LLVM 22.0.0git
X86ISelLowering.cpp
Go to the documentation of this file.
1//===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines the interfaces that X86 uses to lower LLVM code into a
10// selection DAG.
11//
12//===----------------------------------------------------------------------===//
13
14#include "X86ISelLowering.h"
16#include "X86.h"
17#include "X86FrameLowering.h"
18#include "X86InstrBuilder.h"
19#include "X86IntrinsicsInfo.h"
21#include "X86TargetMachine.h"
23#include "llvm/ADT/SmallSet.h"
25#include "llvm/ADT/Statistic.h"
43#include "llvm/IR/CallingConv.h"
44#include "llvm/IR/Constants.h"
47#include "llvm/IR/Function.h"
48#include "llvm/IR/GlobalAlias.h"
50#include "llvm/IR/IRBuilder.h"
52#include "llvm/IR/Intrinsics.h"
54#include "llvm/MC/MCAsmInfo.h"
55#include "llvm/MC/MCContext.h"
56#include "llvm/MC/MCExpr.h"
57#include "llvm/MC/MCSymbol.h"
59#include "llvm/Support/Debug.h"
64#include <algorithm>
65#include <bitset>
66#include <cctype>
67#include <numeric>
68using namespace llvm;
69
70#define DEBUG_TYPE "x86-isel"
71
73 "x86-experimental-pref-innermost-loop-alignment", cl::init(4),
75 "Sets the preferable loop alignment for experiments (as log2 bytes) "
76 "for innermost loops only. If specified, this option overrides "
77 "alignment set by x86-experimental-pref-loop-alignment."),
79
81 "x86-br-merging-base-cost", cl::init(2),
83 "Sets the cost threshold for when multiple conditionals will be merged "
84 "into one branch versus be split in multiple branches. Merging "
85 "conditionals saves branches at the cost of additional instructions. "
86 "This value sets the instruction cost limit, below which conditionals "
87 "will be merged, and above which conditionals will be split. Set to -1 "
88 "to never merge branches."),
90
92 "x86-br-merging-ccmp-bias", cl::init(6),
93 cl::desc("Increases 'x86-br-merging-base-cost' in cases that the target "
94 "supports conditional compare instructions."),
96
97static cl::opt<bool>
98 WidenShift("x86-widen-shift", cl::init(true),
99 cl::desc("Replace narrow shifts with wider shifts."),
100 cl::Hidden);
101
103 "x86-br-merging-likely-bias", cl::init(0),
104 cl::desc("Increases 'x86-br-merging-base-cost' in cases that it is likely "
105 "that all conditionals will be executed. For example for merging "
106 "the conditionals (a == b && c > d), if its known that a == b is "
107 "likely, then it is likely that if the conditionals are split "
108 "both sides will be executed, so it may be desirable to increase "
109 "the instruction cost threshold. Set to -1 to never merge likely "
110 "branches."),
111 cl::Hidden);
112
114 "x86-br-merging-unlikely-bias", cl::init(-1),
115 cl::desc(
116 "Decreases 'x86-br-merging-base-cost' in cases that it is unlikely "
117 "that all conditionals will be executed. For example for merging "
118 "the conditionals (a == b && c > d), if its known that a == b is "
119 "unlikely, then it is unlikely that if the conditionals are split "
120 "both sides will be executed, so it may be desirable to decrease "
121 "the instruction cost threshold. Set to -1 to never merge unlikely "
122 "branches."),
123 cl::Hidden);
124
126 "mul-constant-optimization", cl::init(true),
127 cl::desc("Replace 'mul x, Const' with more effective instructions like "
128 "SHIFT, LEA, etc."),
129 cl::Hidden);
130
132 const X86Subtarget &STI)
133 : TargetLowering(TM), Subtarget(STI) {
134 bool UseX87 = !Subtarget.useSoftFloat() && Subtarget.hasX87();
135 MVT PtrVT = MVT::getIntegerVT(TM.getPointerSizeInBits(0));
136
137 // Set up the TargetLowering object.
138
139 // X86 is weird. It always uses i8 for shift amounts and setcc results.
141 // X86-SSE is even stranger. It uses -1 or 0 for vector masks.
143
144 // X86 instruction cache is coherent with its data cache so we can use the
145 // default expansion to a no-op.
147
148 // For 64-bit, since we have so many registers, use the ILP scheduler.
149 // For 32-bit, use the register pressure specific scheduling.
150 // For Atom, always use ILP scheduling.
151 if (Subtarget.isAtom())
153 else if (Subtarget.is64Bit())
155 else
157 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
158 setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());
159
160 // Bypass expensive divides and use cheaper ones.
161 if (TM.getOptLevel() >= CodeGenOptLevel::Default) {
162 if (Subtarget.hasSlowDivide32())
163 addBypassSlowDiv(32, 8);
164 if (Subtarget.hasSlowDivide64() && Subtarget.is64Bit())
165 addBypassSlowDiv(64, 32);
166 }
167
168 if (Subtarget.canUseCMPXCHG16B())
170 else if (Subtarget.canUseCMPXCHG8B())
172 else
174
175 setMaxDivRemBitWidthSupported(Subtarget.is64Bit() ? 128 : 64);
176
178
179 // Set up the register classes.
180 addRegisterClass(MVT::i8, &X86::GR8RegClass);
181 addRegisterClass(MVT::i16, &X86::GR16RegClass);
182 addRegisterClass(MVT::i32, &X86::GR32RegClass);
183 if (Subtarget.is64Bit())
184 addRegisterClass(MVT::i64, &X86::GR64RegClass);
185
186 for (MVT VT : MVT::integer_valuetypes())
188
189 // We don't accept any truncstore of integer registers.
190 setTruncStoreAction(MVT::i64, MVT::i32, Expand);
191 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
192 setTruncStoreAction(MVT::i64, MVT::i8 , Expand);
193 setTruncStoreAction(MVT::i32, MVT::i16, Expand);
194 setTruncStoreAction(MVT::i32, MVT::i8 , Expand);
195 setTruncStoreAction(MVT::i16, MVT::i8, Expand);
196
197 setTruncStoreAction(MVT::f64, MVT::f32, Expand);
198
199 // SETOEQ and SETUNE require checking two conditions.
200 for (auto VT : {MVT::f32, MVT::f64, MVT::f80}) {
203 }
204
205 // Integer absolute.
206 if (Subtarget.canUseCMOV()) {
207 setOperationAction(ISD::ABS , MVT::i16 , Custom);
208 setOperationAction(ISD::ABS , MVT::i32 , Custom);
209 if (Subtarget.is64Bit())
210 setOperationAction(ISD::ABS , MVT::i64 , Custom);
211 }
212
213 // Absolute difference.
214 for (auto Op : {ISD::ABDS, ISD::ABDU}) {
215 setOperationAction(Op , MVT::i8 , Custom);
216 setOperationAction(Op , MVT::i16 , Custom);
217 setOperationAction(Op , MVT::i32 , Custom);
218 if (Subtarget.is64Bit())
219 setOperationAction(Op , MVT::i64 , Custom);
220 }
221
222 // Signed saturation subtraction.
226 if (Subtarget.is64Bit())
228
229 // Funnel shifts.
230 for (auto ShiftOp : {ISD::FSHL, ISD::FSHR}) {
231 // For slow shld targets we only lower for code size.
232 LegalizeAction ShiftDoubleAction = Subtarget.isSHLDSlow() ? Custom : Legal;
233
234 setOperationAction(ShiftOp , MVT::i8 , Custom);
235 setOperationAction(ShiftOp , MVT::i16 , Custom);
236 setOperationAction(ShiftOp , MVT::i32 , ShiftDoubleAction);
237 if (Subtarget.is64Bit())
238 setOperationAction(ShiftOp , MVT::i64 , ShiftDoubleAction);
239 }
240
241 if (!Subtarget.useSoftFloat()) {
242 // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
243 // operation.
248 // We have an algorithm for SSE2, and we turn this into a 64-bit
249 // FILD or VCVTUSI2SS/SD for other targets.
252 // We have an algorithm for SSE2->double, and we turn this into a
253 // 64-bit FILD followed by conditional FADD for other targets.
256
257 // Promote i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
258 // this operation.
261 // SSE has no i16 to fp conversion, only i32. We promote in the handler
262 // to allow f80 to use i16 and f64 to use i16 with sse1 only
265 // f32 and f64 cases are Legal with SSE1/SSE2, f80 case is not
268 // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64
269 // are Legal, f80 is custom lowered.
272
273 // Promote i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
274 // this operation.
276 // FIXME: This doesn't generate invalid exception when it should. PR44019.
282 // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64
283 // are Legal, f80 is custom lowered.
286
287 // Handle FP_TO_UINT by promoting the destination to a larger signed
288 // conversion.
290 // FIXME: This doesn't generate invalid exception when it should. PR44019.
293 // FIXME: This doesn't generate invalid exception when it should. PR44019.
299
300 setOperationAction(ISD::LRINT, MVT::f32, Custom);
301 setOperationAction(ISD::LRINT, MVT::f64, Custom);
302 setOperationAction(ISD::LLRINT, MVT::f32, Custom);
303 setOperationAction(ISD::LLRINT, MVT::f64, Custom);
304
305 if (!Subtarget.is64Bit()) {
306 setOperationAction(ISD::LRINT, MVT::i64, Custom);
307 setOperationAction(ISD::LLRINT, MVT::i64, Custom);
308 }
309 }
310
311 if (Subtarget.hasSSE2()) {
312 // Custom lowering for saturating float to int conversions.
313 // We handle promotion to larger result types manually.
314 for (MVT VT : { MVT::i8, MVT::i16, MVT::i32 }) {
317 }
320 if (Subtarget.is64Bit()) {
323 }
324 }
325 if (Subtarget.hasAVX10_2()) {
330 for (MVT VT : {MVT::i32, MVT::v4i32, MVT::v8i32, MVT::v16i32, MVT::v2i64,
331 MVT::v4i64}) {
334 }
335 if (Subtarget.is64Bit()) {
338 }
339 }
340
341 // Handle address space casts between mixed sized pointers.
342 setOperationAction(ISD::ADDRSPACECAST, MVT::i32, Custom);
343 setOperationAction(ISD::ADDRSPACECAST, MVT::i64, Custom);
344
345 // TODO: when we have SSE, these could be more efficient, by using movd/movq.
346 if (!Subtarget.hasSSE2()) {
347 setOperationAction(ISD::BITCAST , MVT::f32 , Expand);
348 setOperationAction(ISD::BITCAST , MVT::i32 , Expand);
351 if (Subtarget.is64Bit()) {
352 setOperationAction(ISD::BITCAST , MVT::f64 , Expand);
353 // Without SSE, i64->f64 goes through memory.
354 setOperationAction(ISD::BITCAST , MVT::i64 , Expand);
355 }
356 } else if (!Subtarget.is64Bit())
357 setOperationAction(ISD::BITCAST , MVT::i64 , Custom);
358
359 // Scalar integer divide and remainder are lowered to use operations that
360 // produce two results, to match the available instructions. This exposes
361 // the two-result form to trivial CSE, which is able to combine x/y and x%y
362 // into a single instruction.
363 //
364 // Scalar integer multiply-high is also lowered to use two-result
365 // operations, to match the available instructions. However, plain multiply
366 // (low) operations are left as Legal, as there are single-result
367 // instructions for this in x86. Using the two-result multiply instructions
368 // when both high and low results are needed must be arranged by dagcombine.
369 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
376 }
377
378 setOperationAction(ISD::BR_JT , MVT::Other, Expand);
379 setOperationAction(ISD::BRCOND , MVT::Other, Custom);
380 for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128,
381 MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
382 setOperationAction(ISD::BR_CC, VT, Expand);
384 }
385 if (Subtarget.is64Bit())
390
391 setOperationAction(ISD::FREM , MVT::f32 , Expand);
392 setOperationAction(ISD::FREM , MVT::f64 , Expand);
393 setOperationAction(ISD::FREM , MVT::f80 , Expand);
394 setOperationAction(ISD::FREM , MVT::f128 , Expand);
395
396 if (!Subtarget.useSoftFloat() && Subtarget.hasX87()) {
398 setOperationAction(ISD::SET_ROUNDING , MVT::Other, Custom);
399 setOperationAction(ISD::GET_FPENV_MEM , MVT::Other, Custom);
400 setOperationAction(ISD::SET_FPENV_MEM , MVT::Other, Custom);
401 setOperationAction(ISD::RESET_FPENV , MVT::Other, Custom);
402 }
403
404 // Promote the i8 variants and force them on up to i32 which has a shorter
405 // encoding.
406 setOperationPromotedToType(ISD::CTTZ , MVT::i8 , MVT::i32);
408 // Promoted i16. tzcntw has a false dependency on Intel CPUs. For BSF, we emit
409 // a REP prefix to encode it as TZCNT for modern CPUs so it makes sense to
410 // promote that too.
411 setOperationPromotedToType(ISD::CTTZ , MVT::i16 , MVT::i32);
413
414 if (!Subtarget.hasBMI()) {
415 setOperationAction(ISD::CTTZ , MVT::i32 , Custom);
417 if (Subtarget.is64Bit()) {
418 setOperationAction(ISD::CTTZ , MVT::i64 , Custom);
420 }
421 }
422
423 if (Subtarget.hasLZCNT()) {
424 // When promoting the i8 variants, force them to i32 for a shorter
425 // encoding.
426 setOperationPromotedToType(ISD::CTLZ , MVT::i8 , MVT::i32);
428 } else {
429 for (auto VT : {MVT::i8, MVT::i16, MVT::i32, MVT::i64}) {
430 if (VT == MVT::i64 && !Subtarget.is64Bit())
431 continue;
434 }
435 }
436
437 for (auto Op : {ISD::FP16_TO_FP, ISD::STRICT_FP16_TO_FP, ISD::FP_TO_FP16,
438 ISD::STRICT_FP_TO_FP16}) {
439 // Special handling for half-precision floating point conversions.
440 // If we don't have F16C support, then lower half float conversions
441 // into library calls.
443 Op, MVT::f32,
444 (!Subtarget.useSoftFloat() && Subtarget.hasF16C()) ? Custom : Expand);
445 // There's never any support for operations beyond MVT::f32.
446 setOperationAction(Op, MVT::f64, Expand);
447 setOperationAction(Op, MVT::f80, Expand);
448 setOperationAction(Op, MVT::f128, Expand);
449 }
450
451 for (auto VT : {MVT::f32, MVT::f64, MVT::f80, MVT::f128}) {
452 setOperationAction(ISD::STRICT_FP_TO_BF16, VT, Expand);
453 setOperationAction(ISD::STRICT_BF16_TO_FP, VT, Expand);
454 }
455
456 for (MVT VT : {MVT::f32, MVT::f64, MVT::f80, MVT::f128}) {
457 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand);
458 setLoadExtAction(ISD::EXTLOAD, VT, MVT::bf16, Expand);
459 setTruncStoreAction(VT, MVT::f16, Expand);
460 setTruncStoreAction(VT, MVT::bf16, Expand);
461
462 setOperationAction(ISD::BF16_TO_FP, VT, Expand);
463 setOperationAction(ISD::FP_TO_BF16, VT, Custom);
464 }
465
469 if (Subtarget.is64Bit())
471 if (Subtarget.hasPOPCNT()) {
472 setOperationPromotedToType(ISD::CTPOP, MVT::i8, MVT::i32);
473 // popcntw is longer to encode than popcntl and also has a false dependency
474 // on the dest that popcntl hasn't had since Cannon Lake.
475 setOperationPromotedToType(ISD::CTPOP, MVT::i16, MVT::i32);
476 } else {
481 }
482
483 setOperationAction(ISD::READCYCLECOUNTER , MVT::i64 , Custom);
484
485 if (!Subtarget.hasMOVBE())
487
488 // X86 wants to expand cmov itself.
489 for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128 }) {
494 }
495 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
496 if (VT == MVT::i64 && !Subtarget.is64Bit())
497 continue;
500 }
501
502 // Custom action for SELECT MMX and expand action for SELECT_CC MMX
505
507 // NOTE: EH_SJLJ_SETJMP/_LONGJMP are not recommended, since
508 // LLVM/Clang supports zero-cost DWARF and SEH exception handling.
512
513 // Darwin ABI issue.
514 for (auto VT : { MVT::i32, MVT::i64 }) {
515 if (VT == MVT::i64 && !Subtarget.is64Bit())
516 continue;
523 }
524
525 // 64-bit shl, sra, srl (iff 32-bit x86)
526 for (auto VT : { MVT::i32, MVT::i64 }) {
527 if (VT == MVT::i64 && !Subtarget.is64Bit())
528 continue;
532 }
533
534 if (Subtarget.hasSSEPrefetch())
535 setOperationAction(ISD::PREFETCH , MVT::Other, Custom);
536
537 setOperationAction(ISD::ATOMIC_FENCE , MVT::Other, Custom);
538
539 // Expand certain atomics
540 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
541 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Custom);
542 setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);
543 setOperationAction(ISD::ATOMIC_LOAD_ADD, VT, Custom);
544 setOperationAction(ISD::ATOMIC_LOAD_OR, VT, Custom);
545 setOperationAction(ISD::ATOMIC_LOAD_XOR, VT, Custom);
546 setOperationAction(ISD::ATOMIC_LOAD_AND, VT, Custom);
547 setOperationAction(ISD::ATOMIC_STORE, VT, Custom);
548 }
549
550 if (!Subtarget.is64Bit())
551 setOperationAction(ISD::ATOMIC_LOAD, MVT::i64, Custom);
552
553 if (Subtarget.is64Bit() && Subtarget.hasAVX()) {
554 // All CPUs supporting AVX will atomically load/store aligned 128-bit
555 // values, so we can emit [V]MOVAPS/[V]MOVDQA.
556 setOperationAction(ISD::ATOMIC_LOAD, MVT::i128, Custom);
557 setOperationAction(ISD::ATOMIC_STORE, MVT::i128, Custom);
558 }
559
560 if (Subtarget.canUseCMPXCHG16B())
561 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom);
562
563 // FIXME - use subtarget debug flags
564 if (!Subtarget.isTargetDarwin() && !Subtarget.isTargetELF() &&
565 !Subtarget.isTargetCygMing() && !Subtarget.isTargetWin64() &&
566 TM.Options.ExceptionModel != ExceptionHandling::SjLj) {
567 setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);
568 }
569
572
573 setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);
574 setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);
575
576 setOperationAction(ISD::TRAP, MVT::Other, Legal);
577 setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
578 if (Subtarget.isTargetPS())
579 setOperationAction(ISD::UBSANTRAP, MVT::Other, Expand);
580 else
581 setOperationAction(ISD::UBSANTRAP, MVT::Other, Legal);
582
583 // VASTART needs to be custom lowered to use the VarArgsFrameIndex
584 setOperationAction(ISD::VASTART , MVT::Other, Custom);
585 setOperationAction(ISD::VAEND , MVT::Other, Expand);
586 bool Is64Bit = Subtarget.is64Bit();
587 setOperationAction(ISD::VAARG, MVT::Other, Is64Bit ? Custom : Expand);
588 setOperationAction(ISD::VACOPY, MVT::Other, Is64Bit ? Custom : Expand);
589
590 setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
591 setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
592
593 setOperationAction(ISD::DYNAMIC_STACKALLOC, PtrVT, Custom);
594
595 // GC_TRANSITION_START and GC_TRANSITION_END need custom lowering.
596 setOperationAction(ISD::GC_TRANSITION_START, MVT::Other, Custom);
597 setOperationAction(ISD::GC_TRANSITION_END, MVT::Other, Custom);
598
600
601 auto setF16Action = [&] (MVT VT, LegalizeAction Action) {
602 setOperationAction(ISD::FABS, VT, Action);
603 setOperationAction(ISD::FNEG, VT, Action);
605 setOperationAction(ISD::FREM, VT, Action);
606 setOperationAction(ISD::FMA, VT, Action);
607 setOperationAction(ISD::FMINNUM, VT, Action);
608 setOperationAction(ISD::FMAXNUM, VT, Action);
609 setOperationAction(ISD::FMINIMUM, VT, Action);
610 setOperationAction(ISD::FMAXIMUM, VT, Action);
611 setOperationAction(ISD::FMINIMUMNUM, VT, Action);
612 setOperationAction(ISD::FMAXIMUMNUM, VT, Action);
613 setOperationAction(ISD::FSIN, VT, Action);
614 setOperationAction(ISD::FCOS, VT, Action);
615 setOperationAction(ISD::FSINCOS, VT, Action);
616 setOperationAction(ISD::FTAN, VT, Action);
617 setOperationAction(ISD::FSQRT, VT, Action);
618 setOperationAction(ISD::FPOW, VT, Action);
619 setOperationAction(ISD::FPOWI, VT, Action);
620 setOperationAction(ISD::FLOG, VT, Action);
621 setOperationAction(ISD::FLOG2, VT, Action);
622 setOperationAction(ISD::FLOG10, VT, Action);
623 setOperationAction(ISD::FEXP, VT, Action);
624 setOperationAction(ISD::FEXP2, VT, Action);
625 setOperationAction(ISD::FEXP10, VT, Action);
626 setOperationAction(ISD::FCEIL, VT, Action);
627 setOperationAction(ISD::FFLOOR, VT, Action);
628 setOperationAction(ISD::FNEARBYINT, VT, Action);
629 setOperationAction(ISD::FRINT, VT, Action);
630 setOperationAction(ISD::BR_CC, VT, Action);
631 setOperationAction(ISD::SETCC, VT, Action);
634 setOperationAction(ISD::FROUND, VT, Action);
635 setOperationAction(ISD::FROUNDEVEN, VT, Action);
636 setOperationAction(ISD::FTRUNC, VT, Action);
637 setOperationAction(ISD::FLDEXP, VT, Action);
638 };
639
640 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
641 // f16, f32 and f64 use SSE.
642 // Set up the FP register classes.
643 addRegisterClass(MVT::f16, Subtarget.hasAVX512() ? &X86::FR16XRegClass
644 : &X86::FR16RegClass);
645 addRegisterClass(MVT::f32, Subtarget.hasAVX512() ? &X86::FR32XRegClass
646 : &X86::FR32RegClass);
647 addRegisterClass(MVT::f64, Subtarget.hasAVX512() ? &X86::FR64XRegClass
648 : &X86::FR64RegClass);
649
650 // Disable f32->f64 extload as we can only generate this in one instruction
651 // under optsize. So its easier to pattern match (fpext (load)) for that
652 // case instead of needing to emit 2 instructions for extload in the
653 // non-optsize case.
654 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
655
656 for (auto VT : { MVT::f32, MVT::f64 }) {
657 // Use ANDPD to simulate FABS.
658 setOperationAction(ISD::FABS, VT, Custom);
659
660 // Use XORP to simulate FNEG.
661 setOperationAction(ISD::FNEG, VT, Custom);
662
663 // Use ANDPD and ORPD to simulate FCOPYSIGN.
665
666 // These might be better off as horizontal vector ops.
669
670 // We don't support sin/cos/fmod
671 setOperationAction(ISD::FSIN , VT, Expand);
672 setOperationAction(ISD::FCOS , VT, Expand);
673 setOperationAction(ISD::FSINCOS, VT, Expand);
674 }
675
676 // Half type will be promoted by default.
677 setF16Action(MVT::f16, Promote);
682 setOperationAction(ISD::FABS, MVT::f16, Custom);
683 setOperationAction(ISD::FNEG, MVT::f16, Custom);
686 setOperationAction(ISD::FP_EXTEND, MVT::f32, Custom);
687 setOperationAction(ISD::FP_EXTEND, MVT::f64, Custom);
688
719 setOperationAction(ISD::LRINT, MVT::f16, Expand);
720 setOperationAction(ISD::LLRINT, MVT::f16, Expand);
721
722 // Lower this to MOVMSK plus an AND.
725
726 } else if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1() &&
727 (UseX87 || Is64Bit)) {
728 // Use SSE for f32, x87 for f64.
729 // Set up the FP register classes.
730 addRegisterClass(MVT::f32, &X86::FR32RegClass);
731 if (UseX87)
732 addRegisterClass(MVT::f64, &X86::RFP64RegClass);
733
734 // Use ANDPS to simulate FABS.
735 setOperationAction(ISD::FABS , MVT::f32, Custom);
736
737 // Use XORP to simulate FNEG.
738 setOperationAction(ISD::FNEG , MVT::f32, Custom);
739
740 if (UseX87)
742
743 // Use ANDPS and ORPS to simulate FCOPYSIGN.
744 if (UseX87)
747
748 // We don't support sin/cos/fmod
749 setOperationAction(ISD::FSIN , MVT::f32, Expand);
750 setOperationAction(ISD::FCOS , MVT::f32, Expand);
751 setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
752
753 if (UseX87) {
754 // Always expand sin/cos functions even though x87 has an instruction.
755 setOperationAction(ISD::FSIN, MVT::f64, Expand);
756 setOperationAction(ISD::FCOS, MVT::f64, Expand);
757 setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
758 }
759 } else if (UseX87) {
760 // f32 and f64 in x87.
761 // Set up the FP register classes.
762 addRegisterClass(MVT::f64, &X86::RFP64RegClass);
763 addRegisterClass(MVT::f32, &X86::RFP32RegClass);
764
765 for (auto VT : { MVT::f32, MVT::f64 }) {
768
769 // Always expand sin/cos functions even though x87 has an instruction.
770 setOperationAction(ISD::FSIN , VT, Expand);
771 setOperationAction(ISD::FCOS , VT, Expand);
772 setOperationAction(ISD::FSINCOS, VT, Expand);
773 }
774 }
775
776 // Expand FP32 immediates into loads from the stack, save special cases.
777 if (isTypeLegal(MVT::f32)) {
778 if (UseX87 && (getRegClassFor(MVT::f32) == &X86::RFP32RegClass)) {
779 addLegalFPImmediate(APFloat(+0.0f)); // FLD0
780 addLegalFPImmediate(APFloat(+1.0f)); // FLD1
781 addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
782 addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
783 } else // SSE immediates.
784 addLegalFPImmediate(APFloat(+0.0f)); // xorps
785 }
786 // Expand FP64 immediates into loads from the stack, save special cases.
787 if (isTypeLegal(MVT::f64)) {
788 if (UseX87 && getRegClassFor(MVT::f64) == &X86::RFP64RegClass) {
789 addLegalFPImmediate(APFloat(+0.0)); // FLD0
790 addLegalFPImmediate(APFloat(+1.0)); // FLD1
791 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
792 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
793 } else // SSE immediates.
794 addLegalFPImmediate(APFloat(+0.0)); // xorpd
795 }
796 // Support fp16 0 immediate.
797 if (isTypeLegal(MVT::f16))
798 addLegalFPImmediate(APFloat::getZero(APFloat::IEEEhalf()));
799
800 // Handle constrained floating-point operations of scalar.
813
814 // We don't support FMA.
817
818 // f80 always uses X87.
819 if (UseX87) {
820 addRegisterClass(MVT::f80, &X86::RFP80RegClass);
823 {
825 addLegalFPImmediate(TmpFlt); // FLD0
826 TmpFlt.changeSign();
827 addLegalFPImmediate(TmpFlt); // FLD0/FCHS
828
829 bool ignored;
830 APFloat TmpFlt2(+1.0);
832 &ignored);
833 addLegalFPImmediate(TmpFlt2); // FLD1
834 TmpFlt2.changeSign();
835 addLegalFPImmediate(TmpFlt2); // FLD1/FCHS
836 }
837
838 // Always expand sin/cos functions even though x87 has an instruction.
839 // clang-format off
840 setOperationAction(ISD::FSIN , MVT::f80, Expand);
841 setOperationAction(ISD::FCOS , MVT::f80, Expand);
842 setOperationAction(ISD::FSINCOS, MVT::f80, Expand);
843 setOperationAction(ISD::FTAN , MVT::f80, Expand);
844 setOperationAction(ISD::FASIN , MVT::f80, Expand);
845 setOperationAction(ISD::FACOS , MVT::f80, Expand);
846 setOperationAction(ISD::FATAN , MVT::f80, Expand);
847 setOperationAction(ISD::FATAN2 , MVT::f80, Expand);
848 setOperationAction(ISD::FSINH , MVT::f80, Expand);
849 setOperationAction(ISD::FCOSH , MVT::f80, Expand);
850 setOperationAction(ISD::FTANH , MVT::f80, Expand);
851 // clang-format on
852
853 setOperationAction(ISD::FFLOOR, MVT::f80, Expand);
854 setOperationAction(ISD::FCEIL, MVT::f80, Expand);
855 setOperationAction(ISD::FTRUNC, MVT::f80, Expand);
856 setOperationAction(ISD::FRINT, MVT::f80, Expand);
857 setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand);
858 setOperationAction(ISD::FROUNDEVEN, MVT::f80, Expand);
860 setOperationAction(ISD::LROUND, MVT::f80, LibCall);
861 setOperationAction(ISD::LLROUND, MVT::f80, LibCall);
862 setOperationAction(ISD::LRINT, MVT::f80, Custom);
863 setOperationAction(ISD::LLRINT, MVT::f80, Custom);
864
865 // Handle constrained floating-point operations of scalar.
872 if (isTypeLegal(MVT::f16)) {
873 setOperationAction(ISD::FP_EXTEND, MVT::f80, Custom);
875 } else {
877 }
878 // FIXME: When the target is 64-bit, STRICT_FP_ROUND will be overwritten
879 // as Custom.
881 }
882
883 // f128 uses xmm registers, but most operations require libcalls.
884 if (!Subtarget.useSoftFloat() && Subtarget.is64Bit() && Subtarget.hasSSE1()) {
885 addRegisterClass(MVT::f128, Subtarget.hasVLX() ? &X86::VR128XRegClass
886 : &X86::VR128RegClass);
887
888 addLegalFPImmediate(APFloat::getZero(APFloat::IEEEquad())); // xorps
889
900
901 setOperationAction(ISD::FABS, MVT::f128, Custom);
902 setOperationAction(ISD::FNEG, MVT::f128, Custom);
904
905 // clang-format off
906 setOperationAction(ISD::FSIN, MVT::f128, LibCall);
908 setOperationAction(ISD::FCOS, MVT::f128, LibCall);
910 setOperationAction(ISD::FSINCOS, MVT::f128, LibCall);
911 setOperationAction(ISD::FTAN, MVT::f128, LibCall);
913 // clang-format on
914 // No STRICT_FSINCOS
915 setOperationAction(ISD::FSQRT, MVT::f128, LibCall);
917
918 setOperationAction(ISD::FP_EXTEND, MVT::f128, Custom);
920 // We need to custom handle any FP_ROUND with an f128 input, but
921 // LegalizeDAG uses the result type to know when to run a custom handler.
922 // So we have to list all legal floating point result types here.
923 if (isTypeLegal(MVT::f32)) {
926 }
927 if (isTypeLegal(MVT::f64)) {
930 }
931 if (isTypeLegal(MVT::f80)) {
935 }
936
938
939 setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f32, Expand);
940 setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f64, Expand);
941 setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f80, Expand);
942 setTruncStoreAction(MVT::f128, MVT::f32, Expand);
943 setTruncStoreAction(MVT::f128, MVT::f64, Expand);
944 setTruncStoreAction(MVT::f128, MVT::f80, Expand);
945 }
946
947 // Always use a library call for pow.
948 setOperationAction(ISD::FPOW , MVT::f32 , Expand);
949 setOperationAction(ISD::FPOW , MVT::f64 , Expand);
950 setOperationAction(ISD::FPOW , MVT::f80 , Expand);
951 setOperationAction(ISD::FPOW , MVT::f128 , Expand);
952
953 setOperationAction(ISD::FLOG, MVT::f80, Expand);
954 setOperationAction(ISD::FLOG2, MVT::f80, Expand);
955 setOperationAction(ISD::FLOG10, MVT::f80, Expand);
956 setOperationAction(ISD::FEXP, MVT::f80, Expand);
957 setOperationAction(ISD::FEXP2, MVT::f80, Expand);
958 setOperationAction(ISD::FEXP10, MVT::f80, Expand);
959 setOperationAction(ISD::FMINNUM, MVT::f80, Expand);
960 setOperationAction(ISD::FMAXNUM, MVT::f80, Expand);
961
962 // Some FP actions are always expanded for vector types.
963 for (auto VT : { MVT::v8f16, MVT::v16f16, MVT::v32f16,
964 MVT::v4f32, MVT::v8f32, MVT::v16f32,
965 MVT::v2f64, MVT::v4f64, MVT::v8f64 }) {
966 // clang-format off
967 setOperationAction(ISD::FSIN, VT, Expand);
968 setOperationAction(ISD::FSINCOS, VT, Expand);
969 setOperationAction(ISD::FCOS, VT, Expand);
970 setOperationAction(ISD::FTAN, VT, Expand);
973 setOperationAction(ISD::FPOW, VT, Expand);
974 setOperationAction(ISD::FLOG, VT, Expand);
975 setOperationAction(ISD::FLOG2, VT, Expand);
976 setOperationAction(ISD::FLOG10, VT, Expand);
977 setOperationAction(ISD::FEXP, VT, Expand);
978 setOperationAction(ISD::FEXP2, VT, Expand);
979 setOperationAction(ISD::FEXP10, VT, Expand);
980 // clang-format on
981 }
982
983 // First set operation action for all vector types to either promote
984 // (for widening) or expand (for scalarization). Then we will selectively
985 // turn on ones that can be effectively codegen'd.
996 setOperationAction(ISD::FFLOOR, VT, Expand);
997 setOperationAction(ISD::FCEIL, VT, Expand);
998 setOperationAction(ISD::FTRUNC, VT, Expand);
999 setOperationAction(ISD::FRINT, VT, Expand);
1000 setOperationAction(ISD::FNEARBYINT, VT, Expand);
1001 setOperationAction(ISD::FROUNDEVEN, VT, Expand);
1025 for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
1026 setTruncStoreAction(InnerVT, VT, Expand);
1027
1028 setLoadExtAction(ISD::SEXTLOAD, InnerVT, VT, Expand);
1029 setLoadExtAction(ISD::ZEXTLOAD, InnerVT, VT, Expand);
1030
1031 // N.b. ISD::EXTLOAD legality is basically ignored except for i1-like
1032 // types, we have to deal with them whether we ask for Expansion or not.
1033 // Setting Expand causes its own optimisation problems though, so leave
1034 // them legal.
1035 if (VT.getVectorElementType() == MVT::i1)
1036 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
1037
1038 // EXTLOAD for MVT::f16 vectors is not legal because f16 vectors are
1039 // split/scalarized right now.
1040 if (VT.getVectorElementType() == MVT::f16 ||
1041 VT.getVectorElementType() == MVT::bf16)
1042 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
1043 }
1044 }
1045
1046 // FIXME: In order to prevent SSE instructions being expanded to MMX ones
1047 // with -msoft-float, disable use of MMX as well.
1048 if (!Subtarget.useSoftFloat() && Subtarget.hasMMX()) {
1049 addRegisterClass(MVT::x86mmx, &X86::VR64RegClass);
1050 // No operations on x86mmx supported, everything uses intrinsics.
1051 }
1052
1053 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1()) {
1054 addRegisterClass(MVT::v4f32, Subtarget.hasVLX() ? &X86::VR128XRegClass
1055 : &X86::VR128RegClass);
1056
1057 setOperationAction(ISD::FMAXIMUM, MVT::f32, Custom);
1058 setOperationAction(ISD::FMINIMUM, MVT::f32, Custom);
1059 setOperationAction(ISD::FMAXIMUMNUM, MVT::f32, Custom);
1060 setOperationAction(ISD::FMINIMUMNUM, MVT::f32, Custom);
1061
1062 setOperationAction(ISD::FNEG, MVT::v4f32, Custom);
1063 setOperationAction(ISD::FABS, MVT::v4f32, Custom);
1071
1072 setOperationAction(ISD::LOAD, MVT::v2f32, Custom);
1073 setOperationAction(ISD::STORE, MVT::v2f32, Custom);
1075
1081 }
1082
1083 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
1084 addRegisterClass(MVT::v2f64, Subtarget.hasVLX() ? &X86::VR128XRegClass
1085 : &X86::VR128RegClass);
1086
1087 // FIXME: Unfortunately, -soft-float and -no-implicit-float mean XMM
1088 // registers cannot be used even for integer operations.
1089 addRegisterClass(MVT::v16i8, Subtarget.hasVLX() ? &X86::VR128XRegClass
1090 : &X86::VR128RegClass);
1091 addRegisterClass(MVT::v8i16, Subtarget.hasVLX() ? &X86::VR128XRegClass
1092 : &X86::VR128RegClass);
1093 addRegisterClass(MVT::v8f16, Subtarget.hasVLX() ? &X86::VR128XRegClass
1094 : &X86::VR128RegClass);
1095 addRegisterClass(MVT::v4i32, Subtarget.hasVLX() ? &X86::VR128XRegClass
1096 : &X86::VR128RegClass);
1097 addRegisterClass(MVT::v2i64, Subtarget.hasVLX() ? &X86::VR128XRegClass
1098 : &X86::VR128RegClass);
1099
1100 for (auto VT : { MVT::f64, MVT::v4f32, MVT::v2f64 }) {
1101 setOperationAction(ISD::FMAXIMUM, VT, Custom);
1102 setOperationAction(ISD::FMINIMUM, VT, Custom);
1103 setOperationAction(ISD::FMAXIMUMNUM, VT, Custom);
1104 setOperationAction(ISD::FMINIMUMNUM, VT, Custom);
1105 }
1106
1107 for (auto VT : { MVT::v2i8, MVT::v4i8, MVT::v8i8,
1108 MVT::v2i16, MVT::v4i16, MVT::v2i32 }) {
1113 }
1114
1115 setOperationAction(ISD::MUL, MVT::v2i8, Custom);
1116 setOperationAction(ISD::MUL, MVT::v4i8, Custom);
1117 setOperationAction(ISD::MUL, MVT::v8i8, Custom);
1118
1119 setOperationAction(ISD::MUL, MVT::v16i8, Custom);
1120 setOperationAction(ISD::MUL, MVT::v4i32, Custom);
1121 setOperationAction(ISD::MUL, MVT::v2i64, Custom);
1122 setOperationAction(ISD::MULHU, MVT::v4i32, Custom);
1123 setOperationAction(ISD::MULHS, MVT::v4i32, Custom);
1124 setOperationAction(ISD::MULHU, MVT::v16i8, Custom);
1125 setOperationAction(ISD::MULHS, MVT::v16i8, Custom);
1126 setOperationAction(ISD::MULHU, MVT::v8i16, Legal);
1127 setOperationAction(ISD::MULHS, MVT::v8i16, Legal);
1128 setOperationAction(ISD::MUL, MVT::v8i16, Legal);
1131
1132 setOperationAction(ISD::SMULO, MVT::v16i8, Custom);
1133 setOperationAction(ISD::UMULO, MVT::v16i8, Custom);
1134 setOperationAction(ISD::UMULO, MVT::v2i32, Custom);
1135
1136 setOperationAction(ISD::FNEG, MVT::v2f64, Custom);
1138 setOperationAction(ISD::FABS, MVT::v2f64, Custom);
1140
1141 setOperationAction(ISD::LRINT, MVT::v4f32, Custom);
1142 setOperationAction(ISD::LRINT, MVT::v2i32, Custom);
1143
1144 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1145 setOperationAction(ISD::SMAX, VT, VT == MVT::v8i16 ? Legal : Custom);
1146 setOperationAction(ISD::SMIN, VT, VT == MVT::v8i16 ? Legal : Custom);
1147 setOperationAction(ISD::UMAX, VT, VT == MVT::v16i8 ? Legal : Custom);
1148 setOperationAction(ISD::UMIN, VT, VT == MVT::v16i8 ? Legal : Custom);
1149 }
1150
1161
1166
1167 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1173
1174 // The condition codes aren't legal in SSE/AVX and under AVX512 we use
1175 // setcc all the way to isel and prefer SETGT in some isel patterns.
1178 }
1179
1180 setOperationAction(ISD::SETCC, MVT::v2f64, Custom);
1181 setOperationAction(ISD::SETCC, MVT::v4f32, Custom);
1186
1187 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
1193 }
1194
1195 for (auto VT : { MVT::v8f16, MVT::v2f64, MVT::v2i64 }) {
1199
1200 if (VT == MVT::v2i64 && !Subtarget.is64Bit())
1201 continue;
1202
1205 }
1206 setF16Action(MVT::v8f16, Expand);
1207 setOperationAction(ISD::FADD, MVT::v8f16, Expand);
1208 setOperationAction(ISD::FSUB, MVT::v8f16, Expand);
1209 setOperationAction(ISD::FMUL, MVT::v8f16, Expand);
1210 setOperationAction(ISD::FDIV, MVT::v8f16, Expand);
1211 setOperationAction(ISD::FNEG, MVT::v8f16, Custom);
1212 setOperationAction(ISD::FABS, MVT::v8f16, Custom);
1214
1215 // Custom lower v2i64 and v2f64 selects.
1222
1229
1230 // Custom legalize these to avoid over promotion or custom promotion.
1231 for (auto VT : {MVT::v2i8, MVT::v4i8, MVT::v8i8, MVT::v2i16, MVT::v4i16}) {
1236 }
1237
1242
1245
1248
1249 // Fast v2f32 UINT_TO_FP( v2i32 ) custom conversion.
1254
1255 setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Custom);
1259
1260 // We want to legalize this to an f64 load rather than an i64 load on
1261 // 64-bit targets and two 32-bit loads on a 32-bit target. Similar for
1262 // store.
1263 setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
1264 setOperationAction(ISD::LOAD, MVT::v4i16, Custom);
1265 setOperationAction(ISD::LOAD, MVT::v8i8, Custom);
1266 setOperationAction(ISD::STORE, MVT::v2i32, Custom);
1267 setOperationAction(ISD::STORE, MVT::v4i16, Custom);
1268 setOperationAction(ISD::STORE, MVT::v8i8, Custom);
1269
1270 // Add 32-bit vector stores to help vectorization opportunities.
1271 setOperationAction(ISD::STORE, MVT::v2i16, Custom);
1272 setOperationAction(ISD::STORE, MVT::v4i8, Custom);
1273
1274 setOperationAction(ISD::BITCAST, MVT::v2i32, Custom);
1275 setOperationAction(ISD::BITCAST, MVT::v4i16, Custom);
1276 setOperationAction(ISD::BITCAST, MVT::v8i8, Custom);
1277 if (!Subtarget.hasAVX512())
1278 setOperationAction(ISD::BITCAST, MVT::v16i1, Custom);
1279
1283
1285
1302
1303 // In the customized shift lowering, the legal v4i32/v2i64 cases
1304 // in AVX2 will be recognized.
1305 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1309 if (VT == MVT::v2i64) continue;
1314 }
1315
1321 }
1322
1323 if (!Subtarget.useSoftFloat() && Subtarget.hasGFNI()) {
1328
1329 for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64}) {
1331 }
1332 }
1333
1334 if (!Subtarget.useSoftFloat() && Subtarget.hasSSSE3()) {
1335 setOperationAction(ISD::ABS, MVT::v16i8, Legal);
1336 setOperationAction(ISD::ABS, MVT::v8i16, Legal);
1337 setOperationAction(ISD::ABS, MVT::v4i32, Legal);
1338
1339 for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64}) {
1342 }
1343
1344 // These might be better off as horizontal vector ops.
1349 }
1350
1351 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE41()) {
1352 for (MVT RoundedTy : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64}) {
1353 setOperationAction(ISD::FFLOOR, RoundedTy, Legal);
1355 setOperationAction(ISD::FCEIL, RoundedTy, Legal);
1357 setOperationAction(ISD::FTRUNC, RoundedTy, Legal);
1359 setOperationAction(ISD::FRINT, RoundedTy, Legal);
1361 setOperationAction(ISD::FNEARBYINT, RoundedTy, Legal);
1363 setOperationAction(ISD::FROUNDEVEN, RoundedTy, Legal);
1365
1366 setOperationAction(ISD::FROUND, RoundedTy, Custom);
1367 }
1368
1369 setOperationAction(ISD::SMAX, MVT::v16i8, Legal);
1370 setOperationAction(ISD::SMAX, MVT::v4i32, Legal);
1371 setOperationAction(ISD::UMAX, MVT::v8i16, Legal);
1372 setOperationAction(ISD::UMAX, MVT::v4i32, Legal);
1373 setOperationAction(ISD::SMIN, MVT::v16i8, Legal);
1374 setOperationAction(ISD::SMIN, MVT::v4i32, Legal);
1375 setOperationAction(ISD::UMIN, MVT::v8i16, Legal);
1376 setOperationAction(ISD::UMIN, MVT::v4i32, Legal);
1377
1381
1382 // FIXME: Do we need to handle scalar-to-vector here?
1383 setOperationAction(ISD::MUL, MVT::v4i32, Legal);
1384 setOperationAction(ISD::SMULO, MVT::v2i32, Custom);
1385
1386 // We directly match byte blends in the backend as they match the VSELECT
1387 // condition form.
1389
1390 // SSE41 brings specific instructions for doing vector sign extend even in
1391 // cases where we don't have SRA.
1392 for (auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1395 }
1396
1397 // SSE41 also has vector sign/zero extending loads, PMOV[SZ]X
1398 for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
1399 setLoadExtAction(LoadExtOp, MVT::v8i16, MVT::v8i8, Legal);
1400 setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i8, Legal);
1401 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i8, Legal);
1402 setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i16, Legal);
1403 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i16, Legal);
1404 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i32, Legal);
1405 }
1406
1407 if (Subtarget.is64Bit() && !Subtarget.hasAVX512()) {
1408 // We need to scalarize v4i64->v432 uint_to_fp using cvtsi2ss, but we can
1409 // do the pre and post work in the vector domain.
1412 // We need to mark SINT_TO_FP as Custom even though we want to expand it
1413 // so that DAG combine doesn't try to turn it into uint_to_fp.
1416 }
1417 }
1418
1419 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE42()) {
1421 }
1422
1423 if (!Subtarget.useSoftFloat() && Subtarget.hasXOP()) {
1424 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
1425 MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1428 }
1429
1430 // XOP can efficiently perform BITREVERSE with VPPERM.
1431 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 })
1433 }
1434
1435 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX()) {
1436 bool HasInt256 = Subtarget.hasInt256();
1437
1438 addRegisterClass(MVT::v32i8, Subtarget.hasVLX() ? &X86::VR256XRegClass
1439 : &X86::VR256RegClass);
1440 addRegisterClass(MVT::v16i16, Subtarget.hasVLX() ? &X86::VR256XRegClass
1441 : &X86::VR256RegClass);
1442 addRegisterClass(MVT::v16f16, Subtarget.hasVLX() ? &X86::VR256XRegClass
1443 : &X86::VR256RegClass);
1444 addRegisterClass(MVT::v8i32, Subtarget.hasVLX() ? &X86::VR256XRegClass
1445 : &X86::VR256RegClass);
1446 addRegisterClass(MVT::v8f32, Subtarget.hasVLX() ? &X86::VR256XRegClass
1447 : &X86::VR256RegClass);
1448 addRegisterClass(MVT::v4i64, Subtarget.hasVLX() ? &X86::VR256XRegClass
1449 : &X86::VR256RegClass);
1450 addRegisterClass(MVT::v4f64, Subtarget.hasVLX() ? &X86::VR256XRegClass
1451 : &X86::VR256RegClass);
1452
1453 for (auto VT : { MVT::v8f32, MVT::v4f64 }) {
1454 setOperationAction(ISD::FFLOOR, VT, Legal);
1456 setOperationAction(ISD::FCEIL, VT, Legal);
1458 setOperationAction(ISD::FTRUNC, VT, Legal);
1460 setOperationAction(ISD::FRINT, VT, Legal);
1462 setOperationAction(ISD::FNEARBYINT, VT, Legal);
1464 setOperationAction(ISD::FROUNDEVEN, VT, Legal);
1466
1467 setOperationAction(ISD::FROUND, VT, Custom);
1468
1469 setOperationAction(ISD::FNEG, VT, Custom);
1470 setOperationAction(ISD::FABS, VT, Custom);
1472
1473 setOperationAction(ISD::FMAXIMUM, VT, Custom);
1474 setOperationAction(ISD::FMINIMUM, VT, Custom);
1475 setOperationAction(ISD::FMAXIMUMNUM, VT, Custom);
1476 setOperationAction(ISD::FMINIMUMNUM, VT, Custom);
1478 }
1479
1480 setOperationAction(ISD::LRINT, MVT::v8f32, Custom);
1481 setOperationAction(ISD::LRINT, MVT::v4f64, Custom);
1482
1483 // (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted
1484 // even though v8i16 is a legal type.
1485 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i16, MVT::v8i32);
1486 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i16, MVT::v8i32);
1487 setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v8i16, MVT::v8i32);
1488 setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v8i16, MVT::v8i32);
1492
1495 setOperationAction(ISD::FP_EXTEND, MVT::v8f32, Expand);
1497 setOperationAction(ISD::FP_EXTEND, MVT::v4f64, Custom);
1499
1511
1512 if (!Subtarget.hasAVX512())
1513 setOperationAction(ISD::BITCAST, MVT::v32i1, Custom);
1514
1515 // In the customized shift lowering, the legal v8i32/v4i64 cases
1516 // in AVX2 will be recognized.
1517 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1523 if (VT == MVT::v4i64) continue;
1528 }
1529
1530 // These types need custom splitting if their input is a 128-bit vector.
1535
1539 setOperationAction(ISD::SELECT, MVT::v16i16, Custom);
1540 setOperationAction(ISD::SELECT, MVT::v16f16, Custom);
1543
1544 for (auto VT : { MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1548 }
1549
1554
1555 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1560
1561 // The condition codes aren't legal in SSE/AVX and under AVX512 we use
1562 // setcc all the way to isel and prefer SETGT in some isel patterns.
1565 }
1566
1567 setOperationAction(ISD::SETCC, MVT::v4f64, Custom);
1568 setOperationAction(ISD::SETCC, MVT::v8f32, Custom);
1573
1574 if (Subtarget.hasAnyFMA()) {
1575 for (auto VT : { MVT::f32, MVT::f64, MVT::v4f32, MVT::v8f32,
1576 MVT::v2f64, MVT::v4f64 }) {
1579 }
1580 }
1581
1582 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1583 setOperationAction(ISD::ADD, VT, HasInt256 ? Legal : Custom);
1584 setOperationAction(ISD::SUB, VT, HasInt256 ? Legal : Custom);
1585 }
1586
1587 setOperationAction(ISD::MUL, MVT::v4i64, Custom);
1588 setOperationAction(ISD::MUL, MVT::v8i32, HasInt256 ? Legal : Custom);
1589 setOperationAction(ISD::MUL, MVT::v16i16, HasInt256 ? Legal : Custom);
1590 setOperationAction(ISD::MUL, MVT::v32i8, Custom);
1591
1592 setOperationAction(ISD::MULHU, MVT::v8i32, Custom);
1593 setOperationAction(ISD::MULHS, MVT::v8i32, Custom);
1594 setOperationAction(ISD::MULHU, MVT::v16i16, HasInt256 ? Legal : Custom);
1595 setOperationAction(ISD::MULHS, MVT::v16i16, HasInt256 ? Legal : Custom);
1596 setOperationAction(ISD::MULHU, MVT::v32i8, Custom);
1597 setOperationAction(ISD::MULHS, MVT::v32i8, Custom);
1598 setOperationAction(ISD::AVGCEILU, MVT::v16i16, HasInt256 ? Legal : Custom);
1599 setOperationAction(ISD::AVGCEILU, MVT::v32i8, HasInt256 ? Legal : Custom);
1600
1601 setOperationAction(ISD::SMULO, MVT::v32i8, Custom);
1602 setOperationAction(ISD::UMULO, MVT::v32i8, Custom);
1603
1604 setOperationAction(ISD::ABS, MVT::v4i64, Custom);
1605 setOperationAction(ISD::SMAX, MVT::v4i64, Custom);
1606 setOperationAction(ISD::UMAX, MVT::v4i64, Custom);
1607 setOperationAction(ISD::SMIN, MVT::v4i64, Custom);
1608 setOperationAction(ISD::UMIN, MVT::v4i64, Custom);
1609
1610 setOperationAction(ISD::UADDSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1611 setOperationAction(ISD::SADDSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1612 setOperationAction(ISD::USUBSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1613 setOperationAction(ISD::SSUBSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1614 setOperationAction(ISD::UADDSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1615 setOperationAction(ISD::SADDSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1616 setOperationAction(ISD::USUBSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1617 setOperationAction(ISD::SSUBSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1622
1623 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
1624 setOperationAction(ISD::ABS, VT, HasInt256 ? Legal : Custom);
1625 setOperationAction(ISD::SMAX, VT, HasInt256 ? Legal : Custom);
1626 setOperationAction(ISD::UMAX, VT, HasInt256 ? Legal : Custom);
1627 setOperationAction(ISD::SMIN, VT, HasInt256 ? Legal : Custom);
1628 setOperationAction(ISD::UMIN, VT, HasInt256 ? Legal : Custom);
1629 }
1630
1631 for (auto VT : {MVT::v16i16, MVT::v8i32, MVT::v4i64}) {
1634 }
1635
1636 if (HasInt256) {
1637 // The custom lowering for UINT_TO_FP for v8i32 becomes interesting
1638 // when we have a 256bit-wide blend with immediate.
1641
1642 // AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X
1643 for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
1644 setLoadExtAction(LoadExtOp, MVT::v16i16, MVT::v16i8, Legal);
1645 setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i8, Legal);
1646 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i8, Legal);
1647 setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i16, Legal);
1648 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i16, Legal);
1649 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i32, Legal);
1650 }
1651 }
1652
1653 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1654 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) {
1655 setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom);
1656 setOperationAction(ISD::MSTORE, VT, Legal);
1657 }
1658
1659 // Extract subvector is special because the value type
1660 // (result) is 128-bit but the source is 256-bit wide.
1661 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
1662 MVT::v8f16, MVT::v4f32, MVT::v2f64 }) {
1664 }
1665
1666 // Custom lower several nodes for 256-bit types.
1667 for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
1668 MVT::v16f16, MVT::v8f32, MVT::v4f64 }) {
1677 setOperationAction(ISD::STORE, VT, Custom);
1678 }
1679 setF16Action(MVT::v16f16, Expand);
1680 setOperationAction(ISD::FNEG, MVT::v16f16, Custom);
1681 setOperationAction(ISD::FABS, MVT::v16f16, Custom);
1683 setOperationAction(ISD::FADD, MVT::v16f16, Expand);
1684 setOperationAction(ISD::FSUB, MVT::v16f16, Expand);
1685 setOperationAction(ISD::FMUL, MVT::v16f16, Expand);
1686 setOperationAction(ISD::FDIV, MVT::v16f16, Expand);
1687
1688 if (HasInt256) {
1690
1691 // Custom legalize 2x32 to get a little better code.
1692 setOperationAction(ISD::MGATHER, MVT::v2f32, Custom);
1693 setOperationAction(ISD::MGATHER, MVT::v2i32, Custom);
1694
1695 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1696 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
1697 setOperationAction(ISD::MGATHER, VT, Custom);
1698 }
1699 }
1700
1701 if (!Subtarget.useSoftFloat() && !Subtarget.hasFP16() &&
1702 Subtarget.hasF16C()) {
1703 for (MVT VT : { MVT::f16, MVT::v2f16, MVT::v4f16, MVT::v8f16 }) {
1706 }
1707 for (MVT VT : { MVT::f32, MVT::v2f32, MVT::v4f32, MVT::v8f32 }) {
1708 setOperationAction(ISD::FP_EXTEND, VT, Custom);
1710 }
1711 for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV}) {
1712 setOperationPromotedToType(Opc, MVT::v8f16, MVT::v8f32);
1713 setOperationPromotedToType(Opc, MVT::v16f16, MVT::v16f32);
1714 }
1715 setOperationAction(ISD::SETCC, MVT::v8f16, Custom);
1716 setOperationAction(ISD::SETCC, MVT::v16f16, Custom);
1717 }
1718
1719 // This block controls legalization of the mask vector sizes that are
1720 // available with AVX512. 512-bit vectors are in a separate block controlled
1721 // by useAVX512Regs.
1722 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
1723 addRegisterClass(MVT::v1i1, &X86::VK1RegClass);
1724 addRegisterClass(MVT::v2i1, &X86::VK2RegClass);
1725 addRegisterClass(MVT::v4i1, &X86::VK4RegClass);
1726 addRegisterClass(MVT::v8i1, &X86::VK8RegClass);
1727 addRegisterClass(MVT::v16i1, &X86::VK16RegClass);
1728
1732
1733 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i1, MVT::v8i32);
1734 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i1, MVT::v8i32);
1735 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v4i1, MVT::v4i32);
1736 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v4i1, MVT::v4i32);
1737 setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v8i1, MVT::v8i32);
1738 setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v8i1, MVT::v8i32);
1739 setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v4i1, MVT::v4i32);
1740 setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v4i1, MVT::v4i32);
1748
1749 // There is no byte sized k-register load or store without AVX512DQ.
1750 if (!Subtarget.hasDQI()) {
1751 setOperationAction(ISD::LOAD, MVT::v1i1, Custom);
1752 setOperationAction(ISD::LOAD, MVT::v2i1, Custom);
1753 setOperationAction(ISD::LOAD, MVT::v4i1, Custom);
1754 setOperationAction(ISD::LOAD, MVT::v8i1, Custom);
1755
1756 setOperationAction(ISD::STORE, MVT::v1i1, Custom);
1757 setOperationAction(ISD::STORE, MVT::v2i1, Custom);
1758 setOperationAction(ISD::STORE, MVT::v4i1, Custom);
1759 setOperationAction(ISD::STORE, MVT::v8i1, Custom);
1760 }
1761
1762 // Extends of v16i1/v8i1/v4i1/v2i1 to 128-bit vectors.
1763 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1767 }
1768
1769 for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 })
1771
1772 for (auto VT : { MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 }) {
1776
1783 }
1784
1785 for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1 })
1787 }
1788 if (Subtarget.hasDQI() && Subtarget.hasVLX()) {
1789 for (MVT VT : {MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) {
1790 setOperationAction(ISD::LRINT, VT, Legal);
1791 setOperationAction(ISD::LLRINT, VT, Legal);
1792 }
1793 }
1794
1795 // This block controls legalization for 512-bit operations with 8/16/32/64 bit
1796 // elements. 512-bits can be disabled based on prefer-vector-width and
1797 // required-vector-width function attributes.
1798 if (!Subtarget.useSoftFloat() && Subtarget.useAVX512Regs()) {
1799 bool HasBWI = Subtarget.hasBWI();
1800
1801 addRegisterClass(MVT::v16i32, &X86::VR512RegClass);
1802 addRegisterClass(MVT::v16f32, &X86::VR512RegClass);
1803 addRegisterClass(MVT::v8i64, &X86::VR512RegClass);
1804 addRegisterClass(MVT::v8f64, &X86::VR512RegClass);
1805 addRegisterClass(MVT::v32i16, &X86::VR512RegClass);
1806 addRegisterClass(MVT::v32f16, &X86::VR512RegClass);
1807 addRegisterClass(MVT::v64i8, &X86::VR512RegClass);
1808
1809 for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD}) {
1810 setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i8, Legal);
1811 setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i16, Legal);
1812 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i8, Legal);
1813 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i16, Legal);
1814 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i32, Legal);
1815 if (HasBWI)
1816 setLoadExtAction(ExtType, MVT::v32i16, MVT::v32i8, Legal);
1817 }
1818
1819 for (MVT VT : { MVT::v16f32, MVT::v8f64 }) {
1820 setOperationAction(ISD::FMAXIMUM, VT, Custom);
1821 setOperationAction(ISD::FMINIMUM, VT, Custom);
1822 setOperationAction(ISD::FMAXIMUMNUM, VT, Custom);
1823 setOperationAction(ISD::FMINIMUMNUM, VT, Custom);
1824 setOperationAction(ISD::FNEG, VT, Custom);
1825 setOperationAction(ISD::FABS, VT, Custom);
1830 }
1831 setOperationAction(ISD::LRINT, MVT::v16f32,
1832 Subtarget.hasDQI() ? Legal : Custom);
1833 setOperationAction(ISD::LRINT, MVT::v8f64,
1834 Subtarget.hasDQI() ? Legal : Custom);
1835 if (Subtarget.hasDQI())
1836 setOperationAction(ISD::LLRINT, MVT::v8f64, Legal);
1837
1838 for (MVT VT : { MVT::v16i1, MVT::v16i8 }) {
1843 }
1844
1845 for (MVT VT : { MVT::v16i16, MVT::v16i32 }) {
1850 }
1851
1856 setOperationAction(ISD::FP_EXTEND, MVT::v8f64, Custom);
1858
1870
1871 setTruncStoreAction(MVT::v8i64, MVT::v8i8, Legal);
1872 setTruncStoreAction(MVT::v8i64, MVT::v8i16, Legal);
1873 setTruncStoreAction(MVT::v8i64, MVT::v8i32, Legal);
1874 setTruncStoreAction(MVT::v16i32, MVT::v16i8, Legal);
1875 setTruncStoreAction(MVT::v16i32, MVT::v16i16, Legal);
1876 if (HasBWI)
1877 setTruncStoreAction(MVT::v32i16, MVT::v32i8, Legal);
1878
1879 // With 512-bit vectors and no VLX, we prefer to widen MLOAD/MSTORE
1880 // to 512-bit rather than use the AVX2 instructions so that we can use
1881 // k-masks.
1882 if (!Subtarget.hasVLX()) {
1883 for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1884 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) {
1885 setOperationAction(ISD::MLOAD, VT, Custom);
1886 setOperationAction(ISD::MSTORE, VT, Custom);
1887 }
1888 }
1889
1891 setOperationAction(ISD::TRUNCATE, MVT::v16i16, Legal);
1892 setOperationAction(ISD::TRUNCATE, MVT::v32i8, HasBWI ? Legal : Custom);
1902
1903 if (HasBWI) {
1904 // Extends from v64i1 masks to 512-bit vectors.
1908 }
1909
1910 for (auto VT : { MVT::v16f32, MVT::v8f64 }) {
1911 setOperationAction(ISD::FFLOOR, VT, Legal);
1913 setOperationAction(ISD::FCEIL, VT, Legal);
1915 setOperationAction(ISD::FTRUNC, VT, Legal);
1917 setOperationAction(ISD::FRINT, VT, Legal);
1919 setOperationAction(ISD::FNEARBYINT, VT, Legal);
1921 setOperationAction(ISD::FROUNDEVEN, VT, Legal);
1923
1924 setOperationAction(ISD::FROUND, VT, Custom);
1925 }
1926
1927 for (auto VT : {MVT::v32i16, MVT::v16i32, MVT::v8i64}) {
1930 }
1931
1932 setOperationAction(ISD::ADD, MVT::v32i16, HasBWI ? Legal : Custom);
1933 setOperationAction(ISD::SUB, MVT::v32i16, HasBWI ? Legal : Custom);
1934 setOperationAction(ISD::ADD, MVT::v64i8, HasBWI ? Legal : Custom);
1935 setOperationAction(ISD::SUB, MVT::v64i8, HasBWI ? Legal : Custom);
1936
1937 setOperationAction(ISD::MUL, MVT::v8i64, Custom);
1938 setOperationAction(ISD::MUL, MVT::v16i32, Legal);
1939 setOperationAction(ISD::MUL, MVT::v32i16, HasBWI ? Legal : Custom);
1940 setOperationAction(ISD::MUL, MVT::v64i8, Custom);
1941
1942 setOperationAction(ISD::MULHU, MVT::v16i32, Custom);
1943 setOperationAction(ISD::MULHS, MVT::v16i32, Custom);
1944 setOperationAction(ISD::MULHS, MVT::v32i16, HasBWI ? Legal : Custom);
1945 setOperationAction(ISD::MULHU, MVT::v32i16, HasBWI ? Legal : Custom);
1946 setOperationAction(ISD::MULHS, MVT::v64i8, Custom);
1947 setOperationAction(ISD::MULHU, MVT::v64i8, Custom);
1948 setOperationAction(ISD::AVGCEILU, MVT::v32i16, HasBWI ? Legal : Custom);
1949 setOperationAction(ISD::AVGCEILU, MVT::v64i8, HasBWI ? Legal : Custom);
1950
1951 setOperationAction(ISD::SMULO, MVT::v64i8, Custom);
1952 setOperationAction(ISD::UMULO, MVT::v64i8, Custom);
1953
1954 for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64 }) {
1964
1965 // The condition codes aren't legal in SSE/AVX and under AVX512 we use
1966 // setcc all the way to isel and prefer SETGT in some isel patterns.
1969 }
1970
1971 setOperationAction(ISD::SETCC, MVT::v8f64, Custom);
1972 setOperationAction(ISD::SETCC, MVT::v16f32, Custom);
1977
1978 for (auto VT : { MVT::v16i32, MVT::v8i64 }) {
1985 }
1986
1987 for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
1988 setOperationAction(ISD::ABS, VT, HasBWI ? Legal : Custom);
1989 setOperationAction(ISD::CTPOP, VT, Subtarget.hasBITALG() ? Legal : Custom);
1991 setOperationAction(ISD::SMAX, VT, HasBWI ? Legal : Custom);
1992 setOperationAction(ISD::UMAX, VT, HasBWI ? Legal : Custom);
1993 setOperationAction(ISD::SMIN, VT, HasBWI ? Legal : Custom);
1994 setOperationAction(ISD::UMIN, VT, HasBWI ? Legal : Custom);
1999 }
2000
2001 setOperationAction(ISD::FSHL, MVT::v64i8, Custom);
2002 setOperationAction(ISD::FSHR, MVT::v64i8, Custom);
2003 setOperationAction(ISD::FSHL, MVT::v32i16, Custom);
2004 setOperationAction(ISD::FSHR, MVT::v32i16, Custom);
2005 setOperationAction(ISD::FSHL, MVT::v16i32, Custom);
2006 setOperationAction(ISD::FSHR, MVT::v16i32, Custom);
2007
2008 if (Subtarget.hasDQI() || Subtarget.hasFP16())
2012 setOperationAction(Opc, MVT::v8i64, Custom);
2013
2014 if (Subtarget.hasDQI())
2015 setOperationAction(ISD::MUL, MVT::v8i64, Legal);
2016
2017 if (Subtarget.hasCDI()) {
2018 // NonVLX sub-targets extend 128/256 vectors to use the 512 version.
2019 for (auto VT : { MVT::v16i32, MVT::v8i64} ) {
2021 }
2022 } // Subtarget.hasCDI()
2023
2024 if (Subtarget.hasVPOPCNTDQ()) {
2025 for (auto VT : { MVT::v16i32, MVT::v8i64 })
2027 }
2028
2029 // Extract subvector is special because the value type
2030 // (result) is 256-bit but the source is 512-bit wide.
2031 // 128-bit was made Legal under AVX1.
2032 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
2033 MVT::v16f16, MVT::v8f32, MVT::v4f64 })
2035
2036 for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64,
2037 MVT::v32f16, MVT::v16f32, MVT::v8f64 }) {
2047 }
2048 setF16Action(MVT::v32f16, Expand);
2051 setOperationAction(ISD::FP_EXTEND, MVT::v16f32, Custom);
2053 for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV})
2054 setOperationPromotedToType(Opc, MVT::v32f16, MVT::v32f32);
2055 setOperationAction(ISD::SETCC, MVT::v32f16, Custom);
2056
2057 for (auto VT : { MVT::v16i32, MVT::v8i64, MVT::v16f32, MVT::v8f64 }) {
2058 setOperationAction(ISD::MLOAD, VT, Legal);
2059 setOperationAction(ISD::MSTORE, VT, Legal);
2060 setOperationAction(ISD::MGATHER, VT, Custom);
2061 setOperationAction(ISD::MSCATTER, VT, Custom);
2062 }
2063 if (HasBWI) {
2064 for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
2065 setOperationAction(ISD::MLOAD, VT, Legal);
2066 setOperationAction(ISD::MSTORE, VT, Legal);
2067 }
2068 } else {
2069 setOperationAction(ISD::STORE, MVT::v32i16, Custom);
2070 setOperationAction(ISD::STORE, MVT::v64i8, Custom);
2071 }
2072
2073 if (Subtarget.hasVBMI2()) {
2074 for (auto VT : {MVT::v32i16, MVT::v16i32, MVT::v8i64}) {
2077 }
2078
2079 setOperationAction(ISD::ROTL, MVT::v32i16, Custom);
2080 setOperationAction(ISD::ROTR, MVT::v32i16, Custom);
2081 }
2082
2083 setOperationAction(ISD::FNEG, MVT::v32f16, Custom);
2084 setOperationAction(ISD::FABS, MVT::v32f16, Custom);
2086 }// useAVX512Regs
2087
2088 if (!Subtarget.useSoftFloat() && Subtarget.hasVBMI2()) {
2089 for (auto VT : {MVT::v8i16, MVT::v4i32, MVT::v2i64, MVT::v16i16, MVT::v8i32,
2090 MVT::v4i64}) {
2093 }
2094 }
2095
2096 // This block controls legalization for operations that don't have
2097 // pre-AVX512 equivalents. Without VLX we use 512-bit operations for
2098 // narrower widths.
2099 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
2100 // These operations are handled on non-VLX by artificially widening in
2101 // isel patterns.
2102
2106
2107 if (Subtarget.hasDQI()) {
2108 // Fast v2f32 SINT_TO_FP( v2i64 ) custom conversion.
2109 // v2f32 UINT_TO_FP is already custom under SSE2.
2112 "Unexpected operation action!");
2113 // v2i64 FP_TO_S/UINT(v2f32) custom conversion.
2118 }
2119
2120 for (auto VT : { MVT::v2i64, MVT::v4i64 }) {
2126 }
2127
2128 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
2131 }
2132
2133 // Custom legalize 2x32 to get a little better code.
2134 setOperationAction(ISD::MSCATTER, MVT::v2f32, Custom);
2135 setOperationAction(ISD::MSCATTER, MVT::v2i32, Custom);
2136
2137 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
2138 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
2139 setOperationAction(ISD::MSCATTER, VT, Custom);
2140
2141 if (Subtarget.hasDQI()) {
2145 setOperationAction(Opc, MVT::v2i64, Custom);
2146 setOperationAction(Opc, MVT::v4i64, Custom);
2147 }
2148 setOperationAction(ISD::MUL, MVT::v2i64, Legal);
2149 setOperationAction(ISD::MUL, MVT::v4i64, Legal);
2150 }
2151
2152 if (Subtarget.hasCDI()) {
2153 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
2155 }
2156 } // Subtarget.hasCDI()
2157
2158 if (Subtarget.hasVPOPCNTDQ()) {
2159 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 })
2161 }
2162
2163 // We can try to convert vectors to different sizes to leverage legal
2164 // `vpcompress` cases. So we mark these supported vector sizes as Custom and
2165 // then specialize to Legal below.
2166 for (MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v4i32, MVT::v4f32, MVT::v4i64,
2167 MVT::v4f64, MVT::v2i64, MVT::v2f64, MVT::v16i8, MVT::v8i16,
2168 MVT::v16i16, MVT::v8i8})
2170
2171 // Legal vpcompress depends on various AVX512 extensions.
2172 // Legal in AVX512F
2173 for (MVT VT : {MVT::v16i32, MVT::v16f32, MVT::v8i64, MVT::v8f64})
2175
2176 // Legal in AVX512F + AVX512VL
2177 if (Subtarget.hasVLX())
2178 for (MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v4i32, MVT::v4f32, MVT::v4i64,
2179 MVT::v4f64, MVT::v2i64, MVT::v2f64})
2181
2182 // Legal in AVX512F + AVX512VBMI2
2183 if (Subtarget.hasVBMI2())
2184 for (MVT VT : {MVT::v32i16, MVT::v64i8})
2186
2187 // Legal in AVX512F + AVX512VL + AVX512VBMI2
2188 if (Subtarget.hasVBMI2() && Subtarget.hasVLX())
2189 for (MVT VT : {MVT::v16i8, MVT::v8i16, MVT::v32i8, MVT::v16i16})
2191 }
2192
2193 // This block control legalization of v32i1/v64i1 which are available with
2194 // AVX512BW..
2195 if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) {
2196 addRegisterClass(MVT::v32i1, &X86::VK32RegClass);
2197 addRegisterClass(MVT::v64i1, &X86::VK64RegClass);
2198
2199 for (auto VT : { MVT::v32i1, MVT::v64i1 }) {
2210 }
2211
2212 for (auto VT : { MVT::v16i1, MVT::v32i1 })
2214
2215 // Extends from v32i1 masks to 256-bit vectors.
2219
2220 for (auto VT : {MVT::v32i8, MVT::v16i8, MVT::v16i16, MVT::v8i16,
2221 MVT::v16f16, MVT::v8f16}) {
2222 setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom);
2223 setOperationAction(ISD::MSTORE, VT, Subtarget.hasVLX() ? Legal : Custom);
2224 }
2225
2226 // These operations are handled on non-VLX by artificially widening in
2227 // isel patterns.
2228 // TODO: Custom widen in lowering on non-VLX and drop the isel patterns?
2229
2230 if (Subtarget.hasBITALG()) {
2231 for (auto VT : { MVT::v16i8, MVT::v32i8, MVT::v8i16, MVT::v16i16 })
2233 }
2234 }
2235
2236 if (!Subtarget.useSoftFloat() && Subtarget.hasFP16()) {
2237 auto setGroup = [&] (MVT VT) {
2246 setOperationAction(ISD::FSQRT, VT, Legal);
2248
2249 setOperationAction(ISD::FFLOOR, VT, Legal);
2251 setOperationAction(ISD::FCEIL, VT, Legal);
2253 setOperationAction(ISD::FTRUNC, VT, Legal);
2255 setOperationAction(ISD::FRINT, VT, Legal);
2257 setOperationAction(ISD::FNEARBYINT, VT, Legal);
2259 setOperationAction(ISD::FROUNDEVEN, VT, Legal);
2261
2262 setOperationAction(ISD::FROUND, VT, Custom);
2263
2264 setOperationAction(ISD::LOAD, VT, Legal);
2265 setOperationAction(ISD::STORE, VT, Legal);
2266
2272
2273 setOperationAction(ISD::FNEG, VT, Custom);
2274 setOperationAction(ISD::FABS, VT, Custom);
2278
2282 };
2283
2284 // AVX512_FP16 scalar operations
2285 setGroup(MVT::f16);
2289 setOperationAction(ISD::BR_CC, MVT::f16, Expand);
2291 setOperationAction(ISD::FROUNDEVEN, MVT::f16, Legal);
2295 setOperationAction(ISD::FMAXIMUM, MVT::f16, Custom);
2296 setOperationAction(ISD::FMINIMUM, MVT::f16, Custom);
2297 setOperationAction(ISD::FMAXIMUMNUM, MVT::f16, Custom);
2298 setOperationAction(ISD::FMINIMUMNUM, MVT::f16, Custom);
2299 setOperationAction(ISD::FP_EXTEND, MVT::f32, Legal);
2301 setOperationAction(ISD::LRINT, MVT::f16, Legal);
2302 setOperationAction(ISD::LLRINT, MVT::f16, Legal);
2303
2306
2307 if (Subtarget.useAVX512Regs()) {
2308 setGroup(MVT::v32f16);
2314 setOperationAction(ISD::FP_ROUND, MVT::v16f16, Legal);
2316 setOperationAction(ISD::FP_EXTEND, MVT::v16f32, Custom);
2318 setOperationAction(ISD::FP_EXTEND, MVT::v8f64, Custom);
2321
2326 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v32i8, MVT::v32i16);
2328 MVT::v32i16);
2329 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v32i8, MVT::v32i16);
2331 MVT::v32i16);
2332 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v32i1, MVT::v32i16);
2334 MVT::v32i16);
2335 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v32i1, MVT::v32i16);
2337 MVT::v32i16);
2338
2342
2343 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Legal);
2344 setLoadExtAction(ISD::EXTLOAD, MVT::v16f32, MVT::v16f16, Legal);
2345
2346 setOperationAction(ISD::FMINIMUM, MVT::v32f16, Custom);
2347 setOperationAction(ISD::FMAXIMUM, MVT::v32f16, Custom);
2348 setOperationAction(ISD::FMINIMUMNUM, MVT::v32f16, Custom);
2349 setOperationAction(ISD::FMAXIMUMNUM, MVT::v32f16, Custom);
2350 setOperationAction(ISD::LRINT, MVT::v32f16, Legal);
2351 setOperationAction(ISD::LLRINT, MVT::v8f16, Legal);
2352 }
2353
2358
2359 if (Subtarget.hasVLX()) {
2360 setGroup(MVT::v8f16);
2361 setGroup(MVT::v16f16);
2362
2373
2376 setOperationAction(ISD::FP_EXTEND, MVT::v8f32, Custom);
2378 setOperationAction(ISD::FP_EXTEND, MVT::v4f64, Custom);
2380
2381 // INSERT_VECTOR_ELT v8f16 extended to VECTOR_SHUFFLE
2384
2388
2389 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Legal);
2390 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Legal);
2391 setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Legal);
2392 setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Legal);
2393
2394 // Need to custom widen these to prevent scalarization.
2395 setOperationAction(ISD::LOAD, MVT::v4f16, Custom);
2396 setOperationAction(ISD::STORE, MVT::v4f16, Custom);
2397
2398 setOperationAction(ISD::FMINIMUM, MVT::v8f16, Custom);
2399 setOperationAction(ISD::FMAXIMUM, MVT::v8f16, Custom);
2400 setOperationAction(ISD::FMINIMUMNUM, MVT::v8f16, Custom);
2401 setOperationAction(ISD::FMAXIMUMNUM, MVT::v8f16, Custom);
2402
2403 setOperationAction(ISD::FMINIMUM, MVT::v16f16, Custom);
2404 setOperationAction(ISD::FMAXIMUM, MVT::v16f16, Custom);
2405 setOperationAction(ISD::FMINIMUMNUM, MVT::v16f16, Custom);
2406 setOperationAction(ISD::FMAXIMUMNUM, MVT::v16f16, Custom);
2407 setOperationAction(ISD::LRINT, MVT::v8f16, Legal);
2408 setOperationAction(ISD::LRINT, MVT::v16f16, Legal);
2409 }
2410 }
2411
2412 if (!Subtarget.useSoftFloat() &&
2413 (Subtarget.hasAVXNECONVERT() || Subtarget.hasBF16())) {
2414 addRegisterClass(MVT::v8bf16, Subtarget.hasAVX512() ? &X86::VR128XRegClass
2415 : &X86::VR128RegClass);
2416 addRegisterClass(MVT::v16bf16, Subtarget.hasAVX512() ? &X86::VR256XRegClass
2417 : &X86::VR256RegClass);
2418 // We set the type action of bf16 to TypeSoftPromoteHalf, but we don't
2419 // provide the method to promote BUILD_VECTOR and INSERT_VECTOR_ELT.
2420 // Set the operation action Custom to do the customization later.
2423 for (auto VT : {MVT::v8bf16, MVT::v16bf16}) {
2424 setF16Action(VT, Expand);
2425 if (!Subtarget.hasBF16())
2431 }
2432 for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV}) {
2433 setOperationPromotedToType(Opc, MVT::v8bf16, MVT::v8f32);
2434 setOperationPromotedToType(Opc, MVT::v16bf16, MVT::v16f32);
2435 }
2436 setOperationAction(ISD::SETCC, MVT::v8bf16, Custom);
2437 setOperationAction(ISD::SETCC, MVT::v16bf16, Custom);
2439 addLegalFPImmediate(APFloat::getZero(APFloat::BFloat()));
2440 }
2441
2442 if (!Subtarget.useSoftFloat() && Subtarget.hasBF16() &&
2443 Subtarget.useAVX512Regs()) {
2444 addRegisterClass(MVT::v32bf16, &X86::VR512RegClass);
2445 setF16Action(MVT::v32bf16, Expand);
2446 for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV})
2447 setOperationPromotedToType(Opc, MVT::v32bf16, MVT::v32f32);
2448 setOperationAction(ISD::SETCC, MVT::v32bf16, Custom);
2450 setOperationAction(ISD::FP_ROUND, MVT::v16bf16, Custom);
2454 }
2455
2456 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX10_2()) {
2457 setOperationAction(ISD::FADD, MVT::v32bf16, Legal);
2458 setOperationAction(ISD::FSUB, MVT::v32bf16, Legal);
2459 setOperationAction(ISD::FMUL, MVT::v32bf16, Legal);
2460 setOperationAction(ISD::FDIV, MVT::v32bf16, Legal);
2461 setOperationAction(ISD::FSQRT, MVT::v32bf16, Legal);
2462 setOperationAction(ISD::FMA, MVT::v32bf16, Legal);
2463 setOperationAction(ISD::SETCC, MVT::v32bf16, Custom);
2464 setOperationAction(ISD::FMINIMUM, MVT::v32bf16, Custom);
2465 setOperationAction(ISD::FMAXIMUM, MVT::v32bf16, Custom);
2466 setOperationAction(ISD::FMINIMUMNUM, MVT::v32bf16, Custom);
2467 setOperationAction(ISD::FMAXIMUMNUM, MVT::v32bf16, Custom);
2468 for (auto VT : {MVT::v8bf16, MVT::v16bf16}) {
2473 setOperationAction(ISD::FSQRT, VT, Legal);
2476 setOperationAction(ISD::FMINIMUM, VT, Custom);
2477 setOperationAction(ISD::FMAXIMUM, VT, Custom);
2478 setOperationAction(ISD::FMINIMUMNUM, VT, Custom);
2479 setOperationAction(ISD::FMAXIMUMNUM, VT, Custom);
2480 }
2481 for (auto VT : {MVT::f16, MVT::f32, MVT::f64}) {
2484 }
2485 }
2486
2487 if (!Subtarget.useSoftFloat() && Subtarget.hasVLX()) {
2488 setTruncStoreAction(MVT::v4i64, MVT::v4i8, Legal);
2489 setTruncStoreAction(MVT::v4i64, MVT::v4i16, Legal);
2490 setTruncStoreAction(MVT::v4i64, MVT::v4i32, Legal);
2491 setTruncStoreAction(MVT::v8i32, MVT::v8i8, Legal);
2492 setTruncStoreAction(MVT::v8i32, MVT::v8i16, Legal);
2493
2494 setTruncStoreAction(MVT::v2i64, MVT::v2i8, Legal);
2495 setTruncStoreAction(MVT::v2i64, MVT::v2i16, Legal);
2496 setTruncStoreAction(MVT::v2i64, MVT::v2i32, Legal);
2497 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal);
2498 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal);
2499
2500 if (Subtarget.hasBWI()) {
2501 setTruncStoreAction(MVT::v16i16, MVT::v16i8, Legal);
2502 setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal);
2503 }
2504
2505 if (Subtarget.hasFP16()) {
2506 // vcvttph2[u]dq v4f16 -> v4i32/64, v2f16 -> v2i32/64
2515 // vcvt[u]dq2ph v4i32/64 -> v4f16, v2i32/64 -> v2f16
2524 // vcvtps2phx v4f32 -> v4f16, v2f32 -> v2f16
2529 // vcvtph2psx v4f16 -> v4f32, v2f16 -> v2f32
2530 setOperationAction(ISD::FP_EXTEND, MVT::v2f16, Custom);
2532 setOperationAction(ISD::FP_EXTEND, MVT::v4f16, Custom);
2534 }
2535 }
2536
2537 if (!Subtarget.useSoftFloat() && Subtarget.hasAMXTILE()) {
2538 addRegisterClass(MVT::x86amx, &X86::TILERegClass);
2539 }
2540
2541 // We want to custom lower some of our intrinsics.
2545 if (!Subtarget.is64Bit()) {
2547 }
2548
2549 // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
2550 // handle type legalization for these operations here.
2551 //
2552 // FIXME: We really should do custom legalization for addition and
2553 // subtraction on x86-32 once PR3203 is fixed. We really can't do much better
2554 // than generic legalization for 64-bit multiplication-with-overflow, though.
2555 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
2556 if (VT == MVT::i64 && !Subtarget.is64Bit())
2557 continue;
2558 // Add/Sub/Mul with overflow operations are custom lowered.
2565
2566 // Support carry in as value rather than glue.
2572 }
2573
2574 // Combine sin / cos into _sincos_stret if it is available.
2575 if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&
2576 getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {
2577 setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
2578 setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
2579 }
2580
2581 if (Subtarget.isTargetWin64()) {
2582 setOperationAction(ISD::SDIV, MVT::i128, Custom);
2583 setOperationAction(ISD::UDIV, MVT::i128, Custom);
2584 setOperationAction(ISD::SREM, MVT::i128, Custom);
2585 setOperationAction(ISD::UREM, MVT::i128, Custom);
2594 }
2595
2596 // On 32 bit MSVC, `fmodf(f32)` is not defined - only `fmod(f64)`
2597 // is. We should promote the value to 64-bits to solve this.
2598 // This is what the CRT headers do - `fmodf` is an inline header
2599 // function casting to f64 and calling `fmod`.
2600 if (Subtarget.is32Bit() &&
2601 (Subtarget.isTargetWindowsMSVC() || Subtarget.isTargetWindowsItanium()))
2602 // clang-format off
2603 for (ISD::NodeType Op :
2604 {ISD::FACOS, ISD::STRICT_FACOS,
2605 ISD::FASIN, ISD::STRICT_FASIN,
2606 ISD::FATAN, ISD::STRICT_FATAN,
2607 ISD::FATAN2, ISD::STRICT_FATAN2,
2608 ISD::FCEIL, ISD::STRICT_FCEIL,
2609 ISD::FCOS, ISD::STRICT_FCOS,
2610 ISD::FCOSH, ISD::STRICT_FCOSH,
2611 ISD::FEXP, ISD::STRICT_FEXP,
2612 ISD::FFLOOR, ISD::STRICT_FFLOOR,
2614 ISD::FLOG, ISD::STRICT_FLOG,
2615 ISD::FLOG10, ISD::STRICT_FLOG10,
2616 ISD::FPOW, ISD::STRICT_FPOW,
2617 ISD::FSIN, ISD::STRICT_FSIN,
2618 ISD::FSINH, ISD::STRICT_FSINH,
2619 ISD::FTAN, ISD::STRICT_FTAN,
2620 ISD::FTANH, ISD::STRICT_FTANH,
2621 // TODO: Add ISD:::STRICT_FMODF too once implemented.
2622 ISD::FMODF})
2623 if (isOperationExpand(Op, MVT::f32))
2624 setOperationAction(Op, MVT::f32, Promote);
2625 // clang-format on
2626
2627 // On MSVC, both 32-bit and 64-bit, ldexpf(f32) is not defined. MinGW has
2628 // it, but it's just a wrapper around ldexp.
2629 if (Subtarget.isOSWindows()) {
2630 for (ISD::NodeType Op : {ISD::FLDEXP, ISD::STRICT_FLDEXP, ISD::FFREXP})
2631 if (isOperationExpand(Op, MVT::f32))
2632 setOperationAction(Op, MVT::f32, Promote);
2633 }
2634
2635 // We have target-specific dag combine patterns for the following nodes:
2643 ISD::BITCAST,
2646 ISD::SHL,
2647 ISD::SRA,
2648 ISD::SRL,
2649 ISD::OR,
2650 ISD::AND,
2656 ISD::ADD,
2657 ISD::FADD,
2658 ISD::FSUB,
2659 ISD::FNEG,
2660 ISD::FMA,
2662 ISD::FMINNUM,
2663 ISD::FMAXNUM,
2664 ISD::SUB,
2665 ISD::LOAD,
2666 ISD::LRINT,
2667 ISD::LLRINT,
2668 ISD::MLOAD,
2669 ISD::STORE,
2670 ISD::MSTORE,
2686 ISD::SETCC,
2687 ISD::MUL,
2688 ISD::XOR,
2689 ISD::MSCATTER,
2690 ISD::MGATHER,
2691 ISD::FP16_TO_FP,
2692 ISD::FP_EXTEND,
2699
2700 computeRegisterProperties(Subtarget.getRegisterInfo());
2701
2702 MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
2704 MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores
2706 MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
2708
2709 // TODO: These control memcmp expansion in CGP and could be raised higher, but
2710 // that needs to benchmarked and balanced with the potential use of vector
2711 // load/store types (PR33329, PR33914).
2714
2715 // Default loop alignment, which can be overridden by -align-loops.
2717
2718 // An out-of-order CPU can speculatively execute past a predictable branch,
2719 // but a conditional move could be stalled by an expensive earlier operation.
2720 PredictableSelectIsExpensive = Subtarget.getSchedModel().isOutOfOrder();
2721 EnableExtLdPromotion = true;
2723
2725
2726 // Default to having -disable-strictnode-mutation on
2727 IsStrictFPEnabled = true;
2728}
2729
2730// This has so far only been implemented for 64-bit MachO.
2732 return Subtarget.isTargetMachO() && Subtarget.is64Bit();
2733}
2734
2736 // Currently only MSVC CRTs XOR the frame pointer into the stack guard value.
2737 return Subtarget.getTargetTriple().isOSMSVCRT() && !Subtarget.isTargetMachO();
2738}
2739
2741 const SDLoc &DL) const {
2742 EVT PtrTy = getPointerTy(DAG.getDataLayout());
2743 unsigned XorOp = Subtarget.is64Bit() ? X86::XOR64_FP : X86::XOR32_FP;
2744 MachineSDNode *Node = DAG.getMachineNode(XorOp, DL, PtrTy, Val);
2745 return SDValue(Node, 0);
2746}
2747
2750 if ((VT == MVT::v32i1 || VT == MVT::v64i1) && Subtarget.hasAVX512() &&
2751 !Subtarget.hasBWI())
2752 return TypeSplitVector;
2753
2754 // Since v8f16 is legal, widen anything over v4f16.
2755 if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
2756 VT.getVectorNumElements() <= 4 && !Subtarget.hasF16C() &&
2757 VT.getVectorElementType() == MVT::f16)
2758 return TypeSplitVector;
2759
2760 if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
2761 VT.getVectorElementType() != MVT::i1)
2762 return TypeWidenVector;
2763
2765}
2766
2767FastISel *
2769 const TargetLibraryInfo *libInfo) const {
2770 return X86::createFastISel(funcInfo, libInfo);
2771}
2772
2773//===----------------------------------------------------------------------===//
2774// Other Lowering Hooks
2775//===----------------------------------------------------------------------===//
2776
2778 bool AssumeSingleUse) {
2779 if (!AssumeSingleUse && !Op.hasOneUse())
2780 return false;
2781 if (!ISD::isNormalLoad(Op.getNode()))
2782 return false;
2783
2784 // If this is an unaligned vector, make sure the target supports folding it.
2785 auto *Ld = cast<LoadSDNode>(Op.getNode());
2786 if (!Subtarget.hasAVX() && !Subtarget.hasSSEUnalignedMem() &&
2787 Ld->getValueSizeInBits(0) == 128 && Ld->getAlign() < Align(16))
2788 return false;
2789
2790 // TODO: If this is a non-temporal load and the target has an instruction
2791 // for it, it should not be folded. See "useNonTemporalLoad()".
2792
2793 return true;
2794}
2795
2797 const X86Subtarget &Subtarget,
2798 bool AssumeSingleUse) {
2799 assert(Subtarget.hasAVX() && "Expected AVX for broadcast from memory");
2800 if (!X86::mayFoldLoad(Op, Subtarget, AssumeSingleUse))
2801 return false;
2802
2803 // We can not replace a wide volatile load with a broadcast-from-memory,
2804 // because that would narrow the load, which isn't legal for volatiles.
2805 auto *Ld = cast<LoadSDNode>(Op.getNode());
2806 return !Ld->isVolatile() ||
2807 Ld->getValueSizeInBits(0) == EltVT.getScalarSizeInBits();
2808}
2809
2811 if (!Op.hasOneUse())
2812 return false;
2813 // Peek through (oneuse) bitcast users
2814 SDNode *User = *Op->user_begin();
2815 while (User->getOpcode() == ISD::BITCAST) {
2816 if (!User->hasOneUse())
2817 return false;
2818 User = *User->user_begin();
2819 }
2820 return ISD::isNormalStore(User);
2821}
2822
2824 if (Op.hasOneUse()) {
2825 unsigned Opcode = Op.getNode()->user_begin()->getOpcode();
2826 return (ISD::ZERO_EXTEND == Opcode);
2827 }
2828 return false;
2829}
2830
2831static bool isLogicOp(unsigned Opcode) {
2832 // TODO: Add support for X86ISD::FAND/FOR/FXOR/FANDN with test coverage.
2833 return ISD::isBitwiseLogicOp(Opcode) || X86ISD::ANDNP == Opcode;
2834}
2835
2836static bool isTargetShuffle(unsigned Opcode) {
2837 switch(Opcode) {
2838 default: return false;
2839 case X86ISD::BLENDI:
2840 case X86ISD::PSHUFB:
2841 case X86ISD::PSHUFD:
2842 case X86ISD::PSHUFHW:
2843 case X86ISD::PSHUFLW:
2844 case X86ISD::SHUFP:
2845 case X86ISD::INSERTPS:
2846 case X86ISD::EXTRQI:
2847 case X86ISD::INSERTQI:
2848 case X86ISD::VALIGN:
2849 case X86ISD::PALIGNR:
2850 case X86ISD::VSHLDQ:
2851 case X86ISD::VSRLDQ:
2852 case X86ISD::MOVLHPS:
2853 case X86ISD::MOVHLPS:
2854 case X86ISD::MOVSHDUP:
2855 case X86ISD::MOVSLDUP:
2856 case X86ISD::MOVDDUP:
2857 case X86ISD::MOVSS:
2858 case X86ISD::MOVSD:
2859 case X86ISD::MOVSH:
2860 case X86ISD::UNPCKL:
2861 case X86ISD::UNPCKH:
2862 case X86ISD::VBROADCAST:
2863 case X86ISD::VPERMILPI:
2864 case X86ISD::VPERMILPV:
2865 case X86ISD::VPERM2X128:
2866 case X86ISD::SHUF128:
2867 case X86ISD::VPERMIL2:
2868 case X86ISD::VPERMI:
2869 case X86ISD::VPPERM:
2870 case X86ISD::VPERMV:
2871 case X86ISD::VPERMV3:
2872 case X86ISD::VZEXT_MOVL:
2873 return true;
2874 }
2875}
2876
2877static bool isTargetShuffleVariableMask(unsigned Opcode) {
2878 switch (Opcode) {
2879 default: return false;
2880 // Target Shuffles.
2881 case X86ISD::PSHUFB:
2882 case X86ISD::VPERMILPV:
2883 case X86ISD::VPERMIL2:
2884 case X86ISD::VPPERM:
2885 case X86ISD::VPERMV:
2886 case X86ISD::VPERMV3:
2887 return true;
2888 // 'Faux' Target Shuffles.
2889 case ISD::OR:
2890 case ISD::AND:
2891 case X86ISD::ANDNP:
2892 return true;
2893 }
2894}
2895
2898 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
2900 int ReturnAddrIndex = FuncInfo->getRAIndex();
2901
2902 if (ReturnAddrIndex == 0) {
2903 // Set up a frame object for the return address.
2904 unsigned SlotSize = RegInfo->getSlotSize();
2905 ReturnAddrIndex = MF.getFrameInfo().CreateFixedObject(SlotSize,
2906 -(int64_t)SlotSize,
2907 false);
2908 FuncInfo->setRAIndex(ReturnAddrIndex);
2909 }
2910
2911 return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy(DAG.getDataLayout()));
2912}
2913
2915 bool HasSymbolicDisplacement) {
2916 // Offset should fit into 32 bit immediate field.
2917 if (!isInt<32>(Offset))
2918 return false;
2919
2920 // If we don't have a symbolic displacement - we don't have any extra
2921 // restrictions.
2922 if (!HasSymbolicDisplacement)
2923 return true;
2924
2925 // We can fold large offsets in the large code model because we always use
2926 // 64-bit offsets.
2927 if (CM == CodeModel::Large)
2928 return true;
2929
2930 // For kernel code model we know that all object resist in the negative half
2931 // of 32bits address space. We may not accept negative offsets, since they may
2932 // be just off and we may accept pretty large positive ones.
2933 if (CM == CodeModel::Kernel)
2934 return Offset >= 0;
2935
2936 // For other non-large code models we assume that latest small object is 16MB
2937 // before end of 31 bits boundary. We may also accept pretty large negative
2938 // constants knowing that all objects are in the positive half of address
2939 // space.
2940 return Offset < 16 * 1024 * 1024;
2941}
2942
2943/// Return true if the condition is an signed comparison operation.
2944static bool isX86CCSigned(X86::CondCode X86CC) {
2945 switch (X86CC) {
2946 default:
2947 llvm_unreachable("Invalid integer condition!");
2948 case X86::COND_E:
2949 case X86::COND_NE:
2950 case X86::COND_B:
2951 case X86::COND_A:
2952 case X86::COND_BE:
2953 case X86::COND_AE:
2954 return false;
2955 case X86::COND_G:
2956 case X86::COND_GE:
2957 case X86::COND_L:
2958 case X86::COND_LE:
2959 return true;
2960 }
2961}
2962
2964 switch (SetCCOpcode) {
2965 // clang-format off
2966 default: llvm_unreachable("Invalid integer condition!");
2967 case ISD::SETEQ: return X86::COND_E;
2968 case ISD::SETGT: return X86::COND_G;
2969 case ISD::SETGE: return X86::COND_GE;
2970 case ISD::SETLT: return X86::COND_L;
2971 case ISD::SETLE: return X86::COND_LE;
2972 case ISD::SETNE: return X86::COND_NE;
2973 case ISD::SETULT: return X86::COND_B;
2974 case ISD::SETUGT: return X86::COND_A;
2975 case ISD::SETULE: return X86::COND_BE;
2976 case ISD::SETUGE: return X86::COND_AE;
2977 // clang-format on
2978 }
2979}
2980
2981/// Do a one-to-one translation of a ISD::CondCode to the X86-specific
2982/// condition code, returning the condition code and the LHS/RHS of the
2983/// comparison to make.
2985 bool isFP, SDValue &LHS, SDValue &RHS,
2986 SelectionDAG &DAG) {
2987 if (!isFP) {
2989 if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnes()) {
2990 // X > -1 -> X == 0, jump !sign.
2991 RHS = DAG.getConstant(0, DL, RHS.getValueType());
2992 return X86::COND_NS;
2993 }
2994 if (SetCCOpcode == ISD::SETLT && RHSC->isZero()) {
2995 // X < 0 -> X == 0, jump on sign.
2996 return X86::COND_S;
2997 }
2998 if (SetCCOpcode == ISD::SETGE && RHSC->isZero()) {
2999 // X >= 0 -> X == 0, jump on !sign.
3000 return X86::COND_NS;
3001 }
3002 if (SetCCOpcode == ISD::SETLT && RHSC->isOne()) {
3003 // X < 1 -> X <= 0
3004 RHS = DAG.getConstant(0, DL, RHS.getValueType());
3005 return X86::COND_LE;
3006 }
3007 }
3008
3009 return TranslateIntegerX86CC(SetCCOpcode);
3010 }
3011
3012 // First determine if it is required or is profitable to flip the operands.
3013
3014 // If LHS is a foldable load, but RHS is not, flip the condition.
3015 if (ISD::isNON_EXTLoad(LHS.getNode()) &&
3016 !ISD::isNON_EXTLoad(RHS.getNode())) {
3017 SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
3018 std::swap(LHS, RHS);
3019 }
3020
3021 switch (SetCCOpcode) {
3022 default: break;
3023 case ISD::SETOLT:
3024 case ISD::SETOLE:
3025 case ISD::SETUGT:
3026 case ISD::SETUGE:
3027 std::swap(LHS, RHS);
3028 break;
3029 }
3030
3031 // On a floating point condition, the flags are set as follows:
3032 // ZF PF CF op
3033 // 0 | 0 | 0 | X > Y
3034 // 0 | 0 | 1 | X < Y
3035 // 1 | 0 | 0 | X == Y
3036 // 1 | 1 | 1 | unordered
3037 switch (SetCCOpcode) {
3038 // clang-format off
3039 default: llvm_unreachable("Condcode should be pre-legalized away");
3040 case ISD::SETUEQ:
3041 case ISD::SETEQ: return X86::COND_E;
3042 case ISD::SETOLT: // flipped
3043 case ISD::SETOGT:
3044 case ISD::SETGT: return X86::COND_A;
3045 case ISD::SETOLE: // flipped
3046 case ISD::SETOGE:
3047 case ISD::SETGE: return X86::COND_AE;
3048 case ISD::SETUGT: // flipped
3049 case ISD::SETULT:
3050 case ISD::SETLT: return X86::COND_B;
3051 case ISD::SETUGE: // flipped
3052 case ISD::SETULE:
3053 case ISD::SETLE: return X86::COND_BE;
3054 case ISD::SETONE:
3055 case ISD::SETNE: return X86::COND_NE;
3056 case ISD::SETUO: return X86::COND_P;
3057 case ISD::SETO: return X86::COND_NP;
3058 case ISD::SETOEQ:
3059 case ISD::SETUNE: return X86::COND_INVALID;
3060 // clang-format on
3061 }
3062}
3063
3064/// Is there a floating point cmov for the specific X86 condition code?
3065/// Current x86 isa includes the following FP cmov instructions:
3066/// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.
3067static bool hasFPCMov(unsigned X86CC) {
3068 switch (X86CC) {
3069 default:
3070 return false;
3071 case X86::COND_B:
3072 case X86::COND_BE:
3073 case X86::COND_E:
3074 case X86::COND_P:
3075 case X86::COND_A:
3076 case X86::COND_AE:
3077 case X86::COND_NE:
3078 case X86::COND_NP:
3079 return true;
3080 }
3081}
3082
3083static bool useVPTERNLOG(const X86Subtarget &Subtarget, MVT VT) {
3084 return Subtarget.hasVLX() || Subtarget.canExtendTo512DQ() ||
3085 VT.is512BitVector();
3086}
3087
3089 const CallInst &I,
3090 MachineFunction &MF,
3091 unsigned Intrinsic) const {
3092 Info.flags = MachineMemOperand::MONone;
3093 Info.offset = 0;
3094
3096 if (!IntrData) {
3097 switch (Intrinsic) {
3098 case Intrinsic::x86_aesenc128kl:
3099 case Intrinsic::x86_aesdec128kl:
3100 Info.opc = ISD::INTRINSIC_W_CHAIN;
3101 Info.ptrVal = I.getArgOperand(1);
3102 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 48);
3103 Info.align = Align(1);
3104 Info.flags |= MachineMemOperand::MOLoad;
3105 return true;
3106 case Intrinsic::x86_aesenc256kl:
3107 case Intrinsic::x86_aesdec256kl:
3108 Info.opc = ISD::INTRINSIC_W_CHAIN;
3109 Info.ptrVal = I.getArgOperand(1);
3110 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 64);
3111 Info.align = Align(1);
3112 Info.flags |= MachineMemOperand::MOLoad;
3113 return true;
3114 case Intrinsic::x86_aesencwide128kl:
3115 case Intrinsic::x86_aesdecwide128kl:
3116 Info.opc = ISD::INTRINSIC_W_CHAIN;
3117 Info.ptrVal = I.getArgOperand(0);
3118 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 48);
3119 Info.align = Align(1);
3120 Info.flags |= MachineMemOperand::MOLoad;
3121 return true;
3122 case Intrinsic::x86_aesencwide256kl:
3123 case Intrinsic::x86_aesdecwide256kl:
3124 Info.opc = ISD::INTRINSIC_W_CHAIN;
3125 Info.ptrVal = I.getArgOperand(0);
3126 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 64);
3127 Info.align = Align(1);
3128 Info.flags |= MachineMemOperand::MOLoad;
3129 return true;
3130 case Intrinsic::x86_cmpccxadd32:
3131 case Intrinsic::x86_cmpccxadd64:
3132 case Intrinsic::x86_atomic_bts:
3133 case Intrinsic::x86_atomic_btc:
3134 case Intrinsic::x86_atomic_btr: {
3135 Info.opc = ISD::INTRINSIC_W_CHAIN;
3136 Info.ptrVal = I.getArgOperand(0);
3137 unsigned Size = I.getType()->getScalarSizeInBits();
3138 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), Size);
3139 Info.align = Align(Size);
3142 return true;
3143 }
3144 case Intrinsic::x86_atomic_bts_rm:
3145 case Intrinsic::x86_atomic_btc_rm:
3146 case Intrinsic::x86_atomic_btr_rm: {
3147 Info.opc = ISD::INTRINSIC_W_CHAIN;
3148 Info.ptrVal = I.getArgOperand(0);
3149 unsigned Size = I.getArgOperand(1)->getType()->getScalarSizeInBits();
3150 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), Size);
3151 Info.align = Align(Size);
3154 return true;
3155 }
3156 case Intrinsic::x86_aadd32:
3157 case Intrinsic::x86_aadd64:
3158 case Intrinsic::x86_aand32:
3159 case Intrinsic::x86_aand64:
3160 case Intrinsic::x86_aor32:
3161 case Intrinsic::x86_aor64:
3162 case Intrinsic::x86_axor32:
3163 case Intrinsic::x86_axor64:
3164 case Intrinsic::x86_atomic_add_cc:
3165 case Intrinsic::x86_atomic_sub_cc:
3166 case Intrinsic::x86_atomic_or_cc:
3167 case Intrinsic::x86_atomic_and_cc:
3168 case Intrinsic::x86_atomic_xor_cc: {
3169 Info.opc = ISD::INTRINSIC_W_CHAIN;
3170 Info.ptrVal = I.getArgOperand(0);
3171 unsigned Size = I.getArgOperand(1)->getType()->getScalarSizeInBits();
3172 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), Size);
3173 Info.align = Align(Size);
3176 return true;
3177 }
3178 }
3179 return false;
3180 }
3181
3182 switch (IntrData->Type) {
3185 case TRUNCATE_TO_MEM_VI32: {
3186 Info.opc = ISD::INTRINSIC_VOID;
3187 Info.ptrVal = I.getArgOperand(0);
3188 MVT VT = MVT::getVT(I.getArgOperand(1)->getType());
3190 if (IntrData->Type == TRUNCATE_TO_MEM_VI8)
3191 ScalarVT = MVT::i8;
3192 else if (IntrData->Type == TRUNCATE_TO_MEM_VI16)
3193 ScalarVT = MVT::i16;
3194 else if (IntrData->Type == TRUNCATE_TO_MEM_VI32)
3195 ScalarVT = MVT::i32;
3196
3197 Info.memVT = MVT::getVectorVT(ScalarVT, VT.getVectorNumElements());
3198 Info.align = Align(1);
3199 Info.flags |= MachineMemOperand::MOStore;
3200 break;
3201 }
3202 case GATHER:
3203 case GATHER_AVX2: {
3204 Info.opc = ISD::INTRINSIC_W_CHAIN;
3205 Info.ptrVal = nullptr;
3206 MVT DataVT = MVT::getVT(I.getType());
3207 MVT IndexVT = MVT::getVT(I.getArgOperand(2)->getType());
3208 unsigned NumElts = std::min(DataVT.getVectorNumElements(),
3209 IndexVT.getVectorNumElements());
3210 Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts);
3211 Info.align = Align(1);
3212 Info.flags |= MachineMemOperand::MOLoad;
3213 break;
3214 }
3215 case SCATTER: {
3216 Info.opc = ISD::INTRINSIC_VOID;
3217 Info.ptrVal = nullptr;
3218 MVT DataVT = MVT::getVT(I.getArgOperand(3)->getType());
3219 MVT IndexVT = MVT::getVT(I.getArgOperand(2)->getType());
3220 unsigned NumElts = std::min(DataVT.getVectorNumElements(),
3221 IndexVT.getVectorNumElements());
3222 Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts);
3223 Info.align = Align(1);
3224 Info.flags |= MachineMemOperand::MOStore;
3225 break;
3226 }
3227 default:
3228 return false;
3229 }
3230
3231 return true;
3232}
3233
3234/// Returns true if the target can instruction select the
3235/// specified FP immediate natively. If false, the legalizer will
3236/// materialize the FP immediate as a load from a constant pool.
3238 bool ForCodeSize) const {
3239 for (const APFloat &FPImm : LegalFPImmediates)
3240 if (Imm.bitwiseIsEqual(FPImm))
3241 return true;
3242 return false;
3243}
3244
3246 SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT,
3247 std::optional<unsigned> ByteOffset) const {
3248 assert(cast<LoadSDNode>(Load)->isSimple() && "illegal to narrow");
3249
3250 auto PeekThroughOneUserBitcasts = [](const SDNode *N) {
3251 while (N->getOpcode() == ISD::BITCAST && N->hasOneUse())
3252 N = *N->user_begin();
3253 return N;
3254 };
3255
3256 // "ELF Handling for Thread-Local Storage" specifies that R_X86_64_GOTTPOFF
3257 // relocation target a movq or addq instruction: don't let the load shrink.
3258 SDValue BasePtr = cast<LoadSDNode>(Load)->getBasePtr();
3259 if (BasePtr.getOpcode() == X86ISD::WrapperRIP)
3260 if (const auto *GA = dyn_cast<GlobalAddressSDNode>(BasePtr.getOperand(0)))
3261 return GA->getTargetFlags() != X86II::MO_GOTTPOFF;
3262
3263 // If this is an (1) AVX vector load with (2) multiple uses and (3) all of
3264 // those uses are extracted directly into a store, then the extract + store
3265 // can be store-folded, or (4) any use will be used by legal full width
3266 // instruction. Then, it's probably not worth splitting the load.
3267 EVT VT = Load->getValueType(0);
3268 if ((VT.is256BitVector() || VT.is512BitVector()) &&
3269 !SDValue(Load, 0).hasOneUse()) {
3270 bool FullWidthUse = false;
3271 bool AllExtractStores = true;
3272 for (SDUse &Use : Load->uses()) {
3273 // Skip uses of the chain value. Result 0 of the node is the load value.
3274 if (Use.getResNo() != 0)
3275 continue;
3276
3277 const SDNode *User = PeekThroughOneUserBitcasts(Use.getUser());
3278
3279 // If this use is an extract + store, it's probably not worth splitting.
3280 if (User->getOpcode() == ISD::EXTRACT_SUBVECTOR &&
3281 all_of(User->uses(), [&](const SDUse &U) {
3282 const SDNode *Inner = PeekThroughOneUserBitcasts(U.getUser());
3283 return Inner->getOpcode() == ISD::STORE;
3284 }))
3285 continue;
3286
3287 AllExtractStores = false;
3288
3289 // If any use is a full width legal/target bin op, then assume its legal
3290 // and won't split.
3291 if (isBinOp(User->getOpcode()) &&
3292 (isOperationLegal(User->getOpcode(), User->getValueType(0)) ||
3293 User->getOpcode() > ISD::BUILTIN_OP_END))
3294 FullWidthUse = true;
3295 }
3296
3297 if (AllExtractStores)
3298 return false;
3299
3300 // If we have an user that uses the full vector width, then this use is
3301 // only worth splitting if the offset isn't 0 (to avoid an
3302 // EXTRACT_SUBVECTOR) or we're loading a scalar integer.
3303 if (FullWidthUse)
3304 return (ByteOffset.value_or(0) > 0) || NewVT.isScalarInteger();
3305 }
3306
3307 return true;
3308}
3309
3310/// Returns true if it is beneficial to convert a load of a constant
3311/// to just the constant itself.
3313 Type *Ty) const {
3314 assert(Ty->isIntegerTy());
3315
3316 unsigned BitSize = Ty->getPrimitiveSizeInBits();
3317 if (BitSize == 0 || BitSize > 64)
3318 return false;
3319 return true;
3320}
3321
3323 // If we are using XMM registers in the ABI and the condition of the select is
3324 // a floating-point compare and we have blendv or conditional move, then it is
3325 // cheaper to select instead of doing a cross-register move and creating a
3326 // load that depends on the compare result.
3327 bool IsFPSetCC = CmpOpVT.isFloatingPoint() && CmpOpVT != MVT::f128;
3328 return !IsFPSetCC || !Subtarget.isTarget64BitLP64() || !Subtarget.hasAVX();
3329}
3330
3332 // TODO: It might be a win to ease or lift this restriction, but the generic
3333 // folds in DAGCombiner conflict with vector folds for an AVX512 target.
3334 if (VT.isVector() && Subtarget.hasAVX512())
3335 return false;
3336
3337 return true;
3338}
3339
3341 SDValue C) const {
3342 // TODO: We handle scalars using custom code, but generic combining could make
3343 // that unnecessary.
3344 APInt MulC;
3345 if (!ISD::isConstantSplatVector(C.getNode(), MulC))
3346 return false;
3347
3348 // Find the type this will be legalized too. Otherwise we might prematurely
3349 // convert this to shl+add/sub and then still have to type legalize those ops.
3350 // Another choice would be to defer the decision for illegal types until
3351 // after type legalization. But constant splat vectors of i64 can't make it
3352 // through type legalization on 32-bit targets so we would need to special
3353 // case vXi64.
3354 while (getTypeAction(Context, VT) != TypeLegal)
3355 VT = getTypeToTransformTo(Context, VT);
3356
3357 // If vector multiply is legal, assume that's faster than shl + add/sub.
3358 // Multiply is a complex op with higher latency and lower throughput in
3359 // most implementations, sub-vXi32 vector multiplies are always fast,
3360 // vXi32 mustn't have a SlowMULLD implementation, and anything larger (vXi64)
3361 // is always going to be slow.
3362 unsigned EltSizeInBits = VT.getScalarSizeInBits();
3363 if (isOperationLegal(ISD::MUL, VT) && EltSizeInBits <= 32 &&
3364 (EltSizeInBits != 32 || !Subtarget.isPMULLDSlow()))
3365 return false;
3366
3367 // shl+add, shl+sub, shl+add+neg
3368 return (MulC + 1).isPowerOf2() || (MulC - 1).isPowerOf2() ||
3369 (1 - MulC).isPowerOf2() || (-(MulC + 1)).isPowerOf2();
3370}
3371
3373 unsigned Index) const {
3375 return false;
3376
3377 // Mask vectors support all subregister combinations and operations that
3378 // extract half of vector.
3379 if (ResVT.getVectorElementType() == MVT::i1)
3380 return Index == 0 || ((ResVT.getSizeInBits() == SrcVT.getSizeInBits()*2) &&
3381 (Index == ResVT.getVectorNumElements()));
3382
3383 return (Index % ResVT.getVectorNumElements()) == 0;
3384}
3385
3387 unsigned Opc = VecOp.getOpcode();
3388
3389 // Assume target opcodes can't be scalarized.
3390 // TODO - do we have any exceptions?
3391 if (Opc >= ISD::BUILTIN_OP_END || !isBinOp(Opc))
3392 return false;
3393
3394 // If the vector op is not supported, try to convert to scalar.
3395 EVT VecVT = VecOp.getValueType();
3397 return true;
3398
3399 // If the vector op is supported, but the scalar op is not, the transform may
3400 // not be worthwhile.
3401 EVT ScalarVT = VecVT.getScalarType();
3402 return isOperationLegalOrCustomOrPromote(Opc, ScalarVT);
3403}
3404
3406 bool) const {
3407 // TODO: Allow vectors?
3408 if (VT.isVector())
3409 return false;
3410 return VT.isSimple() || !isOperationExpand(Opcode, VT);
3411}
3412
3414 // Speculate cttz only if we can directly use TZCNT/CMOV, can promote to
3415 // i32/i64 or can rely on BSF passthrough value.
3416 return Subtarget.hasBMI() || Subtarget.canUseCMOV() ||
3417 Subtarget.hasBitScanPassThrough() ||
3418 (!Ty->isVectorTy() &&
3419 Ty->getScalarSizeInBits() < (Subtarget.is64Bit() ? 64u : 32u));
3420}
3421
3423 // Speculate ctlz only if we can directly use LZCNT/CMOV, or can rely on BSR
3424 // passthrough value.
3425 return Subtarget.hasLZCNT() || Subtarget.canUseCMOV() ||
3426 Subtarget.hasBitScanPassThrough();
3427}
3428
3430 // Don't shrink FP constpool if SSE2 is available since cvtss2sd is more
3431 // expensive than a straight movsd. On the other hand, it's important to
3432 // shrink long double fp constant since fldt is very slow.
3433 return !Subtarget.hasSSE2() || VT == MVT::f80;
3434}
3435
3437 return (VT == MVT::f64 && Subtarget.hasSSE2()) ||
3438 (VT == MVT::f32 && Subtarget.hasSSE1()) || VT == MVT::f16;
3439}
3440
3442 const SelectionDAG &DAG,
3443 const MachineMemOperand &MMO) const {
3444 if (!Subtarget.hasAVX512() && !LoadVT.isVector() && BitcastVT.isVector() &&
3445 BitcastVT.getVectorElementType() == MVT::i1)
3446 return false;
3447
3448 if (!Subtarget.hasDQI() && BitcastVT == MVT::v8i1 && LoadVT == MVT::i8)
3449 return false;
3450
3451 // If both types are legal vectors, it's always ok to convert them.
3452 if (LoadVT.isVector() && BitcastVT.isVector() &&
3453 isTypeLegal(LoadVT) && isTypeLegal(BitcastVT))
3454 return true;
3455
3456 return TargetLowering::isLoadBitCastBeneficial(LoadVT, BitcastVT, DAG, MMO);
3457}
3458
3460 const MachineFunction &MF) const {
3461 // Do not merge to float value size (128 bytes) if no implicit
3462 // float attribute is set.
3463 bool NoFloat = MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat);
3464
3465 if (NoFloat) {
3466 unsigned MaxIntSize = Subtarget.is64Bit() ? 64 : 32;
3467 return (MemVT.getSizeInBits() <= MaxIntSize);
3468 }
3469 // Make sure we don't merge greater than our preferred vector
3470 // width.
3471 if (MemVT.getSizeInBits() > Subtarget.getPreferVectorWidth())
3472 return false;
3473
3474 return true;
3475}
3476
3478 return Subtarget.hasFastLZCNT();
3479}
3480
3482 const Instruction &AndI) const {
3483 return true;
3484}
3485
3487 EVT VT = Y.getValueType();
3488
3489 if (VT.isVector())
3490 return false;
3491
3492 if (!Subtarget.hasBMI())
3493 return false;
3494
3495 // There are only 32-bit and 64-bit forms for 'andn'.
3496 if (VT != MVT::i32 && VT != MVT::i64)
3497 return false;
3498
3499 return !isa<ConstantSDNode>(Y) || cast<ConstantSDNode>(Y)->isOpaque();
3500}
3501
3503 EVT VT = Y.getValueType();
3504
3505 if (!VT.isVector())
3506 return hasAndNotCompare(Y);
3507
3508 // Vector.
3509
3510 if (!Subtarget.hasSSE1() || VT.getSizeInBits() < 128)
3511 return false;
3512
3513 if (VT == MVT::v4i32)
3514 return true;
3515
3516 return Subtarget.hasSSE2();
3517}
3518
3520 return X.getValueType().isScalarInteger(); // 'bt'
3521}
3522
3526 unsigned OldShiftOpcode, unsigned NewShiftOpcode,
3527 SelectionDAG &DAG) const {
3528 // Does baseline recommend not to perform the fold by default?
3530 X, XC, CC, Y, OldShiftOpcode, NewShiftOpcode, DAG))
3531 return false;
3532 // For scalars this transform is always beneficial.
3533 if (X.getValueType().isScalarInteger())
3534 return true;
3535 // If all the shift amounts are identical, then transform is beneficial even
3536 // with rudimentary SSE2 shifts.
3537 if (DAG.isSplatValue(Y, /*AllowUndefs=*/true))
3538 return true;
3539 // If we have AVX2 with it's powerful shift operations, then it's also good.
3540 if (Subtarget.hasAVX2())
3541 return true;
3542 // Pre-AVX2 vector codegen for this pattern is best for variant with 'shl'.
3543 return NewShiftOpcode == ISD::SHL;
3544}
3545
3547 EVT VT, unsigned ShiftOpc, bool MayTransformRotate,
3548 const APInt &ShiftOrRotateAmt, const std::optional<APInt> &AndMask) const {
3549 if (!VT.isInteger())
3550 return ShiftOpc;
3551
3552 bool PreferRotate = false;
3553 if (VT.isVector()) {
3554 // For vectors, if we have rotate instruction support, then its definetly
3555 // best. Otherwise its not clear what the best so just don't make changed.
3556 PreferRotate = Subtarget.hasAVX512() && (VT.getScalarType() == MVT::i32 ||
3557 VT.getScalarType() == MVT::i64);
3558 } else {
3559 // For scalar, if we have bmi prefer rotate for rorx. Otherwise prefer
3560 // rotate unless we have a zext mask+shr.
3561 PreferRotate = Subtarget.hasBMI2();
3562 if (!PreferRotate) {
3563 unsigned MaskBits =
3564 VT.getScalarSizeInBits() - ShiftOrRotateAmt.getZExtValue();
3565 PreferRotate = (MaskBits != 8) && (MaskBits != 16) && (MaskBits != 32);
3566 }
3567 }
3568
3569 if (ShiftOpc == ISD::SHL || ShiftOpc == ISD::SRL) {
3570 assert(AndMask.has_value() && "Null andmask when querying about shift+and");
3571
3572 if (PreferRotate && MayTransformRotate)
3573 return ISD::ROTL;
3574
3575 // If vector we don't really get much benefit swapping around constants.
3576 // Maybe we could check if the DAG has the flipped node already in the
3577 // future.
3578 if (VT.isVector())
3579 return ShiftOpc;
3580
3581 // See if the beneficial to swap shift type.
3582 if (ShiftOpc == ISD::SHL) {
3583 // If the current setup has imm64 mask, then inverse will have
3584 // at least imm32 mask (or be zext i32 -> i64).
3585 if (VT == MVT::i64)
3586 return AndMask->getSignificantBits() > 32 ? (unsigned)ISD::SRL
3587 : ShiftOpc;
3588
3589 // We can only benefit if req at least 7-bit for the mask. We
3590 // don't want to replace shl of 1,2,3 as they can be implemented
3591 // with lea/add.
3592 return ShiftOrRotateAmt.uge(7) ? (unsigned)ISD::SRL : ShiftOpc;
3593 }
3594
3595 if (VT == MVT::i64)
3596 // Keep exactly 32-bit imm64, this is zext i32 -> i64 which is
3597 // extremely efficient.
3598 return AndMask->getSignificantBits() > 33 ? (unsigned)ISD::SHL : ShiftOpc;
3599
3600 // Keep small shifts as shl so we can generate add/lea.
3601 return ShiftOrRotateAmt.ult(7) ? (unsigned)ISD::SHL : ShiftOpc;
3602 }
3603
3604 // We prefer rotate for vectors of if we won't get a zext mask with SRL
3605 // (PreferRotate will be set in the latter case).
3606 if (PreferRotate || !MayTransformRotate || VT.isVector())
3607 return ShiftOpc;
3608
3609 // Non-vector type and we have a zext mask with SRL.
3610 return ISD::SRL;
3611}
3612
3615 const Value *Lhs,
3616 const Value *Rhs) const {
3617 using namespace llvm::PatternMatch;
3618 int BaseCost = BrMergingBaseCostThresh.getValue();
3619 // With CCMP, branches can be merged in a more efficient way.
3620 if (BaseCost >= 0 && Subtarget.hasCCMP())
3621 BaseCost += BrMergingCcmpBias;
3622 // a == b && a == c is a fast pattern on x86.
3623 if (BaseCost >= 0 && Opc == Instruction::And &&
3626 BaseCost += 1;
3627
3628 // For OR conditions with EQ comparisons, prefer splitting into branches
3629 // (unless CCMP is available). OR+EQ cannot be optimized via bitwise ops,
3630 // unlike OR+NE which becomes (P|Q)!=0. Similarly, don't split signed
3631 // comparisons (SLT, SGT) that can be optimized.
3632 if (BaseCost >= 0 && !Subtarget.hasCCMP() && Opc == Instruction::Or &&
3635 return {-1, -1, -1};
3636
3637 return {BaseCost, BrMergingLikelyBias.getValue(),
3638 BrMergingUnlikelyBias.getValue()};
3639}
3640
3642 return N->getOpcode() != ISD::FP_EXTEND;
3643}
3644
3646 const SDNode *N) const {
3647 assert(((N->getOpcode() == ISD::SHL &&
3648 N->getOperand(0).getOpcode() == ISD::SRL) ||
3649 (N->getOpcode() == ISD::SRL &&
3650 N->getOperand(0).getOpcode() == ISD::SHL)) &&
3651 "Expected shift-shift mask");
3652 // TODO: Should we always create i64 masks? Or only folded immediates?
3653 EVT VT = N->getValueType(0);
3654 if ((Subtarget.hasFastVectorShiftMasks() && VT.isVector()) ||
3655 (Subtarget.hasFastScalarShiftMasks() && !VT.isVector())) {
3656 // Only fold if the shift values are equal - so it folds to AND.
3657 // TODO - we should fold if either is a non-uniform vector but we don't do
3658 // the fold for non-splats yet.
3659 return N->getOperand(1) == N->getOperand(0).getOperand(1);
3660 }
3662}
3663
3665 EVT VT = Y.getValueType();
3666
3667 // For vectors, we don't have a preference, but we probably want a mask.
3668 if (VT.isVector())
3669 return false;
3670
3671 unsigned MaxWidth = Subtarget.is64Bit() ? 64 : 32;
3672 return VT.getScalarSizeInBits() <= MaxWidth;
3673}
3674
3677 SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const {
3679 !Subtarget.isOSWindows())
3682 ExpansionFactor);
3683}
3684
3686 // Any legal vector type can be splatted more efficiently than
3687 // loading/spilling from memory.
3688 return isTypeLegal(VT);
3689}
3690
3692 MVT VT = MVT::getIntegerVT(NumBits);
3693 if (isTypeLegal(VT))
3694 return VT;
3695
3696 // PMOVMSKB can handle this.
3697 if (NumBits == 128 && isTypeLegal(MVT::v16i8))
3698 return MVT::v16i8;
3699
3700 // VPMOVMSKB can handle this.
3701 if (NumBits == 256 && isTypeLegal(MVT::v32i8))
3702 return MVT::v32i8;
3703
3704 // TODO: Allow 64-bit type for 32-bit target.
3705 // TODO: 512-bit types should be allowed, but make sure that those
3706 // cases are handled in combineVectorSizedSetCCEquality().
3707
3709}
3710
3711/// Val is the undef sentinel value or equal to the specified value.
3712static bool isUndefOrEqual(int Val, int CmpVal) {
3713 return ((Val == SM_SentinelUndef) || (Val == CmpVal));
3714}
3715
3716/// Return true if every element in Mask is the undef sentinel value or equal to
3717/// the specified value.
3718static bool isUndefOrEqual(ArrayRef<int> Mask, int CmpVal) {
3719 return llvm::all_of(Mask, [CmpVal](int M) {
3720 return (M == SM_SentinelUndef) || (M == CmpVal);
3721 });
3722}
3723
3724/// Return true if every element in Mask, beginning from position Pos and ending
3725/// in Pos+Size is the undef sentinel value or equal to the specified value.
3726static bool isUndefOrEqualInRange(ArrayRef<int> Mask, int CmpVal, unsigned Pos,
3727 unsigned Size) {
3728 return llvm::all_of(Mask.slice(Pos, Size),
3729 [CmpVal](int M) { return isUndefOrEqual(M, CmpVal); });
3730}
3731
3732/// Val is either the undef or zero sentinel value.
3733static bool isUndefOrZero(int Val) {
3734 return ((Val == SM_SentinelUndef) || (Val == SM_SentinelZero));
3735}
3736
3737/// Return true if every element in Mask, beginning from position Pos and ending
3738/// in Pos+Size is the undef sentinel value.
3739static bool isUndefInRange(ArrayRef<int> Mask, unsigned Pos, unsigned Size) {
3740 return llvm::all_of(Mask.slice(Pos, Size),
3741 [](int M) { return M == SM_SentinelUndef; });
3742}
3743
3744/// Return true if the mask creates a vector whose lower half is undefined.
3746 unsigned NumElts = Mask.size();
3747 return isUndefInRange(Mask, 0, NumElts / 2);
3748}
3749
3750/// Return true if the mask creates a vector whose upper half is undefined.
3752 unsigned NumElts = Mask.size();
3753 return isUndefInRange(Mask, NumElts / 2, NumElts / 2);
3754}
3755
3756/// Return true if Val falls within the specified range (L, H].
3757static bool isInRange(int Val, int Low, int Hi) {
3758 return (Val >= Low && Val < Hi);
3759}
3760
3761/// Return true if the value of any element in Mask falls within the specified
3762/// range (L, H].
3763static bool isAnyInRange(ArrayRef<int> Mask, int Low, int Hi) {
3764 return llvm::any_of(Mask, [Low, Hi](int M) { return isInRange(M, Low, Hi); });
3765}
3766
3767/// Return true if the value of any element in Mask is the zero sentinel value.
3768static bool isAnyZero(ArrayRef<int> Mask) {
3769 return llvm::any_of(Mask, [](int M) { return M == SM_SentinelZero; });
3770}
3771
3772/// Return true if Val is undef or if its value falls within the
3773/// specified range (L, H].
3774static bool isUndefOrInRange(int Val, int Low, int Hi) {
3775 return (Val == SM_SentinelUndef) || isInRange(Val, Low, Hi);
3776}
3777
3778/// Return true if every element in Mask is undef or if its value
3779/// falls within the specified range (L, H].
3780static bool isUndefOrInRange(ArrayRef<int> Mask, int Low, int Hi) {
3781 return llvm::all_of(
3782 Mask, [Low, Hi](int M) { return isUndefOrInRange(M, Low, Hi); });
3783}
3784
3785/// Return true if Val is undef, zero or if its value falls within the
3786/// specified range (L, H].
3787static bool isUndefOrZeroOrInRange(int Val, int Low, int Hi) {
3788 return isUndefOrZero(Val) || isInRange(Val, Low, Hi);
3789}
3790
3791/// Return true if every element in Mask is undef, zero or if its value
3792/// falls within the specified range (L, H].
3793static bool isUndefOrZeroOrInRange(ArrayRef<int> Mask, int Low, int Hi) {
3794 return llvm::all_of(
3795 Mask, [Low, Hi](int M) { return isUndefOrZeroOrInRange(M, Low, Hi); });
3796}
3797
3798/// Return true if every element in Mask, is an in-place blend/select mask or is
3799/// undef.
3800[[maybe_unused]] static bool isBlendOrUndef(ArrayRef<int> Mask) {
3801 unsigned NumElts = Mask.size();
3802 for (auto [I, M] : enumerate(Mask))
3803 if (!isUndefOrEqual(M, I) && !isUndefOrEqual(M, I + NumElts))
3804 return false;
3805 return true;
3806}
3807
3808/// Return true if every element in Mask, beginning
3809/// from position Pos and ending in Pos + Size, falls within the specified
3810/// sequence (Low, Low + Step, ..., Low + (Size - 1) * Step) or is undef.
3811static bool isSequentialOrUndefInRange(ArrayRef<int> Mask, unsigned Pos,
3812 unsigned Size, int Low, int Step = 1) {
3813 for (unsigned i = Pos, e = Pos + Size; i != e; ++i, Low += Step)
3814 if (!isUndefOrEqual(Mask[i], Low))
3815 return false;
3816 return true;
3817}
3818
3819/// Return true if every element in Mask, beginning
3820/// from position Pos and ending in Pos+Size, falls within the specified
3821/// sequential range (Low, Low+Size], or is undef or is zero.
3823 unsigned Size, int Low,
3824 int Step = 1) {
3825 for (unsigned i = Pos, e = Pos + Size; i != e; ++i, Low += Step)
3826 if (!isUndefOrZero(Mask[i]) && Mask[i] != Low)
3827 return false;
3828 return true;
3829}
3830
3831/// Return true if every element in Mask, beginning
3832/// from position Pos and ending in Pos+Size is undef or is zero.
3833static bool isUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
3834 unsigned Size) {
3835 return llvm::all_of(Mask.slice(Pos, Size), isUndefOrZero);
3836}
3837
3838/// Return true if every element of a single input is referenced by the shuffle
3839/// mask. i.e. it just permutes them all.
3841 unsigned NumElts = Mask.size();
3842 APInt DemandedElts = APInt::getZero(NumElts);
3843 for (int M : Mask)
3844 if (isInRange(M, 0, NumElts))
3845 DemandedElts.setBit(M);
3846 return DemandedElts.isAllOnes();
3847}
3848
3849/// Helper function to test whether a shuffle mask could be
3850/// simplified by widening the elements being shuffled.
3851///
3852/// Appends the mask for wider elements in WidenedMask if valid. Otherwise
3853/// leaves it in an unspecified state.
3854///
3855/// NOTE: This must handle normal vector shuffle masks and *target* vector
3856/// shuffle masks. The latter have the special property of a '-2' representing
3857/// a zero-ed lane of a vector.
3859 SmallVectorImpl<int> &WidenedMask) {
3860 WidenedMask.assign(Mask.size() / 2, 0);
3861 for (int i = 0, Size = Mask.size(); i < Size; i += 2) {
3862 int M0 = Mask[i];
3863 int M1 = Mask[i + 1];
3864
3865 // If both elements are undef, its trivial.
3866 if (M0 == SM_SentinelUndef && M1 == SM_SentinelUndef) {
3867 WidenedMask[i / 2] = SM_SentinelUndef;
3868 continue;
3869 }
3870
3871 // Check for an undef mask and a mask value properly aligned to fit with
3872 // a pair of values. If we find such a case, use the non-undef mask's value.
3873 if (M0 == SM_SentinelUndef && M1 >= 0 && (M1 % 2) == 1) {
3874 WidenedMask[i / 2] = M1 / 2;
3875 continue;
3876 }
3877 if (M1 == SM_SentinelUndef && M0 >= 0 && (M0 % 2) == 0) {
3878 WidenedMask[i / 2] = M0 / 2;
3879 continue;
3880 }
3881
3882 // When zeroing, we need to spread the zeroing across both lanes to widen.
3883 if (M0 == SM_SentinelZero || M1 == SM_SentinelZero) {
3884 if ((M0 == SM_SentinelZero || M0 == SM_SentinelUndef) &&
3886 WidenedMask[i / 2] = SM_SentinelZero;
3887 continue;
3888 }
3889 return false;
3890 }
3891
3892 // Finally check if the two mask values are adjacent and aligned with
3893 // a pair.
3894 if (M0 != SM_SentinelUndef && (M0 % 2) == 0 && (M0 + 1) == M1) {
3895 WidenedMask[i / 2] = M0 / 2;
3896 continue;
3897 }
3898
3899 // Otherwise we can't safely widen the elements used in this shuffle.
3900 return false;
3901 }
3902 assert(WidenedMask.size() == Mask.size() / 2 &&
3903 "Incorrect size of mask after widening the elements!");
3904
3905 return true;
3906}
3907
3909 const APInt &Zeroable,
3910 bool V2IsZero,
3911 SmallVectorImpl<int> &WidenedMask) {
3912 // Create an alternative mask with info about zeroable elements.
3913 // Here we do not set undef elements as zeroable.
3914 SmallVector<int, 64> ZeroableMask(Mask);
3915 if (V2IsZero) {
3916 assert(!Zeroable.isZero() && "V2's non-undef elements are used?!");
3917 for (int i = 0, Size = Mask.size(); i != Size; ++i)
3918 if (Mask[i] != SM_SentinelUndef && Zeroable[i])
3919 ZeroableMask[i] = SM_SentinelZero;
3920 }
3921 return canWidenShuffleElements(ZeroableMask, WidenedMask);
3922}
3923
3925 SmallVector<int, 32> WidenedMask;
3926 return canWidenShuffleElements(Mask, WidenedMask);
3927}
3928
3929// Attempt to narrow/widen shuffle mask until it matches the target number of
3930// elements.
3931static bool scaleShuffleElements(ArrayRef<int> Mask, unsigned NumDstElts,
3932 SmallVectorImpl<int> &ScaledMask) {
3933 unsigned NumSrcElts = Mask.size();
3934 assert(((NumSrcElts % NumDstElts) == 0 || (NumDstElts % NumSrcElts) == 0) &&
3935 "Illegal shuffle scale factor");
3936
3937 // Narrowing is guaranteed to work.
3938 if (NumDstElts >= NumSrcElts) {
3939 int Scale = NumDstElts / NumSrcElts;
3940 llvm::narrowShuffleMaskElts(Scale, Mask, ScaledMask);
3941 return true;
3942 }
3943
3944 // We have to repeat the widening until we reach the target size, but we can
3945 // split out the first widening as it sets up ScaledMask for us.
3946 if (canWidenShuffleElements(Mask, ScaledMask)) {
3947 while (ScaledMask.size() > NumDstElts) {
3948 SmallVector<int, 16> WidenedMask;
3949 if (!canWidenShuffleElements(ScaledMask, WidenedMask))
3950 return false;
3951 ScaledMask = std::move(WidenedMask);
3952 }
3953 return true;
3954 }
3955
3956 return false;
3957}
3958
3959static bool canScaleShuffleElements(ArrayRef<int> Mask, unsigned NumDstElts) {
3960 SmallVector<int, 32> ScaledMask;
3961 return scaleShuffleElements(Mask, NumDstElts, ScaledMask);
3962}
3963
3964// Helper to grow the shuffle mask for a larger value type.
3965// NOTE: This is different to scaleShuffleElements which is a same size type.
3966static void growShuffleMask(ArrayRef<int> SrcMask,
3967 SmallVectorImpl<int> &DstMask,
3968 unsigned SrcSizeInBits, unsigned DstSizeInBits) {
3969 assert(DstMask.empty() && "Expected an empty shuffle mas");
3970 assert((DstSizeInBits % SrcSizeInBits) == 0 && "Illegal shuffle scale");
3971 unsigned Scale = DstSizeInBits / SrcSizeInBits;
3972 unsigned NumSrcElts = SrcMask.size();
3973 DstMask.assign(SrcMask.begin(), SrcMask.end());
3974 for (int &M : DstMask) {
3975 if (M < 0)
3976 continue;
3977 M = (M % NumSrcElts) + ((M / NumSrcElts) * Scale * NumSrcElts);
3978 }
3979 DstMask.append((Scale - 1) * NumSrcElts, SM_SentinelUndef);
3980}
3981
3982/// Returns true if Elt is a constant zero or a floating point constant +0.0.
3984 return isNullConstant(Elt) || isNullFPConstant(Elt);
3985}
3986
3987// Build a vector of constants.
3988// Use an UNDEF node if MaskElt == -1.
3989// Split 64-bit constants in the 32-bit mode.
3991 const SDLoc &dl, bool IsMask = false) {
3992
3994 bool Split = false;
3995
3996 MVT ConstVecVT = VT;
3997 unsigned NumElts = VT.getVectorNumElements();
3998 bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
3999 if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
4000 ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
4001 Split = true;
4002 }
4003
4004 MVT EltVT = ConstVecVT.getVectorElementType();
4005 for (unsigned i = 0; i < NumElts; ++i) {
4006 bool IsUndef = Values[i] < 0 && IsMask;
4007 SDValue OpNode = IsUndef ? DAG.getUNDEF(EltVT) :
4008 DAG.getConstant(Values[i], dl, EltVT);
4009 Ops.push_back(OpNode);
4010 if (Split)
4011 Ops.push_back(IsUndef ? DAG.getUNDEF(EltVT) :
4012 DAG.getConstant(0, dl, EltVT));
4013 }
4014 SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
4015 if (Split)
4016 ConstsNode = DAG.getBitcast(VT, ConstsNode);
4017 return ConstsNode;
4018}
4019
4020static SDValue getConstVector(ArrayRef<APInt> Bits, const APInt &Undefs,
4021 MVT VT, SelectionDAG &DAG, const SDLoc &dl) {
4022 assert(Bits.size() == Undefs.getBitWidth() &&
4023 "Unequal constant and undef arrays");
4025 bool Split = false;
4026
4027 MVT ConstVecVT = VT;
4028 unsigned NumElts = VT.getVectorNumElements();
4029 bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
4030 if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
4031 ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
4032 Split = true;
4033 }
4034
4035 MVT EltVT = ConstVecVT.getVectorElementType();
4036 MVT EltIntVT = EltVT.changeTypeToInteger();
4037 for (unsigned i = 0, e = Bits.size(); i != e; ++i) {
4038 if (Undefs[i]) {
4039 Ops.append(Split ? 2 : 1, DAG.getUNDEF(EltVT));
4040 continue;
4041 }
4042 const APInt &V = Bits[i];
4043 assert(V.getBitWidth() == VT.getScalarSizeInBits() && "Unexpected sizes");
4044 if (Split) {
4045 Ops.push_back(DAG.getConstant(V.extractBits(32, 0), dl, EltVT));
4046 Ops.push_back(DAG.getConstant(V.extractBits(32, 32), dl, EltVT));
4047 } else {
4048 Ops.push_back(DAG.getBitcast(EltVT, DAG.getConstant(V, dl, EltIntVT)));
4049 }
4050 }
4051
4052 SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
4053 return DAG.getBitcast(VT, ConstsNode);
4054}
4055
4057 SelectionDAG &DAG, const SDLoc &dl) {
4058 APInt Undefs = APInt::getZero(Bits.size());
4059 return getConstVector(Bits, Undefs, VT, DAG, dl);
4060}
4061
4062/// Returns a vector of specified type with all zero elements.
4063static SDValue getZeroVector(MVT VT, const X86Subtarget &Subtarget,
4064 SelectionDAG &DAG, const SDLoc &dl) {
4065 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector() ||
4066 VT.getVectorElementType() == MVT::i1) &&
4067 "Unexpected vector type");
4068
4069 // Try to build SSE/AVX zero vectors as <N x i32> bitcasted to their dest
4070 // type. This ensures they get CSE'd. But if the integer type is not
4071 // available, use a floating-point +0.0 instead.
4072 SDValue Vec;
4073 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
4074 if (!Subtarget.hasSSE2() && VT.is128BitVector()) {
4075 Vec = DAG.getConstantFP(+0.0, dl, MVT::v4f32);
4076 } else if (VT.isFloatingPoint() &&
4078 Vec = DAG.getConstantFP(+0.0, dl, VT);
4079 } else if (VT.getVectorElementType() == MVT::i1) {
4080 assert((Subtarget.hasBWI() || VT.getVectorNumElements() <= 16) &&
4081 "Unexpected vector type");
4082 Vec = DAG.getConstant(0, dl, VT);
4083 } else {
4084 unsigned Num32BitElts = VT.getSizeInBits() / 32;
4085 Vec = DAG.getConstant(0, dl, MVT::getVectorVT(MVT::i32, Num32BitElts));
4086 }
4087 return DAG.getBitcast(VT, Vec);
4088}
4089
4090// Helper to determine if the ops are all the extracted subvectors come from a
4091// single source. If we allow commute they don't have to be in order (Lo/Hi).
4092static SDValue getSplitVectorSrc(SDValue LHS, SDValue RHS, bool AllowCommute) {
4093 if (LHS.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
4094 RHS.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
4095 LHS.getValueType() != RHS.getValueType() ||
4096 LHS.getOperand(0) != RHS.getOperand(0))
4097 return SDValue();
4098
4099 SDValue Src = LHS.getOperand(0);
4100 if (Src.getValueSizeInBits() != (LHS.getValueSizeInBits() * 2))
4101 return SDValue();
4102
4103 unsigned NumElts = LHS.getValueType().getVectorNumElements();
4104 if ((LHS.getConstantOperandAPInt(1) == 0 &&
4105 RHS.getConstantOperandAPInt(1) == NumElts) ||
4106 (AllowCommute && RHS.getConstantOperandAPInt(1) == 0 &&
4107 LHS.getConstantOperandAPInt(1) == NumElts))
4108 return Src;
4109
4110 return SDValue();
4111}
4112
4113static SDValue extractSubVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG,
4114 const SDLoc &dl, unsigned vectorWidth) {
4115 EVT VT = Vec.getValueType();
4116 EVT ElVT = VT.getVectorElementType();
4117 unsigned ResultNumElts =
4118 (VT.getVectorNumElements() * vectorWidth) / VT.getSizeInBits();
4119 EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT, ResultNumElts);
4120
4121 assert(ResultVT.getSizeInBits() == vectorWidth &&
4122 "Illegal subvector extraction");
4123
4124 // Extract the relevant vectorWidth bits. Generate an EXTRACT_SUBVECTOR
4125 unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits();
4126 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
4127
4128 // This is the index of the first element of the vectorWidth-bit chunk
4129 // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
4130 IdxVal &= ~(ElemsPerChunk - 1);
4131
4132 // If the input is a buildvector just emit a smaller one.
4133 if (Vec.getOpcode() == ISD::BUILD_VECTOR)
4134 return DAG.getBuildVector(ResultVT, dl,
4135 Vec->ops().slice(IdxVal, ElemsPerChunk));
4136
4137 // Check if we're extracting the upper undef of a widening pattern.
4138 if (Vec.getOpcode() == ISD::INSERT_SUBVECTOR && Vec.getOperand(0).isUndef() &&
4139 Vec.getOperand(1).getValueType().getVectorNumElements() <= IdxVal &&
4140 isNullConstant(Vec.getOperand(2)))
4141 return DAG.getUNDEF(ResultVT);
4142
4143 return DAG.getExtractSubvector(dl, ResultVT, Vec, IdxVal);
4144}
4145
4146/// Generate a DAG to grab 128-bits from a vector > 128 bits. This
4147/// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128
4148/// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4
4149/// instructions or a simple subregister reference. Idx is an index in the
4150/// 128 bits we want. It need not be aligned to a 128-bit boundary. That makes
4151/// lowering EXTRACT_VECTOR_ELT operations easier.
4152static SDValue extract128BitVector(SDValue Vec, unsigned IdxVal,
4153 SelectionDAG &DAG, const SDLoc &dl) {
4155 Vec.getValueType().is512BitVector()) &&
4156 "Unexpected vector size!");
4157 return extractSubVector(Vec, IdxVal, DAG, dl, 128);
4158}
4159
4160/// Generate a DAG to grab 256-bits from a 512-bit vector.
4161static SDValue extract256BitVector(SDValue Vec, unsigned IdxVal,
4162 SelectionDAG &DAG, const SDLoc &dl) {
4163 assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!");
4164 return extractSubVector(Vec, IdxVal, DAG, dl, 256);
4165}
4166
4167static SDValue insertSubVector(SDValue Result, SDValue Vec, unsigned IdxVal,
4168 SelectionDAG &DAG, const SDLoc &dl,
4169 unsigned vectorWidth) {
4170 assert((vectorWidth == 128 || vectorWidth == 256) &&
4171 "Unsupported vector width");
4172 // Inserting UNDEF is Result
4173 if (Vec.isUndef())
4174 return Result;
4175
4176 // Insert the relevant vectorWidth bits.
4177 EVT VT = Vec.getValueType();
4178 unsigned ElemsPerChunk = vectorWidth / VT.getScalarSizeInBits();
4179 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
4180
4181 // This is the index of the first element of the vectorWidth-bit chunk
4182 // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
4183 IdxVal &= ~(ElemsPerChunk - 1);
4184 return DAG.getInsertSubvector(dl, Result, Vec, IdxVal);
4185}
4186
4187/// Generate a DAG to put 128-bits into a vector > 128 bits. This
4188/// sets things up to match to an AVX VINSERTF128/VINSERTI128 or
4189/// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a
4190/// simple superregister reference. Idx is an index in the 128 bits
4191/// we want. It need not be aligned to a 128-bit boundary. That makes
4192/// lowering INSERT_VECTOR_ELT operations easier.
4193static SDValue insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
4194 SelectionDAG &DAG, const SDLoc &dl) {
4195 assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!");
4196 return insertSubVector(Result, Vec, IdxVal, DAG, dl, 128);
4197}
4198
4199/// Widen a vector to a larger size with the same scalar type, with the new
4200/// elements either zero or undef.
4201static SDValue widenSubVector(MVT VT, SDValue Vec, bool ZeroNewElements,
4202 const X86Subtarget &Subtarget, SelectionDAG &DAG,
4203 const SDLoc &dl) {
4204 EVT VecVT = Vec.getValueType();
4206 VecVT.getScalarType() == VT.getScalarType() &&
4207 "Unsupported vector widening type");
4208 // If the upper 128-bits of a build vector are already undef/zero, then try to
4209 // widen from the lower 128-bits.
4210 if (Vec.getOpcode() == ISD::BUILD_VECTOR && VecVT.is256BitVector()) {
4211 unsigned NumSrcElts = VecVT.getVectorNumElements();
4212 ArrayRef<SDUse> Hi = Vec->ops().drop_front(NumSrcElts / 2);
4213 if (all_of(Hi, [&](SDValue V) {
4214 return V.isUndef() || (ZeroNewElements && X86::isZeroNode(V));
4215 }))
4216 Vec = extract128BitVector(Vec, 0, DAG, dl);
4217 }
4218 SDValue Res = ZeroNewElements ? getZeroVector(VT, Subtarget, DAG, dl)
4219 : DAG.getUNDEF(VT);
4220 return DAG.getInsertSubvector(dl, Res, Vec, 0);
4221}
4222
4223/// Widen a vector to a larger size with the same scalar type, with the new
4224/// elements either zero or undef.
4225static SDValue widenSubVector(SDValue Vec, bool ZeroNewElements,
4226 const X86Subtarget &Subtarget, SelectionDAG &DAG,
4227 const SDLoc &dl, unsigned WideSizeInBits) {
4228 assert(Vec.getValueSizeInBits() <= WideSizeInBits &&
4229 (WideSizeInBits % Vec.getScalarValueSizeInBits()) == 0 &&
4230 "Unsupported vector widening type");
4231 unsigned WideNumElts = WideSizeInBits / Vec.getScalarValueSizeInBits();
4232 MVT SVT = Vec.getSimpleValueType().getScalarType();
4233 MVT VT = MVT::getVectorVT(SVT, WideNumElts);
4234 return widenSubVector(VT, Vec, ZeroNewElements, Subtarget, DAG, dl);
4235}
4236
4237/// Widen a mask vector type to a minimum of v8i1/v16i1 to allow use of KSHIFT
4238/// and bitcast with integer types.
4239static MVT widenMaskVectorType(MVT VT, const X86Subtarget &Subtarget) {
4240 assert(VT.getVectorElementType() == MVT::i1 && "Expected bool vector");
4241 unsigned NumElts = VT.getVectorNumElements();
4242 if ((!Subtarget.hasDQI() && NumElts == 8) || NumElts < 8)
4243 return Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
4244 return VT;
4245}
4246
4247/// Widen a mask vector to a minimum of v8i1/v16i1 to allow use of KSHIFT and
4248/// bitcast with integer types.
4249static SDValue widenMaskVector(SDValue Vec, bool ZeroNewElements,
4250 const X86Subtarget &Subtarget, SelectionDAG &DAG,
4251 const SDLoc &dl) {
4252 MVT VT = widenMaskVectorType(Vec.getSimpleValueType(), Subtarget);
4253 return widenSubVector(VT, Vec, ZeroNewElements, Subtarget, DAG, dl);
4254}
4255
4256// Helper function to collect subvector ops that are concatenated together,
4257// either by ISD::CONCAT_VECTORS or a ISD::INSERT_SUBVECTOR series.
4258// The subvectors in Ops are guaranteed to be the same type.
4260 SelectionDAG &DAG) {
4261 assert(Ops.empty() && "Expected an empty ops vector");
4262
4263 if (N->getOpcode() == ISD::CONCAT_VECTORS) {
4264 Ops.append(N->op_begin(), N->op_end());
4265 return true;
4266 }
4267
4268 if (N->getOpcode() == ISD::INSERT_SUBVECTOR) {
4269 SDValue Src = N->getOperand(0);
4270 SDValue Sub = N->getOperand(1);
4271 const APInt &Idx = N->getConstantOperandAPInt(2);
4272 EVT VT = Src.getValueType();
4273 EVT SubVT = Sub.getValueType();
4274
4275 if (VT.getSizeInBits() == (SubVT.getSizeInBits() * 2)) {
4276 // insert_subvector(undef, x, lo)
4277 if (Idx == 0 && Src.isUndef()) {
4278 Ops.push_back(Sub);
4279 Ops.push_back(DAG.getUNDEF(SubVT));
4280 return true;
4281 }
4282 if (Idx == (VT.getVectorNumElements() / 2)) {
4283 // insert_subvector(insert_subvector(undef, x, lo), y, hi)
4284 if (Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
4285 Src.getOperand(1).getValueType() == SubVT &&
4286 isNullConstant(Src.getOperand(2))) {
4287 // Attempt to recurse into inner (matching) concats.
4288 SDValue Lo = Src.getOperand(1);
4289 SDValue Hi = Sub;
4290 SmallVector<SDValue, 2> LoOps, HiOps;
4291 if (collectConcatOps(Lo.getNode(), LoOps, DAG) &&
4292 collectConcatOps(Hi.getNode(), HiOps, DAG) &&
4293 LoOps.size() == HiOps.size()) {
4294 Ops.append(LoOps);
4295 Ops.append(HiOps);
4296 return true;
4297 }
4298 Ops.push_back(Lo);
4299 Ops.push_back(Hi);
4300 return true;
4301 }
4302 // insert_subvector(x, extract_subvector(x, lo), hi)
4303 if (Sub.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
4304 Sub.getOperand(0) == Src && isNullConstant(Sub.getOperand(1))) {
4305 Ops.append(2, Sub);
4306 return true;
4307 }
4308 // insert_subvector(undef, x, hi)
4309 if (Src.isUndef()) {
4310 Ops.push_back(DAG.getUNDEF(SubVT));
4311 Ops.push_back(Sub);
4312 return true;
4313 }
4314 }
4315 }
4316 }
4317
4318 if (N->getOpcode() == ISD::EXTRACT_SUBVECTOR) {
4319 EVT VT = N->getValueType(0);
4320 SDValue Src = N->getOperand(0);
4321 uint64_t Idx = N->getConstantOperandVal(1);
4322
4323 // Collect all the subvectors from the source vector and slice off the
4324 // extraction.
4326 if (collectConcatOps(Src.getNode(), SrcOps, DAG) &&
4327 VT.getSizeInBits() > SrcOps[0].getValueSizeInBits() &&
4328 (VT.getSizeInBits() % SrcOps[0].getValueSizeInBits()) == 0 &&
4329 (Idx % SrcOps[0].getValueType().getVectorNumElements()) == 0) {
4330 unsigned SubIdx = Idx / SrcOps[0].getValueType().getVectorNumElements();
4331 unsigned NumSubs = VT.getSizeInBits() / SrcOps[0].getValueSizeInBits();
4332 Ops.append(SrcOps.begin() + SubIdx, SrcOps.begin() + SubIdx + NumSubs);
4333 return true;
4334 }
4335 }
4336
4337 assert(Ops.empty() && "Expected an empty ops vector");
4338 return false;
4339}
4340
4341// Helper to check if \p V can be split into subvectors and the upper subvectors
4342// are all undef. In which case return the lower subvector.
4344 SelectionDAG &DAG) {
4345 SmallVector<SDValue> SubOps;
4346 if (!collectConcatOps(V.getNode(), SubOps, DAG))
4347 return SDValue();
4348
4349 unsigned NumSubOps = SubOps.size();
4350 unsigned HalfNumSubOps = NumSubOps / 2;
4351 assert((NumSubOps % 2) == 0 && "Unexpected number of subvectors");
4352
4353 ArrayRef<SDValue> UpperOps(SubOps.begin() + HalfNumSubOps, SubOps.end());
4354 if (any_of(UpperOps, [](SDValue Op) { return !Op.isUndef(); }))
4355 return SDValue();
4356
4357 EVT HalfVT = V.getValueType().getHalfNumVectorElementsVT(*DAG.getContext());
4358 ArrayRef<SDValue> LowerOps(SubOps.begin(), SubOps.begin() + HalfNumSubOps);
4359 return DAG.getNode(ISD::CONCAT_VECTORS, DL, HalfVT, LowerOps);
4360}
4361
4362// Helper to check if we can access all the constituent subvectors without any
4363// extract ops.
4366 return collectConcatOps(V.getNode(), Ops, DAG);
4367}
4368
4369static std::pair<SDValue, SDValue> splitVector(SDValue Op, SelectionDAG &DAG,
4370 const SDLoc &dl) {
4371 EVT VT = Op.getValueType();
4372 unsigned NumElems = VT.getVectorNumElements();
4373 unsigned SizeInBits = VT.getSizeInBits();
4374 assert((NumElems % 2) == 0 && (SizeInBits % 2) == 0 &&
4375 "Can't split odd sized vector");
4376
4378 if (collectConcatOps(Op.getNode(), SubOps, DAG)) {
4379 assert((SubOps.size() % 2) == 0 && "Can't split odd sized vector concat");
4380 unsigned HalfOps = SubOps.size() / 2;
4381 EVT HalfVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
4382 SmallVector<SDValue, 2> LoOps(SubOps.begin(), SubOps.begin() + HalfOps);
4383 SmallVector<SDValue, 2> HiOps(SubOps.begin() + HalfOps, SubOps.end());
4384 SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, LoOps);
4385 SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, HiOps);
4386 return std::make_pair(Lo, Hi);
4387 }
4388
4389 // If this is a splat value (with no-undefs) then use the lower subvector,
4390 // which should be a free extraction.
4391 SDValue Lo = extractSubVector(Op, 0, DAG, dl, SizeInBits / 2);
4392 if (DAG.isSplatValue(Op, /*AllowUndefs*/ false))
4393 return std::make_pair(Lo, Lo);
4394
4395 SDValue Hi = extractSubVector(Op, NumElems / 2, DAG, dl, SizeInBits / 2);
4396 return std::make_pair(Lo, Hi);
4397}
4398
4399/// Break an operation into 2 half sized ops and then concatenate the results.
4401 unsigned NumOps = Op.getNumOperands();
4402 EVT VT = Op.getValueType();
4403
4404 // Extract the LHS Lo/Hi vectors
4407 for (unsigned I = 0; I != NumOps; ++I) {
4408 SDValue SrcOp = Op.getOperand(I);
4409 if (!SrcOp.getValueType().isVector()) {
4410 LoOps[I] = HiOps[I] = SrcOp;
4411 continue;
4412 }
4413 std::tie(LoOps[I], HiOps[I]) = splitVector(SrcOp, DAG, dl);
4414 }
4415
4416 EVT LoVT, HiVT;
4417 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
4418 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
4419 DAG.getNode(Op.getOpcode(), dl, LoVT, LoOps),
4420 DAG.getNode(Op.getOpcode(), dl, HiVT, HiOps));
4421}
4422
4423/// Break an unary integer operation into 2 half sized ops and then
4424/// concatenate the result back.
4426 const SDLoc &dl) {
4427 // Make sure we only try to split 256/512-bit types to avoid creating
4428 // narrow vectors.
4429 [[maybe_unused]] EVT VT = Op.getValueType();
4430 assert((Op.getOperand(0).getValueType().is256BitVector() ||
4431 Op.getOperand(0).getValueType().is512BitVector()) &&
4432 (VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!");
4433 assert(Op.getOperand(0).getValueType().getVectorNumElements() ==
4434 VT.getVectorNumElements() &&
4435 "Unexpected VTs!");
4436 return splitVectorOp(Op, DAG, dl);
4437}
4438
4439/// Break a binary integer operation into 2 half sized ops and then
4440/// concatenate the result back.
4442 const SDLoc &dl) {
4443 // Assert that all the types match.
4444 [[maybe_unused]] EVT VT = Op.getValueType();
4445 assert(Op.getOperand(0).getValueType() == VT &&
4446 Op.getOperand(1).getValueType() == VT && "Unexpected VTs!");
4447 assert((VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!");
4448 return splitVectorOp(Op, DAG, dl);
4449}
4450
4451// Helper for splitting operands of an operation to legal target size and
4452// apply a function on each part.
4453// Useful for operations that are available on SSE2 in 128-bit, on AVX2 in
4454// 256-bit and on AVX512BW in 512-bit. The argument VT is the type used for
4455// deciding if/how to split Ops. Ops elements do *not* have to be of type VT.
4456// The argument Builder is a function that will be applied on each split part:
4457// SDValue Builder(SelectionDAG&G, SDLoc, ArrayRef<SDValue>)
4458template <typename F>
4460 const SDLoc &DL, EVT VT, ArrayRef<SDValue> Ops,
4461 F Builder, bool CheckBWI = true,
4462 bool AllowAVX512 = true) {
4463 assert(Subtarget.hasSSE2() && "Target assumed to support at least SSE2");
4464 unsigned NumSubs = 1;
4465 if (AllowAVX512 && ((CheckBWI && Subtarget.useBWIRegs()) ||
4466 (!CheckBWI && Subtarget.useAVX512Regs()))) {
4467 if (VT.getSizeInBits() > 512) {
4468 NumSubs = VT.getSizeInBits() / 512;
4469 assert((VT.getSizeInBits() % 512) == 0 && "Illegal vector size");
4470 }
4471 } else if (Subtarget.hasAVX2()) {
4472 if (VT.getSizeInBits() > 256) {
4473 NumSubs = VT.getSizeInBits() / 256;
4474 assert((VT.getSizeInBits() % 256) == 0 && "Illegal vector size");
4475 }
4476 } else {
4477 if (VT.getSizeInBits() > 128) {
4478 NumSubs = VT.getSizeInBits() / 128;
4479 assert((VT.getSizeInBits() % 128) == 0 && "Illegal vector size");
4480 }
4481 }
4482
4483 if (NumSubs == 1)
4484 return Builder(DAG, DL, Ops);
4485
4487 for (unsigned i = 0; i != NumSubs; ++i) {
4489 for (SDValue Op : Ops) {
4490 EVT OpVT = Op.getValueType();
4491 unsigned NumSubElts = OpVT.getVectorNumElements() / NumSubs;
4492 unsigned SizeSub = OpVT.getSizeInBits() / NumSubs;
4493 SubOps.push_back(extractSubVector(Op, i * NumSubElts, DAG, DL, SizeSub));
4494 }
4495 Subs.push_back(Builder(DAG, DL, SubOps));
4496 }
4497 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs);
4498}
4499
4500// Helper function that extends a non-512-bit vector op to 512-bits on non-VLX
4501// targets.
4502static SDValue getAVX512Node(unsigned Opcode, const SDLoc &DL, MVT VT,
4504 const X86Subtarget &Subtarget) {
4505 assert(Subtarget.hasAVX512() && "AVX512 target expected");
4506 MVT SVT = VT.getScalarType();
4507
4508 // If we have a 32/64 splatted constant, splat it to DstTy to
4509 // encourage a foldable broadcast'd operand.
4510 auto MakeBroadcastOp = [&](SDValue Op, MVT OpVT, MVT DstVT) {
4511 unsigned OpEltSizeInBits = OpVT.getScalarSizeInBits();
4512 // AVX512 broadcasts 32/64-bit operands.
4513 // TODO: Support float once getAVX512Node is used by fp-ops.
4514 if (!OpVT.isInteger() || OpEltSizeInBits < 32 ||
4516 return SDValue();
4517 // If we're not widening, don't bother if we're not bitcasting.
4518 if (OpVT == DstVT && Op.getOpcode() != ISD::BITCAST)
4519 return SDValue();
4521 APInt SplatValue, SplatUndef;
4522 unsigned SplatBitSize;
4523 bool HasAnyUndefs;
4524 if (BV->isConstantSplat(SplatValue, SplatUndef, SplatBitSize,
4525 HasAnyUndefs, OpEltSizeInBits) &&
4526 !HasAnyUndefs && SplatValue.getBitWidth() == OpEltSizeInBits)
4527 return DAG.getConstant(SplatValue, DL, DstVT);
4528 }
4529 return SDValue();
4530 };
4531
4532 bool Widen = !(Subtarget.hasVLX() || VT.is512BitVector());
4533
4534 MVT DstVT = VT;
4535 if (Widen)
4536 DstVT = MVT::getVectorVT(SVT, 512 / SVT.getSizeInBits());
4537
4538 // Canonicalize src operands.
4539 SmallVector<SDValue> SrcOps(Ops);
4540 for (SDValue &Op : SrcOps) {
4541 MVT OpVT = Op.getSimpleValueType();
4542 // Just pass through scalar operands.
4543 if (!OpVT.isVector())
4544 continue;
4545 assert(OpVT == VT && "Vector type mismatch");
4546
4547 if (SDValue BroadcastOp = MakeBroadcastOp(Op, OpVT, DstVT)) {
4548 Op = BroadcastOp;
4549 continue;
4550 }
4551
4552 // Just widen the subvector by inserting into an undef wide vector.
4553 if (Widen)
4554 Op = widenSubVector(Op, false, Subtarget, DAG, DL, 512);
4555 }
4556
4557 SDValue Res = DAG.getNode(Opcode, DL, DstVT, SrcOps);
4558
4559 // Perform the 512-bit op then extract the bottom subvector.
4560 if (Widen)
4561 Res = extractSubVector(Res, 0, DAG, DL, VT.getSizeInBits());
4562 return Res;
4563}
4564
4565/// Insert i1-subvector to i1-vector.
4567 const X86Subtarget &Subtarget) {
4568
4569 SDLoc dl(Op);
4570 SDValue Vec = Op.getOperand(0);
4571 SDValue SubVec = Op.getOperand(1);
4572 SDValue Idx = Op.getOperand(2);
4573 unsigned IdxVal = Op.getConstantOperandVal(2);
4574
4575 // Inserting undef is a nop. We can just return the original vector.
4576 if (SubVec.isUndef())
4577 return Vec;
4578
4579 if (IdxVal == 0 && Vec.isUndef()) // the operation is legal
4580 return Op;
4581
4582 MVT OpVT = Op.getSimpleValueType();
4583 unsigned NumElems = OpVT.getVectorNumElements();
4584 SDValue ZeroIdx = DAG.getVectorIdxConstant(0, dl);
4585
4586 // Extend to natively supported kshift.
4587 MVT WideOpVT = widenMaskVectorType(OpVT, Subtarget);
4588
4589 // Inserting into the lsbs of a zero vector is legal. ISel will insert shifts
4590 // if necessary.
4591 if (IdxVal == 0 && ISD::isBuildVectorAllZeros(Vec.getNode())) {
4592 // May need to promote to a legal type.
4593 Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
4594 DAG.getConstant(0, dl, WideOpVT),
4595 SubVec, Idx);
4596 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
4597 }
4598
4599 MVT SubVecVT = SubVec.getSimpleValueType();
4600 unsigned SubVecNumElems = SubVecVT.getVectorNumElements();
4601 assert(IdxVal + SubVecNumElems <= NumElems &&
4602 IdxVal % SubVecVT.getSizeInBits() == 0 &&
4603 "Unexpected index value in INSERT_SUBVECTOR");
4604
4605 SDValue Undef = DAG.getUNDEF(WideOpVT);
4606
4607 if (IdxVal == 0) {
4608 // Zero lower bits of the Vec
4609 SDValue ShiftBits = DAG.getTargetConstant(SubVecNumElems, dl, MVT::i8);
4610 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec,
4611 ZeroIdx);
4612 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
4613 Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
4614 // Merge them together, SubVec should be zero extended.
4615 SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
4616 DAG.getConstant(0, dl, WideOpVT),
4617 SubVec, ZeroIdx);
4618 Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
4619 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
4620 }
4621
4622 SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
4623 Undef, SubVec, ZeroIdx);
4624
4625 if (Vec.isUndef()) {
4626 assert(IdxVal != 0 && "Unexpected index");
4627 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4628 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
4629 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
4630 }
4631
4633 assert(IdxVal != 0 && "Unexpected index");
4634 // If upper elements of Vec are known undef, then just shift into place.
4635 if (llvm::all_of(Vec->ops().slice(IdxVal + SubVecNumElems),
4636 [](SDValue V) { return V.isUndef(); })) {
4637 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4638 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
4639 } else {
4640 NumElems = WideOpVT.getVectorNumElements();
4641 unsigned ShiftLeft = NumElems - SubVecNumElems;
4642 unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
4643 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4644 DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
4645 if (ShiftRight != 0)
4646 SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
4647 DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
4648 }
4649 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
4650 }
4651
4652 // Simple case when we put subvector in the upper part
4653 if (IdxVal + SubVecNumElems == NumElems) {
4654 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4655 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
4656 if (SubVecNumElems * 2 == NumElems) {
4657 // Special case, use legal zero extending insert_subvector. This allows
4658 // isel to optimize when bits are known zero.
4659 Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVecVT, Vec, ZeroIdx);
4660 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
4661 DAG.getConstant(0, dl, WideOpVT),
4662 Vec, ZeroIdx);
4663 } else {
4664 // Otherwise use explicit shifts to zero the bits.
4665 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
4666 Undef, Vec, ZeroIdx);
4667 NumElems = WideOpVT.getVectorNumElements();
4668 SDValue ShiftBits = DAG.getTargetConstant(NumElems - IdxVal, dl, MVT::i8);
4669 Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
4670 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
4671 }
4672 Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
4673 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
4674 }
4675
4676 // Inserting into the middle is more complicated.
4677
4678 NumElems = WideOpVT.getVectorNumElements();
4679
4680 // Widen the vector if needed.
4681 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx);
4682
4683 unsigned ShiftLeft = NumElems - SubVecNumElems;
4684 unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
4685
4686 // Do an optimization for the most frequently used types.
4687 if (WideOpVT != MVT::v64i1 || Subtarget.is64Bit()) {
4688 APInt Mask0 = APInt::getBitsSet(NumElems, IdxVal, IdxVal + SubVecNumElems);
4689 Mask0.flipAllBits();
4690 SDValue CMask0 = DAG.getConstant(Mask0, dl, MVT::getIntegerVT(NumElems));
4691 SDValue VMask0 = DAG.getNode(ISD::BITCAST, dl, WideOpVT, CMask0);
4692 Vec = DAG.getNode(ISD::AND, dl, WideOpVT, Vec, VMask0);
4693 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4694 DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
4695 SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
4696 DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
4697 Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
4698
4699 // Reduce to original width if needed.
4700 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
4701 }
4702
4703 // Clear the upper bits of the subvector and move it to its insert position.
4704 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4705 DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
4706 SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
4707 DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
4708
4709 // Isolate the bits below the insertion point.
4710 unsigned LowShift = NumElems - IdxVal;
4711 SDValue Low = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec,
4712 DAG.getTargetConstant(LowShift, dl, MVT::i8));
4713 Low = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Low,
4714 DAG.getTargetConstant(LowShift, dl, MVT::i8));
4715
4716 // Isolate the bits after the last inserted bit.
4717 unsigned HighShift = IdxVal + SubVecNumElems;
4718 SDValue High = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec,
4719 DAG.getTargetConstant(HighShift, dl, MVT::i8));
4720 High = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, High,
4721 DAG.getTargetConstant(HighShift, dl, MVT::i8));
4722
4723 // Now OR all 3 pieces together.
4724 Vec = DAG.getNode(ISD::OR, dl, WideOpVT, Low, High);
4725 SubVec = DAG.getNode(ISD::OR, dl, WideOpVT, SubVec, Vec);
4726
4727 // Reduce to original width if needed.
4728 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
4729}
4730
4732 const SDLoc &dl) {
4733 assert(V1.getValueType() == V2.getValueType() && "subvector type mismatch");
4734 EVT SubVT = V1.getValueType();
4735 EVT SubSVT = SubVT.getScalarType();
4736 unsigned SubNumElts = SubVT.getVectorNumElements();
4737 unsigned SubVectorWidth = SubVT.getSizeInBits();
4738 EVT VT = EVT::getVectorVT(*DAG.getContext(), SubSVT, 2 * SubNumElts);
4739 SDValue V = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, dl, SubVectorWidth);
4740 return insertSubVector(V, V2, SubNumElts, DAG, dl, SubVectorWidth);
4741}
4742
4743/// Returns a vector of specified type with all bits set.
4744/// Always build ones vectors as <4 x i32>, <8 x i32> or <16 x i32>.
4745/// Then bitcast to their original type, ensuring they get CSE'd.
4746static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) {
4747 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
4748 "Expected a 128/256/512-bit vector type");
4749 unsigned NumElts = VT.getSizeInBits() / 32;
4750 SDValue Vec = DAG.getAllOnesConstant(dl, MVT::getVectorVT(MVT::i32, NumElts));
4751 return DAG.getBitcast(VT, Vec);
4752}
4753
4754static SDValue getEXTEND_VECTOR_INREG(unsigned Opcode, const SDLoc &DL, EVT VT,
4755 SDValue In, SelectionDAG &DAG) {
4756 EVT InVT = In.getValueType();
4757 assert(VT.isVector() && InVT.isVector() && "Expected vector VTs.");
4758
4759 // Canonicalize Opcode to general extension version.
4760 switch (Opcode) {
4761 case ISD::ANY_EXTEND:
4763 Opcode = ISD::ANY_EXTEND;
4764 break;
4765 case ISD::SIGN_EXTEND:
4767 Opcode = ISD::SIGN_EXTEND;
4768 break;
4769 case ISD::ZERO_EXTEND:
4771 Opcode = ISD::ZERO_EXTEND;
4772 break;
4773 default:
4774 llvm_unreachable("Unknown extension opcode");
4775 }
4776
4777 // For 256-bit vectors, we only need the lower (128-bit) input half.
4778 // For 512-bit vectors, we only need the lower input half or quarter.
4779 if (InVT.getSizeInBits() > 128) {
4780 assert(VT.getSizeInBits() == InVT.getSizeInBits() &&
4781 "Expected VTs to be the same size!");
4782 unsigned Scale = VT.getScalarSizeInBits() / InVT.getScalarSizeInBits();
4783 In = extractSubVector(In, 0, DAG, DL,
4784 std::max(128U, (unsigned)VT.getSizeInBits() / Scale));
4785 InVT = In.getValueType();
4786 }
4787
4788 if (VT.getVectorNumElements() != InVT.getVectorNumElements())
4789 Opcode = DAG.getOpcode_EXTEND_VECTOR_INREG(Opcode);
4790
4791 return DAG.getNode(Opcode, DL, VT, In);
4792}
4793
4794// Create OR(AND(LHS,MASK),AND(RHS,~MASK)) bit select pattern
4796 SDValue Mask, SelectionDAG &DAG) {
4797 LHS = DAG.getNode(ISD::AND, DL, VT, LHS, Mask);
4798 RHS = DAG.getNode(X86ISD::ANDNP, DL, VT, Mask, RHS);
4799 return DAG.getNode(ISD::OR, DL, VT, LHS, RHS);
4800}
4801
4803 bool Lo, bool Unary) {
4804 assert(VT.getScalarType().isSimple() && (VT.getSizeInBits() % 128) == 0 &&
4805 "Illegal vector type to unpack");
4806 assert(Mask.empty() && "Expected an empty shuffle mask vector");
4807 int NumElts = VT.getVectorNumElements();
4808 int NumEltsInLane = 128 / VT.getScalarSizeInBits();
4809 for (int i = 0; i < NumElts; ++i) {
4810 unsigned LaneStart = (i / NumEltsInLane) * NumEltsInLane;
4811 int Pos = (i % NumEltsInLane) / 2 + LaneStart;
4812 Pos += (Unary ? 0 : NumElts * (i % 2));
4813 Pos += (Lo ? 0 : NumEltsInLane / 2);
4814 Mask.push_back(Pos);
4815 }
4816}
4817
4818/// Similar to unpacklo/unpackhi, but without the 128-bit lane limitation
4819/// imposed by AVX and specific to the unary pattern. Example:
4820/// v8iX Lo --> <0, 0, 1, 1, 2, 2, 3, 3>
4821/// v8iX Hi --> <4, 4, 5, 5, 6, 6, 7, 7>
4823 bool Lo) {
4824 assert(Mask.empty() && "Expected an empty shuffle mask vector");
4825 int NumElts = VT.getVectorNumElements();
4826 for (int i = 0; i < NumElts; ++i) {
4827 int Pos = i / 2;
4828 Pos += (Lo ? 0 : NumElts / 2);
4829 Mask.push_back(Pos);
4830 }
4831}
4832
4833// Attempt to constant fold, else just create a VECTOR_SHUFFLE.
4834static SDValue getVectorShuffle(SelectionDAG &DAG, EVT VT, const SDLoc &dl,
4835 SDValue V1, SDValue V2, ArrayRef<int> Mask) {
4838 SmallVector<SDValue> Ops(Mask.size(), DAG.getUNDEF(VT.getScalarType()));
4839 for (int I = 0, NumElts = Mask.size(); I != NumElts; ++I) {
4840 int M = Mask[I];
4841 if (M < 0)
4842 continue;
4843 SDValue V = (M < NumElts) ? V1 : V2;
4844 if (V.isUndef())
4845 continue;
4846 Ops[I] = V.getOperand(M % NumElts);
4847 }
4848 return DAG.getBuildVector(VT, dl, Ops);
4849 }
4850
4851 return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
4852}
4853
4854/// Returns a vector_shuffle node for an unpackl operation.
4855static SDValue getUnpackl(SelectionDAG &DAG, const SDLoc &dl, EVT VT,
4856 SDValue V1, SDValue V2) {
4858 createUnpackShuffleMask(VT, Mask, /* Lo = */ true, /* Unary = */ false);
4859 return getVectorShuffle(DAG, VT, dl, V1, V2, Mask);
4860}
4861
4862/// Returns a vector_shuffle node for an unpackh operation.
4863static SDValue getUnpackh(SelectionDAG &DAG, const SDLoc &dl, EVT VT,
4864 SDValue V1, SDValue V2) {
4866 createUnpackShuffleMask(VT, Mask, /* Lo = */ false, /* Unary = */ false);
4867 return getVectorShuffle(DAG, VT, dl, V1, V2, Mask);
4868}
4869
4870/// Returns a node that packs the LHS + RHS nodes together at half width.
4871/// May return X86ISD::PACKSS/PACKUS, packing the top/bottom half.
4872/// TODO: Add subvector splitting if/when we have a need for it.
4873static SDValue getPack(SelectionDAG &DAG, const X86Subtarget &Subtarget,
4874 const SDLoc &dl, MVT VT, SDValue LHS, SDValue RHS,
4875 bool PackHiHalf = false) {
4876 MVT OpVT = LHS.getSimpleValueType();
4877 unsigned EltSizeInBits = VT.getScalarSizeInBits();
4878 bool UsePackUS = Subtarget.hasSSE41() || EltSizeInBits == 8;
4879 assert(OpVT == RHS.getSimpleValueType() &&
4880 VT.getSizeInBits() == OpVT.getSizeInBits() &&
4881 (EltSizeInBits * 2) == OpVT.getScalarSizeInBits() &&
4882 "Unexpected PACK operand types");
4883 assert((EltSizeInBits == 8 || EltSizeInBits == 16 || EltSizeInBits == 32) &&
4884 "Unexpected PACK result type");
4885
4886 // Rely on vector shuffles for vXi64 -> vXi32 packing.
4887 if (EltSizeInBits == 32) {
4888 SmallVector<int> PackMask;
4889 int Offset = PackHiHalf ? 1 : 0;
4890 int NumElts = VT.getVectorNumElements();
4891 for (int I = 0; I != NumElts; I += 4) {
4892 PackMask.push_back(I + Offset);
4893 PackMask.push_back(I + Offset + 2);
4894 PackMask.push_back(I + Offset + NumElts);
4895 PackMask.push_back(I + Offset + NumElts + 2);
4896 }
4897 return DAG.getVectorShuffle(VT, dl, DAG.getBitcast(VT, LHS),
4898 DAG.getBitcast(VT, RHS), PackMask);
4899 }
4900
4901 // See if we already have sufficient leading bits for PACKSS/PACKUS.
4902 if (!PackHiHalf) {
4903 if (UsePackUS &&
4904 DAG.computeKnownBits(LHS).countMaxActiveBits() <= EltSizeInBits &&
4905 DAG.computeKnownBits(RHS).countMaxActiveBits() <= EltSizeInBits)
4906 return DAG.getNode(X86ISD::PACKUS, dl, VT, LHS, RHS);
4907
4908 if (DAG.ComputeMaxSignificantBits(LHS) <= EltSizeInBits &&
4909 DAG.ComputeMaxSignificantBits(RHS) <= EltSizeInBits)
4910 return DAG.getNode(X86ISD::PACKSS, dl, VT, LHS, RHS);
4911 }
4912
4913 // Fallback to sign/zero extending the requested half and pack.
4914 SDValue Amt = DAG.getTargetConstant(EltSizeInBits, dl, MVT::i8);
4915 if (UsePackUS) {
4916 if (PackHiHalf) {
4917 LHS = DAG.getNode(X86ISD::VSRLI, dl, OpVT, LHS, Amt);
4918 RHS = DAG.getNode(X86ISD::VSRLI, dl, OpVT, RHS, Amt);
4919 } else {
4920 SDValue Mask = DAG.getConstant((1ULL << EltSizeInBits) - 1, dl, OpVT);
4921 LHS = DAG.getNode(ISD::AND, dl, OpVT, LHS, Mask);
4922 RHS = DAG.getNode(ISD::AND, dl, OpVT, RHS, Mask);
4923 };
4924 return DAG.getNode(X86ISD::PACKUS, dl, VT, LHS, RHS);
4925 };
4926
4927 if (!PackHiHalf) {
4928 LHS = DAG.getNode(X86ISD::VSHLI, dl, OpVT, LHS, Amt);
4929 RHS = DAG.getNode(X86ISD::VSHLI, dl, OpVT, RHS, Amt);
4930 }
4931 LHS = DAG.getNode(X86ISD::VSRAI, dl, OpVT, LHS, Amt);
4932 RHS = DAG.getNode(X86ISD::VSRAI, dl, OpVT, RHS, Amt);
4933 return DAG.getNode(X86ISD::PACKSS, dl, VT, LHS, RHS);
4934}
4935
4936/// Return a vector_shuffle of the specified vector of zero or undef vector.
4937/// This produces a shuffle where the low element of V2 is swizzled into the
4938/// zero/undef vector, landing at element Idx.
4939/// This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3).
4941 bool IsZero,
4942 const X86Subtarget &Subtarget,
4943 SelectionDAG &DAG) {
4944 MVT VT = V2.getSimpleValueType();
4945 SDValue V1 = IsZero
4946 ? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT);
4947 int NumElems = VT.getVectorNumElements();
4948 SmallVector<int, 16> MaskVec(NumElems);
4949 for (int i = 0; i != NumElems; ++i)
4950 // If this is the insertion idx, put the low elt of V2 here.
4951 MaskVec[i] = (i == Idx) ? NumElems : i;
4952 return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, MaskVec);
4953}
4954
4956 if (Ptr.getOpcode() == X86ISD::Wrapper ||
4957 Ptr.getOpcode() == X86ISD::WrapperRIP)
4958 Ptr = Ptr.getOperand(0);
4960}
4961
4962// TODO: Add support for non-zero offsets.
4965 if (!CNode || CNode->isMachineConstantPoolEntry() || CNode->getOffset() != 0)
4966 return nullptr;
4967 return CNode->getConstVal();
4968}
4969
4971 if (!Load || !ISD::isNormalLoad(Load))
4972 return nullptr;
4973 return getTargetConstantFromBasePtr(Load->getBasePtr());
4974}
4975
4980
4981const Constant *
4983 assert(LD && "Unexpected null LoadSDNode");
4984 return getTargetConstantFromNode(LD);
4985}
4986
4988 // Do not fold (vselect not(C), X, 0s) to (vselect C, Os, X)
4989 SDValue Cond = N->getOperand(0);
4990 SDValue RHS = N->getOperand(2);
4991 EVT CondVT = Cond.getValueType();
4992 return N->getOpcode() == ISD::VSELECT && Subtarget.hasAVX512() &&
4993 CondVT.getVectorElementType() == MVT::i1 &&
4994 ISD::isBuildVectorAllZeros(RHS.getNode());
4995}
4996
4997// Extract raw constant bits from constant pools.
4998static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
4999 APInt &UndefElts,
5000 SmallVectorImpl<APInt> &EltBits,
5001 bool AllowWholeUndefs = true,
5002 bool AllowPartialUndefs = false) {
5003 assert(EltBits.empty() && "Expected an empty EltBits vector");
5004
5006
5007 EVT VT = Op.getValueType();
5008 unsigned SizeInBits = VT.getSizeInBits();
5009 unsigned NumElts = SizeInBits / EltSizeInBits;
5010
5011 // Can't split constant.
5012 if ((SizeInBits % EltSizeInBits) != 0)
5013 return false;
5014
5015 // Bitcast a source array of element bits to the target size.
5016 auto CastBitData = [&](APInt &UndefSrcElts, ArrayRef<APInt> SrcEltBits) {
5017 unsigned NumSrcElts = UndefSrcElts.getBitWidth();
5018 unsigned SrcEltSizeInBits = SrcEltBits[0].getBitWidth();
5019 assert((NumSrcElts * SrcEltSizeInBits) == SizeInBits &&
5020 "Constant bit sizes don't match");
5021
5022 // Don't split if we don't allow undef bits.
5023 bool AllowUndefs = AllowWholeUndefs || AllowPartialUndefs;
5024 if (UndefSrcElts.getBoolValue() && !AllowUndefs)
5025 return false;
5026
5027 // If we're already the right size, don't bother bitcasting.
5028 if (NumSrcElts == NumElts) {
5029 UndefElts = UndefSrcElts;
5030 EltBits.assign(SrcEltBits.begin(), SrcEltBits.end());
5031 return true;
5032 }
5033
5034 // Extract all the undef/constant element data and pack into single bitsets.
5035 APInt UndefBits(SizeInBits, 0);
5036 APInt MaskBits(SizeInBits, 0);
5037
5038 for (unsigned i = 0; i != NumSrcElts; ++i) {
5039 unsigned BitOffset = i * SrcEltSizeInBits;
5040 if (UndefSrcElts[i])
5041 UndefBits.setBits(BitOffset, BitOffset + SrcEltSizeInBits);
5042 MaskBits.insertBits(SrcEltBits[i], BitOffset);
5043 }
5044
5045 // Split the undef/constant single bitset data into the target elements.
5046 UndefElts = APInt(NumElts, 0);
5047 EltBits.resize(NumElts, APInt(EltSizeInBits, 0));
5048
5049 for (unsigned i = 0; i != NumElts; ++i) {
5050 unsigned BitOffset = i * EltSizeInBits;
5051 APInt UndefEltBits = UndefBits.extractBits(EltSizeInBits, BitOffset);
5052
5053 // Only treat an element as UNDEF if all bits are UNDEF.
5054 if (UndefEltBits.isAllOnes()) {
5055 if (!AllowWholeUndefs)
5056 return false;
5057 UndefElts.setBit(i);
5058 continue;
5059 }
5060
5061 // If only some bits are UNDEF then treat them as zero (or bail if not
5062 // supported).
5063 if (UndefEltBits.getBoolValue() && !AllowPartialUndefs)
5064 return false;
5065
5066 EltBits[i] = MaskBits.extractBits(EltSizeInBits, BitOffset);
5067 }
5068 return true;
5069 };
5070
5071 // Collect constant bits and insert into mask/undef bit masks.
5072 auto CollectConstantBits = [](const Constant *Cst, APInt &Mask, APInt &Undefs,
5073 unsigned UndefBitIndex) {
5074 if (!Cst)
5075 return false;
5076 if (isa<UndefValue>(Cst)) {
5077 Undefs.setBit(UndefBitIndex);
5078 return true;
5079 }
5080 if (auto *CInt = dyn_cast<ConstantInt>(Cst)) {
5081 Mask = CInt->getValue();
5082 return true;
5083 }
5084 if (auto *CFP = dyn_cast<ConstantFP>(Cst)) {
5085 Mask = CFP->getValueAPF().bitcastToAPInt();
5086 return true;
5087 }
5088 if (auto *CDS = dyn_cast<ConstantDataSequential>(Cst)) {
5089 Type *Ty = CDS->getType();
5090 Mask = APInt::getZero(Ty->getPrimitiveSizeInBits());
5091 Type *EltTy = CDS->getElementType();
5092 bool IsInteger = EltTy->isIntegerTy();
5093 bool IsFP =
5094 EltTy->isHalfTy() || EltTy->isFloatTy() || EltTy->isDoubleTy();
5095 if (!IsInteger && !IsFP)
5096 return false;
5097 unsigned EltBits = EltTy->getPrimitiveSizeInBits();
5098 for (unsigned I = 0, E = CDS->getNumElements(); I != E; ++I)
5099 if (IsInteger)
5100 Mask.insertBits(CDS->getElementAsAPInt(I), I * EltBits);
5101 else
5102 Mask.insertBits(CDS->getElementAsAPFloat(I).bitcastToAPInt(),
5103 I * EltBits);
5104 return true;
5105 }
5106 return false;
5107 };
5108
5109 // Handle UNDEFs.
5110 if (Op.isUndef()) {
5111 APInt UndefSrcElts = APInt::getAllOnes(NumElts);
5112 SmallVector<APInt, 64> SrcEltBits(NumElts, APInt(EltSizeInBits, 0));
5113 return CastBitData(UndefSrcElts, SrcEltBits);
5114 }
5115
5116 // Extract scalar constant bits.
5117 if (auto *Cst = dyn_cast<ConstantSDNode>(Op)) {
5118 APInt UndefSrcElts = APInt::getZero(1);
5119 SmallVector<APInt, 64> SrcEltBits(1, Cst->getAPIntValue());
5120 return CastBitData(UndefSrcElts, SrcEltBits);
5121 }
5122 if (auto *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
5123 APInt UndefSrcElts = APInt::getZero(1);
5124 APInt RawBits = Cst->getValueAPF().bitcastToAPInt();
5125 SmallVector<APInt, 64> SrcEltBits(1, RawBits);
5126 return CastBitData(UndefSrcElts, SrcEltBits);
5127 }
5128
5129 // Extract constant bits from build vector.
5130 if (auto *BV = dyn_cast<BuildVectorSDNode>(Op)) {
5131 BitVector Undefs;
5132 SmallVector<APInt> SrcEltBits;
5133 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
5134 if (BV->getConstantRawBits(true, SrcEltSizeInBits, SrcEltBits, Undefs)) {
5135 APInt UndefSrcElts = APInt::getZero(SrcEltBits.size());
5136 for (unsigned I = 0, E = SrcEltBits.size(); I != E; ++I)
5137 if (Undefs[I])
5138 UndefSrcElts.setBit(I);
5139 return CastBitData(UndefSrcElts, SrcEltBits);
5140 }
5141 }
5142
5143 // Extract constant bits from constant pool vector.
5144 if (auto *Cst = getTargetConstantFromNode(Op)) {
5145 Type *CstTy = Cst->getType();
5146 unsigned CstSizeInBits = CstTy->getPrimitiveSizeInBits();
5147 if (!CstTy->isVectorTy() || (CstSizeInBits % SizeInBits) != 0)
5148 return false;
5149
5150 unsigned SrcEltSizeInBits = CstTy->getScalarSizeInBits();
5151 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
5152 if ((SizeInBits % SrcEltSizeInBits) != 0)
5153 return false;
5154
5155 APInt UndefSrcElts(NumSrcElts, 0);
5156 SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0));
5157 for (unsigned i = 0; i != NumSrcElts; ++i)
5158 if (!CollectConstantBits(Cst->getAggregateElement(i), SrcEltBits[i],
5159 UndefSrcElts, i))
5160 return false;
5161
5162 return CastBitData(UndefSrcElts, SrcEltBits);
5163 }
5164
5165 // Extract constant bits from a broadcasted constant pool scalar.
5166 if (Op.getOpcode() == X86ISD::VBROADCAST_LOAD &&
5167 EltSizeInBits <= VT.getScalarSizeInBits()) {
5168 auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
5169 if (MemIntr->getMemoryVT().getStoreSizeInBits() != VT.getScalarSizeInBits())
5170 return false;
5171
5172 SDValue Ptr = MemIntr->getBasePtr();
5174 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
5175 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
5176
5177 APInt UndefSrcElts(NumSrcElts, 0);
5178 SmallVector<APInt, 64> SrcEltBits(1, APInt(SrcEltSizeInBits, 0));
5179 if (CollectConstantBits(C, SrcEltBits[0], UndefSrcElts, 0)) {
5180 if (UndefSrcElts[0])
5181 UndefSrcElts.setBits(0, NumSrcElts);
5182 if (SrcEltBits[0].getBitWidth() != SrcEltSizeInBits)
5183 SrcEltBits[0] = SrcEltBits[0].trunc(SrcEltSizeInBits);
5184 SrcEltBits.append(NumSrcElts - 1, SrcEltBits[0]);
5185 return CastBitData(UndefSrcElts, SrcEltBits);
5186 }
5187 }
5188 }
5189
5190 // Extract constant bits from a subvector broadcast.
5191 if (Op.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) {
5192 auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
5193 SDValue Ptr = MemIntr->getBasePtr();
5194 // The source constant may be larger than the subvector broadcast,
5195 // ensure we extract the correct subvector constants.
5196 if (const Constant *Cst = getTargetConstantFromBasePtr(Ptr)) {
5197 Type *CstTy = Cst->getType();
5198 unsigned CstSizeInBits = CstTy->getPrimitiveSizeInBits();
5199 unsigned SubVecSizeInBits = MemIntr->getMemoryVT().getStoreSizeInBits();
5200 if (!CstTy->isVectorTy() || (CstSizeInBits % SubVecSizeInBits) != 0 ||
5201 (SizeInBits % SubVecSizeInBits) != 0)
5202 return false;
5203 unsigned CstEltSizeInBits = CstTy->getScalarSizeInBits();
5204 unsigned NumSubElts = SubVecSizeInBits / CstEltSizeInBits;
5205 unsigned NumSubVecs = SizeInBits / SubVecSizeInBits;
5206 APInt UndefSubElts(NumSubElts, 0);
5207 SmallVector<APInt, 64> SubEltBits(NumSubElts * NumSubVecs,
5208 APInt(CstEltSizeInBits, 0));
5209 for (unsigned i = 0; i != NumSubElts; ++i) {
5210 if (!CollectConstantBits(Cst->getAggregateElement(i), SubEltBits[i],
5211 UndefSubElts, i))
5212 return false;
5213 for (unsigned j = 1; j != NumSubVecs; ++j)
5214 SubEltBits[i + (j * NumSubElts)] = SubEltBits[i];
5215 }
5216 UndefSubElts = APInt::getSplat(NumSubVecs * UndefSubElts.getBitWidth(),
5217 UndefSubElts);
5218 return CastBitData(UndefSubElts, SubEltBits);
5219 }
5220 }
5221
5222 // Extract a rematerialized scalar constant insertion.
5223 if (Op.getOpcode() == X86ISD::VZEXT_MOVL &&
5224 Op.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR &&
5225 isa<ConstantSDNode>(Op.getOperand(0).getOperand(0))) {
5226 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
5227 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
5228
5229 APInt UndefSrcElts(NumSrcElts, 0);
5230 SmallVector<APInt, 64> SrcEltBits;
5231 const APInt &C = Op.getOperand(0).getConstantOperandAPInt(0);
5232 SrcEltBits.push_back(C.zextOrTrunc(SrcEltSizeInBits));
5233 SrcEltBits.append(NumSrcElts - 1, APInt(SrcEltSizeInBits, 0));
5234 return CastBitData(UndefSrcElts, SrcEltBits);
5235 }
5236
5237 // Insert constant bits from a base and sub vector sources.
5238 if (Op.getOpcode() == ISD::INSERT_SUBVECTOR) {
5239 // If bitcasts to larger elements we might lose track of undefs - don't
5240 // allow any to be safe.
5241 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
5242 bool AllowUndefs = EltSizeInBits >= SrcEltSizeInBits;
5243
5244 APInt UndefSrcElts, UndefSubElts;
5245 SmallVector<APInt, 32> EltSrcBits, EltSubBits;
5246 if (getTargetConstantBitsFromNode(Op.getOperand(1), SrcEltSizeInBits,
5247 UndefSubElts, EltSubBits,
5248 AllowWholeUndefs && AllowUndefs,
5249 AllowPartialUndefs && AllowUndefs) &&
5250 getTargetConstantBitsFromNode(Op.getOperand(0), SrcEltSizeInBits,
5251 UndefSrcElts, EltSrcBits,
5252 AllowWholeUndefs && AllowUndefs,
5253 AllowPartialUndefs && AllowUndefs)) {
5254 unsigned BaseIdx = Op.getConstantOperandVal(2);
5255 UndefSrcElts.insertBits(UndefSubElts, BaseIdx);
5256 for (unsigned i = 0, e = EltSubBits.size(); i != e; ++i)
5257 EltSrcBits[BaseIdx + i] = EltSubBits[i];
5258 return CastBitData(UndefSrcElts, EltSrcBits);
5259 }
5260 }
5261
5262 // Extract constant bits from a subvector's source.
5263 if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
5264 getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits, UndefElts,
5265 EltBits, AllowWholeUndefs,
5266 AllowPartialUndefs)) {
5267 EVT SrcVT = Op.getOperand(0).getValueType();
5268 unsigned NumSrcElts = SrcVT.getSizeInBits() / EltSizeInBits;
5269 unsigned NumSubElts = VT.getSizeInBits() / EltSizeInBits;
5270 unsigned BaseOfs = Op.getConstantOperandVal(1) * VT.getScalarSizeInBits();
5271 unsigned BaseIdx = BaseOfs / EltSizeInBits;
5272 assert((SrcVT.getSizeInBits() % EltSizeInBits) == 0 &&
5273 (VT.getSizeInBits() % EltSizeInBits) == 0 &&
5274 (BaseOfs % EltSizeInBits) == 0 && "Bad subvector index");
5275
5276 UndefElts = UndefElts.extractBits(NumSubElts, BaseIdx);
5277 if ((BaseIdx + NumSubElts) != NumSrcElts)
5278 EltBits.erase(EltBits.begin() + BaseIdx + NumSubElts, EltBits.end());
5279 if (BaseIdx != 0)
5280 EltBits.erase(EltBits.begin(), EltBits.begin() + BaseIdx);
5281 return true;
5282 }
5283
5284 // Extract constant bits from shuffle node sources.
5285 if (auto *SVN = dyn_cast<ShuffleVectorSDNode>(Op)) {
5286 // TODO - support shuffle through bitcasts.
5287 if (EltSizeInBits != VT.getScalarSizeInBits())
5288 return false;
5289
5290 ArrayRef<int> Mask = SVN->getMask();
5291 if ((!AllowWholeUndefs || !AllowPartialUndefs) &&
5292 llvm::any_of(Mask, [](int M) { return M < 0; }))
5293 return false;
5294
5295 APInt UndefElts0, UndefElts1;
5296 SmallVector<APInt, 32> EltBits0, EltBits1;
5297 if (isAnyInRange(Mask, 0, NumElts) &&
5298 !getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits,
5299 UndefElts0, EltBits0, AllowWholeUndefs,
5300 AllowPartialUndefs))
5301 return false;
5302 if (isAnyInRange(Mask, NumElts, 2 * NumElts) &&
5303 !getTargetConstantBitsFromNode(Op.getOperand(1), EltSizeInBits,
5304 UndefElts1, EltBits1, AllowWholeUndefs,
5305 AllowPartialUndefs))
5306 return false;
5307
5308 UndefElts = APInt::getZero(NumElts);
5309 for (int i = 0; i != (int)NumElts; ++i) {
5310 int M = Mask[i];
5311 if (M < 0) {
5312 UndefElts.setBit(i);
5313 EltBits.push_back(APInt::getZero(EltSizeInBits));
5314 } else if (M < (int)NumElts) {
5315 if (UndefElts0[M])
5316 UndefElts.setBit(i);
5317 EltBits.push_back(EltBits0[M]);
5318 } else {
5319 if (UndefElts1[M - NumElts])
5320 UndefElts.setBit(i);
5321 EltBits.push_back(EltBits1[M - NumElts]);
5322 }
5323 }
5324 return true;
5325 }
5326
5327 return false;
5328}
5329
5330namespace llvm {
5331namespace X86 {
5332bool isConstantSplat(SDValue Op, APInt &SplatVal, bool AllowPartialUndefs) {
5333 APInt UndefElts;
5334 SmallVector<APInt, 16> EltBits;
5336 Op, Op.getScalarValueSizeInBits(), UndefElts, EltBits,
5337 /*AllowWholeUndefs*/ true, AllowPartialUndefs)) {
5338 int SplatIndex = -1;
5339 for (int i = 0, e = EltBits.size(); i != e; ++i) {
5340 if (UndefElts[i])
5341 continue;
5342 if (0 <= SplatIndex && EltBits[i] != EltBits[SplatIndex]) {
5343 SplatIndex = -1;
5344 break;
5345 }
5346 SplatIndex = i;
5347 }
5348 if (0 <= SplatIndex) {
5349 SplatVal = EltBits[SplatIndex];
5350 return true;
5351 }
5352 }
5353
5354 return false;
5355}
5356
5357int getRoundingModeX86(unsigned RM) {
5358 switch (static_cast<::llvm::RoundingMode>(RM)) {
5359 // clang-format off
5360 case ::llvm::RoundingMode::NearestTiesToEven: return X86::rmToNearest; break;
5361 case ::llvm::RoundingMode::TowardNegative: return X86::rmDownward; break;
5362 case ::llvm::RoundingMode::TowardPositive: return X86::rmUpward; break;
5363 case ::llvm::RoundingMode::TowardZero: return X86::rmTowardZero; break;
5364 default:
5365 return X86::rmInvalid; // Invalid rounding mode
5366 }
5367}
5368
5369} // namespace X86
5370} // namespace llvm
5371
5373 unsigned MaskEltSizeInBits,
5375 APInt &UndefElts) {
5376 // Extract the raw target constant bits.
5377 SmallVector<APInt, 64> EltBits;
5378 if (!getTargetConstantBitsFromNode(MaskNode, MaskEltSizeInBits, UndefElts,
5379 EltBits, /* AllowWholeUndefs */ true,
5380 /* AllowPartialUndefs */ false))
5381 return false;
5382
5383 // Insert the extracted elements into the mask.
5384 for (const APInt &Elt : EltBits)
5385 RawMask.push_back(Elt.getZExtValue());
5386
5387 return true;
5388}
5389
5390static bool isConstantPowerOf2(SDValue V, unsigned EltSizeInBIts,
5391 bool AllowUndefs) {
5392 APInt UndefElts;
5393 SmallVector<APInt, 64> EltBits;
5394 if (!getTargetConstantBitsFromNode(V, EltSizeInBIts, UndefElts, EltBits,
5395 /*AllowWholeUndefs*/ AllowUndefs,
5396 /*AllowPartialUndefs*/ false))
5397 return false;
5398
5399 bool IsPow2OrUndef = true;
5400 for (unsigned I = 0, E = EltBits.size(); I != E; ++I)
5401 IsPow2OrUndef &= UndefElts[I] || EltBits[I].isPowerOf2();
5402 return IsPow2OrUndef;
5403}
5404
5405// Helper to attempt to return a cheaper, bit-inverted version of \p V.
5407 // TODO: don't always ignore oneuse constraints.
5408 V = peekThroughBitcasts(V);
5409 EVT VT = V.getValueType();
5410
5411 // Match not(xor X, -1) -> X.
5412 if (V.getOpcode() == ISD::XOR &&
5413 (ISD::isBuildVectorAllOnes(V.getOperand(1).getNode()) ||
5414 isAllOnesConstant(V.getOperand(1))))
5415 return V.getOperand(0);
5416
5417 // Match not(extract_subvector(not(X)) -> extract_subvector(X).
5418 if (V.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
5419 (isNullConstant(V.getOperand(1)) || V.getOperand(0).hasOneUse())) {
5420 if (SDValue Not = IsNOT(V.getOperand(0), DAG)) {
5421 Not = DAG.getBitcast(V.getOperand(0).getValueType(), Not);
5422 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(Not), VT, Not,
5423 V.getOperand(1));
5424 }
5425 }
5426
5427 // Match not(pcmpgt(C, X)) -> pcmpgt(X, C - 1).
5428 if (V.getOpcode() == X86ISD::PCMPGT &&
5429 !ISD::isBuildVectorAllZeros(V.getOperand(0).getNode()) &&
5430 !ISD::isBuildVectorAllOnes(V.getOperand(0).getNode()) &&
5431 V.getOperand(0).hasOneUse()) {
5432 APInt UndefElts;
5433 SmallVector<APInt> EltBits;
5434 if (getTargetConstantBitsFromNode(V.getOperand(0),
5435 V.getScalarValueSizeInBits(), UndefElts,
5436 EltBits) &&
5437 !ISD::isBuildVectorOfConstantSDNodes(V.getOperand(1).getNode())) {
5438 // Don't fold min_signed_value -> (min_signed_value - 1)
5439 bool MinSigned = false;
5440 for (APInt &Elt : EltBits) {
5441 MinSigned |= Elt.isMinSignedValue();
5442 Elt -= 1;
5443 }
5444 if (!MinSigned) {
5445 SDLoc DL(V);
5446 MVT VT = V.getSimpleValueType();
5447 return DAG.getNode(X86ISD::PCMPGT, DL, VT, V.getOperand(1),
5448 getConstVector(EltBits, UndefElts, VT, DAG, DL));
5449 }
5450 }
5451 }
5452
5453 // Match not(concat_vectors(not(X), not(Y))) -> concat_vectors(X, Y).
5455 if (collectConcatOps(V.getNode(), CatOps, DAG)) {
5456 for (SDValue &CatOp : CatOps) {
5457 SDValue NotCat = IsNOT(CatOp, DAG);
5458 if (!NotCat)
5459 return SDValue();
5460 CatOp = DAG.getBitcast(CatOp.getValueType(), NotCat);
5461 }
5462 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(V), VT, CatOps);
5463 }
5464
5465 // Match not(or(not(X),not(Y))) -> and(X, Y).
5466 if (V.getOpcode() == ISD::OR && DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
5467 V.getOperand(0).hasOneUse() && V.getOperand(1).hasOneUse()) {
5468 // TODO: Handle cases with single NOT operand -> ANDNP
5469 if (SDValue Op1 = IsNOT(V.getOperand(1), DAG))
5470 if (SDValue Op0 = IsNOT(V.getOperand(0), DAG))
5471 return DAG.getNode(ISD::AND, SDLoc(V), VT, DAG.getBitcast(VT, Op0),
5472 DAG.getBitcast(VT, Op1));
5473 }
5474
5475 return SDValue();
5476}
5477
5478/// Create a shuffle mask that matches the PACKSS/PACKUS truncation.
5479/// A multi-stage pack shuffle mask is created by specifying NumStages > 1.
5480/// Note: This ignores saturation, so inputs must be checked first.
5482 bool Unary, unsigned NumStages = 1) {
5483 assert(Mask.empty() && "Expected an empty shuffle mask vector");
5484 unsigned NumElts = VT.getVectorNumElements();
5485 unsigned NumLanes = VT.getSizeInBits() / 128;
5486 unsigned NumEltsPerLane = 128 / VT.getScalarSizeInBits();
5487 unsigned Offset = Unary ? 0 : NumElts;
5488 unsigned Repetitions = 1u << (NumStages - 1);
5489 unsigned Increment = 1u << NumStages;
5490 assert((NumEltsPerLane >> NumStages) > 0 && "Illegal packing compaction");
5491
5492 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
5493 for (unsigned Stage = 0; Stage != Repetitions; ++Stage) {
5494 for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += Increment)
5495 Mask.push_back(Elt + (Lane * NumEltsPerLane));
5496 for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += Increment)
5497 Mask.push_back(Elt + (Lane * NumEltsPerLane) + Offset);
5498 }
5499 }
5500}
5501
5502// Split the demanded elts of a PACKSS/PACKUS node between its operands.
5503static void getPackDemandedElts(EVT VT, const APInt &DemandedElts,
5504 APInt &DemandedLHS, APInt &DemandedRHS) {
5505 int NumLanes = VT.getSizeInBits() / 128;
5506 int NumElts = DemandedElts.getBitWidth();
5507 int NumInnerElts = NumElts / 2;
5508 int NumEltsPerLane = NumElts / NumLanes;
5509 int NumInnerEltsPerLane = NumInnerElts / NumLanes;
5510
5511 DemandedLHS = APInt::getZero(NumInnerElts);
5512 DemandedRHS = APInt::getZero(NumInnerElts);
5513
5514 // Map DemandedElts to the packed operands.
5515 for (int Lane = 0; Lane != NumLanes; ++Lane) {
5516 for (int Elt = 0; Elt != NumInnerEltsPerLane; ++Elt) {
5517 int OuterIdx = (Lane * NumEltsPerLane) + Elt;
5518 int InnerIdx = (Lane * NumInnerEltsPerLane) + Elt;
5519 if (DemandedElts[OuterIdx])
5520 DemandedLHS.setBit(InnerIdx);
5521 if (DemandedElts[OuterIdx + NumInnerEltsPerLane])
5522 DemandedRHS.setBit(InnerIdx);
5523 }
5524 }
5525}
5526
5527// Split the demanded elts of a HADD/HSUB node between its operands.
5528static void getHorizDemandedElts(EVT VT, const APInt &DemandedElts,
5529 APInt &DemandedLHS, APInt &DemandedRHS) {
5531 DemandedLHS, DemandedRHS);
5532 DemandedLHS |= DemandedLHS << 1;
5533 DemandedRHS |= DemandedRHS << 1;
5534}
5535
5536/// Calculates the shuffle mask corresponding to the target-specific opcode.
5537/// If the mask could be calculated, returns it in \p Mask, returns the shuffle
5538/// operands in \p Ops, and returns true.
5539/// Sets \p IsUnary to true if only one source is used. Note that this will set
5540/// IsUnary for shuffles which use a single input multiple times, and in those
5541/// cases it will adjust the mask to only have indices within that single input.
5542/// It is an error to call this with non-empty Mask/Ops vectors.
5543static bool getTargetShuffleMask(SDValue N, bool AllowSentinelZero,
5545 SmallVectorImpl<int> &Mask, bool &IsUnary) {
5546 if (!isTargetShuffle(N.getOpcode()))
5547 return false;
5548
5549 MVT VT = N.getSimpleValueType();
5550 unsigned NumElems = VT.getVectorNumElements();
5551 unsigned MaskEltSize = VT.getScalarSizeInBits();
5553 APInt RawUndefs;
5554 uint64_t ImmN;
5555
5556 assert(Mask.empty() && "getTargetShuffleMask expects an empty Mask vector");
5557 assert(Ops.empty() && "getTargetShuffleMask expects an empty Ops vector");
5558
5559 IsUnary = false;
5560 bool IsFakeUnary = false;
5561 switch (N.getOpcode()) {
5562 case X86ISD::BLENDI:
5563 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5564 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5565 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5566 DecodeBLENDMask(NumElems, ImmN, Mask);
5567 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5568 break;
5569 case X86ISD::SHUFP:
5570 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5571 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5572 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5573 DecodeSHUFPMask(NumElems, MaskEltSize, ImmN, Mask);
5574 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5575 break;
5576 case X86ISD::INSERTPS:
5577 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5578 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5579 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5580 DecodeINSERTPSMask(ImmN, Mask, /*SrcIsMem=*/false);
5581 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5582 break;
5583 case X86ISD::EXTRQI:
5584 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5585 if (isa<ConstantSDNode>(N.getOperand(1)) &&
5586 isa<ConstantSDNode>(N.getOperand(2))) {
5587 int BitLen = N.getConstantOperandVal(1);
5588 int BitIdx = N.getConstantOperandVal(2);
5589 DecodeEXTRQIMask(NumElems, MaskEltSize, BitLen, BitIdx, Mask);
5590 IsUnary = true;
5591 }
5592 break;
5593 case X86ISD::INSERTQI:
5594 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5595 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5596 if (isa<ConstantSDNode>(N.getOperand(2)) &&
5597 isa<ConstantSDNode>(N.getOperand(3))) {
5598 int BitLen = N.getConstantOperandVal(2);
5599 int BitIdx = N.getConstantOperandVal(3);
5600 DecodeINSERTQIMask(NumElems, MaskEltSize, BitLen, BitIdx, Mask);
5601 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5602 }
5603 break;
5604 case X86ISD::UNPCKH:
5605 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5606 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5607 DecodeUNPCKHMask(NumElems, MaskEltSize, Mask);
5608 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5609 break;
5610 case X86ISD::UNPCKL:
5611 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5612 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5613 DecodeUNPCKLMask(NumElems, MaskEltSize, Mask);
5614 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5615 break;
5616 case X86ISD::MOVHLPS:
5617 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5618 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5619 DecodeMOVHLPSMask(NumElems, Mask);
5620 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5621 break;
5622 case X86ISD::MOVLHPS:
5623 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5624 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5625 DecodeMOVLHPSMask(NumElems, Mask);
5626 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5627 break;
5628 case X86ISD::VALIGN:
5629 assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) &&
5630 "Only 32-bit and 64-bit elements are supported!");
5631 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5632 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5633 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5634 DecodeVALIGNMask(NumElems, ImmN, Mask);
5635 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5636 Ops.push_back(N.getOperand(1));
5637 Ops.push_back(N.getOperand(0));
5638 break;
5639 case X86ISD::PALIGNR:
5640 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5641 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5642 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5643 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5644 DecodePALIGNRMask(NumElems, ImmN, Mask);
5645 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5646 Ops.push_back(N.getOperand(1));
5647 Ops.push_back(N.getOperand(0));
5648 break;
5649 case X86ISD::VSHLDQ:
5650 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5651 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5652 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5653 DecodePSLLDQMask(NumElems, ImmN, Mask);
5654 IsUnary = true;
5655 break;
5656 case X86ISD::VSRLDQ:
5657 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5658 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5659 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5660 DecodePSRLDQMask(NumElems, ImmN, Mask);
5661 IsUnary = true;
5662 break;
5663 case X86ISD::PSHUFD:
5664 case X86ISD::VPERMILPI:
5665 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5666 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5667 DecodePSHUFMask(NumElems, MaskEltSize, ImmN, Mask);
5668 IsUnary = true;
5669 break;
5670 case X86ISD::PSHUFHW:
5671 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5672 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5673 DecodePSHUFHWMask(NumElems, ImmN, Mask);
5674 IsUnary = true;
5675 break;
5676 case X86ISD::PSHUFLW:
5677 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5678 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5679 DecodePSHUFLWMask(NumElems, ImmN, Mask);
5680 IsUnary = true;
5681 break;
5682 case X86ISD::VZEXT_MOVL:
5683 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5684 DecodeZeroMoveLowMask(NumElems, Mask);
5685 IsUnary = true;
5686 break;
5687 case X86ISD::VBROADCAST:
5688 // We only decode broadcasts of same-sized vectors, peeking through to
5689 // extracted subvectors is likely to cause hasOneUse issues with
5690 // SimplifyDemandedBits etc.
5691 if (N.getOperand(0).getValueType() == VT) {
5692 DecodeVectorBroadcast(NumElems, Mask);
5693 IsUnary = true;
5694 break;
5695 }
5696 return false;
5697 case X86ISD::VPERMILPV: {
5698 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5699 IsUnary = true;
5700 SDValue MaskNode = N.getOperand(1);
5701 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
5702 RawUndefs)) {
5703 DecodeVPERMILPMask(NumElems, MaskEltSize, RawMask, RawUndefs, Mask);
5704 break;
5705 }
5706 return false;
5707 }
5708 case X86ISD::PSHUFB: {
5709 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5710 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5711 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5712 IsUnary = true;
5713 SDValue MaskNode = N.getOperand(1);
5714 if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask, RawUndefs)) {
5715 DecodePSHUFBMask(RawMask, RawUndefs, Mask);
5716 break;
5717 }
5718 return false;
5719 }
5720 case X86ISD::VPERMI:
5721 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5722 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5723 DecodeVPERMMask(NumElems, ImmN, Mask);
5724 IsUnary = true;
5725 break;
5726 case X86ISD::MOVSS:
5727 case X86ISD::MOVSD:
5728 case X86ISD::MOVSH:
5729 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5730 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5731 DecodeScalarMoveMask(NumElems, /* IsLoad */ false, Mask);
5732 break;
5733 case X86ISD::VPERM2X128:
5734 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5735 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5736 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5737 DecodeVPERM2X128Mask(NumElems, ImmN, Mask);
5738 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5739 break;
5740 case X86ISD::SHUF128:
5741 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5742 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5743 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5744 decodeVSHUF64x2FamilyMask(NumElems, MaskEltSize, ImmN, Mask);
5745 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5746 break;
5747 case X86ISD::MOVSLDUP:
5748 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5749 DecodeMOVSLDUPMask(NumElems, Mask);
5750 IsUnary = true;
5751 break;
5752 case X86ISD::MOVSHDUP:
5753 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5754 DecodeMOVSHDUPMask(NumElems, Mask);
5755 IsUnary = true;
5756 break;
5757 case X86ISD::MOVDDUP:
5758 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5759 DecodeMOVDDUPMask(NumElems, Mask);
5760 IsUnary = true;
5761 break;
5762 case X86ISD::VPERMIL2: {
5763 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5764 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5765 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5766 SDValue MaskNode = N.getOperand(2);
5767 SDValue CtrlNode = N.getOperand(3);
5768 if (ConstantSDNode *CtrlOp = dyn_cast<ConstantSDNode>(CtrlNode)) {
5769 unsigned CtrlImm = CtrlOp->getZExtValue();
5770 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
5771 RawUndefs)) {
5772 DecodeVPERMIL2PMask(NumElems, MaskEltSize, CtrlImm, RawMask, RawUndefs,
5773 Mask);
5774 break;
5775 }
5776 }
5777 return false;
5778 }
5779 case X86ISD::VPPERM: {
5780 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5781 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5782 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5783 SDValue MaskNode = N.getOperand(2);
5784 if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask, RawUndefs)) {
5785 DecodeVPPERMMask(RawMask, RawUndefs, Mask);
5786 break;
5787 }
5788 return false;
5789 }
5790 case X86ISD::VPERMV: {
5791 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5792 IsUnary = true;
5793 // Unlike most shuffle nodes, VPERMV's mask operand is operand 0.
5794 Ops.push_back(N.getOperand(1));
5795 SDValue MaskNode = N.getOperand(0);
5796 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
5797 RawUndefs)) {
5798 DecodeVPERMVMask(RawMask, RawUndefs, Mask);
5799 break;
5800 }
5801 return false;
5802 }
5803 case X86ISD::VPERMV3: {
5804 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5805 assert(N.getOperand(2).getValueType() == VT && "Unexpected value type");
5806 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(2);
5807 // Unlike most shuffle nodes, VPERMV3's mask operand is the middle one.
5808 Ops.push_back(N.getOperand(0));
5809 Ops.push_back(N.getOperand(2));
5810 SDValue MaskNode = N.getOperand(1);
5811 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
5812 RawUndefs)) {
5813 DecodeVPERMV3Mask(RawMask, RawUndefs, Mask);
5814 break;
5815 }
5816 return false;
5817 }
5818 default:
5819 llvm_unreachable("unknown target shuffle node");
5820 }
5821
5822 // Empty mask indicates the decode failed.
5823 if (Mask.empty())
5824 return false;
5825
5826 // Check if we're getting a shuffle mask with zero'd elements.
5827 if (!AllowSentinelZero && isAnyZero(Mask))
5828 return false;
5829
5830 // If we have a fake unary shuffle, the shuffle mask is spread across two
5831 // inputs that are actually the same node. Re-map the mask to always point
5832 // into the first input.
5833 if (IsFakeUnary)
5834 for (int &M : Mask)
5835 if (M >= (int)Mask.size())
5836 M -= Mask.size();
5837
5838 // If we didn't already add operands in the opcode-specific code, default to
5839 // adding 1 or 2 operands starting at 0.
5840 if (Ops.empty()) {
5841 Ops.push_back(N.getOperand(0));
5842 if (!IsUnary || IsFakeUnary)
5843 Ops.push_back(N.getOperand(1));
5844 }
5845
5846 return true;
5847}
5848
5849// Wrapper for getTargetShuffleMask with InUnary;
5850static bool getTargetShuffleMask(SDValue N, bool AllowSentinelZero,
5852 SmallVectorImpl<int> &Mask) {
5853 bool IsUnary;
5854 return getTargetShuffleMask(N, AllowSentinelZero, Ops, Mask, IsUnary);
5855}
5856
5857/// Compute whether each element of a shuffle is zeroable.
5858///
5859/// A "zeroable" vector shuffle element is one which can be lowered to zero.
5860/// Either it is an undef element in the shuffle mask, the element of the input
5861/// referenced is undef, or the element of the input referenced is known to be
5862/// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle
5863/// as many lanes with this technique as possible to simplify the remaining
5864/// shuffle.
5866 SDValue V1, SDValue V2,
5867 APInt &KnownUndef, APInt &KnownZero) {
5868 int Size = Mask.size();
5869 KnownUndef = KnownZero = APInt::getZero(Size);
5870
5871 V1 = peekThroughBitcasts(V1);
5872 V2 = peekThroughBitcasts(V2);
5873
5874 bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());
5875 bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());
5876
5877 int VectorSizeInBits = V1.getValueSizeInBits();
5878 int ScalarSizeInBits = VectorSizeInBits / Size;
5879 assert(!(VectorSizeInBits % ScalarSizeInBits) && "Illegal shuffle mask size");
5880
5881 for (int i = 0; i < Size; ++i) {
5882 int M = Mask[i];
5883 // Handle the easy cases.
5884 if (M < 0) {
5885 KnownUndef.setBit(i);
5886 continue;
5887 }
5888 if ((M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) {
5889 KnownZero.setBit(i);
5890 continue;
5891 }
5892
5893 // Determine shuffle input and normalize the mask.
5894 SDValue V = M < Size ? V1 : V2;
5895 M %= Size;
5896
5897 // Currently we can only search BUILD_VECTOR for UNDEF/ZERO elements.
5898 if (V.getOpcode() != ISD::BUILD_VECTOR)
5899 continue;
5900
5901 // If the BUILD_VECTOR has fewer elements then the bitcasted portion of
5902 // the (larger) source element must be UNDEF/ZERO.
5903 if ((Size % V.getNumOperands()) == 0) {
5904 int Scale = Size / V->getNumOperands();
5905 SDValue Op = V.getOperand(M / Scale);
5906 if (Op.isUndef())
5907 KnownUndef.setBit(i);
5908 if (X86::isZeroNode(Op))
5909 KnownZero.setBit(i);
5910 else if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
5911 APInt Val = Cst->getAPIntValue();
5912 Val = Val.extractBits(ScalarSizeInBits, (M % Scale) * ScalarSizeInBits);
5913 if (Val == 0)
5914 KnownZero.setBit(i);
5915 } else if (ConstantFPSDNode *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
5916 APInt Val = Cst->getValueAPF().bitcastToAPInt();
5917 Val = Val.extractBits(ScalarSizeInBits, (M % Scale) * ScalarSizeInBits);
5918 if (Val == 0)
5919 KnownZero.setBit(i);
5920 }
5921 continue;
5922 }
5923
5924 // If the BUILD_VECTOR has more elements then all the (smaller) source
5925 // elements must be UNDEF or ZERO.
5926 if ((V.getNumOperands() % Size) == 0) {
5927 int Scale = V->getNumOperands() / Size;
5928 bool AllUndef = true;
5929 bool AllZero = true;
5930 for (int j = 0; j < Scale; ++j) {
5931 SDValue Op = V.getOperand((M * Scale) + j);
5932 AllUndef &= Op.isUndef();
5933 AllZero &= X86::isZeroNode(Op);
5934 }
5935 if (AllUndef)
5936 KnownUndef.setBit(i);
5937 if (AllZero)
5938 KnownZero.setBit(i);
5939 continue;
5940 }
5941 }
5942}
5943
5944/// Decode a target shuffle mask and inputs and see if any values are
5945/// known to be undef or zero from their inputs.
5946/// Returns true if the target shuffle mask was decoded.
5947/// FIXME: Merge this with computeZeroableShuffleElements?
5950 APInt &KnownUndef, APInt &KnownZero) {
5951 bool IsUnary;
5952 if (!isTargetShuffle(N.getOpcode()))
5953 return false;
5954
5955 MVT VT = N.getSimpleValueType();
5956 if (!getTargetShuffleMask(N, true, Ops, Mask, IsUnary))
5957 return false;
5958
5959 int Size = Mask.size();
5960 SDValue V1 = Ops[0];
5961 SDValue V2 = IsUnary ? V1 : Ops[1];
5962 KnownUndef = KnownZero = APInt::getZero(Size);
5963
5964 V1 = peekThroughBitcasts(V1);
5965 V2 = peekThroughBitcasts(V2);
5966
5967 assert((VT.getSizeInBits() % Size) == 0 &&
5968 "Illegal split of shuffle value type");
5969 unsigned EltSizeInBits = VT.getSizeInBits() / Size;
5970
5971 // Extract known constant input data.
5972 APInt UndefSrcElts[2];
5973 SmallVector<APInt, 32> SrcEltBits[2];
5974 bool IsSrcConstant[2] = {
5975 getTargetConstantBitsFromNode(V1, EltSizeInBits, UndefSrcElts[0],
5976 SrcEltBits[0], /*AllowWholeUndefs*/ true,
5977 /*AllowPartialUndefs*/ false),
5978 getTargetConstantBitsFromNode(V2, EltSizeInBits, UndefSrcElts[1],
5979 SrcEltBits[1], /*AllowWholeUndefs*/ true,
5980 /*AllowPartialUndefs*/ false)};
5981
5982 for (int i = 0; i < Size; ++i) {
5983 int M = Mask[i];
5984
5985 // Already decoded as SM_SentinelZero / SM_SentinelUndef.
5986 if (M < 0) {
5987 assert(isUndefOrZero(M) && "Unknown shuffle sentinel value!");
5988 if (SM_SentinelUndef == M)
5989 KnownUndef.setBit(i);
5990 if (SM_SentinelZero == M)
5991 KnownZero.setBit(i);
5992 continue;
5993 }
5994
5995 // Determine shuffle input and normalize the mask.
5996 unsigned SrcIdx = M / Size;
5997 SDValue V = M < Size ? V1 : V2;
5998 M %= Size;
5999
6000 // We are referencing an UNDEF input.
6001 if (V.isUndef()) {
6002 KnownUndef.setBit(i);
6003 continue;
6004 }
6005
6006 // SCALAR_TO_VECTOR - only the first element is defined, and the rest UNDEF.
6007 // TODO: We currently only set UNDEF for integer types - floats use the same
6008 // registers as vectors and many of the scalar folded loads rely on the
6009 // SCALAR_TO_VECTOR pattern.
6010 if (V.getOpcode() == ISD::SCALAR_TO_VECTOR &&
6011 (Size % V.getValueType().getVectorNumElements()) == 0) {
6012 int Scale = Size / V.getValueType().getVectorNumElements();
6013 int Idx = M / Scale;
6014 if (Idx != 0 && !VT.isFloatingPoint())
6015 KnownUndef.setBit(i);
6016 else if (Idx == 0 && X86::isZeroNode(V.getOperand(0)))
6017 KnownZero.setBit(i);
6018 continue;
6019 }
6020
6021 // INSERT_SUBVECTOR - to widen vectors we often insert them into UNDEF
6022 // base vectors.
6023 if (V.getOpcode() == ISD::INSERT_SUBVECTOR) {
6024 SDValue Vec = V.getOperand(0);
6025 int NumVecElts = Vec.getValueType().getVectorNumElements();
6026 if (Vec.isUndef() && Size == NumVecElts) {
6027 int Idx = V.getConstantOperandVal(2);
6028 int NumSubElts = V.getOperand(1).getValueType().getVectorNumElements();
6029 if (M < Idx || (Idx + NumSubElts) <= M)
6030 KnownUndef.setBit(i);
6031 }
6032 continue;
6033 }
6034
6035 // Attempt to extract from the source's constant bits.
6036 if (IsSrcConstant[SrcIdx]) {
6037 if (UndefSrcElts[SrcIdx][M])
6038 KnownUndef.setBit(i);
6039 else if (SrcEltBits[SrcIdx][M] == 0)
6040 KnownZero.setBit(i);
6041 }
6042 }
6043
6044 assert(VT.getVectorNumElements() == (unsigned)Size &&
6045 "Different mask size from vector size!");
6046 return true;
6047}
6048
6049// Replace target shuffle mask elements with known undef/zero sentinels.
6051 const APInt &KnownUndef,
6052 const APInt &KnownZero,
6053 bool ResolveKnownZeros= true) {
6054 unsigned NumElts = Mask.size();
6055 assert(KnownUndef.getBitWidth() == NumElts &&
6056 KnownZero.getBitWidth() == NumElts && "Shuffle mask size mismatch");
6057
6058 for (unsigned i = 0; i != NumElts; ++i) {
6059 if (KnownUndef[i])
6060 Mask[i] = SM_SentinelUndef;
6061 else if (ResolveKnownZeros && KnownZero[i])
6062 Mask[i] = SM_SentinelZero;
6063 }
6064}
6065
6066// Extract target shuffle mask sentinel elements to known undef/zero bitmasks.
6068 APInt &KnownUndef,
6069 APInt &KnownZero) {
6070 unsigned NumElts = Mask.size();
6071 KnownUndef = KnownZero = APInt::getZero(NumElts);
6072
6073 for (unsigned i = 0; i != NumElts; ++i) {
6074 int M = Mask[i];
6075 if (SM_SentinelUndef == M)
6076 KnownUndef.setBit(i);
6077 if (SM_SentinelZero == M)
6078 KnownZero.setBit(i);
6079 }
6080}
6081
6082// Attempt to create a shuffle mask from a VSELECT/BLENDV condition mask.
6084 SDValue Cond, bool IsBLENDV = false) {
6085 EVT CondVT = Cond.getValueType();
6086 unsigned EltSizeInBits = CondVT.getScalarSizeInBits();
6087 unsigned NumElts = CondVT.getVectorNumElements();
6088
6089 APInt UndefElts;
6090 SmallVector<APInt, 32> EltBits;
6091 if (!getTargetConstantBitsFromNode(Cond, EltSizeInBits, UndefElts, EltBits,
6092 /*AllowWholeUndefs*/ true,
6093 /*AllowPartialUndefs*/ false))
6094 return false;
6095
6096 Mask.resize(NumElts, SM_SentinelUndef);
6097
6098 for (int i = 0; i != (int)NumElts; ++i) {
6099 Mask[i] = i;
6100 // Arbitrarily choose from the 2nd operand if the select condition element
6101 // is undef.
6102 // TODO: Can we do better by matching patterns such as even/odd?
6103 if (UndefElts[i] || (!IsBLENDV && EltBits[i].isZero()) ||
6104 (IsBLENDV && EltBits[i].isNonNegative()))
6105 Mask[i] += NumElts;
6106 }
6107
6108 return true;
6109}
6110
6111// Forward declaration (for getFauxShuffleMask recursive check).
6112static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts,
6115 const SelectionDAG &DAG, unsigned Depth,
6116 bool ResolveKnownElts);
6117
6118// Attempt to decode ops that could be represented as a shuffle mask.
6119// The decoded shuffle mask may contain a different number of elements to the
6120// destination value type.
6121// TODO: Merge into getTargetShuffleInputs()
6122static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
6125 const SelectionDAG &DAG, unsigned Depth,
6126 bool ResolveKnownElts) {
6127 Mask.clear();
6128 Ops.clear();
6129
6130 MVT VT = N.getSimpleValueType();
6131 unsigned NumElts = VT.getVectorNumElements();
6132 unsigned NumSizeInBits = VT.getSizeInBits();
6133 unsigned NumBitsPerElt = VT.getScalarSizeInBits();
6134 if ((NumBitsPerElt % 8) != 0 || (NumSizeInBits % 8) != 0)
6135 return false;
6136 assert(NumElts == DemandedElts.getBitWidth() && "Unexpected vector size");
6137 unsigned NumSizeInBytes = NumSizeInBits / 8;
6138 unsigned NumBytesPerElt = NumBitsPerElt / 8;
6139
6140 unsigned Opcode = N.getOpcode();
6141 switch (Opcode) {
6142 case ISD::VECTOR_SHUFFLE: {
6143 // Don't treat ISD::VECTOR_SHUFFLE as a target shuffle so decode it here.
6144 ArrayRef<int> ShuffleMask = cast<ShuffleVectorSDNode>(N)->getMask();
6145 if (isUndefOrInRange(ShuffleMask, 0, 2 * NumElts)) {
6146 Mask.append(ShuffleMask.begin(), ShuffleMask.end());
6147 Ops.push_back(N.getOperand(0));
6148 Ops.push_back(N.getOperand(1));
6149 return true;
6150 }
6151 return false;
6152 }
6153 case ISD::AND:
6154 case X86ISD::ANDNP: {
6155 // Attempt to decode as a per-byte mask.
6156 APInt UndefElts;
6157 SmallVector<APInt, 32> EltBits;
6158 SDValue N0 = N.getOperand(0);
6159 SDValue N1 = N.getOperand(1);
6160 bool IsAndN = (X86ISD::ANDNP == Opcode);
6161 uint64_t ZeroMask = IsAndN ? 255 : 0;
6162 if (!getTargetConstantBitsFromNode(IsAndN ? N0 : N1, 8, UndefElts, EltBits,
6163 /*AllowWholeUndefs*/ false,
6164 /*AllowPartialUndefs*/ false))
6165 return false;
6166 // We can't assume an undef src element gives an undef dst - the other src
6167 // might be zero.
6168 assert(UndefElts.isZero() && "Unexpected UNDEF element in AND/ANDNP mask");
6169 for (int i = 0, e = (int)EltBits.size(); i != e; ++i) {
6170 const APInt &ByteBits = EltBits[i];
6171 if (ByteBits != 0 && ByteBits != 255)
6172 return false;
6173 Mask.push_back(ByteBits == ZeroMask ? SM_SentinelZero : i);
6174 }
6175 Ops.push_back(IsAndN ? N1 : N0);
6176 return true;
6177 }
6178 case ISD::OR: {
6179 // Handle OR(SHUFFLE,SHUFFLE) case where one source is zero and the other
6180 // is a valid shuffle index.
6181 SDValue N0 = peekThroughBitcasts(N.getOperand(0));
6182 SDValue N1 = peekThroughBitcasts(N.getOperand(1));
6183 if (!N0.getValueType().isVector() || !N1.getValueType().isVector())
6184 return false;
6185
6186 SmallVector<int, 64> SrcMask0, SrcMask1;
6187 SmallVector<SDValue, 2> SrcInputs0, SrcInputs1;
6190 if (!getTargetShuffleInputs(N0, Demand0, SrcInputs0, SrcMask0, DAG,
6191 Depth + 1, true) ||
6192 !getTargetShuffleInputs(N1, Demand1, SrcInputs1, SrcMask1, DAG,
6193 Depth + 1, true))
6194 return false;
6195
6196 size_t MaskSize = std::max(SrcMask0.size(), SrcMask1.size());
6197 SmallVector<int, 64> Mask0, Mask1;
6198 narrowShuffleMaskElts(MaskSize / SrcMask0.size(), SrcMask0, Mask0);
6199 narrowShuffleMaskElts(MaskSize / SrcMask1.size(), SrcMask1, Mask1);
6200 for (int i = 0; i != (int)MaskSize; ++i) {
6201 // NOTE: Don't handle SM_SentinelUndef, as we can end up in infinite
6202 // loops converting between OR and BLEND shuffles due to
6203 // canWidenShuffleElements merging away undef elements, meaning we
6204 // fail to recognise the OR as the undef element isn't known zero.
6205 if (Mask0[i] == SM_SentinelZero && Mask1[i] == SM_SentinelZero)
6206 Mask.push_back(SM_SentinelZero);
6207 else if (Mask1[i] == SM_SentinelZero)
6208 Mask.push_back(i);
6209 else if (Mask0[i] == SM_SentinelZero)
6210 Mask.push_back(i + MaskSize);
6211 else
6212 return false;
6213 }
6214 Ops.push_back(N.getOperand(0));
6215 Ops.push_back(N.getOperand(1));
6216 return true;
6217 }
6218 case ISD::CONCAT_VECTORS: {
6219 // Limit this to vXi64 vector cases to make the most of cross lane shuffles.
6220 unsigned NumSubElts = N.getOperand(0).getValueType().getVectorNumElements();
6221 if (NumBitsPerElt == 64) {
6222 for (unsigned I = 0, E = N.getNumOperands(); I != E; ++I) {
6223 for (unsigned M = 0; M != NumSubElts; ++M)
6224 Mask.push_back((I * NumElts) + M);
6225 Ops.push_back(N.getOperand(I));
6226 }
6227 return true;
6228 }
6229 return false;
6230 }
6231 case ISD::INSERT_SUBVECTOR: {
6232 SDValue Src = N.getOperand(0);
6233 SDValue Sub = N.getOperand(1);
6234 EVT SubVT = Sub.getValueType();
6235 unsigned NumSubElts = SubVT.getVectorNumElements();
6236 uint64_t InsertIdx = N.getConstantOperandVal(2);
6237 // Subvector isn't demanded - just return the base vector.
6238 if (DemandedElts.extractBits(NumSubElts, InsertIdx) == 0) {
6239 Mask.resize(NumElts);
6240 std::iota(Mask.begin(), Mask.end(), 0);
6241 Ops.push_back(Src);
6242 return true;
6243 }
6244 // Handle CONCAT(SUB0, SUB1).
6245 // Limit to vXi64/splat cases to make the most of cross lane shuffles.
6246 if (Depth > 0 && InsertIdx == NumSubElts && NumElts == (2 * NumSubElts) &&
6247 Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
6248 Src.getOperand(0).isUndef() &&
6249 Src.getOperand(1).getValueType() == SubVT &&
6250 Src.getConstantOperandVal(2) == 0 &&
6251 (NumBitsPerElt == 64 || Src.getOperand(1) == Sub) &&
6252 SDNode::areOnlyUsersOf({N.getNode(), Src.getNode()}, Sub.getNode())) {
6253 Mask.resize(NumElts);
6254 std::iota(Mask.begin(), Mask.begin() + NumSubElts, 0);
6255 std::iota(Mask.begin() + NumSubElts, Mask.end(), NumElts);
6256 Ops.push_back(Src.getOperand(1));
6257 Ops.push_back(Sub);
6258 return true;
6259 }
6260 if (!N->isOnlyUserOf(Sub.getNode()))
6261 return false;
6262
6263 SmallVector<int, 64> SubMask;
6264 SmallVector<SDValue, 2> SubInputs;
6266 EVT SubSrcVT = SubSrc.getValueType();
6267 if (!SubSrcVT.isVector())
6268 return false;
6269
6270 // Handle INSERT_SUBVECTOR(SRC0, EXTRACT_SUBVECTOR(SRC1)).
6271 if (SubSrc.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
6272 SubSrc.getOperand(0).getValueSizeInBits() == NumSizeInBits) {
6273 uint64_t ExtractIdx = SubSrc.getConstantOperandVal(1);
6274 SDValue SubSrcSrc = SubSrc.getOperand(0);
6275 unsigned NumSubSrcSrcElts =
6276 SubSrcSrc.getValueType().getVectorNumElements();
6277 unsigned MaxElts = std::max(NumElts, NumSubSrcSrcElts);
6278 assert((MaxElts % NumElts) == 0 && (MaxElts % NumSubSrcSrcElts) == 0 &&
6279 "Subvector valuetype mismatch");
6280 InsertIdx *= (MaxElts / NumElts);
6281 ExtractIdx *= (MaxElts / NumSubSrcSrcElts);
6282 NumSubElts *= (MaxElts / NumElts);
6283 bool SrcIsUndef = Src.isUndef();
6284 for (int i = 0; i != (int)MaxElts; ++i)
6285 Mask.push_back(SrcIsUndef ? SM_SentinelUndef : i);
6286 for (int i = 0; i != (int)NumSubElts; ++i)
6287 Mask[InsertIdx + i] = (SrcIsUndef ? 0 : MaxElts) + ExtractIdx + i;
6288 if (!SrcIsUndef)
6289 Ops.push_back(Src);
6290 Ops.push_back(SubSrcSrc);
6291 return true;
6292 }
6293
6294 // Handle INSERT_SUBVECTOR(SRC0, SHUFFLE(SRC1)).
6295 APInt SubDemand = APInt::getAllOnes(SubSrcVT.getVectorNumElements());
6296 if (!getTargetShuffleInputs(SubSrc, SubDemand, SubInputs, SubMask, DAG,
6297 Depth + 1, ResolveKnownElts))
6298 return false;
6299
6300 // Subvector shuffle inputs must not be larger than the subvector.
6301 if (llvm::any_of(SubInputs, [SubVT](SDValue SubInput) {
6302 return SubVT.getFixedSizeInBits() <
6303 SubInput.getValueSizeInBits().getFixedValue();
6304 }))
6305 return false;
6306
6307 if (SubMask.size() != NumSubElts) {
6308 assert(((SubMask.size() % NumSubElts) == 0 ||
6309 (NumSubElts % SubMask.size()) == 0) &&
6310 "Illegal submask scale");
6311 if ((NumSubElts % SubMask.size()) == 0) {
6312 int Scale = NumSubElts / SubMask.size();
6313 SmallVector<int, 64> ScaledSubMask;
6314 narrowShuffleMaskElts(Scale, SubMask, ScaledSubMask);
6315 SubMask = ScaledSubMask;
6316 } else {
6317 int Scale = SubMask.size() / NumSubElts;
6318 NumSubElts = SubMask.size();
6319 NumElts *= Scale;
6320 InsertIdx *= Scale;
6321 }
6322 }
6323 Ops.push_back(Src);
6324 Ops.append(SubInputs.begin(), SubInputs.end());
6325 if (ISD::isBuildVectorAllZeros(Src.getNode()))
6326 Mask.append(NumElts, SM_SentinelZero);
6327 else
6328 for (int i = 0; i != (int)NumElts; ++i)
6329 Mask.push_back(i);
6330 for (int i = 0; i != (int)NumSubElts; ++i) {
6331 int M = SubMask[i];
6332 if (0 <= M) {
6333 int InputIdx = M / NumSubElts;
6334 M = (NumElts * (1 + InputIdx)) + (M % NumSubElts);
6335 }
6336 Mask[i + InsertIdx] = M;
6337 }
6338 return true;
6339 }
6340 case X86ISD::PINSRB:
6341 case X86ISD::PINSRW:
6344 // Match against a insert_vector_elt/scalar_to_vector of an extract from a
6345 // vector, for matching src/dst vector types.
6346 SDValue Scl = N.getOperand(Opcode == ISD::SCALAR_TO_VECTOR ? 0 : 1);
6347
6348 unsigned DstIdx = 0;
6349 if (Opcode != ISD::SCALAR_TO_VECTOR) {
6350 // Check we have an in-range constant insertion index.
6351 if (!isa<ConstantSDNode>(N.getOperand(2)) ||
6352 N.getConstantOperandAPInt(2).uge(NumElts))
6353 return false;
6354 DstIdx = N.getConstantOperandVal(2);
6355
6356 // Attempt to recognise an INSERT*(VEC, 0, DstIdx) shuffle pattern.
6357 if (X86::isZeroNode(Scl)) {
6358 Ops.push_back(N.getOperand(0));
6359 for (unsigned i = 0; i != NumElts; ++i)
6360 Mask.push_back(i == DstIdx ? SM_SentinelZero : (int)i);
6361 return true;
6362 }
6363 }
6364
6365 // Peek through trunc/aext/zext/bitcast.
6366 // TODO: aext shouldn't require SM_SentinelZero padding.
6367 // TODO: handle shift of scalars.
6368 unsigned MinBitsPerElt = Scl.getScalarValueSizeInBits();
6369 while (Scl.getOpcode() == ISD::TRUNCATE ||
6370 Scl.getOpcode() == ISD::ANY_EXTEND ||
6371 Scl.getOpcode() == ISD::ZERO_EXTEND ||
6372 (Scl.getOpcode() == ISD::BITCAST &&
6375 Scl = Scl.getOperand(0);
6376 MinBitsPerElt =
6377 std::min<unsigned>(MinBitsPerElt, Scl.getScalarValueSizeInBits());
6378 }
6379 if ((MinBitsPerElt % 8) != 0)
6380 return false;
6381
6382 // Attempt to find the source vector the scalar was extracted from.
6383 SDValue SrcExtract;
6384 if ((Scl.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
6385 Scl.getOpcode() == X86ISD::PEXTRW ||
6386 Scl.getOpcode() == X86ISD::PEXTRB) &&
6387 Scl.getOperand(0).getValueSizeInBits() == NumSizeInBits) {
6388 SrcExtract = Scl;
6389 }
6390 if (!SrcExtract || !isa<ConstantSDNode>(SrcExtract.getOperand(1)))
6391 return false;
6392
6393 SDValue SrcVec = SrcExtract.getOperand(0);
6394 EVT SrcVT = SrcVec.getValueType();
6395 if (!SrcVT.getScalarType().isByteSized())
6396 return false;
6397 unsigned SrcIdx = SrcExtract.getConstantOperandVal(1);
6398 unsigned SrcByte = SrcIdx * (SrcVT.getScalarSizeInBits() / 8);
6399 unsigned DstByte = DstIdx * NumBytesPerElt;
6400 MinBitsPerElt =
6401 std::min<unsigned>(MinBitsPerElt, SrcVT.getScalarSizeInBits());
6402
6403 // Create 'identity' byte level shuffle mask and then add inserted bytes.
6404 if (Opcode == ISD::SCALAR_TO_VECTOR) {
6405 Ops.push_back(SrcVec);
6406 Mask.append(NumSizeInBytes, SM_SentinelUndef);
6407 } else {
6408 Ops.push_back(SrcVec);
6409 Ops.push_back(N.getOperand(0));
6410 for (int i = 0; i != (int)NumSizeInBytes; ++i)
6411 Mask.push_back(NumSizeInBytes + i);
6412 }
6413
6414 unsigned MinBytesPerElts = MinBitsPerElt / 8;
6415 MinBytesPerElts = std::min(MinBytesPerElts, NumBytesPerElt);
6416 for (unsigned i = 0; i != MinBytesPerElts; ++i)
6417 Mask[DstByte + i] = SrcByte + i;
6418 for (unsigned i = MinBytesPerElts; i < NumBytesPerElt; ++i)
6419 Mask[DstByte + i] = SM_SentinelZero;
6420 return true;
6421 }
6422 case X86ISD::PACKSS:
6423 case X86ISD::PACKUS: {
6424 SDValue N0 = N.getOperand(0);
6425 SDValue N1 = N.getOperand(1);
6426 assert(N0.getValueType().getVectorNumElements() == (NumElts / 2) &&
6427 N1.getValueType().getVectorNumElements() == (NumElts / 2) &&
6428 "Unexpected input value type");
6429
6430 APInt EltsLHS, EltsRHS;
6431 getPackDemandedElts(VT, DemandedElts, EltsLHS, EltsRHS);
6432
6433 // If we know input saturation won't happen (or we don't care for particular
6434 // lanes), we can treat this as a truncation shuffle.
6435 bool Offset0 = false, Offset1 = false;
6436 if (Opcode == X86ISD::PACKSS) {
6437 if ((!(N0.isUndef() || EltsLHS.isZero()) &&
6438 DAG.ComputeNumSignBits(N0, EltsLHS, Depth + 1) <= NumBitsPerElt) ||
6439 (!(N1.isUndef() || EltsRHS.isZero()) &&
6440 DAG.ComputeNumSignBits(N1, EltsRHS, Depth + 1) <= NumBitsPerElt))
6441 return false;
6442 // We can't easily fold ASHR into a shuffle, but if it was feeding a
6443 // PACKSS then it was likely being used for sign-extension for a
6444 // truncation, so just peek through and adjust the mask accordingly.
6445 if (N0.getOpcode() == X86ISD::VSRAI && N->isOnlyUserOf(N0.getNode()) &&
6446 N0.getConstantOperandAPInt(1) == NumBitsPerElt) {
6447 Offset0 = true;
6448 N0 = N0.getOperand(0);
6449 }
6450 if (N1.getOpcode() == X86ISD::VSRAI && N->isOnlyUserOf(N1.getNode()) &&
6451 N1.getConstantOperandAPInt(1) == NumBitsPerElt) {
6452 Offset1 = true;
6453 N1 = N1.getOperand(0);
6454 }
6455 } else {
6456 APInt ZeroMask = APInt::getHighBitsSet(2 * NumBitsPerElt, NumBitsPerElt);
6457 if ((!(N0.isUndef() || EltsLHS.isZero()) &&
6458 !DAG.MaskedValueIsZero(N0, ZeroMask, EltsLHS, Depth + 1)) ||
6459 (!(N1.isUndef() || EltsRHS.isZero()) &&
6460 !DAG.MaskedValueIsZero(N1, ZeroMask, EltsRHS, Depth + 1)))
6461 return false;
6462 }
6463
6464 bool IsUnary = (N0 == N1);
6465
6466 Ops.push_back(N0);
6467 if (!IsUnary)
6468 Ops.push_back(N1);
6469
6470 createPackShuffleMask(VT, Mask, IsUnary);
6471
6472 if (Offset0 || Offset1) {
6473 for (int &M : Mask)
6474 if ((Offset0 && isInRange(M, 0, NumElts)) ||
6475 (Offset1 && isInRange(M, NumElts, 2 * NumElts)))
6476 ++M;
6477 }
6478 return true;
6479 }
6480 case ISD::VSELECT:
6481 case X86ISD::BLENDV: {
6482 SDValue Cond = N.getOperand(0);
6483 if (createShuffleMaskFromVSELECT(Mask, Cond, Opcode == X86ISD::BLENDV)) {
6484 Ops.push_back(N.getOperand(1));
6485 Ops.push_back(N.getOperand(2));
6486 return true;
6487 }
6488 return false;
6489 }
6490 case X86ISD::VTRUNC: {
6491 SDValue Src = N.getOperand(0);
6492 EVT SrcVT = Src.getValueType();
6493 if (SrcVT.getSizeInBits() != NumSizeInBits)
6494 return false;
6495 unsigned NumSrcElts = SrcVT.getVectorNumElements();
6496 unsigned NumBitsPerSrcElt = SrcVT.getScalarSizeInBits();
6497 unsigned Scale = NumBitsPerSrcElt / NumBitsPerElt;
6498 assert((NumBitsPerSrcElt % NumBitsPerElt) == 0 && "Illegal truncation");
6499 for (unsigned i = 0; i != NumSrcElts; ++i)
6500 Mask.push_back(i * Scale);
6501 Mask.append(NumElts - NumSrcElts, SM_SentinelZero);
6502 Ops.push_back(Src);
6503 return true;
6504 }
6505 case ISD::SHL:
6506 case ISD::SRL: {
6507 APInt UndefElts;
6508 SmallVector<APInt, 32> EltBits;
6509 if (!getTargetConstantBitsFromNode(N.getOperand(1), NumBitsPerElt,
6510 UndefElts, EltBits,
6511 /*AllowWholeUndefs*/ true,
6512 /*AllowPartialUndefs*/ false))
6513 return false;
6514
6515 // We can only decode 'whole byte' bit shifts as shuffles.
6516 for (unsigned I = 0; I != NumElts; ++I)
6517 if (DemandedElts[I] && !UndefElts[I] &&
6518 (EltBits[I].urem(8) != 0 || EltBits[I].uge(NumBitsPerElt)))
6519 return false;
6520
6521 Mask.append(NumSizeInBytes, SM_SentinelUndef);
6522 Ops.push_back(N.getOperand(0));
6523
6524 for (unsigned I = 0; I != NumElts; ++I) {
6525 if (!DemandedElts[I] || UndefElts[I])
6526 continue;
6527 unsigned ByteShift = EltBits[I].getZExtValue() / 8;
6528 unsigned Lo = I * NumBytesPerElt;
6529 unsigned Hi = Lo + NumBytesPerElt;
6530 // Clear mask to all zeros and insert the shifted byte indices.
6531 std::fill(Mask.begin() + Lo, Mask.begin() + Hi, SM_SentinelZero);
6532 if (ISD::SHL == Opcode)
6533 std::iota(Mask.begin() + Lo + ByteShift, Mask.begin() + Hi, Lo);
6534 else
6535 std::iota(Mask.begin() + Lo, Mask.begin() + Hi - ByteShift,
6536 Lo + ByteShift);
6537 }
6538 return true;
6539 }
6540 case X86ISD::VSHLI:
6541 case X86ISD::VSRLI: {
6542 uint64_t ShiftVal = N.getConstantOperandVal(1);
6543 // Out of range bit shifts are guaranteed to be zero.
6544 if (NumBitsPerElt <= ShiftVal) {
6545 Mask.append(NumElts, SM_SentinelZero);
6546 return true;
6547 }
6548
6549 // We can only decode 'whole byte' bit shifts as shuffles.
6550 if ((ShiftVal % 8) != 0)
6551 break;
6552
6553 uint64_t ByteShift = ShiftVal / 8;
6554 Ops.push_back(N.getOperand(0));
6555
6556 // Clear mask to all zeros and insert the shifted byte indices.
6557 Mask.append(NumSizeInBytes, SM_SentinelZero);
6558
6559 if (X86ISD::VSHLI == Opcode) {
6560 for (unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt)
6561 for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
6562 Mask[i + j] = i + j - ByteShift;
6563 } else {
6564 for (unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt)
6565 for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
6566 Mask[i + j - ByteShift] = i + j;
6567 }
6568 return true;
6569 }
6570 case X86ISD::VROTLI:
6571 case X86ISD::VROTRI: {
6572 // We can only decode 'whole byte' bit rotates as shuffles.
6573 uint64_t RotateVal = N.getConstantOperandAPInt(1).urem(NumBitsPerElt);
6574 if ((RotateVal % 8) != 0)
6575 return false;
6576 Ops.push_back(N.getOperand(0));
6577 int Offset = RotateVal / 8;
6578 Offset = (X86ISD::VROTLI == Opcode ? NumBytesPerElt - Offset : Offset);
6579 for (int i = 0; i != (int)NumElts; ++i) {
6580 int BaseIdx = i * NumBytesPerElt;
6581 for (int j = 0; j != (int)NumBytesPerElt; ++j) {
6582 Mask.push_back(BaseIdx + ((Offset + j) % NumBytesPerElt));
6583 }
6584 }
6585 return true;
6586 }
6587 case X86ISD::VBROADCAST: {
6588 SDValue Src = N.getOperand(0);
6589 if (!Src.getSimpleValueType().isVector()) {
6590 if (Src.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
6591 !isNullConstant(Src.getOperand(1)) ||
6592 Src.getOperand(0).getValueType().getScalarType() !=
6593 VT.getScalarType())
6594 return false;
6595 Src = Src.getOperand(0);
6596 }
6597 Ops.push_back(Src);
6598 Mask.append(NumElts, 0);
6599 return true;
6600 }
6602 SDValue Src = N.getOperand(0);
6603 EVT SrcVT = Src.getValueType();
6604 unsigned NumBitsPerSrcElt = SrcVT.getScalarSizeInBits();
6605
6606 // Extended source must be a simple vector.
6607 if (!SrcVT.isSimple() || (SrcVT.getSizeInBits() % 128) != 0 ||
6608 (NumBitsPerSrcElt % 8) != 0)
6609 return false;
6610
6611 // We can only handle all-signbits extensions.
6612 APInt DemandedSrcElts =
6613 DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
6614 if (DAG.ComputeNumSignBits(Src, DemandedSrcElts) != NumBitsPerSrcElt)
6615 return false;
6616
6617 assert((NumBitsPerElt % NumBitsPerSrcElt) == 0 && "Unexpected extension");
6618 unsigned Scale = NumBitsPerElt / NumBitsPerSrcElt;
6619 for (unsigned I = 0; I != NumElts; ++I)
6620 Mask.append(Scale, I);
6621 Ops.push_back(Src);
6622 return true;
6623 }
6624 case ISD::ZERO_EXTEND:
6625 case ISD::ANY_EXTEND:
6628 SDValue Src = N.getOperand(0);
6629 EVT SrcVT = Src.getValueType();
6630
6631 // Extended source must be a simple vector.
6632 if (!SrcVT.isSimple() || (SrcVT.getSizeInBits() % 128) != 0 ||
6633 (SrcVT.getScalarSizeInBits() % 8) != 0)
6634 return false;
6635
6636 bool IsAnyExtend =
6637 (ISD::ANY_EXTEND == Opcode || ISD::ANY_EXTEND_VECTOR_INREG == Opcode);
6638 DecodeZeroExtendMask(SrcVT.getScalarSizeInBits(), NumBitsPerElt, NumElts,
6639 IsAnyExtend, Mask);
6640 Ops.push_back(Src);
6641 return true;
6642 }
6643 }
6644
6645 return false;
6646}
6647
6648/// Removes unused/repeated shuffle source inputs and adjusts the shuffle mask.
6650 SmallVectorImpl<int> &Mask) {
6651 int MaskWidth = Mask.size();
6652 SmallVector<SDValue, 16> UsedInputs;
6653 for (int i = 0, e = Inputs.size(); i < e; ++i) {
6654 int lo = UsedInputs.size() * MaskWidth;
6655 int hi = lo + MaskWidth;
6656
6657 // Strip UNDEF input usage.
6658 if (Inputs[i].isUndef())
6659 for (int &M : Mask)
6660 if ((lo <= M) && (M < hi))
6661 M = SM_SentinelUndef;
6662
6663 // Check for unused inputs.
6664 if (none_of(Mask, [lo, hi](int i) { return (lo <= i) && (i < hi); })) {
6665 for (int &M : Mask)
6666 if (lo <= M)
6667 M -= MaskWidth;
6668 continue;
6669 }
6670
6671 // Check for repeated inputs.
6672 bool IsRepeat = false;
6673 for (int j = 0, ue = UsedInputs.size(); j != ue; ++j) {
6674 if (UsedInputs[j] != Inputs[i])
6675 continue;
6676 for (int &M : Mask)
6677 if (lo <= M)
6678 M = (M < hi) ? ((M - lo) + (j * MaskWidth)) : (M - MaskWidth);
6679 IsRepeat = true;
6680 break;
6681 }
6682 if (IsRepeat)
6683 continue;
6684
6685 UsedInputs.push_back(Inputs[i]);
6686 }
6687 Inputs = UsedInputs;
6688}
6689
6690/// Calls getTargetShuffleAndZeroables to resolve a target shuffle mask's inputs
6691/// and then sets the SM_SentinelUndef and SM_SentinelZero values.
6692/// Returns true if the target shuffle mask was decoded.
6693static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts,
6696 APInt &KnownUndef, APInt &KnownZero,
6697 const SelectionDAG &DAG, unsigned Depth,
6698 bool ResolveKnownElts) {
6700 return false; // Limit search depth.
6701
6702 EVT VT = Op.getValueType();
6703 if (!VT.isSimple() || !VT.isVector())
6704 return false;
6705
6706 if (getTargetShuffleAndZeroables(Op, Mask, Inputs, KnownUndef, KnownZero)) {
6707 if (ResolveKnownElts)
6708 resolveTargetShuffleFromZeroables(Mask, KnownUndef, KnownZero);
6709 return true;
6710 }
6711 if (getFauxShuffleMask(Op, DemandedElts, Mask, Inputs, DAG, Depth,
6712 ResolveKnownElts)) {
6713 resolveZeroablesFromTargetShuffle(Mask, KnownUndef, KnownZero);
6714 return true;
6715 }
6716 return false;
6717}
6718
6719static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts,
6722 const SelectionDAG &DAG, unsigned Depth,
6723 bool ResolveKnownElts) {
6724 APInt KnownUndef, KnownZero;
6725 return getTargetShuffleInputs(Op, DemandedElts, Inputs, Mask, KnownUndef,
6726 KnownZero, DAG, Depth, ResolveKnownElts);
6727}
6728
6731 const SelectionDAG &DAG, unsigned Depth = 0,
6732 bool ResolveKnownElts = true) {
6733 EVT VT = Op.getValueType();
6734 if (!VT.isSimple() || !VT.isVector())
6735 return false;
6736
6737 unsigned NumElts = Op.getValueType().getVectorNumElements();
6738 APInt DemandedElts = APInt::getAllOnes(NumElts);
6739 return getTargetShuffleInputs(Op, DemandedElts, Inputs, Mask, DAG, Depth,
6740 ResolveKnownElts);
6741}
6742
6743// Attempt to create a scalar/subvector broadcast from the base MemSDNode.
6744static SDValue getBROADCAST_LOAD(unsigned Opcode, const SDLoc &DL, EVT VT,
6745 EVT MemVT, MemSDNode *Mem, unsigned Offset,
6746 SelectionDAG &DAG) {
6747 assert((Opcode == X86ISD::VBROADCAST_LOAD ||
6748 Opcode == X86ISD::SUBV_BROADCAST_LOAD) &&
6749 "Unknown broadcast load type");
6750
6751 // Ensure this is a simple (non-atomic, non-voltile), temporal read memop.
6752 if (!Mem || !Mem->readMem() || !Mem->isSimple() || Mem->isNonTemporal())
6753 return SDValue();
6754
6757 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
6758 SDValue Ops[] = {Mem->getChain(), Ptr};
6759 SDValue BcstLd = DAG.getMemIntrinsicNode(
6760 Opcode, DL, Tys, Ops, MemVT,
6762 Mem->getMemOperand(), Offset, MemVT.getStoreSize()));
6763 DAG.makeEquivalentMemoryOrdering(SDValue(Mem, 1), BcstLd.getValue(1));
6764 return BcstLd;
6765}
6766
6767/// Returns the scalar element that will make up the i'th
6768/// element of the result of the vector shuffle.
6769static SDValue getShuffleScalarElt(SDValue Op, unsigned Index,
6770 SelectionDAG &DAG, unsigned Depth) {
6772 return SDValue(); // Limit search depth.
6773
6774 EVT VT = Op.getValueType();
6775 unsigned Opcode = Op.getOpcode();
6776 unsigned NumElems = VT.getVectorNumElements();
6777
6778 // Recurse into ISD::VECTOR_SHUFFLE node to find scalars.
6779 if (auto *SV = dyn_cast<ShuffleVectorSDNode>(Op)) {
6780 int Elt = SV->getMaskElt(Index);
6781
6782 if (Elt < 0)
6783 return DAG.getUNDEF(VT.getVectorElementType());
6784
6785 SDValue Src = (Elt < (int)NumElems) ? SV->getOperand(0) : SV->getOperand(1);
6786 return getShuffleScalarElt(Src, Elt % NumElems, DAG, Depth + 1);
6787 }
6788
6789 // Recurse into target specific vector shuffles to find scalars.
6790 if (isTargetShuffle(Opcode)) {
6791 MVT ShufVT = VT.getSimpleVT();
6792 MVT ShufSVT = ShufVT.getVectorElementType();
6793 int NumElems = (int)ShufVT.getVectorNumElements();
6794 SmallVector<int, 16> ShuffleMask;
6796 if (!getTargetShuffleMask(Op, true, ShuffleOps, ShuffleMask))
6797 return SDValue();
6798
6799 int Elt = ShuffleMask[Index];
6800 if (Elt == SM_SentinelZero)
6801 return ShufSVT.isInteger() ? DAG.getConstant(0, SDLoc(Op), ShufSVT)
6802 : DAG.getConstantFP(+0.0, SDLoc(Op), ShufSVT);
6803 if (Elt == SM_SentinelUndef)
6804 return DAG.getUNDEF(ShufSVT);
6805
6806 assert(0 <= Elt && Elt < (2 * NumElems) && "Shuffle index out of range");
6807 SDValue Src = (Elt < NumElems) ? ShuffleOps[0] : ShuffleOps[1];
6808 return getShuffleScalarElt(Src, Elt % NumElems, DAG, Depth + 1);
6809 }
6810
6811 // Recurse into insert_subvector base/sub vector to find scalars.
6812 if (Opcode == ISD::INSERT_SUBVECTOR) {
6813 SDValue Vec = Op.getOperand(0);
6814 SDValue Sub = Op.getOperand(1);
6815 uint64_t SubIdx = Op.getConstantOperandVal(2);
6816 unsigned NumSubElts = Sub.getValueType().getVectorNumElements();
6817
6818 if (SubIdx <= Index && Index < (SubIdx + NumSubElts))
6819 return getShuffleScalarElt(Sub, Index - SubIdx, DAG, Depth + 1);
6820 return getShuffleScalarElt(Vec, Index, DAG, Depth + 1);
6821 }
6822
6823 // Recurse into concat_vectors sub vector to find scalars.
6824 if (Opcode == ISD::CONCAT_VECTORS) {
6825 EVT SubVT = Op.getOperand(0).getValueType();
6826 unsigned NumSubElts = SubVT.getVectorNumElements();
6827 uint64_t SubIdx = Index / NumSubElts;
6828 uint64_t SubElt = Index % NumSubElts;
6829 return getShuffleScalarElt(Op.getOperand(SubIdx), SubElt, DAG, Depth + 1);
6830 }
6831
6832 // Recurse into extract_subvector src vector to find scalars.
6833 if (Opcode == ISD::EXTRACT_SUBVECTOR) {
6834 SDValue Src = Op.getOperand(0);
6835 uint64_t SrcIdx = Op.getConstantOperandVal(1);
6836 return getShuffleScalarElt(Src, Index + SrcIdx, DAG, Depth + 1);
6837 }
6838
6839 // We only peek through bitcasts of the same vector width.
6840 if (Opcode == ISD::BITCAST) {
6841 SDValue Src = Op.getOperand(0);
6842 EVT SrcVT = Src.getValueType();
6843 if (SrcVT.isVector() && SrcVT.getVectorNumElements() == NumElems)
6844 return getShuffleScalarElt(Src, Index, DAG, Depth + 1);
6845 return SDValue();
6846 }
6847
6848 // Actual nodes that may contain scalar elements
6849
6850 // For insert_vector_elt - either return the index matching scalar or recurse
6851 // into the base vector.
6852 if (Opcode == ISD::INSERT_VECTOR_ELT &&
6853 isa<ConstantSDNode>(Op.getOperand(2))) {
6854 if (Op.getConstantOperandAPInt(2) == Index)
6855 return Op.getOperand(1);
6856 return getShuffleScalarElt(Op.getOperand(0), Index, DAG, Depth + 1);
6857 }
6858
6859 if (Opcode == ISD::SCALAR_TO_VECTOR)
6860 return (Index == 0) ? Op.getOperand(0)
6861 : DAG.getUNDEF(VT.getVectorElementType());
6862
6863 if (Opcode == ISD::BUILD_VECTOR)
6864 return Op.getOperand(Index);
6865
6866 return SDValue();
6867}
6868
6869// Use PINSRB/PINSRW/PINSRD to create a build vector.
6871 const APInt &NonZeroMask,
6872 unsigned NumNonZero, unsigned NumZero,
6873 SelectionDAG &DAG,
6874 const X86Subtarget &Subtarget) {
6875 MVT VT = Op.getSimpleValueType();
6876 unsigned NumElts = VT.getVectorNumElements();
6877 assert(((VT == MVT::v8i16 && Subtarget.hasSSE2()) ||
6878 ((VT == MVT::v16i8 || VT == MVT::v4i32) && Subtarget.hasSSE41())) &&
6879 "Illegal vector insertion");
6880
6881 SDValue V;
6882 bool First = true;
6883
6884 for (unsigned i = 0; i < NumElts; ++i) {
6885 bool IsNonZero = NonZeroMask[i];
6886 if (!IsNonZero)
6887 continue;
6888
6889 // If the build vector contains zeros or our first insertion is not the
6890 // first index then insert into zero vector to break any register
6891 // dependency else use SCALAR_TO_VECTOR.
6892 if (First) {
6893 First = false;
6894 if (NumZero || 0 != i)
6895 V = getZeroVector(VT, Subtarget, DAG, DL);
6896 else {
6897 assert(0 == i && "Expected insertion into zero-index");
6898 V = DAG.getAnyExtOrTrunc(Op.getOperand(i), DL, MVT::i32);
6899 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32, V);
6900 V = DAG.getBitcast(VT, V);
6901 continue;
6902 }
6903 }
6904 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, V, Op.getOperand(i),
6905 DAG.getVectorIdxConstant(i, DL));
6906 }
6907
6908 return V;
6909}
6910
6911/// Custom lower build_vector of v16i8.
6913 const APInt &NonZeroMask,
6914 unsigned NumNonZero, unsigned NumZero,
6915 SelectionDAG &DAG,
6916 const X86Subtarget &Subtarget) {
6917 if (NumNonZero > 8 && !Subtarget.hasSSE41())
6918 return SDValue();
6919
6920 // SSE4.1 - use PINSRB to insert each byte directly.
6921 if (Subtarget.hasSSE41())
6922 return LowerBuildVectorAsInsert(Op, DL, NonZeroMask, NumNonZero, NumZero,
6923 DAG, Subtarget);
6924
6925 SDValue V;
6926
6927 // Pre-SSE4.1 - merge byte pairs and insert with PINSRW.
6928 // If both the lowest 16-bits are non-zero, then convert to MOVD.
6929 if (!NonZeroMask.extractBits(2, 0).isZero() &&
6930 !NonZeroMask.extractBits(2, 2).isZero()) {
6931 for (unsigned I = 0; I != 4; ++I) {
6932 if (!NonZeroMask[I])
6933 continue;
6934 SDValue Elt = DAG.getZExtOrTrunc(Op.getOperand(I), DL, MVT::i32);
6935 if (I != 0)
6936 Elt = DAG.getNode(ISD::SHL, DL, MVT::i32, Elt,
6937 DAG.getConstant(I * 8, DL, MVT::i8));
6938 V = V ? DAG.getNode(ISD::OR, DL, MVT::i32, V, Elt) : Elt;
6939 }
6940 assert(V && "Failed to fold v16i8 vector to zero");
6941 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32, V);
6942 V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v4i32, V);
6943 V = DAG.getBitcast(MVT::v8i16, V);
6944 }
6945 for (unsigned i = V ? 4 : 0; i < 16; i += 2) {
6946 bool ThisIsNonZero = NonZeroMask[i];
6947 bool NextIsNonZero = NonZeroMask[i + 1];
6948 if (!ThisIsNonZero && !NextIsNonZero)
6949 continue;
6950
6951 SDValue Elt;
6952 if (ThisIsNonZero) {
6953 if (NumZero || NextIsNonZero)
6954 Elt = DAG.getZExtOrTrunc(Op.getOperand(i), DL, MVT::i32);
6955 else
6956 Elt = DAG.getAnyExtOrTrunc(Op.getOperand(i), DL, MVT::i32);
6957 }
6958
6959 if (NextIsNonZero) {
6960 SDValue NextElt = Op.getOperand(i + 1);
6961 if (i == 0 && NumZero)
6962 NextElt = DAG.getZExtOrTrunc(NextElt, DL, MVT::i32);
6963 else
6964 NextElt = DAG.getAnyExtOrTrunc(NextElt, DL, MVT::i32);
6965 NextElt = DAG.getNode(ISD::SHL, DL, MVT::i32, NextElt,
6966 DAG.getConstant(8, DL, MVT::i8));
6967 if (ThisIsNonZero)
6968 Elt = DAG.getNode(ISD::OR, DL, MVT::i32, NextElt, Elt);
6969 else
6970 Elt = NextElt;
6971 }
6972
6973 // If our first insertion is not the first index or zeros are needed, then
6974 // insert into zero vector. Otherwise, use SCALAR_TO_VECTOR (leaves high
6975 // elements undefined).
6976 if (!V) {
6977 if (i != 0 || NumZero)
6978 V = getZeroVector(MVT::v8i16, Subtarget, DAG, DL);
6979 else {
6980 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32, Elt);
6981 V = DAG.getBitcast(MVT::v8i16, V);
6982 continue;
6983 }
6984 }
6985 Elt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Elt);
6986 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v8i16, V, Elt,
6987 DAG.getVectorIdxConstant(i / 2, DL));
6988 }
6989
6990 return DAG.getBitcast(MVT::v16i8, V);
6991}
6992
6993/// Custom lower build_vector of v8i16.
6995 const APInt &NonZeroMask,
6996 unsigned NumNonZero, unsigned NumZero,
6997 SelectionDAG &DAG,
6998 const X86Subtarget &Subtarget) {
6999 if (NumNonZero > 4 && !Subtarget.hasSSE41())
7000 return SDValue();
7001
7002 // Use PINSRW to insert each byte directly.
7003 return LowerBuildVectorAsInsert(Op, DL, NonZeroMask, NumNonZero, NumZero, DAG,
7004 Subtarget);
7005}
7006
7007/// Custom lower build_vector of v4i32 or v4f32.
7009 SelectionDAG &DAG,
7010 const X86Subtarget &Subtarget) {
7011 // If this is a splat of a pair of elements, use MOVDDUP (unless the target
7012 // has XOP; in that case defer lowering to potentially use VPERMIL2PS).
7013 // Because we're creating a less complicated build vector here, we may enable
7014 // further folding of the MOVDDUP via shuffle transforms.
7015 if (Subtarget.hasSSE3() && !Subtarget.hasXOP() &&
7016 Op.getOperand(0) == Op.getOperand(2) &&
7017 Op.getOperand(1) == Op.getOperand(3) &&
7018 Op.getOperand(0) != Op.getOperand(1)) {
7019 MVT VT = Op.getSimpleValueType();
7020 MVT EltVT = VT.getVectorElementType();
7021 // Create a new build vector with the first 2 elements followed by undef
7022 // padding, bitcast to v2f64, duplicate, and bitcast back.
7023 SDValue Ops[4] = { Op.getOperand(0), Op.getOperand(1),
7024 DAG.getUNDEF(EltVT), DAG.getUNDEF(EltVT) };
7025 SDValue NewBV = DAG.getBitcast(MVT::v2f64, DAG.getBuildVector(VT, DL, Ops));
7026 SDValue Dup = DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v2f64, NewBV);
7027 return DAG.getBitcast(VT, Dup);
7028 }
7029
7030 // Find all zeroable elements.
7031 std::bitset<4> Zeroable, Undefs;
7032 for (int i = 0; i < 4; ++i) {
7033 SDValue Elt = Op.getOperand(i);
7034 Undefs[i] = Elt.isUndef();
7035 Zeroable[i] = (Elt.isUndef() || X86::isZeroNode(Elt));
7036 }
7037 assert(Zeroable.size() - Zeroable.count() > 1 &&
7038 "We expect at least two non-zero elements!");
7039
7040 // We only know how to deal with build_vector nodes where elements are either
7041 // zeroable or extract_vector_elt with constant index.
7042 SDValue FirstNonZero;
7043 unsigned FirstNonZeroIdx;
7044 for (unsigned i = 0; i < 4; ++i) {
7045 if (Zeroable[i])
7046 continue;
7047 SDValue Elt = Op.getOperand(i);
7048 if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
7050 return SDValue();
7051 // Make sure that this node is extracting from a 128-bit vector.
7052 MVT VT = Elt.getOperand(0).getSimpleValueType();
7053 if (!VT.is128BitVector())
7054 return SDValue();
7055 if (!FirstNonZero.getNode()) {
7056 FirstNonZero = Elt;
7057 FirstNonZeroIdx = i;
7058 }
7059 }
7060
7061 assert(FirstNonZero.getNode() && "Unexpected build vector of all zeros!");
7062 SDValue V1 = FirstNonZero.getOperand(0);
7063 MVT VT = V1.getSimpleValueType();
7064
7065 // See if this build_vector can be lowered as a blend with zero.
7066 SDValue Elt;
7067 unsigned EltMaskIdx, EltIdx;
7068 int Mask[4];
7069 for (EltIdx = 0; EltIdx < 4; ++EltIdx) {
7070 if (Zeroable[EltIdx]) {
7071 // The zero vector will be on the right hand side.
7072 Mask[EltIdx] = EltIdx+4;
7073 continue;
7074 }
7075
7076 Elt = Op->getOperand(EltIdx);
7077 // By construction, Elt is a EXTRACT_VECTOR_ELT with constant index.
7078 EltMaskIdx = Elt.getConstantOperandVal(1);
7079 if (Elt.getOperand(0) != V1 || EltMaskIdx != EltIdx)
7080 break;
7081 Mask[EltIdx] = EltIdx;
7082 }
7083
7084 if (EltIdx == 4) {
7085 // Let the shuffle legalizer deal with blend operations.
7086 SDValue VZeroOrUndef = (Zeroable == Undefs)
7087 ? DAG.getUNDEF(VT)
7088 : getZeroVector(VT, Subtarget, DAG, DL);
7089 if (V1.getSimpleValueType() != VT)
7090 V1 = DAG.getBitcast(VT, V1);
7091 return DAG.getVectorShuffle(VT, SDLoc(V1), V1, VZeroOrUndef, Mask);
7092 }
7093
7094 // See if we can lower this build_vector to a INSERTPS.
7095 if (!Subtarget.hasSSE41())
7096 return SDValue();
7097
7098 SDValue V2 = Elt.getOperand(0);
7099 if (Elt == FirstNonZero && EltIdx == FirstNonZeroIdx)
7100 V1 = SDValue();
7101
7102 bool CanFold = true;
7103 for (unsigned i = EltIdx + 1; i < 4 && CanFold; ++i) {
7104 if (Zeroable[i])
7105 continue;
7106
7107 SDValue Current = Op->getOperand(i);
7108 SDValue SrcVector = Current->getOperand(0);
7109 if (!V1.getNode())
7110 V1 = SrcVector;
7111 CanFold = (SrcVector == V1) && (Current.getConstantOperandAPInt(1) == i);
7112 }
7113
7114 if (!CanFold)
7115 return SDValue();
7116
7117 assert(V1.getNode() && "Expected at least two non-zero elements!");
7118 if (V1.getSimpleValueType() != MVT::v4f32)
7119 V1 = DAG.getBitcast(MVT::v4f32, V1);
7120 if (V2.getSimpleValueType() != MVT::v4f32)
7121 V2 = DAG.getBitcast(MVT::v4f32, V2);
7122
7123 // Ok, we can emit an INSERTPS instruction.
7124 unsigned ZMask = Zeroable.to_ulong();
7125
7126 unsigned InsertPSMask = EltMaskIdx << 6 | EltIdx << 4 | ZMask;
7127 assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
7128 SDValue Result =
7129 DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
7130 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
7131 return DAG.getBitcast(VT, Result);
7132}
7133
7134/// Return a vector logical shift node.
7135static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, unsigned NumBits,
7136 SelectionDAG &DAG, const TargetLowering &TLI,
7137 const SDLoc &dl) {
7138 assert(VT.is128BitVector() && "Unknown type for VShift");
7139 MVT ShVT = MVT::v16i8;
7140 unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
7141 SrcOp = DAG.getBitcast(ShVT, SrcOp);
7142 assert(NumBits % 8 == 0 && "Only support byte sized shifts");
7143 SDValue ShiftVal = DAG.getTargetConstant(NumBits / 8, dl, MVT::i8);
7144 return DAG.getBitcast(VT, DAG.getNode(Opc, dl, ShVT, SrcOp, ShiftVal));
7145}
7146
7148 SelectionDAG &DAG) {
7149
7150 // Check if the scalar load can be widened into a vector load. And if
7151 // the address is "base + cst" see if the cst can be "absorbed" into
7152 // the shuffle mask.
7154 SDValue Ptr = LD->getBasePtr();
7155 if (!ISD::isNormalLoad(LD) || !LD->isSimple())
7156 return SDValue();
7157 EVT PVT = LD->getValueType(0);
7158 if (PVT != MVT::i32 && PVT != MVT::f32)
7159 return SDValue();
7160
7161 int FI = -1;
7162 int64_t Offset = 0;
7164 FI = FINode->getIndex();
7165 Offset = 0;
7166 } else if (DAG.isBaseWithConstantOffset(Ptr) &&
7167 isa<FrameIndexSDNode>(Ptr.getOperand(0))) {
7168 FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
7169 Offset = Ptr.getConstantOperandVal(1);
7170 Ptr = Ptr.getOperand(0);
7171 } else {
7172 return SDValue();
7173 }
7174
7175 // FIXME: 256-bit vector instructions don't require a strict alignment,
7176 // improve this code to support it better.
7177 Align RequiredAlign(VT.getSizeInBits() / 8);
7178 SDValue Chain = LD->getChain();
7179 // Make sure the stack object alignment is at least 16 or 32.
7181 MaybeAlign InferredAlign = DAG.InferPtrAlign(Ptr);
7182 if (!InferredAlign || *InferredAlign < RequiredAlign) {
7183 if (MFI.isFixedObjectIndex(FI)) {
7184 // Can't change the alignment. FIXME: It's possible to compute
7185 // the exact stack offset and reference FI + adjust offset instead.
7186 // If someone *really* cares about this. That's the way to implement it.
7187 return SDValue();
7188 } else {
7189 MFI.setObjectAlignment(FI, RequiredAlign);
7190 }
7191 }
7192
7193 // (Offset % 16 or 32) must be multiple of 4. Then address is then
7194 // Ptr + (Offset & ~15).
7195 if (Offset < 0)
7196 return SDValue();
7197 if ((Offset % RequiredAlign.value()) & 3)
7198 return SDValue();
7199 int64_t StartOffset = Offset & ~int64_t(RequiredAlign.value() - 1);
7200 if (StartOffset) {
7201 SDLoc DL(Ptr);
7202 Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
7203 DAG.getConstant(StartOffset, DL, Ptr.getValueType()));
7204 }
7205
7206 int EltNo = (Offset - StartOffset) >> 2;
7207 unsigned NumElems = VT.getVectorNumElements();
7208
7209 EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems);
7210 SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr,
7211 LD->getPointerInfo().getWithOffset(StartOffset));
7212
7213 SmallVector<int, 8> Mask(NumElems, EltNo);
7214
7215 return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), Mask);
7216 }
7217
7218 return SDValue();
7219}
7220
7221// Recurse to find a LoadSDNode source and the accumulated ByteOffest.
7222static bool findEltLoadSrc(SDValue Elt, LoadSDNode *&Ld, int64_t &ByteOffset) {
7223 if (ISD::isNON_EXTLoad(Elt.getNode())) {
7224 auto *BaseLd = cast<LoadSDNode>(Elt);
7225 if (!BaseLd->isSimple())
7226 return false;
7227 Ld = BaseLd;
7228 ByteOffset = 0;
7229 return true;
7230 }
7231
7232 switch (Elt.getOpcode()) {
7233 case ISD::BITCAST:
7234 case ISD::TRUNCATE:
7236 return findEltLoadSrc(Elt.getOperand(0), Ld, ByteOffset);
7237 case ISD::SRL:
7238 if (auto *AmtC = dyn_cast<ConstantSDNode>(Elt.getOperand(1))) {
7239 uint64_t Amt = AmtC->getZExtValue();
7240 if ((Amt % 8) == 0 && findEltLoadSrc(Elt.getOperand(0), Ld, ByteOffset)) {
7241 ByteOffset += Amt / 8;
7242 return true;
7243 }
7244 }
7245 break;
7247 if (auto *IdxC = dyn_cast<ConstantSDNode>(Elt.getOperand(1))) {
7248 SDValue Src = Elt.getOperand(0);
7249 unsigned SrcSizeInBits = Src.getScalarValueSizeInBits();
7250 unsigned DstSizeInBits = Elt.getScalarValueSizeInBits();
7251 if (DstSizeInBits == SrcSizeInBits && (SrcSizeInBits % 8) == 0 &&
7252 findEltLoadSrc(Src, Ld, ByteOffset)) {
7253 uint64_t Idx = IdxC->getZExtValue();
7254 ByteOffset += Idx * (SrcSizeInBits / 8);
7255 return true;
7256 }
7257 }
7258 break;
7259 }
7260
7261 return false;
7262}
7263
7264/// Given the initializing elements 'Elts' of a vector of type 'VT', see if the
7265/// elements can be replaced by a single large load which has the same value as
7266/// a build_vector or insert_subvector whose loaded operands are 'Elts'.
7267///
7268/// Example: <load i32 *a, load i32 *a+4, zero, undef> -> zextload a
7270 const SDLoc &DL, SelectionDAG &DAG,
7271 const X86Subtarget &Subtarget,
7272 bool IsAfterLegalize) {
7273 if ((VT.getScalarSizeInBits() % 8) != 0)
7274 return SDValue();
7275
7276 unsigned NumElems = Elts.size();
7277
7278 int LastLoadedElt = -1;
7279 APInt LoadMask = APInt::getZero(NumElems);
7280 APInt ZeroMask = APInt::getZero(NumElems);
7281 APInt UndefMask = APInt::getZero(NumElems);
7282
7283 SmallVector<LoadSDNode*, 8> Loads(NumElems, nullptr);
7284 SmallVector<int64_t, 8> ByteOffsets(NumElems, 0);
7285
7286 // For each element in the initializer, see if we've found a load, zero or an
7287 // undef.
7288 for (unsigned i = 0; i < NumElems; ++i) {
7289 SDValue Elt = peekThroughBitcasts(Elts[i]);
7290 if (!Elt.getNode())
7291 return SDValue();
7292 if (Elt.isUndef()) {
7293 UndefMask.setBit(i);
7294 continue;
7295 }
7297 ZeroMask.setBit(i);
7298 continue;
7299 }
7300
7301 // Each loaded element must be the correct fractional portion of the
7302 // requested vector load.
7303 unsigned EltSizeInBits = Elt.getValueSizeInBits();
7304 if ((NumElems * EltSizeInBits) != VT.getSizeInBits())
7305 return SDValue();
7306
7307 if (!findEltLoadSrc(Elt, Loads[i], ByteOffsets[i]) || ByteOffsets[i] < 0)
7308 return SDValue();
7309 unsigned LoadSizeInBits = Loads[i]->getValueSizeInBits(0);
7310 if (((ByteOffsets[i] * 8) + EltSizeInBits) > LoadSizeInBits)
7311 return SDValue();
7312
7313 LoadMask.setBit(i);
7314 LastLoadedElt = i;
7315 }
7316 assert((ZeroMask.popcount() + UndefMask.popcount() + LoadMask.popcount()) ==
7317 NumElems &&
7318 "Incomplete element masks");
7319
7320 // Handle Special Cases - all undef or undef/zero.
7321 if (UndefMask.popcount() == NumElems)
7322 return DAG.getUNDEF(VT);
7323 if ((ZeroMask.popcount() + UndefMask.popcount()) == NumElems)
7324 return VT.isInteger() ? DAG.getConstant(0, DL, VT)
7325 : DAG.getConstantFP(0.0, DL, VT);
7326
7327 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
7328 int FirstLoadedElt = LoadMask.countr_zero();
7329 SDValue EltBase = peekThroughBitcasts(Elts[FirstLoadedElt]);
7330 EVT EltBaseVT = EltBase.getValueType();
7331 assert(EltBaseVT.getSizeInBits() == EltBaseVT.getStoreSizeInBits() &&
7332 "Register/Memory size mismatch");
7333 LoadSDNode *LDBase = Loads[FirstLoadedElt];
7334 assert(LDBase && "Did not find base load for merging consecutive loads");
7335 unsigned BaseSizeInBits = EltBaseVT.getStoreSizeInBits();
7336 unsigned BaseSizeInBytes = BaseSizeInBits / 8;
7337 int NumLoadedElts = (1 + LastLoadedElt - FirstLoadedElt);
7338 int LoadSizeInBits = NumLoadedElts * BaseSizeInBits;
7339 assert((BaseSizeInBits % 8) == 0 && "Sub-byte element loads detected");
7340
7341 // TODO: Support offsetting the base load.
7342 if (ByteOffsets[FirstLoadedElt] != 0)
7343 return SDValue();
7344
7345 // Check to see if the element's load is consecutive to the base load
7346 // or offset from a previous (already checked) load.
7347 auto CheckConsecutiveLoad = [&](LoadSDNode *Base, int EltIdx) {
7348 LoadSDNode *Ld = Loads[EltIdx];
7349 int64_t ByteOffset = ByteOffsets[EltIdx];
7350 if (ByteOffset && (ByteOffset % BaseSizeInBytes) == 0) {
7351 int64_t BaseIdx = EltIdx - (ByteOffset / BaseSizeInBytes);
7352 return (0 <= BaseIdx && BaseIdx < (int)NumElems && LoadMask[BaseIdx] &&
7353 Loads[BaseIdx] == Ld && ByteOffsets[BaseIdx] == 0);
7354 }
7355 return DAG.areNonVolatileConsecutiveLoads(Ld, Base, BaseSizeInBytes,
7356 EltIdx - FirstLoadedElt);
7357 };
7358
7359 // Consecutive loads can contain UNDEFS but not ZERO elements.
7360 // Consecutive loads with UNDEFs and ZEROs elements require a
7361 // an additional shuffle stage to clear the ZERO elements.
7362 bool IsConsecutiveLoad = true;
7363 bool IsConsecutiveLoadWithZeros = true;
7364 for (int i = FirstLoadedElt + 1; i <= LastLoadedElt; ++i) {
7365 if (LoadMask[i]) {
7366 if (!CheckConsecutiveLoad(LDBase, i)) {
7367 IsConsecutiveLoad = false;
7368 IsConsecutiveLoadWithZeros = false;
7369 break;
7370 }
7371 } else if (ZeroMask[i]) {
7372 IsConsecutiveLoad = false;
7373 }
7374 }
7375
7376 auto CreateLoad = [&DAG, &DL, &Loads](EVT VT, LoadSDNode *LDBase) {
7377 auto MMOFlags = LDBase->getMemOperand()->getFlags();
7378 assert(LDBase->isSimple() &&
7379 "Cannot merge volatile or atomic loads.");
7380 SDValue NewLd =
7381 DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
7382 LDBase->getPointerInfo(), LDBase->getBaseAlign(), MMOFlags);
7383 for (auto *LD : Loads)
7384 if (LD)
7385 DAG.makeEquivalentMemoryOrdering(LD, NewLd);
7386 return NewLd;
7387 };
7388
7389 // Check if the base load is entirely dereferenceable.
7390 bool IsDereferenceable = LDBase->getPointerInfo().isDereferenceable(
7391 VT.getSizeInBits() / 8, *DAG.getContext(), DAG.getDataLayout());
7392
7393 // LOAD - all consecutive load/undefs (must start/end with a load or be
7394 // entirely dereferenceable). If we have found an entire vector of loads and
7395 // undefs, then return a large load of the entire vector width starting at the
7396 // base pointer. If the vector contains zeros, then attempt to shuffle those
7397 // elements.
7398 if (FirstLoadedElt == 0 &&
7399 (NumLoadedElts == (int)NumElems || IsDereferenceable) &&
7400 (IsConsecutiveLoad || IsConsecutiveLoadWithZeros)) {
7401 if (IsAfterLegalize && !TLI.isOperationLegal(ISD::LOAD, VT))
7402 return SDValue();
7403
7404 // Don't create 256-bit non-temporal aligned loads without AVX2 as these
7405 // will lower to regular temporal loads and use the cache.
7406 if (LDBase->isNonTemporal() && LDBase->getAlign() >= Align(32) &&
7407 VT.is256BitVector() && !Subtarget.hasInt256())
7408 return SDValue();
7409
7410 if (NumElems == 1)
7411 return DAG.getBitcast(VT, Elts[FirstLoadedElt]);
7412
7413 if (!ZeroMask)
7414 return CreateLoad(VT, LDBase);
7415
7416 // IsConsecutiveLoadWithZeros - we need to create a shuffle of the loaded
7417 // vector and a zero vector to clear out the zero elements.
7418 if (!IsAfterLegalize && VT.isVector()) {
7419 unsigned NumMaskElts = VT.getVectorNumElements();
7420 if ((NumMaskElts % NumElems) == 0) {
7421 unsigned Scale = NumMaskElts / NumElems;
7422 SmallVector<int, 4> ClearMask(NumMaskElts, -1);
7423 for (unsigned i = 0; i < NumElems; ++i) {
7424 if (UndefMask[i])
7425 continue;
7426 int Offset = ZeroMask[i] ? NumMaskElts : 0;
7427 for (unsigned j = 0; j != Scale; ++j)
7428 ClearMask[(i * Scale) + j] = (i * Scale) + j + Offset;
7429 }
7430 SDValue V = CreateLoad(VT, LDBase);
7431 SDValue Z = VT.isInteger() ? DAG.getConstant(0, DL, VT)
7432 : DAG.getConstantFP(0.0, DL, VT);
7433 return DAG.getVectorShuffle(VT, DL, V, Z, ClearMask);
7434 }
7435 }
7436 }
7437
7438 // If the upper half of a ymm/zmm load is undef then just load the lower half.
7439 if (VT.is256BitVector() || VT.is512BitVector()) {
7440 unsigned HalfNumElems = NumElems / 2;
7441 if (UndefMask.extractBits(HalfNumElems, HalfNumElems).isAllOnes()) {
7442 EVT HalfVT =
7443 EVT::getVectorVT(*DAG.getContext(), VT.getScalarType(), HalfNumElems);
7444 SDValue HalfLD =
7445 EltsFromConsecutiveLoads(HalfVT, Elts.drop_back(HalfNumElems), DL,
7446 DAG, Subtarget, IsAfterLegalize);
7447 if (HalfLD)
7448 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT),
7449 HalfLD, DAG.getVectorIdxConstant(0, DL));
7450 }
7451 }
7452
7453 // VZEXT_LOAD - consecutive 32/64-bit load/undefs followed by zeros/undefs.
7454 if (IsConsecutiveLoad && FirstLoadedElt == 0 &&
7455 ((LoadSizeInBits == 16 && Subtarget.hasFP16()) || LoadSizeInBits == 32 ||
7456 LoadSizeInBits == 64) &&
7457 ((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))) {
7458 MVT VecSVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(LoadSizeInBits)
7459 : MVT::getIntegerVT(LoadSizeInBits);
7460 MVT VecVT = MVT::getVectorVT(VecSVT, VT.getSizeInBits() / LoadSizeInBits);
7461 // Allow v4f32 on SSE1 only targets.
7462 // FIXME: Add more isel patterns so we can just use VT directly.
7463 if (!Subtarget.hasSSE2() && VT == MVT::v4f32)
7464 VecVT = MVT::v4f32;
7465 if (TLI.isTypeLegal(VecVT)) {
7466 SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);
7467 SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
7468 SDValue ResNode = DAG.getMemIntrinsicNode(
7469 X86ISD::VZEXT_LOAD, DL, Tys, Ops, VecSVT, LDBase->getPointerInfo(),
7471 for (auto *LD : Loads)
7472 if (LD)
7473 DAG.makeEquivalentMemoryOrdering(LD, ResNode);
7474 return DAG.getBitcast(VT, ResNode);
7475 }
7476 }
7477
7478 // BROADCAST - match the smallest possible repetition pattern, load that
7479 // scalar/subvector element and then broadcast to the entire vector.
7480 if (ZeroMask.isZero() && isPowerOf2_32(NumElems) && Subtarget.hasAVX() &&
7481 (VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector())) {
7482 for (unsigned SubElems = 1; SubElems < NumElems; SubElems *= 2) {
7483 unsigned RepeatSize = SubElems * BaseSizeInBits;
7484 unsigned ScalarSize = std::min(RepeatSize, 64u);
7485 if (!Subtarget.hasAVX2() && ScalarSize < 32)
7486 continue;
7487
7488 // Don't attempt a 1:N subvector broadcast - it should be caught by
7489 // combineConcatVectorOps, else will cause infinite loops.
7490 if (RepeatSize > ScalarSize && SubElems == 1)
7491 continue;
7492
7493 bool Match = true;
7494 SmallVector<SDValue, 8> RepeatedLoads(SubElems, DAG.getUNDEF(EltBaseVT));
7495 for (unsigned i = 0; i != NumElems && Match; ++i) {
7496 if (!LoadMask[i])
7497 continue;
7498 SDValue Elt = peekThroughBitcasts(Elts[i]);
7499 if (RepeatedLoads[i % SubElems].isUndef())
7500 RepeatedLoads[i % SubElems] = Elt;
7501 else
7502 Match &= (RepeatedLoads[i % SubElems] == Elt);
7503 }
7504
7505 // We must have loads at both ends of the repetition.
7506 Match &= !RepeatedLoads.front().isUndef();
7507 Match &= !RepeatedLoads.back().isUndef();
7508 if (!Match)
7509 continue;
7510
7511 EVT RepeatVT =
7512 VT.isInteger() && (RepeatSize != 64 || TLI.isTypeLegal(MVT::i64))
7513 ? EVT::getIntegerVT(*DAG.getContext(), ScalarSize)
7514 : EVT::getFloatingPointVT(ScalarSize);
7515 if (RepeatSize > ScalarSize)
7516 RepeatVT = EVT::getVectorVT(*DAG.getContext(), RepeatVT,
7517 RepeatSize / ScalarSize);
7518 EVT BroadcastVT =
7519 EVT::getVectorVT(*DAG.getContext(), RepeatVT.getScalarType(),
7520 VT.getSizeInBits() / ScalarSize);
7521 if (TLI.isTypeLegal(BroadcastVT)) {
7522 if (SDValue RepeatLoad = EltsFromConsecutiveLoads(
7523 RepeatVT, RepeatedLoads, DL, DAG, Subtarget, IsAfterLegalize)) {
7524 SDValue Broadcast = RepeatLoad;
7525 if (RepeatSize > ScalarSize) {
7526 while (Broadcast.getValueSizeInBits() < VT.getSizeInBits())
7527 Broadcast = concatSubVectors(Broadcast, Broadcast, DAG, DL);
7528 } else {
7529 if (!Subtarget.hasAVX2() &&
7531 RepeatLoad, RepeatVT.getScalarType().getSimpleVT(),
7532 Subtarget,
7533 /*AssumeSingleUse=*/true))
7534 return SDValue();
7535 Broadcast =
7536 DAG.getNode(X86ISD::VBROADCAST, DL, BroadcastVT, RepeatLoad);
7537 }
7538 return DAG.getBitcast(VT, Broadcast);
7539 }
7540 }
7541 }
7542 }
7543
7544 return SDValue();
7545}
7546
7547// Combine a vector ops (shuffles etc.) that is equal to build_vector load1,
7548// load2, load3, load4, <0, 1, 2, 3> into a vector load if the load addresses
7549// are consecutive, non-overlapping, and in the right order.
7551 SelectionDAG &DAG,
7552 const X86Subtarget &Subtarget,
7553 bool IsAfterLegalize) {
7555 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
7556 if (SDValue Elt = getShuffleScalarElt(Op, i, DAG, 0)) {
7557 Elts.push_back(Elt);
7558 continue;
7559 }
7560 return SDValue();
7561 }
7562 assert(Elts.size() == VT.getVectorNumElements());
7563 return EltsFromConsecutiveLoads(VT, Elts, DL, DAG, Subtarget,
7564 IsAfterLegalize);
7565}
7566
7568 const APInt &Undefs, LLVMContext &C) {
7569 unsigned ScalarSize = VT.getScalarSizeInBits();
7570 Type *Ty = EVT(VT.getScalarType()).getTypeForEVT(C);
7571
7572 auto getConstantScalar = [&](const APInt &Val) -> Constant * {
7573 if (VT.isFloatingPoint()) {
7574 if (ScalarSize == 16)
7575 return ConstantFP::get(C, APFloat(APFloat::IEEEhalf(), Val));
7576 if (ScalarSize == 32)
7577 return ConstantFP::get(C, APFloat(APFloat::IEEEsingle(), Val));
7578 assert(ScalarSize == 64 && "Unsupported floating point scalar size");
7579 return ConstantFP::get(C, APFloat(APFloat::IEEEdouble(), Val));
7580 }
7581 return Constant::getIntegerValue(Ty, Val);
7582 };
7583
7584 SmallVector<Constant *, 32> ConstantVec;
7585 for (unsigned I = 0, E = Bits.size(); I != E; ++I)
7586 ConstantVec.push_back(Undefs[I] ? UndefValue::get(Ty)
7587 : getConstantScalar(Bits[I]));
7588
7589 return ConstantVector::get(ArrayRef<Constant *>(ConstantVec));
7590}
7591
7592static Constant *getConstantVector(MVT VT, const APInt &SplatValue,
7593 unsigned SplatBitSize, LLVMContext &C) {
7594 unsigned ScalarSize = VT.getScalarSizeInBits();
7595
7596 auto getConstantScalar = [&](const APInt &Val) -> Constant * {
7597 if (VT.isFloatingPoint()) {
7598 if (ScalarSize == 16)
7599 return ConstantFP::get(C, APFloat(APFloat::IEEEhalf(), Val));
7600 if (ScalarSize == 32)
7601 return ConstantFP::get(C, APFloat(APFloat::IEEEsingle(), Val));
7602 assert(ScalarSize == 64 && "Unsupported floating point scalar size");
7603 return ConstantFP::get(C, APFloat(APFloat::IEEEdouble(), Val));
7604 }
7605 return Constant::getIntegerValue(Type::getIntNTy(C, ScalarSize), Val);
7606 };
7607
7608 if (ScalarSize == SplatBitSize)
7609 return getConstantScalar(SplatValue);
7610
7611 unsigned NumElm = SplatBitSize / ScalarSize;
7612 SmallVector<Constant *, 32> ConstantVec;
7613 for (unsigned I = 0; I != NumElm; ++I) {
7614 APInt Val = SplatValue.extractBits(ScalarSize, ScalarSize * I);
7615 ConstantVec.push_back(getConstantScalar(Val));
7616 }
7617 return ConstantVector::get(ArrayRef<Constant *>(ConstantVec));
7618}
7619
7621 for (auto *U : N->users()) {
7622 unsigned Opc = U->getOpcode();
7623 // VPERMV/VPERMV3 shuffles can never fold their index operands.
7624 if (Opc == X86ISD::VPERMV && U->getOperand(0).getNode() == N)
7625 return false;
7626 if (Opc == X86ISD::VPERMV3 && U->getOperand(1).getNode() == N)
7627 return false;
7628 if (isTargetShuffle(Opc))
7629 return true;
7630 if (Opc == ISD::BITCAST) // Ignore bitcasts
7631 return isFoldableUseOfShuffle(U);
7632 if (N->hasOneUse()) {
7633 // TODO, there may be some general way to know if a SDNode can
7634 // be folded. We now only know whether an MI is foldable.
7635 if (Opc == X86ISD::VPDPBUSD && U->getOperand(2).getNode() != N)
7636 return false;
7637 return true;
7638 }
7639 }
7640 return false;
7641}
7642
7643// If the node has a single use by a VSELECT then AVX512 targets may be able to
7644// fold as a predicated instruction.
7645static bool isMaskableNode(SDValue V, const X86Subtarget &Subtarget) {
7646 unsigned SizeInBits = V.getValueSizeInBits();
7647 if ((SizeInBits == 512 && Subtarget.hasAVX512()) ||
7648 (SizeInBits >= 128 && Subtarget.hasVLX())) {
7649 if (V.hasOneUse() && V->user_begin()->getOpcode() == ISD::VSELECT &&
7650 V->user_begin()->getOperand(0).getScalarValueSizeInBits() == 1) {
7651 return true;
7652 }
7653 }
7654 return false;
7655}
7656
7657/// Attempt to use the vbroadcast instruction to generate a splat value
7658/// from a splat BUILD_VECTOR which uses:
7659/// a. A single scalar load, or a constant.
7660/// b. Repeated pattern of constants (e.g. <0,1,0,1> or <0,1,2,3,0,1,2,3>).
7661///
7662/// The VBROADCAST node is returned when a pattern is found,
7663/// or SDValue() otherwise.
7665 const SDLoc &dl,
7666 const X86Subtarget &Subtarget,
7667 SelectionDAG &DAG) {
7668 // VBROADCAST requires AVX.
7669 // TODO: Splats could be generated for non-AVX CPUs using SSE
7670 // instructions, but there's less potential gain for only 128-bit vectors.
7671 if (!Subtarget.hasAVX())
7672 return SDValue();
7673
7674 MVT VT = BVOp->getSimpleValueType(0);
7675 unsigned NumElts = VT.getVectorNumElements();
7676 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
7677 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
7678 "Unsupported vector type for broadcast.");
7679
7680 // See if the build vector is a repeating sequence of scalars (inc. splat).
7681 SDValue Ld;
7682 BitVector UndefElements;
7683 SmallVector<SDValue, 16> Sequence;
7684 if (BVOp->getRepeatedSequence(Sequence, &UndefElements)) {
7685 assert((NumElts % Sequence.size()) == 0 && "Sequence doesn't fit.");
7686 if (Sequence.size() == 1)
7687 Ld = Sequence[0];
7688 }
7689
7690 // Attempt to use VBROADCASTM
7691 // From this pattern:
7692 // a. t0 = (zext_i64 (bitcast_i8 v2i1 X))
7693 // b. t1 = (build_vector t0 t0)
7694 //
7695 // Create (VBROADCASTM v2i1 X)
7696 if (!Sequence.empty() && Subtarget.hasCDI()) {
7697 // If not a splat, are the upper sequence values zeroable?
7698 unsigned SeqLen = Sequence.size();
7699 bool UpperZeroOrUndef =
7700 SeqLen == 1 ||
7701 llvm::all_of(ArrayRef(Sequence).drop_front(),
7702 [](SDValue V) { return !V || isNullConstantOrUndef(V); });
7703 SDValue Op0 = Sequence[0];
7704 if (UpperZeroOrUndef && ((Op0.getOpcode() == ISD::BITCAST) ||
7705 (Op0.getOpcode() == ISD::ZERO_EXTEND &&
7706 Op0.getOperand(0).getOpcode() == ISD::BITCAST))) {
7707 SDValue BOperand = Op0.getOpcode() == ISD::BITCAST
7708 ? Op0.getOperand(0)
7709 : Op0.getOperand(0).getOperand(0);
7710 MVT MaskVT = BOperand.getSimpleValueType();
7711 MVT EltType = MVT::getIntegerVT(VT.getScalarSizeInBits() * SeqLen);
7712 if ((EltType == MVT::i64 && MaskVT == MVT::v8i1) || // for broadcastmb2q
7713 (EltType == MVT::i32 && MaskVT == MVT::v16i1)) { // for broadcastmw2d
7714 MVT BcstVT = MVT::getVectorVT(EltType, NumElts / SeqLen);
7715 if (!VT.is512BitVector() && !Subtarget.hasVLX()) {
7716 unsigned Scale = 512 / VT.getSizeInBits();
7717 BcstVT = MVT::getVectorVT(EltType, Scale * (NumElts / SeqLen));
7718 }
7719 SDValue Bcst = DAG.getNode(X86ISD::VBROADCASTM, dl, BcstVT, BOperand);
7720 if (BcstVT.getSizeInBits() != VT.getSizeInBits())
7721 Bcst = extractSubVector(Bcst, 0, DAG, dl, VT.getSizeInBits());
7722 return DAG.getBitcast(VT, Bcst);
7723 }
7724 }
7725 }
7726
7727 unsigned NumUndefElts = UndefElements.count();
7728 if (!Ld || (NumElts - NumUndefElts) <= 1) {
7729 APInt SplatValue, Undef;
7730 unsigned SplatBitSize;
7731 bool HasUndef;
7732 // Check if this is a repeated constant pattern suitable for broadcasting.
7733 if (BVOp->isConstantSplat(SplatValue, Undef, SplatBitSize, HasUndef) &&
7734 SplatBitSize > VT.getScalarSizeInBits() &&
7735 SplatBitSize < VT.getSizeInBits()) {
7736 // Avoid replacing with broadcast when it's a use of a shuffle
7737 // instruction to preserve the present custom lowering of shuffles.
7738 if (isFoldableUseOfShuffle(BVOp))
7739 return SDValue();
7740 // replace BUILD_VECTOR with broadcast of the repeated constants.
7741 LLVMContext *Ctx = DAG.getContext();
7742 MVT PVT = TLI.getPointerTy(DAG.getDataLayout());
7743 if (SplatBitSize == 32 || SplatBitSize == 64 ||
7744 (SplatBitSize < 32 && Subtarget.hasAVX2())) {
7745 // Load the constant scalar/subvector and broadcast it.
7746 MVT CVT = MVT::getIntegerVT(SplatBitSize);
7747 Constant *C = getConstantVector(VT, SplatValue, SplatBitSize, *Ctx);
7748 SDValue CP = DAG.getConstantPool(C, PVT);
7749 unsigned Repeat = VT.getSizeInBits() / SplatBitSize;
7750
7751 Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();
7752 SDVTList Tys = DAG.getVTList(MVT::getVectorVT(CVT, Repeat), MVT::Other);
7753 SDValue Ops[] = {DAG.getEntryNode(), CP};
7754 MachinePointerInfo MPI =
7756 SDValue Brdcst =
7758 MPI, Alignment, MachineMemOperand::MOLoad);
7759 return DAG.getBitcast(VT, Brdcst);
7760 }
7761 if (SplatBitSize > 64) {
7762 // Load the vector of constants and broadcast it.
7763 Constant *VecC = getConstantVector(VT, SplatValue, SplatBitSize, *Ctx);
7764 SDValue VCP = DAG.getConstantPool(VecC, PVT);
7765 unsigned NumElm = SplatBitSize / VT.getScalarSizeInBits();
7766 MVT VVT = MVT::getVectorVT(VT.getScalarType(), NumElm);
7767 Align Alignment = cast<ConstantPoolSDNode>(VCP)->getAlign();
7768 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
7769 SDValue Ops[] = {DAG.getEntryNode(), VCP};
7770 MachinePointerInfo MPI =
7773 Ops, VVT, MPI, Alignment,
7775 }
7776 }
7777
7778 // If we are moving a scalar into a vector (Ld must be set and all elements
7779 // but 1 are undef) and that operation is not obviously supported by
7780 // vmovd/vmovq/vmovss/vmovsd, then keep trying to form a broadcast.
7781 // That's better than general shuffling and may eliminate a load to GPR and
7782 // move from scalar to vector register.
7783 if (!Ld || NumElts - NumUndefElts != 1)
7784 return SDValue();
7785 unsigned ScalarSize = Ld.getValueSizeInBits();
7786 if (!(UndefElements[0] || (ScalarSize != 32 && ScalarSize != 64)))
7787 return SDValue();
7788 }
7789
7790 bool ConstSplatVal =
7791 (Ld.getOpcode() == ISD::Constant || Ld.getOpcode() == ISD::ConstantFP);
7792 bool IsLoad = ISD::isNormalLoad(Ld.getNode());
7793
7794 // TODO: Handle broadcasts of non-constant sequences.
7795
7796 // Make sure that all of the users of a non-constant load are from the
7797 // BUILD_VECTOR node.
7798 // FIXME: Is the use count needed for non-constant, non-load case?
7799 if (!ConstSplatVal && !IsLoad && !BVOp->isOnlyUserOf(Ld.getNode()))
7800 return SDValue();
7801
7802 unsigned ScalarSize = Ld.getValueSizeInBits();
7803 bool IsGE256 = (VT.getSizeInBits() >= 256);
7804
7805 // When optimizing for size, generate up to 5 extra bytes for a broadcast
7806 // instruction to save 8 or more bytes of constant pool data.
7807 // TODO: If multiple splats are generated to load the same constant,
7808 // it may be detrimental to overall size. There needs to be a way to detect
7809 // that condition to know if this is truly a size win.
7810 bool OptForSize = DAG.shouldOptForSize();
7811
7812 // Handle broadcasting a single constant scalar from the constant pool
7813 // into a vector.
7814 // On Sandybridge (no AVX2), it is still better to load a constant vector
7815 // from the constant pool and not to broadcast it from a scalar.
7816 // But override that restriction when optimizing for size.
7817 // TODO: Check if splatting is recommended for other AVX-capable CPUs.
7818 if (ConstSplatVal && (Subtarget.hasAVX2() || OptForSize)) {
7819 EVT CVT = Ld.getValueType();
7820 assert(!CVT.isVector() && "Must not broadcast a vector type");
7821
7822 // Splat f16, f32, i32, v4f64, v4i64 in all cases with AVX2.
7823 // For size optimization, also splat v2f64 and v2i64, and for size opt
7824 // with AVX2, also splat i8 and i16.
7825 // With pattern matching, the VBROADCAST node may become a VMOVDDUP.
7826 if (ScalarSize == 32 ||
7827 (ScalarSize == 64 && (IsGE256 || Subtarget.hasVLX())) ||
7828 (CVT == MVT::f16 && Subtarget.hasAVX2()) ||
7829 (OptForSize && (ScalarSize == 64 || Subtarget.hasAVX2()))) {
7830 const Constant *C = nullptr;
7832 C = CI->getConstantIntValue();
7834 C = CF->getConstantFPValue();
7835
7836 assert(C && "Invalid constant type");
7837
7838 SDValue CP =
7840 Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();
7841
7842 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
7843 SDValue Ops[] = {DAG.getEntryNode(), CP};
7844 MachinePointerInfo MPI =
7846 return DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops, CVT,
7847 MPI, Alignment, MachineMemOperand::MOLoad);
7848 }
7849 }
7850
7851 // Handle AVX2 in-register broadcasts.
7852 if (!IsLoad && Subtarget.hasInt256() &&
7853 (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)))
7854 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
7855
7856 // The scalar source must be a normal load.
7857 if (!IsLoad)
7858 return SDValue();
7859
7860 // Make sure the non-chain result is only used by this build vector.
7861 if (!Ld->hasNUsesOfValue(NumElts - NumUndefElts, 0))
7862 return SDValue();
7863
7864 if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
7865 (Subtarget.hasVLX() && ScalarSize == 64)) {
7866 auto *LN = cast<LoadSDNode>(Ld);
7867 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
7868 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
7869 SDValue BCast =
7871 LN->getMemoryVT(), LN->getMemOperand());
7872 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BCast.getValue(1));
7873 return BCast;
7874 }
7875
7876 // The integer check is needed for the 64-bit into 128-bit so it doesn't match
7877 // double since there is no vbroadcastsd xmm
7878 if (Subtarget.hasInt256() && Ld.getValueType().isInteger() &&
7879 (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64)) {
7880 auto *LN = cast<LoadSDNode>(Ld);
7881 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
7882 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
7883 SDValue BCast =
7885 LN->getMemoryVT(), LN->getMemOperand());
7886 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BCast.getValue(1));
7887 return BCast;
7888 }
7889
7890 if (ScalarSize == 16 && Subtarget.hasFP16() && IsGE256)
7891 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
7892
7893 // Unsupported broadcast.
7894 return SDValue();
7895}
7896
7897/// For an EXTRACT_VECTOR_ELT with a constant index return the real
7898/// underlying vector and index.
7899///
7900/// Modifies \p ExtractedFromVec to the real vector and returns the real
7901/// index.
7902static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec,
7903 SDValue ExtIdx) {
7904 int Idx = ExtIdx->getAsZExtVal();
7905 if (!isa<ShuffleVectorSDNode>(ExtractedFromVec))
7906 return Idx;
7907
7908 // For 256-bit vectors, LowerEXTRACT_VECTOR_ELT_SSE4 may have already
7909 // lowered this:
7910 // (extract_vector_elt (v8f32 %1), Constant<6>)
7911 // to:
7912 // (extract_vector_elt (vector_shuffle<2,u,u,u>
7913 // (extract_subvector (v8f32 %0), Constant<4>),
7914 // undef)
7915 // Constant<0>)
7916 // In this case the vector is the extract_subvector expression and the index
7917 // is 2, as specified by the shuffle.
7918 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(ExtractedFromVec);
7919 SDValue ShuffleVec = SVOp->getOperand(0);
7920 MVT ShuffleVecVT = ShuffleVec.getSimpleValueType();
7921 assert(ShuffleVecVT.getVectorElementType() ==
7922 ExtractedFromVec.getSimpleValueType().getVectorElementType());
7923
7924 int ShuffleIdx = SVOp->getMaskElt(Idx);
7925 if (isUndefOrInRange(ShuffleIdx, 0, ShuffleVecVT.getVectorNumElements())) {
7926 ExtractedFromVec = ShuffleVec;
7927 return ShuffleIdx;
7928 }
7929 return Idx;
7930}
7931
7933 SelectionDAG &DAG) {
7934 MVT VT = Op.getSimpleValueType();
7935
7936 // Skip if insert_vec_elt is not supported.
7937 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
7939 return SDValue();
7940
7941 unsigned NumElems = Op.getNumOperands();
7942 SDValue VecIn1;
7943 SDValue VecIn2;
7944 SmallVector<unsigned, 4> InsertIndices;
7945 SmallVector<int, 8> Mask(NumElems, -1);
7946
7947 for (unsigned i = 0; i != NumElems; ++i) {
7948 unsigned Opc = Op.getOperand(i).getOpcode();
7949
7950 if (Opc == ISD::UNDEF)
7951 continue;
7952
7954 // Quit if more than 1 elements need inserting.
7955 if (InsertIndices.size() > 1)
7956 return SDValue();
7957
7958 InsertIndices.push_back(i);
7959 continue;
7960 }
7961
7962 SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0);
7963 SDValue ExtIdx = Op.getOperand(i).getOperand(1);
7964
7965 // Quit if non-constant index.
7966 if (!isa<ConstantSDNode>(ExtIdx))
7967 return SDValue();
7968 int Idx = getUnderlyingExtractedFromVec(ExtractedFromVec, ExtIdx);
7969
7970 // Quit if extracted from vector of different type.
7971 if (ExtractedFromVec.getValueType() != VT)
7972 return SDValue();
7973
7974 if (!VecIn1.getNode())
7975 VecIn1 = ExtractedFromVec;
7976 else if (VecIn1 != ExtractedFromVec) {
7977 if (!VecIn2.getNode())
7978 VecIn2 = ExtractedFromVec;
7979 else if (VecIn2 != ExtractedFromVec)
7980 // Quit if more than 2 vectors to shuffle
7981 return SDValue();
7982 }
7983
7984 if (ExtractedFromVec == VecIn1)
7985 Mask[i] = Idx;
7986 else if (ExtractedFromVec == VecIn2)
7987 Mask[i] = Idx + NumElems;
7988 }
7989
7990 if (!VecIn1.getNode())
7991 return SDValue();
7992
7993 VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);
7994 SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, Mask);
7995
7996 for (unsigned Idx : InsertIndices)
7997 NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx),
7998 DAG.getVectorIdxConstant(Idx, DL));
7999
8000 return NV;
8001}
8002
8003// Lower BUILD_VECTOR operation for v8bf16, v16bf16 and v32bf16 types.
8005 const X86Subtarget &Subtarget) {
8006 MVT VT = Op.getSimpleValueType();
8007 MVT IVT =
8008 VT.changeVectorElementType(Subtarget.hasFP16() ? MVT::f16 : MVT::i16);
8010 for (unsigned I = 0, E = Op.getNumOperands(); I != E; ++I)
8011 NewOps.push_back(DAG.getBitcast(Subtarget.hasFP16() ? MVT::f16 : MVT::i16,
8012 Op.getOperand(I)));
8013 SDValue Res = DAG.getNode(ISD::BUILD_VECTOR, SDLoc(), IVT, NewOps);
8014 return DAG.getBitcast(VT, Res);
8015}
8016
8017// Lower BUILD_VECTOR operation for v8i1 and v16i1 types.
8019 SelectionDAG &DAG,
8020 const X86Subtarget &Subtarget) {
8021
8022 MVT VT = Op.getSimpleValueType();
8023 assert((VT.getVectorElementType() == MVT::i1) &&
8024 "Unexpected type in LowerBUILD_VECTORvXi1!");
8025 if (ISD::isBuildVectorAllZeros(Op.getNode()) ||
8026 ISD::isBuildVectorAllOnes(Op.getNode()))
8027 return Op;
8028
8029 uint64_t Immediate = 0;
8030 SmallVector<unsigned, 16> NonConstIdx;
8031 bool IsSplat = true;
8032 bool HasConstElts = false;
8033 int SplatIdx = -1;
8034 for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
8035 SDValue In = Op.getOperand(idx);
8036 if (In.isUndef())
8037 continue;
8038 if (auto *InC = dyn_cast<ConstantSDNode>(In)) {
8039 Immediate |= (InC->getZExtValue() & 0x1) << idx;
8040 HasConstElts = true;
8041 } else {
8042 NonConstIdx.push_back(idx);
8043 }
8044 if (SplatIdx < 0)
8045 SplatIdx = idx;
8046 else if (In != Op.getOperand(SplatIdx))
8047 IsSplat = false;
8048 }
8049
8050 // for splat use " (select i1 splat_elt, all-ones, all-zeroes)"
8051 if (IsSplat) {
8052 // The build_vector allows the scalar element to be larger than the vector
8053 // element type. We need to mask it to use as a condition unless we know
8054 // the upper bits are zero.
8055 // FIXME: Use computeKnownBits instead of checking specific opcode?
8056 SDValue Cond = Op.getOperand(SplatIdx);
8057 assert(Cond.getValueType() == MVT::i8 && "Unexpected VT!");
8058 if (Cond.getOpcode() != ISD::SETCC)
8059 Cond = DAG.getNode(ISD::AND, dl, MVT::i8, Cond,
8060 DAG.getConstant(1, dl, MVT::i8));
8061
8062 // Perform the select in the scalar domain so we can use cmov.
8063 if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
8064 SDValue Select = DAG.getSelect(dl, MVT::i32, Cond,
8065 DAG.getAllOnesConstant(dl, MVT::i32),
8066 DAG.getConstant(0, dl, MVT::i32));
8067 Select = DAG.getBitcast(MVT::v32i1, Select);
8068 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Select, Select);
8069 } else {
8070 MVT ImmVT = MVT::getIntegerVT(std::max((unsigned)VT.getSizeInBits(), 8U));
8071 SDValue Select = DAG.getSelect(dl, ImmVT, Cond,
8072 DAG.getAllOnesConstant(dl, ImmVT),
8073 DAG.getConstant(0, dl, ImmVT));
8074 MVT VecVT = VT.getSizeInBits() >= 8 ? VT : MVT::v8i1;
8075 Select = DAG.getBitcast(VecVT, Select);
8076 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Select,
8077 DAG.getVectorIdxConstant(0, dl));
8078 }
8079 }
8080
8081 // insert elements one by one
8082 SDValue DstVec;
8083 if (HasConstElts) {
8084 if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
8085 SDValue ImmL = DAG.getConstant(Lo_32(Immediate), dl, MVT::i32);
8086 SDValue ImmH = DAG.getConstant(Hi_32(Immediate), dl, MVT::i32);
8087 ImmL = DAG.getBitcast(MVT::v32i1, ImmL);
8088 ImmH = DAG.getBitcast(MVT::v32i1, ImmH);
8089 DstVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, ImmL, ImmH);
8090 } else {
8091 MVT ImmVT = MVT::getIntegerVT(std::max((unsigned)VT.getSizeInBits(), 8U));
8092 SDValue Imm = DAG.getConstant(Immediate, dl, ImmVT);
8093 MVT VecVT = VT.getSizeInBits() >= 8 ? VT : MVT::v8i1;
8094 DstVec = DAG.getBitcast(VecVT, Imm);
8095 DstVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, DstVec,
8096 DAG.getVectorIdxConstant(0, dl));
8097 }
8098 } else
8099 DstVec = DAG.getUNDEF(VT);
8100
8101 for (unsigned InsertIdx : NonConstIdx) {
8102 DstVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
8103 Op.getOperand(InsertIdx),
8104 DAG.getVectorIdxConstant(InsertIdx, dl));
8105 }
8106 return DstVec;
8107}
8108
8109[[maybe_unused]] static bool isHorizOp(unsigned Opcode) {
8110 switch (Opcode) {
8111 case X86ISD::PACKSS:
8112 case X86ISD::PACKUS:
8113 case X86ISD::FHADD:
8114 case X86ISD::FHSUB:
8115 case X86ISD::HADD:
8116 case X86ISD::HSUB:
8117 return true;
8118 }
8119 return false;
8120}
8121
8122/// This is a helper function of LowerToHorizontalOp().
8123/// This function checks that the build_vector \p N in input implements a
8124/// 128-bit partial horizontal operation on a 256-bit vector, but that operation
8125/// may not match the layout of an x86 256-bit horizontal instruction.
8126/// In other words, if this returns true, then some extraction/insertion will
8127/// be required to produce a valid horizontal instruction.
8128///
8129/// Parameter \p Opcode defines the kind of horizontal operation to match.
8130/// For example, if \p Opcode is equal to ISD::ADD, then this function
8131/// checks if \p N implements a horizontal arithmetic add; if instead \p Opcode
8132/// is equal to ISD::SUB, then this function checks if this is a horizontal
8133/// arithmetic sub.
8134///
8135/// This function only analyzes elements of \p N whose indices are
8136/// in range [BaseIdx, LastIdx).
8137///
8138/// TODO: This function was originally used to match both real and fake partial
8139/// horizontal operations, but the index-matching logic is incorrect for that.
8140/// See the corrected implementation in isHopBuildVector(). Can we reduce this
8141/// code because it is only used for partial h-op matching now?
8142static bool isHorizontalBinOpPart(const BuildVectorSDNode *N, unsigned Opcode,
8143 const SDLoc &DL, SelectionDAG &DAG,
8144 unsigned BaseIdx, unsigned LastIdx,
8145 SDValue &V0, SDValue &V1) {
8146 EVT VT = N->getValueType(0);
8147 assert(VT.is256BitVector() && "Only use for matching partial 256-bit h-ops");
8148 assert(BaseIdx * 2 <= LastIdx && "Invalid Indices in input!");
8149 assert(VT.isVector() && VT.getVectorNumElements() >= LastIdx &&
8150 "Invalid Vector in input!");
8151
8152 bool IsCommutable = (Opcode == ISD::ADD || Opcode == ISD::FADD);
8153 bool CanFold = true;
8154 unsigned ExpectedVExtractIdx = BaseIdx;
8155 unsigned NumElts = LastIdx - BaseIdx;
8156 V0 = DAG.getUNDEF(VT);
8157 V1 = DAG.getUNDEF(VT);
8158
8159 // Check if N implements a horizontal binop.
8160 for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i) {
8161 SDValue Op = N->getOperand(i + BaseIdx);
8162
8163 // Skip UNDEFs.
8164 if (Op->isUndef()) {
8165 // Update the expected vector extract index.
8166 if (i * 2 == NumElts)
8167 ExpectedVExtractIdx = BaseIdx;
8168 ExpectedVExtractIdx += 2;
8169 continue;
8170 }
8171
8172 CanFold = Op->getOpcode() == Opcode && Op->hasOneUse();
8173
8174 if (!CanFold)
8175 break;
8176
8177 SDValue Op0 = Op.getOperand(0);
8178 SDValue Op1 = Op.getOperand(1);
8179
8180 // Try to match the following pattern:
8181 // (BINOP (extract_vector_elt A, I), (extract_vector_elt A, I+1))
8182 CanFold = (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
8184 Op0.getOperand(0) == Op1.getOperand(0) &&
8187 if (!CanFold)
8188 break;
8189
8190 unsigned I0 = Op0.getConstantOperandVal(1);
8191 unsigned I1 = Op1.getConstantOperandVal(1);
8192
8193 if (i * 2 < NumElts) {
8194 if (V0.isUndef()) {
8195 V0 = Op0.getOperand(0);
8196 if (V0.getValueType() != VT)
8197 return false;
8198 }
8199 } else {
8200 if (V1.isUndef()) {
8201 V1 = Op0.getOperand(0);
8202 if (V1.getValueType() != VT)
8203 return false;
8204 }
8205 if (i * 2 == NumElts)
8206 ExpectedVExtractIdx = BaseIdx;
8207 }
8208
8209 SDValue Expected = (i * 2 < NumElts) ? V0 : V1;
8210 if (I0 == ExpectedVExtractIdx)
8211 CanFold = I1 == I0 + 1 && Op0.getOperand(0) == Expected;
8212 else if (IsCommutable && I1 == ExpectedVExtractIdx) {
8213 // Try to match the following dag sequence:
8214 // (BINOP (extract_vector_elt A, I+1), (extract_vector_elt A, I))
8215 CanFold = I0 == I1 + 1 && Op1.getOperand(0) == Expected;
8216 } else
8217 CanFold = false;
8218
8219 ExpectedVExtractIdx += 2;
8220 }
8221
8222 return CanFold;
8223}
8224
8225/// Emit a sequence of two 128-bit horizontal add/sub followed by
8226/// a concat_vector.
8227///
8228/// This is a helper function of LowerToHorizontalOp().
8229/// This function expects two 256-bit vectors called V0 and V1.
8230/// At first, each vector is split into two separate 128-bit vectors.
8231/// Then, the resulting 128-bit vectors are used to implement two
8232/// horizontal binary operations.
8233///
8234/// The kind of horizontal binary operation is defined by \p X86Opcode.
8235///
8236/// \p Mode specifies how the 128-bit parts of V0 and V1 are passed in input to
8237/// the two new horizontal binop.
8238/// When Mode is set, the first horizontal binop dag node would take as input
8239/// the lower 128-bit of V0 and the upper 128-bit of V0. The second
8240/// horizontal binop dag node would take as input the lower 128-bit of V1
8241/// and the upper 128-bit of V1.
8242/// Example:
8243/// HADD V0_LO, V0_HI
8244/// HADD V1_LO, V1_HI
8245///
8246/// Otherwise, the first horizontal binop dag node takes as input the lower
8247/// 128-bit of V0 and the lower 128-bit of V1, and the second horizontal binop
8248/// dag node takes the upper 128-bit of V0 and the upper 128-bit of V1.
8249/// Example:
8250/// HADD V0_LO, V1_LO
8251/// HADD V0_HI, V1_HI
8252///
8253/// If \p isUndefLO is set, then the algorithm propagates UNDEF to the lower
8254/// 128-bits of the result. If \p isUndefHI is set, then UNDEF is propagated to
8255/// the upper 128-bits of the result.
8256static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1,
8257 const SDLoc &DL, SelectionDAG &DAG,
8258 unsigned X86Opcode, bool Mode,
8259 bool isUndefLO, bool isUndefHI) {
8260 MVT VT = V0.getSimpleValueType();
8261 assert(VT.is256BitVector() && VT == V1.getSimpleValueType() &&
8262 "Invalid nodes in input!");
8263
8264 unsigned NumElts = VT.getVectorNumElements();
8265 SDValue V0_LO = extract128BitVector(V0, 0, DAG, DL);
8266 SDValue V0_HI = extract128BitVector(V0, NumElts/2, DAG, DL);
8267 SDValue V1_LO = extract128BitVector(V1, 0, DAG, DL);
8268 SDValue V1_HI = extract128BitVector(V1, NumElts/2, DAG, DL);
8269 MVT NewVT = V0_LO.getSimpleValueType();
8270
8271 SDValue LO = DAG.getUNDEF(NewVT);
8272 SDValue HI = DAG.getUNDEF(NewVT);
8273
8274 if (Mode) {
8275 // Don't emit a horizontal binop if the result is expected to be UNDEF.
8276 if (!isUndefLO && !V0->isUndef())
8277 LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V0_HI);
8278 if (!isUndefHI && !V1->isUndef())
8279 HI = DAG.getNode(X86Opcode, DL, NewVT, V1_LO, V1_HI);
8280 } else {
8281 // Don't emit a horizontal binop if the result is expected to be UNDEF.
8282 if (!isUndefLO && (!V0_LO->isUndef() || !V1_LO->isUndef()))
8283 LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V1_LO);
8284
8285 if (!isUndefHI && (!V0_HI->isUndef() || !V1_HI->isUndef()))
8286 HI = DAG.getNode(X86Opcode, DL, NewVT, V0_HI, V1_HI);
8287 }
8288
8289 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LO, HI);
8290}
8291
8292/// Returns true iff \p BV builds a vector with the result equivalent to
8293/// the result of ADDSUB/SUBADD operation.
8294/// If true is returned then the operands of ADDSUB = Opnd0 +- Opnd1
8295/// (SUBADD = Opnd0 -+ Opnd1) operation are written to the parameters
8296/// \p Opnd0 and \p Opnd1.
8298 const X86Subtarget &Subtarget, SelectionDAG &DAG,
8299 SDValue &Opnd0, SDValue &Opnd1,
8300 unsigned &NumExtracts, bool &IsSubAdd,
8301 bool &HasAllowContract) {
8302 using namespace SDPatternMatch;
8303
8304 MVT VT = BV->getSimpleValueType(0);
8305 if (!Subtarget.hasSSE3() || !VT.isFloatingPoint())
8306 return false;
8307
8308 unsigned NumElts = VT.getVectorNumElements();
8309 SDValue InVec0 = DAG.getUNDEF(VT);
8310 SDValue InVec1 = DAG.getUNDEF(VT);
8311
8312 NumExtracts = 0;
8313 HasAllowContract = NumElts != 0;
8314
8315 // Odd-numbered elements in the input build vector are obtained from
8316 // adding/subtracting two integer/float elements.
8317 // Even-numbered elements in the input build vector are obtained from
8318 // subtracting/adding two integer/float elements.
8319 unsigned Opc[2] = {0, 0};
8320 for (unsigned i = 0, e = NumElts; i != e; ++i) {
8321 SDValue Op = BV->getOperand(i);
8322
8323 // Skip 'undef' values.
8324 unsigned Opcode = Op.getOpcode();
8325 if (Opcode == ISD::UNDEF)
8326 continue;
8327
8328 // Early exit if we found an unexpected opcode.
8329 if (Opcode != ISD::FADD && Opcode != ISD::FSUB)
8330 return false;
8331
8332 SDValue Op0 = Op.getOperand(0);
8333 SDValue Op1 = Op.getOperand(1);
8334
8335 // Try to match the following pattern:
8336 // (BINOP (extract_vector_elt A, i), (extract_vector_elt B, i))
8337 // Early exit if we cannot match that sequence.
8338 if (!sd_match(Op0, m_ExtractElt(m_SpecificVT(VT), m_SpecificInt(i))) ||
8339 !sd_match(Op1, m_ExtractElt(m_SpecificVT(VT), m_SpecificInt(i))))
8340 return false;
8341
8342 // We found a valid add/sub node, make sure its the same opcode as previous
8343 // elements for this parity.
8344 if (Opc[i % 2] != 0 && Opc[i % 2] != Opcode)
8345 return false;
8346 Opc[i % 2] = Opcode;
8347
8348 // Update InVec0 and InVec1.
8349 if (InVec0.isUndef())
8350 InVec0 = Op0.getOperand(0);
8351 if (InVec1.isUndef())
8352 InVec1 = Op1.getOperand(0);
8353
8354 // Make sure that operands in input to each add/sub node always
8355 // come from a same pair of vectors.
8356 if (InVec0 != Op0.getOperand(0)) {
8357 if (Opcode == ISD::FSUB)
8358 return false;
8359
8360 // FADD is commutable. Try to commute the operands
8361 // and then test again.
8362 std::swap(Op0, Op1);
8363 if (InVec0 != Op0.getOperand(0))
8364 return false;
8365 }
8366
8367 if (InVec1 != Op1.getOperand(0))
8368 return false;
8369
8370 // Increment the number of extractions done.
8371 ++NumExtracts;
8372 HasAllowContract &= Op->getFlags().hasAllowContract();
8373 }
8374
8375 // Ensure we have found an opcode for both parities and that they are
8376 // different. Don't try to fold this build_vector into an ADDSUB/SUBADD if the
8377 // inputs are undef.
8378 if (!Opc[0] || !Opc[1] || Opc[0] == Opc[1] ||
8379 InVec0.isUndef() || InVec1.isUndef())
8380 return false;
8381
8382 IsSubAdd = Opc[0] == ISD::FADD;
8383
8384 Opnd0 = InVec0;
8385 Opnd1 = InVec1;
8386 return true;
8387}
8388
8389/// Returns true if is possible to fold MUL and an idiom that has already been
8390/// recognized as ADDSUB/SUBADD(\p Opnd0, \p Opnd1) into
8391/// FMADDSUB/FMSUBADD(x, y, \p Opnd1). If (and only if) true is returned, the
8392/// operands of FMADDSUB/FMSUBADD are written to parameters \p Opnd0, \p Opnd1, \p Opnd2.
8393///
8394/// Prior to calling this function it should be known that there is some
8395/// SDNode that potentially can be replaced with an X86ISD::ADDSUB operation
8396/// using \p Opnd0 and \p Opnd1 as operands. Also, this method is called
8397/// before replacement of such SDNode with ADDSUB operation. Thus the number
8398/// of \p Opnd0 uses is expected to be equal to 2.
8399/// For example, this function may be called for the following IR:
8400/// %AB = fmul fast <2 x double> %A, %B
8401/// %Sub = fsub fast <2 x double> %AB, %C
8402/// %Add = fadd fast <2 x double> %AB, %C
8403/// %Addsub = shufflevector <2 x double> %Sub, <2 x double> %Add,
8404/// <2 x i32> <i32 0, i32 3>
8405/// There is a def for %Addsub here, which potentially can be replaced by
8406/// X86ISD::ADDSUB operation:
8407/// %Addsub = X86ISD::ADDSUB %AB, %C
8408/// and such ADDSUB can further be replaced with FMADDSUB:
8409/// %Addsub = FMADDSUB %A, %B, %C.
8410///
8411/// The main reason why this method is called before the replacement of the
8412/// recognized ADDSUB idiom with ADDSUB operation is that such replacement
8413/// is illegal sometimes. E.g. 512-bit ADDSUB is not available, while 512-bit
8414/// FMADDSUB is.
8415static bool isFMAddSubOrFMSubAdd(const X86Subtarget &Subtarget,
8416 SelectionDAG &DAG, SDValue &Opnd0,
8417 SDValue &Opnd1, SDValue &Opnd2,
8418 unsigned ExpectedUses,
8419 bool AllowSubAddOrAddSubContract) {
8420 if (Opnd0.getOpcode() != ISD::FMUL ||
8421 !Opnd0->hasNUsesOfValue(ExpectedUses, 0) || !Subtarget.hasAnyFMA())
8422 return false;
8423
8424 // FIXME: These checks must match the similar ones in
8425 // DAGCombiner::visitFADDForFMACombine. It would be good to have one
8426 // function that would answer if it is Ok to fuse MUL + ADD to FMADD
8427 // or MUL + ADDSUB to FMADDSUB.
8428 const TargetOptions &Options = DAG.getTarget().Options;
8429 bool AllowFusion =
8430 Options.AllowFPOpFusion == FPOpFusion::Fast ||
8431 (AllowSubAddOrAddSubContract && Opnd0->getFlags().hasAllowContract());
8432 if (!AllowFusion)
8433 return false;
8434
8435 Opnd2 = Opnd1;
8436 Opnd1 = Opnd0.getOperand(1);
8437 Opnd0 = Opnd0.getOperand(0);
8438
8439 return true;
8440}
8441
8442/// Try to fold a build_vector that performs an 'addsub' or 'fmaddsub' or
8443/// 'fsubadd' operation accordingly to X86ISD::ADDSUB or X86ISD::FMADDSUB or
8444/// X86ISD::FMSUBADD node.
8446 const SDLoc &DL,
8447 const X86Subtarget &Subtarget,
8448 SelectionDAG &DAG) {
8449 SDValue Opnd0, Opnd1;
8450 unsigned NumExtracts;
8451 bool IsSubAdd;
8452 bool HasAllowContract;
8453 if (!isAddSubOrSubAdd(BV, Subtarget, DAG, Opnd0, Opnd1, NumExtracts, IsSubAdd,
8454 HasAllowContract))
8455 return SDValue();
8456
8457 MVT VT = BV->getSimpleValueType(0);
8458
8459 // Try to generate X86ISD::FMADDSUB node here.
8460 SDValue Opnd2;
8461 if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, NumExtracts,
8462 HasAllowContract)) {
8463 unsigned Opc = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
8464 return DAG.getNode(Opc, DL, VT, Opnd0, Opnd1, Opnd2);
8465 }
8466
8467 // We only support ADDSUB.
8468 if (IsSubAdd)
8469 return SDValue();
8470
8471 // There are no known X86 targets with 512-bit ADDSUB instructions!
8472 // Convert to blend(fsub,fadd).
8473 if (VT.is512BitVector()) {
8474 SmallVector<int> Mask;
8475 for (int I = 0, E = VT.getVectorNumElements(); I != E; I += 2) {
8476 Mask.push_back(I);
8477 Mask.push_back(I + E + 1);
8478 }
8479 SDValue Sub = DAG.getNode(ISD::FSUB, DL, VT, Opnd0, Opnd1);
8480 SDValue Add = DAG.getNode(ISD::FADD, DL, VT, Opnd0, Opnd1);
8481 return DAG.getVectorShuffle(VT, DL, Sub, Add, Mask);
8482 }
8483
8484 return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
8485}
8486
8488 unsigned &HOpcode, SDValue &V0, SDValue &V1) {
8489 // Initialize outputs to known values.
8490 MVT VT = BV->getSimpleValueType(0);
8491 HOpcode = ISD::DELETED_NODE;
8492 V0 = DAG.getUNDEF(VT);
8493 V1 = DAG.getUNDEF(VT);
8494
8495 // x86 256-bit horizontal ops are defined in a non-obvious way. Each 128-bit
8496 // half of the result is calculated independently from the 128-bit halves of
8497 // the inputs, so that makes the index-checking logic below more complicated.
8498 unsigned NumElts = VT.getVectorNumElements();
8499 unsigned GenericOpcode = ISD::DELETED_NODE;
8500 unsigned Num128BitChunks = VT.is256BitVector() ? 2 : 1;
8501 unsigned NumEltsIn128Bits = NumElts / Num128BitChunks;
8502 unsigned NumEltsIn64Bits = NumEltsIn128Bits / 2;
8503 for (unsigned i = 0; i != Num128BitChunks; ++i) {
8504 for (unsigned j = 0; j != NumEltsIn128Bits; ++j) {
8505 // Ignore undef elements.
8506 SDValue Op = BV->getOperand(i * NumEltsIn128Bits + j);
8507 if (Op.isUndef())
8508 continue;
8509
8510 // If there's an opcode mismatch, we're done.
8511 if (HOpcode != ISD::DELETED_NODE && Op.getOpcode() != GenericOpcode)
8512 return false;
8513
8514 // Initialize horizontal opcode.
8515 if (HOpcode == ISD::DELETED_NODE) {
8516 GenericOpcode = Op.getOpcode();
8517 switch (GenericOpcode) {
8518 // clang-format off
8519 case ISD::ADD: HOpcode = X86ISD::HADD; break;
8520 case ISD::SUB: HOpcode = X86ISD::HSUB; break;
8521 case ISD::FADD: HOpcode = X86ISD::FHADD; break;
8522 case ISD::FSUB: HOpcode = X86ISD::FHSUB; break;
8523 default: return false;
8524 // clang-format on
8525 }
8526 }
8527
8528 SDValue Op0 = Op.getOperand(0);
8529 SDValue Op1 = Op.getOperand(1);
8530 if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
8532 Op0.getOperand(0) != Op1.getOperand(0) ||
8534 !isa<ConstantSDNode>(Op1.getOperand(1)) || !Op.hasOneUse())
8535 return false;
8536
8537 // The source vector is chosen based on which 64-bit half of the
8538 // destination vector is being calculated.
8539 if (j < NumEltsIn64Bits) {
8540 if (V0.isUndef())
8541 V0 = Op0.getOperand(0);
8542 } else {
8543 if (V1.isUndef())
8544 V1 = Op0.getOperand(0);
8545 }
8546
8547 SDValue SourceVec = (j < NumEltsIn64Bits) ? V0 : V1;
8548 if (SourceVec != Op0.getOperand(0))
8549 return false;
8550
8551 // op (extract_vector_elt A, I), (extract_vector_elt A, I+1)
8552 unsigned ExtIndex0 = Op0.getConstantOperandVal(1);
8553 unsigned ExtIndex1 = Op1.getConstantOperandVal(1);
8554 unsigned ExpectedIndex = i * NumEltsIn128Bits +
8555 (j % NumEltsIn64Bits) * 2;
8556 if (ExpectedIndex == ExtIndex0 && ExtIndex1 == ExtIndex0 + 1)
8557 continue;
8558
8559 // If this is not a commutative op, this does not match.
8560 if (GenericOpcode != ISD::ADD && GenericOpcode != ISD::FADD)
8561 return false;
8562
8563 // Addition is commutative, so try swapping the extract indexes.
8564 // op (extract_vector_elt A, I+1), (extract_vector_elt A, I)
8565 if (ExpectedIndex == ExtIndex1 && ExtIndex0 == ExtIndex1 + 1)
8566 continue;
8567
8568 // Extract indexes do not match horizontal requirement.
8569 return false;
8570 }
8571 }
8572 // We matched. Opcode and operands are returned by reference as arguments.
8573 return true;
8574}
8575
8577 const SDLoc &DL, SelectionDAG &DAG,
8578 unsigned HOpcode, SDValue V0, SDValue V1) {
8579 // If either input vector is not the same size as the build vector,
8580 // extract/insert the low bits to the correct size.
8581 // This is free (examples: zmm --> xmm, xmm --> ymm).
8582 MVT VT = BV->getSimpleValueType(0);
8583 unsigned Width = VT.getSizeInBits();
8584 if (V0.getValueSizeInBits() > Width)
8585 V0 = extractSubVector(V0, 0, DAG, DL, Width);
8586 else if (V0.getValueSizeInBits() < Width)
8587 V0 = insertSubVector(DAG.getUNDEF(VT), V0, 0, DAG, DL, Width);
8588
8589 if (V1.getValueSizeInBits() > Width)
8590 V1 = extractSubVector(V1, 0, DAG, DL, Width);
8591 else if (V1.getValueSizeInBits() < Width)
8592 V1 = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, DL, Width);
8593
8594 unsigned NumElts = VT.getVectorNumElements();
8595 APInt DemandedElts = APInt::getAllOnes(NumElts);
8596 for (unsigned i = 0; i != NumElts; ++i)
8597 if (BV->getOperand(i).isUndef())
8598 DemandedElts.clearBit(i);
8599
8600 // If we don't need the upper xmm, then perform as a xmm hop.
8601 unsigned HalfNumElts = NumElts / 2;
8602 if (VT.is256BitVector() && DemandedElts.lshr(HalfNumElts) == 0) {
8603 MVT HalfVT = VT.getHalfNumVectorElementsVT();
8604 V0 = extractSubVector(V0, 0, DAG, DL, 128);
8605 V1 = extractSubVector(V1, 0, DAG, DL, 128);
8606 SDValue Half = DAG.getNode(HOpcode, DL, HalfVT, V0, V1);
8607 return insertSubVector(DAG.getUNDEF(VT), Half, 0, DAG, DL, 256);
8608 }
8609
8610 return DAG.getNode(HOpcode, DL, VT, V0, V1);
8611}
8612
8613/// Lower BUILD_VECTOR to a horizontal add/sub operation if possible.
8615 const X86Subtarget &Subtarget,
8616 SelectionDAG &DAG) {
8617 // We need at least 2 non-undef elements to make this worthwhile by default.
8618 unsigned NumNonUndefs =
8619 count_if(BV->op_values(), [](SDValue V) { return !V.isUndef(); });
8620 if (NumNonUndefs < 2)
8621 return SDValue();
8622
8623 // There are 4 sets of horizontal math operations distinguished by type:
8624 // int/FP at 128-bit/256-bit. Each type was introduced with a different
8625 // subtarget feature. Try to match those "native" patterns first.
8626 MVT VT = BV->getSimpleValueType(0);
8627 if (((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget.hasSSE3()) ||
8628 ((VT == MVT::v8i16 || VT == MVT::v4i32) && Subtarget.hasSSSE3()) ||
8629 ((VT == MVT::v8f32 || VT == MVT::v4f64) && Subtarget.hasAVX()) ||
8630 ((VT == MVT::v16i16 || VT == MVT::v8i32) && Subtarget.hasAVX2())) {
8631 unsigned HOpcode;
8632 SDValue V0, V1;
8633 if (isHopBuildVector(BV, DAG, HOpcode, V0, V1))
8634 return getHopForBuildVector(BV, DL, DAG, HOpcode, V0, V1);
8635 }
8636
8637 // Try harder to match 256-bit ops by using extract/concat.
8638 if (!Subtarget.hasAVX() || !VT.is256BitVector())
8639 return SDValue();
8640
8641 // Count the number of UNDEF operands in the build_vector in input.
8642 unsigned NumElts = VT.getVectorNumElements();
8643 unsigned Half = NumElts / 2;
8644 unsigned NumUndefsLO = 0;
8645 unsigned NumUndefsHI = 0;
8646 for (unsigned i = 0, e = Half; i != e; ++i)
8647 if (BV->getOperand(i)->isUndef())
8648 NumUndefsLO++;
8649
8650 for (unsigned i = Half, e = NumElts; i != e; ++i)
8651 if (BV->getOperand(i)->isUndef())
8652 NumUndefsHI++;
8653
8654 SDValue InVec0, InVec1;
8655 if (VT == MVT::v8i32 || VT == MVT::v16i16) {
8656 SDValue InVec2, InVec3;
8657 unsigned X86Opcode;
8658 bool CanFold = true;
8659
8660 if (isHorizontalBinOpPart(BV, ISD::ADD, DL, DAG, 0, Half, InVec0, InVec1) &&
8661 isHorizontalBinOpPart(BV, ISD::ADD, DL, DAG, Half, NumElts, InVec2,
8662 InVec3) &&
8663 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
8664 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
8665 X86Opcode = X86ISD::HADD;
8666 else if (isHorizontalBinOpPart(BV, ISD::SUB, DL, DAG, 0, Half, InVec0,
8667 InVec1) &&
8668 isHorizontalBinOpPart(BV, ISD::SUB, DL, DAG, Half, NumElts, InVec2,
8669 InVec3) &&
8670 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
8671 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
8672 X86Opcode = X86ISD::HSUB;
8673 else
8674 CanFold = false;
8675
8676 if (CanFold) {
8677 // Do not try to expand this build_vector into a pair of horizontal
8678 // add/sub if we can emit a pair of scalar add/sub.
8679 if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
8680 return SDValue();
8681
8682 // Convert this build_vector into a pair of horizontal binops followed by
8683 // a concat vector. We must adjust the outputs from the partial horizontal
8684 // matching calls above to account for undefined vector halves.
8685 SDValue V0 = InVec0.isUndef() ? InVec2 : InVec0;
8686 SDValue V1 = InVec1.isUndef() ? InVec3 : InVec1;
8687 assert((!V0.isUndef() || !V1.isUndef()) && "Horizontal-op of undefs?");
8688 bool isUndefLO = NumUndefsLO == Half;
8689 bool isUndefHI = NumUndefsHI == Half;
8690 return ExpandHorizontalBinOp(V0, V1, DL, DAG, X86Opcode, false, isUndefLO,
8691 isUndefHI);
8692 }
8693 }
8694
8695 if (VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v8i32 ||
8696 VT == MVT::v16i16) {
8697 unsigned X86Opcode;
8698 if (isHorizontalBinOpPart(BV, ISD::ADD, DL, DAG, 0, NumElts, InVec0,
8699 InVec1))
8700 X86Opcode = X86ISD::HADD;
8701 else if (isHorizontalBinOpPart(BV, ISD::SUB, DL, DAG, 0, NumElts, InVec0,
8702 InVec1))
8703 X86Opcode = X86ISD::HSUB;
8704 else if (isHorizontalBinOpPart(BV, ISD::FADD, DL, DAG, 0, NumElts, InVec0,
8705 InVec1))
8706 X86Opcode = X86ISD::FHADD;
8707 else if (isHorizontalBinOpPart(BV, ISD::FSUB, DL, DAG, 0, NumElts, InVec0,
8708 InVec1))
8709 X86Opcode = X86ISD::FHSUB;
8710 else
8711 return SDValue();
8712
8713 // Don't try to expand this build_vector into a pair of horizontal add/sub
8714 // if we can simply emit a pair of scalar add/sub.
8715 if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
8716 return SDValue();
8717
8718 // Convert this build_vector into two horizontal add/sub followed by
8719 // a concat vector.
8720 bool isUndefLO = NumUndefsLO == Half;
8721 bool isUndefHI = NumUndefsHI == Half;
8722 return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, true,
8723 isUndefLO, isUndefHI);
8724 }
8725
8726 return SDValue();
8727}
8728
8729static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
8730 SelectionDAG &DAG);
8731
8732/// If a BUILD_VECTOR's source elements all apply the same bit operation and
8733/// one of their operands is constant, lower to a pair of BUILD_VECTOR and
8734/// just apply the bit to the vectors.
8735/// NOTE: Its not in our interest to start make a general purpose vectorizer
8736/// from this, but enough scalar bit operations are created from the later
8737/// legalization + scalarization stages to need basic support.
8739 const X86Subtarget &Subtarget,
8740 SelectionDAG &DAG) {
8741 MVT VT = Op->getSimpleValueType(0);
8742 unsigned NumElems = VT.getVectorNumElements();
8743 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
8744
8745 // Check that all elements have the same opcode.
8746 // TODO: Should we allow UNDEFS and if so how many?
8747 unsigned Opcode = Op->getOperand(0).getOpcode();
8748 for (unsigned i = 1; i < NumElems; ++i)
8749 if (Opcode != Op->getOperand(i).getOpcode())
8750 return SDValue();
8751
8752 // TODO: We may be able to add support for other Ops (ADD/SUB + shifts).
8753 bool IsShift = false;
8754 switch (Opcode) {
8755 default:
8756 return SDValue();
8757 case ISD::SHL:
8758 case ISD::SRL:
8759 case ISD::SRA:
8760 IsShift = true;
8761 break;
8762 case ISD::AND:
8763 case ISD::XOR:
8764 case ISD::OR:
8765 // Don't do this if the buildvector is a splat - we'd replace one
8766 // constant with an entire vector.
8767 if (Op->getSplatValue())
8768 return SDValue();
8769 if (!TLI.isOperationLegalOrPromote(Opcode, VT))
8770 return SDValue();
8771 break;
8772 }
8773
8774 SmallVector<SDValue, 4> LHSElts, RHSElts;
8775 for (SDValue Elt : Op->ops()) {
8776 SDValue LHS = Elt.getOperand(0);
8777 SDValue RHS = Elt.getOperand(1);
8778
8779 // We expect the canonicalized RHS operand to be the constant.
8781 return SDValue();
8782
8783 // Extend shift amounts.
8784 if (RHS.getValueSizeInBits() != VT.getScalarSizeInBits()) {
8785 if (!IsShift)
8786 return SDValue();
8787 RHS = DAG.getZExtOrTrunc(RHS, DL, VT.getScalarType());
8788 }
8789
8790 LHSElts.push_back(LHS);
8791 RHSElts.push_back(RHS);
8792 }
8793
8794 // Limit to shifts by uniform immediates.
8795 // TODO: Only accept vXi8/vXi64 special cases?
8796 // TODO: Permit non-uniform XOP/AVX2/MULLO cases?
8797 if (IsShift && any_of(RHSElts, [&](SDValue V) { return RHSElts[0] != V; }))
8798 return SDValue();
8799
8800 SDValue LHS = DAG.getBuildVector(VT, DL, LHSElts);
8801 SDValue RHS = DAG.getBuildVector(VT, DL, RHSElts);
8802 SDValue Res = DAG.getNode(Opcode, DL, VT, LHS, RHS);
8803
8804 if (!IsShift)
8805 return Res;
8806
8807 // Immediately lower the shift to ensure the constant build vector doesn't
8808 // get converted to a constant pool before the shift is lowered.
8809 return LowerShift(Res, Subtarget, DAG);
8810}
8811
8812static bool isShuffleFoldableLoad(SDValue);
8813
8814/// Attempt to lower a BUILD_VECTOR of scalar values to a shuffle of splats
8815/// representing a blend.
8817 X86Subtarget const &Subtarget,
8818 SelectionDAG &DAG) {
8819 MVT VT = BVOp->getSimpleValueType(0u);
8820
8821 if (VT != MVT::v4f64)
8822 return SDValue();
8823
8824 // Collect unique operands.
8825 auto UniqueOps = SmallSet<SDValue, 16u>();
8826 for (SDValue Op : BVOp->ops()) {
8827 if (isIntOrFPConstant(Op) || Op.isUndef())
8828 return SDValue();
8829 UniqueOps.insert(Op);
8830 }
8831
8832 // Candidate BUILD_VECTOR must have 2 unique operands.
8833 if (UniqueOps.size() != 2u)
8834 return SDValue();
8835
8836 SDValue Op0 = BVOp->getOperand(0u);
8837 UniqueOps.erase(Op0);
8838 SDValue Op1 = *UniqueOps.begin();
8839
8840 if (Subtarget.hasAVX2() || isShuffleFoldableLoad(Op0) ||
8841 isShuffleFoldableLoad(Op1)) {
8842 // Create shuffle mask.
8843 auto const NumElems = VT.getVectorNumElements();
8844 SmallVector<int, 16u> Mask(NumElems);
8845 for (auto I = 0u; I < NumElems; ++I) {
8846 SDValue Op = BVOp->getOperand(I);
8847 Mask[I] = Op == Op0 ? I : I + NumElems;
8848 }
8849 // Create shuffle of splats.
8850 SDValue NewOp0 = DAG.getSplatBuildVector(VT, DL, Op0);
8851 SDValue NewOp1 = DAG.getSplatBuildVector(VT, DL, Op1);
8852 return DAG.getVectorShuffle(VT, DL, NewOp0, NewOp1, Mask);
8853 }
8854
8855 return SDValue();
8856}
8857
8858/// Create a vector constant without a load. SSE/AVX provide the bare minimum
8859/// functionality to do this, so it's all zeros, all ones, or some derivation
8860/// that is cheap to calculate.
8862 SelectionDAG &DAG,
8863 const X86Subtarget &Subtarget) {
8864 MVT VT = Op.getSimpleValueType();
8865
8866 // Vectors containing all zeros can be matched by pxor and xorps.
8867 if (ISD::isBuildVectorAllZeros(Op.getNode()))
8868 return Op;
8869
8870 // Vectors containing all ones can be matched by pcmpeqd on 128-bit width
8871 // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use
8872 // vpcmpeqd on 256-bit vectors.
8873 if (Subtarget.hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) {
8874 if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32)
8875 return Op;
8876
8877 return getOnesVector(VT, DAG, DL);
8878 }
8879
8880 return SDValue();
8881}
8882
8883/// Look for opportunities to create a VPERMV/VPERMILPV/PSHUFB variable permute
8884/// from a vector of source values and a vector of extraction indices.
8885/// The vectors might be manipulated to match the type of the permute op.
8886static SDValue createVariablePermute(MVT VT, SDValue SrcVec, SDValue IndicesVec,
8887 const SDLoc &DL, SelectionDAG &DAG,
8888 const X86Subtarget &Subtarget) {
8889 MVT ShuffleVT = VT;
8890 EVT IndicesVT = EVT(VT).changeVectorElementTypeToInteger();
8891 unsigned NumElts = VT.getVectorNumElements();
8892 unsigned SizeInBits = VT.getSizeInBits();
8893
8894 // Adjust IndicesVec to match VT size.
8895 assert(IndicesVec.getValueType().getVectorNumElements() >= NumElts &&
8896 "Illegal variable permute mask size");
8897 if (IndicesVec.getValueType().getVectorNumElements() > NumElts) {
8898 // Narrow/widen the indices vector to the correct size.
8899 if (IndicesVec.getValueSizeInBits() > SizeInBits)
8900 IndicesVec = extractSubVector(IndicesVec, 0, DAG, SDLoc(IndicesVec),
8901 NumElts * VT.getScalarSizeInBits());
8902 else if (IndicesVec.getValueSizeInBits() < SizeInBits)
8903 IndicesVec = widenSubVector(IndicesVec, false, Subtarget, DAG,
8904 SDLoc(IndicesVec), SizeInBits);
8905 // Zero-extend the index elements within the vector.
8906 if (IndicesVec.getValueType().getVectorNumElements() > NumElts)
8907 IndicesVec = DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(IndicesVec),
8908 IndicesVT, IndicesVec);
8909 }
8910 IndicesVec = DAG.getZExtOrTrunc(IndicesVec, SDLoc(IndicesVec), IndicesVT);
8911
8912 // Handle SrcVec that don't match VT type.
8913 if (SrcVec.getValueSizeInBits() != SizeInBits) {
8914 if ((SrcVec.getValueSizeInBits() % SizeInBits) == 0) {
8915 // Handle larger SrcVec by treating it as a larger permute.
8916 unsigned Scale = SrcVec.getValueSizeInBits() / SizeInBits;
8917 VT = MVT::getVectorVT(VT.getScalarType(), Scale * NumElts);
8918 IndicesVT = EVT(VT).changeVectorElementTypeToInteger();
8919 IndicesVec = widenSubVector(IndicesVT.getSimpleVT(), IndicesVec, false,
8920 Subtarget, DAG, SDLoc(IndicesVec));
8921 SDValue NewSrcVec =
8922 createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget);
8923 if (NewSrcVec)
8924 return extractSubVector(NewSrcVec, 0, DAG, DL, SizeInBits);
8925 return SDValue();
8926 } else if (SrcVec.getValueSizeInBits() < SizeInBits) {
8927 // Widen smaller SrcVec to match VT.
8928 SrcVec = widenSubVector(VT, SrcVec, false, Subtarget, DAG, SDLoc(SrcVec));
8929 } else
8930 return SDValue();
8931 }
8932
8933 auto ScaleIndices = [&DAG](SDValue Idx, uint64_t Scale) {
8934 assert(isPowerOf2_64(Scale) && "Illegal variable permute shuffle scale");
8935 EVT SrcVT = Idx.getValueType();
8936 unsigned NumDstBits = SrcVT.getScalarSizeInBits() / Scale;
8937 uint64_t IndexScale = 0;
8938 uint64_t IndexOffset = 0;
8939
8940 // If we're scaling a smaller permute op, then we need to repeat the
8941 // indices, scaling and offsetting them as well.
8942 // e.g. v4i32 -> v16i8 (Scale = 4)
8943 // IndexScale = v4i32 Splat(4 << 24 | 4 << 16 | 4 << 8 | 4)
8944 // IndexOffset = v4i32 Splat(3 << 24 | 2 << 16 | 1 << 8 | 0)
8945 for (uint64_t i = 0; i != Scale; ++i) {
8946 IndexScale |= Scale << (i * NumDstBits);
8947 IndexOffset |= i << (i * NumDstBits);
8948 }
8949
8950 Idx = DAG.getNode(ISD::MUL, SDLoc(Idx), SrcVT, Idx,
8951 DAG.getConstant(IndexScale, SDLoc(Idx), SrcVT));
8952 Idx = DAG.getNode(ISD::ADD, SDLoc(Idx), SrcVT, Idx,
8953 DAG.getConstant(IndexOffset, SDLoc(Idx), SrcVT));
8954 return Idx;
8955 };
8956
8957 unsigned Opcode = 0;
8958 switch (VT.SimpleTy) {
8959 default:
8960 break;
8961 case MVT::v16i8:
8962 if (Subtarget.hasSSSE3())
8963 Opcode = X86ISD::PSHUFB;
8964 break;
8965 case MVT::v8i16:
8966 if (Subtarget.hasVLX() && Subtarget.hasBWI())
8967 Opcode = X86ISD::VPERMV;
8968 else if (Subtarget.hasSSSE3()) {
8969 Opcode = X86ISD::PSHUFB;
8970 ShuffleVT = MVT::v16i8;
8971 }
8972 break;
8973 case MVT::v4f32:
8974 case MVT::v4i32:
8975 if (Subtarget.hasAVX()) {
8976 Opcode = X86ISD::VPERMILPV;
8977 ShuffleVT = MVT::v4f32;
8978 } else if (Subtarget.hasSSSE3()) {
8979 Opcode = X86ISD::PSHUFB;
8980 ShuffleVT = MVT::v16i8;
8981 }
8982 break;
8983 case MVT::v2f64:
8984 case MVT::v2i64:
8985 if (Subtarget.hasAVX()) {
8986 // VPERMILPD selects using bit#1 of the index vector, so scale IndicesVec.
8987 IndicesVec = DAG.getNode(ISD::ADD, DL, IndicesVT, IndicesVec, IndicesVec);
8988 Opcode = X86ISD::VPERMILPV;
8989 ShuffleVT = MVT::v2f64;
8990 } else if (Subtarget.hasSSE41()) {
8991 // SSE41 can compare v2i64 - select between indices 0 and 1.
8992 return DAG.getSelectCC(
8993 DL, IndicesVec,
8994 getZeroVector(IndicesVT.getSimpleVT(), Subtarget, DAG, DL),
8995 DAG.getVectorShuffle(VT, DL, SrcVec, SrcVec, {0, 0}),
8996 DAG.getVectorShuffle(VT, DL, SrcVec, SrcVec, {1, 1}),
8998 }
8999 break;
9000 case MVT::v32i8:
9001 if (Subtarget.hasVLX() && Subtarget.hasVBMI())
9002 Opcode = X86ISD::VPERMV;
9003 else if (Subtarget.hasXOP()) {
9004 SDValue LoSrc = extract128BitVector(SrcVec, 0, DAG, DL);
9005 SDValue HiSrc = extract128BitVector(SrcVec, 16, DAG, DL);
9006 SDValue LoIdx = extract128BitVector(IndicesVec, 0, DAG, DL);
9007 SDValue HiIdx = extract128BitVector(IndicesVec, 16, DAG, DL);
9008 return DAG.getNode(
9010 DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, LoSrc, HiSrc, LoIdx),
9011 DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, LoSrc, HiSrc, HiIdx));
9012 } else if (Subtarget.hasAVX()) {
9013 SDValue Lo = extract128BitVector(SrcVec, 0, DAG, DL);
9014 SDValue Hi = extract128BitVector(SrcVec, 16, DAG, DL);
9015 SDValue LoLo = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Lo);
9016 SDValue HiHi = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Hi, Hi);
9017 auto PSHUFBBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
9019 // Permute Lo and Hi and then select based on index range.
9020 // This works as SHUFB uses bits[3:0] to permute elements and we don't
9021 // care about the bit[7] as its just an index vector.
9022 SDValue Idx = Ops[2];
9023 EVT VT = Idx.getValueType();
9024 return DAG.getSelectCC(DL, Idx, DAG.getConstant(15, DL, VT),
9025 DAG.getNode(X86ISD::PSHUFB, DL, VT, Ops[1], Idx),
9026 DAG.getNode(X86ISD::PSHUFB, DL, VT, Ops[0], Idx),
9028 };
9029 SDValue Ops[] = {LoLo, HiHi, IndicesVec};
9030 return SplitOpsAndApply(DAG, Subtarget, DL, MVT::v32i8, Ops,
9031 PSHUFBBuilder);
9032 }
9033 break;
9034 case MVT::v16i16:
9035 if (Subtarget.hasVLX() && Subtarget.hasBWI())
9036 Opcode = X86ISD::VPERMV;
9037 else if (Subtarget.hasAVX()) {
9038 // Scale to v32i8 and perform as v32i8.
9039 IndicesVec = ScaleIndices(IndicesVec, 2);
9040 return DAG.getBitcast(
9042 MVT::v32i8, DAG.getBitcast(MVT::v32i8, SrcVec),
9043 DAG.getBitcast(MVT::v32i8, IndicesVec), DL, DAG, Subtarget));
9044 }
9045 break;
9046 case MVT::v8f32:
9047 case MVT::v8i32:
9048 if (Subtarget.hasAVX2())
9049 Opcode = X86ISD::VPERMV;
9050 else if (Subtarget.hasAVX()) {
9051 SrcVec = DAG.getBitcast(MVT::v8f32, SrcVec);
9052 SDValue LoLo = DAG.getVectorShuffle(MVT::v8f32, DL, SrcVec, SrcVec,
9053 {0, 1, 2, 3, 0, 1, 2, 3});
9054 SDValue HiHi = DAG.getVectorShuffle(MVT::v8f32, DL, SrcVec, SrcVec,
9055 {4, 5, 6, 7, 4, 5, 6, 7});
9056 if (Subtarget.hasXOP())
9057 return DAG.getBitcast(
9058 VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v8f32, LoLo, HiHi,
9059 IndicesVec, DAG.getTargetConstant(0, DL, MVT::i8)));
9060 // Permute Lo and Hi and then select based on index range.
9061 // This works as VPERMILPS only uses index bits[0:1] to permute elements.
9062 SDValue Res = DAG.getSelectCC(
9063 DL, IndicesVec, DAG.getConstant(3, DL, MVT::v8i32),
9064 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, HiHi, IndicesVec),
9065 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, LoLo, IndicesVec),
9067 return DAG.getBitcast(VT, Res);
9068 }
9069 break;
9070 case MVT::v4i64:
9071 case MVT::v4f64:
9072 if (Subtarget.hasAVX512()) {
9073 if (!Subtarget.hasVLX()) {
9074 MVT WidenSrcVT = MVT::getVectorVT(VT.getScalarType(), 8);
9075 SrcVec = widenSubVector(WidenSrcVT, SrcVec, false, Subtarget, DAG,
9076 SDLoc(SrcVec));
9077 IndicesVec = widenSubVector(MVT::v8i64, IndicesVec, false, Subtarget,
9078 DAG, SDLoc(IndicesVec));
9079 SDValue Res = createVariablePermute(WidenSrcVT, SrcVec, IndicesVec, DL,
9080 DAG, Subtarget);
9081 return extract256BitVector(Res, 0, DAG, DL);
9082 }
9083 Opcode = X86ISD::VPERMV;
9084 } else if (Subtarget.hasAVX()) {
9085 SrcVec = DAG.getBitcast(MVT::v4f64, SrcVec);
9086 SDValue LoLo =
9087 DAG.getVectorShuffle(MVT::v4f64, DL, SrcVec, SrcVec, {0, 1, 0, 1});
9088 SDValue HiHi =
9089 DAG.getVectorShuffle(MVT::v4f64, DL, SrcVec, SrcVec, {2, 3, 2, 3});
9090 // VPERMIL2PD selects with bit#1 of the index vector, so scale IndicesVec.
9091 IndicesVec = DAG.getNode(ISD::ADD, DL, IndicesVT, IndicesVec, IndicesVec);
9092 if (Subtarget.hasXOP())
9093 return DAG.getBitcast(
9094 VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v4f64, LoLo, HiHi,
9095 IndicesVec, DAG.getTargetConstant(0, DL, MVT::i8)));
9096 // Permute Lo and Hi and then select based on index range.
9097 // This works as VPERMILPD only uses index bit[1] to permute elements.
9098 SDValue Res = DAG.getSelectCC(
9099 DL, IndicesVec, DAG.getConstant(2, DL, MVT::v4i64),
9100 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v4f64, HiHi, IndicesVec),
9101 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v4f64, LoLo, IndicesVec),
9103 return DAG.getBitcast(VT, Res);
9104 }
9105 break;
9106 case MVT::v64i8:
9107 if (Subtarget.hasVBMI())
9108 Opcode = X86ISD::VPERMV;
9109 break;
9110 case MVT::v32i16:
9111 if (Subtarget.hasBWI())
9112 Opcode = X86ISD::VPERMV;
9113 break;
9114 case MVT::v16f32:
9115 case MVT::v16i32:
9116 case MVT::v8f64:
9117 case MVT::v8i64:
9118 if (Subtarget.hasAVX512())
9119 Opcode = X86ISD::VPERMV;
9120 break;
9121 }
9122 if (!Opcode)
9123 return SDValue();
9124
9125 assert((VT.getSizeInBits() == ShuffleVT.getSizeInBits()) &&
9126 (VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits()) == 0 &&
9127 "Illegal variable permute shuffle type");
9128
9129 uint64_t Scale = VT.getScalarSizeInBits() / ShuffleVT.getScalarSizeInBits();
9130 if (Scale > 1)
9131 IndicesVec = ScaleIndices(IndicesVec, Scale);
9132
9133 EVT ShuffleIdxVT = EVT(ShuffleVT).changeVectorElementTypeToInteger();
9134 IndicesVec = DAG.getBitcast(ShuffleIdxVT, IndicesVec);
9135
9136 SrcVec = DAG.getBitcast(ShuffleVT, SrcVec);
9137 SDValue Res = Opcode == X86ISD::VPERMV
9138 ? DAG.getNode(Opcode, DL, ShuffleVT, IndicesVec, SrcVec)
9139 : DAG.getNode(Opcode, DL, ShuffleVT, SrcVec, IndicesVec);
9140 return DAG.getBitcast(VT, Res);
9141}
9142
9143// Tries to lower a BUILD_VECTOR composed of extract-extract chains that can be
9144// reasoned to be a permutation of a vector by indices in a non-constant vector.
9145// (build_vector (extract_elt V, (extract_elt I, 0)),
9146// (extract_elt V, (extract_elt I, 1)),
9147// ...
9148// ->
9149// (vpermv I, V)
9150//
9151// TODO: Handle undefs
9152// TODO: Utilize pshufb and zero mask blending to support more efficient
9153// construction of vectors with constant-0 elements.
9154static SDValue
9156 SelectionDAG &DAG,
9157 const X86Subtarget &Subtarget) {
9158 SDValue SrcVec, IndicesVec;
9159
9160 auto PeekThroughFreeze = [](SDValue N) {
9161 if (N->getOpcode() == ISD::FREEZE && N.hasOneUse())
9162 return N->getOperand(0);
9163 return N;
9164 };
9165 // Check for a match of the permute source vector and permute index elements.
9166 // This is done by checking that the i-th build_vector operand is of the form:
9167 // (extract_elt SrcVec, (extract_elt IndicesVec, i)).
9168 for (unsigned Idx = 0, E = V.getNumOperands(); Idx != E; ++Idx) {
9169 SDValue Op = PeekThroughFreeze(V.getOperand(Idx));
9170 if (Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
9171 return SDValue();
9172
9173 // If this is the first extract encountered in V, set the source vector,
9174 // otherwise verify the extract is from the previously defined source
9175 // vector.
9176 if (!SrcVec)
9177 SrcVec = Op.getOperand(0);
9178 else if (SrcVec != Op.getOperand(0))
9179 return SDValue();
9180 SDValue ExtractedIndex = Op->getOperand(1);
9181 // Peek through extends.
9182 if (ExtractedIndex.getOpcode() == ISD::ZERO_EXTEND ||
9183 ExtractedIndex.getOpcode() == ISD::SIGN_EXTEND)
9184 ExtractedIndex = ExtractedIndex.getOperand(0);
9185 if (ExtractedIndex.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
9186 return SDValue();
9187
9188 // If this is the first extract from the index vector candidate, set the
9189 // indices vector, otherwise verify the extract is from the previously
9190 // defined indices vector.
9191 if (!IndicesVec)
9192 IndicesVec = ExtractedIndex.getOperand(0);
9193 else if (IndicesVec != ExtractedIndex.getOperand(0))
9194 return SDValue();
9195
9196 auto *PermIdx = dyn_cast<ConstantSDNode>(ExtractedIndex.getOperand(1));
9197 if (!PermIdx || PermIdx->getAPIntValue() != Idx)
9198 return SDValue();
9199 }
9200
9201 MVT VT = V.getSimpleValueType();
9202 return createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget);
9203}
9204
9205SDValue
9206X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
9207 SDLoc dl(Op);
9208
9209 MVT VT = Op.getSimpleValueType();
9210 MVT EltVT = VT.getVectorElementType();
9211 MVT OpEltVT = Op.getOperand(0).getSimpleValueType();
9212 unsigned NumElems = Op.getNumOperands();
9213
9214 // Generate vectors for predicate vectors.
9215 if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512())
9216 return LowerBUILD_VECTORvXi1(Op, dl, DAG, Subtarget);
9217
9218 if (VT.getVectorElementType() == MVT::bf16 &&
9219 (Subtarget.hasAVXNECONVERT() || Subtarget.hasBF16()))
9220 return LowerBUILD_VECTORvXbf16(Op, DAG, Subtarget);
9221
9222 if (SDValue VectorCst = materializeVectorConstant(Op, dl, DAG, Subtarget))
9223 return VectorCst;
9224
9225 unsigned EVTBits = EltVT.getSizeInBits();
9226 APInt UndefMask = APInt::getZero(NumElems);
9227 APInt FrozenUndefMask = APInt::getZero(NumElems);
9228 APInt ZeroMask = APInt::getZero(NumElems);
9229 APInt NonZeroMask = APInt::getZero(NumElems);
9230 bool IsAllConstants = true;
9231 bool OneUseFrozenUndefs = true;
9232 SmallSet<SDValue, 8> Values;
9233 unsigned NumConstants = NumElems;
9234 for (unsigned i = 0; i < NumElems; ++i) {
9235 SDValue Elt = Op.getOperand(i);
9236 if (Elt.isUndef()) {
9237 UndefMask.setBit(i);
9238 continue;
9239 }
9240 if (ISD::isFreezeUndef(Elt.getNode())) {
9241 OneUseFrozenUndefs = OneUseFrozenUndefs && Elt->hasOneUse();
9242 FrozenUndefMask.setBit(i);
9243 continue;
9244 }
9245 Values.insert(Elt);
9246 if (!isIntOrFPConstant(Elt)) {
9247 IsAllConstants = false;
9248 NumConstants--;
9249 }
9250 if (X86::isZeroNode(Elt)) {
9251 ZeroMask.setBit(i);
9252 } else {
9253 NonZeroMask.setBit(i);
9254 }
9255 }
9256
9257 // All undef vector. Return an UNDEF.
9258 if (UndefMask.isAllOnes())
9259 return DAG.getUNDEF(VT);
9260
9261 // All undef/freeze(undef) vector. Return a FREEZE UNDEF.
9262 if (OneUseFrozenUndefs && (UndefMask | FrozenUndefMask).isAllOnes())
9263 return DAG.getFreeze(DAG.getUNDEF(VT));
9264
9265 // All undef/freeze(undef)/zero vector. Return a zero vector.
9266 if ((UndefMask | FrozenUndefMask | ZeroMask).isAllOnes())
9267 return getZeroVector(VT, Subtarget, DAG, dl);
9268
9269 // If we have multiple FREEZE-UNDEF operands, we are likely going to end up
9270 // lowering into a suboptimal insertion sequence. Instead, thaw the UNDEF in
9271 // our source BUILD_VECTOR, create another FREEZE-UNDEF splat BUILD_VECTOR,
9272 // and blend the FREEZE-UNDEF operands back in.
9273 // FIXME: is this worthwhile even for a single FREEZE-UNDEF operand?
9274 if (unsigned NumFrozenUndefElts = FrozenUndefMask.popcount();
9275 NumFrozenUndefElts >= 2 && NumFrozenUndefElts < NumElems) {
9276 SmallVector<int, 16> BlendMask(NumElems, -1);
9277 SmallVector<SDValue, 16> Elts(NumElems, DAG.getUNDEF(OpEltVT));
9278 for (unsigned i = 0; i < NumElems; ++i) {
9279 if (UndefMask[i]) {
9280 BlendMask[i] = -1;
9281 continue;
9282 }
9283 BlendMask[i] = i;
9284 if (!FrozenUndefMask[i])
9285 Elts[i] = Op.getOperand(i);
9286 else
9287 BlendMask[i] += NumElems;
9288 }
9289 SDValue EltsBV = DAG.getBuildVector(VT, dl, Elts);
9290 SDValue FrozenUndefElt = DAG.getFreeze(DAG.getUNDEF(OpEltVT));
9291 SDValue FrozenUndefBV = DAG.getSplatBuildVector(VT, dl, FrozenUndefElt);
9292 return DAG.getVectorShuffle(VT, dl, EltsBV, FrozenUndefBV, BlendMask);
9293 }
9294
9295 BuildVectorSDNode *BV = cast<BuildVectorSDNode>(Op.getNode());
9296
9297 // If the upper elts of a ymm/zmm are undef/freeze(undef)/zero then we might
9298 // be better off lowering to a smaller build vector and padding with
9299 // undef/zero.
9300 if ((VT.is256BitVector() || VT.is512BitVector()) &&
9302 unsigned UpperElems = NumElems / 2;
9303 APInt UndefOrZeroMask = FrozenUndefMask | UndefMask | ZeroMask;
9304 unsigned NumUpperUndefsOrZeros = UndefOrZeroMask.countl_one();
9305 if (NumUpperUndefsOrZeros >= UpperElems) {
9306 if (VT.is512BitVector() &&
9307 NumUpperUndefsOrZeros >= (NumElems - (NumElems / 4)))
9308 UpperElems = NumElems - (NumElems / 4);
9309 // If freeze(undef) is in any upper elements, force to zero.
9310 bool UndefUpper = UndefMask.countl_one() >= UpperElems;
9311 MVT LowerVT = MVT::getVectorVT(EltVT, NumElems - UpperElems);
9312 SDValue NewBV =
9313 DAG.getBuildVector(LowerVT, dl, Op->ops().drop_back(UpperElems));
9314 return widenSubVector(VT, NewBV, !UndefUpper, Subtarget, DAG, dl);
9315 }
9316 }
9317
9318 if (SDValue AddSub = lowerToAddSubOrFMAddSub(BV, dl, Subtarget, DAG))
9319 return AddSub;
9320 if (SDValue HorizontalOp = LowerToHorizontalOp(BV, dl, Subtarget, DAG))
9321 return HorizontalOp;
9322 if (SDValue Broadcast = lowerBuildVectorAsBroadcast(BV, dl, Subtarget, DAG))
9323 return Broadcast;
9324 if (SDValue BitOp = lowerBuildVectorToBitOp(BV, dl, Subtarget, DAG))
9325 return BitOp;
9326 if (SDValue Blend = lowerBuildVectorAsBlend(BV, dl, Subtarget, DAG))
9327 return Blend;
9328
9329 unsigned NumZero = ZeroMask.popcount();
9330 unsigned NumNonZero = NonZeroMask.popcount();
9331
9332 // If we are inserting one variable into a vector of non-zero constants, try
9333 // to avoid loading each constant element as a scalar. Load the constants as a
9334 // vector and then insert the variable scalar element. If insertion is not
9335 // supported, fall back to a shuffle to get the scalar blended with the
9336 // constants. Insertion into a zero vector is handled as a special-case
9337 // somewhere below here.
9338 if (NumConstants == NumElems - 1 && NumNonZero != 1 &&
9339 FrozenUndefMask.isZero() &&
9342 // Create an all-constant vector. The variable element in the old
9343 // build vector is replaced by undef in the constant vector. Save the
9344 // variable scalar element and its index for use in the insertelement.
9345 LLVMContext &Context = *DAG.getContext();
9346 Type *EltType = Op.getValueType().getScalarType().getTypeForEVT(Context);
9347 SmallVector<Constant *, 16> ConstVecOps(NumElems, UndefValue::get(EltType));
9348 SDValue VarElt;
9349 SDValue InsIndex;
9350 for (unsigned i = 0; i != NumElems; ++i) {
9351 SDValue Elt = Op.getOperand(i);
9352 if (auto *C = dyn_cast<ConstantSDNode>(Elt))
9353 ConstVecOps[i] = ConstantInt::get(Context, C->getAPIntValue());
9354 else if (auto *C = dyn_cast<ConstantFPSDNode>(Elt))
9355 ConstVecOps[i] = ConstantFP::get(Context, C->getValueAPF());
9356 else if (!Elt.isUndef()) {
9357 assert(!VarElt.getNode() && !InsIndex.getNode() &&
9358 "Expected one variable element in this vector");
9359 VarElt = Elt;
9360 InsIndex = DAG.getVectorIdxConstant(i, dl);
9361 }
9362 }
9363 Constant *CV = ConstantVector::get(ConstVecOps);
9364 SDValue DAGConstVec = DAG.getConstantPool(CV, VT);
9365
9366 // The constants we just created may not be legal (eg, floating point). We
9367 // must lower the vector right here because we can not guarantee that we'll
9368 // legalize it before loading it. This is also why we could not just create
9369 // a new build vector here. If the build vector contains illegal constants,
9370 // it could get split back up into a series of insert elements.
9371 // TODO: Improve this by using shorter loads with broadcast/VZEXT_LOAD.
9372 SDValue LegalDAGConstVec = LowerConstantPool(DAGConstVec, DAG);
9373 MachineFunction &MF = DAG.getMachineFunction();
9374 MachinePointerInfo MPI = MachinePointerInfo::getConstantPool(MF);
9375 SDValue Ld = DAG.getLoad(VT, dl, DAG.getEntryNode(), LegalDAGConstVec, MPI);
9376 unsigned InsertC = InsIndex->getAsZExtVal();
9377 unsigned NumEltsInLow128Bits = 128 / VT.getScalarSizeInBits();
9378 if (InsertC < NumEltsInLow128Bits)
9379 return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Ld, VarElt, InsIndex);
9380
9381 // There's no good way to insert into the high elements of a >128-bit
9382 // vector, so use shuffles to avoid an extract/insert sequence.
9383 assert(VT.getSizeInBits() > 128 && "Invalid insertion index?");
9384 assert(Subtarget.hasAVX() && "Must have AVX with >16-byte vector");
9385 SmallVector<int, 8> ShuffleMask;
9386 unsigned NumElts = VT.getVectorNumElements();
9387 for (unsigned i = 0; i != NumElts; ++i)
9388 ShuffleMask.push_back(i == InsertC ? NumElts : i);
9389 SDValue S2V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, VarElt);
9390 return DAG.getVectorShuffle(VT, dl, Ld, S2V, ShuffleMask);
9391 }
9392
9393 // Special case for single non-zero, non-undef, element.
9394 if (NumNonZero == 1) {
9395 unsigned Idx = NonZeroMask.countr_zero();
9396 SDValue Item = Op.getOperand(Idx);
9397
9398 // If we have a constant or non-constant insertion into the low element of
9399 // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into
9400 // the rest of the elements. This will be matched as movd/movq/movss/movsd
9401 // depending on what the source datatype is.
9402 if (Idx == 0) {
9403 if (NumZero == 0)
9404 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
9405
9406 if (EltVT == MVT::i32 || EltVT == MVT::f16 || EltVT == MVT::f32 ||
9407 EltVT == MVT::f64 || (EltVT == MVT::i64 && Subtarget.is64Bit()) ||
9408 (EltVT == MVT::i16 && Subtarget.hasFP16())) {
9409 assert((VT.is128BitVector() || VT.is256BitVector() ||
9410 VT.is512BitVector()) &&
9411 "Expected an SSE value type!");
9412 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
9413 // Turn it into a MOVL (i.e. movsh, movss, movsd, movw or movd) to a
9414 // zero vector.
9415 return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
9416 }
9417
9418 // We can't directly insert an i8 or i16 into a vector, so zero extend
9419 // it to i32 first.
9420 if (EltVT == MVT::i16 || EltVT == MVT::i8) {
9421 Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
9422 MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
9423 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, Item);
9424 Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
9425 return DAG.getBitcast(VT, Item);
9426 }
9427 }
9428
9429 // Is it a vector logical left shift?
9430 if (NumElems == 2 && Idx == 1 &&
9431 X86::isZeroNode(Op.getOperand(0)) &&
9432 !X86::isZeroNode(Op.getOperand(1))) {
9433 unsigned NumBits = VT.getSizeInBits();
9434 return getVShift(true, VT,
9436 VT, Op.getOperand(1)),
9437 NumBits/2, DAG, *this, dl);
9438 }
9439
9440 if (IsAllConstants) // Otherwise, it's better to do a constpool load.
9441 return SDValue();
9442
9443 // Otherwise, if this is a vector with i32 or f32 elements, and the element
9444 // is a non-constant being inserted into an element other than the low one,
9445 // we can't use a constant pool load. Instead, use SCALAR_TO_VECTOR (aka
9446 // movd/movss) to move this into the low element, then shuffle it into
9447 // place.
9448 if (EVTBits == 32) {
9449 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
9450 return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget, DAG);
9451 }
9452 }
9453
9454 // Splat is obviously ok. Let legalizer expand it to a shuffle.
9455 if (Values.size() == 1) {
9456 if (EVTBits == 32) {
9457 // Instead of a shuffle like this:
9458 // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
9459 // Check if it's possible to issue this instead.
9460 // shuffle (vload ptr)), undef, <1, 1, 1, 1>
9461 unsigned Idx = NonZeroMask.countr_zero();
9462 SDValue Item = Op.getOperand(Idx);
9463 if (Op.getNode()->isOnlyUserOf(Item.getNode()))
9464 return LowerAsSplatVectorLoad(Item, VT, dl, DAG);
9465 }
9466 return SDValue();
9467 }
9468
9469 // A vector full of immediates; various special cases are already
9470 // handled, so this is best done with a single constant-pool load.
9471 if (IsAllConstants)
9472 return SDValue();
9473
9474 if (SDValue V = LowerBUILD_VECTORAsVariablePermute(Op, dl, DAG, Subtarget))
9475 return V;
9476
9477 // See if we can use a vector load to get all of the elements.
9478 {
9479 SmallVector<SDValue, 64> Ops(Op->ops().take_front(NumElems));
9480 if (SDValue LD =
9481 EltsFromConsecutiveLoads(VT, Ops, dl, DAG, Subtarget, false))
9482 return LD;
9483 }
9484
9485 // If this is a splat of pairs of 32-bit elements, we can use a narrower
9486 // build_vector and broadcast it.
9487 // TODO: We could probably generalize this more.
9488 if (Subtarget.hasAVX2() && EVTBits == 32 && Values.size() == 2) {
9489 SDValue Ops[4] = { Op.getOperand(0), Op.getOperand(1),
9490 DAG.getUNDEF(EltVT), DAG.getUNDEF(EltVT) };
9491 auto CanSplat = [](SDValue Op, unsigned NumElems, ArrayRef<SDValue> Ops) {
9492 // Make sure all the even/odd operands match.
9493 for (unsigned i = 2; i != NumElems; ++i)
9494 if (Ops[i % 2] != Op.getOperand(i))
9495 return false;
9496 return true;
9497 };
9498 if (CanSplat(Op, NumElems, Ops)) {
9499 MVT WideEltVT = VT.isFloatingPoint() ? MVT::f64 : MVT::i64;
9500 MVT NarrowVT = MVT::getVectorVT(EltVT, 4);
9501 // Create a new build vector and cast to v2i64/v2f64.
9502 SDValue NewBV = DAG.getBitcast(MVT::getVectorVT(WideEltVT, 2),
9503 DAG.getBuildVector(NarrowVT, dl, Ops));
9504 // Broadcast from v2i64/v2f64 and cast to final VT.
9505 MVT BcastVT = MVT::getVectorVT(WideEltVT, NumElems / 2);
9506 return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, dl, BcastVT,
9507 NewBV));
9508 }
9509 }
9510
9511 // For AVX-length vectors, build the individual 128-bit pieces and use
9512 // shuffles to put them in place.
9513 if (VT.getSizeInBits() > 128) {
9514 MVT HVT = MVT::getVectorVT(EltVT, NumElems / 2);
9515
9516 // Build both the lower and upper subvector.
9517 SDValue Lower =
9518 DAG.getBuildVector(HVT, dl, Op->ops().slice(0, NumElems / 2));
9520 HVT, dl, Op->ops().slice(NumElems / 2, NumElems /2));
9521
9522 // Recreate the wider vector with the lower and upper part.
9523 return concatSubVectors(Lower, Upper, DAG, dl);
9524 }
9525
9526 // Let legalizer expand 2-wide build_vectors.
9527 if (EVTBits == 64) {
9528 if (NumNonZero == 1) {
9529 // One half is zero or undef.
9530 unsigned Idx = NonZeroMask.countr_zero();
9531 SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,
9532 Op.getOperand(Idx));
9533 return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG);
9534 }
9535 return SDValue();
9536 }
9537
9538 // If element VT is < 32 bits, convert it to inserts into a zero vector.
9539 if (EVTBits == 8 && NumElems == 16)
9540 if (SDValue V = LowerBuildVectorv16i8(Op, dl, NonZeroMask, NumNonZero,
9541 NumZero, DAG, Subtarget))
9542 return V;
9543
9544 if (EltVT == MVT::i16 && NumElems == 8)
9545 if (SDValue V = LowerBuildVectorv8i16(Op, dl, NonZeroMask, NumNonZero,
9546 NumZero, DAG, Subtarget))
9547 return V;
9548
9549 // If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS
9550 if (EVTBits == 32 && NumElems == 4)
9551 if (SDValue V = LowerBuildVectorv4x32(Op, dl, DAG, Subtarget))
9552 return V;
9553
9554 // If element VT is == 32 bits, turn it into a number of shuffles.
9555 if (NumElems == 4 && NumZero > 0) {
9556 SmallVector<SDValue, 8> Ops(NumElems);
9557 for (unsigned i = 0; i < 4; ++i) {
9558 bool isZero = !NonZeroMask[i];
9559 if (isZero)
9560 Ops[i] = getZeroVector(VT, Subtarget, DAG, dl);
9561 else
9562 Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
9563 }
9564
9565 for (unsigned i = 0; i < 2; ++i) {
9566 switch (NonZeroMask.extractBitsAsZExtValue(2, i * 2)) {
9567 default: llvm_unreachable("Unexpected NonZero count");
9568 case 0:
9569 Ops[i] = Ops[i*2]; // Must be a zero vector.
9570 break;
9571 case 1:
9572 Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2+1], Ops[i*2]);
9573 break;
9574 case 2:
9575 Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
9576 break;
9577 case 3:
9578 Ops[i] = getUnpackl(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
9579 break;
9580 }
9581 }
9582
9583 bool Reverse1 = NonZeroMask.extractBitsAsZExtValue(2, 0) == 2;
9584 bool Reverse2 = NonZeroMask.extractBitsAsZExtValue(2, 2) == 2;
9585 int MaskVec[] = {
9586 Reverse1 ? 1 : 0,
9587 Reverse1 ? 0 : 1,
9588 static_cast<int>(Reverse2 ? NumElems+1 : NumElems),
9589 static_cast<int>(Reverse2 ? NumElems : NumElems+1)
9590 };
9591 return DAG.getVectorShuffle(VT, dl, Ops[0], Ops[1], MaskVec);
9592 }
9593
9594 assert(Values.size() > 1 && "Expected non-undef and non-splat vector");
9595
9596 // Check for a build vector from mostly shuffle plus few inserting.
9597 if (SDValue Sh = buildFromShuffleMostly(Op, dl, DAG))
9598 return Sh;
9599
9600 // For SSE 4.1, use insertps to put the high elements into the low element.
9601 if (Subtarget.hasSSE41() && EltVT != MVT::f16) {
9603 if (!Op.getOperand(0).isUndef())
9604 Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));
9605 else
9606 Result = DAG.getUNDEF(VT);
9607
9608 for (unsigned i = 1; i < NumElems; ++i) {
9609 if (Op.getOperand(i).isUndef()) continue;
9610 Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result,
9611 Op.getOperand(i), DAG.getVectorIdxConstant(i, dl));
9612 }
9613 return Result;
9614 }
9615
9616 // Otherwise, expand into a number of unpckl*, start by extending each of
9617 // our (non-undef) elements to the full vector width with the element in the
9618 // bottom slot of the vector (which generates no code for SSE).
9619 SmallVector<SDValue, 8> Ops(NumElems);
9620 for (unsigned i = 0; i < NumElems; ++i) {
9621 if (!Op.getOperand(i).isUndef())
9622 Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
9623 else
9624 Ops[i] = DAG.getUNDEF(VT);
9625 }
9626
9627 // Next, we iteratively mix elements, e.g. for v4f32:
9628 // Step 1: unpcklps 0, 1 ==> X: <?, ?, 1, 0>
9629 // : unpcklps 2, 3 ==> Y: <?, ?, 3, 2>
9630 // Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
9631 for (unsigned Scale = 1; Scale < NumElems; Scale *= 2) {
9632 // Generate scaled UNPCKL shuffle mask.
9633 SmallVector<int, 16> Mask;
9634 for(unsigned i = 0; i != Scale; ++i)
9635 Mask.push_back(i);
9636 for (unsigned i = 0; i != Scale; ++i)
9637 Mask.push_back(NumElems+i);
9638 Mask.append(NumElems - Mask.size(), SM_SentinelUndef);
9639
9640 for (unsigned i = 0, e = NumElems / (2 * Scale); i != e; ++i)
9641 Ops[i] = DAG.getVectorShuffle(VT, dl, Ops[2*i], Ops[(2*i)+1], Mask);
9642 }
9643 return Ops[0];
9644}
9645
9646// 256-bit AVX can use the vinsertf128 instruction
9647// to create 256-bit vectors from two other 128-bit ones.
9648// TODO: Detect subvector broadcast here instead of DAG combine?
9650 SelectionDAG &DAG,
9651 const X86Subtarget &Subtarget) {
9652 MVT ResVT = Op.getSimpleValueType();
9653 assert((ResVT.is256BitVector() || ResVT.is512BitVector()) &&
9654 "Value type must be 256-/512-bit wide");
9655
9656 unsigned NumOperands = Op.getNumOperands();
9657 unsigned NumFreezeUndef = 0;
9658 unsigned NumZero = 0;
9659 unsigned NumNonZero = 0;
9660 unsigned NonZeros = 0;
9661 SmallSet<SDValue, 4> Undefs;
9662 for (unsigned i = 0; i != NumOperands; ++i) {
9663 SDValue SubVec = Op.getOperand(i);
9664 if (SubVec.isUndef())
9665 continue;
9666 if (ISD::isFreezeUndef(SubVec.getNode())) {
9667 // If the freeze(undef) has multiple uses then we must fold to zero.
9668 if (SubVec.hasOneUse()) {
9669 ++NumFreezeUndef;
9670 } else {
9671 ++NumZero;
9672 Undefs.insert(SubVec);
9673 }
9674 }
9675 else if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
9676 ++NumZero;
9677 else {
9678 assert(i < sizeof(NonZeros) * CHAR_BIT); // Ensure the shift is in range.
9679 NonZeros |= 1 << i;
9680 ++NumNonZero;
9681 }
9682 }
9683
9684 // If we have more than 2 non-zeros, build each half separately.
9685 if (NumNonZero > 2) {
9686 MVT HalfVT = ResVT.getHalfNumVectorElementsVT();
9687 ArrayRef<SDUse> Ops = Op->ops();
9688 SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
9689 Ops.slice(0, NumOperands/2));
9690 SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
9691 Ops.slice(NumOperands/2));
9692 return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
9693 }
9694
9695 // Otherwise, build it up through insert_subvectors.
9696 SDValue Vec = NumZero ? getZeroVector(ResVT, Subtarget, DAG, dl)
9697 : (NumFreezeUndef ? DAG.getFreeze(DAG.getUNDEF(ResVT))
9698 : DAG.getUNDEF(ResVT));
9699
9700 // Replace Undef operands with ZeroVector.
9701 for (SDValue U : Undefs)
9703 U, getZeroVector(U.getSimpleValueType(), Subtarget, DAG, dl));
9704
9705 MVT SubVT = Op.getOperand(0).getSimpleValueType();
9706 unsigned NumSubElems = SubVT.getVectorNumElements();
9707 for (unsigned i = 0; i != NumOperands; ++i) {
9708 if ((NonZeros & (1 << i)) == 0)
9709 continue;
9710
9711 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, Op.getOperand(i),
9712 DAG.getVectorIdxConstant(i * NumSubElems, dl));
9713 }
9714
9715 return Vec;
9716}
9717
9718// Returns true if the given node is a type promotion (by concatenating i1
9719// zeros) of the result of a node that already zeros all upper bits of
9720// k-register.
9721// TODO: Merge this with LowerAVXCONCAT_VECTORS?
9723 const X86Subtarget &Subtarget,
9724 SelectionDAG & DAG) {
9725 MVT ResVT = Op.getSimpleValueType();
9726 unsigned NumOperands = Op.getNumOperands();
9727 assert(NumOperands > 1 && isPowerOf2_32(NumOperands) &&
9728 "Unexpected number of operands in CONCAT_VECTORS");
9729
9730 uint64_t Zeros = 0;
9731 uint64_t NonZeros = 0;
9732 for (unsigned i = 0; i != NumOperands; ++i) {
9733 SDValue SubVec = Op.getOperand(i);
9734 if (SubVec.isUndef())
9735 continue;
9736 assert(i < sizeof(NonZeros) * CHAR_BIT); // Ensure the shift is in range.
9737 if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
9738 Zeros |= (uint64_t)1 << i;
9739 else
9740 NonZeros |= (uint64_t)1 << i;
9741 }
9742
9743 unsigned NumElems = ResVT.getVectorNumElements();
9744
9745 // If we are inserting non-zero vector and there are zeros in LSBs and undef
9746 // in the MSBs we need to emit a KSHIFTL. The generic lowering to
9747 // insert_subvector will give us two kshifts.
9748 if (isPowerOf2_64(NonZeros) && Zeros != 0 && NonZeros > Zeros &&
9749 Log2_64(NonZeros) != NumOperands - 1) {
9750 unsigned Idx = Log2_64(NonZeros);
9751 SDValue SubVec = Op.getOperand(Idx);
9752 unsigned SubVecNumElts = SubVec.getSimpleValueType().getVectorNumElements();
9753 MVT ShiftVT = widenMaskVectorType(ResVT, Subtarget);
9754 Op = widenSubVector(ShiftVT, SubVec, false, Subtarget, DAG, dl);
9755 Op = DAG.getNode(X86ISD::KSHIFTL, dl, ShiftVT, Op,
9756 DAG.getTargetConstant(Idx * SubVecNumElts, dl, MVT::i8));
9757 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResVT, Op,
9758 DAG.getVectorIdxConstant(0, dl));
9759 }
9760
9761 // If there are zero or one non-zeros we can handle this very simply.
9762 if (NonZeros == 0 || isPowerOf2_64(NonZeros)) {
9763 SDValue Vec = Zeros ? DAG.getConstant(0, dl, ResVT) : DAG.getUNDEF(ResVT);
9764 if (!NonZeros)
9765 return Vec;
9766 unsigned Idx = Log2_64(NonZeros);
9767 SDValue SubVec = Op.getOperand(Idx);
9768 unsigned SubVecNumElts = SubVec.getSimpleValueType().getVectorNumElements();
9769 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, SubVec,
9770 DAG.getVectorIdxConstant(Idx * SubVecNumElts, dl));
9771 }
9772
9773 if (NumOperands > 2) {
9774 MVT HalfVT = ResVT.getHalfNumVectorElementsVT();
9775 ArrayRef<SDUse> Ops = Op->ops();
9776 SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
9777 Ops.slice(0, NumOperands / 2));
9778 SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
9779 Ops.slice(NumOperands / 2));
9780 return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
9781 }
9782
9783 assert(llvm::popcount(NonZeros) == 2 && "Simple cases not handled?");
9784
9785 if (ResVT.getVectorNumElements() >= 16)
9786 return Op; // The operation is legal with KUNPCK
9787
9788 SDValue Vec =
9789 DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, DAG.getUNDEF(ResVT),
9790 Op.getOperand(0), DAG.getVectorIdxConstant(0, dl));
9791 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, Op.getOperand(1),
9792 DAG.getVectorIdxConstant(NumElems / 2, dl));
9793}
9794
9796 const X86Subtarget &Subtarget,
9797 SelectionDAG &DAG) {
9798 SDLoc DL(Op);
9799 MVT VT = Op.getSimpleValueType();
9800 if (VT.getVectorElementType() == MVT::i1)
9801 return LowerCONCAT_VECTORSvXi1(Op, DL, Subtarget, DAG);
9802
9803 // AVX can use the vinsertf128 instruction to create 256-bit vectors
9804 // from two other 128-bit ones.
9805 // 512-bit vector may contain 2 256-bit vectors or 4 128-bit vectors
9806 assert((VT.is256BitVector() && Op.getNumOperands() == 2) ||
9807 (VT.is512BitVector() &&
9808 (Op.getNumOperands() == 2 || Op.getNumOperands() == 4)));
9809 return LowerAVXCONCAT_VECTORS(Op, DL, DAG, Subtarget);
9810}
9811
9812//===----------------------------------------------------------------------===//
9813// Vector shuffle lowering
9814//
9815// This is an experimental code path for lowering vector shuffles on x86. It is
9816// designed to handle arbitrary vector shuffles and blends, gracefully
9817// degrading performance as necessary. It works hard to recognize idiomatic
9818// shuffles and lower them to optimal instruction patterns without leaving
9819// a framework that allows reasonably efficient handling of all vector shuffle
9820// patterns.
9821//===----------------------------------------------------------------------===//
9822
9823/// Checks whether the vector elements referenced by two shuffle masks are
9824/// equivalent.
9825static bool IsElementEquivalent(int MaskSize, SDValue Op, SDValue ExpectedOp,
9826 int Idx, int ExpectedIdx) {
9827 assert(0 <= Idx && Idx < MaskSize && 0 <= ExpectedIdx &&
9828 ExpectedIdx < MaskSize && "Out of range element index");
9829 if (!Op || !ExpectedOp || Op.getOpcode() != ExpectedOp.getOpcode())
9830 return false;
9831
9832 EVT VT = Op.getValueType();
9833 EVT ExpectedVT = ExpectedOp.getValueType();
9834
9835 // Sources must be vectors and match the mask's element count.
9836 if (!VT.isVector() || !ExpectedVT.isVector() ||
9837 (int)VT.getVectorNumElements() != MaskSize ||
9838 (int)ExpectedVT.getVectorNumElements() != MaskSize)
9839 return false;
9840
9841 // Exact match.
9842 if (Idx == ExpectedIdx && Op == ExpectedOp)
9843 return true;
9844
9845 switch (Op.getOpcode()) {
9846 case ISD::BUILD_VECTOR:
9847 // If the values are build vectors, we can look through them to find
9848 // equivalent inputs that make the shuffles equivalent.
9849 return Op.getOperand(Idx) == ExpectedOp.getOperand(ExpectedIdx);
9850 case ISD::BITCAST: {
9852 EVT SrcVT = Src.getValueType();
9853 if (Op == ExpectedOp && SrcVT.isVector()) {
9854 if ((SrcVT.getScalarSizeInBits() % VT.getScalarSizeInBits()) == 0) {
9855 unsigned Scale = SrcVT.getScalarSizeInBits() / VT.getScalarSizeInBits();
9856 return (Idx % Scale) == (ExpectedIdx % Scale) &&
9857 IsElementEquivalent(SrcVT.getVectorNumElements(), Src, Src,
9858 Idx / Scale, ExpectedIdx / Scale);
9859 }
9860 if ((VT.getScalarSizeInBits() % SrcVT.getScalarSizeInBits()) == 0) {
9861 unsigned Scale = VT.getScalarSizeInBits() / SrcVT.getScalarSizeInBits();
9862 for (unsigned I = 0; I != Scale; ++I)
9863 if (!IsElementEquivalent(SrcVT.getVectorNumElements(), Src, Src,
9864 (Idx * Scale) + I,
9865 (ExpectedIdx * Scale) + I))
9866 return false;
9867 return true;
9868 }
9869 }
9870 break;
9871 }
9872 case ISD::VECTOR_SHUFFLE: {
9873 auto *SVN = cast<ShuffleVectorSDNode>(Op);
9874 return Op == ExpectedOp &&
9875 SVN->getMaskElt(Idx) == SVN->getMaskElt(ExpectedIdx);
9876 }
9877 case X86ISD::VBROADCAST:
9879 return Op == ExpectedOp;
9881 if (Op == ExpectedOp) {
9882 auto *MemOp = cast<MemSDNode>(Op);
9883 unsigned NumMemElts = MemOp->getMemoryVT().getVectorNumElements();
9884 return (Idx % NumMemElts) == (ExpectedIdx % NumMemElts);
9885 }
9886 break;
9887 case X86ISD::VPERMI: {
9888 if (Op == ExpectedOp) {
9890 DecodeVPERMMask(MaskSize, Op.getConstantOperandVal(1), Mask);
9891 SDValue Src = Op.getOperand(0);
9892 return IsElementEquivalent(MaskSize, Src, Src, Mask[Idx],
9893 Mask[ExpectedIdx]);
9894 }
9895 break;
9896 }
9897 case X86ISD::HADD:
9898 case X86ISD::HSUB:
9899 case X86ISD::FHADD:
9900 case X86ISD::FHSUB:
9901 case X86ISD::PACKSS:
9902 case X86ISD::PACKUS:
9903 // HOP(X,X) can refer to the elt from the lower/upper half of a lane.
9904 // TODO: Handle HOP(X,Y) vs HOP(Y,X) equivalence cases.
9905 if (Op == ExpectedOp && Op.getOperand(0) == Op.getOperand(1)) {
9906 int NumElts = VT.getVectorNumElements();
9907 int NumLanes = VT.getSizeInBits() / 128;
9908 int NumEltsPerLane = NumElts / NumLanes;
9909 int NumHalfEltsPerLane = NumEltsPerLane / 2;
9910 bool SameLane = (Idx / NumEltsPerLane) == (ExpectedIdx / NumEltsPerLane);
9911 bool SameElt =
9912 (Idx % NumHalfEltsPerLane) == (ExpectedIdx % NumHalfEltsPerLane);
9913 return SameLane && SameElt;
9914 }
9915 break;
9916 }
9917
9918 return false;
9919}
9920
9921/// Tiny helper function to identify a no-op mask.
9922///
9923/// This is a somewhat boring predicate function. It checks whether the mask
9924/// array input, which is assumed to be a single-input shuffle mask of the kind
9925/// used by the X86 shuffle instructions (not a fully general
9926/// ShuffleVectorSDNode mask) requires any shuffles to occur. Both undef and an
9927/// in-place shuffle are 'no-op's.
9929 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
9930 assert(Mask[i] >= -1 && "Out of bound mask element!");
9931 if (Mask[i] >= 0 && Mask[i] != i)
9932 return false;
9933 }
9934 return true;
9935}
9936
9937/// Test whether there are elements crossing LaneSizeInBits lanes in this
9938/// shuffle mask.
9939///
9940/// X86 divides up its shuffles into in-lane and cross-lane shuffle operations
9941/// and we routinely test for these.
9942static bool isLaneCrossingShuffleMask(unsigned LaneSizeInBits,
9943 unsigned ScalarSizeInBits,
9944 ArrayRef<int> Mask) {
9945 assert(LaneSizeInBits && ScalarSizeInBits &&
9946 (LaneSizeInBits % ScalarSizeInBits) == 0 &&
9947 "Illegal shuffle lane size");
9948 int LaneSize = LaneSizeInBits / ScalarSizeInBits;
9949 int Size = Mask.size();
9950 for (int i = 0; i < Size; ++i)
9951 if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
9952 return true;
9953 return false;
9954}
9955
9956/// Test whether there are elements crossing 128-bit lanes in this
9957/// shuffle mask.
9959 return isLaneCrossingShuffleMask(128, VT.getScalarSizeInBits(), Mask);
9960}
9961
9962/// Test whether elements in each LaneSizeInBits lane in this shuffle mask come
9963/// from multiple lanes - this is different to isLaneCrossingShuffleMask to
9964/// better support 'repeated mask + lane permute' style shuffles.
9965static bool isMultiLaneShuffleMask(unsigned LaneSizeInBits,
9966 unsigned ScalarSizeInBits,
9967 ArrayRef<int> Mask) {
9968 assert(LaneSizeInBits && ScalarSizeInBits &&
9969 (LaneSizeInBits % ScalarSizeInBits) == 0 &&
9970 "Illegal shuffle lane size");
9971 int NumElts = Mask.size();
9972 int NumEltsPerLane = LaneSizeInBits / ScalarSizeInBits;
9973 int NumLanes = NumElts / NumEltsPerLane;
9974 if (NumLanes > 1) {
9975 for (int i = 0; i != NumLanes; ++i) {
9976 int SrcLane = -1;
9977 for (int j = 0; j != NumEltsPerLane; ++j) {
9978 int M = Mask[(i * NumEltsPerLane) + j];
9979 if (M < 0)
9980 continue;
9981 int Lane = (M % NumElts) / NumEltsPerLane;
9982 if (SrcLane >= 0 && SrcLane != Lane)
9983 return true;
9984 SrcLane = Lane;
9985 }
9986 }
9987 }
9988 return false;
9989}
9990
9991/// Test whether a shuffle mask is equivalent within each sub-lane.
9992///
9993/// This checks a shuffle mask to see if it is performing the same
9994/// lane-relative shuffle in each sub-lane. This trivially implies
9995/// that it is also not lane-crossing. It may however involve a blend from the
9996/// same lane of a second vector.
9997///
9998/// The specific repeated shuffle mask is populated in \p RepeatedMask, as it is
9999/// non-trivial to compute in the face of undef lanes. The representation is
10000/// suitable for use with existing 128-bit shuffles as entries from the second
10001/// vector have been remapped to [LaneSize, 2*LaneSize).
10002static bool isRepeatedShuffleMask(unsigned LaneSizeInBits, MVT VT,
10003 ArrayRef<int> Mask,
10004 SmallVectorImpl<int> &RepeatedMask) {
10005 auto LaneSize = LaneSizeInBits / VT.getScalarSizeInBits();
10006 RepeatedMask.assign(LaneSize, -1);
10007 int Size = Mask.size();
10008 for (int i = 0; i < Size; ++i) {
10009 assert(Mask[i] == SM_SentinelUndef || Mask[i] >= 0);
10010 if (Mask[i] < 0)
10011 continue;
10012 if ((Mask[i] % Size) / LaneSize != i / LaneSize)
10013 // This entry crosses lanes, so there is no way to model this shuffle.
10014 return false;
10015
10016 // Ok, handle the in-lane shuffles by detecting if and when they repeat.
10017 // Adjust second vector indices to start at LaneSize instead of Size.
10018 int LocalM = Mask[i] < Size ? Mask[i] % LaneSize
10019 : Mask[i] % LaneSize + LaneSize;
10020 if (RepeatedMask[i % LaneSize] < 0)
10021 // This is the first non-undef entry in this slot of a 128-bit lane.
10022 RepeatedMask[i % LaneSize] = LocalM;
10023 else if (RepeatedMask[i % LaneSize] != LocalM)
10024 // Found a mismatch with the repeated mask.
10025 return false;
10026 }
10027 return true;
10028}
10029
10030/// Test whether a shuffle mask is equivalent within each 128-bit lane.
10031static bool
10033 SmallVectorImpl<int> &RepeatedMask) {
10034 return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
10035}
10036
10037static bool
10039 SmallVector<int, 32> RepeatedMask;
10040 return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
10041}
10042
10043/// Test whether a shuffle mask is equivalent within each 256-bit lane.
10044static bool
10046 SmallVectorImpl<int> &RepeatedMask) {
10047 return isRepeatedShuffleMask(256, VT, Mask, RepeatedMask);
10048}
10049
10050/// Test whether a target shuffle mask is equivalent within each sub-lane.
10051/// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero.
10052static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits,
10053 unsigned EltSizeInBits,
10054 ArrayRef<int> Mask,
10055 SmallVectorImpl<int> &RepeatedMask) {
10056 int LaneSize = LaneSizeInBits / EltSizeInBits;
10057 RepeatedMask.assign(LaneSize, SM_SentinelUndef);
10058 int Size = Mask.size();
10059 for (int i = 0; i < Size; ++i) {
10060 assert(isUndefOrZero(Mask[i]) || (Mask[i] >= 0));
10061 if (Mask[i] == SM_SentinelUndef)
10062 continue;
10063 if (Mask[i] == SM_SentinelZero) {
10064 if (!isUndefOrZero(RepeatedMask[i % LaneSize]))
10065 return false;
10066 RepeatedMask[i % LaneSize] = SM_SentinelZero;
10067 continue;
10068 }
10069 if ((Mask[i] % Size) / LaneSize != i / LaneSize)
10070 // This entry crosses lanes, so there is no way to model this shuffle.
10071 return false;
10072
10073 // Handle the in-lane shuffles by detecting if and when they repeat. Adjust
10074 // later vector indices to start at multiples of LaneSize instead of Size.
10075 int LaneM = Mask[i] / Size;
10076 int LocalM = (Mask[i] % LaneSize) + (LaneM * LaneSize);
10077 if (RepeatedMask[i % LaneSize] == SM_SentinelUndef)
10078 // This is the first non-undef entry in this slot of a 128-bit lane.
10079 RepeatedMask[i % LaneSize] = LocalM;
10080 else if (RepeatedMask[i % LaneSize] != LocalM)
10081 // Found a mismatch with the repeated mask.
10082 return false;
10083 }
10084 return true;
10085}
10086
10087/// Test whether a target shuffle mask is equivalent within each sub-lane.
10088/// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero.
10089static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits, MVT VT,
10090 ArrayRef<int> Mask,
10091 SmallVectorImpl<int> &RepeatedMask) {
10092 return isRepeatedTargetShuffleMask(LaneSizeInBits, VT.getScalarSizeInBits(),
10093 Mask, RepeatedMask);
10094}
10095
10096/// Checks whether a shuffle mask is equivalent to an explicit list of
10097/// arguments.
10098///
10099/// This is a fast way to test a shuffle mask against a fixed pattern:
10100///
10101/// if (isShuffleEquivalent(Mask, 3, 2, {1, 0})) { ... }
10102///
10103/// It returns true if the mask is exactly as wide as the argument list, and
10104/// each element of the mask is either -1 (signifying undef) or the value given
10105/// in the argument.
10106static bool isShuffleEquivalent(ArrayRef<int> Mask, ArrayRef<int> ExpectedMask,
10107 SDValue V1 = SDValue(),
10108 SDValue V2 = SDValue()) {
10109 int Size = Mask.size();
10110 if (Size != (int)ExpectedMask.size())
10111 return false;
10112
10113 for (int i = 0; i < Size; ++i) {
10114 assert(Mask[i] >= -1 && "Out of bound mask element!");
10115 int MaskIdx = Mask[i];
10116 int ExpectedIdx = ExpectedMask[i];
10117 if (0 <= MaskIdx && MaskIdx != ExpectedIdx) {
10118 SDValue MaskV = MaskIdx < Size ? V1 : V2;
10119 SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2;
10120 MaskIdx = MaskIdx < Size ? MaskIdx : (MaskIdx - Size);
10121 ExpectedIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size);
10122 if (!IsElementEquivalent(Size, MaskV, ExpectedV, MaskIdx, ExpectedIdx))
10123 return false;
10124 }
10125 }
10126 return true;
10127}
10128
10129/// Checks whether a target shuffle mask is equivalent to an explicit pattern.
10130///
10131/// The masks must be exactly the same width.
10132///
10133/// If an element in Mask matches SM_SentinelUndef (-1) then the corresponding
10134/// value in ExpectedMask is always accepted. Otherwise the indices must match.
10135///
10136/// SM_SentinelZero is accepted as a valid negative index but must match in
10137/// both, or via a known bits test.
10139 ArrayRef<int> ExpectedMask,
10140 const SelectionDAG &DAG,
10141 SDValue V1 = SDValue(),
10142 SDValue V2 = SDValue()) {
10143 int Size = Mask.size();
10144 if (Size != (int)ExpectedMask.size())
10145 return false;
10146 assert(llvm::all_of(ExpectedMask,
10147 [Size](int M) {
10148 return M == SM_SentinelZero ||
10149 isInRange(M, 0, 2 * Size);
10150 }) &&
10151 "Illegal target shuffle mask");
10152
10153 // Check for out-of-range target shuffle mask indices.
10154 if (!isUndefOrZeroOrInRange(Mask, 0, 2 * Size))
10155 return false;
10156
10157 // Don't use V1/V2 if they're not the same size as the shuffle mask type.
10158 if (V1 && (V1.getValueSizeInBits() != VT.getSizeInBits() ||
10159 !V1.getValueType().isVector()))
10160 V1 = SDValue();
10161 if (V2 && (V2.getValueSizeInBits() != VT.getSizeInBits() ||
10162 !V2.getValueType().isVector()))
10163 V2 = SDValue();
10164
10165 APInt ZeroV1 = APInt::getZero(Size);
10166 APInt ZeroV2 = APInt::getZero(Size);
10167
10168 for (int i = 0; i < Size; ++i) {
10169 int MaskIdx = Mask[i];
10170 int ExpectedIdx = ExpectedMask[i];
10171 if (MaskIdx == SM_SentinelUndef || MaskIdx == ExpectedIdx)
10172 continue;
10173 // If we failed to match an expected SM_SentinelZero then early out.
10174 if (ExpectedIdx < 0)
10175 return false;
10176 if (MaskIdx == SM_SentinelZero) {
10177 // If we need this expected index to be a zero element, then update the
10178 // relevant zero mask and perform the known bits at the end to minimize
10179 // repeated computes.
10180 SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2;
10181 if (ExpectedV &&
10182 Size == (int)ExpectedV.getValueType().getVectorNumElements()) {
10183 int BitIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size);
10184 APInt &ZeroMask = ExpectedIdx < Size ? ZeroV1 : ZeroV2;
10185 ZeroMask.setBit(BitIdx);
10186 continue;
10187 }
10188 }
10189 if (MaskIdx >= 0) {
10190 SDValue MaskV = MaskIdx < Size ? V1 : V2;
10191 SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2;
10192 MaskIdx = MaskIdx < Size ? MaskIdx : (MaskIdx - Size);
10193 ExpectedIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size);
10194 if (IsElementEquivalent(Size, MaskV, ExpectedV, MaskIdx, ExpectedIdx))
10195 continue;
10196 }
10197 return false;
10198 }
10199 return (ZeroV1.isZero() || DAG.MaskedVectorIsZero(V1, ZeroV1)) &&
10200 (ZeroV2.isZero() || DAG.MaskedVectorIsZero(V2, ZeroV2));
10201}
10202
10203// Check if the shuffle mask is suitable for the AVX vpunpcklwd or vpunpckhwd
10204// instructions.
10206 const SelectionDAG &DAG) {
10207 if (VT != MVT::v8i32 && VT != MVT::v8f32)
10208 return false;
10209
10210 SmallVector<int, 8> Unpcklwd;
10211 createUnpackShuffleMask(MVT::v8i16, Unpcklwd, /* Lo = */ true,
10212 /* Unary = */ false);
10213 SmallVector<int, 8> Unpckhwd;
10214 createUnpackShuffleMask(MVT::v8i16, Unpckhwd, /* Lo = */ false,
10215 /* Unary = */ false);
10216 bool IsUnpackwdMask = (isTargetShuffleEquivalent(VT, Mask, Unpcklwd, DAG) ||
10217 isTargetShuffleEquivalent(VT, Mask, Unpckhwd, DAG));
10218 return IsUnpackwdMask;
10219}
10220
10222 const SelectionDAG &DAG) {
10223 // Create 128-bit vector type based on mask size.
10224 MVT EltVT = MVT::getIntegerVT(128 / Mask.size());
10225 MVT VT = MVT::getVectorVT(EltVT, Mask.size());
10226
10227 // We can't assume a canonical shuffle mask, so try the commuted version too.
10228 SmallVector<int, 4> CommutedMask(Mask);
10230
10231 // Match any of unary/binary or low/high.
10232 for (unsigned i = 0; i != 4; ++i) {
10233 SmallVector<int, 16> UnpackMask;
10234 createUnpackShuffleMask(VT, UnpackMask, (i >> 1) % 2, i % 2);
10235 if (isTargetShuffleEquivalent(VT, Mask, UnpackMask, DAG) ||
10236 isTargetShuffleEquivalent(VT, CommutedMask, UnpackMask, DAG))
10237 return true;
10238 }
10239 return false;
10240}
10241
10242/// Return true if a shuffle mask chooses elements identically in its top and
10243/// bottom halves. For example, any splat mask has the same top and bottom
10244/// halves. If an element is undefined in only one half of the mask, the halves
10245/// are not considered identical.
10247 assert(Mask.size() % 2 == 0 && "Expecting even number of elements in mask");
10248 unsigned HalfSize = Mask.size() / 2;
10249 for (unsigned i = 0; i != HalfSize; ++i) {
10250 if (Mask[i] != Mask[i + HalfSize])
10251 return false;
10252 }
10253 return true;
10254}
10255
10256/// Get a 4-lane 8-bit shuffle immediate for a mask.
10257///
10258/// This helper function produces an 8-bit shuffle immediate corresponding to
10259/// the ubiquitous shuffle encoding scheme used in x86 instructions for
10260/// shuffling 4 lanes. It can be used with most of the PSHUF instructions for
10261/// example.
10262///
10263/// NB: We rely heavily on "undef" masks preserving the input lane.
10264static unsigned getV4X86ShuffleImm(ArrayRef<int> Mask) {
10265 assert(Mask.size() == 4 && "Only 4-lane shuffle masks");
10266 assert(Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!");
10267 assert(Mask[1] >= -1 && Mask[1] < 4 && "Out of bound mask element!");
10268 assert(Mask[2] >= -1 && Mask[2] < 4 && "Out of bound mask element!");
10269 assert(Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!");
10270
10271 // If the mask only uses one non-undef element, then fully 'splat' it to
10272 // improve later broadcast matching.
10273 int FirstIndex = find_if(Mask, [](int M) { return M >= 0; }) - Mask.begin();
10274 assert(0 <= FirstIndex && FirstIndex < 4 && "All undef shuffle mask");
10275
10276 int FirstElt = Mask[FirstIndex];
10277 if (all_of(Mask, [FirstElt](int M) { return M < 0 || M == FirstElt; }))
10278 return (FirstElt << 6) | (FirstElt << 4) | (FirstElt << 2) | FirstElt;
10279
10280 unsigned Imm = 0;
10281 Imm |= (Mask[0] < 0 ? 0 : Mask[0]) << 0;
10282 Imm |= (Mask[1] < 0 ? 1 : Mask[1]) << 2;
10283 Imm |= (Mask[2] < 0 ? 2 : Mask[2]) << 4;
10284 Imm |= (Mask[3] < 0 ? 3 : Mask[3]) << 6;
10285 return Imm;
10286}
10287
10289 SelectionDAG &DAG) {
10290 return DAG.getTargetConstant(getV4X86ShuffleImm(Mask), DL, MVT::i8);
10291}
10292
10293// Canonicalize SHUFPD mask to improve chances of further folding.
10294// Mask elements are assumed to be -1, 0 or 1 to match the SHUFPD lo/hi pattern.
10295static unsigned getSHUFPDImm(ArrayRef<int> Mask) {
10296 assert((Mask.size() == 2 || Mask.size() == 4 || Mask.size() == 8) &&
10297 "Unexpected SHUFPD mask size");
10298 assert(all_of(Mask, [](int M) { return -1 <= M && M <= 1; }) &&
10299 "Unexpected SHUFPD mask elements");
10300
10301 // If the mask only uses one non-undef element, then fully 'splat' it to
10302 // improve later broadcast matching.
10303 int FirstIndex = find_if(Mask, [](int M) { return M >= 0; }) - Mask.begin();
10304 assert(0 <= FirstIndex && FirstIndex < (int)Mask.size() &&
10305 "All undef shuffle mask");
10306
10307 int FirstElt = Mask[FirstIndex];
10308 if (all_of(Mask, [FirstElt](int M) { return M < 0 || M == FirstElt; }) &&
10309 count_if(Mask, [FirstElt](int M) { return M == FirstElt; }) > 1) {
10310 unsigned Imm = 0;
10311 for (unsigned I = 0, E = Mask.size(); I != E; ++I)
10312 Imm |= FirstElt << I;
10313 return Imm;
10314 }
10315
10316 // Attempt to keep any undef elements in place to improve chances of the
10317 // shuffle becoming a (commutative) blend.
10318 unsigned Imm = 0;
10319 for (unsigned I = 0, E = Mask.size(); I != E; ++I)
10320 Imm |= (Mask[I] < 0 ? (I & 1) : Mask[I]) << I;
10321
10322 return Imm;
10323}
10324
10326 SelectionDAG &DAG) {
10327 return DAG.getTargetConstant(getSHUFPDImm(Mask), DL, MVT::i8);
10328}
10329
10330// The Shuffle result is as follow:
10331// 0*a[0]0*a[1]...0*a[n] , n >=0 where a[] elements in a ascending order.
10332// Each Zeroable's element correspond to a particular Mask's element.
10333// As described in computeZeroableShuffleElements function.
10334//
10335// The function looks for a sub-mask that the nonzero elements are in
10336// increasing order. If such sub-mask exist. The function returns true.
10337static bool isNonZeroElementsInOrder(const APInt &Zeroable,
10338 ArrayRef<int> Mask, const EVT &VectorType,
10339 bool &IsZeroSideLeft) {
10340 int NextElement = -1;
10341 // Check if the Mask's nonzero elements are in increasing order.
10342 for (int i = 0, e = Mask.size(); i < e; i++) {
10343 // Checks if the mask's zeros elements are built from only zeros.
10344 assert(Mask[i] >= -1 && "Out of bound mask element!");
10345 if (Mask[i] < 0)
10346 return false;
10347 if (Zeroable[i])
10348 continue;
10349 // Find the lowest non zero element
10350 if (NextElement < 0) {
10351 NextElement = Mask[i] != 0 ? VectorType.getVectorNumElements() : 0;
10352 IsZeroSideLeft = NextElement != 0;
10353 }
10354 // Exit if the mask's non zero elements are not in increasing order.
10355 if (NextElement != Mask[i])
10356 return false;
10357 NextElement++;
10358 }
10359 return true;
10360}
10361
10362static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
10364 const X86Subtarget &Subtarget,
10365 unsigned Depth = 0);
10366
10367/// Try to lower a shuffle with a single PSHUFB of V1 or V2.
10369 ArrayRef<int> Mask, SDValue V1,
10370 SDValue V2, const APInt &Zeroable,
10371 const X86Subtarget &Subtarget,
10372 SelectionDAG &DAG) {
10373 int Size = Mask.size();
10374 int LaneSize = 128 / VT.getScalarSizeInBits();
10375 const int NumBytes = VT.getSizeInBits() / 8;
10376 const int NumEltBytes = VT.getScalarSizeInBits() / 8;
10377
10378 assert((Subtarget.hasSSSE3() && VT.is128BitVector()) ||
10379 (Subtarget.hasAVX2() && VT.is256BitVector()) ||
10380 (Subtarget.hasBWI() && VT.is512BitVector()));
10381
10382 SmallVector<SDValue, 64> PSHUFBMask(NumBytes);
10383 // Sign bit set in i8 mask means zero element.
10384 SDValue ZeroMask = DAG.getConstant(0x80, DL, MVT::i8);
10385
10386 SDValue V;
10387 for (int i = 0; i < NumBytes; ++i) {
10388 int M = Mask[i / NumEltBytes];
10389 if (M < 0) {
10390 PSHUFBMask[i] = DAG.getUNDEF(MVT::i8);
10391 continue;
10392 }
10393 if (Zeroable[i / NumEltBytes]) {
10394 PSHUFBMask[i] = ZeroMask;
10395 continue;
10396 }
10397
10398 // We can only use a single input of V1 or V2.
10399 SDValue SrcV = (M >= Size ? V2 : V1);
10400 if (V && V != SrcV)
10401 return SDValue();
10402 V = SrcV;
10403 M %= Size;
10404
10405 // PSHUFB can't cross lanes, ensure this doesn't happen.
10406 if ((M / LaneSize) != ((i / NumEltBytes) / LaneSize))
10407 return SDValue();
10408
10409 M = M % LaneSize;
10410 M = M * NumEltBytes + (i % NumEltBytes);
10411 PSHUFBMask[i] = DAG.getConstant(M, DL, MVT::i8);
10412 }
10413 assert(V && "Failed to find a source input");
10414
10415 MVT I8VT = MVT::getVectorVT(MVT::i8, NumBytes);
10416 return DAG.getBitcast(
10417 VT, DAG.getNode(X86ISD::PSHUFB, DL, I8VT, DAG.getBitcast(I8VT, V),
10418 DAG.getBuildVector(I8VT, DL, PSHUFBMask)));
10419}
10420
10421static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
10422 const X86Subtarget &Subtarget, SelectionDAG &DAG,
10423 const SDLoc &dl);
10424
10425// X86 has dedicated shuffle that can be lowered to VEXPAND
10427 SDValue V2, ArrayRef<int> Mask,
10428 const APInt &Zeroable,
10429 const X86Subtarget &Subtarget,
10430 SelectionDAG &DAG) {
10431 bool IsLeftZeroSide = true;
10432 if (!isNonZeroElementsInOrder(Zeroable, Mask, V1.getValueType(),
10433 IsLeftZeroSide))
10434 return SDValue();
10435 unsigned VEXPANDMask = (~Zeroable).getZExtValue();
10437 MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
10438 SDValue MaskNode = DAG.getConstant(VEXPANDMask, DL, IntegerType);
10439 unsigned NumElts = VT.getVectorNumElements();
10440 assert((NumElts == 4 || NumElts == 8 || NumElts == 16) &&
10441 "Unexpected number of vector elements");
10442 SDValue VMask = getMaskNode(MaskNode, MVT::getVectorVT(MVT::i1, NumElts),
10443 Subtarget, DAG, DL);
10444 SDValue ZeroVector = getZeroVector(VT, Subtarget, DAG, DL);
10445 SDValue ExpandedVector = IsLeftZeroSide ? V2 : V1;
10446 return DAG.getNode(X86ISD::EXPAND, DL, VT, ExpandedVector, ZeroVector, VMask);
10447}
10448
10449static bool matchShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2,
10450 unsigned &UnpackOpcode, bool IsUnary,
10451 ArrayRef<int> TargetMask, const SDLoc &DL,
10452 SelectionDAG &DAG,
10453 const X86Subtarget &Subtarget) {
10454 int NumElts = VT.getVectorNumElements();
10455
10456 bool Undef1 = true, Undef2 = true, Zero1 = true, Zero2 = true;
10457 for (int i = 0; i != NumElts; i += 2) {
10458 int M1 = TargetMask[i + 0];
10459 int M2 = TargetMask[i + 1];
10460 Undef1 &= (SM_SentinelUndef == M1);
10461 Undef2 &= (SM_SentinelUndef == M2);
10462 Zero1 &= isUndefOrZero(M1);
10463 Zero2 &= isUndefOrZero(M2);
10464 }
10465 assert(!((Undef1 || Zero1) && (Undef2 || Zero2)) &&
10466 "Zeroable shuffle detected");
10467
10468 // Attempt to match the target mask against the unpack lo/hi mask patterns.
10469 SmallVector<int, 64> Unpckl, Unpckh;
10470 createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, IsUnary);
10471 if (isTargetShuffleEquivalent(VT, TargetMask, Unpckl, DAG, V1,
10472 (IsUnary ? V1 : V2))) {
10473 UnpackOpcode = X86ISD::UNPCKL;
10474 V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
10475 V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
10476 return true;
10477 }
10478
10479 createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, IsUnary);
10480 if (isTargetShuffleEquivalent(VT, TargetMask, Unpckh, DAG, V1,
10481 (IsUnary ? V1 : V2))) {
10482 UnpackOpcode = X86ISD::UNPCKH;
10483 V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
10484 V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
10485 return true;
10486 }
10487
10488 // If an unary shuffle, attempt to match as an unpack lo/hi with zero.
10489 if (IsUnary && (Zero1 || Zero2)) {
10490 // Don't bother if we can blend instead.
10491 if ((Subtarget.hasSSE41() || VT == MVT::v2i64 || VT == MVT::v2f64) &&
10492 isSequentialOrUndefOrZeroInRange(TargetMask, 0, NumElts, 0))
10493 return false;
10494
10495 bool MatchLo = true, MatchHi = true;
10496 for (int i = 0; (i != NumElts) && (MatchLo || MatchHi); ++i) {
10497 int M = TargetMask[i];
10498
10499 // Ignore if the input is known to be zero or the index is undef.
10500 if ((((i & 1) == 0) && Zero1) || (((i & 1) == 1) && Zero2) ||
10501 (M == SM_SentinelUndef))
10502 continue;
10503
10504 MatchLo &= (M == Unpckl[i]);
10505 MatchHi &= (M == Unpckh[i]);
10506 }
10507
10508 if (MatchLo || MatchHi) {
10509 UnpackOpcode = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
10510 V2 = Zero2 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
10511 V1 = Zero1 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
10512 return true;
10513 }
10514 }
10515
10516 // If a binary shuffle, commute and try again.
10517 if (!IsUnary) {
10519 if (isTargetShuffleEquivalent(VT, TargetMask, Unpckl, DAG)) {
10520 UnpackOpcode = X86ISD::UNPCKL;
10521 std::swap(V1, V2);
10522 return true;
10523 }
10524
10526 if (isTargetShuffleEquivalent(VT, TargetMask, Unpckh, DAG)) {
10527 UnpackOpcode = X86ISD::UNPCKH;
10528 std::swap(V1, V2);
10529 return true;
10530 }
10531 }
10532
10533 return false;
10534}
10535
10536// X86 has dedicated unpack instructions that can handle specific blend
10537// operations: UNPCKH and UNPCKL.
10539 SDValue V2, ArrayRef<int> Mask,
10540 SelectionDAG &DAG) {
10541 SmallVector<int, 8> Unpckl;
10542 createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, /* Unary = */ false);
10543 if (isShuffleEquivalent(Mask, Unpckl, V1, V2))
10544 return DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2);
10545
10546 SmallVector<int, 8> Unpckh;
10547 createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, /* Unary = */ false);
10548 if (isShuffleEquivalent(Mask, Unpckh, V1, V2))
10549 return DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2);
10550
10551 // Commute and try again.
10553 if (isShuffleEquivalent(Mask, Unpckl, V1, V2))
10554 return DAG.getNode(X86ISD::UNPCKL, DL, VT, V2, V1);
10555
10557 if (isShuffleEquivalent(Mask, Unpckh, V1, V2))
10558 return DAG.getNode(X86ISD::UNPCKH, DL, VT, V2, V1);
10559
10560 return SDValue();
10561}
10562
10563/// Check if the mask can be mapped to a preliminary shuffle (vperm 64-bit)
10564/// followed by unpack 256-bit.
10566 SDValue V2, ArrayRef<int> Mask,
10567 SelectionDAG &DAG) {
10568 SmallVector<int, 32> Unpckl, Unpckh;
10569 createSplat2ShuffleMask(VT, Unpckl, /* Lo */ true);
10570 createSplat2ShuffleMask(VT, Unpckh, /* Lo */ false);
10571
10572 unsigned UnpackOpcode;
10573 if (isShuffleEquivalent(Mask, Unpckl, V1, V2))
10574 UnpackOpcode = X86ISD::UNPCKL;
10575 else if (isShuffleEquivalent(Mask, Unpckh, V1, V2))
10576 UnpackOpcode = X86ISD::UNPCKH;
10577 else
10578 return SDValue();
10579
10580 // This is a "natural" unpack operation (rather than the 128-bit sectored
10581 // operation implemented by AVX). We need to rearrange 64-bit chunks of the
10582 // input in order to use the x86 instruction.
10583 V1 = DAG.getVectorShuffle(MVT::v4f64, DL, DAG.getBitcast(MVT::v4f64, V1),
10584 DAG.getUNDEF(MVT::v4f64), {0, 2, 1, 3});
10585 V1 = DAG.getBitcast(VT, V1);
10586 return DAG.getNode(UnpackOpcode, DL, VT, V1, V1);
10587}
10588
10589// Check if the mask can be mapped to a TRUNCATE or VTRUNC, truncating the
10590// source into the lower elements and zeroing the upper elements.
10591static bool matchShuffleAsVTRUNC(MVT &SrcVT, MVT &DstVT, MVT VT,
10592 ArrayRef<int> Mask, const APInt &Zeroable,
10593 const X86Subtarget &Subtarget) {
10594 if (!VT.is512BitVector() && !Subtarget.hasVLX())
10595 return false;
10596
10597 unsigned NumElts = Mask.size();
10598 unsigned EltSizeInBits = VT.getScalarSizeInBits();
10599 unsigned MaxScale = 64 / EltSizeInBits;
10600
10601 for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
10602 unsigned SrcEltBits = EltSizeInBits * Scale;
10603 if (SrcEltBits < 32 && !Subtarget.hasBWI())
10604 continue;
10605 unsigned NumSrcElts = NumElts / Scale;
10606 if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, 0, Scale))
10607 continue;
10608 unsigned UpperElts = NumElts - NumSrcElts;
10609 if (!Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnes())
10610 continue;
10611 SrcVT = MVT::getIntegerVT(EltSizeInBits * Scale);
10612 SrcVT = MVT::getVectorVT(SrcVT, NumSrcElts);
10613 DstVT = MVT::getIntegerVT(EltSizeInBits);
10614 if ((NumSrcElts * EltSizeInBits) >= 128) {
10615 // ISD::TRUNCATE
10616 DstVT = MVT::getVectorVT(DstVT, NumSrcElts);
10617 } else {
10618 // X86ISD::VTRUNC
10619 DstVT = MVT::getVectorVT(DstVT, 128 / EltSizeInBits);
10620 }
10621 return true;
10622 }
10623
10624 return false;
10625}
10626
10627// Helper to create TRUNCATE/VTRUNC nodes, optionally with zero/undef upper
10628// element padding to the final DstVT.
10629static SDValue getAVX512TruncNode(const SDLoc &DL, MVT DstVT, SDValue Src,
10630 const X86Subtarget &Subtarget,
10631 SelectionDAG &DAG, bool ZeroUppers) {
10632 MVT SrcVT = Src.getSimpleValueType();
10633 MVT DstSVT = DstVT.getScalarType();
10634 unsigned NumDstElts = DstVT.getVectorNumElements();
10635 unsigned NumSrcElts = SrcVT.getVectorNumElements();
10636 unsigned DstEltSizeInBits = DstVT.getScalarSizeInBits();
10637
10638 if (!DAG.getTargetLoweringInfo().isTypeLegal(SrcVT))
10639 return SDValue();
10640
10641 // Perform a direct ISD::TRUNCATE if possible.
10642 if (NumSrcElts == NumDstElts)
10643 return DAG.getNode(ISD::TRUNCATE, DL, DstVT, Src);
10644
10645 if (NumSrcElts > NumDstElts) {
10646 MVT TruncVT = MVT::getVectorVT(DstSVT, NumSrcElts);
10647 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Src);
10648 return extractSubVector(Trunc, 0, DAG, DL, DstVT.getSizeInBits());
10649 }
10650
10651 if ((NumSrcElts * DstEltSizeInBits) >= 128) {
10652 MVT TruncVT = MVT::getVectorVT(DstSVT, NumSrcElts);
10653 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Src);
10654 return widenSubVector(Trunc, ZeroUppers, Subtarget, DAG, DL,
10655 DstVT.getSizeInBits());
10656 }
10657
10658 // Non-VLX targets must truncate from a 512-bit type, so we need to
10659 // widen, truncate and then possibly extract the original subvector.
10660 if (!Subtarget.hasVLX() && !SrcVT.is512BitVector()) {
10661 SDValue NewSrc = widenSubVector(Src, ZeroUppers, Subtarget, DAG, DL, 512);
10662 return getAVX512TruncNode(DL, DstVT, NewSrc, Subtarget, DAG, ZeroUppers);
10663 }
10664
10665 // Fallback to a X86ISD::VTRUNC, padding if necessary.
10666 MVT TruncVT = MVT::getVectorVT(DstSVT, 128 / DstEltSizeInBits);
10667 SDValue Trunc = DAG.getNode(X86ISD::VTRUNC, DL, TruncVT, Src);
10668 if (DstVT != TruncVT)
10669 Trunc = widenSubVector(Trunc, ZeroUppers, Subtarget, DAG, DL,
10670 DstVT.getSizeInBits());
10671 return Trunc;
10672}
10673
10674// Try to lower trunc+vector_shuffle to a vpmovdb or a vpmovdw instruction.
10675//
10676// An example is the following:
10677//
10678// t0: ch = EntryToken
10679// t2: v4i64,ch = CopyFromReg t0, Register:v4i64 %0
10680// t25: v4i32 = truncate t2
10681// t41: v8i16 = bitcast t25
10682// t21: v8i16 = BUILD_VECTOR undef:i16, undef:i16, undef:i16, undef:i16,
10683// Constant:i16<0>, Constant:i16<0>, Constant:i16<0>, Constant:i16<0>
10684// t51: v8i16 = vector_shuffle<0,2,4,6,12,13,14,15> t41, t21
10685// t18: v2i64 = bitcast t51
10686//
10687// One can just use a single vpmovdw instruction, without avx512vl we need to
10688// use the zmm variant and extract the lower subvector, padding with zeroes.
10689// TODO: Merge with lowerShuffleAsVTRUNC.
10691 SDValue V2, ArrayRef<int> Mask,
10692 const APInt &Zeroable,
10693 const X86Subtarget &Subtarget,
10694 SelectionDAG &DAG) {
10695 assert((VT == MVT::v16i8 || VT == MVT::v8i16) && "Unexpected VTRUNC type");
10696 if (!Subtarget.hasAVX512())
10697 return SDValue();
10698
10699 unsigned NumElts = VT.getVectorNumElements();
10700 unsigned EltSizeInBits = VT.getScalarSizeInBits();
10701 unsigned MaxScale = 64 / EltSizeInBits;
10702 for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
10703 unsigned SrcEltBits = EltSizeInBits * Scale;
10704 unsigned NumSrcElts = NumElts / Scale;
10705 unsigned UpperElts = NumElts - NumSrcElts;
10706 if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, 0, Scale) ||
10707 !Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnes())
10708 continue;
10709
10710 // Attempt to find a matching source truncation, but as a fall back VLX
10711 // cases can use the VPMOV directly.
10712 SDValue Src = peekThroughBitcasts(V1);
10713 if (Src.getOpcode() == ISD::TRUNCATE &&
10714 Src.getScalarValueSizeInBits() == SrcEltBits) {
10715 Src = Src.getOperand(0);
10716 } else if (Subtarget.hasVLX()) {
10717 MVT SrcSVT = MVT::getIntegerVT(SrcEltBits);
10718 MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts);
10719 Src = DAG.getBitcast(SrcVT, Src);
10720 // Don't do this if PACKSS/PACKUS could perform it cheaper.
10721 if (Scale == 2 &&
10722 ((DAG.ComputeNumSignBits(Src) > EltSizeInBits) ||
10723 (DAG.computeKnownBits(Src).countMinLeadingZeros() >= EltSizeInBits)))
10724 return SDValue();
10725 } else
10726 return SDValue();
10727
10728 // VPMOVWB is only available with avx512bw.
10729 if (!Subtarget.hasBWI() && Src.getScalarValueSizeInBits() < 32)
10730 return SDValue();
10731
10732 bool UndefUppers = isUndefInRange(Mask, NumSrcElts, UpperElts);
10733 return getAVX512TruncNode(DL, VT, Src, Subtarget, DAG, !UndefUppers);
10734 }
10735
10736 return SDValue();
10737}
10738
10739// Attempt to match binary shuffle patterns as a truncate.
10741 SDValue V2, ArrayRef<int> Mask,
10742 const APInt &Zeroable,
10743 const X86Subtarget &Subtarget,
10744 SelectionDAG &DAG) {
10745 assert((VT.is128BitVector() || VT.is256BitVector()) &&
10746 "Unexpected VTRUNC type");
10747 if (!Subtarget.hasAVX512() ||
10748 (VT.is256BitVector() && !Subtarget.useAVX512Regs()))
10749 return SDValue();
10750
10751 unsigned NumElts = VT.getVectorNumElements();
10752 unsigned EltSizeInBits = VT.getScalarSizeInBits();
10753 unsigned MaxScale = 64 / EltSizeInBits;
10754 for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
10755 // TODO: Support non-BWI VPMOVWB truncations?
10756 unsigned SrcEltBits = EltSizeInBits * Scale;
10757 if (SrcEltBits < 32 && !Subtarget.hasBWI())
10758 continue;
10759
10760 // Match shuffle <Ofs,Ofs+Scale,Ofs+2*Scale,..,undef_or_zero,undef_or_zero>
10761 // Bail if the V2 elements are undef.
10762 unsigned NumHalfSrcElts = NumElts / Scale;
10763 unsigned NumSrcElts = 2 * NumHalfSrcElts;
10764 for (unsigned Offset = 0; Offset != Scale; ++Offset) {
10765 if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, Offset, Scale) ||
10766 isUndefInRange(Mask, NumHalfSrcElts, NumHalfSrcElts))
10767 continue;
10768
10769 // The elements beyond the truncation must be undef/zero.
10770 unsigned UpperElts = NumElts - NumSrcElts;
10771 if (UpperElts > 0 &&
10772 !Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnes())
10773 continue;
10774 bool UndefUppers =
10775 UpperElts > 0 && isUndefInRange(Mask, NumSrcElts, UpperElts);
10776
10777 // As we're using both sources then we need to concat them together
10778 // and truncate from the double-sized src.
10779 MVT ConcatVT = VT.getDoubleNumVectorElementsVT();
10780
10781 // For offset truncations, ensure that the concat is cheap.
10782 SDValue Src =
10783 combineConcatVectorOps(DL, ConcatVT, {V1, V2}, DAG, Subtarget);
10784 if (!Src) {
10785 if (Offset)
10786 continue;
10787 Src = DAG.getNode(ISD::CONCAT_VECTORS, DL, ConcatVT, V1, V2);
10788 }
10789
10790 MVT SrcSVT = MVT::getIntegerVT(SrcEltBits);
10791 MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts);
10792 Src = DAG.getBitcast(SrcVT, Src);
10793
10794 // Shift the offset'd elements into place for the truncation.
10795 // TODO: Use getTargetVShiftByConstNode.
10796 if (Offset)
10797 Src = DAG.getNode(
10798 X86ISD::VSRLI, DL, SrcVT, Src,
10799 DAG.getTargetConstant(Offset * EltSizeInBits, DL, MVT::i8));
10800
10801 return getAVX512TruncNode(DL, VT, Src, Subtarget, DAG, !UndefUppers);
10802 }
10803 }
10804
10805 return SDValue();
10806}
10807
10808/// Check whether a compaction lowering can be done by dropping even/odd
10809/// elements and compute how many times even/odd elements must be dropped.
10810///
10811/// This handles shuffles which take every Nth element where N is a power of
10812/// two. Example shuffle masks:
10813///
10814/// (even)
10815/// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14
10816/// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
10817/// N = 2: 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12
10818/// N = 2: 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28
10819/// N = 3: 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8
10820/// N = 3: 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24
10821///
10822/// (odd)
10823/// N = 1: 1, 3, 5, 7, 9, 11, 13, 15, 0, 2, 4, 6, 8, 10, 12, 14
10824/// N = 1: 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
10825///
10826/// Any of these lanes can of course be undef.
10827///
10828/// This routine only supports N <= 3.
10829/// FIXME: Evaluate whether either AVX or AVX-512 have any opportunities here
10830/// for larger N.
10831///
10832/// \returns N above, or the number of times even/odd elements must be dropped
10833/// if there is such a number. Otherwise returns zero.
10834static int canLowerByDroppingElements(ArrayRef<int> Mask, bool MatchEven,
10835 bool IsSingleInput) {
10836 // The modulus for the shuffle vector entries is based on whether this is
10837 // a single input or not.
10838 int ShuffleModulus = Mask.size() * (IsSingleInput ? 1 : 2);
10839 assert(isPowerOf2_32((uint32_t)ShuffleModulus) &&
10840 "We should only be called with masks with a power-of-2 size!");
10841
10842 uint64_t ModMask = (uint64_t)ShuffleModulus - 1;
10843 int Offset = MatchEven ? 0 : 1;
10844
10845 // We track whether the input is viable for all power-of-2 strides 2^1, 2^2,
10846 // and 2^3 simultaneously. This is because we may have ambiguity with
10847 // partially undef inputs.
10848 bool ViableForN[3] = {true, true, true};
10849
10850 for (int i = 0, e = Mask.size(); i < e; ++i) {
10851 // Ignore undef lanes, we'll optimistically collapse them to the pattern we
10852 // want.
10853 if (Mask[i] < 0)
10854 continue;
10855
10856 bool IsAnyViable = false;
10857 for (unsigned j = 0; j != std::size(ViableForN); ++j)
10858 if (ViableForN[j]) {
10859 uint64_t N = j + 1;
10860
10861 // The shuffle mask must be equal to (i * 2^N) % M.
10862 if ((uint64_t)(Mask[i] - Offset) == (((uint64_t)i << N) & ModMask))
10863 IsAnyViable = true;
10864 else
10865 ViableForN[j] = false;
10866 }
10867 // Early exit if we exhaust the possible powers of two.
10868 if (!IsAnyViable)
10869 break;
10870 }
10871
10872 for (unsigned j = 0; j != std::size(ViableForN); ++j)
10873 if (ViableForN[j])
10874 return j + 1;
10875
10876 // Return 0 as there is no viable power of two.
10877 return 0;
10878}
10879
10880// X86 has dedicated pack instructions that can handle specific truncation
10881// operations: PACKSS and PACKUS.
10882// Checks for compaction shuffle masks if MaxStages > 1.
10883// TODO: Add support for matching multiple PACKSS/PACKUS stages.
10884static bool matchShuffleWithPACK(MVT VT, MVT &SrcVT, SDValue &V1, SDValue &V2,
10885 unsigned &PackOpcode, ArrayRef<int> TargetMask,
10886 const SelectionDAG &DAG,
10887 const X86Subtarget &Subtarget,
10888 unsigned MaxStages = 1) {
10889 unsigned NumElts = VT.getVectorNumElements();
10890 unsigned BitSize = VT.getScalarSizeInBits();
10891 assert(0 < MaxStages && MaxStages <= 3 && (BitSize << MaxStages) <= 64 &&
10892 "Illegal maximum compaction");
10893
10894 auto MatchPACK = [&](SDValue N1, SDValue N2, MVT PackVT) {
10895 unsigned NumSrcBits = PackVT.getScalarSizeInBits();
10896 unsigned NumPackedBits = NumSrcBits - BitSize;
10897 N1 = peekThroughBitcasts(N1);
10898 N2 = peekThroughBitcasts(N2);
10899 unsigned NumBits1 = N1.getScalarValueSizeInBits();
10900 unsigned NumBits2 = N2.getScalarValueSizeInBits();
10901 bool IsZero1 = llvm::isNullOrNullSplat(N1, /*AllowUndefs*/ false);
10902 bool IsZero2 = llvm::isNullOrNullSplat(N2, /*AllowUndefs*/ false);
10903 if ((!N1.isUndef() && !IsZero1 && NumBits1 != NumSrcBits) ||
10904 (!N2.isUndef() && !IsZero2 && NumBits2 != NumSrcBits))
10905 return false;
10906 if (Subtarget.hasSSE41() || BitSize == 8) {
10907 APInt ZeroMask = APInt::getHighBitsSet(NumSrcBits, NumPackedBits);
10908 if ((N1.isUndef() || IsZero1 || DAG.MaskedValueIsZero(N1, ZeroMask)) &&
10909 (N2.isUndef() || IsZero2 || DAG.MaskedValueIsZero(N2, ZeroMask))) {
10910 V1 = N1;
10911 V2 = N2;
10912 SrcVT = PackVT;
10913 PackOpcode = X86ISD::PACKUS;
10914 return true;
10915 }
10916 }
10917 bool IsAllOnes1 = llvm::isAllOnesOrAllOnesSplat(N1, /*AllowUndefs*/ false);
10918 bool IsAllOnes2 = llvm::isAllOnesOrAllOnesSplat(N2, /*AllowUndefs*/ false);
10919 if ((N1.isUndef() || IsZero1 || IsAllOnes1 ||
10920 DAG.ComputeNumSignBits(N1) > NumPackedBits) &&
10921 (N2.isUndef() || IsZero2 || IsAllOnes2 ||
10922 DAG.ComputeNumSignBits(N2) > NumPackedBits)) {
10923 V1 = N1;
10924 V2 = N2;
10925 SrcVT = PackVT;
10926 PackOpcode = X86ISD::PACKSS;
10927 return true;
10928 }
10929 return false;
10930 };
10931
10932 // Attempt to match against wider and wider compaction patterns.
10933 for (unsigned NumStages = 1; NumStages <= MaxStages; ++NumStages) {
10934 MVT PackSVT = MVT::getIntegerVT(BitSize << NumStages);
10935 MVT PackVT = MVT::getVectorVT(PackSVT, NumElts >> NumStages);
10936
10937 // Try binary shuffle.
10938 SmallVector<int, 32> BinaryMask;
10939 createPackShuffleMask(VT, BinaryMask, false, NumStages);
10940 if (isTargetShuffleEquivalent(VT, TargetMask, BinaryMask, DAG, V1, V2))
10941 if (MatchPACK(V1, V2, PackVT))
10942 return true;
10943
10944 // Try unary shuffle.
10945 SmallVector<int, 32> UnaryMask;
10946 createPackShuffleMask(VT, UnaryMask, true, NumStages);
10947 if (isTargetShuffleEquivalent(VT, TargetMask, UnaryMask, DAG, V1))
10948 if (MatchPACK(V1, V1, PackVT))
10949 return true;
10950 }
10951
10952 return false;
10953}
10954
10956 SDValue V2, ArrayRef<int> Mask,
10957 const X86Subtarget &Subtarget,
10958 SelectionDAG &DAG) {
10959 MVT PackVT;
10960 unsigned PackOpcode;
10961 unsigned SizeBits = VT.getSizeInBits();
10962 unsigned EltBits = VT.getScalarSizeInBits();
10963 unsigned MaxStages = Log2_32(64 / EltBits);
10964 if (!matchShuffleWithPACK(VT, PackVT, V1, V2, PackOpcode, Mask, DAG,
10965 Subtarget, MaxStages))
10966 return SDValue();
10967
10968 unsigned CurrentEltBits = PackVT.getScalarSizeInBits();
10969 unsigned NumStages = Log2_32(CurrentEltBits / EltBits);
10970
10971 // Don't lower multi-stage packs on AVX512, truncation is better.
10972 if (NumStages != 1 && SizeBits == 128 && Subtarget.hasVLX())
10973 return SDValue();
10974
10975 // Pack to the largest type possible:
10976 // vXi64/vXi32 -> PACK*SDW and vXi16 -> PACK*SWB.
10977 unsigned MaxPackBits = 16;
10978 if (CurrentEltBits > 16 &&
10979 (PackOpcode == X86ISD::PACKSS || Subtarget.hasSSE41()))
10980 MaxPackBits = 32;
10981
10982 // Repeatedly pack down to the target size.
10983 SDValue Res;
10984 for (unsigned i = 0; i != NumStages; ++i) {
10985 unsigned SrcEltBits = std::min(MaxPackBits, CurrentEltBits);
10986 unsigned NumSrcElts = SizeBits / SrcEltBits;
10987 MVT SrcSVT = MVT::getIntegerVT(SrcEltBits);
10988 MVT DstSVT = MVT::getIntegerVT(SrcEltBits / 2);
10989 MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts);
10990 MVT DstVT = MVT::getVectorVT(DstSVT, NumSrcElts * 2);
10991 Res = DAG.getNode(PackOpcode, DL, DstVT, DAG.getBitcast(SrcVT, V1),
10992 DAG.getBitcast(SrcVT, V2));
10993 V1 = V2 = Res;
10994 CurrentEltBits /= 2;
10995 }
10996 assert(Res && Res.getValueType() == VT &&
10997 "Failed to lower compaction shuffle");
10998 return Res;
10999}
11000
11001/// Try to emit a bitmask instruction for a shuffle.
11002///
11003/// This handles cases where we can model a blend exactly as a bitmask due to
11004/// one of the inputs being zeroable.
11006 SDValue V2, ArrayRef<int> Mask,
11007 const APInt &Zeroable,
11008 const X86Subtarget &Subtarget,
11009 SelectionDAG &DAG) {
11010 MVT MaskVT = VT;
11011 MVT EltVT = VT.getVectorElementType();
11012 SDValue Zero, AllOnes;
11013 // Use f64 if i64 isn't legal.
11014 if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {
11015 EltVT = MVT::f64;
11016 MaskVT = MVT::getVectorVT(EltVT, Mask.size());
11017 }
11018
11019 MVT LogicVT = VT;
11020 if (EltVT.isFloatingPoint()) {
11021 Zero = DAG.getConstantFP(0.0, DL, EltVT);
11022 APFloat AllOnesValue = APFloat::getAllOnesValue(EltVT.getFltSemantics());
11023 AllOnes = DAG.getConstantFP(AllOnesValue, DL, EltVT);
11024 LogicVT = MVT::getVectorVT(EltVT.changeTypeToInteger(), Mask.size());
11025 } else {
11026 Zero = DAG.getConstant(0, DL, EltVT);
11027 AllOnes = DAG.getAllOnesConstant(DL, EltVT);
11028 }
11029
11030 SmallVector<SDValue, 16> VMaskOps(Mask.size(), Zero);
11031 SDValue V;
11032 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
11033 if (Zeroable[i])
11034 continue;
11035 if (Mask[i] % Size != i)
11036 return SDValue(); // Not a blend.
11037 if (!V)
11038 V = Mask[i] < Size ? V1 : V2;
11039 else if (V != (Mask[i] < Size ? V1 : V2))
11040 return SDValue(); // Can only let one input through the mask.
11041
11042 VMaskOps[i] = AllOnes;
11043 }
11044 if (!V)
11045 return SDValue(); // No non-zeroable elements!
11046
11047 SDValue VMask = DAG.getBuildVector(MaskVT, DL, VMaskOps);
11048 VMask = DAG.getBitcast(LogicVT, VMask);
11049 V = DAG.getBitcast(LogicVT, V);
11050 SDValue And = DAG.getNode(ISD::AND, DL, LogicVT, V, VMask);
11051 return DAG.getBitcast(VT, And);
11052}
11053
11054/// Try to emit a blend instruction for a shuffle using bit math.
11055///
11056/// This is used as a fallback approach when first class blend instructions are
11057/// unavailable. Currently it is only suitable for integer vectors, but could
11058/// be generalized for floating point vectors if desirable.
11060 SDValue V2, ArrayRef<int> Mask,
11061 SelectionDAG &DAG) {
11062 assert(VT.isInteger() && "Only supports integer vector types!");
11063 MVT EltVT = VT.getVectorElementType();
11064 SDValue Zero = DAG.getConstant(0, DL, EltVT);
11065 SDValue AllOnes = DAG.getAllOnesConstant(DL, EltVT);
11067 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
11068 if (Mask[i] >= 0 && Mask[i] != i && Mask[i] != i + Size)
11069 return SDValue(); // Shuffled input!
11070 MaskOps.push_back(Mask[i] < Size ? AllOnes : Zero);
11071 }
11072
11073 SDValue V1Mask = DAG.getBuildVector(VT, DL, MaskOps);
11074 return getBitSelect(DL, VT, V1, V2, V1Mask, DAG);
11075}
11076
11078 SDValue PreservedSrc,
11079 const X86Subtarget &Subtarget,
11080 SelectionDAG &DAG);
11081
11084 const APInt &Zeroable, bool &ForceV1Zero,
11085 bool &ForceV2Zero, uint64_t &BlendMask) {
11086 bool V1IsZeroOrUndef =
11088 bool V2IsZeroOrUndef =
11090
11091 BlendMask = 0;
11092 ForceV1Zero = false, ForceV2Zero = false;
11093 assert(Mask.size() <= 64 && "Shuffle mask too big for blend mask");
11094
11095 int NumElts = Mask.size();
11096 int NumLanes = VT.getSizeInBits() / 128;
11097 int NumEltsPerLane = NumElts / NumLanes;
11098 assert((NumLanes * NumEltsPerLane) == NumElts && "Value type mismatch");
11099
11100 // For 32/64-bit elements, if we only reference one input (plus any undefs),
11101 // then ensure the blend mask part for that lane just references that input.
11102 bool ForceWholeLaneMasks =
11103 VT.is256BitVector() && VT.getScalarSizeInBits() >= 32;
11104
11105 // Attempt to generate the binary blend mask. If an input is zero then
11106 // we can use any lane.
11107 for (int Lane = 0; Lane != NumLanes; ++Lane) {
11108 // Keep track of the inputs used per lane.
11109 bool LaneV1InUse = false;
11110 bool LaneV2InUse = false;
11111 uint64_t LaneBlendMask = 0;
11112 for (int LaneElt = 0; LaneElt != NumEltsPerLane; ++LaneElt) {
11113 int Elt = (Lane * NumEltsPerLane) + LaneElt;
11114 int M = Mask[Elt];
11115 if (M == SM_SentinelUndef)
11116 continue;
11117 if (M == Elt || (0 <= M && M < NumElts &&
11118 IsElementEquivalent(NumElts, V1, V1, M, Elt))) {
11119 Mask[Elt] = Elt;
11120 LaneV1InUse = true;
11121 continue;
11122 }
11123 if (M == (Elt + NumElts) ||
11124 (NumElts <= M &&
11125 IsElementEquivalent(NumElts, V2, V2, M - NumElts, Elt))) {
11126 LaneBlendMask |= 1ull << LaneElt;
11127 Mask[Elt] = Elt + NumElts;
11128 LaneV2InUse = true;
11129 continue;
11130 }
11131 if (Zeroable[Elt]) {
11132 if (V1IsZeroOrUndef) {
11133 ForceV1Zero = true;
11134 Mask[Elt] = Elt;
11135 LaneV1InUse = true;
11136 continue;
11137 }
11138 if (V2IsZeroOrUndef) {
11139 ForceV2Zero = true;
11140 LaneBlendMask |= 1ull << LaneElt;
11141 Mask[Elt] = Elt + NumElts;
11142 LaneV2InUse = true;
11143 continue;
11144 }
11145 }
11146 return false;
11147 }
11148
11149 // If we only used V2 then splat the lane blend mask to avoid any demanded
11150 // elts from V1 in this lane (the V1 equivalent is implicit with a zero
11151 // blend mask bit).
11152 if (ForceWholeLaneMasks && LaneV2InUse && !LaneV1InUse)
11153 LaneBlendMask = (1ull << NumEltsPerLane) - 1;
11154
11155 BlendMask |= LaneBlendMask << (Lane * NumEltsPerLane);
11156 }
11157 return true;
11158}
11159
11160/// Try to emit a blend instruction for a shuffle.
11161///
11162/// This doesn't do any checks for the availability of instructions for blending
11163/// these values. It relies on the availability of the X86ISD::BLENDI pattern to
11164/// be matched in the backend with the type given. What it does check for is
11165/// that the shuffle mask is a blend, or convertible into a blend with zero.
11167 SDValue V2, ArrayRef<int> Original,
11168 const APInt &Zeroable,
11169 const X86Subtarget &Subtarget,
11170 SelectionDAG &DAG) {
11171 uint64_t BlendMask = 0;
11172 bool ForceV1Zero = false, ForceV2Zero = false;
11173 SmallVector<int, 64> Mask(Original);
11174 if (!matchShuffleAsBlend(VT, V1, V2, Mask, Zeroable, ForceV1Zero, ForceV2Zero,
11175 BlendMask))
11176 return SDValue();
11177
11178 // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
11179 if (ForceV1Zero)
11180 V1 = getZeroVector(VT, Subtarget, DAG, DL);
11181 if (ForceV2Zero)
11182 V2 = getZeroVector(VT, Subtarget, DAG, DL);
11183
11184 unsigned NumElts = VT.getVectorNumElements();
11185
11186 switch (VT.SimpleTy) {
11187 case MVT::v4i64:
11188 case MVT::v8i32:
11189 assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!");
11190 [[fallthrough]];
11191 case MVT::v4f64:
11192 case MVT::v8f32:
11193 assert(Subtarget.hasAVX() && "256-bit float blends require AVX!");
11194 [[fallthrough]];
11195 case MVT::v2f64:
11196 case MVT::v2i64:
11197 case MVT::v4f32:
11198 case MVT::v4i32:
11199 case MVT::v8i16:
11200 assert(Subtarget.hasSSE41() && "128-bit blends require SSE41!");
11201 return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2,
11202 DAG.getTargetConstant(BlendMask, DL, MVT::i8));
11203 case MVT::v16i16: {
11204 assert(Subtarget.hasAVX2() && "v16i16 blends require AVX2!");
11205 SmallVector<int, 8> RepeatedMask;
11206 if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
11207 // We can lower these with PBLENDW which is mirrored across 128-bit lanes.
11208 assert(RepeatedMask.size() == 8 && "Repeated mask size doesn't match!");
11209 BlendMask = 0;
11210 for (int i = 0; i < 8; ++i)
11211 if (RepeatedMask[i] >= 8)
11212 BlendMask |= 1ull << i;
11213 return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
11214 DAG.getTargetConstant(BlendMask, DL, MVT::i8));
11215 }
11216 // Use PBLENDW for lower/upper lanes and then blend lanes.
11217 // TODO - we should allow 2 PBLENDW here and leave shuffle combine to
11218 // merge to VSELECT where useful.
11219 uint64_t LoMask = BlendMask & 0xFF;
11220 uint64_t HiMask = (BlendMask >> 8) & 0xFF;
11221 if (LoMask == 0 || LoMask == 255 || HiMask == 0 || HiMask == 255) {
11222 SDValue Lo = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
11223 DAG.getTargetConstant(LoMask, DL, MVT::i8));
11224 SDValue Hi = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
11225 DAG.getTargetConstant(HiMask, DL, MVT::i8));
11226 return DAG.getVectorShuffle(
11227 MVT::v16i16, DL, Lo, Hi,
11228 {0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31});
11229 }
11230 [[fallthrough]];
11231 }
11232 case MVT::v32i8:
11233 assert(Subtarget.hasAVX2() && "256-bit byte-blends require AVX2!");
11234 [[fallthrough]];
11235 case MVT::v16i8: {
11236 assert(Subtarget.hasSSE41() && "128-bit byte-blends require SSE41!");
11237
11238 // Attempt to lower to a bitmask if we can. VPAND is faster than VPBLENDVB.
11239 if (SDValue Masked = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
11240 Subtarget, DAG))
11241 return Masked;
11242
11243 if (Subtarget.hasBWI() && Subtarget.hasVLX()) {
11244 MVT IntegerType = MVT::getIntegerVT(std::max<unsigned>(NumElts, 8));
11245 SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
11246 return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
11247 }
11248
11249 // If we have VPTERNLOG, we can use that as a bit blend.
11250 if (Subtarget.hasVLX())
11251 if (SDValue BitBlend =
11252 lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
11253 return BitBlend;
11254
11255 // Scale the blend by the number of bytes per element.
11256 int Scale = VT.getScalarSizeInBits() / 8;
11257
11258 // This form of blend is always done on bytes. Compute the byte vector
11259 // type.
11260 MVT BlendVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
11261
11262 // x86 allows load folding with blendvb from the 2nd source operand. But
11263 // we are still using LLVM select here (see comment below), so that's V1.
11264 // If V2 can be load-folded and V1 cannot be load-folded, then commute to
11265 // allow that load-folding possibility.
11266 if (!ISD::isNormalLoad(V1.getNode()) && ISD::isNormalLoad(V2.getNode())) {
11268 std::swap(V1, V2);
11269 }
11270
11271 // Compute the VSELECT mask. Note that VSELECT is really confusing in the
11272 // mix of LLVM's code generator and the x86 backend. We tell the code
11273 // generator that boolean values in the elements of an x86 vector register
11274 // are -1 for true and 0 for false. We then use the LLVM semantics of 'true'
11275 // mapping a select to operand #1, and 'false' mapping to operand #2. The
11276 // reality in x86 is that vector masks (pre-AVX-512) use only the high bit
11277 // of the element (the remaining are ignored) and 0 in that high bit would
11278 // mean operand #1 while 1 in the high bit would mean operand #2. So while
11279 // the LLVM model for boolean values in vector elements gets the relevant
11280 // bit set, it is set backwards and over constrained relative to x86's
11281 // actual model.
11282 SmallVector<SDValue, 32> VSELECTMask;
11283 for (int i = 0, Size = Mask.size(); i < Size; ++i)
11284 for (int j = 0; j < Scale; ++j)
11285 VSELECTMask.push_back(
11286 Mask[i] < 0
11287 ? DAG.getUNDEF(MVT::i8)
11288 : DAG.getSignedConstant(Mask[i] < Size ? -1 : 0, DL, MVT::i8));
11289
11290 V1 = DAG.getBitcast(BlendVT, V1);
11291 V2 = DAG.getBitcast(BlendVT, V2);
11292 return DAG.getBitcast(
11293 VT,
11294 DAG.getSelect(DL, BlendVT, DAG.getBuildVector(BlendVT, DL, VSELECTMask),
11295 V1, V2));
11296 }
11297 case MVT::v16f32:
11298 case MVT::v8f64:
11299 case MVT::v8i64:
11300 case MVT::v16i32:
11301 case MVT::v32i16:
11302 case MVT::v64i8: {
11303 // Attempt to lower to a bitmask if we can. Only if not optimizing for size.
11304 bool OptForSize = DAG.shouldOptForSize();
11305 if (!OptForSize) {
11306 if (SDValue Masked = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
11307 Subtarget, DAG))
11308 return Masked;
11309 }
11310
11311 // Otherwise load an immediate into a GPR, cast to k-register, and use a
11312 // masked move.
11313 MVT IntegerType = MVT::getIntegerVT(std::max<unsigned>(NumElts, 8));
11314 SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
11315 return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
11316 }
11317 default:
11318 llvm_unreachable("Not a supported integer vector type!");
11319 }
11320}
11321
11322/// Try to lower as a blend of elements from two inputs followed by
11323/// a single-input permutation.
11324///
11325/// This matches the pattern where we can blend elements from two inputs and
11326/// then reduce the shuffle to a single-input permutation.
11328 SDValue V1, SDValue V2,
11329 ArrayRef<int> Mask,
11330 SelectionDAG &DAG,
11331 bool ImmBlends = false) {
11332 // We build up the blend mask while checking whether a blend is a viable way
11333 // to reduce the shuffle.
11334 SmallVector<int, 32> BlendMask(Mask.size(), -1);
11335 SmallVector<int, 32> PermuteMask(Mask.size(), -1);
11336
11337 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
11338 if (Mask[i] < 0)
11339 continue;
11340
11341 assert(Mask[i] < Size * 2 && "Shuffle input is out of bounds.");
11342
11343 if (BlendMask[Mask[i] % Size] < 0)
11344 BlendMask[Mask[i] % Size] = Mask[i];
11345 else if (BlendMask[Mask[i] % Size] != Mask[i])
11346 return SDValue(); // Can't blend in the needed input!
11347
11348 PermuteMask[i] = Mask[i] % Size;
11349 }
11350
11351 // If only immediate blends, then bail if the blend mask can't be widened to
11352 // i16.
11353 unsigned EltSize = VT.getScalarSizeInBits();
11354 if (ImmBlends && EltSize == 8 && !canWidenShuffleElements(BlendMask))
11355 return SDValue();
11356
11357 SDValue V = DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
11358 return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), PermuteMask);
11359}
11360
11361/// Try to lower as an unpack of elements from two inputs followed by
11362/// a single-input permutation.
11363///
11364/// This matches the pattern where we can unpack elements from two inputs and
11365/// then reduce the shuffle to a single-input (wider) permutation.
11367 SDValue V1, SDValue V2,
11368 ArrayRef<int> Mask,
11369 SelectionDAG &DAG) {
11370 int NumElts = Mask.size();
11371 int NumLanes = VT.getSizeInBits() / 128;
11372 int NumLaneElts = NumElts / NumLanes;
11373 int NumHalfLaneElts = NumLaneElts / 2;
11374
11375 bool MatchLo = true, MatchHi = true;
11376 SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};
11377
11378 // Determine UNPCKL/UNPCKH type and operand order.
11379 for (int Elt = 0; Elt != NumElts; ++Elt) {
11380 int M = Mask[Elt];
11381 if (M < 0)
11382 continue;
11383
11384 // Normalize the mask value depending on whether it's V1 or V2.
11385 int NormM = M;
11386 SDValue &Op = Ops[Elt & 1];
11387 if (M < NumElts && (Op.isUndef() || Op == V1))
11388 Op = V1;
11389 else if (NumElts <= M && (Op.isUndef() || Op == V2)) {
11390 Op = V2;
11391 NormM -= NumElts;
11392 } else
11393 return SDValue();
11394
11395 bool MatchLoAnyLane = false, MatchHiAnyLane = false;
11396 for (int Lane = 0; Lane != NumElts; Lane += NumLaneElts) {
11397 int Lo = Lane, Mid = Lane + NumHalfLaneElts, Hi = Lane + NumLaneElts;
11398 MatchLoAnyLane |= isUndefOrInRange(NormM, Lo, Mid);
11399 MatchHiAnyLane |= isUndefOrInRange(NormM, Mid, Hi);
11400 if (MatchLoAnyLane || MatchHiAnyLane) {
11401 assert((MatchLoAnyLane ^ MatchHiAnyLane) &&
11402 "Failed to match UNPCKLO/UNPCKHI");
11403 break;
11404 }
11405 }
11406 MatchLo &= MatchLoAnyLane;
11407 MatchHi &= MatchHiAnyLane;
11408 if (!MatchLo && !MatchHi)
11409 return SDValue();
11410 }
11411 assert((MatchLo ^ MatchHi) && "Failed to match UNPCKLO/UNPCKHI");
11412
11413 // Element indices have changed after unpacking. Calculate permute mask
11414 // so that they will be put back to the position as dictated by the
11415 // original shuffle mask indices.
11416 SmallVector<int, 32> PermuteMask(NumElts, -1);
11417 for (int Elt = 0; Elt != NumElts; ++Elt) {
11418 int M = Mask[Elt];
11419 if (M < 0)
11420 continue;
11421 int NormM = M;
11422 if (NumElts <= M)
11423 NormM -= NumElts;
11424 bool IsFirstOp = M < NumElts;
11425 int BaseMaskElt =
11426 NumLaneElts * (NormM / NumLaneElts) + (2 * (NormM % NumHalfLaneElts));
11427 if ((IsFirstOp && V1 == Ops[0]) || (!IsFirstOp && V2 == Ops[0]))
11428 PermuteMask[Elt] = BaseMaskElt;
11429 else if ((IsFirstOp && V1 == Ops[1]) || (!IsFirstOp && V2 == Ops[1]))
11430 PermuteMask[Elt] = BaseMaskElt + 1;
11431 assert(PermuteMask[Elt] != -1 &&
11432 "Input mask element is defined but failed to assign permute mask");
11433 }
11434
11435 unsigned UnpckOp = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
11436 SDValue Unpck = DAG.getNode(UnpckOp, DL, VT, Ops);
11437 return DAG.getVectorShuffle(VT, DL, Unpck, DAG.getUNDEF(VT), PermuteMask);
11438}
11439
11440/// Try to lower a shuffle as a permute of the inputs followed by an
11441/// UNPCK instruction.
11442///
11443/// This specifically targets cases where we end up with alternating between
11444/// the two inputs, and so can permute them into something that feeds a single
11445/// UNPCK instruction. Note that this routine only targets integer vectors
11446/// because for floating point vectors we have a generalized SHUFPS lowering
11447/// strategy that handles everything that doesn't *exactly* match an unpack,
11448/// making this clever lowering unnecessary.
11450 SDValue V1, SDValue V2,
11451 ArrayRef<int> Mask,
11452 const X86Subtarget &Subtarget,
11453 SelectionDAG &DAG) {
11454 int Size = Mask.size();
11455 assert(Mask.size() >= 2 && "Single element masks are invalid.");
11456
11457 // This routine only supports 128-bit integer dual input vectors.
11458 if (VT.isFloatingPoint() || !VT.is128BitVector() || V2.isUndef())
11459 return SDValue();
11460
11461 int NumLoInputs =
11462 count_if(Mask, [Size](int M) { return M >= 0 && M % Size < Size / 2; });
11463 int NumHiInputs =
11464 count_if(Mask, [Size](int M) { return M % Size >= Size / 2; });
11465
11466 bool UnpackLo = NumLoInputs >= NumHiInputs;
11467
11468 auto TryUnpack = [&](int ScalarSize, int Scale) {
11469 SmallVector<int, 16> V1Mask((unsigned)Size, -1);
11470 SmallVector<int, 16> V2Mask((unsigned)Size, -1);
11471
11472 for (int i = 0; i < Size; ++i) {
11473 if (Mask[i] < 0)
11474 continue;
11475
11476 // Each element of the unpack contains Scale elements from this mask.
11477 int UnpackIdx = i / Scale;
11478
11479 // We only handle the case where V1 feeds the first slots of the unpack.
11480 // We rely on canonicalization to ensure this is the case.
11481 if ((UnpackIdx % 2 == 0) != (Mask[i] < Size))
11482 return SDValue();
11483
11484 // Setup the mask for this input. The indexing is tricky as we have to
11485 // handle the unpack stride.
11486 SmallVectorImpl<int> &VMask = (UnpackIdx % 2 == 0) ? V1Mask : V2Mask;
11487 VMask[(UnpackIdx / 2) * Scale + i % Scale + (UnpackLo ? 0 : Size / 2)] =
11488 Mask[i] % Size;
11489 }
11490
11491 // If we will have to shuffle both inputs to use the unpack, check whether
11492 // we can just unpack first and shuffle the result. If so, skip this unpack.
11493 if ((NumLoInputs == 0 || NumHiInputs == 0) && !isNoopShuffleMask(V1Mask) &&
11494 !isNoopShuffleMask(V2Mask))
11495 return SDValue();
11496
11497 // Shuffle the inputs into place.
11498 V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
11499 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
11500
11501 // Cast the inputs to the type we will use to unpack them.
11502 MVT UnpackVT =
11503 MVT::getVectorVT(MVT::getIntegerVT(ScalarSize), Size / Scale);
11504 V1 = DAG.getBitcast(UnpackVT, V1);
11505 V2 = DAG.getBitcast(UnpackVT, V2);
11506
11507 // Unpack the inputs and cast the result back to the desired type.
11508 return DAG.getBitcast(
11509 VT, DAG.getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
11510 UnpackVT, V1, V2));
11511 };
11512
11513 // We try each unpack from the largest to the smallest to try and find one
11514 // that fits this mask.
11515 int OrigScalarSize = VT.getScalarSizeInBits();
11516 for (int ScalarSize = 64; ScalarSize >= OrigScalarSize; ScalarSize /= 2)
11517 if (SDValue Unpack = TryUnpack(ScalarSize, ScalarSize / OrigScalarSize))
11518 return Unpack;
11519
11520 // If we're shuffling with a zero vector then we're better off not doing
11521 // VECTOR_SHUFFLE(UNPCK()) as we lose track of those zero elements.
11524 return SDValue();
11525
11526 // If none of the unpack-rooted lowerings worked (or were profitable) try an
11527 // initial unpack.
11528 if (NumLoInputs == 0 || NumHiInputs == 0) {
11529 assert((NumLoInputs > 0 || NumHiInputs > 0) &&
11530 "We have to have *some* inputs!");
11531 int HalfOffset = NumLoInputs == 0 ? Size / 2 : 0;
11532
11533 // FIXME: We could consider the total complexity of the permute of each
11534 // possible unpacking. Or at the least we should consider how many
11535 // half-crossings are created.
11536 // FIXME: We could consider commuting the unpacks.
11537
11538 SmallVector<int, 32> PermMask((unsigned)Size, -1);
11539 for (int i = 0; i < Size; ++i) {
11540 if (Mask[i] < 0)
11541 continue;
11542
11543 assert(Mask[i] % Size >= HalfOffset && "Found input from wrong half!");
11544
11545 PermMask[i] =
11546 2 * ((Mask[i] % Size) - HalfOffset) + (Mask[i] < Size ? 0 : 1);
11547 }
11548 return DAG.getVectorShuffle(
11549 VT, DL,
11550 DAG.getNode(NumLoInputs == 0 ? X86ISD::UNPCKH : X86ISD::UNPCKL, DL, VT,
11551 V1, V2),
11552 DAG.getUNDEF(VT), PermMask);
11553 }
11554
11555 return SDValue();
11556}
11557
11558/// Helper to form a PALIGNR-based rotate+permute, merging 2 inputs and then
11559/// permuting the elements of the result in place.
11561 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
11562 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
11563 if ((VT.is128BitVector() && !Subtarget.hasSSSE3()) ||
11564 (VT.is256BitVector() && !Subtarget.hasAVX2()) ||
11565 (VT.is512BitVector() && !Subtarget.hasBWI()))
11566 return SDValue();
11567
11568 // We don't currently support lane crossing permutes.
11569 if (is128BitLaneCrossingShuffleMask(VT, Mask))
11570 return SDValue();
11571
11572 int Scale = VT.getScalarSizeInBits() / 8;
11573 int NumLanes = VT.getSizeInBits() / 128;
11574 int NumElts = VT.getVectorNumElements();
11575 int NumEltsPerLane = NumElts / NumLanes;
11576
11577 // Determine range of mask elts.
11578 bool Blend1 = true;
11579 bool Blend2 = true;
11580 std::pair<int, int> Range1 = std::make_pair(INT_MAX, INT_MIN);
11581 std::pair<int, int> Range2 = std::make_pair(INT_MAX, INT_MIN);
11582 for (int Lane = 0; Lane != NumElts; Lane += NumEltsPerLane) {
11583 for (int Elt = 0; Elt != NumEltsPerLane; ++Elt) {
11584 int M = Mask[Lane + Elt];
11585 if (M < 0)
11586 continue;
11587 if (M < NumElts) {
11588 Blend1 &= (M == (Lane + Elt));
11589 assert(Lane <= M && M < (Lane + NumEltsPerLane) && "Out of range mask");
11590 M = M % NumEltsPerLane;
11591 Range1.first = std::min(Range1.first, M);
11592 Range1.second = std::max(Range1.second, M);
11593 } else {
11594 M -= NumElts;
11595 Blend2 &= (M == (Lane + Elt));
11596 assert(Lane <= M && M < (Lane + NumEltsPerLane) && "Out of range mask");
11597 M = M % NumEltsPerLane;
11598 Range2.first = std::min(Range2.first, M);
11599 Range2.second = std::max(Range2.second, M);
11600 }
11601 }
11602 }
11603
11604 // Bail if we don't need both elements.
11605 // TODO - it might be worth doing this for unary shuffles if the permute
11606 // can be widened.
11607 if (!(0 <= Range1.first && Range1.second < NumEltsPerLane) ||
11608 !(0 <= Range2.first && Range2.second < NumEltsPerLane))
11609 return SDValue();
11610
11611 if (VT.getSizeInBits() > 128 && (Blend1 || Blend2))
11612 return SDValue();
11613
11614 // Rotate the 2 ops so we can access both ranges, then permute the result.
11615 auto RotateAndPermute = [&](SDValue Lo, SDValue Hi, int RotAmt, int Ofs) {
11616 MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
11617 SDValue Rotate = DAG.getBitcast(
11618 VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, DAG.getBitcast(ByteVT, Hi),
11619 DAG.getBitcast(ByteVT, Lo),
11620 DAG.getTargetConstant(Scale * RotAmt, DL, MVT::i8)));
11621 SmallVector<int, 64> PermMask(NumElts, SM_SentinelUndef);
11622 for (int Lane = 0; Lane != NumElts; Lane += NumEltsPerLane) {
11623 for (int Elt = 0; Elt != NumEltsPerLane; ++Elt) {
11624 int M = Mask[Lane + Elt];
11625 if (M < 0)
11626 continue;
11627 if (M < NumElts)
11628 PermMask[Lane + Elt] = Lane + ((M + Ofs - RotAmt) % NumEltsPerLane);
11629 else
11630 PermMask[Lane + Elt] = Lane + ((M - Ofs - RotAmt) % NumEltsPerLane);
11631 }
11632 }
11633 return DAG.getVectorShuffle(VT, DL, Rotate, DAG.getUNDEF(VT), PermMask);
11634 };
11635
11636 // Check if the ranges are small enough to rotate from either direction.
11637 if (Range2.second < Range1.first)
11638 return RotateAndPermute(V1, V2, Range1.first, 0);
11639 if (Range1.second < Range2.first)
11640 return RotateAndPermute(V2, V1, Range2.first, NumElts);
11641 return SDValue();
11642}
11643
11645 return isUndefOrEqual(Mask, 0);
11646}
11647
11649 return isNoopShuffleMask(Mask) || isBroadcastShuffleMask(Mask);
11650}
11651
11652/// Check if the Mask consists of the same element repeated multiple times.
11654 size_t NumUndefs = 0;
11655 std::optional<int> UniqueElt;
11656 for (int Elt : Mask) {
11657 if (Elt == SM_SentinelUndef) {
11658 NumUndefs++;
11659 continue;
11660 }
11661 if (UniqueElt.has_value() && UniqueElt.value() != Elt)
11662 return false;
11663 UniqueElt = Elt;
11664 }
11665 // Make sure the element is repeated enough times by checking the number of
11666 // undefs is small.
11667 return NumUndefs <= Mask.size() / 2 && UniqueElt.has_value();
11668}
11669
11670/// Generic routine to decompose a shuffle and blend into independent
11671/// blends and permutes.
11672///
11673/// This matches the extremely common pattern for handling combined
11674/// shuffle+blend operations on newer X86 ISAs where we have very fast blend
11675/// operations. It will try to pick the best arrangement of shuffles and
11676/// blends. For vXi8/vXi16 shuffles we may use unpack instead of blend.
11678 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
11679 const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG) {
11680 int NumElts = Mask.size();
11681 int NumLanes = VT.getSizeInBits() / 128;
11682 int NumEltsPerLane = NumElts / NumLanes;
11683
11684 // Shuffle the input elements into the desired positions in V1 and V2 and
11685 // unpack/blend them together.
11686 bool IsAlternating = true;
11687 bool V1Zero = true, V2Zero = true;
11688 SmallVector<int, 32> V1Mask(NumElts, -1);
11689 SmallVector<int, 32> V2Mask(NumElts, -1);
11690 SmallVector<int, 32> FinalMask(NumElts, -1);
11691 for (int i = 0; i < NumElts; ++i) {
11692 int M = Mask[i];
11693 if (M >= 0 && M < NumElts) {
11694 V1Mask[i] = M;
11695 FinalMask[i] = i;
11696 V1Zero &= Zeroable[i];
11697 IsAlternating &= (i & 1) == 0;
11698 } else if (M >= NumElts) {
11699 V2Mask[i] = M - NumElts;
11700 FinalMask[i] = i + NumElts;
11701 V2Zero &= Zeroable[i];
11702 IsAlternating &= (i & 1) == 1;
11703 }
11704 }
11705
11706 // If we effectively only demand the 0'th element of \p Input, and not only
11707 // as 0'th element, then broadcast said input,
11708 // and change \p InputMask to be a no-op (identity) mask.
11709 auto canonicalizeBroadcastableInput = [DL, VT, &Subtarget,
11710 &DAG](SDValue &Input,
11711 MutableArrayRef<int> InputMask) {
11712 unsigned EltSizeInBits = Input.getScalarValueSizeInBits();
11713 if (!Subtarget.hasAVX2() && (!Subtarget.hasAVX() || EltSizeInBits < 32 ||
11714 !X86::mayFoldLoad(Input, Subtarget)))
11715 return;
11716 if (isNoopShuffleMask(InputMask))
11717 return;
11718 assert(isBroadcastShuffleMask(InputMask) &&
11719 "Expected to demand only the 0'th element.");
11721 for (auto I : enumerate(InputMask)) {
11722 int &InputMaskElt = I.value();
11723 if (InputMaskElt >= 0)
11724 InputMaskElt = I.index();
11725 }
11726 };
11727
11728 // Currently, we may need to produce one shuffle per input, and blend results.
11729 // It is possible that the shuffle for one of the inputs is already a no-op.
11730 // See if we can simplify non-no-op shuffles into broadcasts,
11731 // which we consider to be strictly better than an arbitrary shuffle.
11732 if (isNoopOrBroadcastShuffleMask(V1Mask) &&
11734 canonicalizeBroadcastableInput(V1, V1Mask);
11735 canonicalizeBroadcastableInput(V2, V2Mask);
11736 }
11737
11738 // Try to lower with the simpler initial blend/unpack/rotate strategies unless
11739 // one of the input shuffles would be a no-op. We prefer to shuffle inputs as
11740 // the shuffle may be able to fold with a load or other benefit. However, when
11741 // we'll have to do 2x as many shuffles in order to achieve this, a 2-input
11742 // pre-shuffle first is a better strategy.
11743 if (!isNoopShuffleMask(V1Mask) && !isNoopShuffleMask(V2Mask)) {
11744 // If we don't have blends, see if we can create a cheap unpack.
11745 if (!Subtarget.hasSSE41() && VT.is128BitVector() &&
11746 (is128BitUnpackShuffleMask(V1Mask, DAG) ||
11747 is128BitUnpackShuffleMask(V2Mask, DAG)))
11748 if (SDValue PermUnpack = lowerShuffleAsPermuteAndUnpack(
11749 DL, VT, V1, V2, Mask, Subtarget, DAG))
11750 return PermUnpack;
11751
11752 // Only prefer immediate blends to unpack/rotate.
11753 if (SDValue BlendPerm =
11754 lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask, DAG, true))
11755 return BlendPerm;
11756
11757 // If either input vector provides only a single element which is repeated
11758 // multiple times, unpacking from both input vectors would generate worse
11759 // code. e.g. for
11760 // t5: v16i8 = vector_shuffle<16,0,16,1,16,2,16,3,16,4,16,5,16,6,16,7> t2, t4
11761 // it is better to process t4 first to create a vector of t4[0], then unpack
11762 // that vector with t2.
11763 if (!V1Zero && !V2Zero && !isSingleElementRepeatedMask(V1Mask) &&
11765 if (SDValue UnpackPerm =
11766 lowerShuffleAsUNPCKAndPermute(DL, VT, V1, V2, Mask, DAG))
11767 return UnpackPerm;
11768
11770 DL, VT, V1, V2, Mask, Subtarget, DAG))
11771 return RotatePerm;
11772
11773 // Unpack/rotate failed - try again with variable blends.
11774 if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask,
11775 DAG))
11776 return BlendPerm;
11777
11778 if (VT.getScalarSizeInBits() >= 32)
11779 if (SDValue PermUnpack = lowerShuffleAsPermuteAndUnpack(
11780 DL, VT, V1, V2, Mask, Subtarget, DAG))
11781 return PermUnpack;
11782 }
11783
11784 // If the final mask is an alternating blend of vXi8/vXi16, convert to an
11785 // UNPCKL(SHUFFLE, SHUFFLE) pattern.
11786 // TODO: It doesn't have to be alternating - but each lane mustn't have more
11787 // than half the elements coming from each source.
11788 if (IsAlternating && VT.getScalarSizeInBits() < 32) {
11789 V1Mask.assign(NumElts, -1);
11790 V2Mask.assign(NumElts, -1);
11791 FinalMask.assign(NumElts, -1);
11792 for (int i = 0; i != NumElts; i += NumEltsPerLane)
11793 for (int j = 0; j != NumEltsPerLane; ++j) {
11794 int M = Mask[i + j];
11795 if (M >= 0 && M < NumElts) {
11796 V1Mask[i + (j / 2)] = M;
11797 FinalMask[i + j] = i + (j / 2);
11798 } else if (M >= NumElts) {
11799 V2Mask[i + (j / 2)] = M - NumElts;
11800 FinalMask[i + j] = i + (j / 2) + NumElts;
11801 }
11802 }
11803 }
11804
11805 V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
11806 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
11807 return DAG.getVectorShuffle(VT, DL, V1, V2, FinalMask);
11808}
11809
11810static int matchShuffleAsBitRotate(MVT &RotateVT, int EltSizeInBits,
11811 const X86Subtarget &Subtarget,
11812 ArrayRef<int> Mask) {
11813 assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
11814 assert(EltSizeInBits < 64 && "Can't rotate 64-bit integers");
11815
11816 // AVX512 only has vXi32/vXi64 rotates, so limit the rotation sub group size.
11817 int MinSubElts = Subtarget.hasAVX512() ? std::max(32 / EltSizeInBits, 2) : 2;
11818 int MaxSubElts = 64 / EltSizeInBits;
11819 unsigned RotateAmt, NumSubElts;
11820 if (!ShuffleVectorInst::isBitRotateMask(Mask, EltSizeInBits, MinSubElts,
11821 MaxSubElts, NumSubElts, RotateAmt))
11822 return -1;
11823 unsigned NumElts = Mask.size();
11824 MVT RotateSVT = MVT::getIntegerVT(EltSizeInBits * NumSubElts);
11825 RotateVT = MVT::getVectorVT(RotateSVT, NumElts / NumSubElts);
11826 return RotateAmt;
11827}
11828
11829/// Lower shuffle using X86ISD::VROTLI rotations.
11831 ArrayRef<int> Mask,
11832 const X86Subtarget &Subtarget,
11833 SelectionDAG &DAG) {
11834 // Only XOP + AVX512 targets have bit rotation instructions.
11835 // If we at least have SSSE3 (PSHUFB) then we shouldn't attempt to use this.
11836 bool IsLegal =
11837 (VT.is128BitVector() && Subtarget.hasXOP()) || Subtarget.hasAVX512();
11838 if (!IsLegal && Subtarget.hasSSE3())
11839 return SDValue();
11840
11841 MVT RotateVT;
11842 int RotateAmt = matchShuffleAsBitRotate(RotateVT, VT.getScalarSizeInBits(),
11843 Subtarget, Mask);
11844 if (RotateAmt < 0)
11845 return SDValue();
11846
11847 // For pre-SSSE3 targets, if we are shuffling vXi8 elts then ISD::ROTL,
11848 // expanded to OR(SRL,SHL), will be more efficient, but if they can
11849 // widen to vXi16 or more then existing lowering should will be better.
11850 if (!IsLegal) {
11851 if ((RotateAmt % 16) == 0)
11852 return SDValue();
11853 // TODO: Use getTargetVShiftByConstNode.
11854 unsigned ShlAmt = RotateAmt;
11855 unsigned SrlAmt = RotateVT.getScalarSizeInBits() - RotateAmt;
11856 V1 = DAG.getBitcast(RotateVT, V1);
11857 SDValue SHL = DAG.getNode(X86ISD::VSHLI, DL, RotateVT, V1,
11858 DAG.getTargetConstant(ShlAmt, DL, MVT::i8));
11859 SDValue SRL = DAG.getNode(X86ISD::VSRLI, DL, RotateVT, V1,
11860 DAG.getTargetConstant(SrlAmt, DL, MVT::i8));
11861 SDValue Rot = DAG.getNode(ISD::OR, DL, RotateVT, SHL, SRL);
11862 return DAG.getBitcast(VT, Rot);
11863 }
11864
11865 SDValue Rot =
11866 DAG.getNode(X86ISD::VROTLI, DL, RotateVT, DAG.getBitcast(RotateVT, V1),
11867 DAG.getTargetConstant(RotateAmt, DL, MVT::i8));
11868 return DAG.getBitcast(VT, Rot);
11869}
11870
11871/// Try to match a vector shuffle as an element rotation.
11872///
11873/// This is used for support PALIGNR for SSSE3 or VALIGND/Q for AVX512.
11875 ArrayRef<int> Mask) {
11876 int NumElts = Mask.size();
11877
11878 // We need to detect various ways of spelling a rotation:
11879 // [11, 12, 13, 14, 15, 0, 1, 2]
11880 // [-1, 12, 13, 14, -1, -1, 1, -1]
11881 // [-1, -1, -1, -1, -1, -1, 1, 2]
11882 // [ 3, 4, 5, 6, 7, 8, 9, 10]
11883 // [-1, 4, 5, 6, -1, -1, 9, -1]
11884 // [-1, 4, 5, 6, -1, -1, -1, -1]
11885 int Rotation = 0;
11886 SDValue Lo, Hi;
11887 for (int i = 0; i < NumElts; ++i) {
11888 int M = Mask[i];
11889 assert((M == SM_SentinelUndef || (0 <= M && M < (2*NumElts))) &&
11890 "Unexpected mask index.");
11891 if (M < 0)
11892 continue;
11893
11894 // Determine where a rotated vector would have started.
11895 int StartIdx = i - (M % NumElts);
11896 if (StartIdx == 0)
11897 // The identity rotation isn't interesting, stop.
11898 return -1;
11899
11900 // If we found the tail of a vector the rotation must be the missing
11901 // front. If we found the head of a vector, it must be how much of the
11902 // head.
11903 int CandidateRotation = StartIdx < 0 ? -StartIdx : NumElts - StartIdx;
11904
11905 if (Rotation == 0)
11906 Rotation = CandidateRotation;
11907 else if (Rotation != CandidateRotation)
11908 // The rotations don't match, so we can't match this mask.
11909 return -1;
11910
11911 // Compute which value this mask is pointing at.
11912 SDValue MaskV = M < NumElts ? V1 : V2;
11913
11914 // Compute which of the two target values this index should be assigned
11915 // to. This reflects whether the high elements are remaining or the low
11916 // elements are remaining.
11917 SDValue &TargetV = StartIdx < 0 ? Hi : Lo;
11918
11919 // Either set up this value if we've not encountered it before, or check
11920 // that it remains consistent.
11921 if (!TargetV)
11922 TargetV = MaskV;
11923 else if (TargetV != MaskV)
11924 // This may be a rotation, but it pulls from the inputs in some
11925 // unsupported interleaving.
11926 return -1;
11927 }
11928
11929 // Check that we successfully analyzed the mask, and normalize the results.
11930 assert(Rotation != 0 && "Failed to locate a viable rotation!");
11931 assert((Lo || Hi) && "Failed to find a rotated input vector!");
11932 if (!Lo)
11933 Lo = Hi;
11934 else if (!Hi)
11935 Hi = Lo;
11936
11937 V1 = Lo;
11938 V2 = Hi;
11939
11940 return Rotation;
11941}
11942
11943/// Try to lower a vector shuffle as a byte rotation.
11944///
11945/// SSSE3 has a generic PALIGNR instruction in x86 that will do an arbitrary
11946/// byte-rotation of the concatenation of two vectors; pre-SSSE3 can use
11947/// a PSRLDQ/PSLLDQ/POR pattern to get a similar effect. This routine will
11948/// try to generically lower a vector shuffle through such an pattern. It
11949/// does not check for the profitability of lowering either as PALIGNR or
11950/// PSRLDQ/PSLLDQ/POR, only whether the mask is valid to lower in that form.
11951/// This matches shuffle vectors that look like:
11952///
11953/// v8i16 [11, 12, 13, 14, 15, 0, 1, 2]
11954///
11955/// Essentially it concatenates V1 and V2, shifts right by some number of
11956/// elements, and takes the low elements as the result. Note that while this is
11957/// specified as a *right shift* because x86 is little-endian, it is a *left
11958/// rotate* of the vector lanes.
11960 ArrayRef<int> Mask) {
11961 // Don't accept any shuffles with zero elements.
11962 if (isAnyZero(Mask))
11963 return -1;
11964
11965 // PALIGNR works on 128-bit lanes.
11966 SmallVector<int, 16> RepeatedMask;
11967 if (!is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedMask))
11968 return -1;
11969
11970 int Rotation = matchShuffleAsElementRotate(V1, V2, RepeatedMask);
11971 if (Rotation <= 0)
11972 return -1;
11973
11974 // PALIGNR rotates bytes, so we need to scale the
11975 // rotation based on how many bytes are in the vector lane.
11976 int NumElts = RepeatedMask.size();
11977 int Scale = 16 / NumElts;
11978 return Rotation * Scale;
11979}
11980
11982 SDValue V2, ArrayRef<int> Mask,
11983 const X86Subtarget &Subtarget,
11984 SelectionDAG &DAG) {
11985 assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
11986
11987 SDValue Lo = V1, Hi = V2;
11988 int ByteRotation = matchShuffleAsByteRotate(VT, Lo, Hi, Mask);
11989 if (ByteRotation <= 0)
11990 return SDValue();
11991
11992 // Cast the inputs to i8 vector of correct length to match PALIGNR or
11993 // PSLLDQ/PSRLDQ.
11994 MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
11995 Lo = DAG.getBitcast(ByteVT, Lo);
11996 Hi = DAG.getBitcast(ByteVT, Hi);
11997
11998 // SSSE3 targets can use the palignr instruction.
11999 if (Subtarget.hasSSSE3()) {
12000 assert((!VT.is512BitVector() || Subtarget.hasBWI()) &&
12001 "512-bit PALIGNR requires BWI instructions");
12002 return DAG.getBitcast(
12003 VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, Lo, Hi,
12004 DAG.getTargetConstant(ByteRotation, DL, MVT::i8)));
12005 }
12006
12007 assert(VT.is128BitVector() &&
12008 "Rotate-based lowering only supports 128-bit lowering!");
12009 assert(Mask.size() <= 16 &&
12010 "Can shuffle at most 16 bytes in a 128-bit vector!");
12011 assert(ByteVT == MVT::v16i8 &&
12012 "SSE2 rotate lowering only needed for v16i8!");
12013
12014 // Default SSE2 implementation
12015 int LoByteShift = 16 - ByteRotation;
12016 int HiByteShift = ByteRotation;
12017
12018 SDValue LoShift =
12019 DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Lo,
12020 DAG.getTargetConstant(LoByteShift, DL, MVT::i8));
12021 SDValue HiShift =
12022 DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Hi,
12023 DAG.getTargetConstant(HiByteShift, DL, MVT::i8));
12024 return DAG.getBitcast(VT,
12025 DAG.getNode(ISD::OR, DL, MVT::v16i8, LoShift, HiShift));
12026}
12027
12028/// Try to lower a vector shuffle as a dword/qword rotation.
12029///
12030/// AVX512 has a VALIGND/VALIGNQ instructions that will do an arbitrary
12031/// rotation of the concatenation of two vectors; This routine will
12032/// try to generically lower a vector shuffle through such an pattern.
12033///
12034/// Essentially it concatenates V1 and V2, shifts right by some number of
12035/// elements, and takes the low elements as the result. Note that while this is
12036/// specified as a *right shift* because x86 is little-endian, it is a *left
12037/// rotate* of the vector lanes.
12039 SDValue V2, ArrayRef<int> Mask,
12040 const APInt &Zeroable,
12041 const X86Subtarget &Subtarget,
12042 SelectionDAG &DAG) {
12043 assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) &&
12044 "Only 32-bit and 64-bit elements are supported!");
12045
12046 // 128/256-bit vectors are only supported with VLX.
12047 assert((Subtarget.hasVLX() || (!VT.is128BitVector() && !VT.is256BitVector()))
12048 && "VLX required for 128/256-bit vectors");
12049
12050 SDValue Lo = V1, Hi = V2;
12051 int Rotation = matchShuffleAsElementRotate(Lo, Hi, Mask);
12052 if (0 < Rotation)
12053 return DAG.getNode(X86ISD::VALIGN, DL, VT, Lo, Hi,
12054 DAG.getTargetConstant(Rotation, DL, MVT::i8));
12055
12056 // See if we can use VALIGN as a cross-lane version of VSHLDQ/VSRLDQ.
12057 // TODO: Pull this out as a matchShuffleAsElementShift helper?
12058 // TODO: We can probably make this more aggressive and use shift-pairs like
12059 // lowerShuffleAsByteShiftMask.
12060 unsigned NumElts = Mask.size();
12061 unsigned ZeroLo = Zeroable.countr_one();
12062 unsigned ZeroHi = Zeroable.countl_one();
12063 assert((ZeroLo + ZeroHi) < NumElts && "Zeroable shuffle detected");
12064 if (!ZeroLo && !ZeroHi)
12065 return SDValue();
12066
12067 if (ZeroLo) {
12068 SDValue Src = Mask[ZeroLo] < (int)NumElts ? V1 : V2;
12069 int Low = Mask[ZeroLo] < (int)NumElts ? 0 : NumElts;
12070 if (isSequentialOrUndefInRange(Mask, ZeroLo, NumElts - ZeroLo, Low))
12071 return DAG.getNode(X86ISD::VALIGN, DL, VT, Src,
12072 getZeroVector(VT, Subtarget, DAG, DL),
12073 DAG.getTargetConstant(NumElts - ZeroLo, DL, MVT::i8));
12074 }
12075
12076 if (ZeroHi) {
12077 SDValue Src = Mask[0] < (int)NumElts ? V1 : V2;
12078 int Low = Mask[0] < (int)NumElts ? 0 : NumElts;
12079 if (isSequentialOrUndefInRange(Mask, 0, NumElts - ZeroHi, Low + ZeroHi))
12080 return DAG.getNode(X86ISD::VALIGN, DL, VT,
12081 getZeroVector(VT, Subtarget, DAG, DL), Src,
12082 DAG.getTargetConstant(ZeroHi, DL, MVT::i8));
12083 }
12084
12085 return SDValue();
12086}
12087
12088/// Try to lower a vector shuffle as a byte shift sequence.
12090 SDValue V2, ArrayRef<int> Mask,
12091 const APInt &Zeroable,
12092 const X86Subtarget &Subtarget,
12093 SelectionDAG &DAG) {
12094 assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
12095 assert(VT.is128BitVector() && "Only 128-bit vectors supported");
12096
12097 // We need a shuffle that has zeros at one/both ends and a sequential
12098 // shuffle from one source within.
12099 unsigned ZeroLo = Zeroable.countr_one();
12100 unsigned ZeroHi = Zeroable.countl_one();
12101 if (!ZeroLo && !ZeroHi)
12102 return SDValue();
12103
12104 unsigned NumElts = Mask.size();
12105 unsigned Len = NumElts - (ZeroLo + ZeroHi);
12106 if (!isSequentialOrUndefInRange(Mask, ZeroLo, Len, Mask[ZeroLo]))
12107 return SDValue();
12108
12109 unsigned Scale = VT.getScalarSizeInBits() / 8;
12110 ArrayRef<int> StubMask = Mask.slice(ZeroLo, Len);
12111 if (!isUndefOrInRange(StubMask, 0, NumElts) &&
12112 !isUndefOrInRange(StubMask, NumElts, 2 * NumElts))
12113 return SDValue();
12114
12115 SDValue Res = Mask[ZeroLo] < (int)NumElts ? V1 : V2;
12116 Res = DAG.getBitcast(MVT::v16i8, Res);
12117
12118 // Use VSHLDQ/VSRLDQ ops to zero the ends of a vector and leave an
12119 // inner sequential set of elements, possibly offset:
12120 // 01234567 --> zzzzzz01 --> 1zzzzzzz
12121 // 01234567 --> 4567zzzz --> zzzzz456
12122 // 01234567 --> z0123456 --> 3456zzzz --> zz3456zz
12123 if (ZeroLo == 0) {
12124 unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts);
12125 Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
12126 DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
12127 Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
12128 DAG.getTargetConstant(Scale * ZeroHi, DL, MVT::i8));
12129 } else if (ZeroHi == 0) {
12130 unsigned Shift = Mask[ZeroLo] % NumElts;
12131 Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
12132 DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
12133 Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
12134 DAG.getTargetConstant(Scale * ZeroLo, DL, MVT::i8));
12135 } else if (!Subtarget.hasSSSE3()) {
12136 // If we don't have PSHUFB then its worth avoiding an AND constant mask
12137 // by performing 3 byte shifts. Shuffle combining can kick in above that.
12138 // TODO: There may be some cases where VSH{LR}DQ+PAND is still better.
12139 unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts);
12140 Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
12141 DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
12142 Shift += Mask[ZeroLo] % NumElts;
12143 Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
12144 DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
12145 Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
12146 DAG.getTargetConstant(Scale * ZeroLo, DL, MVT::i8));
12147 } else
12148 return SDValue();
12149
12150 return DAG.getBitcast(VT, Res);
12151}
12152
12153/// Try to lower a vector shuffle as a bit shift (shifts in zeros).
12154///
12155/// Attempts to match a shuffle mask against the PSLL(W/D/Q/DQ) and
12156/// PSRL(W/D/Q/DQ) SSE2 and AVX2 logical bit-shift instructions. The function
12157/// matches elements from one of the input vectors shuffled to the left or
12158/// right with zeroable elements 'shifted in'. It handles both the strictly
12159/// bit-wise element shifts and the byte shift across an entire 128-bit double
12160/// quad word lane.
12161///
12162/// PSHL : (little-endian) left bit shift.
12163/// [ zz, 0, zz, 2 ]
12164/// [ -1, 4, zz, -1 ]
12165/// PSRL : (little-endian) right bit shift.
12166/// [ 1, zz, 3, zz]
12167/// [ -1, -1, 7, zz]
12168/// PSLLDQ : (little-endian) left byte shift
12169/// [ zz, 0, 1, 2, 3, 4, 5, 6]
12170/// [ zz, zz, -1, -1, 2, 3, 4, -1]
12171/// [ zz, zz, zz, zz, zz, zz, -1, 1]
12172/// PSRLDQ : (little-endian) right byte shift
12173/// [ 5, 6, 7, zz, zz, zz, zz, zz]
12174/// [ -1, 5, 6, 7, zz, zz, zz, zz]
12175/// [ 1, 2, -1, -1, -1, -1, zz, zz]
12176static int matchShuffleAsShift(MVT &ShiftVT, unsigned &Opcode,
12177 unsigned ScalarSizeInBits, ArrayRef<int> Mask,
12178 int MaskOffset, const APInt &Zeroable,
12179 const X86Subtarget &Subtarget) {
12180 int Size = Mask.size();
12181 unsigned SizeInBits = Size * ScalarSizeInBits;
12182
12183 auto CheckZeros = [&](int Shift, int Scale, bool Left) {
12184 for (int i = 0; i < Size; i += Scale)
12185 for (int j = 0; j < Shift; ++j)
12186 if (!Zeroable[i + j + (Left ? 0 : (Scale - Shift))])
12187 return false;
12188
12189 return true;
12190 };
12191
12192 auto MatchShift = [&](int Shift, int Scale, bool Left) {
12193 for (int i = 0; i != Size; i += Scale) {
12194 unsigned Pos = Left ? i + Shift : i;
12195 unsigned Low = Left ? i : i + Shift;
12196 unsigned Len = Scale - Shift;
12197 if (!isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset))
12198 return -1;
12199 }
12200
12201 int ShiftEltBits = ScalarSizeInBits * Scale;
12202 bool ByteShift = ShiftEltBits > 64;
12203 Opcode = Left ? (ByteShift ? X86ISD::VSHLDQ : X86ISD::VSHLI)
12204 : (ByteShift ? X86ISD::VSRLDQ : X86ISD::VSRLI);
12205 int ShiftAmt = Shift * ScalarSizeInBits / (ByteShift ? 8 : 1);
12206
12207 // Normalize the scale for byte shifts to still produce an i64 element
12208 // type.
12209 Scale = ByteShift ? Scale / 2 : Scale;
12210
12211 // We need to round trip through the appropriate type for the shift.
12212 MVT ShiftSVT = MVT::getIntegerVT(ScalarSizeInBits * Scale);
12213 ShiftVT = ByteShift ? MVT::getVectorVT(MVT::i8, SizeInBits / 8)
12214 : MVT::getVectorVT(ShiftSVT, Size / Scale);
12215 return (int)ShiftAmt;
12216 };
12217
12218 // SSE/AVX supports logical shifts up to 64-bit integers - so we can just
12219 // keep doubling the size of the integer elements up to that. We can
12220 // then shift the elements of the integer vector by whole multiples of
12221 // their width within the elements of the larger integer vector. Test each
12222 // multiple to see if we can find a match with the moved element indices
12223 // and that the shifted in elements are all zeroable.
12224 unsigned MaxWidth = ((SizeInBits == 512) && !Subtarget.hasBWI() ? 64 : 128);
12225 for (int Scale = 2; Scale * ScalarSizeInBits <= MaxWidth; Scale *= 2)
12226 for (int Shift = 1; Shift != Scale; ++Shift)
12227 for (bool Left : {true, false})
12228 if (CheckZeros(Shift, Scale, Left)) {
12229 int ShiftAmt = MatchShift(Shift, Scale, Left);
12230 if (0 < ShiftAmt)
12231 return ShiftAmt;
12232 }
12233
12234 // no match
12235 return -1;
12236}
12237
12239 SDValue V2, ArrayRef<int> Mask,
12240 const APInt &Zeroable,
12241 const X86Subtarget &Subtarget,
12242 SelectionDAG &DAG, bool BitwiseOnly) {
12243 int Size = Mask.size();
12244 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
12245
12246 MVT ShiftVT;
12247 SDValue V = V1;
12248 unsigned Opcode;
12249
12250 // Try to match shuffle against V1 shift.
12251 int ShiftAmt = matchShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),
12252 Mask, 0, Zeroable, Subtarget);
12253
12254 // If V1 failed, try to match shuffle against V2 shift.
12255 if (ShiftAmt < 0) {
12256 ShiftAmt = matchShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),
12257 Mask, Size, Zeroable, Subtarget);
12258 V = V2;
12259 }
12260
12261 if (ShiftAmt < 0)
12262 return SDValue();
12263
12264 if (BitwiseOnly && (Opcode == X86ISD::VSHLDQ || Opcode == X86ISD::VSRLDQ))
12265 return SDValue();
12266
12267 assert(DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) &&
12268 "Illegal integer vector type");
12269 V = DAG.getBitcast(ShiftVT, V);
12270 V = DAG.getNode(Opcode, DL, ShiftVT, V,
12271 DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
12272 return DAG.getBitcast(VT, V);
12273}
12274
12275// EXTRQ: Extract Len elements from lower half of source, starting at Idx.
12276// Remainder of lower half result is zero and upper half is all undef.
12277static bool matchShuffleAsEXTRQ(MVT VT, SDValue &V1, SDValue &V2,
12278 ArrayRef<int> Mask, uint64_t &BitLen,
12279 uint64_t &BitIdx, const APInt &Zeroable) {
12280 int Size = Mask.size();
12281 int HalfSize = Size / 2;
12282 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
12283 assert(!Zeroable.isAllOnes() && "Fully zeroable shuffle mask");
12284
12285 // Upper half must be undefined.
12286 if (!isUndefUpperHalf(Mask))
12287 return false;
12288
12289 // Determine the extraction length from the part of the
12290 // lower half that isn't zeroable.
12291 int Len = HalfSize;
12292 for (; Len > 0; --Len)
12293 if (!Zeroable[Len - 1])
12294 break;
12295 assert(Len > 0 && "Zeroable shuffle mask");
12296
12297 // Attempt to match first Len sequential elements from the lower half.
12298 SDValue Src;
12299 int Idx = -1;
12300 for (int i = 0; i != Len; ++i) {
12301 int M = Mask[i];
12302 if (M == SM_SentinelUndef)
12303 continue;
12304 SDValue &V = (M < Size ? V1 : V2);
12305 M = M % Size;
12306
12307 // The extracted elements must start at a valid index and all mask
12308 // elements must be in the lower half.
12309 if (i > M || M >= HalfSize)
12310 return false;
12311
12312 if (Idx < 0 || (Src == V && Idx == (M - i))) {
12313 Src = V;
12314 Idx = M - i;
12315 continue;
12316 }
12317 return false;
12318 }
12319
12320 if (!Src || Idx < 0)
12321 return false;
12322
12323 assert((Idx + Len) <= HalfSize && "Illegal extraction mask");
12324 BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
12325 BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
12326 V1 = Src;
12327 return true;
12328}
12329
12330// INSERTQ: Extract lowest Len elements from lower half of second source and
12331// insert over first source, starting at Idx.
12332// { A[0], .., A[Idx-1], B[0], .., B[Len-1], A[Idx+Len], .., UNDEF, ... }
12333static bool matchShuffleAsINSERTQ(MVT VT, SDValue &V1, SDValue &V2,
12334 ArrayRef<int> Mask, uint64_t &BitLen,
12335 uint64_t &BitIdx) {
12336 int Size = Mask.size();
12337 int HalfSize = Size / 2;
12338 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
12339
12340 // Upper half must be undefined.
12341 if (!isUndefUpperHalf(Mask))
12342 return false;
12343
12344 for (int Idx = 0; Idx != HalfSize; ++Idx) {
12345 SDValue Base;
12346
12347 // Attempt to match first source from mask before insertion point.
12348 if (isUndefInRange(Mask, 0, Idx)) {
12349 /* EMPTY */
12350 } else if (isSequentialOrUndefInRange(Mask, 0, Idx, 0)) {
12351 Base = V1;
12352 } else if (isSequentialOrUndefInRange(Mask, 0, Idx, Size)) {
12353 Base = V2;
12354 } else {
12355 continue;
12356 }
12357
12358 // Extend the extraction length looking to match both the insertion of
12359 // the second source and the remaining elements of the first.
12360 for (int Hi = Idx + 1; Hi <= HalfSize; ++Hi) {
12361 SDValue Insert;
12362 int Len = Hi - Idx;
12363
12364 // Match insertion.
12365 if (isSequentialOrUndefInRange(Mask, Idx, Len, 0)) {
12366 Insert = V1;
12367 } else if (isSequentialOrUndefInRange(Mask, Idx, Len, Size)) {
12368 Insert = V2;
12369 } else {
12370 continue;
12371 }
12372
12373 // Match the remaining elements of the lower half.
12374 if (isUndefInRange(Mask, Hi, HalfSize - Hi)) {
12375 /* EMPTY */
12376 } else if ((!Base || (Base == V1)) &&
12377 isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi, Hi)) {
12378 Base = V1;
12379 } else if ((!Base || (Base == V2)) &&
12380 isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi,
12381 Size + Hi)) {
12382 Base = V2;
12383 } else {
12384 continue;
12385 }
12386
12387 BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
12388 BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
12389 V1 = Base;
12390 V2 = Insert;
12391 return true;
12392 }
12393 }
12394
12395 return false;
12396}
12397
12398/// Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ.
12400 SDValue V2, ArrayRef<int> Mask,
12401 const APInt &Zeroable, SelectionDAG &DAG) {
12402 uint64_t BitLen, BitIdx;
12403 if (matchShuffleAsEXTRQ(VT, V1, V2, Mask, BitLen, BitIdx, Zeroable))
12404 return DAG.getNode(X86ISD::EXTRQI, DL, VT, V1,
12405 DAG.getTargetConstant(BitLen, DL, MVT::i8),
12406 DAG.getTargetConstant(BitIdx, DL, MVT::i8));
12407
12408 if (matchShuffleAsINSERTQ(VT, V1, V2, Mask, BitLen, BitIdx))
12409 return DAG.getNode(X86ISD::INSERTQI, DL, VT, V1 ? V1 : DAG.getUNDEF(VT),
12410 V2 ? V2 : DAG.getUNDEF(VT),
12411 DAG.getTargetConstant(BitLen, DL, MVT::i8),
12412 DAG.getTargetConstant(BitIdx, DL, MVT::i8));
12413
12414 return SDValue();
12415}
12416
12417/// Lower a vector shuffle as an any/signed/zero extension.
12418///
12419/// Given a specific number of elements, element bit width, and extension
12420/// stride, produce either an extension based on the available
12421/// features of the subtarget. The extended elements are consecutive and
12422/// begin and can start from an offsetted element index in the input; to
12423/// avoid excess shuffling the offset must either being in the bottom lane
12424/// or at the start of a higher lane. All extended elements must be from
12425/// the same lane.
12427 int Scale, int Offset,
12428 unsigned ExtOpc, SDValue InputV,
12429 ArrayRef<int> Mask,
12430 const X86Subtarget &Subtarget,
12431 SelectionDAG &DAG) {
12432 assert(Scale > 1 && "Need a scale to extend.");
12433 assert(ISD::isExtOpcode(ExtOpc) && "Unsupported extension");
12434 int EltBits = VT.getScalarSizeInBits();
12435 int NumElements = VT.getVectorNumElements();
12436 int NumEltsPerLane = 128 / EltBits;
12437 int OffsetLane = Offset / NumEltsPerLane;
12438 assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&
12439 "Only 8, 16, and 32 bit elements can be extended.");
12440 assert(Scale * EltBits <= 64 && "Cannot zero extend past 64 bits.");
12441 assert(0 <= Offset && "Extension offset must be positive.");
12442 assert((Offset < NumEltsPerLane || Offset % NumEltsPerLane == 0) &&
12443 "Extension offset must be in the first lane or start an upper lane.");
12444
12445 // Check that an index is in same lane as the base offset.
12446 auto SafeOffset = [&](int Idx) {
12447 return OffsetLane == (Idx / NumEltsPerLane);
12448 };
12449
12450 // Shift along an input so that the offset base moves to the first element.
12451 auto ShuffleOffset = [&](SDValue V) {
12452 if (!Offset)
12453 return V;
12454
12455 SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
12456 for (int i = 0; i * Scale < NumElements; ++i) {
12457 int SrcIdx = i + Offset;
12458 ShMask[i] = SafeOffset(SrcIdx) ? SrcIdx : -1;
12459 }
12460 return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), ShMask);
12461 };
12462
12463 // Found a valid a/zext mask! Try various lowering strategies based on the
12464 // input type and available ISA extensions.
12465 if (Subtarget.hasSSE41()) {
12466 // Not worth offsetting 128-bit vectors if scale == 2, a pattern using
12467 // PUNPCK will catch this in a later shuffle match.
12468 if (Offset && Scale == 2 && VT.is128BitVector())
12469 return SDValue();
12470 MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale),
12471 NumElements / Scale);
12472 InputV = DAG.getBitcast(VT, InputV);
12473 InputV = ShuffleOffset(InputV);
12474 InputV = getEXTEND_VECTOR_INREG(ExtOpc, DL, ExtVT, InputV, DAG);
12475 return DAG.getBitcast(VT, InputV);
12476 }
12477
12478 assert(VT.is128BitVector() && "Only 128-bit vectors can be extended.");
12479 InputV = DAG.getBitcast(VT, InputV);
12480 bool AnyExt = ExtOpc == ISD::ANY_EXTEND;
12481
12482 // TODO: Add pre-SSE41 SIGN_EXTEND_VECTOR_INREG handling.
12483 if (ExtOpc == ISD::SIGN_EXTEND)
12484 return SDValue();
12485
12486 // For any extends we can cheat for larger element sizes and use shuffle
12487 // instructions that can fold with a load and/or copy.
12488 if (AnyExt && EltBits == 32) {
12489 int PSHUFDMask[4] = {Offset, -1, SafeOffset(Offset + 1) ? Offset + 1 : -1,
12490 -1};
12491 return DAG.getBitcast(
12492 VT, DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
12493 DAG.getBitcast(MVT::v4i32, InputV),
12494 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
12495 }
12496 if (AnyExt && EltBits == 16 && Scale > 2) {
12497 int PSHUFDMask[4] = {Offset / 2, -1,
12498 SafeOffset(Offset + 1) ? (Offset + 1) / 2 : -1, -1};
12499 InputV = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
12500 DAG.getBitcast(MVT::v4i32, InputV),
12501 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
12502 int PSHUFWMask[4] = {1, -1, -1, -1};
12503 unsigned OddEvenOp = (Offset & 1) ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
12504 return DAG.getBitcast(
12505 VT, DAG.getNode(OddEvenOp, DL, MVT::v8i16,
12506 DAG.getBitcast(MVT::v8i16, InputV),
12507 getV4X86ShuffleImm8ForMask(PSHUFWMask, DL, DAG)));
12508 }
12509
12510 // The SSE4A EXTRQ instruction can efficiently extend the first 2 lanes
12511 // to 64-bits.
12512 if ((Scale * EltBits) == 64 && EltBits < 32 && Subtarget.hasSSE4A()) {
12513 assert(NumElements == (int)Mask.size() && "Unexpected shuffle mask size!");
12514 assert(VT.is128BitVector() && "Unexpected vector width!");
12515
12516 int LoIdx = Offset * EltBits;
12517 SDValue Lo = DAG.getBitcast(
12518 MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
12519 DAG.getTargetConstant(EltBits, DL, MVT::i8),
12520 DAG.getTargetConstant(LoIdx, DL, MVT::i8)));
12521
12522 if (isUndefUpperHalf(Mask) || !SafeOffset(Offset + 1))
12523 return DAG.getBitcast(VT, Lo);
12524
12525 int HiIdx = (Offset + 1) * EltBits;
12526 SDValue Hi = DAG.getBitcast(
12527 MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
12528 DAG.getTargetConstant(EltBits, DL, MVT::i8),
12529 DAG.getTargetConstant(HiIdx, DL, MVT::i8)));
12530 return DAG.getBitcast(VT,
12531 DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, Lo, Hi));
12532 }
12533
12534 // If this would require more than 2 unpack instructions to expand, use
12535 // pshufb when available. We can only use more than 2 unpack instructions
12536 // when zero extending i8 elements which also makes it easier to use pshufb.
12537 if (Scale > 4 && EltBits == 8 && Subtarget.hasSSSE3()) {
12538 assert(NumElements == 16 && "Unexpected byte vector width!");
12539 SDValue PSHUFBMask[16];
12540 for (int i = 0; i < 16; ++i) {
12541 int Idx = Offset + (i / Scale);
12542 if ((i % Scale == 0 && SafeOffset(Idx))) {
12543 PSHUFBMask[i] = DAG.getConstant(Idx, DL, MVT::i8);
12544 continue;
12545 }
12546 PSHUFBMask[i] =
12547 AnyExt ? DAG.getUNDEF(MVT::i8) : DAG.getConstant(0x80, DL, MVT::i8);
12548 }
12549 InputV = DAG.getBitcast(MVT::v16i8, InputV);
12550 return DAG.getBitcast(
12551 VT, DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, InputV,
12552 DAG.getBuildVector(MVT::v16i8, DL, PSHUFBMask)));
12553 }
12554
12555 // If we are extending from an offset, ensure we start on a boundary that
12556 // we can unpack from.
12557 int AlignToUnpack = Offset % (NumElements / Scale);
12558 if (AlignToUnpack) {
12559 SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
12560 for (int i = AlignToUnpack; i < NumElements; ++i)
12561 ShMask[i - AlignToUnpack] = i;
12562 InputV = DAG.getVectorShuffle(VT, DL, InputV, DAG.getUNDEF(VT), ShMask);
12563 Offset -= AlignToUnpack;
12564 }
12565
12566 // Otherwise emit a sequence of unpacks.
12567 do {
12568 unsigned UnpackLoHi = X86ISD::UNPCKL;
12569 if (Offset >= (NumElements / 2)) {
12570 UnpackLoHi = X86ISD::UNPCKH;
12571 Offset -= (NumElements / 2);
12572 }
12573
12574 MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements);
12575 SDValue Ext = AnyExt ? DAG.getUNDEF(InputVT)
12576 : getZeroVector(InputVT, Subtarget, DAG, DL);
12577 InputV = DAG.getBitcast(InputVT, InputV);
12578 InputV = DAG.getNode(UnpackLoHi, DL, InputVT, InputV, Ext);
12579 Scale /= 2;
12580 EltBits *= 2;
12581 NumElements /= 2;
12582 } while (Scale > 1);
12583 return DAG.getBitcast(VT, InputV);
12584}
12585
12586/// Try to lower a vector shuffle as a zero extension on any microarch.
12587///
12588/// This routine will try to do everything in its power to cleverly lower
12589/// a shuffle which happens to match the pattern of a zero extend. It doesn't
12590/// check for the profitability of this lowering, it tries to aggressively
12591/// match this pattern. It will use all of the micro-architectural details it
12592/// can to emit an efficient lowering. It handles both blends with all-zero
12593/// inputs to explicitly zero-extend and undef-lanes (sometimes undef due to
12594/// masking out later).
12595///
12596/// The reason we have dedicated lowering for zext-style shuffles is that they
12597/// are both incredibly common and often quite performance sensitive.
12599 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
12600 const APInt &Zeroable, const X86Subtarget &Subtarget,
12601 SelectionDAG &DAG) {
12602 int Bits = VT.getSizeInBits();
12603 int NumLanes = Bits / 128;
12604 int NumElements = VT.getVectorNumElements();
12605 int NumEltsPerLane = NumElements / NumLanes;
12606 assert(VT.getScalarSizeInBits() <= 32 &&
12607 "Exceeds 32-bit integer zero extension limit");
12608 assert((int)Mask.size() == NumElements && "Unexpected shuffle mask size");
12609
12610 // Define a helper function to check a particular ext-scale and lower to it if
12611 // valid.
12612 auto Lower = [&](int Scale) -> SDValue {
12613 SDValue InputV;
12614 bool AnyExt = true;
12615 int Offset = 0;
12616 int Matches = 0;
12617 for (int i = 0; i < NumElements; ++i) {
12618 int M = Mask[i];
12619 if (M < 0)
12620 continue; // Valid anywhere but doesn't tell us anything.
12621 if (i % Scale != 0) {
12622 // Each of the extended elements need to be zeroable.
12623 if (!Zeroable[i])
12624 return SDValue();
12625
12626 // We no longer are in the anyext case.
12627 AnyExt = false;
12628 continue;
12629 }
12630
12631 // Each of the base elements needs to be consecutive indices into the
12632 // same input vector.
12633 SDValue V = M < NumElements ? V1 : V2;
12634 M = M % NumElements;
12635 if (!InputV) {
12636 InputV = V;
12637 Offset = M - (i / Scale);
12638 } else if (InputV != V)
12639 return SDValue(); // Flip-flopping inputs.
12640
12641 // Offset must start in the lowest 128-bit lane or at the start of an
12642 // upper lane.
12643 // FIXME: Is it ever worth allowing a negative base offset?
12644 if (!((0 <= Offset && Offset < NumEltsPerLane) ||
12645 (Offset % NumEltsPerLane) == 0))
12646 return SDValue();
12647
12648 // If we are offsetting, all referenced entries must come from the same
12649 // lane.
12650 if (Offset && (Offset / NumEltsPerLane) != (M / NumEltsPerLane))
12651 return SDValue();
12652
12653 if ((M % NumElements) != (Offset + (i / Scale)))
12654 return SDValue(); // Non-consecutive strided elements.
12655 Matches++;
12656 }
12657
12658 // If we fail to find an input, we have a zero-shuffle which should always
12659 // have already been handled.
12660 // FIXME: Maybe handle this here in case during blending we end up with one?
12661 if (!InputV)
12662 return SDValue();
12663
12664 // If we are offsetting, don't extend if we only match a single input, we
12665 // can always do better by using a basic PSHUF or PUNPCK.
12666 if (Offset != 0 && Matches < 2)
12667 return SDValue();
12668
12669 unsigned ExtOpc = AnyExt ? ISD::ANY_EXTEND : ISD::ZERO_EXTEND;
12670 return lowerShuffleAsSpecificExtension(DL, VT, Scale, Offset, ExtOpc,
12671 InputV, Mask, Subtarget, DAG);
12672 };
12673
12674 // The widest scale possible for extending is to a 64-bit integer.
12675 assert(Bits % 64 == 0 &&
12676 "The number of bits in a vector must be divisible by 64 on x86!");
12677 int NumExtElements = Bits / 64;
12678
12679 // Each iteration, try extending the elements half as much, but into twice as
12680 // many elements.
12681 for (; NumExtElements < NumElements; NumExtElements *= 2) {
12682 assert(NumElements % NumExtElements == 0 &&
12683 "The input vector size must be divisible by the extended size.");
12684 if (SDValue V = Lower(NumElements / NumExtElements))
12685 return V;
12686 }
12687
12688 // General extends failed, but 128-bit vectors may be able to use MOVQ.
12689 if (Bits != 128)
12690 return SDValue();
12691
12692 // Returns one of the source operands if the shuffle can be reduced to a
12693 // MOVQ, copying the lower 64-bits and zero-extending to the upper 64-bits.
12694 auto CanZExtLowHalf = [&]() {
12695 for (int i = NumElements / 2; i != NumElements; ++i)
12696 if (!Zeroable[i])
12697 return SDValue();
12698 if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, 0))
12699 return V1;
12700 if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, NumElements))
12701 return V2;
12702 return SDValue();
12703 };
12704
12705 if (SDValue V = CanZExtLowHalf()) {
12706 V = DAG.getBitcast(MVT::v2i64, V);
12707 V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v2i64, V);
12708 return DAG.getBitcast(VT, V);
12709 }
12710
12711 // No viable ext lowering found.
12712 return SDValue();
12713}
12714
12715/// Try to get a scalar value for a specific element of a vector.
12716///
12717/// Looks through BUILD_VECTOR and SCALAR_TO_VECTOR nodes to find a scalar.
12719 SelectionDAG &DAG) {
12720 MVT VT = V.getSimpleValueType();
12721 MVT EltVT = VT.getVectorElementType();
12722 V = peekThroughBitcasts(V);
12723
12724 // If the bitcasts shift the element size, we can't extract an equivalent
12725 // element from it.
12726 MVT NewVT = V.getSimpleValueType();
12727 if (!NewVT.isVector() || NewVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
12728 return SDValue();
12729
12730 if (V.getOpcode() == ISD::BUILD_VECTOR ||
12731 (Idx == 0 && V.getOpcode() == ISD::SCALAR_TO_VECTOR)) {
12732 // Ensure the scalar operand is the same size as the destination.
12733 // FIXME: Add support for scalar truncation where possible.
12734 SDValue S = V.getOperand(Idx);
12735 if (EltVT.getSizeInBits() == S.getSimpleValueType().getSizeInBits())
12736 return DAG.getBitcast(EltVT, S);
12737 }
12738
12739 return SDValue();
12740}
12741
12742/// Helper to test for a load that can be folded with x86 shuffles.
12743///
12744/// This is particularly important because the set of instructions varies
12745/// significantly based on whether the operand is a load or not.
12747 return V.hasOneUse() &&
12749}
12750
12751template<typename T>
12752static bool isSoftF16(T VT, const X86Subtarget &Subtarget) {
12753 T EltVT = VT.getScalarType();
12754 return (EltVT == MVT::bf16 && !Subtarget.hasAVX10_2()) ||
12755 (EltVT == MVT::f16 && !Subtarget.hasFP16());
12756}
12757
12758/// Try to lower insertion of a single element into a zero vector.
12759///
12760/// This is a common pattern that we have especially efficient patterns to lower
12761/// across all subtarget feature sets.
12763 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
12764 const APInt &Zeroable, const X86Subtarget &Subtarget,
12765 SelectionDAG &DAG) {
12766 MVT ExtVT = VT;
12767 MVT EltVT = VT.getVectorElementType();
12768 unsigned NumElts = VT.getVectorNumElements();
12769 unsigned EltBits = VT.getScalarSizeInBits();
12770
12771 if (isSoftF16(EltVT, Subtarget))
12772 return SDValue();
12773
12774 int V2Index =
12775 find_if(Mask, [&Mask](int M) { return M >= (int)Mask.size(); }) -
12776 Mask.begin();
12777 bool IsV1Constant = getTargetConstantFromNode(V1) != nullptr;
12778 bool IsV1Zeroable = true;
12779 for (int i = 0, Size = Mask.size(); i < Size; ++i)
12780 if (i != V2Index && !Zeroable[i]) {
12781 IsV1Zeroable = false;
12782 break;
12783 }
12784
12785 // Bail if a non-zero V1 isn't used in place.
12786 if (!IsV1Zeroable) {
12787 SmallVector<int, 8> V1Mask(Mask);
12788 V1Mask[V2Index] = -1;
12789 if (!isNoopShuffleMask(V1Mask))
12790 return SDValue();
12791 }
12792
12793 // Check for a single input from a SCALAR_TO_VECTOR node.
12794 // FIXME: All of this should be canonicalized into INSERT_VECTOR_ELT and
12795 // all the smarts here sunk into that routine. However, the current
12796 // lowering of BUILD_VECTOR makes that nearly impossible until the old
12797 // vector shuffle lowering is dead.
12798 SDValue V2S = getScalarValueForVectorElement(V2, Mask[V2Index] - Mask.size(),
12799 DAG);
12800 if (V2S && DAG.getTargetLoweringInfo().isTypeLegal(V2S.getValueType())) {
12801 // We need to zext the scalar if it is smaller than an i32.
12802 V2S = DAG.getBitcast(EltVT, V2S);
12803 if (EltVT == MVT::i8 || (EltVT == MVT::i16 && !Subtarget.hasFP16())) {
12804 // Using zext to expand a narrow element won't work for non-zero
12805 // insertions. But we can use a masked constant vector if we're
12806 // inserting V2 into the bottom of V1.
12807 if (!IsV1Zeroable && !(IsV1Constant && V2Index == 0))
12808 return SDValue();
12809
12810 // Zero-extend directly to i32.
12811 ExtVT = MVT::getVectorVT(MVT::i32, ExtVT.getSizeInBits() / 32);
12812 V2S = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, V2S);
12813
12814 // If we're inserting into a constant, mask off the inserted index
12815 // and OR with the zero-extended scalar.
12816 if (!IsV1Zeroable) {
12817 SmallVector<APInt> Bits(NumElts, APInt::getAllOnes(EltBits));
12818 Bits[V2Index] = APInt::getZero(EltBits);
12819 SDValue BitMask = getConstVector(Bits, VT, DAG, DL);
12820 V1 = DAG.getNode(ISD::AND, DL, VT, V1, BitMask);
12821 V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S);
12822 V2 = DAG.getBitcast(VT, DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2));
12823 return DAG.getNode(ISD::OR, DL, VT, V1, V2);
12824 }
12825 }
12826 V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S);
12827 } else if (Mask[V2Index] != (int)Mask.size() || EltVT == MVT::i8 ||
12828 (EltVT == MVT::i16 && !Subtarget.hasAVX10_2())) {
12829 // Either not inserting from the low element of the input or the input
12830 // element size is too small to use VZEXT_MOVL to clear the high bits.
12831 return SDValue();
12832 }
12833
12834 if (!IsV1Zeroable) {
12835 // If V1 can't be treated as a zero vector we have fewer options to lower
12836 // this. We can't support integer vectors or non-zero targets cheaply.
12837 assert(VT == ExtVT && "Cannot change extended type when non-zeroable!");
12838 if (!VT.isFloatingPoint() || V2Index != 0)
12839 return SDValue();
12840 if (!VT.is128BitVector())
12841 return SDValue();
12842
12843 // Otherwise, use MOVSD, MOVSS or MOVSH.
12844 unsigned MovOpc = 0;
12845 if (EltVT == MVT::f16)
12846 MovOpc = X86ISD::MOVSH;
12847 else if (EltVT == MVT::f32)
12848 MovOpc = X86ISD::MOVSS;
12849 else if (EltVT == MVT::f64)
12850 MovOpc = X86ISD::MOVSD;
12851 else
12852 llvm_unreachable("Unsupported floating point element type to handle!");
12853 return DAG.getNode(MovOpc, DL, ExtVT, V1, V2);
12854 }
12855
12856 // This lowering only works for the low element with floating point vectors.
12857 if (VT.isFloatingPoint() && V2Index != 0)
12858 return SDValue();
12859
12860 V2 = DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2);
12861 if (ExtVT != VT)
12862 V2 = DAG.getBitcast(VT, V2);
12863
12864 if (V2Index != 0) {
12865 // If we have 4 or fewer lanes we can cheaply shuffle the element into
12866 // the desired position. Otherwise it is more efficient to do a vector
12867 // shift left. We know that we can do a vector shift left because all
12868 // the inputs are zero.
12869 if (VT.isFloatingPoint() || NumElts <= 4) {
12870 SmallVector<int, 4> V2Shuffle(Mask.size(), 1);
12871 V2Shuffle[V2Index] = 0;
12872 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Shuffle);
12873 } else {
12874 V2 = DAG.getBitcast(MVT::v16i8, V2);
12875 V2 = DAG.getNode(
12876 X86ISD::VSHLDQ, DL, MVT::v16i8, V2,
12877 DAG.getTargetConstant(V2Index * EltBits / 8, DL, MVT::i8));
12878 V2 = DAG.getBitcast(VT, V2);
12879 }
12880 }
12881 return V2;
12882}
12883
12884/// Try to lower broadcast of a single - truncated - integer element,
12885/// coming from a scalar_to_vector/build_vector node \p V0 with larger elements.
12886///
12887/// This assumes we have AVX2.
12889 int BroadcastIdx,
12890 const X86Subtarget &Subtarget,
12891 SelectionDAG &DAG) {
12892 assert(Subtarget.hasAVX2() &&
12893 "We can only lower integer broadcasts with AVX2!");
12894
12895 MVT EltVT = VT.getVectorElementType();
12896 MVT V0VT = V0.getSimpleValueType();
12897
12898 assert(VT.isInteger() && "Unexpected non-integer trunc broadcast!");
12899 assert(V0VT.isVector() && "Unexpected non-vector vector-sized value!");
12900
12901 MVT V0EltVT = V0VT.getVectorElementType();
12902 if (!V0EltVT.isInteger())
12903 return SDValue();
12904
12905 const unsigned EltSize = EltVT.getSizeInBits();
12906 const unsigned V0EltSize = V0EltVT.getSizeInBits();
12907
12908 // This is only a truncation if the original element type is larger.
12909 if (V0EltSize <= EltSize)
12910 return SDValue();
12911
12912 assert(((V0EltSize % EltSize) == 0) &&
12913 "Scalar type sizes must all be powers of 2 on x86!");
12914
12915 const unsigned V0Opc = V0.getOpcode();
12916 const unsigned Scale = V0EltSize / EltSize;
12917 const unsigned V0BroadcastIdx = BroadcastIdx / Scale;
12918
12919 if ((V0Opc != ISD::SCALAR_TO_VECTOR || V0BroadcastIdx != 0) &&
12920 V0Opc != ISD::BUILD_VECTOR)
12921 return SDValue();
12922
12923 SDValue Scalar = V0.getOperand(V0BroadcastIdx);
12924
12925 // If we're extracting non-least-significant bits, shift so we can truncate.
12926 // Hopefully, we can fold away the trunc/srl/load into the broadcast.
12927 // Even if we can't (and !isShuffleFoldableLoad(Scalar)), prefer
12928 // vpbroadcast+vmovd+shr to vpshufb(m)+vmovd.
12929 if (const int OffsetIdx = BroadcastIdx % Scale)
12930 Scalar = DAG.getNode(ISD::SRL, DL, Scalar.getValueType(), Scalar,
12931 DAG.getConstant(OffsetIdx * EltSize, DL, MVT::i8));
12932
12933 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
12934 DAG.getNode(ISD::TRUNCATE, DL, EltVT, Scalar));
12935}
12936
12937/// Test whether this can be lowered with a single SHUFPS instruction.
12938///
12939/// This is used to disable more specialized lowerings when the shufps lowering
12940/// will happen to be efficient.
12942 // This routine only handles 128-bit shufps.
12943 assert(Mask.size() == 4 && "Unsupported mask size!");
12944 assert(Mask[0] >= -1 && Mask[0] < 8 && "Out of bound mask element!");
12945 assert(Mask[1] >= -1 && Mask[1] < 8 && "Out of bound mask element!");
12946 assert(Mask[2] >= -1 && Mask[2] < 8 && "Out of bound mask element!");
12947 assert(Mask[3] >= -1 && Mask[3] < 8 && "Out of bound mask element!");
12948
12949 // To lower with a single SHUFPS we need to have the low half and high half
12950 // each requiring a single input.
12951 if (Mask[0] >= 0 && Mask[1] >= 0 && (Mask[0] < 4) != (Mask[1] < 4))
12952 return false;
12953 if (Mask[2] >= 0 && Mask[3] >= 0 && (Mask[2] < 4) != (Mask[3] < 4))
12954 return false;
12955
12956 return true;
12957}
12958
12959/// Test whether the specified input (0 or 1) is in-place blended by the
12960/// given mask.
12961///
12962/// This returns true if the elements from a particular input are already in the
12963/// slot required by the given mask and require no permutation.
12965 assert((Input == 0 || Input == 1) && "Only two inputs to shuffles.");
12966 int Size = Mask.size();
12967 for (int i = 0; i < Size; ++i)
12968 if (Mask[i] >= 0 && Mask[i] / Size == Input && Mask[i] % Size != i)
12969 return false;
12970
12971 return true;
12972}
12973
12974/// Test whether the specified input (0 or 1) is a broadcast/splat blended by
12975/// the given mask.
12976///
12978 int BroadcastableElement = 0) {
12979 assert((Input == 0 || Input == 1) && "Only two inputs to shuffles.");
12980 int Size = Mask.size();
12981 for (int i = 0; i < Size; ++i)
12982 if (Mask[i] >= 0 && Mask[i] / Size == Input &&
12983 Mask[i] % Size != BroadcastableElement)
12984 return false;
12985 return true;
12986}
12987
12988/// If we are extracting two 128-bit halves of a vector and shuffling the
12989/// result, match that to a 256-bit AVX2 vperm* instruction to avoid a
12990/// multi-shuffle lowering.
12992 SDValue N1, ArrayRef<int> Mask,
12993 SelectionDAG &DAG) {
12994 MVT VT = N0.getSimpleValueType();
12995 assert((VT.is128BitVector() &&
12996 (VT.getScalarSizeInBits() == 32 || VT.getScalarSizeInBits() == 64)) &&
12997 "VPERM* family of shuffles requires 32-bit or 64-bit elements");
12998
12999 // Check that both sources are extracts of the same source vector.
13000 if (N0.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
13002 N0.getOperand(0) != N1.getOperand(0) ||
13003 !N0.hasOneUse() || !N1.hasOneUse())
13004 return SDValue();
13005
13006 SDValue WideVec = N0.getOperand(0);
13007 MVT WideVT = WideVec.getSimpleValueType();
13008 if (!WideVT.is256BitVector())
13009 return SDValue();
13010
13011 // Match extracts of each half of the wide source vector. Commute the shuffle
13012 // if the extract of the low half is N1.
13013 unsigned NumElts = VT.getVectorNumElements();
13014 SmallVector<int, 4> NewMask(Mask);
13015 const APInt &ExtIndex0 = N0.getConstantOperandAPInt(1);
13016 const APInt &ExtIndex1 = N1.getConstantOperandAPInt(1);
13017 if (ExtIndex1 == 0 && ExtIndex0 == NumElts)
13019 else if (ExtIndex0 != 0 || ExtIndex1 != NumElts)
13020 return SDValue();
13021
13022 // Final bailout: if the mask is simple, we are better off using an extract
13023 // and a simple narrow shuffle. Prefer extract+unpack(h/l)ps to vpermps
13024 // because that avoids a constant load from memory.
13025 if (NumElts == 4 &&
13026 (isSingleSHUFPSMask(NewMask) || is128BitUnpackShuffleMask(NewMask, DAG)))
13027 return SDValue();
13028
13029 // Extend the shuffle mask with undef elements.
13030 NewMask.append(NumElts, -1);
13031
13032 // shuf (extract X, 0), (extract X, 4), M --> extract (shuf X, undef, M'), 0
13033 SDValue Shuf = DAG.getVectorShuffle(WideVT, DL, WideVec, DAG.getUNDEF(WideVT),
13034 NewMask);
13035 // This is free: ymm -> xmm.
13036 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Shuf,
13037 DAG.getVectorIdxConstant(0, DL));
13038}
13039
13040/// Try to lower broadcast of a single element.
13041///
13042/// For convenience, this code also bundles all of the subtarget feature set
13043/// filtering. While a little annoying to re-dispatch on type here, there isn't
13044/// a convenient way to factor it out.
13046 SDValue V2, ArrayRef<int> Mask,
13047 const X86Subtarget &Subtarget,
13048 SelectionDAG &DAG) {
13049 MVT EltVT = VT.getVectorElementType();
13050 if (!((Subtarget.hasSSE3() && VT == MVT::v2f64) ||
13051 (Subtarget.hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) ||
13052 (Subtarget.hasAVX2() && (VT.isInteger() || EltVT == MVT::f16))))
13053 return SDValue();
13054
13055 // With MOVDDUP (v2f64) we can broadcast from a register or a load, otherwise
13056 // we can only broadcast from a register with AVX2.
13057 unsigned NumEltBits = VT.getScalarSizeInBits();
13058 unsigned Opcode = (VT == MVT::v2f64 && !Subtarget.hasAVX2())
13061 bool BroadcastFromReg = (Opcode == X86ISD::MOVDDUP) || Subtarget.hasAVX2();
13062
13063 // Check that the mask is a broadcast.
13064 int BroadcastIdx = getSplatIndex(Mask);
13065 if (BroadcastIdx < 0) {
13066 // Check for hidden broadcast.
13067 SmallVector<int, 16> BroadcastMask(VT.getVectorNumElements(), 0);
13068 if (!isShuffleEquivalent(Mask, BroadcastMask, V1, V2))
13069 return SDValue();
13070 BroadcastIdx = 0;
13071 }
13072 assert(BroadcastIdx < (int)Mask.size() && "We only expect to be called with "
13073 "a sorted mask where the broadcast "
13074 "comes from V1.");
13075 int NumActiveElts = count_if(Mask, [](int M) { return M >= 0; });
13076
13077 // Go up the chain of (vector) values to find a scalar load that we can
13078 // combine with the broadcast.
13079 // TODO: Combine this logic with findEltLoadSrc() used by
13080 // EltsFromConsecutiveLoads().
13081 int BitOffset = BroadcastIdx * NumEltBits;
13082 SDValue V = V1;
13083 for (;;) {
13084 switch (V.getOpcode()) {
13085 case ISD::BITCAST: {
13086 V = V.getOperand(0);
13087 continue;
13088 }
13089 case ISD::CONCAT_VECTORS: {
13090 int OpBitWidth = V.getOperand(0).getValueSizeInBits();
13091 int OpIdx = BitOffset / OpBitWidth;
13092 V = V.getOperand(OpIdx);
13093 BitOffset %= OpBitWidth;
13094 continue;
13095 }
13097 // The extraction index adds to the existing offset.
13098 unsigned EltBitWidth = V.getScalarValueSizeInBits();
13099 unsigned Idx = V.getConstantOperandVal(1);
13100 unsigned BeginOffset = Idx * EltBitWidth;
13101 BitOffset += BeginOffset;
13102 V = V.getOperand(0);
13103 continue;
13104 }
13105 case ISD::INSERT_SUBVECTOR: {
13106 SDValue VOuter = V.getOperand(0), VInner = V.getOperand(1);
13107 int EltBitWidth = VOuter.getScalarValueSizeInBits();
13108 int Idx = (int)V.getConstantOperandVal(2);
13109 int NumSubElts = (int)VInner.getSimpleValueType().getVectorNumElements();
13110 int BeginOffset = Idx * EltBitWidth;
13111 int EndOffset = BeginOffset + NumSubElts * EltBitWidth;
13112 if (BeginOffset <= BitOffset && BitOffset < EndOffset) {
13113 BitOffset -= BeginOffset;
13114 V = VInner;
13115 } else {
13116 V = VOuter;
13117 }
13118 continue;
13119 }
13120 }
13121 break;
13122 }
13123 assert((BitOffset % NumEltBits) == 0 && "Illegal bit-offset");
13124 BroadcastIdx = BitOffset / NumEltBits;
13125
13126 // Do we need to bitcast the source to retrieve the original broadcast index?
13127 bool BitCastSrc = V.getScalarValueSizeInBits() != NumEltBits;
13128
13129 // Check if this is a broadcast of a scalar. We special case lowering
13130 // for scalars so that we can more effectively fold with loads.
13131 // If the original value has a larger element type than the shuffle, the
13132 // broadcast element is in essence truncated. Make that explicit to ease
13133 // folding.
13134 if (BitCastSrc && VT.isInteger())
13135 if (SDValue TruncBroadcast = lowerShuffleAsTruncBroadcast(
13136 DL, VT, V, BroadcastIdx, Subtarget, DAG))
13137 return TruncBroadcast;
13138
13139 // Also check the simpler case, where we can directly reuse the scalar.
13140 if (!BitCastSrc &&
13141 ((V.getOpcode() == ISD::BUILD_VECTOR && V.hasOneUse()) ||
13142 (V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0))) {
13143 V = V.getOperand(BroadcastIdx);
13144
13145 // If we can't broadcast from a register, check that the input is a load.
13146 if (!BroadcastFromReg && !isShuffleFoldableLoad(V))
13147 return SDValue();
13148 } else if (ISD::isNormalLoad(V.getNode()) &&
13149 cast<LoadSDNode>(V)->isSimple()) {
13150 // We do not check for one-use of the vector load because a broadcast load
13151 // is expected to be a win for code size, register pressure, and possibly
13152 // uops even if the original vector load is not eliminated.
13153
13154 // Reduce the vector load and shuffle to a broadcasted scalar load.
13155 auto *Ld = cast<LoadSDNode>(V);
13156 SDValue BaseAddr = Ld->getBasePtr();
13157 MVT SVT = VT.getScalarType();
13158 unsigned Offset = BroadcastIdx * SVT.getStoreSize();
13159 assert((int)(Offset * 8) == BitOffset && "Unexpected bit-offset");
13160 SDValue NewAddr =
13162
13163 // Directly form VBROADCAST_LOAD if we're using VBROADCAST opcode rather
13164 // than MOVDDUP.
13165 // FIXME: Should we add VBROADCAST_LOAD isel patterns for pre-AVX?
13166 if (Opcode == X86ISD::VBROADCAST) {
13167 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
13168 SDValue Ops[] = {Ld->getChain(), NewAddr};
13169 V = DAG.getMemIntrinsicNode(
13170 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, SVT,
13172 Ld->getMemOperand(), Offset, SVT.getStoreSize()));
13174 return DAG.getBitcast(VT, V);
13175 }
13176 assert(SVT == MVT::f64 && "Unexpected VT!");
13177 V = DAG.getLoad(SVT, DL, Ld->getChain(), NewAddr,
13179 Ld->getMemOperand(), Offset, SVT.getStoreSize()));
13181 } else if (!BroadcastFromReg) {
13182 // We can't broadcast from a vector register.
13183 return SDValue();
13184 } else if (BitOffset != 0) {
13185 // We can only broadcast from the zero-element of a vector register,
13186 // but it can be advantageous to broadcast from the zero-element of a
13187 // subvector.
13188 if (!VT.is256BitVector() && !VT.is512BitVector())
13189 return SDValue();
13190
13191 // VPERMQ/VPERMPD can perform the cross-lane shuffle directly.
13192 if (VT == MVT::v4f64 || VT == MVT::v4i64)
13193 return SDValue();
13194
13195 // If we are broadcasting an element from the lowest 128-bit subvector, try
13196 // to move the element in position.
13197 if (BitOffset < 128 && NumActiveElts > 1 &&
13198 V.getScalarValueSizeInBits() == NumEltBits) {
13199 assert((BitOffset % V.getScalarValueSizeInBits()) == 0 &&
13200 "Unexpected bit-offset");
13201 SmallVector<int, 16> ExtractMask(128 / NumEltBits, SM_SentinelUndef);
13202 ExtractMask[0] = BitOffset / V.getScalarValueSizeInBits();
13203 V = extractSubVector(V, 0, DAG, DL, 128);
13204 V = DAG.getVectorShuffle(V.getValueType(), DL, V, V, ExtractMask);
13205 } else {
13206 // Only broadcast the zero-element of a 128-bit subvector.
13207 if ((BitOffset % 128) != 0)
13208 return SDValue();
13209
13210 assert((BitOffset % V.getScalarValueSizeInBits()) == 0 &&
13211 "Unexpected bit-offset");
13212 assert((V.getValueSizeInBits() == 256 || V.getValueSizeInBits() == 512) &&
13213 "Unexpected vector size");
13214 unsigned ExtractIdx = BitOffset / V.getScalarValueSizeInBits();
13215 V = extract128BitVector(V, ExtractIdx, DAG, DL);
13216 }
13217 }
13218
13219 // On AVX we can use VBROADCAST directly for scalar sources.
13220 if (Opcode == X86ISD::MOVDDUP && !V.getValueType().isVector()) {
13221 V = DAG.getBitcast(MVT::f64, V);
13222 if (Subtarget.hasAVX()) {
13223 V = DAG.getNode(X86ISD::VBROADCAST, DL, MVT::v2f64, V);
13224 return DAG.getBitcast(VT, V);
13225 }
13226 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V);
13227 }
13228
13229 // If this is a scalar, do the broadcast on this type and bitcast.
13230 if (!V.getValueType().isVector()) {
13231 assert(V.getScalarValueSizeInBits() == NumEltBits &&
13232 "Unexpected scalar size");
13233 MVT BroadcastVT = MVT::getVectorVT(V.getSimpleValueType(),
13235 return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, BroadcastVT, V));
13236 }
13237
13238 // We only support broadcasting from 128-bit vectors to minimize the
13239 // number of patterns we need to deal with in isel. So extract down to
13240 // 128-bits, removing as many bitcasts as possible.
13241 if (V.getValueSizeInBits() > 128)
13243
13244 // Otherwise cast V to a vector with the same element type as VT, but
13245 // possibly narrower than VT. Then perform the broadcast.
13246 unsigned NumSrcElts = V.getValueSizeInBits() / NumEltBits;
13247 MVT CastVT = MVT::getVectorVT(VT.getVectorElementType(), NumSrcElts);
13248 return DAG.getNode(Opcode, DL, VT, DAG.getBitcast(CastVT, V));
13249}
13250
13251// Check for whether we can use INSERTPS to perform the shuffle. We only use
13252// INSERTPS when the V1 elements are already in the correct locations
13253// because otherwise we can just always use two SHUFPS instructions which
13254// are much smaller to encode than a SHUFPS and an INSERTPS. We can also
13255// perform INSERTPS if a single V1 element is out of place and all V2
13256// elements are zeroable.
13258 unsigned &InsertPSMask,
13259 const APInt &Zeroable,
13260 ArrayRef<int> Mask, SelectionDAG &DAG) {
13261 assert(V1.getSimpleValueType().is128BitVector() && "Bad operand type!");
13262 assert(V2.getSimpleValueType().is128BitVector() && "Bad operand type!");
13263 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
13264
13265 // Attempt to match INSERTPS with one element from VA or VB being
13266 // inserted into VA (or undef). If successful, V1, V2 and InsertPSMask
13267 // are updated.
13268 auto matchAsInsertPS = [&](SDValue VA, SDValue VB,
13269 ArrayRef<int> CandidateMask) {
13270 unsigned ZMask = 0;
13271 int VADstIndex = -1;
13272 int VBDstIndex = -1;
13273 bool VAUsedInPlace = false;
13274
13275 for (int i = 0; i < 4; ++i) {
13276 // Synthesize a zero mask from the zeroable elements (includes undefs).
13277 if (Zeroable[i]) {
13278 ZMask |= 1 << i;
13279 continue;
13280 }
13281
13282 // Flag if we use any VA inputs in place.
13283 if (i == CandidateMask[i]) {
13284 VAUsedInPlace = true;
13285 continue;
13286 }
13287
13288 // We can only insert a single non-zeroable element.
13289 if (VADstIndex >= 0 || VBDstIndex >= 0)
13290 return false;
13291
13292 if (CandidateMask[i] < 4) {
13293 // VA input out of place for insertion.
13294 VADstIndex = i;
13295 } else {
13296 // VB input for insertion.
13297 VBDstIndex = i;
13298 }
13299 }
13300
13301 // Don't bother if we have no (non-zeroable) element for insertion.
13302 if (VADstIndex < 0 && VBDstIndex < 0)
13303 return false;
13304
13305 // Determine element insertion src/dst indices. The src index is from the
13306 // start of the inserted vector, not the start of the concatenated vector.
13307 unsigned VBSrcIndex = 0;
13308 if (VADstIndex >= 0) {
13309 // If we have a VA input out of place, we use VA as the V2 element
13310 // insertion and don't use the original V2 at all.
13311 VBSrcIndex = CandidateMask[VADstIndex];
13312 VBDstIndex = VADstIndex;
13313 VB = VA;
13314 } else {
13315 VBSrcIndex = CandidateMask[VBDstIndex] - 4;
13316 }
13317
13318 // If no V1 inputs are used in place, then the result is created only from
13319 // the zero mask and the V2 insertion - so remove V1 dependency.
13320 if (!VAUsedInPlace)
13321 VA = DAG.getUNDEF(MVT::v4f32);
13322
13323 // Update V1, V2 and InsertPSMask accordingly.
13324 V1 = VA;
13325 V2 = VB;
13326
13327 // Insert the V2 element into the desired position.
13328 InsertPSMask = VBSrcIndex << 6 | VBDstIndex << 4 | ZMask;
13329 assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
13330 return true;
13331 };
13332
13333 if (matchAsInsertPS(V1, V2, Mask))
13334 return true;
13335
13336 // Commute and try again.
13337 SmallVector<int, 4> CommutedMask(Mask);
13339 if (matchAsInsertPS(V2, V1, CommutedMask))
13340 return true;
13341
13342 return false;
13343}
13344
13346 ArrayRef<int> Mask, const APInt &Zeroable,
13347 SelectionDAG &DAG) {
13348 assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
13349 assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
13350
13351 // Attempt to match the insertps pattern.
13352 unsigned InsertPSMask = 0;
13353 if (!matchShuffleAsInsertPS(V1, V2, InsertPSMask, Zeroable, Mask, DAG))
13354 return SDValue();
13355
13356 // Insert the V2 element into the desired position.
13357 return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
13358 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
13359}
13360
13361/// Handle lowering of 2-lane 64-bit floating point shuffles.
13362///
13363/// This is the basis function for the 2-lane 64-bit shuffles as we have full
13364/// support for floating point shuffles but not integer shuffles. These
13365/// instructions will incur a domain crossing penalty on some chips though so
13366/// it is better to avoid lowering through this for integer vectors where
13367/// possible.
13369 const APInt &Zeroable, SDValue V1, SDValue V2,
13370 const X86Subtarget &Subtarget,
13371 SelectionDAG &DAG) {
13372 assert(V1.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
13373 assert(V2.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
13374 assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
13375
13376 if (V2.isUndef()) {
13377 // Check for being able to broadcast a single element.
13378 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v2f64, V1, V2,
13379 Mask, Subtarget, DAG))
13380 return Broadcast;
13381
13382 // Straight shuffle of a single input vector. Simulate this by using the
13383 // single input as both of the "inputs" to this instruction..
13384 unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1);
13385
13386 if (Subtarget.hasAVX()) {
13387 // If we have AVX, we can use VPERMILPS which will allow folding a load
13388 // into the shuffle.
13389 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v2f64, V1,
13390 DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
13391 }
13392
13393 return DAG.getNode(
13394 X86ISD::SHUFP, DL, MVT::v2f64,
13395 Mask[0] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
13396 Mask[1] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
13397 DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
13398 }
13399 assert(Mask[0] >= 0 && "No undef lanes in multi-input v2 shuffles!");
13400 assert(Mask[1] >= 0 && "No undef lanes in multi-input v2 shuffles!");
13401 assert(Mask[0] < 2 && "We sort V1 to be the first input.");
13402 assert(Mask[1] >= 2 && "We sort V2 to be the second input.");
13403
13404 if (Subtarget.hasAVX2())
13405 if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
13406 return Extract;
13407
13408 // When loading a scalar and then shuffling it into a vector we can often do
13409 // the insertion cheaply.
13411 DL, MVT::v2f64, V1, V2, Mask, Zeroable, Subtarget, DAG))
13412 return Insertion;
13413 // Try inverting the insertion since for v2 masks it is easy to do and we
13414 // can't reliably sort the mask one way or the other.
13415 int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2),
13416 Mask[1] < 0 ? -1 : (Mask[1] ^ 2)};
13418 DL, MVT::v2f64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
13419 return Insertion;
13420
13421 // Try to use one of the special instruction patterns to handle two common
13422 // blend patterns if a zero-blend above didn't work.
13423 if (isShuffleEquivalent(Mask, {0, 3}, V1, V2) ||
13424 isShuffleEquivalent(Mask, {1, 3}, V1, V2))
13425 if (SDValue V1S = getScalarValueForVectorElement(V1, Mask[0], DAG))
13426 // We can either use a special instruction to load over the low double or
13427 // to move just the low double.
13428 return DAG.getNode(
13429 X86ISD::MOVSD, DL, MVT::v2f64, V2,
13430 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V1S));
13431
13432 if (Subtarget.hasSSE41())
13433 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask,
13434 Zeroable, Subtarget, DAG))
13435 return Blend;
13436
13437 // Use dedicated unpack instructions for masks that match their pattern.
13438 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v2f64, V1, V2, Mask, DAG))
13439 return V;
13440
13441 unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1);
13442 return DAG.getNode(X86ISD::SHUFP, DL, MVT::v2f64, V1, V2,
13443 DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
13444}
13445
13446/// Handle lowering of 2-lane 64-bit integer shuffles.
13447///
13448/// Tries to lower a 2-lane 64-bit shuffle using shuffle operations provided by
13449/// the integer unit to minimize domain crossing penalties. However, for blends
13450/// it falls back to the floating point shuffle operation with appropriate bit
13451/// casting.
13453 const APInt &Zeroable, SDValue V1, SDValue V2,
13454 const X86Subtarget &Subtarget,
13455 SelectionDAG &DAG) {
13456 assert(V1.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
13457 assert(V2.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
13458 assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
13459
13460 if (V2.isUndef()) {
13461 // Check for being able to broadcast a single element.
13462 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v2i64, V1, V2,
13463 Mask, Subtarget, DAG))
13464 return Broadcast;
13465
13466 // Straight shuffle of a single input vector. For everything from SSE2
13467 // onward this has a single fast instruction with no scary immediates.
13468 // We have to map the mask as it is actually a v4i32 shuffle instruction.
13469 V1 = DAG.getBitcast(MVT::v4i32, V1);
13470 int WidenedMask[4] = {Mask[0] < 0 ? -1 : (Mask[0] * 2),
13471 Mask[0] < 0 ? -1 : ((Mask[0] * 2) + 1),
13472 Mask[1] < 0 ? -1 : (Mask[1] * 2),
13473 Mask[1] < 0 ? -1 : ((Mask[1] * 2) + 1)};
13474 return DAG.getBitcast(
13475 MVT::v2i64,
13476 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
13477 getV4X86ShuffleImm8ForMask(WidenedMask, DL, DAG)));
13478 }
13479 assert(Mask[0] != -1 && "No undef lanes in multi-input v2 shuffles!");
13480 assert(Mask[1] != -1 && "No undef lanes in multi-input v2 shuffles!");
13481 assert(Mask[0] < 2 && "We sort V1 to be the first input.");
13482 assert(Mask[1] >= 2 && "We sort V2 to be the second input.");
13483
13484 if (Subtarget.hasAVX2())
13485 if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
13486 return Extract;
13487
13488 // Try to use shift instructions.
13489 if (SDValue Shift =
13490 lowerShuffleAsShift(DL, MVT::v2i64, V1, V2, Mask, Zeroable, Subtarget,
13491 DAG, /*BitwiseOnly*/ false))
13492 return Shift;
13493
13494 // When loading a scalar and then shuffling it into a vector we can often do
13495 // the insertion cheaply.
13497 DL, MVT::v2i64, V1, V2, Mask, Zeroable, Subtarget, DAG))
13498 return Insertion;
13499 // Try inverting the insertion since for v2 masks it is easy to do and we
13500 // can't reliably sort the mask one way or the other.
13501 int InverseMask[2] = {Mask[0] ^ 2, Mask[1] ^ 2};
13503 DL, MVT::v2i64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
13504 return Insertion;
13505
13506 // We have different paths for blend lowering, but they all must use the
13507 // *exact* same predicate.
13508 bool IsBlendSupported = Subtarget.hasSSE41();
13509 if (IsBlendSupported)
13510 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask,
13511 Zeroable, Subtarget, DAG))
13512 return Blend;
13513
13514 // Use dedicated unpack instructions for masks that match their pattern.
13515 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v2i64, V1, V2, Mask, DAG))
13516 return V;
13517
13518 // Try to use byte rotation instructions.
13519 // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
13520 if (Subtarget.hasSSSE3()) {
13521 if (Subtarget.hasVLX())
13522 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v2i64, V1, V2, Mask,
13523 Zeroable, Subtarget, DAG))
13524 return Rotate;
13525
13526 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v2i64, V1, V2, Mask,
13527 Subtarget, DAG))
13528 return Rotate;
13529 }
13530
13531 // If we have direct support for blends, we should lower by decomposing into
13532 // a permute. That will be faster than the domain cross.
13533 if (IsBlendSupported)
13534 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v2i64, V1, V2, Mask,
13535 Zeroable, Subtarget, DAG);
13536
13537 // We implement this with SHUFPD which is pretty lame because it will likely
13538 // incur 2 cycles of stall for integer vectors on Nehalem and older chips.
13539 // However, all the alternatives are still more cycles and newer chips don't
13540 // have this problem. It would be really nice if x86 had better shuffles here.
13541 V1 = DAG.getBitcast(MVT::v2f64, V1);
13542 V2 = DAG.getBitcast(MVT::v2f64, V2);
13543 return DAG.getBitcast(MVT::v2i64,
13544 DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask));
13545}
13546
13547/// Lower a vector shuffle using the SHUFPS instruction.
13548///
13549/// This is a helper routine dedicated to lowering vector shuffles using SHUFPS.
13550/// It makes no assumptions about whether this is the *best* lowering, it simply
13551/// uses it.
13553 ArrayRef<int> Mask, SDValue V1,
13554 SDValue V2, SelectionDAG &DAG) {
13555 SDValue LowV = V1, HighV = V2;
13556 SmallVector<int, 4> NewMask(Mask);
13557 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
13558
13559 if (NumV2Elements == 1) {
13560 int V2Index = find_if(Mask, [](int M) { return M >= 4; }) - Mask.begin();
13561
13562 // Compute the index adjacent to V2Index and in the same half by toggling
13563 // the low bit.
13564 int V2AdjIndex = V2Index ^ 1;
13565
13566 if (Mask[V2AdjIndex] < 0) {
13567 // Handles all the cases where we have a single V2 element and an undef.
13568 // This will only ever happen in the high lanes because we commute the
13569 // vector otherwise.
13570 if (V2Index < 2)
13571 std::swap(LowV, HighV);
13572 NewMask[V2Index] -= 4;
13573 } else {
13574 // Handle the case where the V2 element ends up adjacent to a V1 element.
13575 // To make this work, blend them together as the first step.
13576 int V1Index = V2AdjIndex;
13577 int BlendMask[4] = {Mask[V2Index] - 4, 0, Mask[V1Index], 0};
13578 V2 = DAG.getNode(X86ISD::SHUFP, DL, VT, V2, V1,
13579 getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
13580
13581 // Now proceed to reconstruct the final blend as we have the necessary
13582 // high or low half formed.
13583 if (V2Index < 2) {
13584 LowV = V2;
13585 HighV = V1;
13586 } else {
13587 HighV = V2;
13588 }
13589 NewMask[V1Index] = 2; // We put the V1 element in V2[2].
13590 NewMask[V2Index] = 0; // We shifted the V2 element into V2[0].
13591 }
13592 } else if (NumV2Elements == 2) {
13593 if (Mask[0] < 4 && Mask[1] < 4) {
13594 // Handle the easy case where we have V1 in the low lanes and V2 in the
13595 // high lanes.
13596 NewMask[2] -= 4;
13597 NewMask[3] -= 4;
13598 } else if (Mask[2] < 4 && Mask[3] < 4) {
13599 // We also handle the reversed case because this utility may get called
13600 // when we detect a SHUFPS pattern but can't easily commute the shuffle to
13601 // arrange things in the right direction.
13602 NewMask[0] -= 4;
13603 NewMask[1] -= 4;
13604 HighV = V1;
13605 LowV = V2;
13606 } else {
13607 // We have a mixture of V1 and V2 in both low and high lanes. Rather than
13608 // trying to place elements directly, just blend them and set up the final
13609 // shuffle to place them.
13610
13611 // The first two blend mask elements are for V1, the second two are for
13612 // V2.
13613 int BlendMask[4] = {Mask[0] < 4 ? Mask[0] : Mask[1],
13614 Mask[2] < 4 ? Mask[2] : Mask[3],
13615 (Mask[0] >= 4 ? Mask[0] : Mask[1]) - 4,
13616 (Mask[2] >= 4 ? Mask[2] : Mask[3]) - 4};
13617 V1 = DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
13618 getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
13619
13620 // Now we do a normal shuffle of V1 by giving V1 as both operands to
13621 // a blend.
13622 LowV = HighV = V1;
13623 NewMask[0] = Mask[0] < 4 ? 0 : 2;
13624 NewMask[1] = Mask[0] < 4 ? 2 : 0;
13625 NewMask[2] = Mask[2] < 4 ? 1 : 3;
13626 NewMask[3] = Mask[2] < 4 ? 3 : 1;
13627 }
13628 } else if (NumV2Elements == 3) {
13629 // Ideally canonicalizeShuffleMaskWithCommute should have caught this, but
13630 // we can get here due to other paths (e.g repeated mask matching) that we
13631 // don't want to do another round of lowerVECTOR_SHUFFLE.
13633 return lowerShuffleWithSHUFPS(DL, VT, NewMask, V2, V1, DAG);
13634 }
13635 return DAG.getNode(X86ISD::SHUFP, DL, VT, LowV, HighV,
13636 getV4X86ShuffleImm8ForMask(NewMask, DL, DAG));
13637}
13638
13639/// Lower 4-lane 32-bit floating point shuffles.
13640///
13641/// Uses instructions exclusively from the floating point unit to minimize
13642/// domain crossing penalties, as these are sufficient to implement all v4f32
13643/// shuffles.
13645 const APInt &Zeroable, SDValue V1, SDValue V2,
13646 const X86Subtarget &Subtarget,
13647 SelectionDAG &DAG) {
13648 assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
13649 assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
13650 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
13651
13652 if (Subtarget.hasSSE41())
13653 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask,
13654 Zeroable, Subtarget, DAG))
13655 return Blend;
13656
13657 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
13658
13659 if (NumV2Elements == 0) {
13660 // Check for being able to broadcast a single element.
13661 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4f32, V1, V2,
13662 Mask, Subtarget, DAG))
13663 return Broadcast;
13664
13665 // Use even/odd duplicate instructions for masks that match their pattern.
13666 if (Subtarget.hasSSE3()) {
13667 if (isShuffleEquivalent(Mask, {0, 0, 2, 2}, V1, V2))
13668 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v4f32, V1);
13669 if (isShuffleEquivalent(Mask, {1, 1, 3, 3}, V1, V2))
13670 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v4f32, V1);
13671 }
13672
13673 if (Subtarget.hasAVX()) {
13674 // If we have AVX, we can use VPERMILPS which will allow folding a load
13675 // into the shuffle.
13676 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f32, V1,
13677 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
13678 }
13679
13680 // Use MOVLHPS/MOVHLPS to simulate unary shuffles. These are only valid
13681 // in SSE1 because otherwise they are widened to v2f64 and never get here.
13682 if (!Subtarget.hasSSE2()) {
13683 if (isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1, V2))
13684 return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V1);
13685 if (isShuffleEquivalent(Mask, {2, 3, 2, 3}, V1, V2))
13686 return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V1, V1);
13687 }
13688
13689 // Otherwise, use a straight shuffle of a single input vector. We pass the
13690 // input vector to both operands to simulate this with a SHUFPS.
13691 return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V1,
13692 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
13693 }
13694
13695 if (Subtarget.hasSSE2())
13697 DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG)) {
13698 ZExt = DAG.getBitcast(MVT::v4f32, ZExt);
13699 return ZExt;
13700 }
13701
13702 if (Subtarget.hasAVX2())
13703 if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
13704 return Extract;
13705
13706 // There are special ways we can lower some single-element blends. However, we
13707 // have custom ways we can lower more complex single-element blends below that
13708 // we defer to if both this and BLENDPS fail to match, so restrict this to
13709 // when the V2 input is targeting element 0 of the mask -- that is the fast
13710 // case here.
13711 if (NumV2Elements == 1 && Mask[0] >= 4)
13713 DL, MVT::v4f32, V1, V2, Mask, Zeroable, Subtarget, DAG))
13714 return V;
13715
13716 if (Subtarget.hasSSE41()) {
13717 // Use INSERTPS if we can complete the shuffle efficiently.
13718 if (SDValue V = lowerShuffleAsInsertPS(DL, V1, V2, Mask, Zeroable, DAG))
13719 return V;
13720
13721 if (!isSingleSHUFPSMask(Mask))
13722 if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, MVT::v4f32, V1,
13723 V2, Mask, DAG))
13724 return BlendPerm;
13725 }
13726
13727 // Use low/high mov instructions. These are only valid in SSE1 because
13728 // otherwise they are widened to v2f64 and never get here.
13729 if (!Subtarget.hasSSE2()) {
13730 if (isShuffleEquivalent(Mask, {0, 1, 4, 5}, V1, V2))
13731 return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V2);
13732 if (isShuffleEquivalent(Mask, {2, 3, 6, 7}, V1, V2))
13733 return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V2, V1);
13734 }
13735
13736 // Use dedicated unpack instructions for masks that match their pattern.
13737 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4f32, V1, V2, Mask, DAG))
13738 return V;
13739
13740 // Otherwise fall back to a SHUFPS lowering strategy.
13741 return lowerShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG);
13742}
13743
13744/// Lower 4-lane i32 vector shuffles.
13745///
13746/// We try to handle these with integer-domain shuffles where we can, but for
13747/// blends we use the floating point domain blend instructions.
13749 const APInt &Zeroable, SDValue V1, SDValue V2,
13750 const X86Subtarget &Subtarget,
13751 SelectionDAG &DAG) {
13752 assert(V1.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
13753 assert(V2.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
13754 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
13755
13756 // Whenever we can lower this as a zext, that instruction is strictly faster
13757 // than any alternative. It also allows us to fold memory operands into the
13758 // shuffle in many cases.
13759 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v4i32, V1, V2, Mask,
13760 Zeroable, Subtarget, DAG))
13761 return ZExt;
13762
13763 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
13764
13765 // Try to use shift instructions if fast.
13766 if (Subtarget.preferLowerShuffleAsShift()) {
13767 if (SDValue Shift =
13768 lowerShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask, Zeroable,
13769 Subtarget, DAG, /*BitwiseOnly*/ true))
13770 return Shift;
13771 if (NumV2Elements == 0)
13772 if (SDValue Rotate =
13773 lowerShuffleAsBitRotate(DL, MVT::v4i32, V1, Mask, Subtarget, DAG))
13774 return Rotate;
13775 }
13776
13777 if (NumV2Elements == 0) {
13778 // Try to use broadcast unless the mask only has one non-undef element.
13779 if (count_if(Mask, [](int M) { return M >= 0 && M < 4; }) > 1) {
13780 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4i32, V1, V2,
13781 Mask, Subtarget, DAG))
13782 return Broadcast;
13783 }
13784
13785 // Straight shuffle of a single input vector. For everything from SSE2
13786 // onward this has a single fast instruction with no scary immediates.
13787 // We coerce the shuffle pattern to be compatible with UNPCK instructions
13788 // but we aren't actually going to use the UNPCK instruction because doing
13789 // so prevents folding a load into this instruction or making a copy.
13790 const int UnpackLoMask[] = {0, 0, 1, 1};
13791 const int UnpackHiMask[] = {2, 2, 3, 3};
13792 if (!isSingleElementRepeatedMask(Mask)) {
13793 if (isShuffleEquivalent(Mask, {0, 0, 1, 1}, V1, V2))
13794 Mask = UnpackLoMask;
13795 else if (isShuffleEquivalent(Mask, {2, 2, 3, 3}, V1, V2))
13796 Mask = UnpackHiMask;
13797 }
13798
13799 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
13800 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
13801 }
13802
13803 if (Subtarget.hasAVX2())
13804 if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
13805 return Extract;
13806
13807 // Try to use shift instructions.
13808 if (SDValue Shift =
13809 lowerShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget,
13810 DAG, /*BitwiseOnly*/ false))
13811 return Shift;
13812
13813 // There are special ways we can lower some single-element blends.
13814 if (NumV2Elements == 1)
13816 DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
13817 return V;
13818
13819 // We have different paths for blend lowering, but they all must use the
13820 // *exact* same predicate.
13821 bool IsBlendSupported = Subtarget.hasSSE41();
13822 if (IsBlendSupported)
13823 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask,
13824 Zeroable, Subtarget, DAG))
13825 return Blend;
13826
13827 if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v4i32, V1, V2, Mask,
13828 Zeroable, Subtarget, DAG))
13829 return Masked;
13830
13831 // Use dedicated unpack instructions for masks that match their pattern.
13832 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4i32, V1, V2, Mask, DAG))
13833 return V;
13834
13835 // Try to use byte rotation instructions.
13836 // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
13837 if (Subtarget.hasSSSE3()) {
13838 if (Subtarget.hasVLX())
13839 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v4i32, V1, V2, Mask,
13840 Zeroable, Subtarget, DAG))
13841 return Rotate;
13842
13843 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v4i32, V1, V2, Mask,
13844 Subtarget, DAG))
13845 return Rotate;
13846 }
13847
13848 // Assume that a single SHUFPS is faster than an alternative sequence of
13849 // multiple instructions (even if the CPU has a domain penalty).
13850 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
13851 if (!isSingleSHUFPSMask(Mask)) {
13852 // If we have direct support for blends, we should lower by decomposing into
13853 // a permute. That will be faster than the domain cross.
13854 if (IsBlendSupported)
13855 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i32, V1, V2, Mask,
13856 Zeroable, Subtarget, DAG);
13857
13858 // Try to lower by permuting the inputs into an unpack instruction.
13859 if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(DL, MVT::v4i32, V1, V2,
13860 Mask, Subtarget, DAG))
13861 return Unpack;
13862 }
13863
13864 // We implement this with SHUFPS because it can blend from two vectors.
13865 // Because we're going to eventually use SHUFPS, we use SHUFPS even to build
13866 // up the inputs, bypassing domain shift penalties that we would incur if we
13867 // directly used PSHUFD on Nehalem and older. For newer chips, this isn't
13868 // relevant.
13869 SDValue CastV1 = DAG.getBitcast(MVT::v4f32, V1);
13870 SDValue CastV2 = DAG.getBitcast(MVT::v4f32, V2);
13871 SDValue ShufPS = DAG.getVectorShuffle(MVT::v4f32, DL, CastV1, CastV2, Mask);
13872 return DAG.getBitcast(MVT::v4i32, ShufPS);
13873}
13874
13875/// Lowering of single-input v8i16 shuffles is the cornerstone of SSE2
13876/// shuffle lowering, and the most complex part.
13877///
13878/// The lowering strategy is to try to form pairs of input lanes which are
13879/// targeted at the same half of the final vector, and then use a dword shuffle
13880/// to place them onto the right half, and finally unpack the paired lanes into
13881/// their final position.
13882///
13883/// The exact breakdown of how to form these dword pairs and align them on the
13884/// correct sides is really tricky. See the comments within the function for
13885/// more of the details.
13886///
13887/// This code also handles repeated 128-bit lanes of v8i16 shuffles, but each
13888/// lane must shuffle the *exact* same way. In fact, you must pass a v8 Mask to
13889/// this routine for it to work correctly. To shuffle a 256-bit or 512-bit i16
13890/// vector, form the analogous 128-bit 8-element Mask.
13892 const SDLoc &DL, MVT VT, SDValue V, MutableArrayRef<int> Mask,
13893 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
13894 assert(VT.getVectorElementType() == MVT::i16 && "Bad input type!");
13895 MVT PSHUFDVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
13896
13897 assert(Mask.size() == 8 && "Shuffle mask length doesn't match!");
13898 MutableArrayRef<int> LoMask = Mask.slice(0, 4);
13899 MutableArrayRef<int> HiMask = Mask.slice(4, 4);
13900
13901 // Attempt to directly match PSHUFLW or PSHUFHW.
13902 if (isUndefOrInRange(LoMask, 0, 4) &&
13903 isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {
13904 return DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
13905 getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));
13906 }
13907 if (isUndefOrInRange(HiMask, 4, 8) &&
13908 isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {
13909 for (int i = 0; i != 4; ++i)
13910 HiMask[i] = (HiMask[i] < 0 ? HiMask[i] : (HiMask[i] - 4));
13911 return DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
13912 getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));
13913 }
13914
13915 SmallVector<int, 4> LoInputs;
13916 copy_if(LoMask, std::back_inserter(LoInputs), [](int M) { return M >= 0; });
13917 array_pod_sort(LoInputs.begin(), LoInputs.end());
13918 LoInputs.erase(llvm::unique(LoInputs), LoInputs.end());
13919 SmallVector<int, 4> HiInputs;
13920 copy_if(HiMask, std::back_inserter(HiInputs), [](int M) { return M >= 0; });
13921 array_pod_sort(HiInputs.begin(), HiInputs.end());
13922 HiInputs.erase(llvm::unique(HiInputs), HiInputs.end());
13923 int NumLToL = llvm::lower_bound(LoInputs, 4) - LoInputs.begin();
13924 int NumHToL = LoInputs.size() - NumLToL;
13925 int NumLToH = llvm::lower_bound(HiInputs, 4) - HiInputs.begin();
13926 int NumHToH = HiInputs.size() - NumLToH;
13927 MutableArrayRef<int> LToLInputs(LoInputs.data(), NumLToL);
13928 MutableArrayRef<int> LToHInputs(HiInputs.data(), NumLToH);
13929 MutableArrayRef<int> HToLInputs(LoInputs.data() + NumLToL, NumHToL);
13930 MutableArrayRef<int> HToHInputs(HiInputs.data() + NumLToH, NumHToH);
13931
13932 // If we are shuffling values from one half - check how many different DWORD
13933 // pairs we need to create. If only 1 or 2 then we can perform this as a
13934 // PSHUFLW/PSHUFHW + PSHUFD instead of the PSHUFD+PSHUFLW+PSHUFHW chain below.
13935 auto ShuffleDWordPairs = [&](ArrayRef<int> PSHUFHalfMask,
13936 ArrayRef<int> PSHUFDMask, unsigned ShufWOp) {
13937 V = DAG.getNode(ShufWOp, DL, VT, V,
13938 getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
13939 V = DAG.getBitcast(PSHUFDVT, V);
13940 V = DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, V,
13941 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
13942 return DAG.getBitcast(VT, V);
13943 };
13944
13945 if ((NumHToL + NumHToH) == 0 || (NumLToL + NumLToH) == 0) {
13946 int PSHUFDMask[4] = { -1, -1, -1, -1 };
13947 SmallVector<std::pair<int, int>, 4> DWordPairs;
13948 int DOffset = ((NumHToL + NumHToH) == 0 ? 0 : 2);
13949
13950 // Collect the different DWORD pairs.
13951 for (int DWord = 0; DWord != 4; ++DWord) {
13952 int M0 = Mask[2 * DWord + 0];
13953 int M1 = Mask[2 * DWord + 1];
13954 M0 = (M0 >= 0 ? M0 % 4 : M0);
13955 M1 = (M1 >= 0 ? M1 % 4 : M1);
13956 if (M0 < 0 && M1 < 0)
13957 continue;
13958
13959 bool Match = false;
13960 for (int j = 0, e = DWordPairs.size(); j < e; ++j) {
13961 auto &DWordPair = DWordPairs[j];
13962 if ((M0 < 0 || isUndefOrEqual(DWordPair.first, M0)) &&
13963 (M1 < 0 || isUndefOrEqual(DWordPair.second, M1))) {
13964 DWordPair.first = (M0 >= 0 ? M0 : DWordPair.first);
13965 DWordPair.second = (M1 >= 0 ? M1 : DWordPair.second);
13966 PSHUFDMask[DWord] = DOffset + j;
13967 Match = true;
13968 break;
13969 }
13970 }
13971 if (!Match) {
13972 PSHUFDMask[DWord] = DOffset + DWordPairs.size();
13973 DWordPairs.push_back(std::make_pair(M0, M1));
13974 }
13975 }
13976
13977 if (DWordPairs.size() <= 2) {
13978 DWordPairs.resize(2, std::make_pair(-1, -1));
13979 int PSHUFHalfMask[4] = {DWordPairs[0].first, DWordPairs[0].second,
13980 DWordPairs[1].first, DWordPairs[1].second};
13981 // For splat, ensure we widen the PSHUFDMask to allow vXi64 folds.
13982 if (ShuffleVectorSDNode::isSplatMask(PSHUFDMask) &&
13983 ShuffleVectorSDNode::isSplatMask(PSHUFHalfMask)) {
13984 int SplatIdx = ShuffleVectorSDNode::getSplatMaskIndex(PSHUFHalfMask);
13985 std::fill(PSHUFHalfMask, PSHUFHalfMask + 4, SplatIdx);
13986 PSHUFDMask[0] = PSHUFDMask[2] = DOffset + 0;
13987 PSHUFDMask[1] = PSHUFDMask[3] = DOffset + 1;
13988 }
13989 if ((NumHToL + NumHToH) == 0)
13990 return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFLW);
13991 if ((NumLToL + NumLToH) == 0)
13992 return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFHW);
13993 }
13994 }
13995
13996 // Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all
13997 // such inputs we can swap two of the dwords across the half mark and end up
13998 // with <=2 inputs to each half in each half. Once there, we can fall through
13999 // to the generic code below. For example:
14000 //
14001 // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
14002 // Mask: [0, 1, 2, 7, 4, 5, 6, 3] -----------------> [0, 1, 4, 7, 2, 3, 6, 5]
14003 //
14004 // However in some very rare cases we have a 1-into-3 or 3-into-1 on one half
14005 // and an existing 2-into-2 on the other half. In this case we may have to
14006 // pre-shuffle the 2-into-2 half to avoid turning it into a 3-into-1 or
14007 // 1-into-3 which could cause us to cycle endlessly fixing each side in turn.
14008 // Fortunately, we don't have to handle anything but a 2-into-2 pattern
14009 // because any other situation (including a 3-into-1 or 1-into-3 in the other
14010 // half than the one we target for fixing) will be fixed when we re-enter this
14011 // path. We will also combine away any sequence of PSHUFD instructions that
14012 // result into a single instruction. Here is an example of the tricky case:
14013 //
14014 // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
14015 // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -THIS-IS-BAD!!!!-> [5, 7, 1, 0, 4, 7, 5, 3]
14016 //
14017 // This now has a 1-into-3 in the high half! Instead, we do two shuffles:
14018 //
14019 // Input: [a, b, c, d, e, f, g, h] PSHUFHW[0,2,1,3]-> [a, b, c, d, e, g, f, h]
14020 // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -----------------> [3, 7, 1, 0, 2, 7, 3, 6]
14021 //
14022 // Input: [a, b, c, d, e, g, f, h] -PSHUFD[0,2,1,3]-> [a, b, e, g, c, d, f, h]
14023 // Mask: [3, 7, 1, 0, 2, 7, 3, 6] -----------------> [5, 7, 1, 0, 4, 7, 5, 6]
14024 //
14025 // The result is fine to be handled by the generic logic.
14026 auto balanceSides = [&](ArrayRef<int> AToAInputs, ArrayRef<int> BToAInputs,
14027 ArrayRef<int> BToBInputs, ArrayRef<int> AToBInputs,
14028 int AOffset, int BOffset) {
14029 assert((AToAInputs.size() == 3 || AToAInputs.size() == 1) &&
14030 "Must call this with A having 3 or 1 inputs from the A half.");
14031 assert((BToAInputs.size() == 1 || BToAInputs.size() == 3) &&
14032 "Must call this with B having 1 or 3 inputs from the B half.");
14033 assert(AToAInputs.size() + BToAInputs.size() == 4 &&
14034 "Must call this with either 3:1 or 1:3 inputs (summing to 4).");
14035
14036 bool ThreeAInputs = AToAInputs.size() == 3;
14037
14038 // Compute the index of dword with only one word among the three inputs in
14039 // a half by taking the sum of the half with three inputs and subtracting
14040 // the sum of the actual three inputs. The difference is the remaining
14041 // slot.
14042 int ADWord = 0, BDWord = 0;
14043 int &TripleDWord = ThreeAInputs ? ADWord : BDWord;
14044 int &OneInputDWord = ThreeAInputs ? BDWord : ADWord;
14045 int TripleInputOffset = ThreeAInputs ? AOffset : BOffset;
14046 ArrayRef<int> TripleInputs = ThreeAInputs ? AToAInputs : BToAInputs;
14047 int OneInput = ThreeAInputs ? BToAInputs[0] : AToAInputs[0];
14048 int TripleInputSum = 0 + 1 + 2 + 3 + (4 * TripleInputOffset);
14049 int TripleNonInputIdx =
14050 TripleInputSum - std::accumulate(TripleInputs.begin(), TripleInputs.end(), 0);
14051 TripleDWord = TripleNonInputIdx / 2;
14052
14053 // We use xor with one to compute the adjacent DWord to whichever one the
14054 // OneInput is in.
14055 OneInputDWord = (OneInput / 2) ^ 1;
14056
14057 // Check for one tricky case: We're fixing a 3<-1 or a 1<-3 shuffle for AToA
14058 // and BToA inputs. If there is also such a problem with the BToB and AToB
14059 // inputs, we don't try to fix it necessarily -- we'll recurse and see it in
14060 // the next pass. However, if we have a 2<-2 in the BToB and AToB inputs, it
14061 // is essential that we don't *create* a 3<-1 as then we might oscillate.
14062 if (BToBInputs.size() == 2 && AToBInputs.size() == 2) {
14063 // Compute how many inputs will be flipped by swapping these DWords. We
14064 // need
14065 // to balance this to ensure we don't form a 3-1 shuffle in the other
14066 // half.
14067 int NumFlippedAToBInputs = llvm::count(AToBInputs, 2 * ADWord) +
14068 llvm::count(AToBInputs, 2 * ADWord + 1);
14069 int NumFlippedBToBInputs = llvm::count(BToBInputs, 2 * BDWord) +
14070 llvm::count(BToBInputs, 2 * BDWord + 1);
14071 if ((NumFlippedAToBInputs == 1 &&
14072 (NumFlippedBToBInputs == 0 || NumFlippedBToBInputs == 2)) ||
14073 (NumFlippedBToBInputs == 1 &&
14074 (NumFlippedAToBInputs == 0 || NumFlippedAToBInputs == 2))) {
14075 // We choose whether to fix the A half or B half based on whether that
14076 // half has zero flipped inputs. At zero, we may not be able to fix it
14077 // with that half. We also bias towards fixing the B half because that
14078 // will more commonly be the high half, and we have to bias one way.
14079 auto FixFlippedInputs = [&V, &DL, &Mask, &DAG](int PinnedIdx, int DWord,
14080 ArrayRef<int> Inputs) {
14081 int FixIdx = PinnedIdx ^ 1; // The adjacent slot to the pinned slot.
14082 bool IsFixIdxInput = is_contained(Inputs, PinnedIdx ^ 1);
14083 // Determine whether the free index is in the flipped dword or the
14084 // unflipped dword based on where the pinned index is. We use this bit
14085 // in an xor to conditionally select the adjacent dword.
14086 int FixFreeIdx = 2 * (DWord ^ (PinnedIdx / 2 == DWord));
14087 bool IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
14088 if (IsFixIdxInput == IsFixFreeIdxInput)
14089 FixFreeIdx += 1;
14090 IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
14091 assert(IsFixIdxInput != IsFixFreeIdxInput &&
14092 "We need to be changing the number of flipped inputs!");
14093 int PSHUFHalfMask[] = {0, 1, 2, 3};
14094 std::swap(PSHUFHalfMask[FixFreeIdx % 4], PSHUFHalfMask[FixIdx % 4]);
14095 V = DAG.getNode(
14096 FixIdx < 4 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW, DL,
14097 MVT::getVectorVT(MVT::i16, V.getValueSizeInBits() / 16), V,
14098 getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
14099
14100 for (int &M : Mask)
14101 if (M >= 0 && M == FixIdx)
14102 M = FixFreeIdx;
14103 else if (M >= 0 && M == FixFreeIdx)
14104 M = FixIdx;
14105 };
14106 if (NumFlippedBToBInputs != 0) {
14107 int BPinnedIdx =
14108 BToAInputs.size() == 3 ? TripleNonInputIdx : OneInput;
14109 FixFlippedInputs(BPinnedIdx, BDWord, BToBInputs);
14110 } else {
14111 assert(NumFlippedAToBInputs != 0 && "Impossible given predicates!");
14112 int APinnedIdx = ThreeAInputs ? TripleNonInputIdx : OneInput;
14113 FixFlippedInputs(APinnedIdx, ADWord, AToBInputs);
14114 }
14115 }
14116 }
14117
14118 int PSHUFDMask[] = {0, 1, 2, 3};
14119 PSHUFDMask[ADWord] = BDWord;
14120 PSHUFDMask[BDWord] = ADWord;
14121 V = DAG.getBitcast(
14122 VT,
14123 DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
14124 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
14125
14126 // Adjust the mask to match the new locations of A and B.
14127 for (int &M : Mask)
14128 if (M >= 0 && M/2 == ADWord)
14129 M = 2 * BDWord + M % 2;
14130 else if (M >= 0 && M/2 == BDWord)
14131 M = 2 * ADWord + M % 2;
14132
14133 // Recurse back into this routine to re-compute state now that this isn't
14134 // a 3 and 1 problem.
14135 return lowerV8I16GeneralSingleInputShuffle(DL, VT, V, Mask, Subtarget, DAG);
14136 };
14137 if ((NumLToL == 3 && NumHToL == 1) || (NumLToL == 1 && NumHToL == 3))
14138 return balanceSides(LToLInputs, HToLInputs, HToHInputs, LToHInputs, 0, 4);
14139 if ((NumHToH == 3 && NumLToH == 1) || (NumHToH == 1 && NumLToH == 3))
14140 return balanceSides(HToHInputs, LToHInputs, LToLInputs, HToLInputs, 4, 0);
14141
14142 // At this point there are at most two inputs to the low and high halves from
14143 // each half. That means the inputs can always be grouped into dwords and
14144 // those dwords can then be moved to the correct half with a dword shuffle.
14145 // We use at most one low and one high word shuffle to collect these paired
14146 // inputs into dwords, and finally a dword shuffle to place them.
14147 int PSHUFLMask[4] = {-1, -1, -1, -1};
14148 int PSHUFHMask[4] = {-1, -1, -1, -1};
14149 int PSHUFDMask[4] = {-1, -1, -1, -1};
14150
14151 // First fix the masks for all the inputs that are staying in their
14152 // original halves. This will then dictate the targets of the cross-half
14153 // shuffles.
14154 auto fixInPlaceInputs =
14155 [&PSHUFDMask](ArrayRef<int> InPlaceInputs, ArrayRef<int> IncomingInputs,
14156 MutableArrayRef<int> SourceHalfMask,
14157 MutableArrayRef<int> HalfMask, int HalfOffset) {
14158 if (InPlaceInputs.empty())
14159 return;
14160 if (InPlaceInputs.size() == 1) {
14161 SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
14162 InPlaceInputs[0] - HalfOffset;
14163 PSHUFDMask[InPlaceInputs[0] / 2] = InPlaceInputs[0] / 2;
14164 return;
14165 }
14166 if (IncomingInputs.empty()) {
14167 // Just fix all of the in place inputs.
14168 for (int Input : InPlaceInputs) {
14169 SourceHalfMask[Input - HalfOffset] = Input - HalfOffset;
14170 PSHUFDMask[Input / 2] = Input / 2;
14171 }
14172 return;
14173 }
14174
14175 assert(InPlaceInputs.size() == 2 && "Cannot handle 3 or 4 inputs!");
14176 SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
14177 InPlaceInputs[0] - HalfOffset;
14178 // Put the second input next to the first so that they are packed into
14179 // a dword. We find the adjacent index by toggling the low bit.
14180 int AdjIndex = InPlaceInputs[0] ^ 1;
14181 SourceHalfMask[AdjIndex - HalfOffset] = InPlaceInputs[1] - HalfOffset;
14182 llvm::replace(HalfMask, InPlaceInputs[1], AdjIndex);
14183 PSHUFDMask[AdjIndex / 2] = AdjIndex / 2;
14184 };
14185 fixInPlaceInputs(LToLInputs, HToLInputs, PSHUFLMask, LoMask, 0);
14186 fixInPlaceInputs(HToHInputs, LToHInputs, PSHUFHMask, HiMask, 4);
14187
14188 // Now gather the cross-half inputs and place them into a free dword of
14189 // their target half.
14190 // FIXME: This operation could almost certainly be simplified dramatically to
14191 // look more like the 3-1 fixing operation.
14192 auto moveInputsToRightHalf = [&PSHUFDMask](
14193 MutableArrayRef<int> IncomingInputs, ArrayRef<int> ExistingInputs,
14194 MutableArrayRef<int> SourceHalfMask, MutableArrayRef<int> HalfMask,
14195 MutableArrayRef<int> FinalSourceHalfMask, int SourceOffset,
14196 int DestOffset) {
14197 auto isWordClobbered = [](ArrayRef<int> SourceHalfMask, int Word) {
14198 return SourceHalfMask[Word] >= 0 && SourceHalfMask[Word] != Word;
14199 };
14200 auto isDWordClobbered = [&isWordClobbered](ArrayRef<int> SourceHalfMask,
14201 int Word) {
14202 int LowWord = Word & ~1;
14203 int HighWord = Word | 1;
14204 return isWordClobbered(SourceHalfMask, LowWord) ||
14205 isWordClobbered(SourceHalfMask, HighWord);
14206 };
14207
14208 if (IncomingInputs.empty())
14209 return;
14210
14211 if (ExistingInputs.empty()) {
14212 // Map any dwords with inputs from them into the right half.
14213 for (int Input : IncomingInputs) {
14214 // If the source half mask maps over the inputs, turn those into
14215 // swaps and use the swapped lane.
14216 if (isWordClobbered(SourceHalfMask, Input - SourceOffset)) {
14217 if (SourceHalfMask[SourceHalfMask[Input - SourceOffset]] < 0) {
14218 SourceHalfMask[SourceHalfMask[Input - SourceOffset]] =
14219 Input - SourceOffset;
14220 // We have to swap the uses in our half mask in one sweep.
14221 for (int &M : HalfMask)
14222 if (M == SourceHalfMask[Input - SourceOffset] + SourceOffset)
14223 M = Input;
14224 else if (M == Input)
14225 M = SourceHalfMask[Input - SourceOffset] + SourceOffset;
14226 } else {
14227 assert(SourceHalfMask[SourceHalfMask[Input - SourceOffset]] ==
14228 Input - SourceOffset &&
14229 "Previous placement doesn't match!");
14230 }
14231 // Note that this correctly re-maps both when we do a swap and when
14232 // we observe the other side of the swap above. We rely on that to
14233 // avoid swapping the members of the input list directly.
14234 Input = SourceHalfMask[Input - SourceOffset] + SourceOffset;
14235 }
14236
14237 // Map the input's dword into the correct half.
14238 if (PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] < 0)
14239 PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] = Input / 2;
14240 else
14241 assert(PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] ==
14242 Input / 2 &&
14243 "Previous placement doesn't match!");
14244 }
14245
14246 // And just directly shift any other-half mask elements to be same-half
14247 // as we will have mirrored the dword containing the element into the
14248 // same position within that half.
14249 for (int &M : HalfMask)
14250 if (M >= SourceOffset && M < SourceOffset + 4) {
14251 M = M - SourceOffset + DestOffset;
14252 assert(M >= 0 && "This should never wrap below zero!");
14253 }
14254 return;
14255 }
14256
14257 // Ensure we have the input in a viable dword of its current half. This
14258 // is particularly tricky because the original position may be clobbered
14259 // by inputs being moved and *staying* in that half.
14260 if (IncomingInputs.size() == 1) {
14261 if (isWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
14262 int InputFixed = find(SourceHalfMask, -1) - std::begin(SourceHalfMask) +
14263 SourceOffset;
14264 SourceHalfMask[InputFixed - SourceOffset] =
14265 IncomingInputs[0] - SourceOffset;
14266 llvm::replace(HalfMask, IncomingInputs[0], InputFixed);
14267 IncomingInputs[0] = InputFixed;
14268 }
14269 } else if (IncomingInputs.size() == 2) {
14270 if (IncomingInputs[0] / 2 != IncomingInputs[1] / 2 ||
14271 isDWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
14272 // We have two non-adjacent or clobbered inputs we need to extract from
14273 // the source half. To do this, we need to map them into some adjacent
14274 // dword slot in the source mask.
14275 int InputsFixed[2] = {IncomingInputs[0] - SourceOffset,
14276 IncomingInputs[1] - SourceOffset};
14277
14278 // If there is a free slot in the source half mask adjacent to one of
14279 // the inputs, place the other input in it. We use (Index XOR 1) to
14280 // compute an adjacent index.
14281 if (!isWordClobbered(SourceHalfMask, InputsFixed[0]) &&
14282 SourceHalfMask[InputsFixed[0] ^ 1] < 0) {
14283 SourceHalfMask[InputsFixed[0]] = InputsFixed[0];
14284 SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
14285 InputsFixed[1] = InputsFixed[0] ^ 1;
14286 } else if (!isWordClobbered(SourceHalfMask, InputsFixed[1]) &&
14287 SourceHalfMask[InputsFixed[1] ^ 1] < 0) {
14288 SourceHalfMask[InputsFixed[1]] = InputsFixed[1];
14289 SourceHalfMask[InputsFixed[1] ^ 1] = InputsFixed[0];
14290 InputsFixed[0] = InputsFixed[1] ^ 1;
14291 } else if (SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] < 0 &&
14292 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] < 0) {
14293 // The two inputs are in the same DWord but it is clobbered and the
14294 // adjacent DWord isn't used at all. Move both inputs to the free
14295 // slot.
14296 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] = InputsFixed[0];
14297 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] = InputsFixed[1];
14298 InputsFixed[0] = 2 * ((InputsFixed[0] / 2) ^ 1);
14299 InputsFixed[1] = 2 * ((InputsFixed[0] / 2) ^ 1) + 1;
14300 } else {
14301 // The only way we hit this point is if there is no clobbering
14302 // (because there are no off-half inputs to this half) and there is no
14303 // free slot adjacent to one of the inputs. In this case, we have to
14304 // swap an input with a non-input.
14305 for (int i = 0; i < 4; ++i)
14306 assert((SourceHalfMask[i] < 0 || SourceHalfMask[i] == i) &&
14307 "We can't handle any clobbers here!");
14308 assert(InputsFixed[1] != (InputsFixed[0] ^ 1) &&
14309 "Cannot have adjacent inputs here!");
14310
14311 SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
14312 SourceHalfMask[InputsFixed[1]] = InputsFixed[0] ^ 1;
14313
14314 // We also have to update the final source mask in this case because
14315 // it may need to undo the above swap.
14316 for (int &M : FinalSourceHalfMask)
14317 if (M == (InputsFixed[0] ^ 1) + SourceOffset)
14318 M = InputsFixed[1] + SourceOffset;
14319 else if (M == InputsFixed[1] + SourceOffset)
14320 M = (InputsFixed[0] ^ 1) + SourceOffset;
14321
14322 InputsFixed[1] = InputsFixed[0] ^ 1;
14323 }
14324
14325 // Point everything at the fixed inputs.
14326 for (int &M : HalfMask)
14327 if (M == IncomingInputs[0])
14328 M = InputsFixed[0] + SourceOffset;
14329 else if (M == IncomingInputs[1])
14330 M = InputsFixed[1] + SourceOffset;
14331
14332 IncomingInputs[0] = InputsFixed[0] + SourceOffset;
14333 IncomingInputs[1] = InputsFixed[1] + SourceOffset;
14334 }
14335 } else {
14336 llvm_unreachable("Unhandled input size!");
14337 }
14338
14339 // Now hoist the DWord down to the right half.
14340 int FreeDWord = (PSHUFDMask[DestOffset / 2] < 0 ? 0 : 1) + DestOffset / 2;
14341 assert(PSHUFDMask[FreeDWord] < 0 && "DWord not free");
14342 PSHUFDMask[FreeDWord] = IncomingInputs[0] / 2;
14343 for (int &M : HalfMask)
14344 for (int Input : IncomingInputs)
14345 if (M == Input)
14346 M = FreeDWord * 2 + Input % 2;
14347 };
14348 moveInputsToRightHalf(HToLInputs, LToLInputs, PSHUFHMask, LoMask, HiMask,
14349 /*SourceOffset*/ 4, /*DestOffset*/ 0);
14350 moveInputsToRightHalf(LToHInputs, HToHInputs, PSHUFLMask, HiMask, LoMask,
14351 /*SourceOffset*/ 0, /*DestOffset*/ 4);
14352
14353 // Now enact all the shuffles we've computed to move the inputs into their
14354 // target half.
14355 if (!isNoopShuffleMask(PSHUFLMask))
14356 V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
14357 getV4X86ShuffleImm8ForMask(PSHUFLMask, DL, DAG));
14358 if (!isNoopShuffleMask(PSHUFHMask))
14359 V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
14360 getV4X86ShuffleImm8ForMask(PSHUFHMask, DL, DAG));
14361 if (!isNoopShuffleMask(PSHUFDMask))
14362 V = DAG.getBitcast(
14363 VT,
14364 DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
14365 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
14366
14367 // At this point, each half should contain all its inputs, and we can then
14368 // just shuffle them into their final position.
14369 assert(none_of(LoMask, [](int M) { return M >= 4; }) &&
14370 "Failed to lift all the high half inputs to the low mask!");
14371 assert(none_of(HiMask, [](int M) { return M >= 0 && M < 4; }) &&
14372 "Failed to lift all the low half inputs to the high mask!");
14373
14374 // Do a half shuffle for the low mask.
14375 if (!isNoopShuffleMask(LoMask))
14376 V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
14377 getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));
14378
14379 // Do a half shuffle with the high mask after shifting its values down.
14380 for (int &M : HiMask)
14381 if (M >= 0)
14382 M -= 4;
14383 if (!isNoopShuffleMask(HiMask))
14384 V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
14385 getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));
14386
14387 return V;
14388}
14389
14390/// Helper to form a PSHUFB-based shuffle+blend, opportunistically avoiding the
14391/// blend if only one input is used.
14393 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
14394 const APInt &Zeroable, SelectionDAG &DAG, bool &V1InUse, bool &V2InUse) {
14396 "Lane crossing shuffle masks not supported");
14397
14398 int NumBytes = VT.getSizeInBits() / 8;
14399 int Size = Mask.size();
14400 int Scale = NumBytes / Size;
14401
14402 SmallVector<SDValue, 64> V1Mask(NumBytes, DAG.getUNDEF(MVT::i8));
14403 SmallVector<SDValue, 64> V2Mask(NumBytes, DAG.getUNDEF(MVT::i8));
14404 V1InUse = false;
14405 V2InUse = false;
14406
14407 for (int i = 0; i < NumBytes; ++i) {
14408 int M = Mask[i / Scale];
14409 if (M < 0)
14410 continue;
14411
14412 const int ZeroMask = 0x80;
14413 int V1Idx = M < Size ? M * Scale + i % Scale : ZeroMask;
14414 int V2Idx = M < Size ? ZeroMask : (M - Size) * Scale + i % Scale;
14415 if (Zeroable[i / Scale])
14416 V1Idx = V2Idx = ZeroMask;
14417
14418 V1Mask[i] = DAG.getConstant(V1Idx, DL, MVT::i8);
14419 V2Mask[i] = DAG.getConstant(V2Idx, DL, MVT::i8);
14420 V1InUse |= (ZeroMask != V1Idx);
14421 V2InUse |= (ZeroMask != V2Idx);
14422 }
14423
14424 MVT ShufVT = MVT::getVectorVT(MVT::i8, NumBytes);
14425 if (V1InUse)
14426 V1 = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT, DAG.getBitcast(ShufVT, V1),
14427 DAG.getBuildVector(ShufVT, DL, V1Mask));
14428 if (V2InUse)
14429 V2 = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT, DAG.getBitcast(ShufVT, V2),
14430 DAG.getBuildVector(ShufVT, DL, V2Mask));
14431
14432 // If we need shuffled inputs from both, blend the two.
14433 SDValue V;
14434 if (V1InUse && V2InUse)
14435 V = DAG.getNode(ISD::OR, DL, ShufVT, V1, V2);
14436 else
14437 V = V1InUse ? V1 : V2;
14438
14439 // Cast the result back to the correct type.
14440 return DAG.getBitcast(VT, V);
14441}
14442
14443/// Generic lowering of 8-lane i16 shuffles.
14444///
14445/// This handles both single-input shuffles and combined shuffle/blends with
14446/// two inputs. The single input shuffles are immediately delegated to
14447/// a dedicated lowering routine.
14448///
14449/// The blends are lowered in one of three fundamental ways. If there are few
14450/// enough inputs, it delegates to a basic UNPCK-based strategy. If the shuffle
14451/// of the input is significantly cheaper when lowered as an interleaving of
14452/// the two inputs, try to interleave them. Otherwise, blend the low and high
14453/// halves of the inputs separately (making them have relatively few inputs)
14454/// and then concatenate them.
14456 const APInt &Zeroable, SDValue V1, SDValue V2,
14457 const X86Subtarget &Subtarget,
14458 SelectionDAG &DAG) {
14459 assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
14460 assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
14461 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
14462
14463 // Whenever we can lower this as a zext, that instruction is strictly faster
14464 // than any alternative.
14465 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i16, V1, V2, Mask,
14466 Zeroable, Subtarget, DAG))
14467 return ZExt;
14468
14469 // Try to use lower using a truncation.
14470 if (SDValue V = lowerShuffleWithVPMOV(DL, MVT::v8i16, V1, V2, Mask, Zeroable,
14471 Subtarget, DAG))
14472 return V;
14473
14474 int NumV2Inputs = count_if(Mask, [](int M) { return M >= 8; });
14475
14476 if (NumV2Inputs == 0) {
14477 // Try to use shift instructions.
14478 if (SDValue Shift =
14479 lowerShuffleAsShift(DL, MVT::v8i16, V1, V1, Mask, Zeroable,
14480 Subtarget, DAG, /*BitwiseOnly*/ false))
14481 return Shift;
14482
14483 // Check for being able to broadcast a single element.
14484 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8i16, V1, V2,
14485 Mask, Subtarget, DAG))
14486 return Broadcast;
14487
14488 // Try to use bit rotation instructions.
14489 if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v8i16, V1, Mask,
14490 Subtarget, DAG))
14491 return Rotate;
14492
14493 // Use dedicated unpack instructions for masks that match their pattern.
14494 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i16, V1, V2, Mask, DAG))
14495 return V;
14496
14497 // Use dedicated pack instructions for masks that match their pattern.
14498 if (SDValue V =
14499 lowerShuffleWithPACK(DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
14500 return V;
14501
14502 // Try to use byte rotation instructions.
14503 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i16, V1, V1, Mask,
14504 Subtarget, DAG))
14505 return Rotate;
14506
14507 // Make a copy of the mask so it can be modified.
14508 SmallVector<int, 8> MutableMask(Mask);
14509 return lowerV8I16GeneralSingleInputShuffle(DL, MVT::v8i16, V1, MutableMask,
14510 Subtarget, DAG);
14511 }
14512
14513 assert(llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) &&
14514 "All single-input shuffles should be canonicalized to be V1-input "
14515 "shuffles.");
14516
14517 // Try to use shift instructions.
14518 if (SDValue Shift =
14519 lowerShuffleAsShift(DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget,
14520 DAG, /*BitwiseOnly*/ false))
14521 return Shift;
14522
14523 // See if we can use SSE4A Extraction / Insertion.
14524 if (Subtarget.hasSSE4A())
14525 if (SDValue V = lowerShuffleWithSSE4A(DL, MVT::v8i16, V1, V2, Mask,
14526 Zeroable, DAG))
14527 return V;
14528
14529 // There are special ways we can lower some single-element blends.
14530 if (NumV2Inputs == 1)
14532 DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
14533 return V;
14534
14535 // We have different paths for blend lowering, but they all must use the
14536 // *exact* same predicate.
14537 bool IsBlendSupported = Subtarget.hasSSE41();
14538 if (IsBlendSupported)
14539 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask,
14540 Zeroable, Subtarget, DAG))
14541 return Blend;
14542
14543 if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v8i16, V1, V2, Mask,
14544 Zeroable, Subtarget, DAG))
14545 return Masked;
14546
14547 // Use dedicated unpack instructions for masks that match their pattern.
14548 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i16, V1, V2, Mask, DAG))
14549 return V;
14550
14551 // Use dedicated pack instructions for masks that match their pattern.
14552 if (SDValue V =
14553 lowerShuffleWithPACK(DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
14554 return V;
14555
14556 // Try to use lower using a truncation.
14557 if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v8i16, V1, V2, Mask, Zeroable,
14558 Subtarget, DAG))
14559 return V;
14560
14561 // Try to use byte rotation instructions.
14562 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i16, V1, V2, Mask,
14563 Subtarget, DAG))
14564 return Rotate;
14565
14566 if (SDValue BitBlend =
14567 lowerShuffleAsBitBlend(DL, MVT::v8i16, V1, V2, Mask, DAG))
14568 return BitBlend;
14569
14570 // Try to use byte shift instructions to mask.
14571 if (SDValue V = lowerShuffleAsByteShiftMask(DL, MVT::v8i16, V1, V2, Mask,
14572 Zeroable, Subtarget, DAG))
14573 return V;
14574
14575 // Attempt to lower using compaction, SSE41 is necessary for PACKUSDW.
14576 int NumEvenDrops = canLowerByDroppingElements(Mask, true, false);
14577 if ((NumEvenDrops == 1 || (NumEvenDrops == 2 && Subtarget.hasSSE41())) &&
14578 !Subtarget.hasVLX()) {
14579 // Check if this is part of a 256-bit vector truncation.
14580 unsigned PackOpc = 0;
14581 if (NumEvenDrops == 2 && Subtarget.hasAVX2() &&
14584 SDValue V1V2 = concatSubVectors(V1, V2, DAG, DL);
14585 V1V2 = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1V2,
14586 getZeroVector(MVT::v16i16, Subtarget, DAG, DL),
14587 DAG.getTargetConstant(0xEE, DL, MVT::i8));
14588 V1V2 = DAG.getBitcast(MVT::v8i32, V1V2);
14589 V1 = extract128BitVector(V1V2, 0, DAG, DL);
14590 V2 = extract128BitVector(V1V2, 4, DAG, DL);
14591 PackOpc = X86ISD::PACKUS;
14592 } else if (Subtarget.hasSSE41()) {
14593 SmallVector<SDValue, 4> DWordClearOps(4,
14594 DAG.getConstant(0, DL, MVT::i32));
14595 for (unsigned i = 0; i != 4; i += 1 << (NumEvenDrops - 1))
14596 DWordClearOps[i] = DAG.getConstant(0xFFFF, DL, MVT::i32);
14597 SDValue DWordClearMask =
14598 DAG.getBuildVector(MVT::v4i32, DL, DWordClearOps);
14599 V1 = DAG.getNode(ISD::AND, DL, MVT::v4i32, DAG.getBitcast(MVT::v4i32, V1),
14600 DWordClearMask);
14601 V2 = DAG.getNode(ISD::AND, DL, MVT::v4i32, DAG.getBitcast(MVT::v4i32, V2),
14602 DWordClearMask);
14603 PackOpc = X86ISD::PACKUS;
14604 } else if (!Subtarget.hasSSSE3()) {
14605 SDValue ShAmt = DAG.getTargetConstant(16, DL, MVT::i8);
14606 V1 = DAG.getBitcast(MVT::v4i32, V1);
14607 V2 = DAG.getBitcast(MVT::v4i32, V2);
14608 V1 = DAG.getNode(X86ISD::VSHLI, DL, MVT::v4i32, V1, ShAmt);
14609 V2 = DAG.getNode(X86ISD::VSHLI, DL, MVT::v4i32, V2, ShAmt);
14610 V1 = DAG.getNode(X86ISD::VSRAI, DL, MVT::v4i32, V1, ShAmt);
14611 V2 = DAG.getNode(X86ISD::VSRAI, DL, MVT::v4i32, V2, ShAmt);
14612 PackOpc = X86ISD::PACKSS;
14613 }
14614 if (PackOpc) {
14615 // Now pack things back together.
14616 SDValue Result = DAG.getNode(PackOpc, DL, MVT::v8i16, V1, V2);
14617 if (NumEvenDrops == 2) {
14618 Result = DAG.getBitcast(MVT::v4i32, Result);
14619 Result = DAG.getNode(PackOpc, DL, MVT::v8i16, Result, Result);
14620 }
14621 return Result;
14622 }
14623 }
14624
14625 // When compacting odd (upper) elements, use PACKSS pre-SSE41.
14626 int NumOddDrops = canLowerByDroppingElements(Mask, false, false);
14627 if (NumOddDrops == 1) {
14628 bool HasSSE41 = Subtarget.hasSSE41();
14629 V1 = DAG.getNode(HasSSE41 ? X86ISD::VSRLI : X86ISD::VSRAI, DL, MVT::v4i32,
14630 DAG.getBitcast(MVT::v4i32, V1),
14631 DAG.getTargetConstant(16, DL, MVT::i8));
14632 V2 = DAG.getNode(HasSSE41 ? X86ISD::VSRLI : X86ISD::VSRAI, DL, MVT::v4i32,
14633 DAG.getBitcast(MVT::v4i32, V2),
14634 DAG.getTargetConstant(16, DL, MVT::i8));
14635 return DAG.getNode(HasSSE41 ? X86ISD::PACKUS : X86ISD::PACKSS, DL,
14636 MVT::v8i16, V1, V2);
14637 }
14638
14639 // Try to lower by permuting the inputs into an unpack instruction.
14640 if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(DL, MVT::v8i16, V1, V2,
14641 Mask, Subtarget, DAG))
14642 return Unpack;
14643
14644 // If we can't directly blend but can use PSHUFB, that will be better as it
14645 // can both shuffle and set up the inefficient blend.
14646 if (!IsBlendSupported && Subtarget.hasSSSE3()) {
14647 bool V1InUse, V2InUse;
14648 return lowerShuffleAsBlendOfPSHUFBs(DL, MVT::v8i16, V1, V2, Mask,
14649 Zeroable, DAG, V1InUse, V2InUse);
14650 }
14651
14652 // We can always bit-blend if we have to so the fallback strategy is to
14653 // decompose into single-input permutes and blends/unpacks.
14654 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8i16, V1, V2, Mask,
14655 Zeroable, Subtarget, DAG);
14656}
14657
14658/// Lower 8-lane 16-bit floating point shuffles.
14660 const APInt &Zeroable, SDValue V1, SDValue V2,
14661 const X86Subtarget &Subtarget,
14662 SelectionDAG &DAG) {
14663 assert(V1.getSimpleValueType() == MVT::v8f16 && "Bad operand type!");
14664 assert(V2.getSimpleValueType() == MVT::v8f16 && "Bad operand type!");
14665 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
14666 int NumV2Elements = count_if(Mask, [](int M) { return M >= 8; });
14667
14668 if (Subtarget.hasFP16()) {
14669 if (NumV2Elements == 0) {
14670 // Check for being able to broadcast a single element.
14671 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8f16, V1, V2,
14672 Mask, Subtarget, DAG))
14673 return Broadcast;
14674 }
14675 if (NumV2Elements == 1 && Mask[0] >= 8)
14677 DL, MVT::v8f16, V1, V2, Mask, Zeroable, Subtarget, DAG))
14678 return V;
14679 }
14680
14681 V1 = DAG.getBitcast(MVT::v8i16, V1);
14682 V2 = DAG.getBitcast(MVT::v8i16, V2);
14683 return DAG.getBitcast(MVT::v8f16,
14684 DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, Mask));
14685}
14686
14687// Lowers unary/binary shuffle as VPERMV/VPERMV3, for non-VLX targets,
14688// sub-512-bit shuffles are padded to 512-bits for the shuffle and then
14689// the active subvector is extracted.
14691 ArrayRef<int> OriginalMask, SDValue V1,
14692 SDValue V2, const X86Subtarget &Subtarget,
14693 SelectionDAG &DAG) {
14694 // Commute binary inputs so V2 is a load to simplify VPERMI2/T2 folds.
14695 SmallVector<int, 32> Mask(OriginalMask);
14696 if (!V2.isUndef() && isShuffleFoldableLoad(V1) &&
14697 !isShuffleFoldableLoad(V2)) {
14699 std::swap(V1, V2);
14700 }
14701
14702 MVT MaskVT = VT.changeTypeToInteger();
14703 SDValue MaskNode;
14704 MVT ShuffleVT = VT;
14705 if (!VT.is512BitVector() && !Subtarget.hasVLX()) {
14706 V1 = widenSubVector(V1, false, Subtarget, DAG, DL, 512);
14707 V2 = widenSubVector(V2, false, Subtarget, DAG, DL, 512);
14708 ShuffleVT = V1.getSimpleValueType();
14709
14710 // Adjust mask to correct indices for the second input.
14711 int NumElts = VT.getVectorNumElements();
14712 unsigned Scale = 512 / VT.getSizeInBits();
14713 SmallVector<int, 32> AdjustedMask(Mask);
14714 for (int &M : AdjustedMask)
14715 if (NumElts <= M)
14716 M += (Scale - 1) * NumElts;
14717 MaskNode = getConstVector(AdjustedMask, MaskVT, DAG, DL, true);
14718 MaskNode = widenSubVector(MaskNode, false, Subtarget, DAG, DL, 512);
14719 } else {
14720 MaskNode = getConstVector(Mask, MaskVT, DAG, DL, true);
14721 }
14722
14723 SDValue Result;
14724 if (V2.isUndef())
14725 Result = DAG.getNode(X86ISD::VPERMV, DL, ShuffleVT, MaskNode, V1);
14726 else
14727 Result = DAG.getNode(X86ISD::VPERMV3, DL, ShuffleVT, V1, MaskNode, V2);
14728
14729 if (VT != ShuffleVT)
14730 Result = extractSubVector(Result, 0, DAG, DL, VT.getSizeInBits());
14731
14732 return Result;
14733}
14734
14735/// Generic lowering of v16i8 shuffles.
14736///
14737/// This is a hybrid strategy to lower v16i8 vectors. It first attempts to
14738/// detect any complexity reducing interleaving. If that doesn't help, it uses
14739/// UNPCK to spread the i8 elements across two i16-element vectors, and uses
14740/// the existing lowering for v8i16 blends on each half, finally PACK-ing them
14741/// back together.
14743 const APInt &Zeroable, SDValue V1, SDValue V2,
14744 const X86Subtarget &Subtarget,
14745 SelectionDAG &DAG) {
14746 assert(V1.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
14747 assert(V2.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
14748 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
14749
14750 // Try to use shift instructions.
14751 if (SDValue Shift =
14752 lowerShuffleAsShift(DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget,
14753 DAG, /*BitwiseOnly*/ false))
14754 return Shift;
14755
14756 // Try to use byte rotation instructions.
14757 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i8, V1, V2, Mask,
14758 Subtarget, DAG))
14759 return Rotate;
14760
14761 // Use dedicated pack instructions for masks that match their pattern.
14762 if (SDValue V =
14763 lowerShuffleWithPACK(DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
14764 return V;
14765
14766 // Try to use a zext lowering.
14767 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v16i8, V1, V2, Mask,
14768 Zeroable, Subtarget, DAG))
14769 return ZExt;
14770
14771 // Try to use lower using a truncation.
14772 if (SDValue V = lowerShuffleWithVPMOV(DL, MVT::v16i8, V1, V2, Mask, Zeroable,
14773 Subtarget, DAG))
14774 return V;
14775
14776 if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v16i8, V1, V2, Mask, Zeroable,
14777 Subtarget, DAG))
14778 return V;
14779
14780 // See if we can use SSE4A Extraction / Insertion.
14781 if (Subtarget.hasSSE4A())
14782 if (SDValue V = lowerShuffleWithSSE4A(DL, MVT::v16i8, V1, V2, Mask,
14783 Zeroable, DAG))
14784 return V;
14785
14786 int NumV2Elements = count_if(Mask, [](int M) { return M >= 16; });
14787
14788 // For single-input shuffles, there are some nicer lowering tricks we can use.
14789 if (NumV2Elements == 0) {
14790 // Check for being able to broadcast a single element.
14791 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v16i8, V1, V2,
14792 Mask, Subtarget, DAG))
14793 return Broadcast;
14794
14795 // Try to use bit rotation instructions.
14796 if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v16i8, V1, Mask,
14797 Subtarget, DAG))
14798 return Rotate;
14799
14800 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i8, V1, V2, Mask, DAG))
14801 return V;
14802
14803 // Check whether we can widen this to an i16 shuffle by duplicating bytes.
14804 // Notably, this handles splat and partial-splat shuffles more efficiently.
14805 // However, it only makes sense if the pre-duplication shuffle simplifies
14806 // things significantly. Currently, this means we need to be able to
14807 // express the pre-duplication shuffle as an i16 shuffle.
14808 //
14809 // FIXME: We should check for other patterns which can be widened into an
14810 // i16 shuffle as well.
14811 auto canWidenViaDuplication = [](ArrayRef<int> Mask) {
14812 for (int i = 0; i < 16; i += 2)
14813 if (Mask[i] >= 0 && Mask[i + 1] >= 0 && Mask[i] != Mask[i + 1])
14814 return false;
14815
14816 return true;
14817 };
14818 auto tryToWidenViaDuplication = [&]() -> SDValue {
14819 if (!canWidenViaDuplication(Mask))
14820 return SDValue();
14821 SmallVector<int, 4> LoInputs;
14822 copy_if(Mask, std::back_inserter(LoInputs),
14823 [](int M) { return M >= 0 && M < 8; });
14824 array_pod_sort(LoInputs.begin(), LoInputs.end());
14825 LoInputs.erase(llvm::unique(LoInputs), LoInputs.end());
14826 SmallVector<int, 4> HiInputs;
14827 copy_if(Mask, std::back_inserter(HiInputs), [](int M) { return M >= 8; });
14828 array_pod_sort(HiInputs.begin(), HiInputs.end());
14829 HiInputs.erase(llvm::unique(HiInputs), HiInputs.end());
14830
14831 bool TargetLo = LoInputs.size() >= HiInputs.size();
14832 ArrayRef<int> InPlaceInputs = TargetLo ? LoInputs : HiInputs;
14833 ArrayRef<int> MovingInputs = TargetLo ? HiInputs : LoInputs;
14834
14835 int PreDupI16Shuffle[] = {-1, -1, -1, -1, -1, -1, -1, -1};
14837 for (int I : InPlaceInputs) {
14838 PreDupI16Shuffle[I/2] = I/2;
14839 LaneMap[I] = I;
14840 }
14841 int j = TargetLo ? 0 : 4, je = j + 4;
14842 for (int i = 0, ie = MovingInputs.size(); i < ie; ++i) {
14843 // Check if j is already a shuffle of this input. This happens when
14844 // there are two adjacent bytes after we move the low one.
14845 if (PreDupI16Shuffle[j] != MovingInputs[i] / 2) {
14846 // If we haven't yet mapped the input, search for a slot into which
14847 // we can map it.
14848 while (j < je && PreDupI16Shuffle[j] >= 0)
14849 ++j;
14850
14851 if (j == je)
14852 // We can't place the inputs into a single half with a simple i16 shuffle, so bail.
14853 return SDValue();
14854
14855 // Map this input with the i16 shuffle.
14856 PreDupI16Shuffle[j] = MovingInputs[i] / 2;
14857 }
14858
14859 // Update the lane map based on the mapping we ended up with.
14860 LaneMap[MovingInputs[i]] = 2 * j + MovingInputs[i] % 2;
14861 }
14862 V1 = DAG.getBitcast(
14863 MVT::v16i8,
14864 DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
14865 DAG.getUNDEF(MVT::v8i16), PreDupI16Shuffle));
14866
14867 // Unpack the bytes to form the i16s that will be shuffled into place.
14868 bool EvenInUse = false, OddInUse = false;
14869 for (int i = 0; i < 16; i += 2) {
14870 EvenInUse |= (Mask[i + 0] >= 0);
14871 OddInUse |= (Mask[i + 1] >= 0);
14872 if (EvenInUse && OddInUse)
14873 break;
14874 }
14875 V1 = DAG.getNode(TargetLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
14876 MVT::v16i8, EvenInUse ? V1 : DAG.getUNDEF(MVT::v16i8),
14877 OddInUse ? V1 : DAG.getUNDEF(MVT::v16i8));
14878
14879 int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
14880 for (int i = 0; i < 16; ++i)
14881 if (Mask[i] >= 0) {
14882 int MappedMask = LaneMap[Mask[i]] - (TargetLo ? 0 : 8);
14883 assert(MappedMask < 8 && "Invalid v8 shuffle mask!");
14884 if (PostDupI16Shuffle[i / 2] < 0)
14885 PostDupI16Shuffle[i / 2] = MappedMask;
14886 else
14887 assert(PostDupI16Shuffle[i / 2] == MappedMask &&
14888 "Conflicting entries in the original shuffle!");
14889 }
14890 return DAG.getBitcast(
14891 MVT::v16i8,
14892 DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
14893 DAG.getUNDEF(MVT::v8i16), PostDupI16Shuffle));
14894 };
14895 if (SDValue V = tryToWidenViaDuplication())
14896 return V;
14897 }
14898
14899 if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v16i8, V1, V2, Mask,
14900 Zeroable, Subtarget, DAG))
14901 return Masked;
14902
14903 // Use dedicated unpack instructions for masks that match their pattern.
14904 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i8, V1, V2, Mask, DAG))
14905 return V;
14906
14907 // Try to use byte shift instructions to mask.
14908 if (SDValue V = lowerShuffleAsByteShiftMask(DL, MVT::v16i8, V1, V2, Mask,
14909 Zeroable, Subtarget, DAG))
14910 return V;
14911
14912 // Check for compaction patterns.
14913 bool IsSingleInput = V2.isUndef();
14914 int NumEvenDrops = canLowerByDroppingElements(Mask, true, IsSingleInput);
14915
14916 // Check for SSSE3 which lets us lower all v16i8 shuffles much more directly
14917 // with PSHUFB. It is important to do this before we attempt to generate any
14918 // blends but after all of the single-input lowerings. If the single input
14919 // lowerings can find an instruction sequence that is faster than a PSHUFB, we
14920 // want to preserve that and we can DAG combine any longer sequences into
14921 // a PSHUFB in the end. But once we start blending from multiple inputs,
14922 // the complexity of DAG combining bad patterns back into PSHUFB is too high,
14923 // and there are *very* few patterns that would actually be faster than the
14924 // PSHUFB approach because of its ability to zero lanes.
14925 //
14926 // If the mask is a binary compaction, we can more efficiently perform this
14927 // as a PACKUS(AND(),AND()) - which is quicker than UNPACK(PSHUFB(),PSHUFB()).
14928 //
14929 // FIXME: The only exceptions to the above are blends which are exact
14930 // interleavings with direct instructions supporting them. We currently don't
14931 // handle those well here.
14932 if (Subtarget.hasSSSE3() && (IsSingleInput || NumEvenDrops != 1)) {
14933 bool V1InUse = false;
14934 bool V2InUse = false;
14935
14937 DL, MVT::v16i8, V1, V2, Mask, Zeroable, DAG, V1InUse, V2InUse);
14938
14939 // If both V1 and V2 are in use and we can use a direct blend or an unpack,
14940 // do so. This avoids using them to handle blends-with-zero which is
14941 // important as a single pshufb is significantly faster for that.
14942 if (V1InUse && V2InUse) {
14943 if (Subtarget.hasSSE41())
14944 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i8, V1, V2, Mask,
14945 Zeroable, Subtarget, DAG))
14946 return Blend;
14947
14948 // We can use an unpack to do the blending rather than an or in some
14949 // cases. Even though the or may be (very minorly) more efficient, we
14950 // preference this lowering because there are common cases where part of
14951 // the complexity of the shuffles goes away when we do the final blend as
14952 // an unpack.
14953 // FIXME: It might be worth trying to detect if the unpack-feeding
14954 // shuffles will both be pshufb, in which case we shouldn't bother with
14955 // this.
14957 DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
14958 return Unpack;
14959
14960 // AVX512VBMI can lower to VPERMB (non-VLX will pad to v64i8).
14961 if (Subtarget.hasVBMI())
14962 return lowerShuffleWithPERMV(DL, MVT::v16i8, Mask, V1, V2, Subtarget,
14963 DAG);
14964
14965 // If we have XOP we can use one VPPERM instead of multiple PSHUFBs.
14966 if (Subtarget.hasXOP()) {
14967 SDValue MaskNode = getConstVector(Mask, MVT::v16i8, DAG, DL, true);
14968 return DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, V1, V2, MaskNode);
14969 }
14970
14971 // Use PALIGNR+Permute if possible - permute might become PSHUFB but the
14972 // PALIGNR will be cheaper than the second PSHUFB+OR.
14974 DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
14975 return V;
14976 }
14977
14978 return PSHUFB;
14979 }
14980
14981 // There are special ways we can lower some single-element blends.
14982 if (NumV2Elements == 1)
14984 DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
14985 return V;
14986
14987 if (SDValue Blend = lowerShuffleAsBitBlend(DL, MVT::v16i8, V1, V2, Mask, DAG))
14988 return Blend;
14989
14990 // Check whether a compaction lowering can be done. This handles shuffles
14991 // which take every Nth element for some even N. See the helper function for
14992 // details.
14993 //
14994 // We special case these as they can be particularly efficiently handled with
14995 // the PACKUSB instruction on x86 and they show up in common patterns of
14996 // rearranging bytes to truncate wide elements.
14997 if (NumEvenDrops) {
14998 // NumEvenDrops is the power of two stride of the elements. Another way of
14999 // thinking about it is that we need to drop the even elements this many
15000 // times to get the original input.
15001
15002 // First we need to zero all the dropped bytes.
15003 assert(NumEvenDrops <= 3 &&
15004 "No support for dropping even elements more than 3 times.");
15005 SmallVector<SDValue, 8> WordClearOps(8, DAG.getConstant(0, DL, MVT::i16));
15006 for (unsigned i = 0; i != 8; i += 1 << (NumEvenDrops - 1))
15007 WordClearOps[i] = DAG.getConstant(0xFF, DL, MVT::i16);
15008 SDValue WordClearMask = DAG.getBuildVector(MVT::v8i16, DL, WordClearOps);
15009 V1 = DAG.getNode(ISD::AND, DL, MVT::v8i16, DAG.getBitcast(MVT::v8i16, V1),
15010 WordClearMask);
15011 if (!IsSingleInput)
15012 V2 = DAG.getNode(ISD::AND, DL, MVT::v8i16, DAG.getBitcast(MVT::v8i16, V2),
15013 WordClearMask);
15014
15015 // Now pack things back together.
15016 SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1,
15017 IsSingleInput ? V1 : V2);
15018 for (int i = 1; i < NumEvenDrops; ++i) {
15019 Result = DAG.getBitcast(MVT::v8i16, Result);
15020 Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Result, Result);
15021 }
15022 return Result;
15023 }
15024
15025 int NumOddDrops = canLowerByDroppingElements(Mask, false, IsSingleInput);
15026 if (NumOddDrops == 1) {
15027 V1 = DAG.getNode(X86ISD::VSRLI, DL, MVT::v8i16,
15028 DAG.getBitcast(MVT::v8i16, V1),
15029 DAG.getTargetConstant(8, DL, MVT::i8));
15030 if (!IsSingleInput)
15031 V2 = DAG.getNode(X86ISD::VSRLI, DL, MVT::v8i16,
15032 DAG.getBitcast(MVT::v8i16, V2),
15033 DAG.getTargetConstant(8, DL, MVT::i8));
15034 return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1,
15035 IsSingleInput ? V1 : V2);
15036 }
15037
15038 // Handle multi-input cases by blending/unpacking single-input shuffles.
15039 if (NumV2Elements > 0)
15040 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v16i8, V1, V2, Mask,
15041 Zeroable, Subtarget, DAG);
15042
15043 // The fallback path for single-input shuffles widens this into two v8i16
15044 // vectors with unpacks, shuffles those, and then pulls them back together
15045 // with a pack.
15046 SDValue V = V1;
15047
15048 std::array<int, 8> LoBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
15049 std::array<int, 8> HiBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
15050 for (int i = 0; i < 16; ++i)
15051 if (Mask[i] >= 0)
15052 (i < 8 ? LoBlendMask[i] : HiBlendMask[i % 8]) = Mask[i];
15053
15054 SDValue VLoHalf, VHiHalf;
15055 // Check if any of the odd lanes in the v16i8 are used. If not, we can mask
15056 // them out and avoid using UNPCK{L,H} to extract the elements of V as
15057 // i16s.
15058 if (none_of(LoBlendMask, [](int M) { return M >= 0 && M % 2 == 1; }) &&
15059 none_of(HiBlendMask, [](int M) { return M >= 0 && M % 2 == 1; })) {
15060 // Use a mask to drop the high bytes.
15061 VLoHalf = DAG.getBitcast(MVT::v8i16, V);
15062 VLoHalf = DAG.getNode(ISD::AND, DL, MVT::v8i16, VLoHalf,
15063 DAG.getConstant(0x00FF, DL, MVT::v8i16));
15064
15065 // This will be a single vector shuffle instead of a blend so nuke VHiHalf.
15066 VHiHalf = DAG.getUNDEF(MVT::v8i16);
15067
15068 // Squash the masks to point directly into VLoHalf.
15069 for (int &M : LoBlendMask)
15070 if (M >= 0)
15071 M /= 2;
15072 for (int &M : HiBlendMask)
15073 if (M >= 0)
15074 M /= 2;
15075 } else {
15076 // Otherwise just unpack the low half of V into VLoHalf and the high half into
15077 // VHiHalf so that we can blend them as i16s.
15078 SDValue Zero = getZeroVector(MVT::v16i8, Subtarget, DAG, DL);
15079
15080 VLoHalf = DAG.getBitcast(
15081 MVT::v8i16, DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero));
15082 VHiHalf = DAG.getBitcast(
15083 MVT::v8i16, DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero));
15084 }
15085
15086 SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, LoBlendMask);
15087 SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, HiBlendMask);
15088
15089 return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, LoV, HiV);
15090}
15091
15092/// Dispatching routine to lower various 128-bit x86 vector shuffles.
15093///
15094/// This routine breaks down the specific type of 128-bit shuffle and
15095/// dispatches to the lowering routines accordingly.
15097 MVT VT, SDValue V1, SDValue V2,
15098 const APInt &Zeroable,
15099 const X86Subtarget &Subtarget,
15100 SelectionDAG &DAG) {
15101 if (VT == MVT::v8bf16) {
15102 V1 = DAG.getBitcast(MVT::v8i16, V1);
15103 V2 = DAG.getBitcast(MVT::v8i16, V2);
15104 return DAG.getBitcast(VT,
15105 DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, Mask));
15106 }
15107
15108 switch (VT.SimpleTy) {
15109 case MVT::v2i64:
15110 return lowerV2I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15111 case MVT::v2f64:
15112 return lowerV2F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15113 case MVT::v4i32:
15114 return lowerV4I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15115 case MVT::v4f32:
15116 return lowerV4F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15117 case MVT::v8i16:
15118 return lowerV8I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15119 case MVT::v8f16:
15120 return lowerV8F16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15121 case MVT::v16i8:
15122 return lowerV16I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
15123
15124 default:
15125 llvm_unreachable("Unimplemented!");
15126 }
15127}
15128
15129/// Generic routine to split vector shuffle into half-sized shuffles.
15130///
15131/// This routine just extracts two subvectors, shuffles them independently, and
15132/// then concatenates them back together. This should work effectively with all
15133/// AVX vector shuffle types.
15135 SDValue V2, ArrayRef<int> Mask,
15136 SelectionDAG &DAG, bool SimpleOnly) {
15137 assert(VT.getSizeInBits() >= 256 &&
15138 "Only for 256-bit or wider vector shuffles!");
15139 assert(V1.getSimpleValueType() == VT && "Bad operand type!");
15140 assert(V2.getSimpleValueType() == VT && "Bad operand type!");
15141
15142 // If this came from the AVX1 v8i32 -> v8f32 bitcast, split using v4i32.
15143 if (VT == MVT::v8f32) {
15144 SDValue BC1 = peekThroughBitcasts(V1);
15145 SDValue BC2 = peekThroughBitcasts(V2);
15146 if (BC1.getValueType() == MVT::v8i32 && BC2.getValueType() == MVT::v8i32) {
15147 if (SDValue Split = splitAndLowerShuffle(DL, MVT::v8i32, BC1, BC2, Mask,
15148 DAG, SimpleOnly))
15149 return DAG.getBitcast(VT, Split);
15150 }
15151 }
15152
15153 ArrayRef<int> LoMask = Mask.slice(0, Mask.size() / 2);
15154 ArrayRef<int> HiMask = Mask.slice(Mask.size() / 2);
15155
15156 int NumElements = VT.getVectorNumElements();
15157 int SplitNumElements = NumElements / 2;
15158 MVT ScalarVT = VT.getVectorElementType();
15159 MVT SplitVT = MVT::getVectorVT(ScalarVT, SplitNumElements);
15160
15161 // Use splitVector/extractSubVector so that split build-vectors just build two
15162 // narrower build vectors. This helps shuffling with splats and zeros.
15163 auto SplitVector = [&](SDValue V) {
15164 SDValue LoV, HiV;
15165 std::tie(LoV, HiV) = splitVector(peekThroughBitcasts(V), DAG, DL);
15166 return std::make_pair(DAG.getBitcast(SplitVT, LoV),
15167 DAG.getBitcast(SplitVT, HiV));
15168 };
15169
15170 SDValue LoV1, HiV1, LoV2, HiV2;
15171 std::tie(LoV1, HiV1) = SplitVector(V1);
15172 std::tie(LoV2, HiV2) = SplitVector(V2);
15173
15174 // Now create two 4-way blends of these half-width vectors.
15175 auto GetHalfBlendPiecesReq = [&](const ArrayRef<int> &HalfMask, bool &UseLoV1,
15176 bool &UseHiV1, bool &UseLoV2,
15177 bool &UseHiV2) {
15178 UseLoV1 = UseHiV1 = UseLoV2 = UseHiV2 = false;
15179 for (int i = 0; i < SplitNumElements; ++i) {
15180 int M = HalfMask[i];
15181 if (M >= NumElements) {
15182 if (M >= NumElements + SplitNumElements)
15183 UseHiV2 = true;
15184 else
15185 UseLoV2 = true;
15186 } else if (M >= 0) {
15187 if (M >= SplitNumElements)
15188 UseHiV1 = true;
15189 else
15190 UseLoV1 = true;
15191 }
15192 }
15193 };
15194
15195 auto CheckHalfBlendUsable = [&](const ArrayRef<int> &HalfMask) -> bool {
15196 if (!SimpleOnly)
15197 return true;
15198
15199 bool UseLoV1, UseHiV1, UseLoV2, UseHiV2;
15200 GetHalfBlendPiecesReq(HalfMask, UseLoV1, UseHiV1, UseLoV2, UseHiV2);
15201
15202 return !(UseHiV1 || UseHiV2);
15203 };
15204
15205 auto HalfBlend = [&](ArrayRef<int> HalfMask) {
15206 SmallVector<int, 32> V1BlendMask((unsigned)SplitNumElements, -1);
15207 SmallVector<int, 32> V2BlendMask((unsigned)SplitNumElements, -1);
15208 SmallVector<int, 32> BlendMask((unsigned)SplitNumElements, -1);
15209 for (int i = 0; i < SplitNumElements; ++i) {
15210 int M = HalfMask[i];
15211 if (M >= NumElements) {
15212 V2BlendMask[i] = M - NumElements;
15213 BlendMask[i] = SplitNumElements + i;
15214 } else if (M >= 0) {
15215 V1BlendMask[i] = M;
15216 BlendMask[i] = i;
15217 }
15218 }
15219
15220 bool UseLoV1, UseHiV1, UseLoV2, UseHiV2;
15221 GetHalfBlendPiecesReq(HalfMask, UseLoV1, UseHiV1, UseLoV2, UseHiV2);
15222
15223 // Because the lowering happens after all combining takes place, we need to
15224 // manually combine these blend masks as much as possible so that we create
15225 // a minimal number of high-level vector shuffle nodes.
15226 assert((!SimpleOnly || (!UseHiV1 && !UseHiV2)) && "Shuffle isn't simple");
15227
15228 // First try just blending the halves of V1 or V2.
15229 if (!UseLoV1 && !UseHiV1 && !UseLoV2 && !UseHiV2)
15230 return DAG.getUNDEF(SplitVT);
15231 if (!UseLoV2 && !UseHiV2)
15232 return DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
15233 if (!UseLoV1 && !UseHiV1)
15234 return DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
15235
15236 SDValue V1Blend, V2Blend;
15237 if (UseLoV1 && UseHiV1) {
15238 V1Blend = DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
15239 } else {
15240 // We only use half of V1 so map the usage down into the final blend mask.
15241 V1Blend = UseLoV1 ? LoV1 : HiV1;
15242 for (int i = 0; i < SplitNumElements; ++i)
15243 if (BlendMask[i] >= 0 && BlendMask[i] < SplitNumElements)
15244 BlendMask[i] = V1BlendMask[i] - (UseLoV1 ? 0 : SplitNumElements);
15245 }
15246 if (UseLoV2 && UseHiV2) {
15247 V2Blend = DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
15248 } else {
15249 // We only use half of V2 so map the usage down into the final blend mask.
15250 V2Blend = UseLoV2 ? LoV2 : HiV2;
15251 for (int i = 0; i < SplitNumElements; ++i)
15252 if (BlendMask[i] >= SplitNumElements)
15253 BlendMask[i] = V2BlendMask[i] + (UseLoV2 ? SplitNumElements : 0);
15254 }
15255 return DAG.getVectorShuffle(SplitVT, DL, V1Blend, V2Blend, BlendMask);
15256 };
15257
15258 if (!CheckHalfBlendUsable(LoMask) || !CheckHalfBlendUsable(HiMask))
15259 return SDValue();
15260
15261 SDValue Lo = HalfBlend(LoMask);
15262 SDValue Hi = HalfBlend(HiMask);
15263 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
15264}
15265
15266/// Either split a vector in halves or decompose the shuffles and the
15267/// blend/unpack.
15268///
15269/// This is provided as a good fallback for many lowerings of non-single-input
15270/// shuffles with more than one 128-bit lane. In those cases, we want to select
15271/// between splitting the shuffle into 128-bit components and stitching those
15272/// back together vs. extracting the single-input shuffles and blending those
15273/// results.
15275 SDValue V2, ArrayRef<int> Mask,
15276 const APInt &Zeroable,
15277 const X86Subtarget &Subtarget,
15278 SelectionDAG &DAG) {
15279 assert(!V2.isUndef() && "This routine must not be used to lower single-input "
15280 "shuffles as it could then recurse on itself.");
15281 int Size = Mask.size();
15282
15283 // If this can be modeled as a broadcast of two elements followed by a blend,
15284 // prefer that lowering. This is especially important because broadcasts can
15285 // often fold with memory operands.
15286 auto DoBothBroadcast = [&] {
15287 int V1BroadcastIdx = -1, V2BroadcastIdx = -1;
15288 for (int M : Mask)
15289 if (M >= Size) {
15290 if (V2BroadcastIdx < 0)
15291 V2BroadcastIdx = M - Size;
15292 else if ((M - Size) != V2BroadcastIdx &&
15293 !IsElementEquivalent(Size, V2, V2, M - Size, V2BroadcastIdx))
15294 return false;
15295 } else if (M >= 0) {
15296 if (V1BroadcastIdx < 0)
15297 V1BroadcastIdx = M;
15298 else if (M != V1BroadcastIdx &&
15299 !IsElementEquivalent(Size, V1, V1, M, V1BroadcastIdx))
15300 return false;
15301 }
15302 return true;
15303 };
15304 if (DoBothBroadcast())
15305 return lowerShuffleAsDecomposedShuffleMerge(DL, VT, V1, V2, Mask, Zeroable,
15306 Subtarget, DAG);
15307
15308 // If the inputs all stem from a single 128-bit lane of each input, then we
15309 // split them rather than blending because the split will decompose to
15310 // unusually few instructions.
15311 int LaneCount = VT.getSizeInBits() / 128;
15312 int LaneSize = Size / LaneCount;
15313 SmallBitVector LaneInputs[2];
15314 LaneInputs[0].resize(LaneCount, false);
15315 LaneInputs[1].resize(LaneCount, false);
15316 for (int i = 0; i < Size; ++i)
15317 if (Mask[i] >= 0)
15318 LaneInputs[Mask[i] / Size][(Mask[i] % Size) / LaneSize] = true;
15319 if (LaneInputs[0].count() <= 1 && LaneInputs[1].count() <= 1)
15320 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG,
15321 /*SimpleOnly*/ false);
15322
15323 // Without AVX2, if we can freely split the subvectors then we're better off
15324 // performing half width shuffles.
15325 if (!Subtarget.hasAVX2()) {
15326 SDValue BC1 = peekThroughBitcasts(V1);
15327 SDValue BC2 = peekThroughBitcasts(V2);
15328 bool SplatOrSplitV1 = isFreeToSplitVector(BC1, DAG) ||
15329 DAG.isSplatValue(BC1, /*AllowUndefs=*/true);
15330 bool SplatOrSplitV2 = isFreeToSplitVector(BC2, DAG) ||
15331 DAG.isSplatValue(BC2, /*AllowUndefs=*/true);
15332 if (SplatOrSplitV1 && SplatOrSplitV2)
15333 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG,
15334 /*SimpleOnly*/ false);
15335 }
15336
15337 // Otherwise, just fall back to decomposed shuffles and a blend/unpack. This
15338 // requires that the decomposed single-input shuffles don't end up here.
15339 return lowerShuffleAsDecomposedShuffleMerge(DL, VT, V1, V2, Mask, Zeroable,
15340 Subtarget, DAG);
15341}
15342
15343// Lower as SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
15344// TODO: Extend to support v8f32 (+ 512-bit shuffles).
15346 SDValue V1, SDValue V2,
15347 ArrayRef<int> Mask,
15348 SelectionDAG &DAG) {
15349 assert(VT == MVT::v4f64 && "Only for v4f64 shuffles");
15350
15351 int LHSMask[4] = {-1, -1, -1, -1};
15352 int RHSMask[4] = {-1, -1, -1, -1};
15353 int SHUFPDMask[4] = {-1, -1, -1, -1};
15354
15355 // As SHUFPD uses a single LHS/RHS element per lane, we can always
15356 // perform the shuffle once the lanes have been shuffled in place.
15357 for (int i = 0; i != 4; ++i) {
15358 int M = Mask[i];
15359 if (M < 0)
15360 continue;
15361 int LaneBase = i & ~1;
15362 auto &LaneMask = (i & 1) ? RHSMask : LHSMask;
15363 LaneMask[LaneBase + (M & 1)] = M;
15364 SHUFPDMask[i] = M & 1;
15365 }
15366
15367 SDValue LHS = DAG.getVectorShuffle(VT, DL, V1, V2, LHSMask);
15368 SDValue RHS = DAG.getVectorShuffle(VT, DL, V1, V2, RHSMask);
15369 return DAG.getNode(X86ISD::SHUFP, DL, VT, LHS, RHS,
15370 getSHUFPDImmForMask(SHUFPDMask, DL, DAG));
15371}
15372
15373/// Lower a vector shuffle crossing multiple 128-bit lanes as
15374/// a lane permutation followed by a per-lane permutation.
15375///
15376/// This is mainly for cases where we can have non-repeating permutes
15377/// in each lane.
15378///
15379/// TODO: This is very similar to lowerShuffleAsLanePermuteAndRepeatedMask,
15380/// we should investigate merging them.
15382 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
15383 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
15384 int NumElts = VT.getVectorNumElements();
15385 int NumLanes = VT.getSizeInBits() / 128;
15386 int NumEltsPerLane = NumElts / NumLanes;
15387 bool CanUseSublanes = Subtarget.hasAVX2() && V2.isUndef();
15388
15389 /// Attempts to find a sublane permute with the given size
15390 /// that gets all elements into their target lanes.
15391 ///
15392 /// If successful, fills CrossLaneMask and InLaneMask and returns true.
15393 /// If unsuccessful, returns false and may overwrite InLaneMask.
15394 auto getSublanePermute = [&](int NumSublanes) -> SDValue {
15395 int NumSublanesPerLane = NumSublanes / NumLanes;
15396 int NumEltsPerSublane = NumElts / NumSublanes;
15397
15398 SmallVector<int, 16> CrossLaneMask;
15399 SmallVector<int, 16> InLaneMask(NumElts, SM_SentinelUndef);
15400 // CrossLaneMask but one entry == one sublane.
15401 SmallVector<int, 16> CrossLaneMaskLarge(NumSublanes, SM_SentinelUndef);
15402 APInt DemandedCrossLane = APInt::getZero(NumElts);
15403
15404 for (int i = 0; i != NumElts; ++i) {
15405 int M = Mask[i];
15406 if (M < 0)
15407 continue;
15408
15409 int SrcSublane = M / NumEltsPerSublane;
15410 int DstLane = i / NumEltsPerLane;
15411
15412 // We only need to get the elements into the right lane, not sublane.
15413 // So search all sublanes that make up the destination lane.
15414 bool Found = false;
15415 int DstSubStart = DstLane * NumSublanesPerLane;
15416 int DstSubEnd = DstSubStart + NumSublanesPerLane;
15417 for (int DstSublane = DstSubStart; DstSublane < DstSubEnd; ++DstSublane) {
15418 if (!isUndefOrEqual(CrossLaneMaskLarge[DstSublane], SrcSublane))
15419 continue;
15420
15421 Found = true;
15422 CrossLaneMaskLarge[DstSublane] = SrcSublane;
15423 int DstSublaneOffset = DstSublane * NumEltsPerSublane;
15424 InLaneMask[i] = DstSublaneOffset + M % NumEltsPerSublane;
15425 DemandedCrossLane.setBit(InLaneMask[i]);
15426 break;
15427 }
15428 if (!Found)
15429 return SDValue();
15430 }
15431
15432 // Fill CrossLaneMask using CrossLaneMaskLarge.
15433 narrowShuffleMaskElts(NumEltsPerSublane, CrossLaneMaskLarge, CrossLaneMask);
15434
15435 if (!CanUseSublanes) {
15436 // If we're only shuffling a single lowest lane and the rest are identity
15437 // then don't bother.
15438 // TODO - isShuffleMaskInputInPlace could be extended to something like
15439 // this.
15440 int NumIdentityLanes = 0;
15441 bool OnlyShuffleLowestLane = true;
15442 for (int i = 0; i != NumLanes; ++i) {
15443 int LaneOffset = i * NumEltsPerLane;
15444 if (isSequentialOrUndefInRange(InLaneMask, LaneOffset, NumEltsPerLane,
15445 i * NumEltsPerLane))
15446 NumIdentityLanes++;
15447 else if (CrossLaneMask[LaneOffset] != 0)
15448 OnlyShuffleLowestLane = false;
15449 }
15450 if (OnlyShuffleLowestLane && NumIdentityLanes == (NumLanes - 1))
15451 return SDValue();
15452 }
15453
15454 // Simplify CrossLaneMask based on the actual demanded elements.
15455 if (V1.hasOneUse())
15456 for (int i = 0; i != NumElts; ++i)
15457 if (!DemandedCrossLane[i])
15458 CrossLaneMask[i] = SM_SentinelUndef;
15459
15460 // Avoid returning the same shuffle operation. For example,
15461 // t7: v16i16 = vector_shuffle<8,9,10,11,4,5,6,7,0,1,2,3,12,13,14,15> t5,
15462 // undef:v16i16
15463 if (CrossLaneMask == Mask || InLaneMask == Mask)
15464 return SDValue();
15465
15466 SDValue CrossLane = DAG.getVectorShuffle(VT, DL, V1, V2, CrossLaneMask);
15467 return DAG.getVectorShuffle(VT, DL, CrossLane, DAG.getUNDEF(VT),
15468 InLaneMask);
15469 };
15470
15471 // First attempt a solution with full lanes.
15472 if (SDValue V = getSublanePermute(/*NumSublanes=*/NumLanes))
15473 return V;
15474
15475 // The rest of the solutions use sublanes.
15476 if (!CanUseSublanes)
15477 return SDValue();
15478
15479 // Then attempt a solution with 64-bit sublanes (vpermq).
15480 if (SDValue V = getSublanePermute(/*NumSublanes=*/NumLanes * 2))
15481 return V;
15482
15483 // If that doesn't work and we have fast variable cross-lane shuffle,
15484 // attempt 32-bit sublanes (vpermd).
15485 if (!Subtarget.hasFastVariableCrossLaneShuffle())
15486 return SDValue();
15487
15488 return getSublanePermute(/*NumSublanes=*/NumLanes * 4);
15489}
15490
15491/// Helper to get compute inlane shuffle mask for a complete shuffle mask.
15492static void computeInLaneShuffleMask(const ArrayRef<int> &Mask, int LaneSize,
15493 SmallVector<int> &InLaneMask) {
15494 int Size = Mask.size();
15495 InLaneMask.assign(Mask.begin(), Mask.end());
15496 for (int i = 0; i < Size; ++i) {
15497 int &M = InLaneMask[i];
15498 if (M < 0)
15499 continue;
15500 if (((M % Size) / LaneSize) != (i / LaneSize))
15501 M = (M % LaneSize) + ((i / LaneSize) * LaneSize) + Size;
15502 }
15503}
15504
15505/// Lower a vector shuffle crossing multiple 128-bit lanes by shuffling one
15506/// source with a lane permutation.
15507///
15508/// This lowering strategy results in four instructions in the worst case for a
15509/// single-input cross lane shuffle which is lower than any other fully general
15510/// cross-lane shuffle strategy I'm aware of. Special cases for each particular
15511/// shuffle pattern should be handled prior to trying this lowering.
15513 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
15514 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
15515 // FIXME: This should probably be generalized for 512-bit vectors as well.
15516 assert(VT.is256BitVector() && "Only for 256-bit vector shuffles!");
15517 int Size = Mask.size();
15518 int LaneSize = Size / 2;
15519
15520 // Fold to SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
15521 // Only do this if the elements aren't all from the lower lane,
15522 // otherwise we're (probably) better off doing a split.
15523 if (VT == MVT::v4f64 &&
15524 !all_of(Mask, [LaneSize](int M) { return M < LaneSize; }))
15525 return lowerShuffleAsLanePermuteAndSHUFP(DL, VT, V1, V2, Mask, DAG);
15526
15527 // If there are only inputs from one 128-bit lane, splitting will in fact be
15528 // less expensive. The flags track whether the given lane contains an element
15529 // that crosses to another lane.
15530 bool AllLanes;
15531 if (!Subtarget.hasAVX2()) {
15532 bool LaneCrossing[2] = {false, false};
15533 for (int i = 0; i < Size; ++i)
15534 if (Mask[i] >= 0 && ((Mask[i] % Size) / LaneSize) != (i / LaneSize))
15535 LaneCrossing[(Mask[i] % Size) / LaneSize] = true;
15536 AllLanes = LaneCrossing[0] && LaneCrossing[1];
15537 } else {
15538 bool LaneUsed[2] = {false, false};
15539 for (int i = 0; i < Size; ++i)
15540 if (Mask[i] >= 0)
15541 LaneUsed[(Mask[i] % Size) / LaneSize] = true;
15542 AllLanes = LaneUsed[0] && LaneUsed[1];
15543 }
15544
15545 // TODO - we could support shuffling V2 in the Flipped input.
15546 assert(V2.isUndef() &&
15547 "This last part of this routine only works on single input shuffles");
15548
15549 SmallVector<int> InLaneMask;
15550 computeInLaneShuffleMask(Mask, Mask.size() / 2, InLaneMask);
15551
15552 assert(!is128BitLaneCrossingShuffleMask(VT, InLaneMask) &&
15553 "In-lane shuffle mask expected");
15554
15555 // If we're not using both lanes in each lane and the inlane mask is not
15556 // repeating, then we're better off splitting.
15557 if (!AllLanes && !is128BitLaneRepeatedShuffleMask(VT, InLaneMask))
15558 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG,
15559 /*SimpleOnly*/ false);
15560
15561 // Flip the lanes, and shuffle the results which should now be in-lane.
15562 MVT PVT = VT.isFloatingPoint() ? MVT::v4f64 : MVT::v4i64;
15563 SDValue Flipped = DAG.getBitcast(PVT, V1);
15564 Flipped =
15565 DAG.getVectorShuffle(PVT, DL, Flipped, DAG.getUNDEF(PVT), {2, 3, 0, 1});
15566 Flipped = DAG.getBitcast(VT, Flipped);
15567 return DAG.getVectorShuffle(VT, DL, V1, Flipped, InLaneMask);
15568}
15569
15570/// Handle lowering 2-lane 128-bit shuffles.
15572 SDValue V2, ArrayRef<int> Mask,
15573 const APInt &Zeroable,
15574 const X86Subtarget &Subtarget,
15575 SelectionDAG &DAG) {
15576 if (V2.isUndef()) {
15577 // Attempt to match VBROADCAST*128 subvector broadcast load.
15578 bool SplatLo = isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1);
15579 bool SplatHi = isShuffleEquivalent(Mask, {2, 3, 2, 3}, V1);
15580 if ((SplatLo || SplatHi) && !Subtarget.hasAVX512() && V1.hasOneUse() &&
15582 MVT MemVT = VT.getHalfNumVectorElementsVT();
15583 unsigned Ofs = SplatLo ? 0 : MemVT.getStoreSize();
15586 VT, MemVT, Ld, Ofs, DAG))
15587 return BcstLd;
15588 }
15589
15590 // With AVX2, use VPERMQ/VPERMPD for unary shuffles to allow memory folding.
15591 if (Subtarget.hasAVX2())
15592 return SDValue();
15593 }
15594
15595 bool V2IsZero = !V2.isUndef() && ISD::isBuildVectorAllZeros(V2.getNode());
15596
15597 SmallVector<int, 4> WidenedMask;
15598 if (!canWidenShuffleElements(Mask, Zeroable, V2IsZero, WidenedMask))
15599 return SDValue();
15600
15601 bool IsLowZero = (Zeroable & 0x3) == 0x3;
15602 bool IsHighZero = (Zeroable & 0xc) == 0xc;
15603
15604 // Try to use an insert into a zero vector.
15605 if (WidenedMask[0] == 0 && IsHighZero) {
15606 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
15607 SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
15608 DAG.getVectorIdxConstant(0, DL));
15609 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
15610 getZeroVector(VT, Subtarget, DAG, DL), LoV,
15611 DAG.getVectorIdxConstant(0, DL));
15612 }
15613
15614 // TODO: If minimizing size and one of the inputs is a zero vector and the
15615 // the zero vector has only one use, we could use a VPERM2X128 to save the
15616 // instruction bytes needed to explicitly generate the zero vector.
15617
15618 // Blends are faster and handle all the non-lane-crossing cases.
15619 if (SDValue Blend = lowerShuffleAsBlend(DL, VT, V1, V2, Mask, Zeroable,
15620 Subtarget, DAG))
15621 return Blend;
15622
15623 // If either input operand is a zero vector, use VPERM2X128 because its mask
15624 // allows us to replace the zero input with an implicit zero.
15625 if (!IsLowZero && !IsHighZero) {
15626 // Check for patterns which can be matched with a single insert of a 128-bit
15627 // subvector.
15628 bool OnlyUsesV1 = isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1, V2);
15629 if (OnlyUsesV1 || isShuffleEquivalent(Mask, {0, 1, 4, 5}, V1, V2)) {
15630
15631 // With AVX1, use vperm2f128 (below) to allow load folding. Otherwise,
15632 // this will likely become vinsertf128 which can't fold a 256-bit memop.
15634 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
15635 SDValue SubVec =
15636 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, OnlyUsesV1 ? V1 : V2,
15637 DAG.getVectorIdxConstant(0, DL));
15638 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, V1, SubVec,
15639 DAG.getVectorIdxConstant(2, DL));
15640 }
15641 }
15642
15643 // Try to use SHUF128 if possible.
15644 if (Subtarget.hasVLX()) {
15645 if (WidenedMask[0] < 2 && WidenedMask[1] >= 2) {
15646 unsigned PermMask = ((WidenedMask[0] % 2) << 0) |
15647 ((WidenedMask[1] % 2) << 1);
15648 return DAG.getNode(X86ISD::SHUF128, DL, VT, V1, V2,
15649 DAG.getTargetConstant(PermMask, DL, MVT::i8));
15650 }
15651 }
15652 }
15653
15654 // Otherwise form a 128-bit permutation. After accounting for undefs,
15655 // convert the 64-bit shuffle mask selection values into 128-bit
15656 // selection bits by dividing the indexes by 2 and shifting into positions
15657 // defined by a vperm2*128 instruction's immediate control byte.
15658
15659 // The immediate permute control byte looks like this:
15660 // [1:0] - select 128 bits from sources for low half of destination
15661 // [2] - ignore
15662 // [3] - zero low half of destination
15663 // [5:4] - select 128 bits from sources for high half of destination
15664 // [6] - ignore
15665 // [7] - zero high half of destination
15666
15667 assert((WidenedMask[0] >= 0 || IsLowZero) &&
15668 (WidenedMask[1] >= 0 || IsHighZero) && "Undef half?");
15669
15670 unsigned PermMask = 0;
15671 PermMask |= IsLowZero ? 0x08 : (WidenedMask[0] << 0);
15672 PermMask |= IsHighZero ? 0x80 : (WidenedMask[1] << 4);
15673
15674 // Check the immediate mask and replace unused sources with undef.
15675 if ((PermMask & 0x0a) != 0x00 && (PermMask & 0xa0) != 0x00)
15676 V1 = DAG.getUNDEF(VT);
15677 if ((PermMask & 0x0a) != 0x02 && (PermMask & 0xa0) != 0x20)
15678 V2 = DAG.getUNDEF(VT);
15679
15680 return DAG.getNode(X86ISD::VPERM2X128, DL, VT, V1, V2,
15681 DAG.getTargetConstant(PermMask, DL, MVT::i8));
15682}
15683
15684/// Lower a vector shuffle by first fixing the 128-bit lanes and then
15685/// shuffling each lane.
15686///
15687/// This attempts to create a repeated lane shuffle where each lane uses one
15688/// or two of the lanes of the inputs. The lanes of the input vectors are
15689/// shuffled in one or two independent shuffles to get the lanes into the
15690/// position needed by the final shuffle.
15692 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
15693 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
15694 assert(!V2.isUndef() && "This is only useful with multiple inputs.");
15695
15696 if (is128BitLaneRepeatedShuffleMask(VT, Mask))
15697 return SDValue();
15698
15699 int NumElts = Mask.size();
15700 int NumLanes = VT.getSizeInBits() / 128;
15701 int NumLaneElts = 128 / VT.getScalarSizeInBits();
15702 SmallVector<int, 16> RepeatMask(NumLaneElts, -1);
15703 SmallVector<std::array<int, 2>, 2> LaneSrcs(NumLanes, {{-1, -1}});
15704
15705 // First pass will try to fill in the RepeatMask from lanes that need two
15706 // sources.
15707 for (int Lane = 0; Lane != NumLanes; ++Lane) {
15708 int Srcs[2] = {-1, -1};
15709 SmallVector<int, 16> InLaneMask(NumLaneElts, -1);
15710 for (int i = 0; i != NumLaneElts; ++i) {
15711 int M = Mask[(Lane * NumLaneElts) + i];
15712 if (M < 0)
15713 continue;
15714 // Determine which of the possible input lanes (NumLanes from each source)
15715 // this element comes from. Assign that as one of the sources for this
15716 // lane. We can assign up to 2 sources for this lane. If we run out
15717 // sources we can't do anything.
15718 int LaneSrc = M / NumLaneElts;
15719 int Src;
15720 if (Srcs[0] < 0 || Srcs[0] == LaneSrc)
15721 Src = 0;
15722 else if (Srcs[1] < 0 || Srcs[1] == LaneSrc)
15723 Src = 1;
15724 else
15725 return SDValue();
15726
15727 Srcs[Src] = LaneSrc;
15728 InLaneMask[i] = (M % NumLaneElts) + Src * NumElts;
15729 }
15730
15731 // If this lane has two sources, see if it fits with the repeat mask so far.
15732 if (Srcs[1] < 0)
15733 continue;
15734
15735 LaneSrcs[Lane][0] = Srcs[0];
15736 LaneSrcs[Lane][1] = Srcs[1];
15737
15738 auto MatchMasks = [](ArrayRef<int> M1, ArrayRef<int> M2) {
15739 assert(M1.size() == M2.size() && "Unexpected mask size");
15740 for (int i = 0, e = M1.size(); i != e; ++i)
15741 if (M1[i] >= 0 && M2[i] >= 0 && M1[i] != M2[i])
15742 return false;
15743 return true;
15744 };
15745
15746 auto MergeMasks = [](ArrayRef<int> Mask, MutableArrayRef<int> MergedMask) {
15747 assert(Mask.size() == MergedMask.size() && "Unexpected mask size");
15748 for (int i = 0, e = MergedMask.size(); i != e; ++i) {
15749 int M = Mask[i];
15750 if (M < 0)
15751 continue;
15752 assert((MergedMask[i] < 0 || MergedMask[i] == M) &&
15753 "Unexpected mask element");
15754 MergedMask[i] = M;
15755 }
15756 };
15757
15758 if (MatchMasks(InLaneMask, RepeatMask)) {
15759 // Merge this lane mask into the final repeat mask.
15760 MergeMasks(InLaneMask, RepeatMask);
15761 continue;
15762 }
15763
15764 // Didn't find a match. Swap the operands and try again.
15765 std::swap(LaneSrcs[Lane][0], LaneSrcs[Lane][1]);
15767
15768 if (MatchMasks(InLaneMask, RepeatMask)) {
15769 // Merge this lane mask into the final repeat mask.
15770 MergeMasks(InLaneMask, RepeatMask);
15771 continue;
15772 }
15773
15774 // Couldn't find a match with the operands in either order.
15775 return SDValue();
15776 }
15777
15778 // Now handle any lanes with only one source.
15779 for (int Lane = 0; Lane != NumLanes; ++Lane) {
15780 // If this lane has already been processed, skip it.
15781 if (LaneSrcs[Lane][0] >= 0)
15782 continue;
15783
15784 for (int i = 0; i != NumLaneElts; ++i) {
15785 int M = Mask[(Lane * NumLaneElts) + i];
15786 if (M < 0)
15787 continue;
15788
15789 // If RepeatMask isn't defined yet we can define it ourself.
15790 if (RepeatMask[i] < 0)
15791 RepeatMask[i] = M % NumLaneElts;
15792
15793 if (RepeatMask[i] < NumElts) {
15794 if (RepeatMask[i] != M % NumLaneElts)
15795 return SDValue();
15796 LaneSrcs[Lane][0] = M / NumLaneElts;
15797 } else {
15798 if (RepeatMask[i] != ((M % NumLaneElts) + NumElts))
15799 return SDValue();
15800 LaneSrcs[Lane][1] = M / NumLaneElts;
15801 }
15802 }
15803
15804 if (LaneSrcs[Lane][0] < 0 && LaneSrcs[Lane][1] < 0)
15805 return SDValue();
15806 }
15807
15808 SmallVector<int, 16> NewMask(NumElts, -1);
15809 for (int Lane = 0; Lane != NumLanes; ++Lane) {
15810 int Src = LaneSrcs[Lane][0];
15811 for (int i = 0; i != NumLaneElts; ++i) {
15812 int M = -1;
15813 if (Src >= 0)
15814 M = Src * NumLaneElts + i;
15815 NewMask[Lane * NumLaneElts + i] = M;
15816 }
15817 }
15818 SDValue NewV1 = DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
15819 // Ensure we didn't get back the shuffle we started with.
15820 // FIXME: This is a hack to make up for some splat handling code in
15821 // getVectorShuffle.
15822 if (isa<ShuffleVectorSDNode>(NewV1) &&
15823 cast<ShuffleVectorSDNode>(NewV1)->getMask() == Mask)
15824 return SDValue();
15825
15826 for (int Lane = 0; Lane != NumLanes; ++Lane) {
15827 int Src = LaneSrcs[Lane][1];
15828 for (int i = 0; i != NumLaneElts; ++i) {
15829 int M = -1;
15830 if (Src >= 0)
15831 M = Src * NumLaneElts + i;
15832 NewMask[Lane * NumLaneElts + i] = M;
15833 }
15834 }
15835 SDValue NewV2 = DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
15836 // Ensure we didn't get back the shuffle we started with.
15837 // FIXME: This is a hack to make up for some splat handling code in
15838 // getVectorShuffle.
15839 if (isa<ShuffleVectorSDNode>(NewV2) &&
15840 cast<ShuffleVectorSDNode>(NewV2)->getMask() == Mask)
15841 return SDValue();
15842
15843 for (int i = 0; i != NumElts; ++i) {
15844 if (Mask[i] < 0) {
15845 NewMask[i] = -1;
15846 continue;
15847 }
15848 NewMask[i] = RepeatMask[i % NumLaneElts];
15849 if (NewMask[i] < 0)
15850 continue;
15851
15852 NewMask[i] += (i / NumLaneElts) * NumLaneElts;
15853 }
15854 return DAG.getVectorShuffle(VT, DL, NewV1, NewV2, NewMask);
15855}
15856
15857/// If the input shuffle mask results in a vector that is undefined in all upper
15858/// or lower half elements and that mask accesses only 2 halves of the
15859/// shuffle's operands, return true. A mask of half the width with mask indexes
15860/// adjusted to access the extracted halves of the original shuffle operands is
15861/// returned in HalfMask. HalfIdx1 and HalfIdx2 return whether the upper or
15862/// lower half of each input operand is accessed.
15863static bool
15865 int &HalfIdx1, int &HalfIdx2) {
15866 assert((Mask.size() == HalfMask.size() * 2) &&
15867 "Expected input mask to be twice as long as output");
15868
15869 // Exactly one half of the result must be undef to allow narrowing.
15870 bool UndefLower = isUndefLowerHalf(Mask);
15871 bool UndefUpper = isUndefUpperHalf(Mask);
15872 if (UndefLower == UndefUpper)
15873 return false;
15874
15875 unsigned HalfNumElts = HalfMask.size();
15876 unsigned MaskIndexOffset = UndefLower ? HalfNumElts : 0;
15877 HalfIdx1 = -1;
15878 HalfIdx2 = -1;
15879 for (unsigned i = 0; i != HalfNumElts; ++i) {
15880 int M = Mask[i + MaskIndexOffset];
15881 if (M < 0) {
15882 HalfMask[i] = M;
15883 continue;
15884 }
15885
15886 // Determine which of the 4 half vectors this element is from.
15887 // i.e. 0 = Lower V1, 1 = Upper V1, 2 = Lower V2, 3 = Upper V2.
15888 int HalfIdx = M / HalfNumElts;
15889
15890 // Determine the element index into its half vector source.
15891 int HalfElt = M % HalfNumElts;
15892
15893 // We can shuffle with up to 2 half vectors, set the new 'half'
15894 // shuffle mask accordingly.
15895 if (HalfIdx1 < 0 || HalfIdx1 == HalfIdx) {
15896 HalfMask[i] = HalfElt;
15897 HalfIdx1 = HalfIdx;
15898 continue;
15899 }
15900 if (HalfIdx2 < 0 || HalfIdx2 == HalfIdx) {
15901 HalfMask[i] = HalfElt + HalfNumElts;
15902 HalfIdx2 = HalfIdx;
15903 continue;
15904 }
15905
15906 // Too many half vectors referenced.
15907 return false;
15908 }
15909
15910 return true;
15911}
15912
15913/// Given the output values from getHalfShuffleMask(), create a half width
15914/// shuffle of extracted vectors followed by an insert back to full width.
15916 ArrayRef<int> HalfMask, int HalfIdx1,
15917 int HalfIdx2, bool UndefLower,
15918 SelectionDAG &DAG, bool UseConcat = false) {
15919 assert(V1.getValueType() == V2.getValueType() && "Different sized vectors?");
15920 assert(V1.getValueType().isSimple() && "Expecting only simple types");
15921
15922 MVT VT = V1.getSimpleValueType();
15923 MVT HalfVT = VT.getHalfNumVectorElementsVT();
15924 unsigned HalfNumElts = HalfVT.getVectorNumElements();
15925
15926 auto getHalfVector = [&](int HalfIdx) {
15927 if (HalfIdx < 0)
15928 return DAG.getUNDEF(HalfVT);
15929 SDValue V = (HalfIdx < 2 ? V1 : V2);
15930 HalfIdx = (HalfIdx % 2) * HalfNumElts;
15931 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V,
15932 DAG.getVectorIdxConstant(HalfIdx, DL));
15933 };
15934
15935 // ins undef, (shuf (ext V1, HalfIdx1), (ext V2, HalfIdx2), HalfMask), Offset
15936 SDValue Half1 = getHalfVector(HalfIdx1);
15937 SDValue Half2 = getHalfVector(HalfIdx2);
15938 SDValue V = DAG.getVectorShuffle(HalfVT, DL, Half1, Half2, HalfMask);
15939 if (UseConcat) {
15940 SDValue Op0 = V;
15941 SDValue Op1 = DAG.getUNDEF(HalfVT);
15942 if (UndefLower)
15943 std::swap(Op0, Op1);
15944 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Op0, Op1);
15945 }
15946
15947 unsigned Offset = UndefLower ? HalfNumElts : 0;
15948 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V,
15950}
15951
15952/// Lower shuffles where an entire half of a 256 or 512-bit vector is UNDEF.
15953/// This allows for fast cases such as subvector extraction/insertion
15954/// or shuffling smaller vector types which can lower more efficiently.
15956 SDValue V2, ArrayRef<int> Mask,
15957 const X86Subtarget &Subtarget,
15958 SelectionDAG &DAG) {
15959 assert((VT.is256BitVector() || VT.is512BitVector()) &&
15960 "Expected 256-bit or 512-bit vector");
15961
15962 bool UndefLower = isUndefLowerHalf(Mask);
15963 if (!UndefLower && !isUndefUpperHalf(Mask))
15964 return SDValue();
15965
15966 assert((!UndefLower || !isUndefUpperHalf(Mask)) &&
15967 "Completely undef shuffle mask should have been simplified already");
15968
15969 // Upper half is undef and lower half is whole upper subvector.
15970 // e.g. vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
15971 MVT HalfVT = VT.getHalfNumVectorElementsVT();
15972 unsigned HalfNumElts = HalfVT.getVectorNumElements();
15973 if (!UndefLower &&
15974 isSequentialOrUndefInRange(Mask, 0, HalfNumElts, HalfNumElts)) {
15975 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
15976 DAG.getVectorIdxConstant(HalfNumElts, DL));
15977 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
15978 DAG.getVectorIdxConstant(0, DL));
15979 }
15980
15981 // Lower half is undef and upper half is whole lower subvector.
15982 // e.g. vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
15983 if (UndefLower &&
15984 isSequentialOrUndefInRange(Mask, HalfNumElts, HalfNumElts, 0)) {
15985 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
15986 DAG.getVectorIdxConstant(0, DL));
15987 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
15988 DAG.getVectorIdxConstant(HalfNumElts, DL));
15989 }
15990
15991 int HalfIdx1, HalfIdx2;
15992 SmallVector<int, 8> HalfMask(HalfNumElts);
15993 if (!getHalfShuffleMask(Mask, HalfMask, HalfIdx1, HalfIdx2))
15994 return SDValue();
15995
15996 assert(HalfMask.size() == HalfNumElts && "Unexpected shuffle mask length");
15997
15998 // Only shuffle the halves of the inputs when useful.
15999 unsigned NumLowerHalves =
16000 (HalfIdx1 == 0 || HalfIdx1 == 2) + (HalfIdx2 == 0 || HalfIdx2 == 2);
16001 unsigned NumUpperHalves =
16002 (HalfIdx1 == 1 || HalfIdx1 == 3) + (HalfIdx2 == 1 || HalfIdx2 == 3);
16003 assert(NumLowerHalves + NumUpperHalves <= 2 && "Only 1 or 2 halves allowed");
16004
16005 // Determine the larger pattern of undef/halves, then decide if it's worth
16006 // splitting the shuffle based on subtarget capabilities and types.
16007 unsigned EltWidth = VT.getVectorElementType().getSizeInBits();
16008 if (!UndefLower) {
16009 // XXXXuuuu: no insert is needed.
16010 // Always extract lowers when setting lower - these are all free subreg ops.
16011 if (NumUpperHalves == 0)
16012 return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
16013 UndefLower, DAG);
16014
16015 if (NumUpperHalves == 1) {
16016 // AVX2 has efficient 32/64-bit element cross-lane shuffles.
16017 if (Subtarget.hasAVX2()) {
16018 // extract128 + vunpckhps/vshufps, is better than vblend + vpermps.
16019 if (EltWidth == 32 && NumLowerHalves && HalfVT.is128BitVector() &&
16020 !is128BitUnpackShuffleMask(HalfMask, DAG) &&
16021 (!isSingleSHUFPSMask(HalfMask) ||
16022 Subtarget.hasFastVariableCrossLaneShuffle()))
16023 return SDValue();
16024 // If this is an unary shuffle (assume that the 2nd operand is
16025 // canonicalized to undef), then we can use vpermpd. Otherwise, we
16026 // are better off extracting the upper half of 1 operand and using a
16027 // narrow shuffle.
16028 if (EltWidth == 64 && V2.isUndef())
16029 return SDValue();
16030 // If this is an unary vXi8 shuffle with inplace halves, then perform as
16031 // full width pshufb, and then merge.
16032 if (EltWidth == 8 && HalfIdx1 == 0 && HalfIdx2 == 1)
16033 return SDValue();
16034 }
16035 // AVX512 has efficient cross-lane shuffles for all legal 512-bit types.
16036 if (Subtarget.hasAVX512() && VT.is512BitVector())
16037 return SDValue();
16038 // Extract + narrow shuffle is better than the wide alternative.
16039 return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
16040 UndefLower, DAG);
16041 }
16042
16043 // Don't extract both uppers, instead shuffle and then extract.
16044 assert(NumUpperHalves == 2 && "Half vector count went wrong");
16045 return SDValue();
16046 }
16047
16048 // UndefLower - uuuuXXXX: an insert to high half is required if we split this.
16049 if (NumUpperHalves == 0) {
16050 // AVX2 has efficient 64-bit element cross-lane shuffles.
16051 // TODO: Refine to account for unary shuffle, splat, and other masks?
16052 if (Subtarget.hasAVX2() && EltWidth == 64)
16053 return SDValue();
16054 // AVX512 has efficient cross-lane shuffles for all legal 512-bit types.
16055 if (Subtarget.hasAVX512() && VT.is512BitVector())
16056 return SDValue();
16057 // Narrow shuffle + insert is better than the wide alternative.
16058 return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
16059 UndefLower, DAG);
16060 }
16061
16062 // NumUpperHalves != 0: don't bother with extract, shuffle, and then insert.
16063 return SDValue();
16064}
16065
16066/// Handle case where shuffle sources are coming from the same 128-bit lane and
16067/// every lane can be represented as the same repeating mask - allowing us to
16068/// shuffle the sources with the repeating shuffle and then permute the result
16069/// to the destination lanes.
16071 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
16072 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
16073 int NumElts = VT.getVectorNumElements();
16074 int NumLanes = VT.getSizeInBits() / 128;
16075 int NumLaneElts = NumElts / NumLanes;
16076
16077 // On AVX2 we may be able to just shuffle the lowest elements and then
16078 // broadcast the result.
16079 if (Subtarget.hasAVX2()) {
16080 for (unsigned BroadcastSize : {16, 32, 64}) {
16081 if (BroadcastSize <= VT.getScalarSizeInBits())
16082 continue;
16083 int NumBroadcastElts = BroadcastSize / VT.getScalarSizeInBits();
16084
16085 // Attempt to match a repeating pattern every NumBroadcastElts,
16086 // accounting for UNDEFs but only references the lowest 128-bit
16087 // lane of the inputs.
16088 auto FindRepeatingBroadcastMask = [&](SmallVectorImpl<int> &RepeatMask) {
16089 for (int i = 0; i != NumElts; i += NumBroadcastElts)
16090 for (int j = 0; j != NumBroadcastElts; ++j) {
16091 int M = Mask[i + j];
16092 if (M < 0)
16093 continue;
16094 int &R = RepeatMask[j];
16095 if (0 != ((M % NumElts) / NumLaneElts))
16096 return false;
16097 if (0 <= R && R != M)
16098 return false;
16099 R = M;
16100 }
16101 return true;
16102 };
16103
16104 SmallVector<int, 8> RepeatMask((unsigned)NumElts, -1);
16105 if (!FindRepeatingBroadcastMask(RepeatMask))
16106 continue;
16107
16108 // Shuffle the (lowest) repeated elements in place for broadcast.
16109 SDValue RepeatShuf = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatMask);
16110
16111 // Shuffle the actual broadcast.
16112 SmallVector<int, 8> BroadcastMask((unsigned)NumElts, -1);
16113 for (int i = 0; i != NumElts; i += NumBroadcastElts)
16114 for (int j = 0; j != NumBroadcastElts; ++j)
16115 BroadcastMask[i + j] = j;
16116
16117 // Avoid returning the same shuffle operation. For example,
16118 // v8i32 = vector_shuffle<0,1,0,1,0,1,0,1> t5, undef:v8i32
16119 if (BroadcastMask == Mask)
16120 return SDValue();
16121
16122 return DAG.getVectorShuffle(VT, DL, RepeatShuf, DAG.getUNDEF(VT),
16123 BroadcastMask);
16124 }
16125 }
16126
16127 // Bail if the shuffle mask doesn't cross 128-bit lanes.
16128 if (!is128BitLaneCrossingShuffleMask(VT, Mask))
16129 return SDValue();
16130
16131 // Bail if we already have a repeated lane shuffle mask.
16132 if (is128BitLaneRepeatedShuffleMask(VT, Mask))
16133 return SDValue();
16134
16135 // Helper to look for repeated mask in each split sublane, and that those
16136 // sublanes can then be permuted into place.
16137 auto ShuffleSubLanes = [&](int SubLaneScale) {
16138 int NumSubLanes = NumLanes * SubLaneScale;
16139 int NumSubLaneElts = NumLaneElts / SubLaneScale;
16140
16141 // Check that all the sources are coming from the same lane and see if we
16142 // can form a repeating shuffle mask (local to each sub-lane). At the same
16143 // time, determine the source sub-lane for each destination sub-lane.
16144 int TopSrcSubLane = -1;
16145 SmallVector<int, 8> Dst2SrcSubLanes((unsigned)NumSubLanes, -1);
16146 SmallVector<SmallVector<int, 8>> RepeatedSubLaneMasks(
16147 SubLaneScale,
16148 SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef));
16149
16150 for (int DstSubLane = 0; DstSubLane != NumSubLanes; ++DstSubLane) {
16151 // Extract the sub-lane mask, check that it all comes from the same lane
16152 // and normalize the mask entries to come from the first lane.
16153 int SrcLane = -1;
16154 SmallVector<int, 8> SubLaneMask((unsigned)NumSubLaneElts, -1);
16155 for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
16156 int M = Mask[(DstSubLane * NumSubLaneElts) + Elt];
16157 if (M < 0)
16158 continue;
16159 int Lane = (M % NumElts) / NumLaneElts;
16160 if ((0 <= SrcLane) && (SrcLane != Lane))
16161 return SDValue();
16162 SrcLane = Lane;
16163 int LocalM = (M % NumLaneElts) + (M < NumElts ? 0 : NumElts);
16164 SubLaneMask[Elt] = LocalM;
16165 }
16166
16167 // Whole sub-lane is UNDEF.
16168 if (SrcLane < 0)
16169 continue;
16170
16171 // Attempt to match against the candidate repeated sub-lane masks.
16172 for (int SubLane = 0; SubLane != SubLaneScale; ++SubLane) {
16173 auto MatchMasks = [NumSubLaneElts](ArrayRef<int> M1, ArrayRef<int> M2) {
16174 for (int i = 0; i != NumSubLaneElts; ++i) {
16175 if (M1[i] < 0 || M2[i] < 0)
16176 continue;
16177 if (M1[i] != M2[i])
16178 return false;
16179 }
16180 return true;
16181 };
16182
16183 auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane];
16184 if (!MatchMasks(SubLaneMask, RepeatedSubLaneMask))
16185 continue;
16186
16187 // Merge the sub-lane mask into the matching repeated sub-lane mask.
16188 for (int i = 0; i != NumSubLaneElts; ++i) {
16189 int M = SubLaneMask[i];
16190 if (M < 0)
16191 continue;
16192 assert((RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask[i] == M) &&
16193 "Unexpected mask element");
16194 RepeatedSubLaneMask[i] = M;
16195 }
16196
16197 // Track the top most source sub-lane - by setting the remaining to
16198 // UNDEF we can greatly simplify shuffle matching.
16199 int SrcSubLane = (SrcLane * SubLaneScale) + SubLane;
16200 TopSrcSubLane = std::max(TopSrcSubLane, SrcSubLane);
16201 Dst2SrcSubLanes[DstSubLane] = SrcSubLane;
16202 break;
16203 }
16204
16205 // Bail if we failed to find a matching repeated sub-lane mask.
16206 if (Dst2SrcSubLanes[DstSubLane] < 0)
16207 return SDValue();
16208 }
16209 assert(0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes &&
16210 "Unexpected source lane");
16211
16212 // Create a repeating shuffle mask for the entire vector.
16213 SmallVector<int, 8> RepeatedMask((unsigned)NumElts, -1);
16214 for (int SubLane = 0; SubLane <= TopSrcSubLane; ++SubLane) {
16215 int Lane = SubLane / SubLaneScale;
16216 auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane % SubLaneScale];
16217 for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
16218 int M = RepeatedSubLaneMask[Elt];
16219 if (M < 0)
16220 continue;
16221 int Idx = (SubLane * NumSubLaneElts) + Elt;
16222 RepeatedMask[Idx] = M + (Lane * NumLaneElts);
16223 }
16224 }
16225
16226 // Shuffle each source sub-lane to its destination.
16227 SmallVector<int, 8> SubLaneMask((unsigned)NumElts, -1);
16228 for (int i = 0; i != NumElts; i += NumSubLaneElts) {
16229 int SrcSubLane = Dst2SrcSubLanes[i / NumSubLaneElts];
16230 if (SrcSubLane < 0)
16231 continue;
16232 for (int j = 0; j != NumSubLaneElts; ++j)
16233 SubLaneMask[i + j] = j + (SrcSubLane * NumSubLaneElts);
16234 }
16235
16236 // Avoid returning the same shuffle operation.
16237 // v8i32 = vector_shuffle<0,1,4,5,2,3,6,7> t5, undef:v8i32
16238 if (RepeatedMask == Mask || SubLaneMask == Mask)
16239 return SDValue();
16240
16241 SDValue RepeatedShuffle =
16242 DAG.getVectorShuffle(VT, DL, V1, V2, RepeatedMask);
16243
16244 return DAG.getVectorShuffle(VT, DL, RepeatedShuffle, DAG.getUNDEF(VT),
16245 SubLaneMask);
16246 };
16247
16248 // On AVX2 targets we can permute 256-bit vectors as 64-bit sub-lanes
16249 // (with PERMQ/PERMPD). On AVX2/AVX512BW targets, permuting 32-bit sub-lanes,
16250 // even with a variable shuffle, can be worth it for v32i8/v64i8 vectors.
16251 // Otherwise we can only permute whole 128-bit lanes.
16252 int MinSubLaneScale = 1, MaxSubLaneScale = 1;
16253 if (Subtarget.hasAVX2() && VT.is256BitVector()) {
16254 bool OnlyLowestElts = isUndefOrInRange(Mask, 0, NumLaneElts);
16255 MinSubLaneScale = 2;
16256 MaxSubLaneScale =
16257 (!OnlyLowestElts && V2.isUndef() && VT == MVT::v32i8) ? 4 : 2;
16258 }
16259 if (Subtarget.hasBWI() && VT == MVT::v64i8)
16260 MinSubLaneScale = MaxSubLaneScale = 4;
16261
16262 for (int Scale = MinSubLaneScale; Scale <= MaxSubLaneScale; Scale *= 2)
16263 if (SDValue Shuffle = ShuffleSubLanes(Scale))
16264 return Shuffle;
16265
16266 return SDValue();
16267}
16268
16270 bool &ForceV1Zero, bool &ForceV2Zero,
16271 unsigned &ShuffleImm, ArrayRef<int> Mask,
16272 const APInt &Zeroable) {
16273 int NumElts = VT.getVectorNumElements();
16274 assert(VT.getScalarSizeInBits() == 64 &&
16275 (NumElts == 2 || NumElts == 4 || NumElts == 8) &&
16276 "Unexpected data type for VSHUFPD");
16277 assert(isUndefOrZeroOrInRange(Mask, 0, 2 * NumElts) &&
16278 "Illegal shuffle mask");
16279
16280 bool ZeroLane[2] = { true, true };
16281 for (int i = 0; i < NumElts; ++i)
16282 ZeroLane[i & 1] &= Zeroable[i];
16283
16284 // Mask for V8F64: 0/1, 8/9, 2/3, 10/11, 4/5, ..
16285 // Mask for V4F64; 0/1, 4/5, 2/3, 6/7..
16286 bool IsSHUFPD = true;
16287 bool IsCommutable = true;
16288 SmallVector<int, 8> SHUFPDMask(NumElts, -1);
16289 for (int i = 0; i < NumElts; ++i) {
16290 if (Mask[i] == SM_SentinelUndef || ZeroLane[i & 1])
16291 continue;
16292 if (Mask[i] < 0)
16293 return false;
16294 int Val = (i & 6) + NumElts * (i & 1);
16295 int CommutVal = (i & 0xe) + NumElts * ((i & 1) ^ 1);
16296 if (Mask[i] < Val || Mask[i] > Val + 1)
16297 IsSHUFPD = false;
16298 if (Mask[i] < CommutVal || Mask[i] > CommutVal + 1)
16299 IsCommutable = false;
16300 SHUFPDMask[i] = Mask[i] % 2;
16301 }
16302
16303 if (!IsSHUFPD && !IsCommutable)
16304 return false;
16305
16306 if (!IsSHUFPD && IsCommutable)
16307 std::swap(V1, V2);
16308
16309 ForceV1Zero = ZeroLane[0];
16310 ForceV2Zero = ZeroLane[1];
16311 ShuffleImm = getSHUFPDImm(SHUFPDMask);
16312 return true;
16313}
16314
16316 SDValue V2, ArrayRef<int> Mask,
16317 const APInt &Zeroable,
16318 const X86Subtarget &Subtarget,
16319 SelectionDAG &DAG) {
16320 assert((VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64) &&
16321 "Unexpected data type for VSHUFPD");
16322
16323 unsigned Immediate = 0;
16324 bool ForceV1Zero = false, ForceV2Zero = false;
16325 if (!matchShuffleWithSHUFPD(VT, V1, V2, ForceV1Zero, ForceV2Zero, Immediate,
16326 Mask, Zeroable))
16327 return SDValue();
16328
16329 // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
16330 if (ForceV1Zero)
16331 V1 = getZeroVector(VT, Subtarget, DAG, DL);
16332 if (ForceV2Zero)
16333 V2 = getZeroVector(VT, Subtarget, DAG, DL);
16334
16335 return DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
16336 DAG.getTargetConstant(Immediate, DL, MVT::i8));
16337}
16338
16339// Look for {0, 8, 16, 24, 32, 40, 48, 56 } in the first 8 elements. Followed
16340// by zeroable elements in the remaining 24 elements. Turn this into two
16341// vmovqb instructions shuffled together.
16343 SDValue V1, SDValue V2,
16344 ArrayRef<int> Mask,
16345 const APInt &Zeroable,
16346 SelectionDAG &DAG) {
16347 assert(VT == MVT::v32i8 && "Unexpected type!");
16348
16349 // The first 8 indices should be every 8th element.
16350 if (!isSequentialOrUndefInRange(Mask, 0, 8, 0, 8))
16351 return SDValue();
16352
16353 // Remaining elements need to be zeroable.
16354 if (Zeroable.countl_one() < (Mask.size() - 8))
16355 return SDValue();
16356
16357 V1 = DAG.getBitcast(MVT::v4i64, V1);
16358 V2 = DAG.getBitcast(MVT::v4i64, V2);
16359
16360 V1 = DAG.getNode(X86ISD::VTRUNC, DL, MVT::v16i8, V1);
16361 V2 = DAG.getNode(X86ISD::VTRUNC, DL, MVT::v16i8, V2);
16362
16363 // The VTRUNCs will put 0s in the upper 12 bytes. Use them to put zeroes in
16364 // the upper bits of the result using an unpckldq.
16365 SDValue Unpack = DAG.getVectorShuffle(MVT::v16i8, DL, V1, V2,
16366 { 0, 1, 2, 3, 16, 17, 18, 19,
16367 4, 5, 6, 7, 20, 21, 22, 23 });
16368 // Insert the unpckldq into a zero vector to widen to v32i8.
16369 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v32i8,
16370 DAG.getConstant(0, DL, MVT::v32i8), Unpack,
16371 DAG.getVectorIdxConstant(0, DL));
16372}
16373
16374// a = shuffle v1, v2, mask1 ; interleaving lower lanes of v1 and v2
16375// b = shuffle v1, v2, mask2 ; interleaving higher lanes of v1 and v2
16376// =>
16377// ul = unpckl v1, v2
16378// uh = unpckh v1, v2
16379// a = vperm ul, uh
16380// b = vperm ul, uh
16381//
16382// Pattern-match interleave(256b v1, 256b v2) -> 512b v3 and lower it into unpck
16383// and permute. We cannot directly match v3 because it is split into two
16384// 256-bit vectors in earlier isel stages. Therefore, this function matches a
16385// pair of 256-bit shuffles and makes sure the masks are consecutive.
16386//
16387// Once unpck and permute nodes are created, the permute corresponding to this
16388// shuffle is returned, while the other permute replaces the other half of the
16389// shuffle in the selection dag.
16391 SDValue V1, SDValue V2,
16392 ArrayRef<int> Mask,
16393 SelectionDAG &DAG) {
16394 if (VT != MVT::v8f32 && VT != MVT::v8i32 && VT != MVT::v16i16 &&
16395 VT != MVT::v32i8)
16396 return SDValue();
16397 // <B0, B1, B0+1, B1+1, ..., >
16398 auto IsInterleavingPattern = [&](ArrayRef<int> Mask, unsigned Begin0,
16399 unsigned Begin1) {
16400 size_t Size = Mask.size();
16401 assert(Size % 2 == 0 && "Expected even mask size");
16402 for (unsigned I = 0; I < Size; I += 2) {
16403 if (Mask[I] != (int)(Begin0 + I / 2) ||
16404 Mask[I + 1] != (int)(Begin1 + I / 2))
16405 return false;
16406 }
16407 return true;
16408 };
16409 // Check which half is this shuffle node
16410 int NumElts = VT.getVectorNumElements();
16411 size_t FirstQtr = NumElts / 2;
16412 size_t ThirdQtr = NumElts + NumElts / 2;
16413 bool IsFirstHalf = IsInterleavingPattern(Mask, 0, NumElts);
16414 bool IsSecondHalf = IsInterleavingPattern(Mask, FirstQtr, ThirdQtr);
16415 if (!IsFirstHalf && !IsSecondHalf)
16416 return SDValue();
16417
16418 // Find the intersection between shuffle users of V1 and V2.
16419 SmallVector<SDNode *, 2> Shuffles;
16420 for (SDNode *User : V1->users())
16421 if (User->getOpcode() == ISD::VECTOR_SHUFFLE && User->getOperand(0) == V1 &&
16422 User->getOperand(1) == V2)
16423 Shuffles.push_back(User);
16424 // Limit user size to two for now.
16425 if (Shuffles.size() != 2)
16426 return SDValue();
16427 // Find out which half of the 512-bit shuffles is each smaller shuffle
16428 auto *SVN1 = cast<ShuffleVectorSDNode>(Shuffles[0]);
16429 auto *SVN2 = cast<ShuffleVectorSDNode>(Shuffles[1]);
16430 SDNode *FirstHalf;
16431 SDNode *SecondHalf;
16432 if (IsInterleavingPattern(SVN1->getMask(), 0, NumElts) &&
16433 IsInterleavingPattern(SVN2->getMask(), FirstQtr, ThirdQtr)) {
16434 FirstHalf = Shuffles[0];
16435 SecondHalf = Shuffles[1];
16436 } else if (IsInterleavingPattern(SVN1->getMask(), FirstQtr, ThirdQtr) &&
16437 IsInterleavingPattern(SVN2->getMask(), 0, NumElts)) {
16438 FirstHalf = Shuffles[1];
16439 SecondHalf = Shuffles[0];
16440 } else {
16441 return SDValue();
16442 }
16443 // Lower into unpck and perm. Return the perm of this shuffle and replace
16444 // the other.
16445 SDValue Unpckl = DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2);
16446 SDValue Unpckh = DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2);
16447 SDValue Perm1 = DAG.getNode(X86ISD::VPERM2X128, DL, VT, Unpckl, Unpckh,
16448 DAG.getTargetConstant(0x20, DL, MVT::i8));
16449 SDValue Perm2 = DAG.getNode(X86ISD::VPERM2X128, DL, VT, Unpckl, Unpckh,
16450 DAG.getTargetConstant(0x31, DL, MVT::i8));
16451 if (IsFirstHalf) {
16452 DAG.ReplaceAllUsesWith(SecondHalf, &Perm2);
16453 return Perm1;
16454 }
16455 DAG.ReplaceAllUsesWith(FirstHalf, &Perm1);
16456 return Perm2;
16457}
16458
16459/// Handle lowering of 4-lane 64-bit floating point shuffles.
16460///
16461/// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2
16462/// isn't available.
16464 const APInt &Zeroable, SDValue V1, SDValue V2,
16465 const X86Subtarget &Subtarget,
16466 SelectionDAG &DAG) {
16467 assert(V1.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
16468 assert(V2.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
16469 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
16470
16471 if (SDValue V = lowerV2X128Shuffle(DL, MVT::v4f64, V1, V2, Mask, Zeroable,
16472 Subtarget, DAG))
16473 return V;
16474
16475 if (V2.isUndef()) {
16476 // Check for being able to broadcast a single element.
16477 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4f64, V1, V2,
16478 Mask, Subtarget, DAG))
16479 return Broadcast;
16480
16481 // Use low duplicate instructions for masks that match their pattern.
16482 if (isShuffleEquivalent(Mask, {0, 0, 2, 2}, V1, V2))
16483 return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v4f64, V1);
16484
16485 if (!is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask)) {
16486 // Non-half-crossing single input shuffles can be lowered with an
16487 // interleaved permutation.
16488 unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
16489 ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3);
16490 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f64, V1,
16491 DAG.getTargetConstant(VPERMILPMask, DL, MVT::i8));
16492 }
16493
16494 // With AVX2 we have direct support for this permutation.
16495 if (Subtarget.hasAVX2())
16496 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4f64, V1,
16497 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
16498
16499 // Try to create an in-lane repeating shuffle mask and then shuffle the
16500 // results into the target lanes.
16502 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
16503 return V;
16504
16505 // Try to permute the lanes and then use a per-lane permute.
16506 if (SDValue V = lowerShuffleAsLanePermuteAndPermute(DL, MVT::v4f64, V1, V2,
16507 Mask, DAG, Subtarget))
16508 return V;
16509
16510 // Otherwise, fall back.
16511 return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v4f64, V1, V2, Mask,
16512 DAG, Subtarget);
16513 }
16514
16515 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask,
16516 Zeroable, Subtarget, DAG))
16517 return Blend;
16518
16519 // Use dedicated unpack instructions for masks that match their pattern.
16520 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4f64, V1, V2, Mask, DAG))
16521 return V;
16522
16523 if (SDValue Op = lowerShuffleWithSHUFPD(DL, MVT::v4f64, V1, V2, Mask,
16524 Zeroable, Subtarget, DAG))
16525 return Op;
16526
16527 bool V1IsInPlace = isShuffleMaskInputInPlace(0, Mask);
16528 bool V2IsInPlace = isShuffleMaskInputInPlace(1, Mask);
16529 bool V1IsSplat = isShuffleMaskInputBroadcastable(0, Mask);
16530 bool V2IsSplat = isShuffleMaskInputBroadcastable(1, Mask);
16531
16532 // If we have lane crossing shuffles AND they don't all come from the lower
16533 // lane elements, lower to SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
16534 // TODO: Handle BUILD_VECTOR sources which getVectorShuffle currently
16535 // canonicalize to a blend of splat which isn't necessary for this combine.
16536 if (is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask) &&
16537 !all_of(Mask, [](int M) { return M < 2 || (4 <= M && M < 6); }) &&
16538 (V1.getOpcode() != ISD::BUILD_VECTOR) &&
16539 (V2.getOpcode() != ISD::BUILD_VECTOR) &&
16540 (!Subtarget.hasAVX2() ||
16541 !((V1IsInPlace || V1IsSplat) && (V2IsInPlace || V2IsSplat))))
16542 return lowerShuffleAsLanePermuteAndSHUFP(DL, MVT::v4f64, V1, V2, Mask, DAG);
16543
16544 // If we have one input in place, then we can permute the other input and
16545 // blend the result.
16546 if (V1IsInPlace || V2IsInPlace)
16547 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4f64, V1, V2, Mask,
16548 Zeroable, Subtarget, DAG);
16549
16550 // Try to create an in-lane repeating shuffle mask and then shuffle the
16551 // results into the target lanes.
16553 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
16554 return V;
16555
16556 // Try to simplify this by merging 128-bit lanes to enable a lane-based
16557 // shuffle. However, if we have AVX2 and either inputs are already in place,
16558 // we will be able to shuffle even across lanes the other input in a single
16559 // instruction so skip this pattern.
16560 if (!(Subtarget.hasAVX2() && (V1IsInPlace || V2IsInPlace)))
16562 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
16563 return V;
16564
16565 // If we have VLX support, we can use VEXPAND.
16566 if (Subtarget.hasVLX())
16567 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v4f64, V1, V2, Mask,
16568 Zeroable, Subtarget, DAG))
16569 return V;
16570
16571 // If we have AVX2 then we always want to lower with a blend because an v4 we
16572 // can fully permute the elements.
16573 if (Subtarget.hasAVX2())
16574 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4f64, V1, V2, Mask,
16575 Zeroable, Subtarget, DAG);
16576
16577 // Otherwise fall back on generic lowering.
16578 return lowerShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask, Zeroable,
16579 Subtarget, DAG);
16580}
16581
16582/// Handle lowering of 4-lane 64-bit integer shuffles.
16583///
16584/// This routine is only called when we have AVX2 and thus a reasonable
16585/// instruction set for v4i64 shuffling..
16587 const APInt &Zeroable, SDValue V1, SDValue V2,
16588 const X86Subtarget &Subtarget,
16589 SelectionDAG &DAG) {
16590 assert(V1.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
16591 assert(V2.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
16592 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
16593 assert(Subtarget.hasAVX2() && "We can only lower v4i64 with AVX2!");
16594
16595 if (SDValue V = lowerV2X128Shuffle(DL, MVT::v4i64, V1, V2, Mask, Zeroable,
16596 Subtarget, DAG))
16597 return V;
16598
16599 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4i64, V1, V2, Mask,
16600 Zeroable, Subtarget, DAG))
16601 return Blend;
16602
16603 // Check for being able to broadcast a single element.
16604 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4i64, V1, V2, Mask,
16605 Subtarget, DAG))
16606 return Broadcast;
16607
16608 // Try to use shift instructions if fast.
16609 if (Subtarget.preferLowerShuffleAsShift())
16610 if (SDValue Shift =
16611 lowerShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask, Zeroable,
16612 Subtarget, DAG, /*BitwiseOnly*/ true))
16613 return Shift;
16614
16615 if (V2.isUndef()) {
16616 // When the shuffle is mirrored between the 128-bit lanes of the unit, we
16617 // can use lower latency instructions that will operate on both lanes.
16618 SmallVector<int, 2> RepeatedMask;
16619 if (is128BitLaneRepeatedShuffleMask(MVT::v4i64, Mask, RepeatedMask)) {
16620 SmallVector<int, 4> PSHUFDMask;
16621 narrowShuffleMaskElts(2, RepeatedMask, PSHUFDMask);
16622 return DAG.getBitcast(
16623 MVT::v4i64,
16624 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32,
16625 DAG.getBitcast(MVT::v8i32, V1),
16626 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
16627 }
16628
16629 // AVX2 provides a direct instruction for permuting a single input across
16630 // lanes.
16631 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4i64, V1,
16632 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
16633 }
16634
16635 // Try to use shift instructions.
16636 if (SDValue Shift =
16637 lowerShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask, Zeroable, Subtarget,
16638 DAG, /*BitwiseOnly*/ false))
16639 return Shift;
16640
16641 // If we have VLX support, we can use VALIGN or VEXPAND.
16642 if (Subtarget.hasVLX()) {
16643 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v4i64, V1, V2, Mask,
16644 Zeroable, Subtarget, DAG))
16645 return Rotate;
16646
16647 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v4i64, V1, V2, Mask,
16648 Zeroable, Subtarget, DAG))
16649 return V;
16650 }
16651
16652 // Try to use PALIGNR.
16653 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v4i64, V1, V2, Mask,
16654 Subtarget, DAG))
16655 return Rotate;
16656
16657 // Use dedicated unpack instructions for masks that match their pattern.
16658 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4i64, V1, V2, Mask, DAG))
16659 return V;
16660
16661 bool V1IsInPlace = isShuffleMaskInputInPlace(0, Mask);
16662 bool V2IsInPlace = isShuffleMaskInputInPlace(1, Mask);
16663
16664 // If we have one input in place, then we can permute the other input and
16665 // blend the result.
16666 if (V1IsInPlace || V2IsInPlace)
16667 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i64, V1, V2, Mask,
16668 Zeroable, Subtarget, DAG);
16669
16670 // Try to create an in-lane repeating shuffle mask and then shuffle the
16671 // results into the target lanes.
16673 DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
16674 return V;
16675
16676 // Try to lower to PERMQ(BLENDD(V1,V2)).
16677 if (SDValue V =
16678 lowerShuffleAsBlendAndPermute(DL, MVT::v4i64, V1, V2, Mask, DAG))
16679 return V;
16680
16681 // Try to simplify this by merging 128-bit lanes to enable a lane-based
16682 // shuffle. However, if we have AVX2 and either inputs are already in place,
16683 // we will be able to shuffle even across lanes the other input in a single
16684 // instruction so skip this pattern.
16685 if (!V1IsInPlace && !V2IsInPlace)
16687 DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
16688 return Result;
16689
16690 // Otherwise fall back on generic blend lowering.
16691 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i64, V1, V2, Mask,
16692 Zeroable, Subtarget, DAG);
16693}
16694
16695/// Handle lowering of 8-lane 32-bit floating point shuffles.
16696///
16697/// Also ends up handling lowering of 8-lane 32-bit integer shuffles when AVX2
16698/// isn't available.
16700 const APInt &Zeroable, SDValue V1, SDValue V2,
16701 const X86Subtarget &Subtarget,
16702 SelectionDAG &DAG) {
16703 assert(V1.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
16704 assert(V2.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
16705 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
16706
16707 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8f32, V1, V2, Mask,
16708 Zeroable, Subtarget, DAG))
16709 return Blend;
16710
16711 // Check for being able to broadcast a single element.
16712 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8f32, V1, V2, Mask,
16713 Subtarget, DAG))
16714 return Broadcast;
16715
16716 if (!Subtarget.hasAVX2()) {
16717 SmallVector<int> InLaneMask;
16718 computeInLaneShuffleMask(Mask, Mask.size() / 2, InLaneMask);
16719
16720 if (!is128BitLaneRepeatedShuffleMask(MVT::v8f32, InLaneMask))
16721 if (SDValue R = splitAndLowerShuffle(DL, MVT::v8f32, V1, V2, Mask, DAG,
16722 /*SimpleOnly*/ true))
16723 return R;
16724 }
16725 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i32, V1, V2, Mask,
16726 Zeroable, Subtarget, DAG))
16727 return DAG.getBitcast(MVT::v8f32, ZExt);
16728
16729 // If the shuffle mask is repeated in each 128-bit lane, we have many more
16730 // options to efficiently lower the shuffle.
16731 SmallVector<int, 4> RepeatedMask;
16732 if (is128BitLaneRepeatedShuffleMask(MVT::v8f32, Mask, RepeatedMask)) {
16733 assert(RepeatedMask.size() == 4 &&
16734 "Repeated masks must be half the mask width!");
16735
16736 // Use even/odd duplicate instructions for masks that match their pattern.
16737 if (isShuffleEquivalent(RepeatedMask, {0, 0, 2, 2}, V1, V2))
16738 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v8f32, V1);
16739 if (isShuffleEquivalent(RepeatedMask, {1, 1, 3, 3}, V1, V2))
16740 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v8f32, V1);
16741
16742 if (V2.isUndef())
16743 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, V1,
16744 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
16745
16746 // Use dedicated unpack instructions for masks that match their pattern.
16747 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8f32, V1, V2, Mask, DAG))
16748 return V;
16749
16750 // Otherwise, fall back to a SHUFPS sequence. Here it is important that we
16751 // have already handled any direct blends.
16752 return lowerShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask, V1, V2, DAG);
16753 }
16754
16755 // Try to create an in-lane repeating shuffle mask and then shuffle the
16756 // results into the target lanes.
16758 DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
16759 return V;
16760
16761 // If we have a single input shuffle with different shuffle patterns in the
16762 // two 128-bit lanes use the variable mask to VPERMILPS.
16763 if (V2.isUndef()) {
16764 if (!is128BitLaneCrossingShuffleMask(MVT::v8f32, Mask)) {
16765 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
16766 return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, V1, VPermMask);
16767 }
16768 if (Subtarget.hasAVX2()) {
16769 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
16770 return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8f32, VPermMask, V1);
16771 }
16772 // Otherwise, fall back.
16773 return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v8f32, V1, V2, Mask,
16774 DAG, Subtarget);
16775 }
16776
16777 // Try to simplify this by merging 128-bit lanes to enable a lane-based
16778 // shuffle.
16780 DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
16781 return Result;
16782
16783 // If we have VLX support, we can use VEXPAND.
16784 if (Subtarget.hasVLX())
16785 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v8f32, V1, V2, Mask,
16786 Zeroable, Subtarget, DAG))
16787 return V;
16788
16789 // Try to match an interleave of two v8f32s and lower them as unpck and
16790 // permutes using ymms. This needs to go before we try to split the vectors.
16791 // Don't attempt on AVX1 if we're likely to split vectors anyway.
16792 if ((Subtarget.hasAVX2() ||
16795 !Subtarget.hasAVX512())
16796 if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v8f32, V1, V2,
16797 Mask, DAG))
16798 return V;
16799
16800 // For non-AVX512 if the Mask is of 16bit elements in lane then try to split
16801 // since after split we get a more efficient code using vpunpcklwd and
16802 // vpunpckhwd instrs than vblend.
16803 if (!Subtarget.hasAVX512() && isUnpackWdShuffleMask(Mask, MVT::v8f32, DAG))
16804 return lowerShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, Zeroable,
16805 Subtarget, DAG);
16806
16807 // If we have AVX2 then we always want to lower with a blend because at v8 we
16808 // can fully permute the elements.
16809 if (Subtarget.hasAVX2())
16810 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8f32, V1, V2, Mask,
16811 Zeroable, Subtarget, DAG);
16812
16813 // Otherwise fall back on generic lowering.
16814 return lowerShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, Zeroable,
16815 Subtarget, DAG);
16816}
16817
16818/// Handle lowering of 8-lane 32-bit integer shuffles.
16819///
16820/// This routine is only called when we have AVX2 and thus a reasonable
16821/// instruction set for v8i32 shuffling..
16823 const APInt &Zeroable, SDValue V1, SDValue V2,
16824 const X86Subtarget &Subtarget,
16825 SelectionDAG &DAG) {
16826 assert(V1.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
16827 assert(V2.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
16828 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
16829 assert(Subtarget.hasAVX2() && "We can only lower v8i32 with AVX2!");
16830
16831 int NumV2Elements = count_if(Mask, [](int M) { return M >= 8; });
16832
16833 // Whenever we can lower this as a zext, that instruction is strictly faster
16834 // than any alternative. It also allows us to fold memory operands into the
16835 // shuffle in many cases.
16836 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i32, V1, V2, Mask,
16837 Zeroable, Subtarget, DAG))
16838 return ZExt;
16839
16840 // Try to match an interleave of two v8i32s and lower them as unpck and
16841 // permutes using ymms. This needs to go before we try to split the vectors.
16842 if (!Subtarget.hasAVX512())
16843 if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v8i32, V1, V2,
16844 Mask, DAG))
16845 return V;
16846
16847 // For non-AVX512 if the Mask is of 16bit elements in lane then try to split
16848 // since after split we get a more efficient code than vblend by using
16849 // vpunpcklwd and vpunpckhwd instrs.
16850 if (isUnpackWdShuffleMask(Mask, MVT::v8i32, DAG) && !V2.isUndef() &&
16851 !Subtarget.hasAVX512())
16852 return lowerShuffleAsSplitOrBlend(DL, MVT::v8i32, V1, V2, Mask, Zeroable,
16853 Subtarget, DAG);
16854
16855 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask,
16856 Zeroable, Subtarget, DAG))
16857 return Blend;
16858
16859 // Check for being able to broadcast a single element.
16860 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8i32, V1, V2, Mask,
16861 Subtarget, DAG))
16862 return Broadcast;
16863
16864 // Try to use shift instructions if fast.
16865 if (Subtarget.preferLowerShuffleAsShift()) {
16866 if (SDValue Shift =
16867 lowerShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask, Zeroable,
16868 Subtarget, DAG, /*BitwiseOnly*/ true))
16869 return Shift;
16870 if (NumV2Elements == 0)
16871 if (SDValue Rotate =
16872 lowerShuffleAsBitRotate(DL, MVT::v8i32, V1, Mask, Subtarget, DAG))
16873 return Rotate;
16874 }
16875
16876 // If the shuffle mask is repeated in each 128-bit lane we can use more
16877 // efficient instructions that mirror the shuffles across the two 128-bit
16878 // lanes.
16879 SmallVector<int, 4> RepeatedMask;
16880 bool Is128BitLaneRepeatedShuffle =
16881 is128BitLaneRepeatedShuffleMask(MVT::v8i32, Mask, RepeatedMask);
16882 if (Is128BitLaneRepeatedShuffle) {
16883 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
16884 if (V2.isUndef())
16885 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, V1,
16886 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
16887
16888 // Use dedicated unpack instructions for masks that match their pattern.
16889 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i32, V1, V2, Mask, DAG))
16890 return V;
16891 }
16892
16893 // Try to use shift instructions.
16894 if (SDValue Shift =
16895 lowerShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask, Zeroable, Subtarget,
16896 DAG, /*BitwiseOnly*/ false))
16897 return Shift;
16898
16899 if (!Subtarget.preferLowerShuffleAsShift() && NumV2Elements == 0)
16900 if (SDValue Rotate =
16901 lowerShuffleAsBitRotate(DL, MVT::v8i32, V1, Mask, Subtarget, DAG))
16902 return Rotate;
16903
16904 // If we have VLX support, we can use VALIGN or EXPAND.
16905 if (Subtarget.hasVLX()) {
16906 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v8i32, V1, V2, Mask,
16907 Zeroable, Subtarget, DAG))
16908 return Rotate;
16909
16910 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v8i32, V1, V2, Mask,
16911 Zeroable, Subtarget, DAG))
16912 return V;
16913 }
16914
16915 // Try to use byte rotation instructions.
16916 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i32, V1, V2, Mask,
16917 Subtarget, DAG))
16918 return Rotate;
16919
16920 // Try to create an in-lane repeating shuffle mask and then shuffle the
16921 // results into the target lanes.
16923 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
16924 return V;
16925
16926 if (V2.isUndef()) {
16927 // Try to produce a fixed cross-128-bit lane permute followed by unpack
16928 // because that should be faster than the variable permute alternatives.
16929 if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v8i32, V1, V2, Mask, DAG))
16930 return V;
16931
16932 // If the shuffle patterns aren't repeated but it's a single input, directly
16933 // generate a cross-lane VPERMD instruction.
16934 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
16935 return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8i32, VPermMask, V1);
16936 }
16937
16938 // Assume that a single SHUFPS is faster than an alternative sequence of
16939 // multiple instructions (even if the CPU has a domain penalty).
16940 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
16941 if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
16942 SDValue CastV1 = DAG.getBitcast(MVT::v8f32, V1);
16943 SDValue CastV2 = DAG.getBitcast(MVT::v8f32, V2);
16944 SDValue ShufPS = lowerShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask,
16945 CastV1, CastV2, DAG);
16946 return DAG.getBitcast(MVT::v8i32, ShufPS);
16947 }
16948
16949 // Try to simplify this by merging 128-bit lanes to enable a lane-based
16950 // shuffle.
16952 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
16953 return Result;
16954
16955 // Otherwise fall back on generic blend lowering.
16956 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8i32, V1, V2, Mask,
16957 Zeroable, Subtarget, DAG);
16958}
16959
16960/// Handle lowering of 16-lane 16-bit integer shuffles.
16961///
16962/// This routine is only called when we have AVX2 and thus a reasonable
16963/// instruction set for v16i16 shuffling..
16965 const APInt &Zeroable, SDValue V1, SDValue V2,
16966 const X86Subtarget &Subtarget,
16967 SelectionDAG &DAG) {
16968 assert(V1.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
16969 assert(V2.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
16970 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
16971 assert(Subtarget.hasAVX2() && "We can only lower v16i16 with AVX2!");
16972
16973 // Whenever we can lower this as a zext, that instruction is strictly faster
16974 // than any alternative. It also allows us to fold memory operands into the
16975 // shuffle in many cases.
16977 DL, MVT::v16i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
16978 return ZExt;
16979
16980 // Check for being able to broadcast a single element.
16981 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v16i16, V1, V2, Mask,
16982 Subtarget, DAG))
16983 return Broadcast;
16984
16985 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i16, V1, V2, Mask,
16986 Zeroable, Subtarget, DAG))
16987 return Blend;
16988
16989 // Use dedicated unpack instructions for masks that match their pattern.
16990 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i16, V1, V2, Mask, DAG))
16991 return V;
16992
16993 // Use dedicated pack instructions for masks that match their pattern.
16994 if (SDValue V =
16995 lowerShuffleWithPACK(DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
16996 return V;
16997
16998 // Try to use lower using a truncation.
16999 if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v16i16, V1, V2, Mask, Zeroable,
17000 Subtarget, DAG))
17001 return V;
17002
17003 // Try to use shift instructions.
17004 if (SDValue Shift =
17005 lowerShuffleAsShift(DL, MVT::v16i16, V1, V2, Mask, Zeroable,
17006 Subtarget, DAG, /*BitwiseOnly*/ false))
17007 return Shift;
17008
17009 // Try to use byte rotation instructions.
17010 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i16, V1, V2, Mask,
17011 Subtarget, DAG))
17012 return Rotate;
17013
17014 // Try to create an in-lane repeating shuffle mask and then shuffle the
17015 // results into the target lanes.
17017 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
17018 return V;
17019
17020 if (V2.isUndef()) {
17021 // Try to use bit rotation instructions.
17022 if (SDValue Rotate =
17023 lowerShuffleAsBitRotate(DL, MVT::v16i16, V1, Mask, Subtarget, DAG))
17024 return Rotate;
17025
17026 // Try to produce a fixed cross-128-bit lane permute followed by unpack
17027 // because that should be faster than the variable permute alternatives.
17028 if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v16i16, V1, V2, Mask, DAG))
17029 return V;
17030
17031 // There are no generalized cross-lane shuffle operations available on i16
17032 // element types.
17033 if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask)) {
17035 DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget))
17036 return V;
17037
17038 return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v16i16, V1, V2, Mask,
17039 DAG, Subtarget);
17040 }
17041
17042 SmallVector<int, 8> RepeatedMask;
17043 if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
17044 // As this is a single-input shuffle, the repeated mask should be
17045 // a strictly valid v8i16 mask that we can pass through to the v8i16
17046 // lowering to handle even the v16 case.
17048 DL, MVT::v16i16, V1, RepeatedMask, Subtarget, DAG);
17049 }
17050 }
17051
17052 if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v16i16, Mask, V1, V2,
17053 Zeroable, Subtarget, DAG))
17054 return PSHUFB;
17055
17056 // AVX512BW can lower to VPERMW (non-VLX will pad to v32i16).
17057 if (Subtarget.hasBWI())
17058 return lowerShuffleWithPERMV(DL, MVT::v16i16, Mask, V1, V2, Subtarget, DAG);
17059
17060 // Try to simplify this by merging 128-bit lanes to enable a lane-based
17061 // shuffle.
17063 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
17064 return Result;
17065
17066 // Try to permute the lanes and then use a per-lane permute.
17068 DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget))
17069 return V;
17070
17071 // Try to match an interleave of two v16i16s and lower them as unpck and
17072 // permutes using ymms.
17073 if (!Subtarget.hasAVX512())
17074 if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v16i16, V1, V2,
17075 Mask, DAG))
17076 return V;
17077
17078 // Otherwise fall back on generic lowering.
17079 return lowerShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask, Zeroable,
17080 Subtarget, DAG);
17081}
17082
17083/// Handle lowering of 32-lane 8-bit integer shuffles.
17084///
17085/// This routine is only called when we have AVX2 and thus a reasonable
17086/// instruction set for v32i8 shuffling..
17088 const APInt &Zeroable, SDValue V1, SDValue V2,
17089 const X86Subtarget &Subtarget,
17090 SelectionDAG &DAG) {
17091 assert(V1.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
17092 assert(V2.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
17093 assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
17094 assert(Subtarget.hasAVX2() && "We can only lower v32i8 with AVX2!");
17095
17096 // Whenever we can lower this as a zext, that instruction is strictly faster
17097 // than any alternative. It also allows us to fold memory operands into the
17098 // shuffle in many cases.
17099 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v32i8, V1, V2, Mask,
17100 Zeroable, Subtarget, DAG))
17101 return ZExt;
17102
17103 // Check for being able to broadcast a single element.
17104 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v32i8, V1, V2, Mask,
17105 Subtarget, DAG))
17106 return Broadcast;
17107
17108 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v32i8, V1, V2, Mask,
17109 Zeroable, Subtarget, DAG))
17110 return Blend;
17111
17112 // Use dedicated unpack instructions for masks that match their pattern.
17113 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v32i8, V1, V2, Mask, DAG))
17114 return V;
17115
17116 // Use dedicated pack instructions for masks that match their pattern.
17117 if (SDValue V =
17118 lowerShuffleWithPACK(DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
17119 return V;
17120
17121 // Try to use lower using a truncation.
17122 if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v32i8, V1, V2, Mask, Zeroable,
17123 Subtarget, DAG))
17124 return V;
17125
17126 // Try to use shift instructions.
17127 if (SDValue Shift =
17128 lowerShuffleAsShift(DL, MVT::v32i8, V1, V2, Mask, Zeroable, Subtarget,
17129 DAG, /*BitwiseOnly*/ false))
17130 return Shift;
17131
17132 // Try to use byte rotation instructions.
17133 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v32i8, V1, V2, Mask,
17134 Subtarget, DAG))
17135 return Rotate;
17136
17137 // Try to use bit rotation instructions.
17138 if (V2.isUndef())
17139 if (SDValue Rotate =
17140 lowerShuffleAsBitRotate(DL, MVT::v32i8, V1, Mask, Subtarget, DAG))
17141 return Rotate;
17142
17143 // Try to create an in-lane repeating shuffle mask and then shuffle the
17144 // results into the target lanes.
17146 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
17147 return V;
17148
17149 // There are no generalized cross-lane shuffle operations available on i8
17150 // element types.
17151 if (V2.isUndef() && is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask)) {
17152 // Try to produce a fixed cross-128-bit lane permute followed by unpack
17153 // because that should be faster than the variable permute alternatives.
17154 if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v32i8, V1, V2, Mask, DAG))
17155 return V;
17156
17158 DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget))
17159 return V;
17160
17161 return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v32i8, V1, V2, Mask,
17162 DAG, Subtarget);
17163 }
17164
17165 if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v32i8, Mask, V1, V2,
17166 Zeroable, Subtarget, DAG))
17167 return PSHUFB;
17168
17169 // AVX512VBMI can lower to VPERMB (non-VLX will pad to v64i8).
17170 if (Subtarget.hasVBMI())
17171 return lowerShuffleWithPERMV(DL, MVT::v32i8, Mask, V1, V2, Subtarget, DAG);
17172
17173 // Try to simplify this by merging 128-bit lanes to enable a lane-based
17174 // shuffle.
17176 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
17177 return Result;
17178
17179 // Try to permute the lanes and then use a per-lane permute.
17181 DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget))
17182 return V;
17183
17184 // Look for {0, 8, 16, 24, 32, 40, 48, 56 } in the first 8 elements. Followed
17185 // by zeroable elements in the remaining 24 elements. Turn this into two
17186 // vmovqb instructions shuffled together.
17187 if (Subtarget.hasVLX())
17188 if (SDValue V = lowerShuffleAsVTRUNCAndUnpack(DL, MVT::v32i8, V1, V2,
17189 Mask, Zeroable, DAG))
17190 return V;
17191
17192 // Try to match an interleave of two v32i8s and lower them as unpck and
17193 // permutes using ymms.
17194 if (!Subtarget.hasAVX512())
17195 if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v32i8, V1, V2,
17196 Mask, DAG))
17197 return V;
17198
17199 // Otherwise fall back on generic lowering.
17200 return lowerShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask, Zeroable,
17201 Subtarget, DAG);
17202}
17203
17204/// High-level routine to lower various 256-bit x86 vector shuffles.
17205///
17206/// This routine either breaks down the specific type of a 256-bit x86 vector
17207/// shuffle or splits it into two 128-bit shuffles and fuses the results back
17208/// together based on the available instructions.
17210 SDValue V1, SDValue V2, const APInt &Zeroable,
17211 const X86Subtarget &Subtarget,
17212 SelectionDAG &DAG) {
17213 // If we have a single input to the zero element, insert that into V1 if we
17214 // can do so cheaply.
17215 int NumElts = VT.getVectorNumElements();
17216 int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
17217
17218 if (NumV2Elements == 1 && Mask[0] >= NumElts)
17220 DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
17221 return Insertion;
17222
17223 // Handle special cases where the lower or upper half is UNDEF.
17224 if (SDValue V =
17225 lowerShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
17226 return V;
17227
17228 // There is a really nice hard cut-over between AVX1 and AVX2 that means we
17229 // can check for those subtargets here and avoid much of the subtarget
17230 // querying in the per-vector-type lowering routines. With AVX1 we have
17231 // essentially *zero* ability to manipulate a 256-bit vector with integer
17232 // types. Since we'll use floating point types there eventually, just
17233 // immediately cast everything to a float and operate entirely in that domain.
17234 if (VT.isInteger() && !Subtarget.hasAVX2()) {
17235 int ElementBits = VT.getScalarSizeInBits();
17236 if (ElementBits < 32) {
17237 // No floating point type available, if we can't use the bit operations
17238 // for masking/blending then decompose into 128-bit vectors.
17239 if (SDValue V = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
17240 Subtarget, DAG))
17241 return V;
17242 if (SDValue V = lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
17243 return V;
17244 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG, /*SimpleOnly*/ false);
17245 }
17246
17247 MVT FpVT = MVT::getVectorVT(MVT::getFloatingPointVT(ElementBits),
17249 V1 = DAG.getBitcast(FpVT, V1);
17250 V2 = DAG.getBitcast(FpVT, V2);
17251 return DAG.getBitcast(VT, DAG.getVectorShuffle(FpVT, DL, V1, V2, Mask));
17252 }
17253
17254 if (VT == MVT::v16f16 || VT == MVT::v16bf16) {
17255 V1 = DAG.getBitcast(MVT::v16i16, V1);
17256 V2 = DAG.getBitcast(MVT::v16i16, V2);
17257 return DAG.getBitcast(VT,
17258 DAG.getVectorShuffle(MVT::v16i16, DL, V1, V2, Mask));
17259 }
17260
17261 switch (VT.SimpleTy) {
17262 case MVT::v4f64:
17263 return lowerV4F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17264 case MVT::v4i64:
17265 return lowerV4I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17266 case MVT::v8f32:
17267 return lowerV8F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17268 case MVT::v8i32:
17269 return lowerV8I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17270 case MVT::v16i16:
17271 return lowerV16I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17272 case MVT::v32i8:
17273 return lowerV32I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17274
17275 default:
17276 llvm_unreachable("Not a valid 256-bit x86 vector type!");
17277 }
17278}
17279
17280/// Try to lower a vector shuffle as a 128-bit shuffles.
17282 const APInt &Zeroable, SDValue V1, SDValue V2,
17283 const X86Subtarget &Subtarget,
17284 SelectionDAG &DAG) {
17285 assert(VT.getScalarSizeInBits() == 64 &&
17286 "Unexpected element type size for 128bit shuffle.");
17287
17288 // To handle 256 bit vector requires VLX and most probably
17289 // function lowerV2X128VectorShuffle() is better solution.
17290 assert(VT.is512BitVector() && "Unexpected vector size for 512bit shuffle.");
17291
17292 // TODO - use Zeroable like we do for lowerV2X128VectorShuffle?
17293 SmallVector<int, 4> Widened128Mask;
17294 if (!canWidenShuffleElements(Mask, Widened128Mask))
17295 return SDValue();
17296 assert(Widened128Mask.size() == 4 && "Shuffle widening mismatch");
17297
17298 // Try to use an insert into a zero vector.
17299 if (Widened128Mask[0] == 0 && (Zeroable & 0xf0) == 0xf0 &&
17300 (Widened128Mask[1] == 1 || (Zeroable & 0x0c) == 0x0c)) {
17301 unsigned NumElts = ((Zeroable & 0x0c) == 0x0c) ? 2 : 4;
17302 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
17303 SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
17304 DAG.getVectorIdxConstant(0, DL));
17305 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
17306 getZeroVector(VT, Subtarget, DAG, DL), LoV,
17307 DAG.getVectorIdxConstant(0, DL));
17308 }
17309
17310 // Check for patterns which can be matched with a single insert of a 256-bit
17311 // subvector.
17312 bool OnlyUsesV1 = isShuffleEquivalent(Mask, {0, 1, 2, 3, 0, 1, 2, 3}, V1, V2);
17313 if (OnlyUsesV1 ||
17314 isShuffleEquivalent(Mask, {0, 1, 2, 3, 8, 9, 10, 11}, V1, V2)) {
17315 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 4);
17316 SDValue SubVec =
17317 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, OnlyUsesV1 ? V1 : V2,
17318 DAG.getVectorIdxConstant(0, DL));
17319 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, V1, SubVec,
17320 DAG.getVectorIdxConstant(4, DL));
17321 }
17322
17323 // See if this is an insertion of the lower 128-bits of V2 into V1.
17324 bool IsInsert = true;
17325 int V2Index = -1;
17326 for (int i = 0; i < 4; ++i) {
17327 assert(Widened128Mask[i] >= -1 && "Illegal shuffle sentinel value");
17328 if (Widened128Mask[i] < 0)
17329 continue;
17330
17331 // Make sure all V1 subvectors are in place.
17332 if (Widened128Mask[i] < 4) {
17333 if (Widened128Mask[i] != i) {
17334 IsInsert = false;
17335 break;
17336 }
17337 } else {
17338 // Make sure we only have a single V2 index and its the lowest 128-bits.
17339 if (V2Index >= 0 || Widened128Mask[i] != 4) {
17340 IsInsert = false;
17341 break;
17342 }
17343 V2Index = i;
17344 }
17345 }
17346 if (IsInsert && V2Index >= 0) {
17347 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
17348 SDValue Subvec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V2,
17349 DAG.getVectorIdxConstant(0, DL));
17350 return insert128BitVector(V1, Subvec, V2Index * 2, DAG, DL);
17351 }
17352
17353 // See if we can widen to a 256-bit lane shuffle, we're going to lose 128-lane
17354 // UNDEF info by lowering to X86ISD::SHUF128 anyway, so by widening where
17355 // possible we at least ensure the lanes stay sequential to help later
17356 // combines.
17357 SmallVector<int, 2> Widened256Mask;
17358 if (canWidenShuffleElements(Widened128Mask, Widened256Mask)) {
17359 Widened128Mask.clear();
17360 narrowShuffleMaskElts(2, Widened256Mask, Widened128Mask);
17361 }
17362
17363 // Try to lower to vshuf64x2/vshuf32x4.
17364 SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};
17365 int PermMask[4] = {-1, -1, -1, -1};
17366 // Ensure elements came from the same Op.
17367 for (int i = 0; i < 4; ++i) {
17368 assert(Widened128Mask[i] >= -1 && "Illegal shuffle sentinel value");
17369 if (Widened128Mask[i] < 0)
17370 continue;
17371
17372 SDValue Op = Widened128Mask[i] >= 4 ? V2 : V1;
17373 unsigned OpIndex = i / 2;
17374 if (Ops[OpIndex].isUndef())
17375 Ops[OpIndex] = Op;
17376 else if (Ops[OpIndex] != Op)
17377 return SDValue();
17378
17379 PermMask[i] = Widened128Mask[i] % 4;
17380 }
17381
17382 return DAG.getNode(X86ISD::SHUF128, DL, VT, Ops[0], Ops[1],
17383 getV4X86ShuffleImm8ForMask(PermMask, DL, DAG));
17384}
17385
17386/// Handle lowering of 8-lane 64-bit floating point shuffles.
17388 const APInt &Zeroable, SDValue V1, SDValue V2,
17389 const X86Subtarget &Subtarget,
17390 SelectionDAG &DAG) {
17391 assert(V1.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
17392 assert(V2.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
17393 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
17394
17395 if (V2.isUndef()) {
17396 // Use low duplicate instructions for masks that match their pattern.
17397 if (isShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6}, V1, V2))
17398 return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v8f64, V1);
17399
17400 if (!is128BitLaneCrossingShuffleMask(MVT::v8f64, Mask)) {
17401 // Non-half-crossing single input shuffles can be lowered with an
17402 // interleaved permutation.
17403 unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
17404 ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3) |
17405 ((Mask[4] == 5) << 4) | ((Mask[5] == 5) << 5) |
17406 ((Mask[6] == 7) << 6) | ((Mask[7] == 7) << 7);
17407 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f64, V1,
17408 DAG.getTargetConstant(VPERMILPMask, DL, MVT::i8));
17409 }
17410
17411 SmallVector<int, 4> RepeatedMask;
17412 if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask))
17413 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8f64, V1,
17414 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
17415 }
17416
17417 if (SDValue Shuf128 = lowerV4X128Shuffle(DL, MVT::v8f64, Mask, Zeroable, V1,
17418 V2, Subtarget, DAG))
17419 return Shuf128;
17420
17421 if (SDValue Unpck = lowerShuffleWithUNPCK(DL, MVT::v8f64, V1, V2, Mask, DAG))
17422 return Unpck;
17423
17424 // Check if the blend happens to exactly fit that of SHUFPD.
17425 if (SDValue Op = lowerShuffleWithSHUFPD(DL, MVT::v8f64, V1, V2, Mask,
17426 Zeroable, Subtarget, DAG))
17427 return Op;
17428
17429 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v8f64, V1, V2, Mask, Zeroable,
17430 Subtarget, DAG))
17431 return V;
17432
17433 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8f64, V1, V2, Mask,
17434 Zeroable, Subtarget, DAG))
17435 return Blend;
17436
17437 return lowerShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, Subtarget, DAG);
17438}
17439
17440/// Handle lowering of 16-lane 32-bit floating point shuffles.
17442 const APInt &Zeroable, SDValue V1, SDValue V2,
17443 const X86Subtarget &Subtarget,
17444 SelectionDAG &DAG) {
17445 assert(V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
17446 assert(V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
17447 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
17448
17449 // If the shuffle mask is repeated in each 128-bit lane, we have many more
17450 // options to efficiently lower the shuffle.
17451 SmallVector<int, 4> RepeatedMask;
17452 if (is128BitLaneRepeatedShuffleMask(MVT::v16f32, Mask, RepeatedMask)) {
17453 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
17454
17455 // Use even/odd duplicate instructions for masks that match their pattern.
17456 if (isShuffleEquivalent(RepeatedMask, {0, 0, 2, 2}, V1, V2))
17457 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v16f32, V1);
17458 if (isShuffleEquivalent(RepeatedMask, {1, 1, 3, 3}, V1, V2))
17459 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v16f32, V1);
17460
17461 if (V2.isUndef())
17462 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v16f32, V1,
17463 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
17464
17465 // Use dedicated unpack instructions for masks that match their pattern.
17466 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16f32, V1, V2, Mask, DAG))
17467 return V;
17468
17469 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16f32, V1, V2, Mask,
17470 Zeroable, Subtarget, DAG))
17471 return Blend;
17472
17473 // Otherwise, fall back to a SHUFPS sequence.
17474 return lowerShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, V1, V2, DAG);
17475 }
17476
17477 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16f32, V1, V2, Mask,
17478 Zeroable, Subtarget, DAG))
17479 return Blend;
17480
17482 DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
17483 return DAG.getBitcast(MVT::v16f32, ZExt);
17484
17485 // Try to create an in-lane repeating shuffle mask and then shuffle the
17486 // results into the target lanes.
17488 DL, MVT::v16f32, V1, V2, Mask, Subtarget, DAG))
17489 return V;
17490
17491 // If we have a single input shuffle with different shuffle patterns in the
17492 // 128-bit lanes and don't lane cross, use variable mask VPERMILPS.
17493 if (V2.isUndef() &&
17494 !is128BitLaneCrossingShuffleMask(MVT::v16f32, Mask)) {
17495 SDValue VPermMask = getConstVector(Mask, MVT::v16i32, DAG, DL, true);
17496 return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v16f32, V1, VPermMask);
17497 }
17498
17499 // If we have AVX512F support, we can use VEXPAND.
17500 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v16f32, V1, V2, Mask,
17501 Zeroable, Subtarget, DAG))
17502 return V;
17503
17504 return lowerShuffleWithPERMV(DL, MVT::v16f32, Mask, V1, V2, Subtarget, DAG);
17505}
17506
17507/// Handle lowering of 8-lane 64-bit integer shuffles.
17509 const APInt &Zeroable, SDValue V1, SDValue V2,
17510 const X86Subtarget &Subtarget,
17511 SelectionDAG &DAG) {
17512 assert(V1.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
17513 assert(V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
17514 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
17515
17516 // Try to use shift instructions if fast.
17517 if (Subtarget.preferLowerShuffleAsShift())
17518 if (SDValue Shift =
17519 lowerShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask, Zeroable,
17520 Subtarget, DAG, /*BitwiseOnly*/ true))
17521 return Shift;
17522
17523 if (V2.isUndef()) {
17524 // When the shuffle is mirrored between the 128-bit lanes of the unit, we
17525 // can use lower latency instructions that will operate on all four
17526 // 128-bit lanes.
17527 SmallVector<int, 2> Repeated128Mask;
17528 if (is128BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated128Mask)) {
17529 SmallVector<int, 4> PSHUFDMask;
17530 narrowShuffleMaskElts(2, Repeated128Mask, PSHUFDMask);
17531 return DAG.getBitcast(
17532 MVT::v8i64,
17533 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32,
17534 DAG.getBitcast(MVT::v16i32, V1),
17535 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
17536 }
17537
17538 SmallVector<int, 4> Repeated256Mask;
17539 if (is256BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated256Mask))
17540 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8i64, V1,
17541 getV4X86ShuffleImm8ForMask(Repeated256Mask, DL, DAG));
17542 }
17543
17544 if (SDValue Shuf128 = lowerV4X128Shuffle(DL, MVT::v8i64, Mask, Zeroable, V1,
17545 V2, Subtarget, DAG))
17546 return Shuf128;
17547
17548 // Try to use shift instructions.
17549 if (SDValue Shift =
17550 lowerShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask, Zeroable, Subtarget,
17551 DAG, /*BitwiseOnly*/ false))
17552 return Shift;
17553
17554 // Try to use VALIGN.
17555 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v8i64, V1, V2, Mask,
17556 Zeroable, Subtarget, DAG))
17557 return Rotate;
17558
17559 // Try to use PALIGNR.
17560 if (Subtarget.hasBWI())
17561 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i64, V1, V2, Mask,
17562 Subtarget, DAG))
17563 return Rotate;
17564
17565 if (SDValue Unpck = lowerShuffleWithUNPCK(DL, MVT::v8i64, V1, V2, Mask, DAG))
17566 return Unpck;
17567
17568 // If we have AVX512F support, we can use VEXPAND.
17569 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v8i64, V1, V2, Mask, Zeroable,
17570 Subtarget, DAG))
17571 return V;
17572
17573 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i64, V1, V2, Mask,
17574 Zeroable, Subtarget, DAG))
17575 return Blend;
17576
17577 return lowerShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, Subtarget, DAG);
17578}
17579
17580/// Handle lowering of 16-lane 32-bit integer shuffles.
17582 const APInt &Zeroable, SDValue V1, SDValue V2,
17583 const X86Subtarget &Subtarget,
17584 SelectionDAG &DAG) {
17585 assert(V1.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
17586 assert(V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
17587 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
17588
17589 int NumV2Elements = count_if(Mask, [](int M) { return M >= 16; });
17590
17591 // Whenever we can lower this as a zext, that instruction is strictly faster
17592 // than any alternative. It also allows us to fold memory operands into the
17593 // shuffle in many cases.
17595 DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
17596 return ZExt;
17597
17598 // Try to use shift instructions if fast.
17599 if (Subtarget.preferLowerShuffleAsShift()) {
17600 if (SDValue Shift =
17601 lowerShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask, Zeroable,
17602 Subtarget, DAG, /*BitwiseOnly*/ true))
17603 return Shift;
17604 if (NumV2Elements == 0)
17605 if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v16i32, V1, Mask,
17606 Subtarget, DAG))
17607 return Rotate;
17608 }
17609
17610 // If the shuffle mask is repeated in each 128-bit lane we can use more
17611 // efficient instructions that mirror the shuffles across the four 128-bit
17612 // lanes.
17613 SmallVector<int, 4> RepeatedMask;
17614 bool Is128BitLaneRepeatedShuffle =
17615 is128BitLaneRepeatedShuffleMask(MVT::v16i32, Mask, RepeatedMask);
17616 if (Is128BitLaneRepeatedShuffle) {
17617 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
17618 if (V2.isUndef())
17619 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32, V1,
17620 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
17621
17622 // Use dedicated unpack instructions for masks that match their pattern.
17623 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i32, V1, V2, Mask, DAG))
17624 return V;
17625 }
17626
17627 // Try to use shift instructions.
17628 if (SDValue Shift =
17629 lowerShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask, Zeroable,
17630 Subtarget, DAG, /*BitwiseOnly*/ false))
17631 return Shift;
17632
17633 if (!Subtarget.preferLowerShuffleAsShift() && NumV2Elements != 0)
17634 if (SDValue Rotate =
17635 lowerShuffleAsBitRotate(DL, MVT::v16i32, V1, Mask, Subtarget, DAG))
17636 return Rotate;
17637
17638 // Try to use VALIGN.
17639 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v16i32, V1, V2, Mask,
17640 Zeroable, Subtarget, DAG))
17641 return Rotate;
17642
17643 // Try to use byte rotation instructions.
17644 if (Subtarget.hasBWI())
17645 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i32, V1, V2, Mask,
17646 Subtarget, DAG))
17647 return Rotate;
17648
17649 // Assume that a single SHUFPS is faster than using a permv shuffle.
17650 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
17651 if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
17652 SDValue CastV1 = DAG.getBitcast(MVT::v16f32, V1);
17653 SDValue CastV2 = DAG.getBitcast(MVT::v16f32, V2);
17654 SDValue ShufPS = lowerShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask,
17655 CastV1, CastV2, DAG);
17656 return DAG.getBitcast(MVT::v16i32, ShufPS);
17657 }
17658
17659 // Try to create an in-lane repeating shuffle mask and then shuffle the
17660 // results into the target lanes.
17662 DL, MVT::v16i32, V1, V2, Mask, Subtarget, DAG))
17663 return V;
17664
17665 // If we have AVX512F support, we can use VEXPAND.
17666 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v16i32, V1, V2, Mask,
17667 Zeroable, Subtarget, DAG))
17668 return V;
17669
17670 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i32, V1, V2, Mask,
17671 Zeroable, Subtarget, DAG))
17672 return Blend;
17673
17674 return lowerShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, Subtarget, DAG);
17675}
17676
17677/// Handle lowering of 32-lane 16-bit integer shuffles.
17679 const APInt &Zeroable, SDValue V1, SDValue V2,
17680 const X86Subtarget &Subtarget,
17681 SelectionDAG &DAG) {
17682 assert(V1.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
17683 assert(V2.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
17684 assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
17685 assert(Subtarget.hasBWI() && "We can only lower v32i16 with AVX-512-BWI!");
17686
17687 // Whenever we can lower this as a zext, that instruction is strictly faster
17688 // than any alternative. It also allows us to fold memory operands into the
17689 // shuffle in many cases.
17691 DL, MVT::v32i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
17692 return ZExt;
17693
17694 // Use dedicated unpack instructions for masks that match their pattern.
17695 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v32i16, V1, V2, Mask, DAG))
17696 return V;
17697
17698 // Use dedicated pack instructions for masks that match their pattern.
17699 if (SDValue V =
17700 lowerShuffleWithPACK(DL, MVT::v32i16, V1, V2, Mask, Subtarget, DAG))
17701 return V;
17702
17703 // Try to use shift instructions.
17704 if (SDValue Shift =
17705 lowerShuffleAsShift(DL, MVT::v32i16, V1, V2, Mask, Zeroable,
17706 Subtarget, DAG, /*BitwiseOnly*/ false))
17707 return Shift;
17708
17709 // Try to use byte rotation instructions.
17710 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v32i16, V1, V2, Mask,
17711 Subtarget, DAG))
17712 return Rotate;
17713
17714 if (V2.isUndef()) {
17715 // Try to use bit rotation instructions.
17716 if (SDValue Rotate =
17717 lowerShuffleAsBitRotate(DL, MVT::v32i16, V1, Mask, Subtarget, DAG))
17718 return Rotate;
17719
17720 SmallVector<int, 8> RepeatedMask;
17721 if (is128BitLaneRepeatedShuffleMask(MVT::v32i16, Mask, RepeatedMask)) {
17722 // As this is a single-input shuffle, the repeated mask should be
17723 // a strictly valid v8i16 mask that we can pass through to the v8i16
17724 // lowering to handle even the v32 case.
17725 return lowerV8I16GeneralSingleInputShuffle(DL, MVT::v32i16, V1,
17726 RepeatedMask, Subtarget, DAG);
17727 }
17728 }
17729
17730 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v32i16, V1, V2, Mask,
17731 Zeroable, Subtarget, DAG))
17732 return Blend;
17733
17734 if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v32i16, Mask, V1, V2,
17735 Zeroable, Subtarget, DAG))
17736 return PSHUFB;
17737
17738 // Try to simplify this by merging 128-bit lanes to enable a lane-based
17739 // shuffle.
17740 if (!V2.isUndef())
17742 DL, MVT::v32i16, V1, V2, Mask, Subtarget, DAG))
17743 return Result;
17744
17745 return lowerShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, Subtarget, DAG);
17746}
17747
17748/// Handle lowering of 64-lane 8-bit integer shuffles.
17750 const APInt &Zeroable, SDValue V1, SDValue V2,
17751 const X86Subtarget &Subtarget,
17752 SelectionDAG &DAG) {
17753 assert(V1.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
17754 assert(V2.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
17755 assert(Mask.size() == 64 && "Unexpected mask size for v64 shuffle!");
17756 assert(Subtarget.hasBWI() && "We can only lower v64i8 with AVX-512-BWI!");
17757
17758 // Whenever we can lower this as a zext, that instruction is strictly faster
17759 // than any alternative. It also allows us to fold memory operands into the
17760 // shuffle in many cases.
17762 DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
17763 return ZExt;
17764
17765 // Use dedicated unpack instructions for masks that match their pattern.
17766 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v64i8, V1, V2, Mask, DAG))
17767 return V;
17768
17769 // Use dedicated pack instructions for masks that match their pattern.
17770 if (SDValue V =
17771 lowerShuffleWithPACK(DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
17772 return V;
17773
17774 // Try to use shift instructions.
17775 if (SDValue Shift =
17776 lowerShuffleAsShift(DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget,
17777 DAG, /*BitwiseOnly*/ false))
17778 return Shift;
17779
17780 // Try to use byte rotation instructions.
17781 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v64i8, V1, V2, Mask,
17782 Subtarget, DAG))
17783 return Rotate;
17784
17785 // Try to use bit rotation instructions.
17786 if (V2.isUndef())
17787 if (SDValue Rotate =
17788 lowerShuffleAsBitRotate(DL, MVT::v64i8, V1, Mask, Subtarget, DAG))
17789 return Rotate;
17790
17791 // Lower as AND if possible.
17792 if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v64i8, V1, V2, Mask,
17793 Zeroable, Subtarget, DAG))
17794 return Masked;
17795
17796 if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v64i8, Mask, V1, V2,
17797 Zeroable, Subtarget, DAG))
17798 return PSHUFB;
17799
17800 // Try to create an in-lane repeating shuffle mask and then shuffle the
17801 // results into the target lanes.
17803 DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
17804 return V;
17805
17807 DL, MVT::v64i8, V1, V2, Mask, DAG, Subtarget))
17808 return Result;
17809
17810 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v64i8, V1, V2, Mask,
17811 Zeroable, Subtarget, DAG))
17812 return Blend;
17813
17814 if (!is128BitLaneCrossingShuffleMask(MVT::v64i8, Mask)) {
17815 // Use PALIGNR+Permute if possible - permute might become PSHUFB but the
17816 // PALIGNR will be cheaper than the second PSHUFB+OR.
17817 if (SDValue V = lowerShuffleAsByteRotateAndPermute(DL, MVT::v64i8, V1, V2,
17818 Mask, Subtarget, DAG))
17819 return V;
17820
17821 // If we can't directly blend but can use PSHUFB, that will be better as it
17822 // can both shuffle and set up the inefficient blend.
17823 bool V1InUse, V2InUse;
17824 return lowerShuffleAsBlendOfPSHUFBs(DL, MVT::v64i8, V1, V2, Mask, Zeroable,
17825 DAG, V1InUse, V2InUse);
17826 }
17827
17828 // Try to simplify this by merging 128-bit lanes to enable a lane-based
17829 // shuffle.
17830 if (!V2.isUndef())
17832 DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
17833 return Result;
17834
17835 // VBMI can use VPERMV/VPERMV3 byte shuffles.
17836 if (Subtarget.hasVBMI())
17837 return lowerShuffleWithPERMV(DL, MVT::v64i8, Mask, V1, V2, Subtarget, DAG);
17838
17839 return splitAndLowerShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG, /*SimpleOnly*/ false);
17840}
17841
17842/// High-level routine to lower various 512-bit x86 vector shuffles.
17843///
17844/// This routine either breaks down the specific type of a 512-bit x86 vector
17845/// shuffle or splits it into two 256-bit shuffles and fuses the results back
17846/// together based on the available instructions.
17848 MVT VT, SDValue V1, SDValue V2,
17849 const APInt &Zeroable,
17850 const X86Subtarget &Subtarget,
17851 SelectionDAG &DAG) {
17852 assert(Subtarget.hasAVX512() &&
17853 "Cannot lower 512-bit vectors w/ basic ISA!");
17854
17855 // If we have a single input to the zero element, insert that into V1 if we
17856 // can do so cheaply.
17857 int NumElts = Mask.size();
17858 int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
17859
17860 if (NumV2Elements == 1 && Mask[0] >= NumElts)
17862 DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
17863 return Insertion;
17864
17865 // Handle special cases where the lower or upper half is UNDEF.
17866 if (SDValue V =
17867 lowerShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
17868 return V;
17869
17870 // Check for being able to broadcast a single element.
17871 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, VT, V1, V2, Mask,
17872 Subtarget, DAG))
17873 return Broadcast;
17874
17875 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI()) {
17876 // Try using bit ops for masking and blending before falling back to
17877 // splitting.
17878 if (SDValue V = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
17879 Subtarget, DAG))
17880 return V;
17881 if (SDValue V = lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
17882 return V;
17883
17884 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG, /*SimpleOnly*/ false);
17885 }
17886
17887 if (VT == MVT::v32f16 || VT == MVT::v32bf16) {
17888 if (!Subtarget.hasBWI())
17889 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG,
17890 /*SimpleOnly*/ false);
17891
17892 V1 = DAG.getBitcast(MVT::v32i16, V1);
17893 V2 = DAG.getBitcast(MVT::v32i16, V2);
17894 return DAG.getBitcast(VT,
17895 DAG.getVectorShuffle(MVT::v32i16, DL, V1, V2, Mask));
17896 }
17897
17898 // Dispatch to each element type for lowering. If we don't have support for
17899 // specific element type shuffles at 512 bits, immediately split them and
17900 // lower them. Each lowering routine of a given type is allowed to assume that
17901 // the requisite ISA extensions for that element type are available.
17902 switch (VT.SimpleTy) {
17903 case MVT::v8f64:
17904 return lowerV8F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17905 case MVT::v16f32:
17906 return lowerV16F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17907 case MVT::v8i64:
17908 return lowerV8I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17909 case MVT::v16i32:
17910 return lowerV16I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17911 case MVT::v32i16:
17912 return lowerV32I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17913 case MVT::v64i8:
17914 return lowerV64I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17915
17916 default:
17917 llvm_unreachable("Not a valid 512-bit x86 vector type!");
17918 }
17919}
17920
17922 MVT VT, SDValue V1, SDValue V2,
17923 const X86Subtarget &Subtarget,
17924 SelectionDAG &DAG) {
17925 // Shuffle should be unary.
17926 if (!V2.isUndef())
17927 return SDValue();
17928
17929 int ShiftAmt = -1;
17930 int NumElts = Mask.size();
17931 for (int i = 0; i != NumElts; ++i) {
17932 int M = Mask[i];
17933 assert((M == SM_SentinelUndef || (0 <= M && M < NumElts)) &&
17934 "Unexpected mask index.");
17935 if (M < 0)
17936 continue;
17937
17938 // The first non-undef element determines our shift amount.
17939 if (ShiftAmt < 0) {
17940 ShiftAmt = M - i;
17941 // Need to be shifting right.
17942 if (ShiftAmt <= 0)
17943 return SDValue();
17944 }
17945 // All non-undef elements must shift by the same amount.
17946 if (ShiftAmt != M - i)
17947 return SDValue();
17948 }
17949 assert(ShiftAmt >= 0 && "All undef?");
17950
17951 // Great we found a shift right.
17952 SDValue Res = widenMaskVector(V1, false, Subtarget, DAG, DL);
17953 Res = DAG.getNode(X86ISD::KSHIFTR, DL, Res.getValueType(), Res,
17954 DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
17955 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
17956 DAG.getVectorIdxConstant(0, DL));
17957}
17958
17959// Determine if this shuffle can be implemented with a KSHIFT instruction.
17960// Returns the shift amount if possible or -1 if not. This is a simplified
17961// version of matchShuffleAsShift.
17962static int match1BitShuffleAsKSHIFT(unsigned &Opcode, ArrayRef<int> Mask,
17963 int MaskOffset, const APInt &Zeroable) {
17964 int Size = Mask.size();
17965
17966 auto CheckZeros = [&](int Shift, bool Left) {
17967 for (int j = 0; j < Shift; ++j)
17968 if (!Zeroable[j + (Left ? 0 : (Size - Shift))])
17969 return false;
17970
17971 return true;
17972 };
17973
17974 auto MatchShift = [&](int Shift, bool Left) {
17975 unsigned Pos = Left ? Shift : 0;
17976 unsigned Low = Left ? 0 : Shift;
17977 unsigned Len = Size - Shift;
17978 return isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset);
17979 };
17980
17981 for (int Shift = 1; Shift != Size; ++Shift)
17982 for (bool Left : {true, false})
17983 if (CheckZeros(Shift, Left) && MatchShift(Shift, Left)) {
17985 return Shift;
17986 }
17987
17988 return -1;
17989}
17990
17991
17992// Lower vXi1 vector shuffles.
17993// There is no a dedicated instruction on AVX-512 that shuffles the masks.
17994// The only way to shuffle bits is to sign-extend the mask vector to SIMD
17995// vector, shuffle and then truncate it back.
17997 MVT VT, SDValue V1, SDValue V2,
17998 const APInt &Zeroable,
17999 const X86Subtarget &Subtarget,
18000 SelectionDAG &DAG) {
18001 assert(Subtarget.hasAVX512() &&
18002 "Cannot lower 512-bit vectors w/o basic ISA!");
18003
18004 int NumElts = Mask.size();
18005 int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
18006
18007 // Try to recognize shuffles that are just padding a subvector with zeros.
18008 int SubvecElts = 0;
18009 int Src = -1;
18010 for (int i = 0; i != NumElts; ++i) {
18011 if (Mask[i] >= 0) {
18012 // Grab the source from the first valid mask. All subsequent elements need
18013 // to use this same source.
18014 if (Src < 0)
18015 Src = Mask[i] / NumElts;
18016 if (Src != (Mask[i] / NumElts) || (Mask[i] % NumElts) != i)
18017 break;
18018 }
18019
18020 ++SubvecElts;
18021 }
18022 assert(SubvecElts != NumElts && "Identity shuffle?");
18023
18024 // Clip to a power 2.
18025 SubvecElts = llvm::bit_floor<uint32_t>(SubvecElts);
18026
18027 // Make sure the number of zeroable bits in the top at least covers the bits
18028 // not covered by the subvector.
18029 if ((int)Zeroable.countl_one() >= (NumElts - SubvecElts)) {
18030 assert(Src >= 0 && "Expected a source!");
18031 MVT ExtractVT = MVT::getVectorVT(MVT::i1, SubvecElts);
18032 SDValue Extract =
18033 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtractVT, Src == 0 ? V1 : V2,
18034 DAG.getVectorIdxConstant(0, DL));
18035 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
18036 DAG.getConstant(0, DL, VT), Extract,
18037 DAG.getVectorIdxConstant(0, DL));
18038 }
18039
18040 // Try a simple shift right with undef elements. Later we'll try with zeros.
18041 if (SDValue Shift =
18042 lower1BitShuffleAsKSHIFTR(DL, Mask, VT, V1, V2, Subtarget, DAG))
18043 return Shift;
18044
18045 // Try to match KSHIFTs.
18046 unsigned Offset = 0;
18047 for (SDValue V : {V1, V2}) {
18048 unsigned Opcode;
18049 int ShiftAmt = match1BitShuffleAsKSHIFT(Opcode, Mask, Offset, Zeroable);
18050 if (ShiftAmt >= 0) {
18051 SDValue Res = widenMaskVector(V, false, Subtarget, DAG, DL);
18052 MVT WideVT = Res.getSimpleValueType();
18053 // Widened right shifts need two shifts to ensure we shift in zeroes.
18054 if (Opcode == X86ISD::KSHIFTR && WideVT != VT) {
18055 int WideElts = WideVT.getVectorNumElements();
18056 // Shift left to put the original vector in the MSBs of the new size.
18057 Res =
18058 DAG.getNode(X86ISD::KSHIFTL, DL, WideVT, Res,
18059 DAG.getTargetConstant(WideElts - NumElts, DL, MVT::i8));
18060 // Increase the shift amount to account for the left shift.
18061 ShiftAmt += WideElts - NumElts;
18062 }
18063
18064 Res = DAG.getNode(Opcode, DL, WideVT, Res,
18065 DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
18066 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
18067 DAG.getVectorIdxConstant(0, DL));
18068 }
18069 Offset += NumElts; // Increment for next iteration.
18070 }
18071
18072 // If we're performing an unary shuffle on a SETCC result, try to shuffle the
18073 // ops instead.
18074 // TODO: What other unary shuffles would benefit from this?
18075 if (NumV2Elements == 0 && V1.getOpcode() == ISD::SETCC && V1->hasOneUse()) {
18076 SDValue Op0 = V1.getOperand(0);
18077 SDValue Op1 = V1.getOperand(1);
18079 EVT OpVT = Op0.getValueType();
18080 if (OpVT.getScalarSizeInBits() >= 32 || isBroadcastShuffleMask(Mask))
18081 return DAG.getSetCC(
18082 DL, VT, DAG.getVectorShuffle(OpVT, DL, Op0, DAG.getUNDEF(OpVT), Mask),
18083 DAG.getVectorShuffle(OpVT, DL, Op1, DAG.getUNDEF(OpVT), Mask), CC);
18084 }
18085
18086 MVT ExtVT;
18087 switch (VT.SimpleTy) {
18088 default:
18089 llvm_unreachable("Expected a vector of i1 elements");
18090 case MVT::v2i1:
18091 ExtVT = MVT::v2i64;
18092 break;
18093 case MVT::v4i1:
18094 ExtVT = MVT::v4i32;
18095 break;
18096 case MVT::v8i1:
18097 // Take 512-bit type, more shuffles on KNL. If we have VLX use a 256-bit
18098 // shuffle.
18099 ExtVT = Subtarget.hasVLX() ? MVT::v8i32 : MVT::v8i64;
18100 break;
18101 case MVT::v16i1:
18102 // Take 512-bit type, unless we are avoiding 512-bit types and have the
18103 // 256-bit operation available.
18104 ExtVT = Subtarget.canExtendTo512DQ() ? MVT::v16i32 : MVT::v16i16;
18105 break;
18106 case MVT::v32i1:
18107 // Take 512-bit type, unless we are avoiding 512-bit types and have the
18108 // 256-bit operation available.
18109 assert(Subtarget.hasBWI() && "Expected AVX512BW support");
18110 ExtVT = Subtarget.canExtendTo512BW() ? MVT::v32i16 : MVT::v32i8;
18111 break;
18112 case MVT::v64i1:
18113 // Fall back to scalarization. FIXME: We can do better if the shuffle
18114 // can be partitioned cleanly.
18115 if (!Subtarget.useBWIRegs())
18116 return SDValue();
18117 ExtVT = MVT::v64i8;
18118 break;
18119 }
18120
18121 V1 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V1);
18122 V2 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V2);
18123
18124 SDValue Shuffle = DAG.getVectorShuffle(ExtVT, DL, V1, V2, Mask);
18125 // i1 was sign extended we can use X86ISD::CVT2MASK.
18126 int NumElems = VT.getVectorNumElements();
18127 if ((Subtarget.hasBWI() && (NumElems >= 32)) ||
18128 (Subtarget.hasDQI() && (NumElems < 32)))
18129 return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, ExtVT),
18130 Shuffle, ISD::SETGT);
18131
18132 return DAG.getNode(ISD::TRUNCATE, DL, VT, Shuffle);
18133}
18134
18135/// Helper function that returns true if the shuffle mask should be
18136/// commuted to improve canonicalization.
18138 int NumElements = Mask.size();
18139
18140 int NumV1Elements = 0, NumV2Elements = 0;
18141 for (int M : Mask)
18142 if (M < 0)
18143 continue;
18144 else if (M < NumElements)
18145 ++NumV1Elements;
18146 else
18147 ++NumV2Elements;
18148
18149 // Commute the shuffle as needed such that more elements come from V1 than
18150 // V2. This allows us to match the shuffle pattern strictly on how many
18151 // elements come from V1 without handling the symmetric cases.
18152 if (NumV2Elements > NumV1Elements)
18153 return true;
18154
18155 assert(NumV1Elements > 0 && "No V1 indices");
18156
18157 if (NumV2Elements == 0)
18158 return false;
18159
18160 // When the number of V1 and V2 elements are the same, try to minimize the
18161 // number of uses of V2 in the low half of the vector. When that is tied,
18162 // ensure that the sum of indices for V1 is equal to or lower than the sum
18163 // indices for V2. When those are equal, try to ensure that the number of odd
18164 // indices for V1 is lower than the number of odd indices for V2.
18165 if (NumV1Elements == NumV2Elements) {
18166 int LowV1Elements = 0, LowV2Elements = 0;
18167 for (int M : Mask.slice(0, NumElements / 2))
18168 if (M >= NumElements)
18169 ++LowV2Elements;
18170 else if (M >= 0)
18171 ++LowV1Elements;
18172 if (LowV2Elements > LowV1Elements)
18173 return true;
18174 if (LowV2Elements == LowV1Elements) {
18175 int SumV1Indices = 0, SumV2Indices = 0;
18176 for (int i = 0, Size = Mask.size(); i < Size; ++i)
18177 if (Mask[i] >= NumElements)
18178 SumV2Indices += i;
18179 else if (Mask[i] >= 0)
18180 SumV1Indices += i;
18181 if (SumV2Indices < SumV1Indices)
18182 return true;
18183 if (SumV2Indices == SumV1Indices) {
18184 int NumV1OddIndices = 0, NumV2OddIndices = 0;
18185 for (int i = 0, Size = Mask.size(); i < Size; ++i)
18186 if (Mask[i] >= NumElements)
18187 NumV2OddIndices += i % 2;
18188 else if (Mask[i] >= 0)
18189 NumV1OddIndices += i % 2;
18190 if (NumV2OddIndices < NumV1OddIndices)
18191 return true;
18192 }
18193 }
18194 }
18195
18196 return false;
18197}
18198
18200 const X86Subtarget &Subtarget) {
18201 if (!Subtarget.hasAVX512())
18202 return false;
18203
18204 if (!V.getValueType().isSimple())
18205 return false;
18206
18207 MVT VT = V.getSimpleValueType().getScalarType();
18208 if ((VT == MVT::i16 || VT == MVT::i8) && !Subtarget.hasBWI())
18209 return false;
18210
18211 // If vec width < 512, widen i8/i16 even with BWI as blendd/blendps/blendpd
18212 // are preferable to blendw/blendvb/masked-mov.
18213 if ((VT == MVT::i16 || VT == MVT::i8) &&
18214 V.getSimpleValueType().getSizeInBits() < 512)
18215 return false;
18216
18217 auto HasMaskOperation = [&](SDValue V) {
18218 // TODO: Currently we only check limited opcode. We probably extend
18219 // it to all binary operation by checking TLI.isBinOp().
18220 switch (V->getOpcode()) {
18221 default:
18222 return false;
18223 case ISD::ADD:
18224 case ISD::SUB:
18225 case ISD::AND:
18226 case ISD::XOR:
18227 case ISD::OR:
18228 case ISD::SMAX:
18229 case ISD::SMIN:
18230 case ISD::UMAX:
18231 case ISD::UMIN:
18232 case ISD::ABS:
18233 case ISD::SHL:
18234 case ISD::SRL:
18235 case ISD::SRA:
18236 case ISD::MUL:
18237 break;
18238 }
18239 if (!V->hasOneUse())
18240 return false;
18241
18242 return true;
18243 };
18244
18245 if (HasMaskOperation(V))
18246 return true;
18247
18248 return false;
18249}
18250
18251// Forward declaration.
18254 unsigned RootSizeInBits, const SDLoc &DL, SelectionDAG &DAG,
18255 const X86Subtarget &Subtarget);
18256
18257 /// Top-level lowering for x86 vector shuffles.
18258///
18259/// This handles decomposition, canonicalization, and lowering of all x86
18260/// vector shuffles. Most of the specific lowering strategies are encapsulated
18261/// above in helper routines. The canonicalization attempts to widen shuffles
18262/// to involve fewer lanes of wider elements, consolidate symmetric patterns
18263/// s.t. only one of the two inputs needs to be tested, etc.
18265 SelectionDAG &DAG) {
18267 ArrayRef<int> OrigMask = SVOp->getMask();
18268 SDValue V1 = Op.getOperand(0);
18269 SDValue V2 = Op.getOperand(1);
18270 MVT VT = Op.getSimpleValueType();
18271 int NumElements = VT.getVectorNumElements();
18272 SDLoc DL(Op);
18273 bool Is1BitVector = (VT.getVectorElementType() == MVT::i1);
18274
18275 assert((VT.getSizeInBits() != 64 || Is1BitVector) &&
18276 "Can't lower MMX shuffles");
18277
18278 bool V1IsUndef = V1.isUndef();
18279 bool V2IsUndef = V2.isUndef();
18280 if (V1IsUndef && V2IsUndef)
18281 return DAG.getUNDEF(VT);
18282
18283 // When we create a shuffle node we put the UNDEF node to second operand,
18284 // but in some cases the first operand may be transformed to UNDEF.
18285 // In this case we should just commute the node.
18286 if (V1IsUndef)
18287 return DAG.getCommutedVectorShuffle(*SVOp);
18288
18289 // Check for non-undef masks pointing at an undef vector and make the masks
18290 // undef as well. This makes it easier to match the shuffle based solely on
18291 // the mask.
18292 if (V2IsUndef &&
18293 any_of(OrigMask, [NumElements](int M) { return M >= NumElements; })) {
18294 SmallVector<int, 8> NewMask(OrigMask);
18295 for (int &M : NewMask)
18296 if (M >= NumElements)
18297 M = -1;
18298 return DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
18299 }
18300
18301 // Check for illegal shuffle mask element index values.
18302 int MaskUpperLimit = OrigMask.size() * (V2IsUndef ? 1 : 2);
18303 (void)MaskUpperLimit;
18304 assert(llvm::all_of(OrigMask,
18305 [&](int M) { return -1 <= M && M < MaskUpperLimit; }) &&
18306 "Out of bounds shuffle index");
18307
18308 // We actually see shuffles that are entirely re-arrangements of a set of
18309 // zero inputs. This mostly happens while decomposing complex shuffles into
18310 // simple ones. Directly lower these as a buildvector of zeros.
18311 APInt KnownUndef, KnownZero;
18312 computeZeroableShuffleElements(OrigMask, V1, V2, KnownUndef, KnownZero);
18313
18314 APInt Zeroable = KnownUndef | KnownZero;
18315 if (Zeroable.isAllOnes())
18316 return getZeroVector(VT, Subtarget, DAG, DL);
18317
18318 bool V2IsZero = !V2IsUndef && ISD::isBuildVectorAllZeros(V2.getNode());
18319
18320 // Try to collapse shuffles into using a vector type with fewer elements but
18321 // wider element types. We cap this to not form integers or floating point
18322 // elements wider than 64 bits. It does not seem beneficial to form i128
18323 // integers to handle flipping the low and high halves of AVX 256-bit vectors.
18324 SmallVector<int, 16> WidenedMask;
18325 if (VT.getScalarSizeInBits() < 64 && !Is1BitVector &&
18326 !canCombineAsMaskOperation(V1, Subtarget) &&
18327 !canCombineAsMaskOperation(V2, Subtarget) &&
18328 canWidenShuffleElements(OrigMask, Zeroable, V2IsZero, WidenedMask)) {
18329 // Shuffle mask widening should not interfere with a broadcast opportunity
18330 // by obfuscating the operands with bitcasts.
18331 // TODO: Avoid lowering directly from this top-level function: make this
18332 // a query (canLowerAsBroadcast) and defer lowering to the type-based calls.
18333 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, VT, V1, V2, OrigMask,
18334 Subtarget, DAG))
18335 return Broadcast;
18336
18337 MVT NewEltVT = VT.isFloatingPoint()
18340 int NewNumElts = NumElements / 2;
18341 MVT NewVT = MVT::getVectorVT(NewEltVT, NewNumElts);
18342 // Make sure that the new vector type is legal. For example, v2f64 isn't
18343 // legal on SSE1.
18344 if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {
18345 if (V2IsZero) {
18346 // Modify the new Mask to take all zeros from the all-zero vector.
18347 // Choose indices that are blend-friendly.
18348 bool UsedZeroVector = false;
18349 assert(is_contained(WidenedMask, SM_SentinelZero) &&
18350 "V2's non-undef elements are used?!");
18351 for (int i = 0; i != NewNumElts; ++i)
18352 if (WidenedMask[i] == SM_SentinelZero) {
18353 WidenedMask[i] = i + NewNumElts;
18354 UsedZeroVector = true;
18355 }
18356 // Ensure all elements of V2 are zero - isBuildVectorAllZeros permits
18357 // some elements to be undef.
18358 if (UsedZeroVector)
18359 V2 = getZeroVector(NewVT, Subtarget, DAG, DL);
18360 }
18361 V1 = DAG.getBitcast(NewVT, V1);
18362 V2 = DAG.getBitcast(NewVT, V2);
18363 return DAG.getBitcast(
18364 VT, DAG.getVectorShuffle(NewVT, DL, V1, V2, WidenedMask));
18365 }
18366 }
18367
18368 SmallVector<SDValue> Ops = {V1, V2};
18369 SmallVector<int> Mask(OrigMask);
18370
18371 // Canonicalize the shuffle with any horizontal ops inputs.
18372 // NOTE: This may update Ops and Mask.
18374 Ops, Mask, VT.getSizeInBits(), DL, DAG, Subtarget))
18375 return DAG.getBitcast(VT, HOp);
18376
18377 V1 = DAG.getBitcast(VT, Ops[0]);
18378 V2 = DAG.getBitcast(VT, Ops[1]);
18379 assert(NumElements == (int)Mask.size() &&
18380 "canonicalizeShuffleMaskWithHorizOp "
18381 "shouldn't alter the shuffle mask size");
18382
18383 // Canonicalize zeros/ones/fp splat constants to ensure no undefs.
18384 // These will be materialized uniformly anyway, so make splat matching easier.
18385 // TODO: Allow all int constants?
18386 auto CanonicalizeConstant = [VT, &DL, &DAG](SDValue V) {
18387 if (auto *BV = dyn_cast<BuildVectorSDNode>(V)) {
18388 BitVector Undefs;
18389 if (SDValue Splat = BV->getSplatValue(&Undefs)) {
18390 if (Undefs.any() &&
18393 V = DAG.getBitcast(VT, DAG.getSplat(BV->getValueType(0), DL, Splat));
18394 }
18395 }
18396 }
18397 return V;
18398 };
18399 V1 = CanonicalizeConstant(V1);
18400 V2 = CanonicalizeConstant(V2);
18401
18402 // Commute the shuffle if it will improve canonicalization.
18405 std::swap(V1, V2);
18406 }
18407
18408 // For each vector width, delegate to a specialized lowering routine.
18409 if (VT.is128BitVector())
18410 return lower128BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
18411
18412 if (VT.is256BitVector())
18413 return lower256BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
18414
18415 if (VT.is512BitVector())
18416 return lower512BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
18417
18418 if (Is1BitVector)
18419 return lower1BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
18420
18421 llvm_unreachable("Unimplemented!");
18422}
18423
18424// As legal vpcompress instructions depend on various AVX512 extensions, try to
18425// convert illegal vector sizes to legal ones to avoid expansion.
18427 SelectionDAG &DAG) {
18428 assert(Subtarget.hasAVX512() &&
18429 "Need AVX512 for custom VECTOR_COMPRESS lowering.");
18430
18431 SDLoc DL(Op);
18432 SDValue Vec = Op.getOperand(0);
18433 SDValue Mask = Op.getOperand(1);
18434 SDValue Passthru = Op.getOperand(2);
18435
18436 EVT VecVT = Vec.getValueType();
18437 EVT ElementVT = VecVT.getVectorElementType();
18438 unsigned NumElements = VecVT.getVectorNumElements();
18439 unsigned NumVecBits = VecVT.getFixedSizeInBits();
18440 unsigned NumElementBits = ElementVT.getFixedSizeInBits();
18441
18442 // 128- and 256-bit vectors with <= 16 elements can be converted to and
18443 // compressed as 512-bit vectors in AVX512F.
18444 if (NumVecBits != 128 && NumVecBits != 256)
18445 return SDValue();
18446
18447 if (NumElementBits == 32 || NumElementBits == 64) {
18448 unsigned NumLargeElements = 512 / NumElementBits;
18449 MVT LargeVecVT =
18450 MVT::getVectorVT(ElementVT.getSimpleVT(), NumLargeElements);
18451 MVT LargeMaskVT = MVT::getVectorVT(MVT::i1, NumLargeElements);
18452
18453 Vec = widenSubVector(LargeVecVT, Vec, /*ZeroNewElements=*/false, Subtarget,
18454 DAG, DL);
18455 Mask = widenSubVector(LargeMaskVT, Mask, /*ZeroNewElements=*/true,
18456 Subtarget, DAG, DL);
18457 Passthru = Passthru.isUndef() ? DAG.getUNDEF(LargeVecVT)
18458 : widenSubVector(LargeVecVT, Passthru,
18459 /*ZeroNewElements=*/false,
18460 Subtarget, DAG, DL);
18461
18462 SDValue Compressed =
18463 DAG.getNode(ISD::VECTOR_COMPRESS, DL, LargeVecVT, Vec, Mask, Passthru);
18464 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VecVT, Compressed,
18465 DAG.getConstant(0, DL, MVT::i64));
18466 }
18467
18468 if (VecVT == MVT::v8i16 || VecVT == MVT::v8i8 || VecVT == MVT::v16i8 ||
18469 VecVT == MVT::v16i16) {
18470 MVT LageElementVT = MVT::getIntegerVT(512 / NumElements);
18471 EVT LargeVecVT = MVT::getVectorVT(LageElementVT, NumElements);
18472
18473 Vec = DAG.getNode(ISD::ANY_EXTEND, DL, LargeVecVT, Vec);
18474 Passthru = Passthru.isUndef()
18475 ? DAG.getUNDEF(LargeVecVT)
18476 : DAG.getNode(ISD::ANY_EXTEND, DL, LargeVecVT, Passthru);
18477
18478 SDValue Compressed =
18479 DAG.getNode(ISD::VECTOR_COMPRESS, DL, LargeVecVT, Vec, Mask, Passthru);
18480 return DAG.getNode(ISD::TRUNCATE, DL, VecVT, Compressed);
18481 }
18482
18483 return SDValue();
18484}
18485
18486/// Try to lower a VSELECT instruction to a vector shuffle.
18488 const X86Subtarget &Subtarget,
18489 SelectionDAG &DAG) {
18490 SDValue Cond = Op.getOperand(0);
18491 SDValue LHS = Op.getOperand(1);
18492 SDValue RHS = Op.getOperand(2);
18493 MVT VT = Op.getSimpleValueType();
18494
18495 // Only non-legal VSELECTs reach this lowering, convert those into generic
18496 // shuffles and re-use the shuffle lowering path for blends.
18500 return DAG.getVectorShuffle(VT, SDLoc(Op), LHS, RHS, Mask);
18501 }
18502
18503 return SDValue();
18504}
18505
18506SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
18507 SDValue Cond = Op.getOperand(0);
18508 SDValue LHS = Op.getOperand(1);
18509 SDValue RHS = Op.getOperand(2);
18510
18511 SDLoc dl(Op);
18512 MVT VT = Op.getSimpleValueType();
18513 if (isSoftF16(VT, Subtarget)) {
18514 MVT NVT = VT.changeVectorElementTypeToInteger();
18515 return DAG.getBitcast(VT, DAG.getNode(ISD::VSELECT, dl, NVT, Cond,
18516 DAG.getBitcast(NVT, LHS),
18517 DAG.getBitcast(NVT, RHS)));
18518 }
18519
18520 // A vselect where all conditions and data are constants can be optimized into
18521 // a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR().
18525 return SDValue();
18526
18527 // Try to lower this to a blend-style vector shuffle. This can handle all
18528 // constant condition cases.
18529 if (SDValue BlendOp = lowerVSELECTtoVectorShuffle(Op, Subtarget, DAG))
18530 return BlendOp;
18531
18532 // If this VSELECT has a vector if i1 as a mask, it will be directly matched
18533 // with patterns on the mask registers on AVX-512.
18534 MVT CondVT = Cond.getSimpleValueType();
18535 unsigned CondEltSize = Cond.getScalarValueSizeInBits();
18536 if (CondEltSize == 1)
18537 return Op;
18538
18539 // Variable blends are only legal from SSE4.1 onward.
18540 if (!Subtarget.hasSSE41())
18541 return SDValue();
18542
18543 unsigned EltSize = VT.getScalarSizeInBits();
18544 unsigned NumElts = VT.getVectorNumElements();
18545
18546 // Expand v32i16/v64i8 without BWI.
18547 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
18548 return SDValue();
18549
18550 // If the VSELECT is on a 512-bit type, we have to convert a non-i1 condition
18551 // into an i1 condition so that we can use the mask-based 512-bit blend
18552 // instructions.
18553 if (VT.getSizeInBits() == 512) {
18554 // Build a mask by testing the condition against zero.
18555 MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
18556 SDValue Mask = DAG.getSetCC(dl, MaskVT, Cond,
18557 DAG.getConstant(0, dl, CondVT),
18558 ISD::SETNE);
18559 // Now return a new VSELECT using the mask.
18560 return DAG.getSelect(dl, VT, Mask, LHS, RHS);
18561 }
18562
18563 // SEXT/TRUNC cases where the mask doesn't match the destination size.
18564 if (CondEltSize != EltSize) {
18565 // If we don't have a sign splat, rely on the expansion.
18566 if (CondEltSize != DAG.ComputeNumSignBits(Cond))
18567 return SDValue();
18568
18569 MVT NewCondSVT = MVT::getIntegerVT(EltSize);
18570 MVT NewCondVT = MVT::getVectorVT(NewCondSVT, NumElts);
18571 Cond = DAG.getSExtOrTrunc(Cond, dl, NewCondVT);
18572 return DAG.getNode(ISD::VSELECT, dl, VT, Cond, LHS, RHS);
18573 }
18574
18575 // v16i16/v32i8 selects without AVX2, if the condition and another operand
18576 // are free to split, then better to split before expanding the
18577 // select. Don't bother with XOP as it has the fast VPCMOV instruction.
18578 // TODO: This is very similar to narrowVectorSelect.
18579 // TODO: Add Load splitting to isFreeToSplitVector ?
18580 if (EltSize < 32 && VT.is256BitVector() && !Subtarget.hasAVX2() &&
18581 !Subtarget.hasXOP()) {
18582 bool FreeCond = isFreeToSplitVector(Cond, DAG);
18583 bool FreeLHS = isFreeToSplitVector(LHS, DAG) ||
18584 (ISD::isNormalLoad(LHS.getNode()) && LHS.hasOneUse());
18585 bool FreeRHS = isFreeToSplitVector(RHS, DAG) ||
18586 (ISD::isNormalLoad(RHS.getNode()) && RHS.hasOneUse());
18587 if (FreeCond && (FreeLHS || FreeRHS))
18588 return splitVectorOp(Op, DAG, dl);
18589 }
18590
18591 // Only some types will be legal on some subtargets. If we can emit a legal
18592 // VSELECT-matching blend, return Op, and but if we need to expand, return
18593 // a null value.
18594 switch (VT.SimpleTy) {
18595 default:
18596 // Most of the vector types have blends past SSE4.1.
18597 return Op;
18598
18599 case MVT::v32i8:
18600 // The byte blends for AVX vectors were introduced only in AVX2.
18601 if (Subtarget.hasAVX2())
18602 return Op;
18603
18604 return SDValue();
18605
18606 case MVT::v8i16:
18607 case MVT::v16i16:
18608 case MVT::v8f16:
18609 case MVT::v16f16: {
18610 // Bitcast everything to the vXi8 type and use a vXi8 vselect.
18611 MVT CastVT = MVT::getVectorVT(MVT::i8, NumElts * 2);
18612 Cond = DAG.getBitcast(CastVT, Cond);
18613 LHS = DAG.getBitcast(CastVT, LHS);
18614 RHS = DAG.getBitcast(CastVT, RHS);
18615 SDValue Select = DAG.getNode(ISD::VSELECT, dl, CastVT, Cond, LHS, RHS);
18616 return DAG.getBitcast(VT, Select);
18617 }
18618 }
18619}
18620
18622 MVT VT = Op.getSimpleValueType();
18623 SDValue Vec = Op.getOperand(0);
18624 SDValue Idx = Op.getOperand(1);
18625 assert(isa<ConstantSDNode>(Idx) && "Constant index expected");
18626 SDLoc dl(Op);
18627
18629 return SDValue();
18630
18631 if (VT.getSizeInBits() == 8) {
18632 // If IdxVal is 0, it's cheaper to do a move instead of a pextrb, unless
18633 // we're going to zero extend the register or fold the store.
18636 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8,
18637 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
18638 DAG.getBitcast(MVT::v4i32, Vec), Idx));
18639
18640 unsigned IdxVal = Idx->getAsZExtVal();
18641 SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32, Vec,
18642 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
18643 return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);
18644 }
18645
18646 if (VT == MVT::f32) {
18647 // EXTRACTPS outputs to a GPR32 register which will require a movd to copy
18648 // the result back to FR32 register. It's only worth matching if the
18649 // result has a single use which is a store or a bitcast to i32. And in
18650 // the case of a store, it's not worth it if the index is a constant 0,
18651 // because a MOVSSmr can be used instead, which is smaller and faster.
18652 if (!Op.hasOneUse())
18653 return SDValue();
18654 SDNode *User = *Op.getNode()->user_begin();
18655 if ((User->getOpcode() != ISD::STORE || isNullConstant(Idx)) &&
18656 (User->getOpcode() != ISD::BITCAST ||
18657 User->getValueType(0) != MVT::i32))
18658 return SDValue();
18659 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
18660 DAG.getBitcast(MVT::v4i32, Vec), Idx);
18661 return DAG.getBitcast(MVT::f32, Extract);
18662 }
18663
18664 if (VT == MVT::i32 || VT == MVT::i64)
18665 return Op;
18666
18667 return SDValue();
18668}
18669
18670/// Extract one bit from mask vector, like v16i1 or v8i1.
18671/// AVX-512 feature.
18673 const X86Subtarget &Subtarget) {
18674 SDValue Vec = Op.getOperand(0);
18675 SDLoc dl(Vec);
18676 MVT VecVT = Vec.getSimpleValueType();
18677 SDValue Idx = Op.getOperand(1);
18678 auto* IdxC = dyn_cast<ConstantSDNode>(Idx);
18679 MVT EltVT = Op.getSimpleValueType();
18680
18681 assert((VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI()) &&
18682 "Unexpected vector type in ExtractBitFromMaskVector");
18683
18684 // variable index can't be handled in mask registers,
18685 // extend vector to VR512/128
18686 if (!IdxC) {
18687 unsigned NumElts = VecVT.getVectorNumElements();
18688 // Extending v8i1/v16i1 to 512-bit get better performance on KNL
18689 // than extending to 128/256bit.
18690 if (NumElts == 1) {
18691 Vec = widenMaskVector(Vec, false, Subtarget, DAG, dl);
18693 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, DAG.getBitcast(IntVT, Vec));
18694 }
18695 MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8;
18696 MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts);
18697 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec);
18698 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ExtEltVT, Ext, Idx);
18699 return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);
18700 }
18701
18702 unsigned IdxVal = IdxC->getZExtValue();
18703 if (IdxVal == 0) // the operation is legal
18704 return Op;
18705
18706 // Extend to natively supported kshift.
18707 Vec = widenMaskVector(Vec, false, Subtarget, DAG, dl);
18708
18709 // Use kshiftr instruction to move to the lower element.
18710 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, Vec.getSimpleValueType(), Vec,
18711 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
18712
18713 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
18714 DAG.getVectorIdxConstant(0, dl));
18715}
18716
18717// Helper to find all the extracted elements from a vector.
18719 MVT VT = N->getSimpleValueType(0);
18720 unsigned NumElts = VT.getVectorNumElements();
18721 APInt DemandedElts = APInt::getZero(NumElts);
18722 for (SDNode *User : N->users()) {
18723 switch (User->getOpcode()) {
18724 case X86ISD::PEXTRB:
18725 case X86ISD::PEXTRW:
18728 DemandedElts.setAllBits();
18729 return DemandedElts;
18730 }
18731 DemandedElts.setBit(User->getConstantOperandVal(1));
18732 break;
18733 case ISD::BITCAST: {
18734 if (!User->getValueType(0).isSimple() ||
18735 !User->getValueType(0).isVector()) {
18736 DemandedElts.setAllBits();
18737 return DemandedElts;
18738 }
18739 APInt DemandedSrcElts = getExtractedDemandedElts(User);
18740 DemandedElts |= APIntOps::ScaleBitMask(DemandedSrcElts, NumElts);
18741 break;
18742 }
18743 default:
18744 DemandedElts.setAllBits();
18745 return DemandedElts;
18746 }
18747 }
18748 return DemandedElts;
18749}
18750
18751SDValue
18752X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
18753 SelectionDAG &DAG) const {
18754 SDLoc dl(Op);
18755 SDValue Vec = Op.getOperand(0);
18756 MVT VecVT = Vec.getSimpleValueType();
18757 SDValue Idx = Op.getOperand(1);
18758 auto* IdxC = dyn_cast<ConstantSDNode>(Idx);
18759
18760 if (VecVT.getVectorElementType() == MVT::i1)
18761 return ExtractBitFromMaskVector(Op, DAG, Subtarget);
18762
18763 if (!IdxC) {
18764 // Its more profitable to go through memory (1 cycles throughput)
18765 // than using VMOVD + VPERMV/PSHUFB sequence (2/3 cycles throughput)
18766 // IACA tool was used to get performance estimation
18767 // (https://software.intel.com/en-us/articles/intel-architecture-code-analyzer)
18768 //
18769 // example : extractelement <16 x i8> %a, i32 %i
18770 //
18771 // Block Throughput: 3.00 Cycles
18772 // Throughput Bottleneck: Port5
18773 //
18774 // | Num Of | Ports pressure in cycles | |
18775 // | Uops | 0 - DV | 5 | 6 | 7 | |
18776 // ---------------------------------------------
18777 // | 1 | | 1.0 | | | CP | vmovd xmm1, edi
18778 // | 1 | | 1.0 | | | CP | vpshufb xmm0, xmm0, xmm1
18779 // | 2 | 1.0 | 1.0 | | | CP | vpextrb eax, xmm0, 0x0
18780 // Total Num Of Uops: 4
18781 //
18782 //
18783 // Block Throughput: 1.00 Cycles
18784 // Throughput Bottleneck: PORT2_AGU, PORT3_AGU, Port4
18785 //
18786 // | | Ports pressure in cycles | |
18787 // |Uops| 1 | 2 - D |3 - D | 4 | 5 | |
18788 // ---------------------------------------------------------
18789 // |2^ | | 0.5 | 0.5 |1.0| |CP| vmovaps xmmword ptr [rsp-0x18], xmm0
18790 // |1 |0.5| | | |0.5| | lea rax, ptr [rsp-0x18]
18791 // |1 | |0.5, 0.5|0.5, 0.5| | |CP| mov al, byte ptr [rdi+rax*1]
18792 // Total Num Of Uops: 4
18793
18794 return SDValue();
18795 }
18796
18797 unsigned IdxVal = IdxC->getZExtValue();
18798
18799 // If this is a 256-bit vector result, first extract the 128-bit vector and
18800 // then extract the element from the 128-bit vector.
18801 if (VecVT.is256BitVector() || VecVT.is512BitVector()) {
18802 // Get the 128-bit vector.
18803 Vec = extract128BitVector(Vec, IdxVal, DAG, dl);
18804 MVT EltVT = VecVT.getVectorElementType();
18805
18806 unsigned ElemsPerChunk = 128 / EltVT.getSizeInBits();
18807 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
18808
18809 // Find IdxVal modulo ElemsPerChunk. Since ElemsPerChunk is a power of 2
18810 // this can be done with a mask.
18811 IdxVal &= ElemsPerChunk - 1;
18812 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
18813 DAG.getVectorIdxConstant(IdxVal, dl));
18814 }
18815
18816 assert(VecVT.is128BitVector() && "Unexpected vector length");
18817
18818 MVT VT = Op.getSimpleValueType();
18819
18820 if (VT == MVT::i16) {
18821 // If IdxVal is 0, it's cheaper to do a move instead of a pextrw, unless
18822 // we're going to zero extend the register or fold the store (SSE41 only).
18823 if (IdxVal == 0 && !X86::mayFoldIntoZeroExtend(Op) &&
18824 !(Subtarget.hasSSE41() && X86::mayFoldIntoStore(Op))) {
18825 if (Subtarget.hasFP16())
18826 return Op;
18827
18828 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
18829 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
18830 DAG.getBitcast(MVT::v4i32, Vec), Idx));
18831 }
18832
18833 SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32, Vec,
18834 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
18835 return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);
18836 }
18837
18838 if (Subtarget.hasSSE41())
18839 if (SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG))
18840 return Res;
18841
18842 // Only extract a single element from a v16i8 source - determine the common
18843 // DWORD/WORD that all extractions share, and extract the sub-byte.
18844 // TODO: Add QWORD MOVQ extraction?
18845 if (VT == MVT::i8) {
18846 APInt DemandedElts = getExtractedDemandedElts(Vec.getNode());
18847 assert(DemandedElts.getBitWidth() == 16 && "Vector width mismatch");
18848
18849 // Extract either the lowest i32 or any i16, and extract the sub-byte.
18850 int DWordIdx = IdxVal / 4;
18851 if (DWordIdx == 0 && DemandedElts == (DemandedElts & 15)) {
18852 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
18853 DAG.getBitcast(MVT::v4i32, Vec),
18854 DAG.getVectorIdxConstant(DWordIdx, dl));
18855 int ShiftVal = (IdxVal % 4) * 8;
18856 if (ShiftVal != 0)
18857 Res = DAG.getNode(ISD::SRL, dl, MVT::i32, Res,
18858 DAG.getConstant(ShiftVal, dl, MVT::i8));
18859 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
18860 }
18861
18862 int WordIdx = IdxVal / 2;
18863 if (DemandedElts == (DemandedElts & (3 << (WordIdx * 2)))) {
18864 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16,
18865 DAG.getBitcast(MVT::v8i16, Vec),
18866 DAG.getVectorIdxConstant(WordIdx, dl));
18867 int ShiftVal = (IdxVal % 2) * 8;
18868 if (ShiftVal != 0)
18869 Res = DAG.getNode(ISD::SRL, dl, MVT::i16, Res,
18870 DAG.getConstant(ShiftVal, dl, MVT::i8));
18871 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
18872 }
18873 }
18874
18875 if (VT == MVT::f16 || VT.getSizeInBits() == 32) {
18876 if (IdxVal == 0)
18877 return Op;
18878
18879 // Shuffle the element to the lowest element, then movss or movsh.
18880 SmallVector<int, 8> Mask(VecVT.getVectorNumElements(), -1);
18881 Mask[0] = static_cast<int>(IdxVal);
18882 Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
18883 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
18884 DAG.getVectorIdxConstant(0, dl));
18885 }
18886
18887 if (VT.getSizeInBits() == 64) {
18888 // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b
18889 // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught
18890 // to match extract_elt for f64.
18891 if (IdxVal == 0)
18892 return Op;
18893
18894 // UNPCKHPD the element to the lowest double word, then movsd.
18895 // Note if the lower 64 bits of the result of the UNPCKHPD is then stored
18896 // to a f64mem, the whole operation is folded into a single MOVHPDmr.
18897 int Mask[2] = { 1, -1 };
18898 Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
18899 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
18900 DAG.getVectorIdxConstant(0, dl));
18901 }
18902
18903 return SDValue();
18904}
18905
18906/// Insert one bit to mask vector, like v16i1 or v8i1.
18907/// AVX-512 feature.
18909 const X86Subtarget &Subtarget) {
18910 SDLoc dl(Op);
18911 SDValue Vec = Op.getOperand(0);
18912 SDValue Elt = Op.getOperand(1);
18913 SDValue Idx = Op.getOperand(2);
18914 MVT VecVT = Vec.getSimpleValueType();
18915
18916 if (!isa<ConstantSDNode>(Idx)) {
18917 // Non constant index. Extend source and destination,
18918 // insert element and then truncate the result.
18919 unsigned NumElts = VecVT.getVectorNumElements();
18920 MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8;
18921 MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts);
18922 SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT,
18923 DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec),
18924 DAG.getNode(ISD::SIGN_EXTEND, dl, ExtEltVT, Elt), Idx);
18925 return DAG.getNode(ISD::TRUNCATE, dl, VecVT, ExtOp);
18926 }
18927
18928 // Copy into a k-register, extract to v1i1 and insert_subvector.
18929 SDValue EltInVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i1, Elt);
18930 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VecVT, Vec, EltInVec, Idx);
18931}
18932
18933SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
18934 SelectionDAG &DAG) const {
18935 MVT VT = Op.getSimpleValueType();
18936 MVT EltVT = VT.getVectorElementType();
18937 unsigned NumElts = VT.getVectorNumElements();
18938 unsigned EltSizeInBits = EltVT.getScalarSizeInBits();
18939
18940 if (EltVT == MVT::i1)
18941 return InsertBitToMaskVector(Op, DAG, Subtarget);
18942
18943 SDLoc dl(Op);
18944 SDValue N0 = Op.getOperand(0);
18945 SDValue N1 = Op.getOperand(1);
18946 SDValue N2 = Op.getOperand(2);
18947 auto *N2C = dyn_cast<ConstantSDNode>(N2);
18948
18949 if (EltVT == MVT::bf16) {
18950 MVT IVT = VT.changeVectorElementTypeToInteger();
18951 SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, IVT,
18952 DAG.getBitcast(IVT, N0),
18953 DAG.getBitcast(MVT::i16, N1), N2);
18954 return DAG.getBitcast(VT, Res);
18955 }
18956
18957 if (!N2C) {
18958 // Variable insertion indices, usually we're better off spilling to stack,
18959 // but AVX512 can use a variable compare+select by comparing against all
18960 // possible vector indices, and FP insertion has less gpr->simd traffic.
18961 if (!(Subtarget.hasBWI() ||
18962 (Subtarget.hasAVX512() && EltSizeInBits >= 32) ||
18963 (Subtarget.hasSSE41() && (EltVT == MVT::f32 || EltVT == MVT::f64))))
18964 return SDValue();
18965
18966 MVT IdxSVT = MVT::getIntegerVT(EltSizeInBits);
18967 MVT IdxVT = MVT::getVectorVT(IdxSVT, NumElts);
18968 if (!isTypeLegal(IdxSVT) || !isTypeLegal(IdxVT))
18969 return SDValue();
18970
18971 SDValue IdxExt = DAG.getZExtOrTrunc(N2, dl, IdxSVT);
18972 SDValue IdxSplat = DAG.getSplatBuildVector(IdxVT, dl, IdxExt);
18973 SDValue EltSplat = DAG.getSplatBuildVector(VT, dl, N1);
18974
18975 SmallVector<SDValue, 16> RawIndices;
18976 for (unsigned I = 0; I != NumElts; ++I)
18977 RawIndices.push_back(DAG.getConstant(I, dl, IdxSVT));
18978 SDValue Indices = DAG.getBuildVector(IdxVT, dl, RawIndices);
18979
18980 // inselt N0, N1, N2 --> select (SplatN2 == {0,1,2...}) ? SplatN1 : N0.
18981 return DAG.getSelectCC(dl, IdxSplat, Indices, EltSplat, N0,
18983 }
18984
18985 if (N2C->getAPIntValue().uge(NumElts))
18986 return SDValue();
18987 uint64_t IdxVal = N2C->getZExtValue();
18988
18989 bool IsZeroElt = X86::isZeroNode(N1);
18990 bool IsAllOnesElt = VT.isInteger() && llvm::isAllOnesConstant(N1);
18991
18992 if (IsZeroElt || IsAllOnesElt) {
18993 // Lower insertion of v16i8/v32i8/v64i16 -1 elts as an 'OR' blend.
18994 // We don't deal with i8 0 since it appears to be handled elsewhere.
18995 if (IsAllOnesElt &&
18996 ((VT == MVT::v16i8 && !Subtarget.hasSSE41()) ||
18997 ((VT == MVT::v32i8 || VT == MVT::v16i16) && !Subtarget.hasInt256()))) {
18998 SDValue ZeroCst = DAG.getConstant(0, dl, VT.getScalarType());
18999 SDValue OnesCst = DAG.getAllOnesConstant(dl, VT.getScalarType());
19000 SmallVector<SDValue, 8> CstVectorElts(NumElts, ZeroCst);
19001 CstVectorElts[IdxVal] = OnesCst;
19002 SDValue CstVector = DAG.getBuildVector(VT, dl, CstVectorElts);
19003 return DAG.getNode(ISD::OR, dl, VT, N0, CstVector);
19004 }
19005 // See if we can do this more efficiently with a blend shuffle with a
19006 // rematerializable vector.
19007 if (Subtarget.hasSSE41() &&
19008 (EltSizeInBits >= 16 || (IsZeroElt && !VT.is128BitVector()))) {
19009 SmallVector<int, 8> BlendMask;
19010 for (unsigned i = 0; i != NumElts; ++i)
19011 BlendMask.push_back(i == IdxVal ? i + NumElts : i);
19012 SDValue CstVector = IsZeroElt ? getZeroVector(VT, Subtarget, DAG, dl)
19013 : getOnesVector(VT, DAG, dl);
19014 return DAG.getVectorShuffle(VT, dl, N0, CstVector, BlendMask);
19015 }
19016 }
19017
19018 // If the vector is wider than 128 bits, extract the 128-bit subvector, insert
19019 // into that, and then insert the subvector back into the result.
19020 if (VT.is256BitVector() || VT.is512BitVector()) {
19021 // With a 256-bit vector, we can insert into the zero element efficiently
19022 // using a blend if we have AVX or AVX2 and the right data type.
19023 if (VT.is256BitVector() && IdxVal == 0) {
19024 // TODO: It is worthwhile to cast integer to floating point and back
19025 // and incur a domain crossing penalty if that's what we'll end up
19026 // doing anyway after extracting to a 128-bit vector.
19027 if ((Subtarget.hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) ||
19028 (Subtarget.hasAVX2() && (EltVT == MVT::i32 || EltVT == MVT::i64))) {
19029 SDValue N1Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
19030 return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1Vec,
19031 DAG.getTargetConstant(1, dl, MVT::i8));
19032 }
19033 }
19034
19035 unsigned NumEltsIn128 = 128 / EltSizeInBits;
19036 assert(isPowerOf2_32(NumEltsIn128) &&
19037 "Vectors will always have power-of-two number of elements.");
19038
19039 // If we are not inserting into the low 128-bit vector chunk,
19040 // then prefer the broadcast+blend sequence.
19041 // FIXME: relax the profitability check iff all N1 uses are insertions.
19042 if (IdxVal >= NumEltsIn128 &&
19043 ((Subtarget.hasAVX2() && EltSizeInBits != 8) ||
19044 (Subtarget.hasAVX() && (EltSizeInBits >= 32) &&
19045 X86::mayFoldLoad(N1, Subtarget)))) {
19046 SDValue N1SplatVec = DAG.getSplatBuildVector(VT, dl, N1);
19047 SmallVector<int, 8> BlendMask;
19048 for (unsigned i = 0; i != NumElts; ++i)
19049 BlendMask.push_back(i == IdxVal ? i + NumElts : i);
19050 return DAG.getVectorShuffle(VT, dl, N0, N1SplatVec, BlendMask);
19051 }
19052
19053 // Get the desired 128-bit vector chunk.
19054 SDValue V = extract128BitVector(N0, IdxVal, DAG, dl);
19055
19056 // Insert the element into the desired chunk.
19057 // Since NumEltsIn128 is a power of 2 we can use mask instead of modulo.
19058 unsigned IdxIn128 = IdxVal & (NumEltsIn128 - 1);
19059
19060 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1,
19061 DAG.getVectorIdxConstant(IdxIn128, dl));
19062
19063 // Insert the changed part back into the bigger vector
19064 return insert128BitVector(N0, V, IdxVal, DAG, dl);
19065 }
19066 assert(VT.is128BitVector() && "Only 128-bit vector types should be left!");
19067
19068 // This will be just movw/movd/movq/movsh/movss/movsd.
19069 if (IdxVal == 0 && ISD::isBuildVectorAllZeros(N0.getNode())) {
19070 if (EltVT == MVT::i32 || EltVT == MVT::f32 || EltVT == MVT::f64 ||
19071 EltVT == MVT::f16 || EltVT == MVT::i64) {
19072 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
19073 return getShuffleVectorZeroOrUndef(N1, 0, true, Subtarget, DAG);
19074 }
19075
19076 // We can't directly insert an i8 or i16 into a vector, so zero extend
19077 // it to i32 first.
19078 if (EltVT == MVT::i16 || EltVT == MVT::i8) {
19079 N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, N1);
19080 MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
19081 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, N1);
19082 N1 = getShuffleVectorZeroOrUndef(N1, 0, true, Subtarget, DAG);
19083 return DAG.getBitcast(VT, N1);
19084 }
19085 }
19086
19087 // Transform it so it match pinsr{b,w} which expects a GR32 as its second
19088 // argument. SSE41 required for pinsrb.
19089 if (VT == MVT::v8i16 || (VT == MVT::v16i8 && Subtarget.hasSSE41())) {
19090 unsigned Opc;
19091 if (VT == MVT::v8i16) {
19092 assert(Subtarget.hasSSE2() && "SSE2 required for PINSRW");
19094 } else {
19095 assert(VT == MVT::v16i8 && "PINSRB requires v16i8 vector");
19096 assert(Subtarget.hasSSE41() && "SSE41 required for PINSRB");
19098 }
19099
19100 assert(N1.getValueType() != MVT::i32 && "Unexpected VT");
19101 N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
19102 N2 = DAG.getTargetConstant(IdxVal, dl, MVT::i8);
19103 return DAG.getNode(Opc, dl, VT, N0, N1, N2);
19104 }
19105
19106 if (Subtarget.hasSSE41()) {
19107 if (EltVT == MVT::f32) {
19108 // Bits [7:6] of the constant are the source select. This will always be
19109 // zero here. The DAG Combiner may combine an extract_elt index into
19110 // these bits. For example (insert (extract, 3), 2) could be matched by
19111 // putting the '3' into bits [7:6] of X86ISD::INSERTPS.
19112 // Bits [5:4] of the constant are the destination select. This is the
19113 // value of the incoming immediate.
19114 // Bits [3:0] of the constant are the zero mask. The DAG Combiner may
19115 // combine either bitwise AND or insert of float 0.0 to set these bits.
19116
19117 bool MinSize = DAG.getMachineFunction().getFunction().hasMinSize();
19118 if (IdxVal == 0 && (!MinSize || !X86::mayFoldLoad(N1, Subtarget))) {
19119 // If this is an insertion of 32-bits into the low 32-bits of
19120 // a vector, we prefer to generate a blend with immediate rather
19121 // than an insertps. Blends are simpler operations in hardware and so
19122 // will always have equal or better performance than insertps.
19123 // But if optimizing for size and there's a load folding opportunity,
19124 // generate insertps because blendps does not have a 32-bit memory
19125 // operand form.
19126 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
19127 return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1,
19128 DAG.getTargetConstant(1, dl, MVT::i8));
19129 }
19130 // Create this as a scalar to vector..
19131 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
19132 return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1,
19133 DAG.getTargetConstant(IdxVal << 4, dl, MVT::i8));
19134 }
19135
19136 // PINSR* works with constant index.
19137 if (EltVT == MVT::i32 || EltVT == MVT::i64)
19138 return Op;
19139 }
19140
19141 return SDValue();
19142}
19143
19145 SelectionDAG &DAG) {
19146 SDLoc dl(Op);
19147 MVT OpVT = Op.getSimpleValueType();
19148
19149 // It's always cheaper to replace a xor+movd with xorps and simplifies further
19150 // combines.
19151 if (X86::isZeroNode(Op.getOperand(0)))
19152 return getZeroVector(OpVT, Subtarget, DAG, dl);
19153
19154 // If this is a 256-bit vector result, first insert into a 128-bit
19155 // vector and then insert into the 256-bit vector.
19156 if (!OpVT.is128BitVector()) {
19157 // Insert into a 128-bit vector.
19158 unsigned SizeFactor = OpVT.getSizeInBits() / 128;
19160 OpVT.getVectorNumElements() / SizeFactor);
19161
19162 Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0));
19163
19164 // Insert the 128-bit vector.
19165 return insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl);
19166 }
19167 assert(OpVT.is128BitVector() && OpVT.isInteger() && OpVT != MVT::v2i64 &&
19168 "Expected an SSE type!");
19169
19170 // Pass through a v4i32 or V8i16 SCALAR_TO_VECTOR as that's what we use in
19171 // tblgen.
19172 if (OpVT == MVT::v4i32 || (OpVT == MVT::v8i16 && Subtarget.hasFP16()))
19173 return Op;
19174
19175 SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));
19176 return DAG.getBitcast(
19177 OpVT, DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, AnyExt));
19178}
19179
19180// Lower a node with an INSERT_SUBVECTOR opcode. This may result in a
19181// simple superregister reference or explicit instructions to insert
19182// the upper bits of a vector.
19184 SelectionDAG &DAG) {
19185 assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1);
19186
19187 return insert1BitVector(Op, DAG, Subtarget);
19188}
19189
19191 SelectionDAG &DAG) {
19192 assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1 &&
19193 "Only vXi1 extract_subvectors need custom lowering");
19194
19195 SDLoc dl(Op);
19196 SDValue Vec = Op.getOperand(0);
19197 uint64_t IdxVal = Op.getConstantOperandVal(1);
19198
19199 if (IdxVal == 0) // the operation is legal
19200 return Op;
19201
19202 // Extend to natively supported kshift.
19203 Vec = widenMaskVector(Vec, false, Subtarget, DAG, dl);
19204
19205 // Shift to the LSB.
19206 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, Vec.getSimpleValueType(), Vec,
19207 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
19208
19209 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, Op.getValueType(), Vec,
19210 DAG.getVectorIdxConstant(0, dl));
19211}
19212
19213// Returns the appropriate wrapper opcode for a global reference.
19214unsigned X86TargetLowering::getGlobalWrapperKind(
19215 const GlobalValue *GV, const unsigned char OpFlags) const {
19216 // References to absolute symbols are never PC-relative.
19217 if (GV && GV->isAbsoluteSymbolRef())
19218 return X86ISD::Wrapper;
19219
19220 // The following OpFlags under RIP-rel PIC use RIP.
19221 if (Subtarget.isPICStyleRIPRel() &&
19222 (OpFlags == X86II::MO_NO_FLAG || OpFlags == X86II::MO_COFFSTUB ||
19223 OpFlags == X86II::MO_DLLIMPORT))
19224 return X86ISD::WrapperRIP;
19225
19226 // GOTPCREL references must always use RIP.
19227 if (OpFlags == X86II::MO_GOTPCREL || OpFlags == X86II::MO_GOTPCREL_NORELAX)
19228 return X86ISD::WrapperRIP;
19229
19230 return X86ISD::Wrapper;
19231}
19232
19233// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
19234// their target counterpart wrapped in the X86ISD::Wrapper node. Suppose N is
19235// one of the above mentioned nodes. It has to be wrapped because otherwise
19236// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
19237// be used to form addressing mode. These wrapped nodes will be selected
19238// into MOV32ri.
19239SDValue
19240X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
19241 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
19242
19243 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
19244 // global base reg.
19245 unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
19246
19247 auto PtrVT = getPointerTy(DAG.getDataLayout());
19249 CP->getConstVal(), PtrVT, CP->getAlign(), CP->getOffset(), OpFlag);
19250 SDLoc DL(CP);
19251 Result =
19252 DAG.getNode(getGlobalWrapperKind(nullptr, OpFlag), DL, PtrVT, Result);
19253 // With PIC, the address is actually $g + Offset.
19254 if (OpFlag) {
19255 Result =
19256 DAG.getNode(ISD::ADD, DL, PtrVT,
19257 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
19258 }
19259
19260 return Result;
19261}
19262
19263SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
19264 JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
19265
19266 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
19267 // global base reg.
19268 unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
19269
19270 EVT PtrVT = Op.getValueType();
19271 SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, OpFlag);
19272 SDLoc DL(JT);
19273 Result =
19274 DAG.getNode(getGlobalWrapperKind(nullptr, OpFlag), DL, PtrVT, Result);
19275
19276 // With PIC, the address is actually $g + Offset.
19277 if (OpFlag)
19278 Result =
19279 DAG.getNode(ISD::ADD, DL, PtrVT,
19280 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
19281
19282 return Result;
19283}
19284
19285SDValue X86TargetLowering::LowerExternalSymbol(SDValue Op,
19286 SelectionDAG &DAG) const {
19287 return LowerGlobalOrExternal(Op, DAG, /*ForCall=*/false, nullptr);
19288}
19289
19290SDValue
19291X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
19292 // Create the TargetBlockAddressAddress node.
19293 unsigned char OpFlags =
19294 Subtarget.classifyBlockAddressReference();
19295 const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
19296 int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset();
19297 SDLoc dl(Op);
19298 EVT PtrVT = Op.getValueType();
19299 SDValue Result = DAG.getTargetBlockAddress(BA, PtrVT, Offset, OpFlags);
19300 Result =
19301 DAG.getNode(getGlobalWrapperKind(nullptr, OpFlags), dl, PtrVT, Result);
19302
19303 // With PIC, the address is actually $g + Offset.
19304 if (isGlobalRelativeToPICBase(OpFlags)) {
19305 Result = DAG.getNode(ISD::ADD, dl, PtrVT,
19306 DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
19307 }
19308
19309 return Result;
19310}
19311
19312/// Creates target global address or external symbol nodes for calls or
19313/// other uses.
19314SDValue X86TargetLowering::LowerGlobalOrExternal(SDValue Op, SelectionDAG &DAG,
19315 bool ForCall,
19316 bool *IsImpCall) const {
19317 // Unpack the global address or external symbol.
19318 SDLoc dl(Op);
19319 const GlobalValue *GV = nullptr;
19320 int64_t Offset = 0;
19321 const char *ExternalSym = nullptr;
19322 if (const auto *G = dyn_cast<GlobalAddressSDNode>(Op)) {
19323 GV = G->getGlobal();
19324 Offset = G->getOffset();
19325 } else {
19326 const auto *ES = cast<ExternalSymbolSDNode>(Op);
19327 ExternalSym = ES->getSymbol();
19328 }
19329
19330 // Calculate some flags for address lowering.
19332 unsigned char OpFlags;
19333 if (ForCall)
19334 OpFlags = Subtarget.classifyGlobalFunctionReference(GV, Mod);
19335 else
19336 OpFlags = Subtarget.classifyGlobalReference(GV, Mod);
19337 bool HasPICReg = isGlobalRelativeToPICBase(OpFlags);
19338 bool NeedsLoad = isGlobalStubReference(OpFlags);
19339
19341 EVT PtrVT = Op.getValueType();
19343
19344 if (GV) {
19345 // Create a target global address if this is a global. If possible, fold the
19346 // offset into the global address reference. Otherwise, ADD it on later.
19347 // Suppress the folding if Offset is negative: movl foo-1, %eax is not
19348 // allowed because if the address of foo is 0, the ELF R_X86_64_32
19349 // relocation will compute to a negative value, which is invalid.
19350 int64_t GlobalOffset = 0;
19351 if (OpFlags == X86II::MO_NO_FLAG && Offset >= 0 &&
19353 std::swap(GlobalOffset, Offset);
19354 }
19355 Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, GlobalOffset, OpFlags);
19356 } else {
19357 // If this is not a global address, this must be an external symbol.
19358 Result = DAG.getTargetExternalSymbol(ExternalSym, PtrVT, OpFlags);
19359 }
19360
19361 // If this is a direct call, avoid the wrapper if we don't need to do any
19362 // loads or adds. This allows SDAG ISel to match direct calls.
19363 if (ForCall && !NeedsLoad && !HasPICReg && Offset == 0)
19364 return Result;
19365
19366 // If Import Call Optimization is enabled and this is an imported function
19367 // then make a note of it and return the global address without wrapping.
19368 if (IsImpCall && (OpFlags == X86II::MO_DLLIMPORT) &&
19369 Mod.getModuleFlag("import-call-optimization")) {
19370 assert(ForCall && "Should only enable import call optimization if we are "
19371 "lowering a call");
19372 *IsImpCall = true;
19373 return Result;
19374 }
19375
19376 Result = DAG.getNode(getGlobalWrapperKind(GV, OpFlags), dl, PtrVT, Result);
19377
19378 // With PIC, the address is actually $g + Offset.
19379 if (HasPICReg) {
19380 Result = DAG.getNode(ISD::ADD, dl, PtrVT,
19381 DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
19382 }
19383
19384 // For globals that require a load from a stub to get the address, emit the
19385 // load.
19386 if (NeedsLoad)
19387 Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
19389
19390 // If there was a non-zero offset that we didn't fold, create an explicit
19391 // addition for it.
19392 if (Offset != 0)
19393 Result = DAG.getNode(ISD::ADD, dl, PtrVT, Result,
19394 DAG.getSignedConstant(Offset, dl, PtrVT));
19395
19396 return Result;
19397}
19398
19399SDValue
19400X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
19401 return LowerGlobalOrExternal(Op, DAG, /*ForCall=*/false, nullptr);
19402}
19403
19405 const EVT PtrVT, unsigned ReturnReg,
19406 unsigned char OperandFlags,
19407 bool LoadGlobalBaseReg = false,
19408 bool LocalDynamic = false) {
19410 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
19411 SDLoc dl(GA);
19412 SDValue TGA;
19413 bool UseTLSDESC = DAG.getTarget().useTLSDESC();
19414 SDValue Chain = DAG.getEntryNode();
19415 SDValue Ret;
19416 if (LocalDynamic && UseTLSDESC) {
19417 TGA = DAG.getTargetExternalSymbol("_TLS_MODULE_BASE_", PtrVT, OperandFlags);
19418 // Reuse existing GetTLSADDR node if we can find it.
19419 if (TGA->hasOneUse()) {
19420 // TLSDESC uses TGA.
19421 SDNode *TLSDescOp = *TGA->user_begin();
19422 assert(TLSDescOp->getOpcode() == X86ISD::TLSDESC &&
19423 "Unexpected TLSDESC DAG");
19424 // CALLSEQ_END uses TGA via a chain and glue.
19425 auto *CallSeqEndOp = TLSDescOp->getGluedUser();
19426 assert(CallSeqEndOp && CallSeqEndOp->getOpcode() == ISD::CALLSEQ_END &&
19427 "Unexpected TLSDESC DAG");
19428 // CopyFromReg uses CALLSEQ_END via a chain and glue.
19429 auto *CopyFromRegOp = CallSeqEndOp->getGluedUser();
19430 assert(CopyFromRegOp && CopyFromRegOp->getOpcode() == ISD::CopyFromReg &&
19431 "Unexpected TLSDESC DAG");
19432 Ret = SDValue(CopyFromRegOp, 0);
19433 }
19434 } else {
19435 TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),
19436 GA->getOffset(), OperandFlags);
19437 }
19438
19439 if (!Ret) {
19440 X86ISD::NodeType CallType = UseTLSDESC ? X86ISD::TLSDESC
19441 : LocalDynamic ? X86ISD::TLSBASEADDR
19443
19444 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
19445 if (LoadGlobalBaseReg) {
19446 SDValue InGlue;
19447 Chain = DAG.getCopyToReg(Chain, dl, X86::EBX,
19448 DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT),
19449 InGlue);
19450 InGlue = Chain.getValue(1);
19451 Chain = DAG.getNode(CallType, dl, NodeTys, {Chain, TGA, InGlue});
19452 } else {
19453 Chain = DAG.getNode(CallType, dl, NodeTys, {Chain, TGA});
19454 }
19455 Chain = DAG.getCALLSEQ_END(Chain, 0, 0, Chain.getValue(1), dl);
19456
19457 // TLSADDR will be codegen'ed as call. Inform MFI that function has calls.
19458 MFI.setHasCalls(true);
19459
19460 SDValue Glue = Chain.getValue(1);
19461 Ret = DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Glue);
19462 }
19463
19464 if (!UseTLSDESC)
19465 return Ret;
19466
19467 const X86Subtarget &Subtarget = DAG.getSubtarget<X86Subtarget>();
19468 unsigned Seg = Subtarget.is64Bit() ? X86AS::FS : X86AS::GS;
19469
19471 SDValue Offset =
19472 DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0, dl),
19474 return DAG.getNode(ISD::ADD, dl, PtrVT, Ret, Offset);
19475}
19476
19477// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit
19478static SDValue
19480 const EVT PtrVT) {
19481 return GetTLSADDR(DAG, GA, PtrVT, X86::EAX, X86II::MO_TLSGD,
19482 /*LoadGlobalBaseReg=*/true);
19483}
19484
19485// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit LP64
19486static SDValue
19488 const EVT PtrVT) {
19489 return GetTLSADDR(DAG, GA, PtrVT, X86::RAX, X86II::MO_TLSGD);
19490}
19491
19492// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit ILP32
19493static SDValue
19495 const EVT PtrVT) {
19496 return GetTLSADDR(DAG, GA, PtrVT, X86::EAX, X86II::MO_TLSGD);
19497}
19498
19500 SelectionDAG &DAG, const EVT PtrVT,
19501 bool Is64Bit, bool Is64BitLP64) {
19502 SDLoc dl(GA);
19503
19504 // Get the start address of the TLS block for this module.
19508
19509 SDValue Base;
19510 if (Is64Bit) {
19511 unsigned ReturnReg = Is64BitLP64 ? X86::RAX : X86::EAX;
19512 Base = GetTLSADDR(DAG, GA, PtrVT, ReturnReg, X86II::MO_TLSLD,
19513 /*LoadGlobalBaseReg=*/false,
19514 /*LocalDynamic=*/true);
19515 } else {
19516 Base = GetTLSADDR(DAG, GA, PtrVT, X86::EAX, X86II::MO_TLSLDM,
19517 /*LoadGlobalBaseReg=*/true,
19518 /*LocalDynamic=*/true);
19519 }
19520
19521 // Note: the CleanupLocalDynamicTLSPass will remove redundant computations
19522 // of Base.
19523
19524 // Build x@dtpoff.
19525 unsigned char OperandFlags = X86II::MO_DTPOFF;
19526 unsigned WrapperKind = X86ISD::Wrapper;
19527 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
19528 GA->getValueType(0),
19529 GA->getOffset(), OperandFlags);
19530 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
19531
19532 // Add x@dtpoff with the base.
19533 return DAG.getNode(ISD::ADD, dl, PtrVT, Offset, Base);
19534}
19535
19536// Lower ISD::GlobalTLSAddress using the "initial exec" or "local exec" model.
19538 const EVT PtrVT, TLSModel::Model model,
19539 bool is64Bit, bool isPIC) {
19540 SDLoc dl(GA);
19541
19542 // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
19545
19546 SDValue ThreadPointer =
19547 DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0, dl),
19549
19550 unsigned char OperandFlags = 0;
19551 // Most TLS accesses are not RIP relative, even on x86-64. One exception is
19552 // initialexec.
19553 unsigned WrapperKind = X86ISD::Wrapper;
19554 if (model == TLSModel::LocalExec) {
19555 OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF;
19556 } else if (model == TLSModel::InitialExec) {
19557 if (is64Bit) {
19558 OperandFlags = X86II::MO_GOTTPOFF;
19559 WrapperKind = X86ISD::WrapperRIP;
19560 } else {
19561 OperandFlags = isPIC ? X86II::MO_GOTNTPOFF : X86II::MO_INDNTPOFF;
19562 }
19563 } else {
19564 llvm_unreachable("Unexpected model");
19565 }
19566
19567 // emit "addl x@ntpoff,%eax" (local exec)
19568 // or "addl x@indntpoff,%eax" (initial exec)
19569 // or "addl x@gotntpoff(%ebx) ,%eax" (initial exec, 32-bit pic)
19570 SDValue TGA =
19571 DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),
19572 GA->getOffset(), OperandFlags);
19573 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
19574
19575 if (model == TLSModel::InitialExec) {
19576 if (isPIC && !is64Bit) {
19577 Offset = DAG.getNode(ISD::ADD, dl, PtrVT,
19578 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
19579 Offset);
19580 }
19581
19582 Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,
19584 }
19585
19586 // The address of the thread local variable is the add of the thread
19587 // pointer with the offset of the variable.
19588 return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
19589}
19590
19591SDValue
19592X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
19593
19594 GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
19595
19596 if (DAG.getTarget().useEmulatedTLS())
19597 return LowerToTLSEmulatedModel(GA, DAG);
19598
19599 const GlobalValue *GV = GA->getGlobal();
19600 EVT PtrVT = Op.getValueType();
19601 bool PositionIndependent = isPositionIndependent();
19602
19603 if (Subtarget.isTargetELF()) {
19604 TLSModel::Model model = DAG.getTarget().getTLSModel(GV);
19605 switch (model) {
19607 if (Subtarget.is64Bit()) {
19608 if (Subtarget.isTarget64BitLP64())
19609 return LowerToTLSGeneralDynamicModel64(GA, DAG, PtrVT);
19610 return LowerToTLSGeneralDynamicModelX32(GA, DAG, PtrVT);
19611 }
19612 return LowerToTLSGeneralDynamicModel32(GA, DAG, PtrVT);
19614 return LowerToTLSLocalDynamicModel(GA, DAG, PtrVT, Subtarget.is64Bit(),
19615 Subtarget.isTarget64BitLP64());
19618 return LowerToTLSExecModel(GA, DAG, PtrVT, model, Subtarget.is64Bit(),
19619 PositionIndependent);
19620 }
19621 llvm_unreachable("Unknown TLS model.");
19622 }
19623
19624 if (Subtarget.isTargetDarwin()) {
19625 // Darwin only has one model of TLS. Lower to that.
19626 unsigned char OpFlag = 0;
19627 unsigned WrapperKind = 0;
19628
19629 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
19630 // global base reg.
19631 bool PIC32 = PositionIndependent && !Subtarget.is64Bit();
19632 if (PIC32) {
19633 OpFlag = X86II::MO_TLVP_PIC_BASE;
19634 WrapperKind = X86ISD::Wrapper;
19635 } else {
19636 OpFlag = X86II::MO_TLVP;
19637 WrapperKind = X86ISD::WrapperRIP;
19638 }
19639 SDLoc DL(Op);
19641 GA->getValueType(0),
19642 GA->getOffset(), OpFlag);
19643 SDValue Offset = DAG.getNode(WrapperKind, DL, PtrVT, Result);
19644
19645 // With PIC32, the address is actually $g + Offset.
19646 if (PIC32)
19647 Offset = DAG.getNode(ISD::ADD, DL, PtrVT,
19648 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
19649 Offset);
19650
19651 // Lowering the machine isd will make sure everything is in the right
19652 // location.
19653 SDValue Chain = DAG.getEntryNode();
19654 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
19655 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
19656 SDValue Args[] = { Chain, Offset };
19657 Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args);
19658 Chain = DAG.getCALLSEQ_END(Chain, 0, 0, Chain.getValue(1), DL);
19659
19660 // TLSCALL will be codegen'ed as call. Inform MFI that function has calls.
19661 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
19662 MFI.setAdjustsStack(true);
19663
19664 // And our return value (tls address) is in the standard call return value
19665 // location.
19666 unsigned Reg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
19667 return DAG.getCopyFromReg(Chain, DL, Reg, PtrVT, Chain.getValue(1));
19668 }
19669
19670 if (Subtarget.isOSWindows()) {
19671 // Just use the implicit TLS architecture
19672 // Need to generate something similar to:
19673 // mov rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage
19674 // ; from TEB
19675 // mov ecx, dword [rel _tls_index]: Load index (from C runtime)
19676 // mov rcx, qword [rdx+rcx*8]
19677 // mov eax, .tls$:tlsvar
19678 // [rax+rcx] contains the address
19679 // Windows 64bit: gs:0x58
19680 // Windows 32bit: fs:__tls_array
19681
19682 SDLoc dl(GA);
19683 SDValue Chain = DAG.getEntryNode();
19684
19685 // Get the Thread Pointer, which is %fs:__tls_array (32-bit) or
19686 // %gs:0x58 (64-bit). On MinGW, __tls_array is not available, so directly
19687 // use its literal value of 0x2C.
19689 Subtarget.is64Bit() ? PointerType::get(*DAG.getContext(), X86AS::GS)
19691
19692 SDValue TlsArray = Subtarget.is64Bit()
19693 ? DAG.getIntPtrConstant(0x58, dl)
19694 : (Subtarget.isTargetWindowsGNU()
19695 ? DAG.getIntPtrConstant(0x2C, dl)
19696 : DAG.getExternalSymbol("_tls_array", PtrVT));
19697
19699 DAG.getLoad(PtrVT, dl, Chain, TlsArray, MachinePointerInfo(Ptr));
19700
19701 SDValue res;
19703 res = ThreadPointer;
19704 } else {
19705 // Load the _tls_index variable
19706 SDValue IDX = DAG.getExternalSymbol("_tls_index", PtrVT);
19707 if (Subtarget.is64Bit())
19708 IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, PtrVT, Chain, IDX,
19709 MachinePointerInfo(), MVT::i32);
19710 else
19711 IDX = DAG.getLoad(PtrVT, dl, Chain, IDX, MachinePointerInfo());
19712
19713 const DataLayout &DL = DAG.getDataLayout();
19714 SDValue Scale =
19715 DAG.getConstant(Log2_64_Ceil(DL.getPointerSize()), dl, MVT::i8);
19716 IDX = DAG.getNode(ISD::SHL, dl, PtrVT, IDX, Scale);
19717
19718 res = DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, IDX);
19719 }
19720
19721 res = DAG.getLoad(PtrVT, dl, Chain, res, MachinePointerInfo());
19722
19723 // Get the offset of start of .tls section
19724 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
19725 GA->getValueType(0),
19727 SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, TGA);
19728
19729 // The address of the thread local variable is the add of the thread
19730 // pointer with the offset of the variable.
19731 return DAG.getNode(ISD::ADD, dl, PtrVT, res, Offset);
19732 }
19733
19734 llvm_unreachable("TLS not implemented for this target.");
19735}
19736
19738 if (Subtarget.is64Bit() && Subtarget.isTargetELF()) {
19739 const TargetMachine &TM = getTargetMachine();
19740 TLSModel::Model Model = TM.getTLSModel(&GV);
19741 switch (Model) {
19744 // We can include the %fs segment register in addressing modes.
19745 return true;
19748 // These models do not result in %fs relative addresses unless
19749 // TLS descriptior are used.
19750 //
19751 // Even in the case of TLS descriptors we currently have no way to model
19752 // the difference between %fs access and the computations needed for the
19753 // offset and returning `true` for TLS-desc currently duplicates both
19754 // which is detrimental :-/
19755 return false;
19756 }
19757 }
19758 return false;
19759}
19760
19761/// Lower SRA_PARTS and friends, which return two i32 values
19762/// and take a 2 x i32 value to shift plus a shift amount.
19763/// TODO: Can this be moved to general expansion code?
19765 SDValue Lo, Hi;
19766 DAG.getTargetLoweringInfo().expandShiftParts(Op.getNode(), Lo, Hi, DAG);
19767 return DAG.getMergeValues({Lo, Hi}, SDLoc(Op));
19768}
19769
19770// Try to use a packed vector operation to handle i64 on 32-bit targets when
19771// AVX512DQ is enabled.
19773 SelectionDAG &DAG,
19774 const X86Subtarget &Subtarget) {
19775 assert((Op.getOpcode() == ISD::SINT_TO_FP ||
19776 Op.getOpcode() == ISD::STRICT_SINT_TO_FP ||
19777 Op.getOpcode() == ISD::STRICT_UINT_TO_FP ||
19778 Op.getOpcode() == ISD::UINT_TO_FP) &&
19779 "Unexpected opcode!");
19780 bool IsStrict = Op->isStrictFPOpcode();
19781 unsigned OpNo = IsStrict ? 1 : 0;
19782 SDValue Src = Op.getOperand(OpNo);
19783 MVT SrcVT = Src.getSimpleValueType();
19784 MVT VT = Op.getSimpleValueType();
19785
19786 if (!Subtarget.hasDQI() || SrcVT != MVT::i64 || Subtarget.is64Bit() ||
19787 (VT != MVT::f32 && VT != MVT::f64))
19788 return SDValue();
19789
19790 // Pack the i64 into a vector, do the operation and extract.
19791
19792 // Using 256-bit to ensure result is 128-bits for f32 case.
19793 unsigned NumElts = Subtarget.hasVLX() ? 4 : 8;
19794 MVT VecInVT = MVT::getVectorVT(MVT::i64, NumElts);
19795 MVT VecVT = MVT::getVectorVT(VT, NumElts);
19796
19797 SDValue InVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecInVT, Src);
19798 if (IsStrict) {
19799 SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, {VecVT, MVT::Other},
19800 {Op.getOperand(0), InVec});
19801 SDValue Chain = CvtVec.getValue(1);
19802 SDValue Value = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
19803 DAG.getVectorIdxConstant(0, dl));
19804 return DAG.getMergeValues({Value, Chain}, dl);
19805 }
19806
19807 SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, VecVT, InVec);
19808
19809 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
19810 DAG.getVectorIdxConstant(0, dl));
19811}
19812
19813// Try to use a packed vector operation to handle i64 on 32-bit targets.
19815 const X86Subtarget &Subtarget) {
19816 assert((Op.getOpcode() == ISD::SINT_TO_FP ||
19817 Op.getOpcode() == ISD::STRICT_SINT_TO_FP ||
19818 Op.getOpcode() == ISD::STRICT_UINT_TO_FP ||
19819 Op.getOpcode() == ISD::UINT_TO_FP) &&
19820 "Unexpected opcode!");
19821 bool IsStrict = Op->isStrictFPOpcode();
19822 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
19823 MVT SrcVT = Src.getSimpleValueType();
19824 MVT VT = Op.getSimpleValueType();
19825
19826 if (SrcVT != MVT::i64 || Subtarget.is64Bit() || VT != MVT::f16)
19827 return SDValue();
19828
19829 // Pack the i64 into a vector, do the operation and extract.
19830
19831 assert(Subtarget.hasFP16() && "Expected FP16");
19832
19833 SDValue InVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Src);
19834 if (IsStrict) {
19835 SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, {MVT::v2f16, MVT::Other},
19836 {Op.getOperand(0), InVec});
19837 SDValue Chain = CvtVec.getValue(1);
19838 SDValue Value = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
19839 DAG.getVectorIdxConstant(0, dl));
19840 return DAG.getMergeValues({Value, Chain}, dl);
19841 }
19842
19843 SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, MVT::v2f16, InVec);
19844
19845 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
19846 DAG.getVectorIdxConstant(0, dl));
19847}
19848
19849static bool useVectorCast(unsigned Opcode, MVT FromVT, MVT ToVT,
19850 const X86Subtarget &Subtarget) {
19851 switch (Opcode) {
19852 case ISD::SINT_TO_FP:
19853 // TODO: Handle wider types with AVX/AVX512.
19854 if (!Subtarget.hasSSE2() || FromVT != MVT::v4i32)
19855 return false;
19856 // CVTDQ2PS or (V)CVTDQ2PD
19857 return ToVT == MVT::v4f32 || (Subtarget.hasAVX() && ToVT == MVT::v4f64);
19858
19859 case ISD::UINT_TO_FP:
19860 // TODO: Handle wider types and i64 elements.
19861 if (!Subtarget.hasAVX512() || FromVT != MVT::v4i32)
19862 return false;
19863 // VCVTUDQ2PS or VCVTUDQ2PD
19864 return ToVT == MVT::v4f32 || ToVT == MVT::v4f64;
19865
19866 default:
19867 return false;
19868 }
19869}
19870
19871/// Given a scalar cast operation that is extracted from a vector, try to
19872/// vectorize the cast op followed by extraction. This will avoid an expensive
19873/// round-trip between XMM and GPR.
19875 SelectionDAG &DAG,
19876 const X86Subtarget &Subtarget) {
19877 // TODO: This could be enhanced to handle smaller integer types by peeking
19878 // through an extend.
19879 SDValue Extract = Cast.getOperand(0);
19880 MVT DestVT = Cast.getSimpleValueType();
19881 if (Extract.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
19882 !isa<ConstantSDNode>(Extract.getOperand(1)))
19883 return SDValue();
19884
19885 // See if we have a 128-bit vector cast op for this type of cast.
19886 SDValue VecOp = Extract.getOperand(0);
19887 MVT FromVT = VecOp.getSimpleValueType();
19888 unsigned NumEltsInXMM = 128 / FromVT.getScalarSizeInBits();
19889 MVT Vec128VT = MVT::getVectorVT(FromVT.getScalarType(), NumEltsInXMM);
19890 MVT ToVT = MVT::getVectorVT(DestVT, NumEltsInXMM);
19891 if (!useVectorCast(Cast.getOpcode(), Vec128VT, ToVT, Subtarget))
19892 return SDValue();
19893
19894 // If we are extracting from a non-zero element, first shuffle the source
19895 // vector to allow extracting from element zero.
19896 if (!isNullConstant(Extract.getOperand(1))) {
19897 SmallVector<int, 16> Mask(FromVT.getVectorNumElements(), -1);
19898 Mask[0] = Extract.getConstantOperandVal(1);
19899 VecOp = DAG.getVectorShuffle(FromVT, DL, VecOp, DAG.getUNDEF(FromVT), Mask);
19900 }
19901 // If the source vector is wider than 128-bits, extract the low part. Do not
19902 // create an unnecessarily wide vector cast op.
19903 if (FromVT != Vec128VT)
19904 VecOp = extract128BitVector(VecOp, 0, DAG, DL);
19905
19906 // cast (extelt V, 0) --> extelt (cast (extract_subv V)), 0
19907 // cast (extelt V, C) --> extelt (cast (extract_subv (shuffle V, [C...]))), 0
19908 SDValue VCast = DAG.getNode(Cast.getOpcode(), DL, ToVT, VecOp);
19909 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, DestVT, VCast,
19910 DAG.getVectorIdxConstant(0, DL));
19911}
19912
19913/// Given a scalar cast to FP with a cast to integer operand (almost an ftrunc),
19914/// try to vectorize the cast ops. This will avoid an expensive round-trip
19915/// between XMM and GPR.
19916static SDValue lowerFPToIntToFP(SDValue CastToFP, const SDLoc &DL,
19917 SelectionDAG &DAG,
19918 const X86Subtarget &Subtarget) {
19919 // TODO: Allow FP_TO_UINT.
19920 SDValue CastToInt = CastToFP.getOperand(0);
19921 MVT VT = CastToFP.getSimpleValueType();
19922 if (CastToInt.getOpcode() != ISD::FP_TO_SINT || VT.isVector())
19923 return SDValue();
19924
19925 MVT IntVT = CastToInt.getSimpleValueType();
19926 SDValue X = CastToInt.getOperand(0);
19927 MVT SrcVT = X.getSimpleValueType();
19928 if (SrcVT != MVT::f32 && SrcVT != MVT::f64)
19929 return SDValue();
19930
19931 // See if we have 128-bit vector cast instructions for this type of cast.
19932 // We need cvttps2dq/cvttpd2dq and cvtdq2ps/cvtdq2pd.
19933 if (!Subtarget.hasSSE2() || (VT != MVT::f32 && VT != MVT::f64) ||
19934 IntVT != MVT::i32)
19935 return SDValue();
19936
19937 unsigned SrcSize = SrcVT.getSizeInBits();
19938 unsigned IntSize = IntVT.getSizeInBits();
19939 unsigned VTSize = VT.getSizeInBits();
19940 MVT VecSrcVT = MVT::getVectorVT(SrcVT, 128 / SrcSize);
19941 MVT VecIntVT = MVT::getVectorVT(IntVT, 128 / IntSize);
19942 MVT VecVT = MVT::getVectorVT(VT, 128 / VTSize);
19943
19944 // We need target-specific opcodes if this is v2f64 -> v4i32 -> v2f64.
19945 unsigned ToIntOpcode =
19946 SrcSize != IntSize ? X86ISD::CVTTP2SI : (unsigned)ISD::FP_TO_SINT;
19947 unsigned ToFPOpcode =
19948 IntSize != VTSize ? X86ISD::CVTSI2P : (unsigned)ISD::SINT_TO_FP;
19949
19950 // sint_to_fp (fp_to_sint X) --> extelt (sint_to_fp (fp_to_sint (s2v X))), 0
19951 //
19952 // We are not defining the high elements (for example, zero them) because
19953 // that could nullify any performance advantage that we hoped to gain from
19954 // this vector op hack. We do not expect any adverse effects (like denorm
19955 // penalties) with cast ops.
19956 SDValue ZeroIdx = DAG.getVectorIdxConstant(0, DL);
19957 SDValue VecX = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecSrcVT, X);
19958 SDValue VCastToInt = DAG.getNode(ToIntOpcode, DL, VecIntVT, VecX);
19959 SDValue VCastToFP = DAG.getNode(ToFPOpcode, DL, VecVT, VCastToInt);
19960 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, VCastToFP, ZeroIdx);
19961}
19962
19964 SelectionDAG &DAG,
19965 const X86Subtarget &Subtarget) {
19966 bool IsStrict = Op->isStrictFPOpcode();
19967 MVT VT = Op->getSimpleValueType(0);
19968 SDValue Src = Op->getOperand(IsStrict ? 1 : 0);
19969
19970 if (Subtarget.hasDQI()) {
19971 assert(!Subtarget.hasVLX() && "Unexpected features");
19972
19973 assert((Src.getSimpleValueType() == MVT::v2i64 ||
19974 Src.getSimpleValueType() == MVT::v4i64) &&
19975 "Unsupported custom type");
19976
19977 // With AVX512DQ, but not VLX we need to widen to get a 512-bit result type.
19978 assert((VT == MVT::v4f32 || VT == MVT::v2f64 || VT == MVT::v4f64) &&
19979 "Unexpected VT!");
19980 MVT WideVT = VT == MVT::v4f32 ? MVT::v8f32 : MVT::v8f64;
19981
19982 // Need to concat with zero vector for strict fp to avoid spurious
19983 // exceptions.
19984 SDValue Tmp = IsStrict ? DAG.getConstant(0, DL, MVT::v8i64)
19985 : DAG.getUNDEF(MVT::v8i64);
19986 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i64, Tmp, Src,
19987 DAG.getVectorIdxConstant(0, DL));
19988 SDValue Res, Chain;
19989 if (IsStrict) {
19990 Res = DAG.getNode(Op.getOpcode(), DL, {WideVT, MVT::Other},
19991 {Op->getOperand(0), Src});
19992 Chain = Res.getValue(1);
19993 } else {
19994 Res = DAG.getNode(Op.getOpcode(), DL, WideVT, Src);
19995 }
19996
19997 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
19998 DAG.getVectorIdxConstant(0, DL));
19999
20000 if (IsStrict)
20001 return DAG.getMergeValues({Res, Chain}, DL);
20002 return Res;
20003 }
20004
20005 bool IsSigned = Op->getOpcode() == ISD::SINT_TO_FP ||
20006 Op->getOpcode() == ISD::STRICT_SINT_TO_FP;
20007 if (VT != MVT::v4f32 || IsSigned)
20008 return SDValue();
20009
20010 SDValue Zero = DAG.getConstant(0, DL, MVT::v4i64);
20011 SDValue One = DAG.getConstant(1, DL, MVT::v4i64);
20012 SDValue Sign = DAG.getNode(ISD::OR, DL, MVT::v4i64,
20013 DAG.getNode(ISD::SRL, DL, MVT::v4i64, Src, One),
20014 DAG.getNode(ISD::AND, DL, MVT::v4i64, Src, One));
20015 SDValue IsNeg = DAG.getSetCC(DL, MVT::v4i64, Src, Zero, ISD::SETLT);
20016 SDValue SignSrc = DAG.getSelect(DL, MVT::v4i64, IsNeg, Sign, Src);
20017 SmallVector<SDValue, 4> SignCvts(4);
20018 SmallVector<SDValue, 4> Chains(4);
20019 for (int i = 0; i != 4; ++i) {
20020 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, SignSrc,
20021 DAG.getVectorIdxConstant(i, DL));
20022 if (IsStrict) {
20023 SignCvts[i] =
20024 DAG.getNode(ISD::STRICT_SINT_TO_FP, DL, {MVT::f32, MVT::Other},
20025 {Op.getOperand(0), Elt});
20026 Chains[i] = SignCvts[i].getValue(1);
20027 } else {
20028 SignCvts[i] = DAG.getNode(ISD::SINT_TO_FP, DL, MVT::f32, Elt);
20029 }
20030 }
20031 SDValue SignCvt = DAG.getBuildVector(VT, DL, SignCvts);
20032
20033 SDValue Slow, Chain;
20034 if (IsStrict) {
20035 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
20036 Slow = DAG.getNode(ISD::STRICT_FADD, DL, {MVT::v4f32, MVT::Other},
20037 {Chain, SignCvt, SignCvt});
20038 Chain = Slow.getValue(1);
20039 } else {
20040 Slow = DAG.getNode(ISD::FADD, DL, MVT::v4f32, SignCvt, SignCvt);
20041 }
20042
20043 IsNeg = DAG.getNode(ISD::TRUNCATE, DL, MVT::v4i32, IsNeg);
20044 SDValue Cvt = DAG.getSelect(DL, MVT::v4f32, IsNeg, Slow, SignCvt);
20045
20046 if (IsStrict)
20047 return DAG.getMergeValues({Cvt, Chain}, DL);
20048
20049 return Cvt;
20050}
20051
20053 SelectionDAG &DAG) {
20054 bool IsStrict = Op->isStrictFPOpcode();
20055 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
20056 SDValue Chain = IsStrict ? Op->getOperand(0) : DAG.getEntryNode();
20057 MVT VT = Op.getSimpleValueType();
20058 MVT NVT = VT.isVector() ? VT.changeVectorElementType(MVT::f32) : MVT::f32;
20059
20060 SDValue Rnd = DAG.getIntPtrConstant(0, dl, /*isTarget=*/true);
20061 if (IsStrict)
20062 return DAG.getNode(
20063 ISD::STRICT_FP_ROUND, dl, {VT, MVT::Other},
20064 {Chain,
20065 DAG.getNode(Op.getOpcode(), dl, {NVT, MVT::Other}, {Chain, Src}),
20066 Rnd});
20067 return DAG.getNode(ISD::FP_ROUND, dl, VT,
20068 DAG.getNode(Op.getOpcode(), dl, NVT, Src), Rnd);
20069}
20070
20071static bool isLegalConversion(MVT VT, MVT FloatVT, bool IsSigned,
20072 const X86Subtarget &Subtarget) {
20073 if (FloatVT.getScalarType() != MVT::f16 || Subtarget.hasVLX()) {
20074 if (VT == MVT::v4i32 && Subtarget.hasSSE2() && IsSigned)
20075 return true;
20076 if (VT == MVT::v8i32 && Subtarget.hasAVX() && IsSigned)
20077 return true;
20078 }
20079 if (Subtarget.hasVLX() && (VT == MVT::v4i32 || VT == MVT::v8i32))
20080 return true;
20081 if (Subtarget.useAVX512Regs()) {
20082 if (VT == MVT::v16i32)
20083 return true;
20084 if (VT == MVT::v8i64 && FloatVT == MVT::v8f16 && Subtarget.hasFP16())
20085 return true;
20086 if (VT == MVT::v8i64 && Subtarget.hasDQI())
20087 return true;
20088 }
20089 if (Subtarget.hasDQI() && Subtarget.hasVLX() &&
20090 (VT == MVT::v2i64 || VT == MVT::v4i64))
20091 return true;
20092 return false;
20093}
20094
20095SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
20096 SelectionDAG &DAG) const {
20097 bool IsStrict = Op->isStrictFPOpcode();
20098 unsigned OpNo = IsStrict ? 1 : 0;
20099 SDValue Src = Op.getOperand(OpNo);
20100 SDValue Chain = IsStrict ? Op->getOperand(0) : DAG.getEntryNode();
20101 MVT SrcVT = Src.getSimpleValueType();
20102 MVT VT = Op.getSimpleValueType();
20103 SDLoc dl(Op);
20104
20105 if (isSoftF16(VT, Subtarget))
20106 return promoteXINT_TO_FP(Op, dl, DAG);
20107 else if (isLegalConversion(SrcVT, VT, true, Subtarget))
20108 return Op;
20109
20110 if (Subtarget.isTargetWin64() && SrcVT == MVT::i128)
20111 return LowerWin64_INT128_TO_FP(Op, DAG);
20112
20113 if (SDValue Extract = vectorizeExtractedCast(Op, dl, DAG, Subtarget))
20114 return Extract;
20115
20116 if (SDValue R = lowerFPToIntToFP(Op, dl, DAG, Subtarget))
20117 return R;
20118
20119 if (SrcVT.isVector()) {
20120 if (SrcVT == MVT::v2i32 && VT == MVT::v2f64) {
20121 // Note: Since v2f64 is a legal type. We don't need to zero extend the
20122 // source for strict FP.
20123 if (IsStrict)
20124 return DAG.getNode(
20125 X86ISD::STRICT_CVTSI2P, dl, {VT, MVT::Other},
20126 {Chain, DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
20127 DAG.getUNDEF(SrcVT))});
20128 return DAG.getNode(X86ISD::CVTSI2P, dl, VT,
20129 DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
20130 DAG.getUNDEF(SrcVT)));
20131 }
20132 if (SrcVT == MVT::v2i64 || SrcVT == MVT::v4i64)
20133 return lowerINT_TO_FP_vXi64(Op, dl, DAG, Subtarget);
20134
20135 return SDValue();
20136 }
20137
20138 assert(SrcVT <= MVT::i64 && SrcVT >= MVT::i16 &&
20139 "Unknown SINT_TO_FP to lower!");
20140
20141 bool UseSSEReg = isScalarFPTypeInSSEReg(VT);
20142
20143 // These are really Legal; return the operand so the caller accepts it as
20144 // Legal.
20145 if (SrcVT == MVT::i32 && UseSSEReg)
20146 return Op;
20147 if (SrcVT == MVT::i64 && UseSSEReg && Subtarget.is64Bit())
20148 return Op;
20149
20150 if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, dl, DAG, Subtarget))
20151 return V;
20152 if (SDValue V = LowerI64IntToFP16(Op, dl, DAG, Subtarget))
20153 return V;
20154
20155 // SSE doesn't have an i16 conversion so we need to promote.
20156 if (SrcVT == MVT::i16 && (UseSSEReg || VT == MVT::f128)) {
20157 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i32, Src);
20158 if (IsStrict)
20159 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
20160 {Chain, Ext});
20161
20162 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Ext);
20163 }
20164
20165 if (VT == MVT::f128 || !Subtarget.hasX87())
20166 return SDValue();
20167
20168 SDValue ValueToStore = Src;
20169 if (SrcVT == MVT::i64 && Subtarget.hasSSE2() && !Subtarget.is64Bit())
20170 // Bitcasting to f64 here allows us to do a single 64-bit store from
20171 // an SSE register, avoiding the store forwarding penalty that would come
20172 // with two 32-bit stores.
20173 ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
20174
20175 unsigned Size = SrcVT.getStoreSize();
20176 Align Alignment(Size);
20177 MachineFunction &MF = DAG.getMachineFunction();
20178 auto PtrVT = getPointerTy(MF.getDataLayout());
20179 int SSFI = MF.getFrameInfo().CreateStackObject(Size, Alignment, false);
20180 MachinePointerInfo MPI =
20182 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
20183 Chain = DAG.getStore(Chain, dl, ValueToStore, StackSlot, MPI, Alignment);
20184 std::pair<SDValue, SDValue> Tmp =
20185 BuildFILD(VT, SrcVT, dl, Chain, StackSlot, MPI, Alignment, DAG);
20186
20187 if (IsStrict)
20188 return DAG.getMergeValues({Tmp.first, Tmp.second}, dl);
20189
20190 return Tmp.first;
20191}
20192
20193std::pair<SDValue, SDValue> X86TargetLowering::BuildFILD(
20194 EVT DstVT, EVT SrcVT, const SDLoc &DL, SDValue Chain, SDValue Pointer,
20195 MachinePointerInfo PtrInfo, Align Alignment, SelectionDAG &DAG) const {
20196 // Build the FILD
20197 SDVTList Tys;
20198 bool useSSE = isScalarFPTypeInSSEReg(DstVT);
20199 if (useSSE)
20200 Tys = DAG.getVTList(MVT::f80, MVT::Other);
20201 else
20202 Tys = DAG.getVTList(DstVT, MVT::Other);
20203
20204 SDValue FILDOps[] = {Chain, Pointer};
20205 SDValue Result =
20206 DAG.getMemIntrinsicNode(X86ISD::FILD, DL, Tys, FILDOps, SrcVT, PtrInfo,
20207 Alignment, MachineMemOperand::MOLoad);
20208 Chain = Result.getValue(1);
20209
20210 if (useSSE) {
20212 unsigned SSFISize = DstVT.getStoreSize();
20213 int SSFI =
20214 MF.getFrameInfo().CreateStackObject(SSFISize, Align(SSFISize), false);
20215 auto PtrVT = getPointerTy(MF.getDataLayout());
20216 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
20217 Tys = DAG.getVTList(MVT::Other);
20218 SDValue FSTOps[] = {Chain, Result, StackSlot};
20221 MachineMemOperand::MOStore, SSFISize, Align(SSFISize));
20222
20223 Chain =
20224 DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys, FSTOps, DstVT, StoreMMO);
20225 Result = DAG.getLoad(
20226 DstVT, DL, Chain, StackSlot,
20228 Chain = Result.getValue(1);
20229 }
20230
20231 return { Result, Chain };
20232}
20233
20234/// Horizontal vector math instructions may be slower than normal math with
20235/// shuffles. Limit horizontal op codegen based on size/speed trade-offs, uarch
20236/// implementation, and likely shuffle complexity of the alternate sequence.
20237static bool shouldUseHorizontalOp(bool IsSingleSource, SelectionDAG &DAG,
20238 const X86Subtarget &Subtarget) {
20239 bool IsOptimizingSize = DAG.shouldOptForSize();
20240 bool HasFastHOps = Subtarget.hasFastHorizontalOps();
20241 return !IsSingleSource || IsOptimizingSize || HasFastHOps;
20242}
20243
20244/// 64-bit unsigned integer to double expansion.
20246 SelectionDAG &DAG,
20247 const X86Subtarget &Subtarget) {
20248 // We can't use this algorithm for strict fp. It produces -0.0 instead of +0.0
20249 // when converting 0 when rounding toward negative infinity. Caller will
20250 // fall back to Expand for when i64 or is legal or use FILD in 32-bit mode.
20251 assert(!Op->isStrictFPOpcode() && "Expected non-strict uint_to_fp!");
20252 // This algorithm is not obvious. Here it is what we're trying to output:
20253 /*
20254 movq %rax, %xmm0
20255 punpckldq (c0), %xmm0 // c0: (uint4){ 0x43300000U, 0x45300000U, 0U, 0U }
20256 subpd (c1), %xmm0 // c1: (double2){ 0x1.0p52, 0x1.0p52 * 0x1.0p32 }
20257 #ifdef __SSE3__
20258 haddpd %xmm0, %xmm0
20259 #else
20260 pshufd $0x4e, %xmm0, %xmm1
20261 addpd %xmm1, %xmm0
20262 #endif
20263 */
20264
20265 LLVMContext *Context = DAG.getContext();
20266
20267 // Build some magic constants.
20268 static const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 };
20269 Constant *C0 = ConstantDataVector::get(*Context, CV0);
20270 auto PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
20271 SDValue CPIdx0 = DAG.getConstantPool(C0, PtrVT, Align(16));
20272
20274 CV1.push_back(
20275 ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
20276 APInt(64, 0x4330000000000000ULL))));
20277 CV1.push_back(
20278 ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
20279 APInt(64, 0x4530000000000000ULL))));
20280 Constant *C1 = ConstantVector::get(CV1);
20281 SDValue CPIdx1 = DAG.getConstantPool(C1, PtrVT, Align(16));
20282
20283 // Load the 64-bit value into an XMM register.
20284 SDValue XR1 =
20285 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Op.getOperand(0));
20286 SDValue CLod0 = DAG.getLoad(
20287 MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,
20289 SDValue Unpck1 =
20290 getUnpackl(DAG, dl, MVT::v4i32, DAG.getBitcast(MVT::v4i32, XR1), CLod0);
20291
20292 SDValue CLod1 = DAG.getLoad(
20293 MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,
20295 SDValue XR2F = DAG.getBitcast(MVT::v2f64, Unpck1);
20296 // TODO: Are there any fast-math-flags to propagate here?
20297 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
20298 SDValue Result;
20299
20300 if (Subtarget.hasSSE3() &&
20301 shouldUseHorizontalOp(true, DAG, Subtarget)) {
20302 Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub);
20303 } else {
20304 SDValue Shuffle = DAG.getVectorShuffle(MVT::v2f64, dl, Sub, Sub, {1,-1});
20305 Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64, Shuffle, Sub);
20306 }
20307 Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result,
20308 DAG.getVectorIdxConstant(0, dl));
20309 return Result;
20310}
20311
20312/// 32-bit unsigned integer to float expansion.
20314 SelectionDAG &DAG,
20315 const X86Subtarget &Subtarget) {
20316 unsigned OpNo = Op.getNode()->isStrictFPOpcode() ? 1 : 0;
20317 // FP constant to bias correct the final result.
20318 SDValue Bias = DAG.getConstantFP(
20319 llvm::bit_cast<double>(0x4330000000000000ULL), dl, MVT::f64);
20320
20321 // Load the 32-bit value into an XMM register.
20322 SDValue Load =
20323 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Op.getOperand(OpNo));
20324
20325 // Zero out the upper parts of the register.
20326 Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG);
20327
20328 // Or the load with the bias.
20329 SDValue Or = DAG.getNode(
20330 ISD::OR, dl, MVT::v2i64,
20331 DAG.getBitcast(MVT::v2i64, Load),
20332 DAG.getBitcast(MVT::v2i64,
20333 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Bias)));
20334 Or = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
20335 DAG.getBitcast(MVT::v2f64, Or),
20336 DAG.getVectorIdxConstant(0, dl));
20337
20338 if (Op.getNode()->isStrictFPOpcode()) {
20339 // Subtract the bias.
20340 // TODO: Are there any fast-math-flags to propagate here?
20341 SDValue Chain = Op.getOperand(0);
20342 SDValue Sub = DAG.getNode(ISD::STRICT_FSUB, dl, {MVT::f64, MVT::Other},
20343 {Chain, Or, Bias});
20344
20345 if (Op.getValueType() == Sub.getValueType())
20346 return Sub;
20347
20348 // Handle final rounding.
20349 std::pair<SDValue, SDValue> ResultPair = DAG.getStrictFPExtendOrRound(
20350 Sub, Sub.getValue(1), dl, Op.getSimpleValueType());
20351
20352 return DAG.getMergeValues({ResultPair.first, ResultPair.second}, dl);
20353 }
20354
20355 // Subtract the bias.
20356 // TODO: Are there any fast-math-flags to propagate here?
20357 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias);
20358
20359 // Handle final rounding.
20360 return DAG.getFPExtendOrRound(Sub, dl, Op.getSimpleValueType());
20361}
20362
20364 SelectionDAG &DAG,
20365 const X86Subtarget &Subtarget) {
20366 if (Op.getSimpleValueType() != MVT::v2f64)
20367 return SDValue();
20368
20369 bool IsStrict = Op->isStrictFPOpcode();
20370
20371 SDValue N0 = Op.getOperand(IsStrict ? 1 : 0);
20372 assert(N0.getSimpleValueType() == MVT::v2i32 && "Unexpected input type");
20373
20374 if (Subtarget.hasAVX512()) {
20375 if (!Subtarget.hasVLX()) {
20376 // Let generic type legalization widen this.
20377 if (!IsStrict)
20378 return SDValue();
20379 // Otherwise pad the integer input with 0s and widen the operation.
20380 N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
20381 DAG.getConstant(0, DL, MVT::v2i32));
20382 SDValue Res = DAG.getNode(Op->getOpcode(), DL, {MVT::v4f64, MVT::Other},
20383 {Op.getOperand(0), N0});
20384 SDValue Chain = Res.getValue(1);
20385 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2f64, Res,
20386 DAG.getVectorIdxConstant(0, DL));
20387 return DAG.getMergeValues({Res, Chain}, DL);
20388 }
20389
20390 // Legalize to v4i32 type.
20391 N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
20392 DAG.getUNDEF(MVT::v2i32));
20393 if (IsStrict)
20394 return DAG.getNode(X86ISD::STRICT_CVTUI2P, DL, {MVT::v2f64, MVT::Other},
20395 {Op.getOperand(0), N0});
20396 return DAG.getNode(X86ISD::CVTUI2P, DL, MVT::v2f64, N0);
20397 }
20398
20399 // Zero extend to 2i64, OR with the floating point representation of 2^52.
20400 // This gives us the floating point equivalent of 2^52 + the i32 integer
20401 // since double has 52-bits of mantissa. Then subtract 2^52 in floating
20402 // point leaving just our i32 integers in double format.
20403 SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v2i64, N0);
20404 SDValue VBias = DAG.getConstantFP(
20405 llvm::bit_cast<double>(0x4330000000000000ULL), DL, MVT::v2f64);
20406 SDValue Or = DAG.getNode(ISD::OR, DL, MVT::v2i64, ZExtIn,
20407 DAG.getBitcast(MVT::v2i64, VBias));
20408 Or = DAG.getBitcast(MVT::v2f64, Or);
20409
20410 if (IsStrict)
20411 return DAG.getNode(ISD::STRICT_FSUB, DL, {MVT::v2f64, MVT::Other},
20412 {Op.getOperand(0), Or, VBias});
20413 return DAG.getNode(ISD::FSUB, DL, MVT::v2f64, Or, VBias);
20414}
20415
20417 SelectionDAG &DAG,
20418 const X86Subtarget &Subtarget) {
20419 bool IsStrict = Op->isStrictFPOpcode();
20420 SDValue V = Op->getOperand(IsStrict ? 1 : 0);
20421 MVT VecIntVT = V.getSimpleValueType();
20422 assert((VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) &&
20423 "Unsupported custom type");
20424
20425 if (Subtarget.hasAVX512()) {
20426 // With AVX512, but not VLX we need to widen to get a 512-bit result type.
20427 assert(!Subtarget.hasVLX() && "Unexpected features");
20428 MVT VT = Op->getSimpleValueType(0);
20429
20430 // v8i32->v8f64 is legal with AVX512 so just return it.
20431 if (VT == MVT::v8f64)
20432 return Op;
20433
20434 assert((VT == MVT::v4f32 || VT == MVT::v8f32 || VT == MVT::v4f64 ||
20435 VT == MVT::v8f16) &&
20436 "Unexpected VT!");
20437 MVT WideVT = VT == MVT::v8f16 ? MVT::v16f16 : MVT::v16f32;
20438 MVT WideIntVT = MVT::v16i32;
20439 if (VT == MVT::v4f64) {
20440 WideVT = MVT::v8f64;
20441 WideIntVT = MVT::v8i32;
20442 }
20443
20444 // Need to concat with zero vector for strict fp to avoid spurious
20445 // exceptions.
20446 SDValue Tmp =
20447 IsStrict ? DAG.getConstant(0, DL, WideIntVT) : DAG.getUNDEF(WideIntVT);
20448 V = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideIntVT, Tmp, V,
20449 DAG.getVectorIdxConstant(0, DL));
20450 SDValue Res, Chain;
20451 if (IsStrict) {
20452 Res = DAG.getNode(ISD::STRICT_UINT_TO_FP, DL, {WideVT, MVT::Other},
20453 {Op->getOperand(0), V});
20454 Chain = Res.getValue(1);
20455 } else {
20456 Res = DAG.getNode(ISD::UINT_TO_FP, DL, WideVT, V);
20457 }
20458
20459 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
20460 DAG.getVectorIdxConstant(0, DL));
20461
20462 if (IsStrict)
20463 return DAG.getMergeValues({Res, Chain}, DL);
20464 return Res;
20465 }
20466
20467 if (Subtarget.hasAVX() && VecIntVT == MVT::v4i32 &&
20468 Op->getSimpleValueType(0) == MVT::v4f64) {
20469 SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v4i64, V);
20470 Constant *Bias = ConstantFP::get(
20471 *DAG.getContext(),
20472 APFloat(APFloat::IEEEdouble(), APInt(64, 0x4330000000000000ULL)));
20473 auto PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
20474 SDValue CPIdx = DAG.getConstantPool(Bias, PtrVT, Align(8));
20475 SDVTList Tys = DAG.getVTList(MVT::v4f64, MVT::Other);
20476 SDValue Ops[] = {DAG.getEntryNode(), CPIdx};
20477 SDValue VBias = DAG.getMemIntrinsicNode(
20478 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::f64,
20481
20482 SDValue Or = DAG.getNode(ISD::OR, DL, MVT::v4i64, ZExtIn,
20483 DAG.getBitcast(MVT::v4i64, VBias));
20484 Or = DAG.getBitcast(MVT::v4f64, Or);
20485
20486 if (IsStrict)
20487 return DAG.getNode(ISD::STRICT_FSUB, DL, {MVT::v4f64, MVT::Other},
20488 {Op.getOperand(0), Or, VBias});
20489 return DAG.getNode(ISD::FSUB, DL, MVT::v4f64, Or, VBias);
20490 }
20491
20492 // The algorithm is the following:
20493 // #ifdef __SSE4_1__
20494 // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
20495 // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
20496 // (uint4) 0x53000000, 0xaa);
20497 // #else
20498 // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
20499 // uint4 hi = (v >> 16) | (uint4) 0x53000000;
20500 // #endif
20501 // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
20502 // return (float4) lo + fhi;
20503
20504 bool Is128 = VecIntVT == MVT::v4i32;
20505 MVT VecFloatVT = Is128 ? MVT::v4f32 : MVT::v8f32;
20506 // If we convert to something else than the supported type, e.g., to v4f64,
20507 // abort early.
20508 if (VecFloatVT != Op->getSimpleValueType(0))
20509 return SDValue();
20510
20511 // In the #idef/#else code, we have in common:
20512 // - The vector of constants:
20513 // -- 0x4b000000
20514 // -- 0x53000000
20515 // - A shift:
20516 // -- v >> 16
20517
20518 // Create the splat vector for 0x4b000000.
20519 SDValue VecCstLow = DAG.getConstant(0x4b000000, DL, VecIntVT);
20520 // Create the splat vector for 0x53000000.
20521 SDValue VecCstHigh = DAG.getConstant(0x53000000, DL, VecIntVT);
20522
20523 // Create the right shift.
20524 SDValue VecCstShift = DAG.getConstant(16, DL, VecIntVT);
20525 SDValue HighShift = DAG.getNode(ISD::SRL, DL, VecIntVT, V, VecCstShift);
20526
20527 SDValue Low, High;
20528 if (Subtarget.hasSSE41()) {
20529 MVT VecI16VT = Is128 ? MVT::v8i16 : MVT::v16i16;
20530 // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
20531 SDValue VecCstLowBitcast = DAG.getBitcast(VecI16VT, VecCstLow);
20532 SDValue VecBitcast = DAG.getBitcast(VecI16VT, V);
20533 // Low will be bitcasted right away, so do not bother bitcasting back to its
20534 // original type.
20535 Low = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecBitcast,
20536 VecCstLowBitcast, DAG.getTargetConstant(0xaa, DL, MVT::i8));
20537 // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
20538 // (uint4) 0x53000000, 0xaa);
20539 SDValue VecCstHighBitcast = DAG.getBitcast(VecI16VT, VecCstHigh);
20540 SDValue VecShiftBitcast = DAG.getBitcast(VecI16VT, HighShift);
20541 // High will be bitcasted right away, so do not bother bitcasting back to
20542 // its original type.
20543 High = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecShiftBitcast,
20544 VecCstHighBitcast, DAG.getTargetConstant(0xaa, DL, MVT::i8));
20545 } else {
20546 SDValue VecCstMask = DAG.getConstant(0xffff, DL, VecIntVT);
20547 // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
20548 SDValue LowAnd = DAG.getNode(ISD::AND, DL, VecIntVT, V, VecCstMask);
20549 Low = DAG.getNode(ISD::OR, DL, VecIntVT, LowAnd, VecCstLow);
20550
20551 // uint4 hi = (v >> 16) | (uint4) 0x53000000;
20552 High = DAG.getNode(ISD::OR, DL, VecIntVT, HighShift, VecCstHigh);
20553 }
20554
20555 // Create the vector constant for (0x1.0p39f + 0x1.0p23f).
20556 SDValue VecCstFSub = DAG.getConstantFP(
20557 APFloat(APFloat::IEEEsingle(), APInt(32, 0x53000080)), DL, VecFloatVT);
20558
20559 // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
20560 // NOTE: By using fsub of a positive constant instead of fadd of a negative
20561 // constant, we avoid reassociation in MachineCombiner when unsafe-fp-math is
20562 // enabled. See PR24512.
20563 SDValue HighBitcast = DAG.getBitcast(VecFloatVT, High);
20564 // TODO: Are there any fast-math-flags to propagate here?
20565 // (float4) lo;
20566 SDValue LowBitcast = DAG.getBitcast(VecFloatVT, Low);
20567 // return (float4) lo + fhi;
20568 if (IsStrict) {
20569 SDValue FHigh = DAG.getNode(ISD::STRICT_FSUB, DL, {VecFloatVT, MVT::Other},
20570 {Op.getOperand(0), HighBitcast, VecCstFSub});
20571 return DAG.getNode(ISD::STRICT_FADD, DL, {VecFloatVT, MVT::Other},
20572 {FHigh.getValue(1), LowBitcast, FHigh});
20573 }
20574
20575 SDValue FHigh =
20576 DAG.getNode(ISD::FSUB, DL, VecFloatVT, HighBitcast, VecCstFSub);
20577 return DAG.getNode(ISD::FADD, DL, VecFloatVT, LowBitcast, FHigh);
20578}
20579
20581 const X86Subtarget &Subtarget) {
20582 unsigned OpNo = Op.getNode()->isStrictFPOpcode() ? 1 : 0;
20583 SDValue N0 = Op.getOperand(OpNo);
20584 MVT SrcVT = N0.getSimpleValueType();
20585
20586 switch (SrcVT.SimpleTy) {
20587 default:
20588 llvm_unreachable("Custom UINT_TO_FP is not supported!");
20589 case MVT::v2i32:
20590 return lowerUINT_TO_FP_v2i32(Op, dl, DAG, Subtarget);
20591 case MVT::v4i32:
20592 case MVT::v8i32:
20593 return lowerUINT_TO_FP_vXi32(Op, dl, DAG, Subtarget);
20594 case MVT::v2i64:
20595 case MVT::v4i64:
20596 return lowerINT_TO_FP_vXi64(Op, dl, DAG, Subtarget);
20597 }
20598}
20599
20600SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
20601 SelectionDAG &DAG) const {
20602 bool IsStrict = Op->isStrictFPOpcode();
20603 unsigned OpNo = IsStrict ? 1 : 0;
20604 SDValue Src = Op.getOperand(OpNo);
20605 SDLoc dl(Op);
20606 auto PtrVT = getPointerTy(DAG.getDataLayout());
20607 MVT SrcVT = Src.getSimpleValueType();
20608 MVT DstVT = Op->getSimpleValueType(0);
20609 SDValue Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
20610
20611 // Bail out when we don't have native conversion instructions.
20612 if (DstVT == MVT::f128)
20613 return SDValue();
20614
20615 if (isSoftF16(DstVT, Subtarget))
20616 return promoteXINT_TO_FP(Op, dl, DAG);
20617 else if (isLegalConversion(SrcVT, DstVT, false, Subtarget))
20618 return Op;
20619
20620 if (DstVT.isVector())
20621 return lowerUINT_TO_FP_vec(Op, dl, DAG, Subtarget);
20622
20623 if (Subtarget.isTargetWin64() && SrcVT == MVT::i128)
20624 return LowerWin64_INT128_TO_FP(Op, DAG);
20625
20626 if (SDValue Extract = vectorizeExtractedCast(Op, dl, DAG, Subtarget))
20627 return Extract;
20628
20629 if (Subtarget.hasAVX512() && isScalarFPTypeInSSEReg(DstVT) &&
20630 (SrcVT == MVT::i32 || (SrcVT == MVT::i64 && Subtarget.is64Bit()))) {
20631 // Conversions from unsigned i32 to f32/f64 are legal,
20632 // using VCVTUSI2SS/SD. Same for i64 in 64-bit mode.
20633 return Op;
20634 }
20635
20636 // Promote i32 to i64 and use a signed conversion on 64-bit targets.
20637 if (SrcVT == MVT::i32 && Subtarget.is64Bit()) {
20638 Src = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Src);
20639 if (IsStrict)
20640 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {DstVT, MVT::Other},
20641 {Chain, Src});
20642 return DAG.getNode(ISD::SINT_TO_FP, dl, DstVT, Src);
20643 }
20644
20645 if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, dl, DAG, Subtarget))
20646 return V;
20647 if (SDValue V = LowerI64IntToFP16(Op, dl, DAG, Subtarget))
20648 return V;
20649
20650 // The transform for i64->f64 isn't correct for 0 when rounding to negative
20651 // infinity. It produces -0.0, so disable under strictfp.
20652 if (SrcVT == MVT::i64 && DstVT == MVT::f64 && Subtarget.hasSSE2() &&
20653 !IsStrict)
20654 return LowerUINT_TO_FP_i64(Op, dl, DAG, Subtarget);
20655 // The transform for i32->f64/f32 isn't correct for 0 when rounding to
20656 // negative infinity. So disable under strictfp. Using FILD instead.
20657 if (SrcVT == MVT::i32 && Subtarget.hasSSE2() && DstVT != MVT::f80 &&
20658 !IsStrict)
20659 return LowerUINT_TO_FP_i32(Op, dl, DAG, Subtarget);
20660 if (Subtarget.is64Bit() && SrcVT == MVT::i64 &&
20661 (DstVT == MVT::f32 || DstVT == MVT::f64))
20662 return SDValue();
20663
20664 // Make a 64-bit buffer, and use it to build an FILD.
20665 SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64, 8);
20666 int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex();
20667 Align SlotAlign(8);
20668 MachinePointerInfo MPI =
20670 if (SrcVT == MVT::i32) {
20671 SDValue OffsetSlot =
20672 DAG.getMemBasePlusOffset(StackSlot, TypeSize::getFixed(4), dl);
20673 SDValue Store1 = DAG.getStore(Chain, dl, Src, StackSlot, MPI, SlotAlign);
20674 SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, dl, MVT::i32),
20675 OffsetSlot, MPI.getWithOffset(4), SlotAlign);
20676 std::pair<SDValue, SDValue> Tmp =
20677 BuildFILD(DstVT, MVT::i64, dl, Store2, StackSlot, MPI, SlotAlign, DAG);
20678 if (IsStrict)
20679 return DAG.getMergeValues({Tmp.first, Tmp.second}, dl);
20680
20681 return Tmp.first;
20682 }
20683
20684 assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP");
20685 SDValue ValueToStore = Src;
20686 if (isScalarFPTypeInSSEReg(Op.getValueType()) && !Subtarget.is64Bit()) {
20687 // Bitcasting to f64 here allows us to do a single 64-bit store from
20688 // an SSE register, avoiding the store forwarding penalty that would come
20689 // with two 32-bit stores.
20690 ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
20691 }
20692 SDValue Store =
20693 DAG.getStore(Chain, dl, ValueToStore, StackSlot, MPI, SlotAlign);
20694 // For i64 source, we need to add the appropriate power of 2 if the input
20695 // was negative. We must be careful to do the computation in x87 extended
20696 // precision, not in SSE.
20697 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
20698 SDValue Ops[] = {Store, StackSlot};
20699 SDValue Fild =
20700 DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops, MVT::i64, MPI,
20701 SlotAlign, MachineMemOperand::MOLoad);
20702 Chain = Fild.getValue(1);
20703
20704 // Check whether the sign bit is set.
20705 SDValue SignSet = DAG.getSetCC(
20706 dl, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),
20707 Op.getOperand(OpNo), DAG.getConstant(0, dl, MVT::i64), ISD::SETLT);
20708
20709 // Build a 64 bit pair (FF, 0) in the constant pool, with FF in the hi bits.
20710 APInt FF(64, 0x5F80000000000000ULL);
20711 SDValue FudgePtr =
20712 DAG.getConstantPool(ConstantInt::get(*DAG.getContext(), FF), PtrVT);
20713 Align CPAlignment = cast<ConstantPoolSDNode>(FudgePtr)->getAlign();
20714
20715 // Get a pointer to FF if the sign bit was set, or to 0 otherwise.
20716 SDValue Zero = DAG.getIntPtrConstant(0, dl);
20717 SDValue Four = DAG.getIntPtrConstant(4, dl);
20718 SDValue Offset = DAG.getSelect(dl, Zero.getValueType(), SignSet, Four, Zero);
20719 FudgePtr = DAG.getNode(ISD::ADD, dl, PtrVT, FudgePtr, Offset);
20720
20721 // Load the value out, extending it from f32 to f80.
20722 SDValue Fudge = DAG.getExtLoad(
20723 ISD::EXTLOAD, dl, MVT::f80, Chain, FudgePtr,
20725 CPAlignment);
20726 Chain = Fudge.getValue(1);
20727 // Extend everything to 80 bits to force it to be done on x87.
20728 // TODO: Are there any fast-math-flags to propagate here?
20729 if (IsStrict) {
20730 unsigned Opc = ISD::STRICT_FADD;
20731 // Windows needs the precision control changed to 80bits around this add.
20732 if (Subtarget.isOSWindows() && DstVT == MVT::f32)
20734
20735 SDValue Add =
20736 DAG.getNode(Opc, dl, {MVT::f80, MVT::Other}, {Chain, Fild, Fudge});
20737 // STRICT_FP_ROUND can't handle equal types.
20738 if (DstVT == MVT::f80)
20739 return Add;
20740 return DAG.getNode(ISD::STRICT_FP_ROUND, dl, {DstVT, MVT::Other},
20741 {Add.getValue(1), Add,
20742 DAG.getIntPtrConstant(0, dl, /*isTarget=*/true)});
20743 }
20744 unsigned Opc = ISD::FADD;
20745 // Windows needs the precision control changed to 80bits around this add.
20746 if (Subtarget.isOSWindows() && DstVT == MVT::f32)
20748
20749 SDValue Add = DAG.getNode(Opc, dl, MVT::f80, Fild, Fudge);
20750 return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add,
20751 DAG.getIntPtrConstant(0, dl, /*isTarget=*/true));
20752}
20753
20754// If the given FP_TO_SINT (IsSigned) or FP_TO_UINT (!IsSigned) operation
20755// is legal, or has an fp128 or f16 source (which needs to be promoted to f32),
20756// just return an SDValue().
20757// Otherwise it is assumed to be a conversion from one of f32, f64 or f80
20758// to i16, i32 or i64, and we lower it to a legal sequence and return the
20759// result.
20760SDValue X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
20761 bool IsSigned,
20762 SDValue &Chain) const {
20763 bool IsStrict = Op->isStrictFPOpcode();
20764 SDLoc DL(Op);
20765
20766 EVT DstTy = Op.getValueType();
20767 SDValue Value = Op.getOperand(IsStrict ? 1 : 0);
20768 EVT TheVT = Value.getValueType();
20769 auto PtrVT = getPointerTy(DAG.getDataLayout());
20770
20771 if (TheVT != MVT::f32 && TheVT != MVT::f64 && TheVT != MVT::f80) {
20772 // f16 must be promoted before using the lowering in this routine.
20773 // fp128 does not use this lowering.
20774 return SDValue();
20775 }
20776
20777 // If using FIST to compute an unsigned i64, we'll need some fixup
20778 // to handle values above the maximum signed i64. A FIST is always
20779 // used for the 32-bit subtarget, but also for f80 on a 64-bit target.
20780 bool UnsignedFixup = !IsSigned && DstTy == MVT::i64;
20781
20782 // FIXME: This does not generate an invalid exception if the input does not
20783 // fit in i32. PR44019
20784 if (!IsSigned && DstTy != MVT::i64) {
20785 // Replace the fp-to-uint32 operation with an fp-to-sint64 FIST.
20786 // The low 32 bits of the fist result will have the correct uint32 result.
20787 assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT");
20788 DstTy = MVT::i64;
20789 }
20790
20791 assert(DstTy.getSimpleVT() <= MVT::i64 &&
20792 DstTy.getSimpleVT() >= MVT::i16 &&
20793 "Unknown FP_TO_INT to lower!");
20794
20795 // We lower FP->int64 into FISTP64 followed by a load from a temporary
20796 // stack slot.
20797 MachineFunction &MF = DAG.getMachineFunction();
20798 unsigned MemSize = DstTy.getStoreSize();
20799 int SSFI =
20800 MF.getFrameInfo().CreateStackObject(MemSize, Align(MemSize), false);
20801 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
20802
20803 Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
20804
20805 SDValue Adjust; // 0x0 or 0x80000000, for result sign bit adjustment.
20806
20807 if (UnsignedFixup) {
20808 //
20809 // Conversion to unsigned i64 is implemented with a select,
20810 // depending on whether the source value fits in the range
20811 // of a signed i64. Let Thresh be the FP equivalent of
20812 // 0x8000000000000000ULL.
20813 //
20814 // Adjust = (Value >= Thresh) ? 0x80000000 : 0;
20815 // FltOfs = (Value >= Thresh) ? 0x80000000 : 0;
20816 // FistSrc = (Value - FltOfs);
20817 // Fist-to-mem64 FistSrc
20818 // Add 0 or 0x800...0ULL to the 64-bit result, which is equivalent
20819 // to XOR'ing the high 32 bits with Adjust.
20820 //
20821 // Being a power of 2, Thresh is exactly representable in all FP formats.
20822 // For X87 we'd like to use the smallest FP type for this constant, but
20823 // for DAG type consistency we have to match the FP operand type.
20824
20825 APFloat Thresh(APFloat::IEEEsingle(), APInt(32, 0x5f000000));
20826 [[maybe_unused]] APFloat::opStatus Status = APFloat::opOK;
20827 bool LosesInfo = false;
20828 if (TheVT == MVT::f64)
20829 // The rounding mode is irrelevant as the conversion should be exact.
20830 Status = Thresh.convert(APFloat::IEEEdouble(), APFloat::rmNearestTiesToEven,
20831 &LosesInfo);
20832 else if (TheVT == MVT::f80)
20833 Status = Thresh.convert(APFloat::x87DoubleExtended(),
20834 APFloat::rmNearestTiesToEven, &LosesInfo);
20835
20836 assert(Status == APFloat::opOK && !LosesInfo &&
20837 "FP conversion should have been exact");
20838
20839 SDValue ThreshVal = DAG.getConstantFP(Thresh, DL, TheVT);
20840
20841 EVT ResVT = getSetCCResultType(DAG.getDataLayout(),
20842 *DAG.getContext(), TheVT);
20843 SDValue Cmp;
20844 if (IsStrict) {
20845 Cmp = DAG.getSetCC(DL, ResVT, Value, ThreshVal, ISD::SETGE, Chain,
20846 /*IsSignaling*/ true);
20847 Chain = Cmp.getValue(1);
20848 } else {
20849 Cmp = DAG.getSetCC(DL, ResVT, Value, ThreshVal, ISD::SETGE);
20850 }
20851
20852 // Our preferred lowering of
20853 //
20854 // (Value >= Thresh) ? 0x8000000000000000ULL : 0
20855 //
20856 // is
20857 //
20858 // (Value >= Thresh) << 63
20859 //
20860 // but since we can get here after LegalOperations, DAGCombine might do the
20861 // wrong thing if we create a select. So, directly create the preferred
20862 // version.
20863 SDValue Zext = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Cmp);
20864 SDValue Const63 = DAG.getConstant(63, DL, MVT::i8);
20865 Adjust = DAG.getNode(ISD::SHL, DL, MVT::i64, Zext, Const63);
20866
20867 SDValue FltOfs = DAG.getSelect(DL, TheVT, Cmp, ThreshVal,
20868 DAG.getConstantFP(0.0, DL, TheVT));
20869
20870 if (IsStrict) {
20871 Value = DAG.getNode(ISD::STRICT_FSUB, DL, { TheVT, MVT::Other},
20872 { Chain, Value, FltOfs });
20873 Chain = Value.getValue(1);
20874 } else
20875 Value = DAG.getNode(ISD::FSUB, DL, TheVT, Value, FltOfs);
20876 }
20877
20878 MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, SSFI);
20879
20880 // FIXME This causes a redundant load/store if the SSE-class value is already
20881 // in memory, such as if it is on the callstack.
20882 if (isScalarFPTypeInSSEReg(TheVT)) {
20883 assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!");
20884 Chain = DAG.getStore(Chain, DL, Value, StackSlot, MPI);
20885 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
20886 SDValue Ops[] = { Chain, StackSlot };
20887
20888 unsigned FLDSize = TheVT.getStoreSize();
20889 assert(FLDSize <= MemSize && "Stack slot not big enough");
20890 MachineMemOperand *MMO = MF.getMachineMemOperand(
20891 MPI, MachineMemOperand::MOLoad, FLDSize, Align(FLDSize));
20892 Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, TheVT, MMO);
20893 Chain = Value.getValue(1);
20894 }
20895
20896 // Build the FP_TO_INT*_IN_MEM
20897 MachineMemOperand *MMO = MF.getMachineMemOperand(
20898 MPI, MachineMemOperand::MOStore, MemSize, Align(MemSize));
20899 SDValue Ops[] = { Chain, Value, StackSlot };
20901 DAG.getVTList(MVT::Other),
20902 Ops, DstTy, MMO);
20903
20904 SDValue Res = DAG.getLoad(Op.getValueType(), DL, FIST, StackSlot, MPI);
20905 Chain = Res.getValue(1);
20906
20907 // If we need an unsigned fixup, XOR the result with adjust.
20908 if (UnsignedFixup)
20909 Res = DAG.getNode(ISD::XOR, DL, MVT::i64, Res, Adjust);
20910
20911 return Res;
20912}
20913
20915 const X86Subtarget &Subtarget) {
20916 MVT VT = Op.getSimpleValueType();
20917 SDValue In = Op.getOperand(0);
20918 MVT InVT = In.getSimpleValueType();
20919 unsigned Opc = Op.getOpcode();
20920
20921 assert(VT.isVector() && InVT.isVector() && "Expected vector type");
20923 "Unexpected extension opcode");
20925 "Expected same number of elements");
20926 assert((VT.getVectorElementType() == MVT::i16 ||
20927 VT.getVectorElementType() == MVT::i32 ||
20928 VT.getVectorElementType() == MVT::i64) &&
20929 "Unexpected element type");
20930 assert((InVT.getVectorElementType() == MVT::i8 ||
20931 InVT.getVectorElementType() == MVT::i16 ||
20932 InVT.getVectorElementType() == MVT::i32) &&
20933 "Unexpected element type");
20934
20935 unsigned ExtendInVecOpc = DAG.getOpcode_EXTEND_VECTOR_INREG(Opc);
20936
20937 if (VT == MVT::v32i16 && !Subtarget.hasBWI()) {
20938 assert(InVT == MVT::v32i8 && "Unexpected VT!");
20939 return splitVectorIntUnary(Op, DAG, dl);
20940 }
20941
20942 if (Subtarget.hasInt256())
20943 return Op;
20944
20945 // Optimize vectors in AVX mode:
20946 //
20947 // v8i16 -> v8i32
20948 // Use vpmovzwd for 4 lower elements v8i16 -> v4i32.
20949 // Use vpunpckhwd for 4 upper elements v8i16 -> v4i32.
20950 // Concat upper and lower parts.
20951 //
20952 // v4i32 -> v4i64
20953 // Use vpmovzdq for 4 lower elements v4i32 -> v2i64.
20954 // Use vpunpckhdq for 4 upper elements v4i32 -> v2i64.
20955 // Concat upper and lower parts.
20956 //
20957 MVT HalfVT = VT.getHalfNumVectorElementsVT();
20958 SDValue OpLo = DAG.getNode(ExtendInVecOpc, dl, HalfVT, In);
20959
20960 // Short-circuit if we can determine that each 128-bit half is the same value.
20961 // Otherwise, this is difficult to match and optimize.
20962 if (auto *Shuf = dyn_cast<ShuffleVectorSDNode>(In))
20963 if (hasIdenticalHalvesShuffleMask(Shuf->getMask()))
20964 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpLo);
20965
20966 SDValue ZeroVec = DAG.getConstant(0, dl, InVT);
20967 SDValue Undef = DAG.getUNDEF(InVT);
20968 bool NeedZero = Opc == ISD::ZERO_EXTEND;
20969 SDValue OpHi = getUnpackh(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
20970 OpHi = DAG.getBitcast(HalfVT, OpHi);
20971
20972 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
20973}
20974
20975// Helper to split and extend a v16i1 mask to v16i8 or v16i16.
20976static SDValue SplitAndExtendv16i1(unsigned ExtOpc, MVT VT, SDValue In,
20977 const SDLoc &dl, SelectionDAG &DAG) {
20978 assert((VT == MVT::v16i8 || VT == MVT::v16i16) && "Unexpected VT.");
20979 SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i1, In,
20980 DAG.getVectorIdxConstant(0, dl));
20981 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i1, In,
20982 DAG.getVectorIdxConstant(8, dl));
20983 Lo = DAG.getNode(ExtOpc, dl, MVT::v8i16, Lo);
20984 Hi = DAG.getNode(ExtOpc, dl, MVT::v8i16, Hi);
20985 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i16, Lo, Hi);
20986 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
20987}
20988
20990 const X86Subtarget &Subtarget,
20991 SelectionDAG &DAG) {
20992 MVT VT = Op->getSimpleValueType(0);
20993 SDValue In = Op->getOperand(0);
20994 MVT InVT = In.getSimpleValueType();
20995 assert(InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!");
20996 unsigned NumElts = VT.getVectorNumElements();
20997
20998 // For all vectors, but vXi8 we can just emit a sign_extend and a shift. This
20999 // avoids a constant pool load.
21000 if (VT.getVectorElementType() != MVT::i8) {
21001 SDValue Extend = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, In);
21002 return DAG.getNode(ISD::SRL, DL, VT, Extend,
21003 DAG.getConstant(VT.getScalarSizeInBits() - 1, DL, VT));
21004 }
21005
21006 // Extend VT if BWI is not supported.
21007 MVT ExtVT = VT;
21008 if (!Subtarget.hasBWI()) {
21009 // If v16i32 is to be avoided, we'll need to split and concatenate.
21010 if (NumElts == 16 && !Subtarget.canExtendTo512DQ())
21011 return SplitAndExtendv16i1(ISD::ZERO_EXTEND, VT, In, DL, DAG);
21012
21013 ExtVT = MVT::getVectorVT(MVT::i32, NumElts);
21014 }
21015
21016 // Widen to 512-bits if VLX is not supported.
21017 MVT WideVT = ExtVT;
21018 if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) {
21019 NumElts *= 512 / ExtVT.getSizeInBits();
21020 InVT = MVT::getVectorVT(MVT::i1, NumElts);
21021 In = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, InVT, DAG.getUNDEF(InVT), In,
21022 DAG.getVectorIdxConstant(0, DL));
21023 WideVT = MVT::getVectorVT(ExtVT.getVectorElementType(), NumElts);
21024 }
21025
21026 SDValue One = DAG.getConstant(1, DL, WideVT);
21027 SDValue Zero = DAG.getConstant(0, DL, WideVT);
21028
21029 SDValue SelectedVal = DAG.getSelect(DL, WideVT, In, One, Zero);
21030
21031 // Truncate if we had to extend above.
21032 if (VT != ExtVT) {
21033 WideVT = MVT::getVectorVT(MVT::i8, NumElts);
21034 SelectedVal = DAG.getNode(ISD::TRUNCATE, DL, WideVT, SelectedVal);
21035 }
21036
21037 // Extract back to 128/256-bit if we widened.
21038 if (WideVT != VT)
21039 SelectedVal = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SelectedVal,
21040 DAG.getVectorIdxConstant(0, DL));
21041
21042 return SelectedVal;
21043}
21044
21046 SelectionDAG &DAG) {
21047 SDValue In = Op.getOperand(0);
21048 MVT SVT = In.getSimpleValueType();
21049 SDLoc DL(Op);
21050
21051 if (SVT.getVectorElementType() == MVT::i1)
21052 return LowerZERO_EXTEND_Mask(Op, DL, Subtarget, DAG);
21053
21054 assert(Subtarget.hasAVX() && "Expected AVX support");
21055 return LowerAVXExtend(Op, DL, DAG, Subtarget);
21056}
21057
21058/// Helper to recursively truncate vector elements in half with PACKSS/PACKUS.
21059/// It makes use of the fact that vectors with enough leading sign/zero bits
21060/// prevent the PACKSS/PACKUS from saturating the results.
21061/// AVX2 (Int256) sub-targets require extra shuffling as the PACK*S operates
21062/// within each 128-bit lane.
21063static SDValue truncateVectorWithPACK(unsigned Opcode, EVT DstVT, SDValue In,
21064 const SDLoc &DL, SelectionDAG &DAG,
21065 const X86Subtarget &Subtarget) {
21066 assert((Opcode == X86ISD::PACKSS || Opcode == X86ISD::PACKUS) &&
21067 "Unexpected PACK opcode");
21068 assert(DstVT.isVector() && "VT not a vector?");
21069
21070 // Requires SSE2 for PACKSS (SSE41 PACKUSDW is handled below).
21071 if (!Subtarget.hasSSE2())
21072 return SDValue();
21073
21074 EVT SrcVT = In.getValueType();
21075
21076 // No truncation required, we might get here due to recursive calls.
21077 if (SrcVT == DstVT)
21078 return In;
21079
21080 unsigned NumElems = SrcVT.getVectorNumElements();
21081 if (NumElems < 2 || !isPowerOf2_32(NumElems) )
21082 return SDValue();
21083
21084 unsigned DstSizeInBits = DstVT.getSizeInBits();
21085 unsigned SrcSizeInBits = SrcVT.getSizeInBits();
21086 assert(DstVT.getVectorNumElements() == NumElems && "Illegal truncation");
21087 assert(SrcSizeInBits > DstSizeInBits && "Illegal truncation");
21088
21089 LLVMContext &Ctx = *DAG.getContext();
21090 EVT PackedSVT = EVT::getIntegerVT(Ctx, SrcVT.getScalarSizeInBits() / 2);
21091 EVT PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems);
21092
21093 // Pack to the largest type possible:
21094 // vXi64/vXi32 -> PACK*SDW and vXi16 -> PACK*SWB.
21095 EVT InVT = MVT::i16, OutVT = MVT::i8;
21096 if (SrcVT.getScalarSizeInBits() > 16 &&
21097 (Opcode == X86ISD::PACKSS || Subtarget.hasSSE41())) {
21098 InVT = MVT::i32;
21099 OutVT = MVT::i16;
21100 }
21101
21102 // Sub-128-bit truncation - widen to 128-bit src and pack in the lower half.
21103 // On pre-AVX512, pack the src in both halves to help value tracking.
21104 if (SrcSizeInBits <= 128) {
21105 InVT = EVT::getVectorVT(Ctx, InVT, 128 / InVT.getSizeInBits());
21106 OutVT = EVT::getVectorVT(Ctx, OutVT, 128 / OutVT.getSizeInBits());
21107 In = widenSubVector(In, false, Subtarget, DAG, DL, 128);
21108 SDValue LHS = DAG.getBitcast(InVT, In);
21109 SDValue RHS = Subtarget.hasAVX512() ? DAG.getUNDEF(InVT) : LHS;
21110 SDValue Res = DAG.getNode(Opcode, DL, OutVT, LHS, RHS);
21111 Res = extractSubVector(Res, 0, DAG, DL, SrcSizeInBits / 2);
21112 Res = DAG.getBitcast(PackedVT, Res);
21113 return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
21114 }
21115
21116 // Split lower/upper subvectors.
21117 SDValue Lo, Hi;
21118 std::tie(Lo, Hi) = splitVector(In, DAG, DL);
21119
21120 // If Hi is undef, then don't bother packing it and widen the result instead.
21121 if (Hi.isUndef()) {
21122 EVT DstHalfVT = DstVT.getHalfNumVectorElementsVT(Ctx);
21123 if (SDValue Res =
21124 truncateVectorWithPACK(Opcode, DstHalfVT, Lo, DL, DAG, Subtarget))
21125 return widenSubVector(Res, false, Subtarget, DAG, DL, DstSizeInBits);
21126 }
21127
21128 unsigned SubSizeInBits = SrcSizeInBits / 2;
21129 InVT = EVT::getVectorVT(Ctx, InVT, SubSizeInBits / InVT.getSizeInBits());
21130 OutVT = EVT::getVectorVT(Ctx, OutVT, SubSizeInBits / OutVT.getSizeInBits());
21131
21132 // 256bit -> 128bit truncate - PACK lower/upper 128-bit subvectors.
21133 if (SrcVT.is256BitVector() && DstVT.is128BitVector()) {
21134 Lo = DAG.getBitcast(InVT, Lo);
21135 Hi = DAG.getBitcast(InVT, Hi);
21136 SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi);
21137 return DAG.getBitcast(DstVT, Res);
21138 }
21139
21140 // AVX2: 512bit -> 256bit truncate - PACK lower/upper 256-bit subvectors.
21141 // AVX2: 512bit -> 128bit truncate - PACK(PACK, PACK).
21142 if (SrcVT.is512BitVector() && Subtarget.hasInt256()) {
21143 Lo = DAG.getBitcast(InVT, Lo);
21144 Hi = DAG.getBitcast(InVT, Hi);
21145 SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi);
21146
21147 // 256-bit PACK(ARG0, ARG1) leaves us with ((LO0,LO1),(HI0,HI1)),
21148 // so we need to shuffle to get ((LO0,HI0),(LO1,HI1)).
21149 // Scale shuffle mask to avoid bitcasts and help ComputeNumSignBits.
21151 int Scale = 64 / OutVT.getScalarSizeInBits();
21152 narrowShuffleMaskElts(Scale, { 0, 2, 1, 3 }, Mask);
21153 Res = DAG.getVectorShuffle(OutVT, DL, Res, Res, Mask);
21154
21155 if (DstVT.is256BitVector())
21156 return DAG.getBitcast(DstVT, Res);
21157
21158 // If 512bit -> 128bit truncate another stage.
21159 Res = DAG.getBitcast(PackedVT, Res);
21160 return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
21161 }
21162
21163 // Recursively pack lower/upper subvectors, concat result and pack again.
21164 assert(SrcSizeInBits >= 256 && "Expected 256-bit vector or greater");
21165
21166 if (PackedVT.is128BitVector()) {
21167 // Avoid CONCAT_VECTORS on sub-128bit nodes as these can fail after
21168 // type legalization.
21169 SDValue Res =
21170 truncateVectorWithPACK(Opcode, PackedVT, In, DL, DAG, Subtarget);
21171 return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
21172 }
21173
21174 EVT HalfPackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems / 2);
21175 Lo = truncateVectorWithPACK(Opcode, HalfPackedVT, Lo, DL, DAG, Subtarget);
21176 Hi = truncateVectorWithPACK(Opcode, HalfPackedVT, Hi, DL, DAG, Subtarget);
21177 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, PackedVT, Lo, Hi);
21178 return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
21179}
21180
21181/// Truncate using inreg zero extension (AND mask) and X86ISD::PACKUS.
21182/// e.g. trunc <8 x i32> X to <8 x i16> -->
21183/// MaskX = X & 0xffff (clear high bits to prevent saturation)
21184/// packus (extract_subv MaskX, 0), (extract_subv MaskX, 1)
21186 const X86Subtarget &Subtarget,
21187 SelectionDAG &DAG) {
21188 In = DAG.getZeroExtendInReg(In, DL, DstVT);
21189 return truncateVectorWithPACK(X86ISD::PACKUS, DstVT, In, DL, DAG, Subtarget);
21190}
21191
21192/// Truncate using inreg sign extension and X86ISD::PACKSS.
21194 const X86Subtarget &Subtarget,
21195 SelectionDAG &DAG) {
21196 EVT SrcVT = In.getValueType();
21197 In = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, SrcVT, In,
21198 DAG.getValueType(DstVT));
21199 return truncateVectorWithPACK(X86ISD::PACKSS, DstVT, In, DL, DAG, Subtarget);
21200}
21201
21202/// Helper to determine if \p In truncated to \p DstVT has the necessary
21203/// signbits / leading zero bits to be truncated with PACKSS / PACKUS,
21204/// possibly by converting a SRL node to SRA for sign extension.
21205static SDValue matchTruncateWithPACK(unsigned &PackOpcode, EVT DstVT,
21206 SDValue In, const SDLoc &DL,
21207 SelectionDAG &DAG,
21208 const X86Subtarget &Subtarget,
21209 const SDNodeFlags Flags = SDNodeFlags()) {
21210 // Requires SSE2.
21211 if (!Subtarget.hasSSE2())
21212 return SDValue();
21213
21214 EVT SrcVT = In.getValueType();
21215 EVT DstSVT = DstVT.getVectorElementType();
21216 EVT SrcSVT = SrcVT.getVectorElementType();
21217 unsigned NumDstEltBits = DstSVT.getSizeInBits();
21218 unsigned NumSrcEltBits = SrcSVT.getSizeInBits();
21219
21220 // Check we have a truncation suited for PACKSS/PACKUS.
21221 if (!((SrcSVT == MVT::i16 || SrcSVT == MVT::i32 || SrcSVT == MVT::i64) &&
21222 (DstSVT == MVT::i8 || DstSVT == MVT::i16 || DstSVT == MVT::i32)))
21223 return SDValue();
21224
21225 assert(NumSrcEltBits > NumDstEltBits && "Bad truncation");
21226 unsigned NumStages = Log2_32(NumSrcEltBits / NumDstEltBits);
21227
21228 // Truncation from 128-bit to vXi32 can be better handled with PSHUFD.
21229 // Truncation to sub-64-bit vXi16 can be better handled with PSHUFD/PSHUFLW.
21230 // Truncation from v2i64 to v2i8 can be better handled with PSHUFB.
21231 if ((DstSVT == MVT::i32 && SrcVT.getSizeInBits() <= 128) ||
21232 (DstSVT == MVT::i16 && SrcVT.getSizeInBits() <= (64 * NumStages)) ||
21233 (DstVT == MVT::v2i8 && SrcVT == MVT::v2i64 && Subtarget.hasSSSE3()))
21234 return SDValue();
21235
21236 // Prefer to lower v4i64 -> v4i32 as a shuffle unless we can cheaply
21237 // split this for packing.
21238 if (SrcVT == MVT::v4i64 && DstVT == MVT::v4i32 &&
21239 !isFreeToSplitVector(In, DAG) &&
21240 (!Subtarget.hasAVX() || DAG.ComputeNumSignBits(In) != 64))
21241 return SDValue();
21242
21243 // Don't truncate AVX512 targets as multiple PACK nodes stages.
21244 if (Subtarget.hasAVX512() && NumStages > 1)
21245 return SDValue();
21246
21247 unsigned NumPackedSignBits = std::min<unsigned>(NumDstEltBits, 16);
21248 unsigned NumPackedZeroBits = Subtarget.hasSSE41() ? NumPackedSignBits : 8;
21249
21250 // Truncate with PACKUS if we are truncating a vector with leading zero
21251 // bits that extend all the way to the packed/truncated value.
21252 // e.g. Masks, zext_in_reg, etc.
21253 // Pre-SSE41 we can only use PACKUSWB.
21254 KnownBits Known = DAG.computeKnownBits(In);
21255 if ((Flags.hasNoUnsignedWrap() && NumDstEltBits <= NumPackedZeroBits) ||
21256 (NumSrcEltBits - NumPackedZeroBits) <= Known.countMinLeadingZeros()) {
21257 PackOpcode = X86ISD::PACKUS;
21258 return In;
21259 }
21260
21261 // Truncate with PACKSS if we are truncating a vector with sign-bits
21262 // that extend all the way to the packed/truncated value.
21263 // e.g. Comparison result, sext_in_reg, etc.
21264 unsigned NumSignBits = DAG.ComputeNumSignBits(In);
21265
21266 // Don't use PACKSS for vXi64 -> vXi32 truncations unless we're dealing with
21267 // a sign splat (or AVX512 VPSRAQ support). ComputeNumSignBits struggles to
21268 // see through BITCASTs later on and combines/simplifications can't then use
21269 // it.
21270 if (DstSVT == MVT::i32 && NumSignBits != NumSrcEltBits &&
21271 !Subtarget.hasAVX512())
21272 return SDValue();
21273
21274 unsigned MinSignBits = NumSrcEltBits - NumPackedSignBits;
21275 if ((Flags.hasNoSignedWrap() && DstSVT != MVT::i32) ||
21276 MinSignBits < NumSignBits) {
21277 PackOpcode = X86ISD::PACKSS;
21278 return In;
21279 }
21280
21281 // If we have a srl that only generates signbits that we will discard in
21282 // the truncation then we can use PACKSS by converting the srl to a sra.
21283 // SimplifyDemandedBits often relaxes sra to srl so we need to reverse it.
21284 if (In.getOpcode() == ISD::SRL && In->hasOneUse())
21285 if (std::optional<unsigned> ShAmt = DAG.getValidShiftAmount(In)) {
21286 if (*ShAmt == MinSignBits) {
21287 PackOpcode = X86ISD::PACKSS;
21288 return DAG.getNode(ISD::SRA, DL, SrcVT, In->ops());
21289 }
21290 }
21291
21292 return SDValue();
21293}
21294
21295/// This function lowers a vector truncation of 'extended sign-bits' or
21296/// 'extended zero-bits' values.
21297/// vXi16/vXi32/vXi64 to vXi8/vXi16/vXi32 into X86ISD::PACKSS/PACKUS operations.
21299 MVT DstVT, SDValue In, const SDLoc &DL, const X86Subtarget &Subtarget,
21300 SelectionDAG &DAG, const SDNodeFlags Flags = SDNodeFlags()) {
21301 MVT SrcVT = In.getSimpleValueType();
21302 MVT DstSVT = DstVT.getVectorElementType();
21303 MVT SrcSVT = SrcVT.getVectorElementType();
21304 if (!((SrcSVT == MVT::i16 || SrcSVT == MVT::i32 || SrcSVT == MVT::i64) &&
21305 (DstSVT == MVT::i8 || DstSVT == MVT::i16 || DstSVT == MVT::i32)))
21306 return SDValue();
21307
21308 // If the upper half of the source is undef, then attempt to split and
21309 // only truncate the lower half.
21310 if (DstVT.getSizeInBits() >= 128) {
21311 if (SDValue Lo = isUpperSubvectorUndef(In, DL, DAG)) {
21312 MVT DstHalfVT = DstVT.getHalfNumVectorElementsVT();
21313 if (SDValue Res = LowerTruncateVecPackWithSignBits(DstHalfVT, Lo, DL,
21314 Subtarget, DAG))
21315 return widenSubVector(Res, false, Subtarget, DAG, DL,
21316 DstVT.getSizeInBits());
21317 }
21318 }
21319
21320 unsigned PackOpcode;
21321 if (SDValue Src = matchTruncateWithPACK(PackOpcode, DstVT, In, DL, DAG,
21322 Subtarget, Flags))
21323 return truncateVectorWithPACK(PackOpcode, DstVT, Src, DL, DAG, Subtarget);
21324
21325 return SDValue();
21326}
21327
21328/// This function lowers a vector truncation from vXi32/vXi64 to vXi8/vXi16 into
21329/// X86ISD::PACKUS/X86ISD::PACKSS operations.
21331 const X86Subtarget &Subtarget,
21332 SelectionDAG &DAG) {
21333 MVT SrcVT = In.getSimpleValueType();
21334 MVT DstSVT = DstVT.getVectorElementType();
21335 MVT SrcSVT = SrcVT.getVectorElementType();
21336 unsigned NumElems = DstVT.getVectorNumElements();
21337 if (!((SrcSVT == MVT::i16 || SrcSVT == MVT::i32 || SrcSVT == MVT::i64) &&
21338 (DstSVT == MVT::i8 || DstSVT == MVT::i16) && isPowerOf2_32(NumElems) &&
21339 NumElems >= 8))
21340 return SDValue();
21341
21342 // SSSE3's pshufb results in less instructions in the cases below.
21343 if (Subtarget.hasSSSE3() && NumElems == 8) {
21344 if (SrcSVT == MVT::i16)
21345 return SDValue();
21346 if (SrcSVT == MVT::i32 && (DstSVT == MVT::i8 || !Subtarget.hasSSE41()))
21347 return SDValue();
21348 }
21349
21350 // If the upper half of the source is undef, then attempt to split and
21351 // only truncate the lower half.
21352 if (DstVT.getSizeInBits() >= 128) {
21353 if (SDValue Lo = isUpperSubvectorUndef(In, DL, DAG)) {
21354 MVT DstHalfVT = DstVT.getHalfNumVectorElementsVT();
21355 if (SDValue Res = LowerTruncateVecPack(DstHalfVT, Lo, DL, Subtarget, DAG))
21356 return widenSubVector(Res, false, Subtarget, DAG, DL,
21357 DstVT.getSizeInBits());
21358 }
21359 }
21360
21361 // SSE2 provides PACKUS for only 2 x v8i16 -> v16i8 and SSE4.1 provides PACKUS
21362 // for 2 x v4i32 -> v8i16. For SSSE3 and below, we need to use PACKSS to
21363 // truncate 2 x v4i32 to v8i16.
21364 if (Subtarget.hasSSE41() || DstSVT == MVT::i8)
21365 return truncateVectorWithPACKUS(DstVT, In, DL, Subtarget, DAG);
21366
21367 if (SrcSVT == MVT::i16 || SrcSVT == MVT::i32)
21368 return truncateVectorWithPACKSS(DstVT, In, DL, Subtarget, DAG);
21369
21370 // Special case vXi64 -> vXi16, shuffle to vXi32 and then use PACKSS.
21371 if (DstSVT == MVT::i16 && SrcSVT == MVT::i64) {
21372 MVT TruncVT = MVT::getVectorVT(MVT::i32, NumElems);
21373 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, In);
21374 return truncateVectorWithPACKSS(DstVT, Trunc, DL, Subtarget, DAG);
21375 }
21376
21377 return SDValue();
21378}
21379
21381 SelectionDAG &DAG,
21382 const X86Subtarget &Subtarget) {
21383 MVT VT = Op.getSimpleValueType();
21384 SDValue In = Op.getOperand(0);
21385 MVT InVT = In.getSimpleValueType();
21386 assert(VT.getVectorElementType() == MVT::i1 && "Unexpected vector type.");
21387
21388 // Shift LSB to MSB and use VPMOVB/W2M or TESTD/Q.
21389 unsigned ShiftInx = InVT.getScalarSizeInBits() - 1;
21390 if (InVT.getScalarSizeInBits() <= 16) {
21391 if (Subtarget.hasBWI()) {
21392 // legal, will go to VPMOVB2M, VPMOVW2M
21393 if (DAG.ComputeNumSignBits(In) < InVT.getScalarSizeInBits()) {
21394 // We need to shift to get the lsb into sign position.
21395 // Shift packed bytes not supported natively, bitcast to word
21396 MVT ExtVT = MVT::getVectorVT(MVT::i16, InVT.getSizeInBits()/16);
21397 In = DAG.getNode(ISD::SHL, DL, ExtVT,
21398 DAG.getBitcast(ExtVT, In),
21399 DAG.getConstant(ShiftInx, DL, ExtVT));
21400 In = DAG.getBitcast(InVT, In);
21401 }
21402 return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, InVT),
21403 In, ISD::SETGT);
21404 }
21405 // Use TESTD/Q, extended vector to packed dword/qword.
21406 assert((InVT.is256BitVector() || InVT.is128BitVector()) &&
21407 "Unexpected vector type.");
21408 unsigned NumElts = InVT.getVectorNumElements();
21409 assert((NumElts == 8 || NumElts == 16) && "Unexpected number of elements");
21410 // We need to change to a wider element type that we have support for.
21411 // For 8 element vectors this is easy, we either extend to v8i32 or v8i64.
21412 // For 16 element vectors we extend to v16i32 unless we are explicitly
21413 // trying to avoid 512-bit vectors. If we are avoiding 512-bit vectors
21414 // we need to split into two 8 element vectors which we can extend to v8i32,
21415 // truncate and concat the results. There's an additional complication if
21416 // the original type is v16i8. In that case we can't split the v16i8
21417 // directly, so we need to shuffle high elements to low and use
21418 // sign_extend_vector_inreg.
21419 if (NumElts == 16 && !Subtarget.canExtendTo512DQ()) {
21420 SDValue Lo, Hi;
21421 if (InVT == MVT::v16i8) {
21422 Lo = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, DL, MVT::v8i32, In);
21423 Hi = DAG.getVectorShuffle(
21424 InVT, DL, In, In,
21425 {8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1});
21426 Hi = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, DL, MVT::v8i32, Hi);
21427 } else {
21428 assert(InVT == MVT::v16i16 && "Unexpected VT!");
21429 Lo = extract128BitVector(In, 0, DAG, DL);
21430 Hi = extract128BitVector(In, 8, DAG, DL);
21431 }
21432 // We're split now, just emit two truncates and a concat. The two
21433 // truncates will trigger legalization to come back to this function.
21434 Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i1, Lo);
21435 Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i1, Hi);
21436 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
21437 }
21438 // We either have 8 elements or we're allowed to use 512-bit vectors.
21439 // If we have VLX, we want to use the narrowest vector that can get the
21440 // job done so we use vXi32.
21441 MVT EltVT = Subtarget.hasVLX() ? MVT::i32 : MVT::getIntegerVT(512/NumElts);
21442 MVT ExtVT = MVT::getVectorVT(EltVT, NumElts);
21443 In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In);
21444 InVT = ExtVT;
21445 ShiftInx = InVT.getScalarSizeInBits() - 1;
21446 }
21447
21448 if (DAG.ComputeNumSignBits(In) < InVT.getScalarSizeInBits()) {
21449 // We need to shift to get the lsb into sign position.
21450 In = DAG.getNode(ISD::SHL, DL, InVT, In,
21451 DAG.getConstant(ShiftInx, DL, InVT));
21452 }
21453 // If we have DQI, emit a pattern that will be iseled as vpmovq2m/vpmovd2m.
21454 if (Subtarget.hasDQI())
21455 return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, InVT), In, ISD::SETGT);
21456 return DAG.getSetCC(DL, VT, In, DAG.getConstant(0, DL, InVT), ISD::SETNE);
21457}
21458
21459SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
21460 SDLoc DL(Op);
21461 MVT VT = Op.getSimpleValueType();
21462 SDValue In = Op.getOperand(0);
21463 MVT InVT = In.getSimpleValueType();
21465 "Invalid TRUNCATE operation");
21466
21467 // If we're called by the type legalizer, handle a few cases.
21468 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
21469 if (!TLI.isTypeLegal(VT) || !TLI.isTypeLegal(InVT)) {
21470 if ((InVT == MVT::v8i64 || InVT == MVT::v16i32 || InVT == MVT::v16i64) &&
21471 VT.is128BitVector() && Subtarget.hasAVX512()) {
21472 assert((InVT == MVT::v16i64 || Subtarget.hasVLX()) &&
21473 "Unexpected subtarget!");
21474 // The default behavior is to truncate one step, concatenate, and then
21475 // truncate the remainder. We'd rather produce two 64-bit results and
21476 // concatenate those.
21477 SDValue Lo, Hi;
21478 std::tie(Lo, Hi) = DAG.SplitVector(In, DL);
21479
21480 EVT LoVT, HiVT;
21481 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
21482
21483 Lo = DAG.getNode(ISD::TRUNCATE, DL, LoVT, Lo);
21484 Hi = DAG.getNode(ISD::TRUNCATE, DL, HiVT, Hi);
21485 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
21486 }
21487
21488 // Pre-AVX512 (or prefer-256bit) see if we can make use of PACKSS/PACKUS.
21489 if (!Subtarget.hasAVX512() ||
21490 (InVT.is512BitVector() && VT.is256BitVector()))
21492 VT, In, DL, Subtarget, DAG, Op->getFlags()))
21493 return SignPack;
21494
21495 // Pre-AVX512 see if we can make use of PACKSS/PACKUS.
21496 if (!Subtarget.hasAVX512())
21497 return LowerTruncateVecPack(VT, In, DL, Subtarget, DAG);
21498
21499 // Otherwise let default legalization handle it.
21500 return SDValue();
21501 }
21502
21503 if (VT.getVectorElementType() == MVT::i1)
21504 return LowerTruncateVecI1(Op, DL, DAG, Subtarget);
21505
21506 // Attempt to truncate with PACKUS/PACKSS even on AVX512 if we'd have to
21507 // concat from subvectors to use VPTRUNC etc.
21508 if (!Subtarget.hasAVX512() || isFreeToSplitVector(In, DAG))
21510 VT, In, DL, Subtarget, DAG, Op->getFlags()))
21511 return SignPack;
21512
21513 // vpmovqb/w/d, vpmovdb/w, vpmovwb
21514 if (Subtarget.hasAVX512()) {
21515 if (InVT == MVT::v32i16 && !Subtarget.hasBWI()) {
21516 assert(VT == MVT::v32i8 && "Unexpected VT!");
21517 return splitVectorIntUnary(Op, DAG, DL);
21518 }
21519
21520 // word to byte only under BWI. Otherwise we have to promoted to v16i32
21521 // and then truncate that. But we should only do that if we haven't been
21522 // asked to avoid 512-bit vectors. The actual promotion to v16i32 will be
21523 // handled by isel patterns.
21524 if (InVT != MVT::v16i16 || Subtarget.hasBWI() ||
21525 Subtarget.canExtendTo512DQ())
21526 return Op;
21527 }
21528
21529 // Handle truncation of V256 to V128 using shuffles.
21530 assert(VT.is128BitVector() && InVT.is256BitVector() && "Unexpected types!");
21531
21532 if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) {
21533 // On AVX2, v4i64 -> v4i32 becomes VPERMD.
21534 if (Subtarget.hasInt256()) {
21535 static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1};
21536 In = DAG.getBitcast(MVT::v8i32, In);
21537 In = DAG.getVectorShuffle(MVT::v8i32, DL, In, In, ShufMask);
21538 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, In,
21539 DAG.getVectorIdxConstant(0, DL));
21540 }
21541
21542 SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
21543 DAG.getVectorIdxConstant(0, DL));
21544 SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
21545 DAG.getVectorIdxConstant(2, DL));
21546 static const int ShufMask[] = {0, 2, 4, 6};
21547 return DAG.getVectorShuffle(VT, DL, DAG.getBitcast(MVT::v4i32, OpLo),
21548 DAG.getBitcast(MVT::v4i32, OpHi), ShufMask);
21549 }
21550
21551 if ((VT == MVT::v8i16) && (InVT == MVT::v8i32)) {
21552 // On AVX2, v8i32 -> v8i16 becomes PSHUFB.
21553 if (Subtarget.hasInt256()) {
21554 // The PSHUFB mask:
21555 static const int ShufMask1[] = { 0, 1, 4, 5, 8, 9, 12, 13,
21556 -1, -1, -1, -1, -1, -1, -1, -1,
21557 16, 17, 20, 21, 24, 25, 28, 29,
21558 -1, -1, -1, -1, -1, -1, -1, -1 };
21559 In = DAG.getBitcast(MVT::v32i8, In);
21560 In = DAG.getVectorShuffle(MVT::v32i8, DL, In, In, ShufMask1);
21561 In = DAG.getBitcast(MVT::v4i64, In);
21562
21563 static const int ShufMask2[] = {0, 2, -1, -1};
21564 In = DAG.getVectorShuffle(MVT::v4i64, DL, In, In, ShufMask2);
21565 In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
21566 DAG.getVectorIdxConstant(0, DL));
21567 return DAG.getBitcast(MVT::v8i16, In);
21568 }
21569
21570 return Subtarget.hasSSE41()
21571 ? truncateVectorWithPACKUS(VT, In, DL, Subtarget, DAG)
21572 : truncateVectorWithPACKSS(VT, In, DL, Subtarget, DAG);
21573 }
21574
21575 if (VT == MVT::v16i8 && InVT == MVT::v16i16)
21576 return truncateVectorWithPACKUS(VT, In, DL, Subtarget, DAG);
21577
21578 llvm_unreachable("All 256->128 cases should have been handled above!");
21579}
21580
21581// We can leverage the specific way the "cvttps2dq/cvttpd2dq" instruction
21582// behaves on out of range inputs to generate optimized conversions.
21584 SelectionDAG &DAG,
21585 const X86Subtarget &Subtarget) {
21586 MVT SrcVT = Src.getSimpleValueType();
21587 unsigned DstBits = VT.getScalarSizeInBits();
21588 assert(DstBits == 32 && "expandFP_TO_UINT_SSE - only vXi32 supported");
21589
21590 // Calculate the converted result for values in the range 0 to
21591 // 2^31-1 ("Small") and from 2^31 to 2^32-1 ("Big").
21592 SDValue Small = DAG.getNode(X86ISD::CVTTP2SI, dl, VT, Src);
21593 SDValue Big =
21594 DAG.getNode(X86ISD::CVTTP2SI, dl, VT,
21595 DAG.getNode(ISD::FSUB, dl, SrcVT, Src,
21596 DAG.getConstantFP(2147483648.0f, dl, SrcVT)));
21597
21598 // The "CVTTP2SI" instruction conveniently sets the sign bit if
21599 // and only if the value was out of range. So we can use that
21600 // as our indicator that we rather use "Big" instead of "Small".
21601 //
21602 // Use "Small" if "IsOverflown" has all bits cleared
21603 // and "0x80000000 | Big" if all bits in "IsOverflown" are set.
21604
21605 // AVX1 can't use the signsplat masking for 256-bit vectors - we have to
21606 // use the slightly slower blendv select instead.
21607 if (VT == MVT::v8i32 && !Subtarget.hasAVX2()) {
21608 SDValue Overflow = DAG.getNode(ISD::OR, dl, VT, Small, Big);
21609 return DAG.getNode(X86ISD::BLENDV, dl, VT, Small, Overflow, Small);
21610 }
21611
21612 SDValue IsOverflown =
21613 DAG.getNode(X86ISD::VSRAI, dl, VT, Small,
21614 DAG.getTargetConstant(DstBits - 1, dl, MVT::i8));
21615 return DAG.getNode(ISD::OR, dl, VT, Small,
21616 DAG.getNode(ISD::AND, dl, VT, Big, IsOverflown));
21617}
21618
21619SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
21620 bool IsStrict = Op->isStrictFPOpcode();
21621 bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT ||
21622 Op.getOpcode() == ISD::STRICT_FP_TO_SINT;
21623 bool HasVLX = Subtarget.hasVLX();
21624 MVT VT = Op->getSimpleValueType(0);
21625 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
21626 SDValue Chain = IsStrict ? Op->getOperand(0) : SDValue();
21627 MVT SrcVT = Src.getSimpleValueType();
21628 SDLoc dl(Op);
21629
21630 SDValue Res;
21631 if (isSoftF16(SrcVT, Subtarget)) {
21632 MVT NVT = VT.isVector() ? VT.changeVectorElementType(MVT::f32) : MVT::f32;
21633 if (IsStrict)
21634 return DAG.getNode(Op.getOpcode(), dl, {VT, MVT::Other},
21635 {Chain, DAG.getNode(ISD::STRICT_FP_EXTEND, dl,
21636 {NVT, MVT::Other}, {Chain, Src})});
21637 return DAG.getNode(Op.getOpcode(), dl, VT,
21638 DAG.getNode(ISD::FP_EXTEND, dl, NVT, Src));
21639 } else if (isTypeLegal(SrcVT) &&
21640 isLegalConversion(VT, SrcVT, IsSigned, Subtarget)) {
21641 return Op;
21642 }
21643
21644 if (VT.isVector()) {
21645 if (VT == MVT::v2i1 && SrcVT == MVT::v2f64) {
21646 MVT ResVT = MVT::v4i32;
21647 MVT TruncVT = MVT::v4i1;
21648 unsigned Opc;
21649 if (IsStrict)
21651 else
21652 Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
21653
21654 if (!IsSigned && !HasVLX) {
21655 assert(Subtarget.useAVX512Regs() && "Unexpected features!");
21656 // Widen to 512-bits.
21657 ResVT = MVT::v8i32;
21658 TruncVT = MVT::v8i1;
21659 Opc = Op.getOpcode();
21660 // Need to concat with zero vector for strict fp to avoid spurious
21661 // exceptions.
21662 // TODO: Should we just do this for non-strict as well?
21663 SDValue Tmp = IsStrict ? DAG.getConstantFP(0.0, dl, MVT::v8f64)
21664 : DAG.getUNDEF(MVT::v8f64);
21665 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8f64, Tmp, Src,
21666 DAG.getVectorIdxConstant(0, dl));
21667 }
21668 if (IsStrict) {
21669 Res = DAG.getNode(Opc, dl, {ResVT, MVT::Other}, {Chain, Src});
21670 Chain = Res.getValue(1);
21671 } else {
21672 Res = DAG.getNode(Opc, dl, ResVT, Src);
21673 }
21674
21675 Res = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Res);
21676 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i1, Res,
21677 DAG.getVectorIdxConstant(0, dl));
21678 if (IsStrict)
21679 return DAG.getMergeValues({Res, Chain}, dl);
21680 return Res;
21681 }
21682
21683 if (Subtarget.hasFP16() && SrcVT.getVectorElementType() == MVT::f16) {
21684 if ((HasVLX && (VT == MVT::v8i16 || VT == MVT::v16i16)) ||
21685 VT == MVT::v32i16)
21686 return Op;
21687
21688 MVT ResVT = VT;
21689 MVT EleVT = VT.getVectorElementType();
21690 if (EleVT != MVT::i64)
21691 ResVT = EleVT == MVT::i32 ? MVT::v4i32 : MVT::v8i16;
21692
21693 if (SrcVT == MVT::v2f16 || SrcVT == MVT::v4f16) {
21694 SDValue Tmp =
21695 IsStrict ? DAG.getConstantFP(0.0, dl, SrcVT) : DAG.getUNDEF(SrcVT);
21696 SmallVector<SDValue, 4> Ops(SrcVT == MVT::v2f16 ? 4 : 2, Tmp);
21697 Ops[0] = Src;
21698 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8f16, Ops);
21699 }
21700
21701 if (!HasVLX) {
21702 assert(Subtarget.useAVX512Regs() && "Unexpected features!");
21703 // Widen to 512-bits.
21704 unsigned IntSize = EleVT.getSizeInBits();
21705 unsigned Num = IntSize > 16 ? 512 / IntSize : 32;
21706 ResVT = MVT::getVectorVT(EleVT, Num);
21707 Src = widenSubVector(MVT::getVectorVT(MVT::f16, Num), Src, IsStrict,
21708 Subtarget, DAG, dl);
21709 }
21710
21711 if (IsStrict) {
21712 Res = DAG.getNode(IsSigned ? X86ISD::STRICT_CVTTP2SI
21714 dl, {ResVT, MVT::Other}, {Chain, Src});
21715 Chain = Res.getValue(1);
21716 } else {
21717 Res = DAG.getNode(IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI, dl,
21718 ResVT, Src);
21719 }
21720
21721 // TODO: Need to add exception check code for strict FP.
21722 if (EleVT.getSizeInBits() < 16) {
21723 if (HasVLX)
21724 ResVT = MVT::getVectorVT(EleVT, 8);
21725 Res = DAG.getNode(ISD::TRUNCATE, dl, ResVT, Res);
21726 }
21727
21728 if (ResVT != VT)
21729 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,
21730 DAG.getVectorIdxConstant(0, dl));
21731
21732 if (IsStrict)
21733 return DAG.getMergeValues({Res, Chain}, dl);
21734 return Res;
21735 }
21736
21737 // v8f32/v16f32/v8f64->v8i16/v16i16 need to widen first.
21738 if (VT.getVectorElementType() == MVT::i16) {
21739 assert((SrcVT.getVectorElementType() == MVT::f32 ||
21740 SrcVT.getVectorElementType() == MVT::f64) &&
21741 "Expected f32/f64 vector!");
21742 MVT NVT = VT.changeVectorElementType(MVT::i32);
21743 if (IsStrict) {
21744 Res = DAG.getNode(IsSigned ? ISD::STRICT_FP_TO_SINT
21746 dl, {NVT, MVT::Other}, {Chain, Src});
21747 Chain = Res.getValue(1);
21748 } else {
21749 Res = DAG.getNode(IsSigned ? ISD::FP_TO_SINT : ISD::FP_TO_UINT, dl,
21750 NVT, Src);
21751 }
21752
21753 // TODO: Need to add exception check code for strict FP.
21754 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
21755
21756 if (IsStrict)
21757 return DAG.getMergeValues({Res, Chain}, dl);
21758 return Res;
21759 }
21760
21761 // v8f64->v8i32 is legal, but we need v8i32 to be custom for v8f32.
21762 if (VT == MVT::v8i32 && SrcVT == MVT::v8f64) {
21763 assert(!IsSigned && "Expected unsigned conversion!");
21764 assert(Subtarget.useAVX512Regs() && "Requires avx512f");
21765 return Op;
21766 }
21767
21768 // Widen vXi32 fp_to_uint with avx512f to 512-bit source.
21769 if ((VT == MVT::v4i32 || VT == MVT::v8i32) &&
21770 (SrcVT == MVT::v4f64 || SrcVT == MVT::v4f32 || SrcVT == MVT::v8f32) &&
21771 Subtarget.useAVX512Regs()) {
21772 assert(!IsSigned && "Expected unsigned conversion!");
21773 assert(!Subtarget.hasVLX() && "Unexpected features!");
21774 MVT WideVT = SrcVT == MVT::v4f64 ? MVT::v8f64 : MVT::v16f32;
21775 MVT ResVT = SrcVT == MVT::v4f64 ? MVT::v8i32 : MVT::v16i32;
21776 // Need to concat with zero vector for strict fp to avoid spurious
21777 // exceptions.
21778 // TODO: Should we just do this for non-strict as well?
21779 SDValue Tmp =
21780 IsStrict ? DAG.getConstantFP(0.0, dl, WideVT) : DAG.getUNDEF(WideVT);
21781 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVT, Tmp, Src,
21782 DAG.getVectorIdxConstant(0, dl));
21783
21784 if (IsStrict) {
21785 Res = DAG.getNode(ISD::STRICT_FP_TO_UINT, dl, {ResVT, MVT::Other},
21786 {Chain, Src});
21787 Chain = Res.getValue(1);
21788 } else {
21789 Res = DAG.getNode(ISD::FP_TO_UINT, dl, ResVT, Src);
21790 }
21791
21792 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,
21793 DAG.getVectorIdxConstant(0, dl));
21794
21795 if (IsStrict)
21796 return DAG.getMergeValues({Res, Chain}, dl);
21797 return Res;
21798 }
21799
21800 // Widen vXi64 fp_to_uint/fp_to_sint with avx512dq to 512-bit source.
21801 if ((VT == MVT::v2i64 || VT == MVT::v4i64) &&
21802 (SrcVT == MVT::v2f64 || SrcVT == MVT::v4f64 || SrcVT == MVT::v4f32) &&
21803 Subtarget.useAVX512Regs() && Subtarget.hasDQI()) {
21804 assert(!Subtarget.hasVLX() && "Unexpected features!");
21805 MVT WideVT = SrcVT == MVT::v4f32 ? MVT::v8f32 : MVT::v8f64;
21806 // Need to concat with zero vector for strict fp to avoid spurious
21807 // exceptions.
21808 // TODO: Should we just do this for non-strict as well?
21809 SDValue Tmp =
21810 IsStrict ? DAG.getConstantFP(0.0, dl, WideVT) : DAG.getUNDEF(WideVT);
21811 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVT, Tmp, Src,
21812 DAG.getVectorIdxConstant(0, dl));
21813
21814 if (IsStrict) {
21815 Res = DAG.getNode(Op.getOpcode(), dl, {MVT::v8i64, MVT::Other},
21816 {Chain, Src});
21817 Chain = Res.getValue(1);
21818 } else {
21819 Res = DAG.getNode(Op.getOpcode(), dl, MVT::v8i64, Src);
21820 }
21821
21822 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,
21823 DAG.getVectorIdxConstant(0, dl));
21824
21825 if (IsStrict)
21826 return DAG.getMergeValues({Res, Chain}, dl);
21827 return Res;
21828 }
21829
21830 if (VT == MVT::v2i64 && SrcVT == MVT::v2f32) {
21831 if (!Subtarget.hasVLX()) {
21832 // Non-strict nodes without VLX can we widened to v4f32->v4i64 by type
21833 // legalizer and then widened again by vector op legalization.
21834 if (!IsStrict)
21835 return SDValue();
21836
21837 SDValue Zero = DAG.getConstantFP(0.0, dl, MVT::v2f32);
21838 SDValue Tmp = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8f32,
21839 {Src, Zero, Zero, Zero});
21840 Tmp = DAG.getNode(Op.getOpcode(), dl, {MVT::v8i64, MVT::Other},
21841 {Chain, Tmp});
21842 SDValue Chain = Tmp.getValue(1);
21843 Tmp = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i64, Tmp,
21844 DAG.getVectorIdxConstant(0, dl));
21845 return DAG.getMergeValues({Tmp, Chain}, dl);
21846 }
21847
21848 assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL");
21849 SDValue Tmp = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
21850 DAG.getUNDEF(MVT::v2f32));
21851 if (IsStrict) {
21852 unsigned Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI
21854 return DAG.getNode(Opc, dl, {VT, MVT::Other}, {Op->getOperand(0), Tmp});
21855 }
21856 unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
21857 return DAG.getNode(Opc, dl, VT, Tmp);
21858 }
21859
21860 // Generate optimized instructions for pre AVX512 unsigned conversions from
21861 // vXf32 to vXi32.
21862 if ((VT == MVT::v4i32 && SrcVT == MVT::v4f32) ||
21863 (VT == MVT::v4i32 && SrcVT == MVT::v4f64) ||
21864 (VT == MVT::v8i32 && SrcVT == MVT::v8f32)) {
21865 assert(!IsSigned && "Expected unsigned conversion!");
21866 return expandFP_TO_UINT_SSE(VT, Src, dl, DAG, Subtarget);
21867 }
21868
21869 return SDValue();
21870 }
21871
21872 assert(!VT.isVector());
21873
21874 bool UseSSEReg = isScalarFPTypeInSSEReg(SrcVT);
21875
21876 if (!IsSigned && UseSSEReg) {
21877 // Conversions from f32/f64 with AVX512 should be legal.
21878 if (Subtarget.hasAVX512())
21879 return Op;
21880
21881 // We can leverage the specific way the "cvttss2si/cvttsd2si" instruction
21882 // behaves on out of range inputs to generate optimized conversions.
21883 if (!IsStrict && ((VT == MVT::i32 && !Subtarget.is64Bit()) ||
21884 (VT == MVT::i64 && Subtarget.is64Bit()))) {
21885 unsigned DstBits = VT.getScalarSizeInBits();
21886 APInt UIntLimit = APInt::getSignMask(DstBits);
21887 SDValue FloatOffset = DAG.getNode(ISD::UINT_TO_FP, dl, SrcVT,
21888 DAG.getConstant(UIntLimit, dl, VT));
21889 MVT SrcVecVT = MVT::getVectorVT(SrcVT, 128 / SrcVT.getScalarSizeInBits());
21890
21891 // Calculate the converted result for values in the range:
21892 // (i32) 0 to 2^31-1 ("Small") and from 2^31 to 2^32-1 ("Big").
21893 // (i64) 0 to 2^63-1 ("Small") and from 2^63 to 2^64-1 ("Big").
21894 SDValue Small =
21895 DAG.getNode(X86ISD::CVTTS2SI, dl, VT,
21896 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, SrcVecVT, Src));
21897 SDValue Big = DAG.getNode(
21898 X86ISD::CVTTS2SI, dl, VT,
21899 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, SrcVecVT,
21900 DAG.getNode(ISD::FSUB, dl, SrcVT, Src, FloatOffset)));
21901
21902 // The "CVTTS2SI" instruction conveniently sets the sign bit if
21903 // and only if the value was out of range. So we can use that
21904 // as our indicator that we rather use "Big" instead of "Small".
21905 //
21906 // Use "Small" if "IsOverflown" has all bits cleared
21907 // and "0x80000000 | Big" if all bits in "IsOverflown" are set.
21908 SDValue IsOverflown = DAG.getNode(
21909 ISD::SRA, dl, VT, Small, DAG.getConstant(DstBits - 1, dl, MVT::i8));
21910 return DAG.getNode(ISD::OR, dl, VT, Small,
21911 DAG.getNode(ISD::AND, dl, VT, Big, IsOverflown));
21912 }
21913
21914 // Use default expansion for i64.
21915 if (VT == MVT::i64)
21916 return SDValue();
21917
21918 assert(VT == MVT::i32 && "Unexpected VT!");
21919
21920 // Promote i32 to i64 and use a signed operation on 64-bit targets.
21921 // FIXME: This does not generate an invalid exception if the input does not
21922 // fit in i32. PR44019
21923 if (Subtarget.is64Bit()) {
21924 if (IsStrict) {
21925 Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, {MVT::i64, MVT::Other},
21926 {Chain, Src});
21927 Chain = Res.getValue(1);
21928 } else
21929 Res = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i64, Src);
21930
21931 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
21932 if (IsStrict)
21933 return DAG.getMergeValues({Res, Chain}, dl);
21934 return Res;
21935 }
21936
21937 // Use default expansion for SSE1/2 targets without SSE3. With SSE3 we can
21938 // use fisttp which will be handled later.
21939 if (!Subtarget.hasSSE3())
21940 return SDValue();
21941 }
21942
21943 // Promote i16 to i32 if we can use a SSE operation or the type is f128.
21944 // FIXME: This does not generate an invalid exception if the input does not
21945 // fit in i16. PR44019
21946 if (VT == MVT::i16 && (UseSSEReg || SrcVT == MVT::f128)) {
21947 assert(IsSigned && "Expected i16 FP_TO_UINT to have been promoted!");
21948 if (IsStrict) {
21949 Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, {MVT::i32, MVT::Other},
21950 {Chain, Src});
21951 Chain = Res.getValue(1);
21952 } else
21953 Res = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i32, Src);
21954
21955 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
21956 if (IsStrict)
21957 return DAG.getMergeValues({Res, Chain}, dl);
21958 return Res;
21959 }
21960
21961 // If this is a FP_TO_SINT using SSEReg we're done.
21962 if (UseSSEReg && IsSigned)
21963 return Op;
21964
21965 // fp128 needs to use a libcall.
21966 if (SrcVT == MVT::f128) {
21967 RTLIB::Libcall LC;
21968 if (IsSigned)
21969 LC = RTLIB::getFPTOSINT(SrcVT, VT);
21970 else
21971 LC = RTLIB::getFPTOUINT(SrcVT, VT);
21972
21973 MakeLibCallOptions CallOptions;
21974 std::pair<SDValue, SDValue> Tmp =
21975 makeLibCall(DAG, LC, VT, Src, CallOptions, dl, Chain);
21976
21977 if (IsStrict)
21978 return DAG.getMergeValues({ Tmp.first, Tmp.second }, dl);
21979
21980 return Tmp.first;
21981 }
21982
21983 // Fall back to X87.
21984 if (SDValue V = FP_TO_INTHelper(Op, DAG, IsSigned, Chain)) {
21985 if (IsStrict)
21986 return DAG.getMergeValues({V, Chain}, dl);
21987 return V;
21988 }
21989
21990 llvm_unreachable("Expected FP_TO_INTHelper to handle all remaining cases.");
21991}
21992
21993SDValue X86TargetLowering::LowerLRINT_LLRINT(SDValue Op,
21994 SelectionDAG &DAG) const {
21995 SDValue Src = Op.getOperand(0);
21996 EVT DstVT = Op.getSimpleValueType();
21997 MVT SrcVT = Src.getSimpleValueType();
21998
21999 if (SrcVT.isVector())
22000 return DstVT.getScalarType() == MVT::i32 ? Op : SDValue();
22001
22002 if (SrcVT == MVT::f16)
22003 return SDValue();
22004
22005 // If the source is in an SSE register, the node is Legal.
22006 if (isScalarFPTypeInSSEReg(SrcVT))
22007 return Op;
22008
22009 return LRINT_LLRINTHelper(Op.getNode(), DAG);
22010}
22011
22012SDValue X86TargetLowering::LRINT_LLRINTHelper(SDNode *N,
22013 SelectionDAG &DAG) const {
22014 EVT DstVT = N->getValueType(0);
22015 SDValue Src = N->getOperand(0);
22016 EVT SrcVT = Src.getValueType();
22017
22018 if (SrcVT != MVT::f32 && SrcVT != MVT::f64 && SrcVT != MVT::f80) {
22019 // f16 must be promoted before using the lowering in this routine.
22020 // fp128 does not use this lowering.
22021 return SDValue();
22022 }
22023
22024 SDLoc DL(N);
22025 SDValue Chain = DAG.getEntryNode();
22026
22027 bool UseSSE = isScalarFPTypeInSSEReg(SrcVT);
22028
22029 // If we're converting from SSE, the stack slot needs to hold both types.
22030 // Otherwise it only needs to hold the DstVT.
22031 EVT OtherVT = UseSSE ? SrcVT : DstVT;
22032 SDValue StackPtr = DAG.CreateStackTemporary(DstVT, OtherVT);
22033 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
22034 MachinePointerInfo MPI =
22036
22037 if (UseSSE) {
22038 assert(DstVT == MVT::i64 && "Invalid LRINT/LLRINT to lower!");
22039 Chain = DAG.getStore(Chain, DL, Src, StackPtr, MPI);
22040 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
22041 SDValue Ops[] = { Chain, StackPtr };
22042
22043 Src = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, SrcVT, MPI,
22044 /*Align*/ std::nullopt,
22046 Chain = Src.getValue(1);
22047 }
22048
22049 SDValue StoreOps[] = { Chain, Src, StackPtr };
22050 Chain = DAG.getMemIntrinsicNode(X86ISD::FIST, DL, DAG.getVTList(MVT::Other),
22051 StoreOps, DstVT, MPI, /*Align*/ std::nullopt,
22053
22054 return DAG.getLoad(DstVT, DL, Chain, StackPtr, MPI);
22055}
22056
22057SDValue
22058X86TargetLowering::LowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG) const {
22059 // This is based on the TargetLowering::expandFP_TO_INT_SAT implementation,
22060 // but making use of X86 specifics to produce better instruction sequences.
22061 SDNode *Node = Op.getNode();
22062 bool IsSigned = Node->getOpcode() == ISD::FP_TO_SINT_SAT;
22063 unsigned FpToIntOpcode = IsSigned ? ISD::FP_TO_SINT : ISD::FP_TO_UINT;
22064 SDLoc dl(SDValue(Node, 0));
22065 SDValue Src = Node->getOperand(0);
22066
22067 // There are three types involved here: SrcVT is the source floating point
22068 // type, DstVT is the type of the result, and TmpVT is the result of the
22069 // intermediate FP_TO_*INT operation we'll use (which may be a promotion of
22070 // DstVT).
22071 EVT SrcVT = Src.getValueType();
22072 EVT DstVT = Node->getValueType(0);
22073 EVT TmpVT = DstVT;
22074
22075 // This code is only for floats and doubles. Fall back to generic code for
22076 // anything else.
22077 if (!isScalarFPTypeInSSEReg(SrcVT) || isSoftF16(SrcVT, Subtarget))
22078 return SDValue();
22079
22080 EVT SatVT = cast<VTSDNode>(Node->getOperand(1))->getVT();
22081 unsigned SatWidth = SatVT.getScalarSizeInBits();
22082 unsigned DstWidth = DstVT.getScalarSizeInBits();
22083 unsigned TmpWidth = TmpVT.getScalarSizeInBits();
22084 assert(SatWidth <= DstWidth && SatWidth <= TmpWidth &&
22085 "Expected saturation width smaller than result width");
22086
22087 // Promote result of FP_TO_*INT to at least 32 bits.
22088 if (TmpWidth < 32) {
22089 TmpVT = MVT::i32;
22090 TmpWidth = 32;
22091 }
22092
22093 // Promote conversions to unsigned 32-bit to 64-bit, because it will allow
22094 // us to use a native signed conversion instead.
22095 if (SatWidth == 32 && !IsSigned && Subtarget.is64Bit()) {
22096 TmpVT = MVT::i64;
22097 TmpWidth = 64;
22098 }
22099
22100 // If the saturation width is smaller than the size of the temporary result,
22101 // we can always use signed conversion, which is native.
22102 if (SatWidth < TmpWidth)
22103 FpToIntOpcode = ISD::FP_TO_SINT;
22104
22105 // Determine minimum and maximum integer values and their corresponding
22106 // floating-point values.
22107 APInt MinInt, MaxInt;
22108 if (IsSigned) {
22109 MinInt = APInt::getSignedMinValue(SatWidth).sext(DstWidth);
22110 MaxInt = APInt::getSignedMaxValue(SatWidth).sext(DstWidth);
22111 } else {
22112 MinInt = APInt::getMinValue(SatWidth).zext(DstWidth);
22113 MaxInt = APInt::getMaxValue(SatWidth).zext(DstWidth);
22114 }
22115
22116 const fltSemantics &Sem = SrcVT.getFltSemantics();
22117 APFloat MinFloat(Sem);
22118 APFloat MaxFloat(Sem);
22119
22120 APFloat::opStatus MinStatus = MinFloat.convertFromAPInt(
22121 MinInt, IsSigned, APFloat::rmTowardZero);
22122 APFloat::opStatus MaxStatus = MaxFloat.convertFromAPInt(
22123 MaxInt, IsSigned, APFloat::rmTowardZero);
22124 bool AreExactFloatBounds = !(MinStatus & APFloat::opStatus::opInexact)
22125 && !(MaxStatus & APFloat::opStatus::opInexact);
22126
22127 SDValue MinFloatNode = DAG.getConstantFP(MinFloat, dl, SrcVT);
22128 SDValue MaxFloatNode = DAG.getConstantFP(MaxFloat, dl, SrcVT);
22129
22130 // If the integer bounds are exactly representable as floats, emit a
22131 // min+max+fptoi sequence. Otherwise use comparisons and selects.
22132 if (AreExactFloatBounds) {
22133 if (DstVT != TmpVT) {
22134 // Clamp by MinFloat from below. If Src is NaN, propagate NaN.
22135 SDValue MinClamped = DAG.getNode(
22136 X86ISD::FMAX, dl, SrcVT, MinFloatNode, Src);
22137 // Clamp by MaxFloat from above. If Src is NaN, propagate NaN.
22138 SDValue BothClamped = DAG.getNode(
22139 X86ISD::FMIN, dl, SrcVT, MaxFloatNode, MinClamped);
22140 // Convert clamped value to integer.
22141 SDValue FpToInt = DAG.getNode(FpToIntOpcode, dl, TmpVT, BothClamped);
22142
22143 // NaN will become INDVAL, with the top bit set and the rest zero.
22144 // Truncation will discard the top bit, resulting in zero.
22145 return DAG.getNode(ISD::TRUNCATE, dl, DstVT, FpToInt);
22146 }
22147
22148 // Clamp by MinFloat from below. If Src is NaN, the result is MinFloat.
22149 SDValue MinClamped = DAG.getNode(
22150 X86ISD::FMAX, dl, SrcVT, Src, MinFloatNode);
22151 // Clamp by MaxFloat from above. NaN cannot occur.
22152 SDValue BothClamped = DAG.getNode(
22153 X86ISD::FMINC, dl, SrcVT, MinClamped, MaxFloatNode);
22154 // Convert clamped value to integer.
22155 SDValue FpToInt = DAG.getNode(FpToIntOpcode, dl, DstVT, BothClamped);
22156
22157 if (!IsSigned) {
22158 // In the unsigned case we're done, because we mapped NaN to MinFloat,
22159 // which is zero.
22160 return FpToInt;
22161 }
22162
22163 // Otherwise, select zero if Src is NaN.
22164 SDValue ZeroInt = DAG.getConstant(0, dl, DstVT);
22165 return DAG.getSelectCC(
22166 dl, Src, Src, ZeroInt, FpToInt, ISD::CondCode::SETUO);
22167 }
22168
22169 SDValue MinIntNode = DAG.getConstant(MinInt, dl, DstVT);
22170 SDValue MaxIntNode = DAG.getConstant(MaxInt, dl, DstVT);
22171
22172 // Result of direct conversion, which may be selected away.
22173 SDValue FpToInt = DAG.getNode(FpToIntOpcode, dl, TmpVT, Src);
22174
22175 if (DstVT != TmpVT) {
22176 // NaN will become INDVAL, with the top bit set and the rest zero.
22177 // Truncation will discard the top bit, resulting in zero.
22178 FpToInt = DAG.getNode(ISD::TRUNCATE, dl, DstVT, FpToInt);
22179 }
22180
22181 SDValue Select = FpToInt;
22182 // For signed conversions where we saturate to the same size as the
22183 // result type of the fptoi instructions, INDVAL coincides with integer
22184 // minimum, so we don't need to explicitly check it.
22185 if (!IsSigned || SatWidth != TmpVT.getScalarSizeInBits()) {
22186 // If Src ULT MinFloat, select MinInt. In particular, this also selects
22187 // MinInt if Src is NaN.
22188 Select = DAG.getSelectCC(
22189 dl, Src, MinFloatNode, MinIntNode, Select, ISD::CondCode::SETULT);
22190 }
22191
22192 // If Src OGT MaxFloat, select MaxInt.
22193 Select = DAG.getSelectCC(
22194 dl, Src, MaxFloatNode, MaxIntNode, Select, ISD::CondCode::SETOGT);
22195
22196 // In the unsigned case we are done, because we mapped NaN to MinInt, which
22197 // is already zero. The promoted case was already handled above.
22198 if (!IsSigned || DstVT != TmpVT) {
22199 return Select;
22200 }
22201
22202 // Otherwise, select 0 if Src is NaN.
22203 SDValue ZeroInt = DAG.getConstant(0, dl, DstVT);
22204 return DAG.getSelectCC(
22205 dl, Src, Src, ZeroInt, Select, ISD::CondCode::SETUO);
22206}
22207
22208SDValue X86TargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
22209 bool IsStrict = Op->isStrictFPOpcode();
22210
22211 SDLoc DL(Op);
22212 MVT VT = Op.getSimpleValueType();
22213 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
22214 SDValue In = Op.getOperand(IsStrict ? 1 : 0);
22215 MVT SVT = In.getSimpleValueType();
22216
22217 // Let f16->f80 get lowered to a libcall, except for darwin, where we should
22218 // lower it to an fp_extend via f32 (as only f16<>f32 libcalls are available)
22219 if (VT == MVT::f128 || (SVT == MVT::f16 && VT == MVT::f80 &&
22220 !Subtarget.getTargetTriple().isOSDarwin()))
22221 return SDValue();
22222
22223 if ((SVT == MVT::v8f16 && Subtarget.hasF16C()) ||
22224 (SVT == MVT::v16f16 && Subtarget.useAVX512Regs()))
22225 return Op;
22226
22227 if (SVT == MVT::f16) {
22228 if (Subtarget.hasFP16())
22229 return Op;
22230
22231 if (VT != MVT::f32) {
22232 if (IsStrict)
22233 return DAG.getNode(
22234 ISD::STRICT_FP_EXTEND, DL, {VT, MVT::Other},
22235 {Chain, DAG.getNode(ISD::STRICT_FP_EXTEND, DL,
22236 {MVT::f32, MVT::Other}, {Chain, In})});
22237
22238 return DAG.getNode(ISD::FP_EXTEND, DL, VT,
22239 DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, In));
22240 }
22241
22242 if (!Subtarget.hasF16C()) {
22243 if (!Subtarget.getTargetTriple().isOSDarwin())
22244 return SDValue();
22245
22246 assert(VT == MVT::f32 && SVT == MVT::f16 && "unexpected extend libcall");
22247
22248 // Need a libcall, but ABI for f16 is soft-float on MacOS.
22249 TargetLowering::CallLoweringInfo CLI(DAG);
22250 Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
22251
22252 In = DAG.getBitcast(MVT::i16, In);
22254 TargetLowering::ArgListEntry Entry(
22255 In, EVT(MVT::i16).getTypeForEVT(*DAG.getContext()));
22256 Entry.IsSExt = false;
22257 Entry.IsZExt = true;
22258 Args.push_back(Entry);
22259
22261 getLibcallName(RTLIB::FPEXT_F16_F32),
22263 CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(
22264 CallingConv::C, EVT(VT).getTypeForEVT(*DAG.getContext()), Callee,
22265 std::move(Args));
22266
22267 SDValue Res;
22268 std::tie(Res,Chain) = LowerCallTo(CLI);
22269 if (IsStrict)
22270 Res = DAG.getMergeValues({Res, Chain}, DL);
22271
22272 return Res;
22273 }
22274
22275 In = DAG.getBitcast(MVT::i16, In);
22276 SDValue Res;
22277 if (IsStrict) {
22278 In = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v8i16,
22279 getZeroVector(MVT::v8i16, Subtarget, DAG, DL), In,
22280 DAG.getVectorIdxConstant(0, DL));
22281 Res = DAG.getNode(X86ISD::STRICT_CVTPH2PS, DL, {MVT::v4f32, MVT::Other},
22282 {Chain, In});
22283 Chain = Res.getValue(1);
22284 } else {
22285 In = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, In);
22286 In = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v4i32,
22287 DAG.getUNDEF(MVT::v4i32), In,
22288 DAG.getVectorIdxConstant(0, DL));
22289 In = DAG.getBitcast(MVT::v8i16, In);
22290 Res = DAG.getNode(X86ISD::CVTPH2PS, DL, MVT::v4f32, In,
22291 DAG.getTargetConstant(4, DL, MVT::i32));
22292 }
22293 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Res,
22294 DAG.getVectorIdxConstant(0, DL));
22295 if (IsStrict)
22296 return DAG.getMergeValues({Res, Chain}, DL);
22297 return Res;
22298 }
22299
22300 if (!SVT.isVector() || SVT.getVectorElementType() == MVT::bf16)
22301 return Op;
22302
22303 if (SVT.getVectorElementType() == MVT::f16) {
22304 if (Subtarget.hasFP16() && isTypeLegal(SVT))
22305 return Op;
22306 assert(Subtarget.hasF16C() && "Unexpected features!");
22307 if (SVT == MVT::v2f16)
22308 In = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f16, In,
22309 DAG.getUNDEF(MVT::v2f16));
22310 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8f16, In,
22311 DAG.getUNDEF(MVT::v4f16));
22312 if (IsStrict)
22313 return DAG.getNode(X86ISD::STRICT_VFPEXT, DL, {VT, MVT::Other},
22314 {Op->getOperand(0), Res});
22315 return DAG.getNode(X86ISD::VFPEXT, DL, VT, Res);
22316 } else if (VT == MVT::v4f64 || VT == MVT::v8f64) {
22317 return Op;
22318 }
22319
22320 assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!");
22321
22322 SDValue Res =
22323 DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32, In, DAG.getUNDEF(SVT));
22324 if (IsStrict)
22325 return DAG.getNode(X86ISD::STRICT_VFPEXT, DL, {VT, MVT::Other},
22326 {Op->getOperand(0), Res});
22327 return DAG.getNode(X86ISD::VFPEXT, DL, VT, Res);
22328}
22329
22330SDValue X86TargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
22331 bool IsStrict = Op->isStrictFPOpcode();
22332
22333 SDLoc DL(Op);
22334 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
22335 SDValue In = Op.getOperand(IsStrict ? 1 : 0);
22336 MVT VT = Op.getSimpleValueType();
22337 MVT SVT = In.getSimpleValueType();
22338
22339 if (SVT == MVT::f128 || (VT == MVT::f16 && SVT == MVT::f80))
22340 return SDValue();
22341
22342 if (VT == MVT::f16 && (SVT == MVT::f64 || SVT == MVT::f32) &&
22343 !Subtarget.hasFP16() && (SVT == MVT::f64 || !Subtarget.hasF16C())) {
22344 if (!Subtarget.getTargetTriple().isOSDarwin())
22345 return SDValue();
22346
22347 // We need a libcall but the ABI for f16 libcalls on MacOS is soft.
22348 TargetLowering::CallLoweringInfo CLI(DAG);
22349 Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
22350
22352 TargetLowering::ArgListEntry Entry(
22353 In, EVT(SVT).getTypeForEVT(*DAG.getContext()));
22354 Entry.IsSExt = false;
22355 Entry.IsZExt = true;
22356 Args.push_back(Entry);
22357
22359 getLibcallName(SVT == MVT::f64 ? RTLIB::FPROUND_F64_F16
22360 : RTLIB::FPROUND_F32_F16),
22362 CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(
22363 CallingConv::C, EVT(MVT::i16).getTypeForEVT(*DAG.getContext()), Callee,
22364 std::move(Args));
22365
22366 SDValue Res;
22367 std::tie(Res, Chain) = LowerCallTo(CLI);
22368
22369 Res = DAG.getBitcast(MVT::f16, Res);
22370
22371 if (IsStrict)
22372 Res = DAG.getMergeValues({Res, Chain}, DL);
22373
22374 return Res;
22375 }
22376
22377 if (VT.getScalarType() == MVT::bf16) {
22378 if (SVT.getScalarType() == MVT::f32 &&
22379 ((Subtarget.hasBF16() && Subtarget.hasVLX()) ||
22380 Subtarget.hasAVXNECONVERT()))
22381 return Op;
22382 return SDValue();
22383 }
22384
22385 if (VT.getScalarType() == MVT::f16 && !Subtarget.hasFP16()) {
22386 if (!Subtarget.hasF16C() || SVT.getScalarType() != MVT::f32)
22387 return SDValue();
22388
22389 if (VT.isVector())
22390 return Op;
22391
22392 SDValue Res;
22394 MVT::i32);
22395 if (IsStrict) {
22396 Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v4f32,
22397 DAG.getConstantFP(0, DL, MVT::v4f32), In,
22398 DAG.getVectorIdxConstant(0, DL));
22399 Res = DAG.getNode(X86ISD::STRICT_CVTPS2PH, DL, {MVT::v8i16, MVT::Other},
22400 {Chain, Res, Rnd});
22401 Chain = Res.getValue(1);
22402 } else {
22403 // FIXME: Should we use zeros for upper elements for non-strict?
22404 Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4f32, In);
22405 Res = DAG.getNode(X86ISD::CVTPS2PH, DL, MVT::v8i16, Res, Rnd);
22406 }
22407
22408 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i16, Res,
22409 DAG.getVectorIdxConstant(0, DL));
22410 Res = DAG.getBitcast(MVT::f16, Res);
22411
22412 if (IsStrict)
22413 return DAG.getMergeValues({Res, Chain}, DL);
22414
22415 return Res;
22416 }
22417
22418 return Op;
22419}
22420
22422 bool IsStrict = Op->isStrictFPOpcode();
22423 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
22424 assert(Src.getValueType() == MVT::i16 && Op.getValueType() == MVT::f32 &&
22425 "Unexpected VT!");
22426
22427 SDLoc dl(Op);
22428 SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16,
22429 DAG.getConstant(0, dl, MVT::v8i16), Src,
22430 DAG.getVectorIdxConstant(0, dl));
22431
22432 SDValue Chain;
22433 if (IsStrict) {
22434 Res = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {MVT::v4f32, MVT::Other},
22435 {Op.getOperand(0), Res});
22436 Chain = Res.getValue(1);
22437 } else {
22438 Res = DAG.getNode(X86ISD::CVTPH2PS, dl, MVT::v4f32, Res);
22439 }
22440
22441 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res,
22442 DAG.getVectorIdxConstant(0, dl));
22443
22444 if (IsStrict)
22445 return DAG.getMergeValues({Res, Chain}, dl);
22446
22447 return Res;
22448}
22449
22451 bool IsStrict = Op->isStrictFPOpcode();
22452 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
22453 assert(Src.getValueType() == MVT::f32 && Op.getValueType() == MVT::i16 &&
22454 "Unexpected VT!");
22455
22456 SDLoc dl(Op);
22457 SDValue Res, Chain;
22458 if (IsStrict) {
22459 Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v4f32,
22460 DAG.getConstantFP(0, dl, MVT::v4f32), Src,
22461 DAG.getVectorIdxConstant(0, dl));
22462 Res = DAG.getNode(
22463 X86ISD::STRICT_CVTPS2PH, dl, {MVT::v8i16, MVT::Other},
22464 {Op.getOperand(0), Res, DAG.getTargetConstant(4, dl, MVT::i32)});
22465 Chain = Res.getValue(1);
22466 } else {
22467 // FIXME: Should we use zeros for upper elements for non-strict?
22468 Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, Src);
22469 Res = DAG.getNode(X86ISD::CVTPS2PH, dl, MVT::v8i16, Res,
22470 DAG.getTargetConstant(4, dl, MVT::i32));
22471 }
22472
22473 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Res,
22474 DAG.getVectorIdxConstant(0, dl));
22475
22476 if (IsStrict)
22477 return DAG.getMergeValues({Res, Chain}, dl);
22478
22479 return Res;
22480}
22481
22482SDValue X86TargetLowering::LowerFP_TO_BF16(SDValue Op,
22483 SelectionDAG &DAG) const {
22484 SDLoc DL(Op);
22485
22486 MVT SVT = Op.getOperand(0).getSimpleValueType();
22487 if (SVT == MVT::f32 && ((Subtarget.hasBF16() && Subtarget.hasVLX()) ||
22488 Subtarget.hasAVXNECONVERT())) {
22489 SDValue Res;
22490 Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4f32, Op.getOperand(0));
22491 Res = DAG.getNode(X86ISD::CVTNEPS2BF16, DL, MVT::v8bf16, Res);
22492 Res = DAG.getBitcast(MVT::v8i16, Res);
22493 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i16, Res,
22494 DAG.getVectorIdxConstant(0, DL));
22495 }
22496
22497 MakeLibCallOptions CallOptions;
22498 RTLIB::Libcall LC = RTLIB::getFPROUND(SVT, MVT::bf16);
22499 SDValue Res =
22500 makeLibCall(DAG, LC, MVT::f16, Op.getOperand(0), CallOptions, DL).first;
22501 return DAG.getBitcast(MVT::i16, Res);
22502}
22503
22504/// Depending on uarch and/or optimizing for size, we might prefer to use a
22505/// vector operation in place of the typical scalar operation.
22507 SelectionDAG &DAG,
22508 const X86Subtarget &Subtarget) {
22509 // If both operands have other uses, this is probably not profitable.
22510 SDValue LHS = Op.getOperand(0);
22511 SDValue RHS = Op.getOperand(1);
22512 if (!LHS.hasOneUse() && !RHS.hasOneUse())
22513 return Op;
22514
22515 // FP horizontal add/sub were added with SSE3. Integer with SSSE3.
22516 bool IsFP = Op.getSimpleValueType().isFloatingPoint();
22517 if (IsFP && !Subtarget.hasSSE3())
22518 return Op;
22519 if (!IsFP && !Subtarget.hasSSSE3())
22520 return Op;
22521
22522 // Extract from a common vector.
22523 if (LHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
22524 RHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
22525 LHS.getOperand(0) != RHS.getOperand(0) ||
22526 !isa<ConstantSDNode>(LHS.getOperand(1)) ||
22527 !isa<ConstantSDNode>(RHS.getOperand(1)) ||
22528 !shouldUseHorizontalOp(true, DAG, Subtarget))
22529 return Op;
22530
22531 // Allow commuted 'hadd' ops.
22532 // TODO: Allow commuted (f)sub by negating the result of (F)HSUB?
22533 unsigned HOpcode;
22534 switch (Op.getOpcode()) {
22535 // clang-format off
22536 case ISD::ADD: HOpcode = X86ISD::HADD; break;
22537 case ISD::SUB: HOpcode = X86ISD::HSUB; break;
22538 case ISD::FADD: HOpcode = X86ISD::FHADD; break;
22539 case ISD::FSUB: HOpcode = X86ISD::FHSUB; break;
22540 default:
22541 llvm_unreachable("Trying to lower unsupported opcode to horizontal op");
22542 // clang-format on
22543 }
22544 unsigned LExtIndex = LHS.getConstantOperandVal(1);
22545 unsigned RExtIndex = RHS.getConstantOperandVal(1);
22546 if ((LExtIndex & 1) == 1 && (RExtIndex & 1) == 0 &&
22547 (HOpcode == X86ISD::HADD || HOpcode == X86ISD::FHADD))
22548 std::swap(LExtIndex, RExtIndex);
22549
22550 if ((LExtIndex & 1) != 0 || RExtIndex != (LExtIndex + 1))
22551 return Op;
22552
22553 SDValue X = LHS.getOperand(0);
22554 EVT VecVT = X.getValueType();
22555 unsigned BitWidth = VecVT.getSizeInBits();
22556 unsigned NumLanes = BitWidth / 128;
22557 unsigned NumEltsPerLane = VecVT.getVectorNumElements() / NumLanes;
22558 assert((BitWidth == 128 || BitWidth == 256 || BitWidth == 512) &&
22559 "Not expecting illegal vector widths here");
22560
22561 // Creating a 256-bit horizontal op would be wasteful, and there is no 512-bit
22562 // equivalent, so extract the 256/512-bit source op to 128-bit if we can.
22563 if (BitWidth == 256 || BitWidth == 512) {
22564 unsigned LaneIdx = LExtIndex / NumEltsPerLane;
22565 X = extract128BitVector(X, LaneIdx * NumEltsPerLane, DAG, DL);
22566 LExtIndex %= NumEltsPerLane;
22567 }
22568
22569 // add (extractelt (X, 0), extractelt (X, 1)) --> extractelt (hadd X, X), 0
22570 // add (extractelt (X, 1), extractelt (X, 0)) --> extractelt (hadd X, X), 0
22571 // add (extractelt (X, 2), extractelt (X, 3)) --> extractelt (hadd X, X), 1
22572 // sub (extractelt (X, 0), extractelt (X, 1)) --> extractelt (hsub X, X), 0
22573 SDValue HOp = DAG.getNode(HOpcode, DL, X.getValueType(), X, X);
22574 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Op.getSimpleValueType(), HOp,
22575 DAG.getVectorIdxConstant(LExtIndex / 2, DL));
22576}
22577
22578/// Depending on uarch and/or optimizing for size, we might prefer to use a
22579/// vector operation in place of the typical scalar operation.
22580SDValue X86TargetLowering::lowerFaddFsub(SDValue Op, SelectionDAG &DAG) const {
22581 assert((Op.getValueType() == MVT::f32 || Op.getValueType() == MVT::f64) &&
22582 "Only expecting float/double");
22583 return lowerAddSubToHorizontalOp(Op, SDLoc(Op), DAG, Subtarget);
22584}
22585
22586/// ISD::FROUND is defined to round to nearest with ties rounding away from 0.
22587/// This mode isn't supported in hardware on X86. But as long as we aren't
22588/// compiling with trapping math, we can emulate this with
22589/// trunc(X + copysign(nextafter(0.5, 0.0), X)).
22591 SDValue N0 = Op.getOperand(0);
22592 SDLoc dl(Op);
22593 MVT VT = Op.getSimpleValueType();
22594
22595 // N0 += copysign(nextafter(0.5, 0.0), N0)
22596 const fltSemantics &Sem = VT.getFltSemantics();
22597 bool Ignored;
22598 APFloat Point5Pred = APFloat(0.5f);
22599 Point5Pred.convert(Sem, APFloat::rmNearestTiesToEven, &Ignored);
22600 Point5Pred.next(/*nextDown*/true);
22601
22602 SDValue Adder = DAG.getNode(ISD::FCOPYSIGN, dl, VT,
22603 DAG.getConstantFP(Point5Pred, dl, VT), N0);
22604 N0 = DAG.getNode(ISD::FADD, dl, VT, N0, Adder);
22605
22606 // Truncate the result to remove fraction.
22607 return DAG.getNode(ISD::FTRUNC, dl, VT, N0);
22608}
22609
22610/// The only differences between FABS and FNEG are the mask and the logic op.
22611/// FNEG also has a folding opportunity for FNEG(FABS(x)).
22613 assert((Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) &&
22614 "Wrong opcode for lowering FABS or FNEG.");
22615
22616 bool IsFABS = (Op.getOpcode() == ISD::FABS);
22617
22618 // If this is a FABS and it has an FNEG user, bail out to fold the combination
22619 // into an FNABS. We'll lower the FABS after that if it is still in use.
22620 if (IsFABS)
22621 for (SDNode *User : Op->users())
22622 if (User->getOpcode() == ISD::FNEG)
22623 return Op;
22624
22625 SDLoc dl(Op);
22626 MVT VT = Op.getSimpleValueType();
22627
22628 bool IsF128 = (VT == MVT::f128);
22629 assert(VT.isFloatingPoint() && VT != MVT::f80 &&
22631 "Unexpected type in LowerFABSorFNEG");
22632
22633 // FIXME: Use function attribute "OptimizeForSize" and/or CodeGenOptLevel to
22634 // decide if we should generate a 16-byte constant mask when we only need 4 or
22635 // 8 bytes for the scalar case.
22636
22637 // There are no scalar bitwise logical SSE/AVX instructions, so we
22638 // generate a 16-byte vector constant and logic op even for the scalar case.
22639 // Using a 16-byte mask allows folding the load of the mask with
22640 // the logic op, so it can save (~4 bytes) on code size.
22641 bool IsFakeVector = !VT.isVector() && !IsF128;
22642 MVT LogicVT = VT;
22643 if (IsFakeVector)
22644 LogicVT = (VT == MVT::f64) ? MVT::v2f64
22645 : (VT == MVT::f32) ? MVT::v4f32
22646 : MVT::v8f16;
22647
22648 unsigned EltBits = VT.getScalarSizeInBits();
22649 // For FABS, mask is 0x7f...; for FNEG, mask is 0x80...
22650 APInt MaskElt = IsFABS ? APInt::getSignedMaxValue(EltBits) :
22651 APInt::getSignMask(EltBits);
22652 const fltSemantics &Sem = VT.getFltSemantics();
22653 SDValue Mask = DAG.getConstantFP(APFloat(Sem, MaskElt), dl, LogicVT);
22654
22655 SDValue Op0 = Op.getOperand(0);
22656 bool IsFNABS = !IsFABS && (Op0.getOpcode() == ISD::FABS);
22657 unsigned LogicOp = IsFABS ? X86ISD::FAND :
22658 IsFNABS ? X86ISD::FOR :
22660 SDValue Operand = IsFNABS ? Op0.getOperand(0) : Op0;
22661
22662 if (VT.isVector() || IsF128)
22663 return DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
22664
22665 // For the scalar case extend to a 128-bit vector, perform the logic op,
22666 // and extract the scalar result back out.
22667 Operand = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Operand);
22668 SDValue LogicNode = DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
22669 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, LogicNode,
22670 DAG.getVectorIdxConstant(0, dl));
22671}
22672
22674 SDValue Mag = Op.getOperand(0);
22675 SDValue Sign = Op.getOperand(1);
22676 SDLoc dl(Op);
22677
22678 // If the sign operand is smaller, extend it first.
22679 MVT VT = Op.getSimpleValueType();
22680 if (Sign.getSimpleValueType().bitsLT(VT))
22681 Sign = DAG.getNode(ISD::FP_EXTEND, dl, VT, Sign);
22682
22683 // And if it is bigger, shrink it first.
22684 if (Sign.getSimpleValueType().bitsGT(VT))
22685 Sign = DAG.getNode(ISD::FP_ROUND, dl, VT, Sign,
22686 DAG.getIntPtrConstant(0, dl, /*isTarget=*/true));
22687
22688 // At this point the operands and the result should have the same
22689 // type, and that won't be f80 since that is not custom lowered.
22690 bool IsF128 = (VT == MVT::f128);
22691 assert(VT.isFloatingPoint() && VT != MVT::f80 &&
22693 "Unexpected type in LowerFCOPYSIGN");
22694
22695 const fltSemantics &Sem = VT.getFltSemantics();
22696
22697 // Perform all scalar logic operations as 16-byte vectors because there are no
22698 // scalar FP logic instructions in SSE.
22699 // TODO: This isn't necessary. If we used scalar types, we might avoid some
22700 // unnecessary splats, but we might miss load folding opportunities. Should
22701 // this decision be based on OptimizeForSize?
22702 bool IsFakeVector = !VT.isVector() && !IsF128;
22703 MVT LogicVT = VT;
22704 if (IsFakeVector)
22705 LogicVT = (VT == MVT::f64) ? MVT::v2f64
22706 : (VT == MVT::f32) ? MVT::v4f32
22707 : MVT::v8f16;
22708
22709 // The mask constants are automatically splatted for vector types.
22710 unsigned EltSizeInBits = VT.getScalarSizeInBits();
22711 SDValue SignMask = DAG.getConstantFP(
22712 APFloat(Sem, APInt::getSignMask(EltSizeInBits)), dl, LogicVT);
22713 SDValue MagMask = DAG.getConstantFP(
22714 APFloat(Sem, APInt::getSignedMaxValue(EltSizeInBits)), dl, LogicVT);
22715
22716 // First, clear all bits but the sign bit from the second operand (sign).
22717 if (IsFakeVector)
22718 Sign = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Sign);
22719 SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, LogicVT, Sign, SignMask);
22720
22721 // Next, clear the sign bit from the first operand (magnitude).
22722 // TODO: If we had general constant folding for FP logic ops, this check
22723 // wouldn't be necessary.
22724 SDValue MagBits;
22725 if (ConstantFPSDNode *Op0CN = isConstOrConstSplatFP(Mag)) {
22726 APFloat APF = Op0CN->getValueAPF();
22727 APF.clearSign();
22728 MagBits = DAG.getConstantFP(APF, dl, LogicVT);
22729 } else {
22730 // If the magnitude operand wasn't a constant, we need to AND out the sign.
22731 if (IsFakeVector)
22732 Mag = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Mag);
22733 MagBits = DAG.getNode(X86ISD::FAND, dl, LogicVT, Mag, MagMask);
22734 }
22735
22736 // OR the magnitude value with the sign bit.
22737 SDValue Or = DAG.getNode(X86ISD::FOR, dl, LogicVT, MagBits, SignBit);
22738 return !IsFakeVector ? Or
22739 : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Or,
22740 DAG.getVectorIdxConstant(0, dl));
22741}
22742
22744 SDValue N0 = Op.getOperand(0);
22745 SDLoc dl(Op);
22746 MVT VT = Op.getSimpleValueType();
22747
22748 MVT OpVT = N0.getSimpleValueType();
22749 assert((OpVT == MVT::f32 || OpVT == MVT::f64) &&
22750 "Unexpected type for FGETSIGN");
22751
22752 // Lower ISD::FGETSIGN to (AND (X86ISD::MOVMSK ...) 1).
22753 MVT VecVT = (OpVT == MVT::f32 ? MVT::v4f32 : MVT::v2f64);
22754 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, N0);
22755 Res = DAG.getNode(X86ISD::MOVMSK, dl, MVT::i32, Res);
22756 Res = DAG.getZExtOrTrunc(Res, dl, VT);
22757 Res = DAG.getNode(ISD::AND, dl, VT, Res, DAG.getConstant(1, dl, VT));
22758 return Res;
22759}
22760
22761/// Helper for attempting to create a X86ISD::BT node.
22762static SDValue getBT(SDValue Src, SDValue BitNo, const SDLoc &DL, SelectionDAG &DAG) {
22763 // If Src is i8, promote it to i32 with any_extend. There is no i8 BT
22764 // instruction. Since the shift amount is in-range-or-undefined, we know
22765 // that doing a bittest on the i32 value is ok. We extend to i32 because
22766 // the encoding for the i16 version is larger than the i32 version.
22767 // Also promote i16 to i32 for performance / code size reason.
22768 if (Src.getValueType().getScalarSizeInBits() < 32)
22769 Src = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Src);
22770
22771 // No legal type found, give up.
22772 if (!DAG.getTargetLoweringInfo().isTypeLegal(Src.getValueType()))
22773 return SDValue();
22774
22775 // See if we can use the 32-bit instruction instead of the 64-bit one for a
22776 // shorter encoding. Since the former takes the modulo 32 of BitNo and the
22777 // latter takes the modulo 64, this is only valid if the 5th bit of BitNo is
22778 // known to be zero.
22779 if (Src.getValueType() == MVT::i64 &&
22780 DAG.MaskedValueIsZero(BitNo, APInt(BitNo.getValueSizeInBits(), 32)))
22781 Src = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Src);
22782
22783 // If the operand types disagree, extend the shift amount to match. Since
22784 // BT ignores high bits (like shifts) we can use anyextend.
22785 if (Src.getValueType() != BitNo.getValueType()) {
22786 // Peek through a mask/modulo operation.
22787 // TODO: DAGCombine fails to do this as it just checks isTruncateFree, but
22788 // we probably need a better IsDesirableToPromoteOp to handle this as well.
22789 if (BitNo.getOpcode() == ISD::AND && BitNo->hasOneUse())
22790 BitNo = DAG.getNode(ISD::AND, DL, Src.getValueType(),
22791 DAG.getNode(ISD::ANY_EXTEND, DL, Src.getValueType(),
22792 BitNo.getOperand(0)),
22793 DAG.getNode(ISD::ANY_EXTEND, DL, Src.getValueType(),
22794 BitNo.getOperand(1)));
22795 else
22796 BitNo = DAG.getNode(ISD::ANY_EXTEND, DL, Src.getValueType(), BitNo);
22797 }
22798
22799 return DAG.getNode(X86ISD::BT, DL, MVT::i32, Src, BitNo);
22800}
22801
22802/// Helper for creating a X86ISD::SETCC node.
22804 SelectionDAG &DAG) {
22805 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
22806 DAG.getTargetConstant(Cond, dl, MVT::i8), EFLAGS);
22807}
22808
22809/// Recursive helper for combineVectorSizedSetCCEquality() to see if we have a
22810/// recognizable memcmp expansion.
22811static bool isOrXorXorTree(SDValue X, bool Root = true) {
22812 if (X.getOpcode() == ISD::OR)
22813 return isOrXorXorTree(X.getOperand(0), false) &&
22814 isOrXorXorTree(X.getOperand(1), false);
22815 if (Root)
22816 return false;
22817 return X.getOpcode() == ISD::XOR;
22818}
22819
22820/// Recursive helper for combineVectorSizedSetCCEquality() to emit the memcmp
22821/// expansion.
22822template <typename F>
22824 EVT VecVT, EVT CmpVT, bool HasPT, F SToV) {
22825 SDValue Op0 = X.getOperand(0);
22826 SDValue Op1 = X.getOperand(1);
22827 if (X.getOpcode() == ISD::OR) {
22828 SDValue A = emitOrXorXorTree(Op0, DL, DAG, VecVT, CmpVT, HasPT, SToV);
22829 SDValue B = emitOrXorXorTree(Op1, DL, DAG, VecVT, CmpVT, HasPT, SToV);
22830 if (VecVT != CmpVT)
22831 return DAG.getNode(ISD::OR, DL, CmpVT, A, B);
22832 if (HasPT)
22833 return DAG.getNode(ISD::OR, DL, VecVT, A, B);
22834 return DAG.getNode(ISD::AND, DL, CmpVT, A, B);
22835 }
22836 if (X.getOpcode() == ISD::XOR) {
22837 SDValue A = SToV(Op0);
22838 SDValue B = SToV(Op1);
22839 if (VecVT != CmpVT)
22840 return DAG.getSetCC(DL, CmpVT, A, B, ISD::SETNE);
22841 if (HasPT)
22842 return DAG.getNode(ISD::XOR, DL, VecVT, A, B);
22843 return DAG.getSetCC(DL, CmpVT, A, B, ISD::SETEQ);
22844 }
22845 llvm_unreachable("Impossible");
22846}
22847
22848/// Try to map a 128-bit or larger integer comparison to vector instructions
22849/// before type legalization splits it up into chunks.
22851 ISD::CondCode CC,
22852 const SDLoc &DL,
22853 SelectionDAG &DAG,
22854 const X86Subtarget &Subtarget) {
22855 assert((CC == ISD::SETNE || CC == ISD::SETEQ) && "Bad comparison predicate");
22856
22857 // We're looking for an oversized integer equality comparison.
22858 EVT OpVT = X.getValueType();
22859 unsigned OpSize = OpVT.getSizeInBits();
22860 if (!OpVT.isScalarInteger() || OpSize < 128)
22861 return SDValue();
22862
22863 // Ignore a comparison with zero because that gets special treatment in
22864 // EmitTest(). But make an exception for the special case of a pair of
22865 // logically-combined vector-sized operands compared to zero. This pattern may
22866 // be generated by the memcmp expansion pass with oversized integer compares
22867 // (see PR33325).
22868 bool IsOrXorXorTreeCCZero = isNullConstant(Y) && isOrXorXorTree(X);
22869 if (isNullConstant(Y) && OpSize == 128 && !IsOrXorXorTreeCCZero)
22870 return SDValue();
22871
22872 // Don't perform this combine if constructing the vector will be expensive.
22873 auto IsVectorBitCastCheap = [](SDValue X) {
22875 return isa<ConstantSDNode>(X) || X.getValueType().isVector() ||
22876 X.getOpcode() == ISD::LOAD;
22877 };
22878 if ((!IsVectorBitCastCheap(X) || !IsVectorBitCastCheap(Y)) &&
22879 !IsOrXorXorTreeCCZero)
22880 return SDValue();
22881
22882 // Use XOR (plus OR) and PTEST after SSE4.1 for 128/256-bit operands.
22883 // Use PCMPNEQ (plus OR) and KORTEST for 512-bit operands.
22884 // Otherwise use PCMPEQ (plus AND) and mask testing.
22885 bool NoImplicitFloatOps =
22887 Attribute::NoImplicitFloat);
22888 if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps &&
22889 ((OpSize == 128 && Subtarget.hasSSE2()) ||
22890 (OpSize == 256 && Subtarget.hasAVX()) ||
22891 (OpSize == 512 && Subtarget.useAVX512Regs()))) {
22892 bool HasPT = Subtarget.hasSSE41();
22893
22894 // PTEST and MOVMSK are slow on Knights Landing and Knights Mill and widened
22895 // vector registers are essentially free. (Technically, widening registers
22896 // prevents load folding, but the tradeoff is worth it.)
22897 bool PreferKOT = Subtarget.preferMaskRegisters();
22898 bool NeedZExt = PreferKOT && !Subtarget.hasVLX() && OpSize != 512;
22899
22900 EVT VecVT = MVT::v16i8;
22901 EVT CmpVT = PreferKOT ? MVT::v16i1 : VecVT;
22902 if (OpSize == 256) {
22903 VecVT = MVT::v32i8;
22904 CmpVT = PreferKOT ? MVT::v32i1 : VecVT;
22905 }
22906 EVT CastVT = VecVT;
22907 bool NeedsAVX512FCast = false;
22908 if (OpSize == 512 || NeedZExt) {
22909 if (Subtarget.hasBWI()) {
22910 VecVT = MVT::v64i8;
22911 CmpVT = MVT::v64i1;
22912 if (OpSize == 512)
22913 CastVT = VecVT;
22914 } else {
22915 VecVT = MVT::v16i32;
22916 CmpVT = MVT::v16i1;
22917 CastVT = OpSize == 512 ? VecVT
22918 : OpSize == 256 ? MVT::v8i32
22919 : MVT::v4i32;
22920 NeedsAVX512FCast = true;
22921 }
22922 }
22923
22924 auto ScalarToVector = [&](SDValue X) -> SDValue {
22925 bool TmpZext = false;
22926 EVT TmpCastVT = CastVT;
22927 if (X.getOpcode() == ISD::ZERO_EXTEND) {
22928 SDValue OrigX = X.getOperand(0);
22929 unsigned OrigSize = OrigX.getScalarValueSizeInBits();
22930 if (OrigSize < OpSize) {
22931 if (OrigSize == 128) {
22932 TmpCastVT = NeedsAVX512FCast ? MVT::v4i32 : MVT::v16i8;
22933 X = OrigX;
22934 TmpZext = true;
22935 } else if (OrigSize == 256) {
22936 TmpCastVT = NeedsAVX512FCast ? MVT::v8i32 : MVT::v32i8;
22937 X = OrigX;
22938 TmpZext = true;
22939 }
22940 }
22941 }
22942 X = DAG.getBitcast(TmpCastVT, X);
22943 if (!NeedZExt && !TmpZext)
22944 return X;
22945 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VecVT,
22946 DAG.getConstant(0, DL, VecVT), X,
22947 DAG.getVectorIdxConstant(0, DL));
22948 };
22949
22950 SDValue Cmp;
22951 if (IsOrXorXorTreeCCZero) {
22952 // This is a bitwise-combined equality comparison of 2 pairs of vectors:
22953 // setcc i128 (or (xor A, B), (xor C, D)), 0, eq|ne
22954 // Use 2 vector equality compares and 'and' the results before doing a
22955 // MOVMSK.
22956 Cmp = emitOrXorXorTree(X, DL, DAG, VecVT, CmpVT, HasPT, ScalarToVector);
22957 } else {
22958 SDValue VecX = ScalarToVector(X);
22959 SDValue VecY = ScalarToVector(Y);
22960 if (VecVT != CmpVT) {
22961 Cmp = DAG.getSetCC(DL, CmpVT, VecX, VecY, ISD::SETNE);
22962 } else if (HasPT) {
22963 Cmp = DAG.getNode(ISD::XOR, DL, VecVT, VecX, VecY);
22964 } else {
22965 Cmp = DAG.getSetCC(DL, CmpVT, VecX, VecY, ISD::SETEQ);
22966 }
22967 }
22968 // AVX512 should emit a setcc that will lower to kortest.
22969 if (VecVT != CmpVT) {
22970 EVT KRegVT = CmpVT == MVT::v64i1 ? MVT::i64
22971 : CmpVT == MVT::v32i1 ? MVT::i32
22972 : MVT::i16;
22973 return DAG.getSetCC(DL, VT, DAG.getBitcast(KRegVT, Cmp),
22974 DAG.getConstant(0, DL, KRegVT), CC);
22975 }
22976 if (HasPT) {
22977 SDValue BCCmp =
22978 DAG.getBitcast(OpSize == 256 ? MVT::v4i64 : MVT::v2i64, Cmp);
22979 SDValue PT = DAG.getNode(X86ISD::PTEST, DL, MVT::i32, BCCmp, BCCmp);
22981 SDValue X86SetCC = getSETCC(X86CC, PT, DL, DAG);
22982 return DAG.getNode(ISD::TRUNCATE, DL, VT, X86SetCC.getValue(0));
22983 }
22984 // If all bytes match (bitmask is 0x(FFFF)FFFF), that's equality.
22985 // setcc i128 X, Y, eq --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, eq
22986 // setcc i128 X, Y, ne --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, ne
22987 assert(Cmp.getValueType() == MVT::v16i8 &&
22988 "Non 128-bit vector on pre-SSE41 target");
22989 SDValue MovMsk = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Cmp);
22990 SDValue FFFFs = DAG.getConstant(0xFFFF, DL, MVT::i32);
22991 return DAG.getSetCC(DL, VT, MovMsk, FFFFs, CC);
22992 }
22993
22994 return SDValue();
22995}
22996
22997/// Helper for matching BINOP(EXTRACTELT(X,0),BINOP(EXTRACTELT(X,1),...))
22998/// style scalarized (associative) reduction patterns. Partial reductions
22999/// are supported when the pointer SrcMask is non-null.
23000/// TODO - move this to SelectionDAG?
23003 SmallVectorImpl<APInt> *SrcMask = nullptr) {
23005 DenseMap<SDValue, APInt> SrcOpMap;
23006 EVT VT = MVT::Other;
23007
23008 // Recognize a special case where a vector is casted into wide integer to
23009 // test all 0s.
23010 assert(Op.getOpcode() == unsigned(BinOp) &&
23011 "Unexpected bit reduction opcode");
23012 Opnds.push_back(Op.getOperand(0));
23013 Opnds.push_back(Op.getOperand(1));
23014
23015 for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) {
23017 // BFS traverse all BinOp operands.
23018 if (I->getOpcode() == unsigned(BinOp)) {
23019 Opnds.push_back(I->getOperand(0));
23020 Opnds.push_back(I->getOperand(1));
23021 // Re-evaluate the number of nodes to be traversed.
23022 e += 2; // 2 more nodes (LHS and RHS) are pushed.
23023 continue;
23024 }
23025
23026 // Quit if a non-EXTRACT_VECTOR_ELT
23027 if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
23028 return false;
23029
23030 // Quit if without a constant index.
23031 auto *Idx = dyn_cast<ConstantSDNode>(I->getOperand(1));
23032 if (!Idx)
23033 return false;
23034
23035 SDValue Src = I->getOperand(0);
23036 DenseMap<SDValue, APInt>::iterator M = SrcOpMap.find(Src);
23037 if (M == SrcOpMap.end()) {
23038 VT = Src.getValueType();
23039 // Quit if not the same type.
23040 if (!SrcOpMap.empty() && VT != SrcOpMap.begin()->first.getValueType())
23041 return false;
23042 unsigned NumElts = VT.getVectorNumElements();
23043 APInt EltCount = APInt::getZero(NumElts);
23044 M = SrcOpMap.insert(std::make_pair(Src, EltCount)).first;
23045 SrcOps.push_back(Src);
23046 }
23047
23048 // Quit if element already used.
23049 unsigned CIdx = Idx->getZExtValue();
23050 if (M->second[CIdx])
23051 return false;
23052 M->second.setBit(CIdx);
23053 }
23054
23055 if (SrcMask) {
23056 // Collect the source partial masks.
23057 for (SDValue &SrcOp : SrcOps)
23058 SrcMask->push_back(SrcOpMap[SrcOp]);
23059 } else {
23060 // Quit if not all elements are used.
23061 for (const auto &I : SrcOpMap)
23062 if (!I.second.isAllOnes())
23063 return false;
23064 }
23065
23066 return true;
23067}
23068
23069// Helper function for comparing all bits of two vectors.
23071 ISD::CondCode CC, const APInt &OriginalMask,
23072 const X86Subtarget &Subtarget,
23073 SelectionDAG &DAG, X86::CondCode &X86CC) {
23074 EVT VT = LHS.getValueType();
23075 unsigned ScalarSize = VT.getScalarSizeInBits();
23076 if (OriginalMask.getBitWidth() != ScalarSize) {
23077 assert(ScalarSize == 1 && "Element Mask vs Vector bitwidth mismatch");
23078 return SDValue();
23079 }
23080
23081 // Quit if not convertable to legal scalar or 128/256-bit vector.
23083 return SDValue();
23084
23085 // FCMP may use ISD::SETNE when nnan - early out if we manage to get here.
23086 if (VT.isFloatingPoint())
23087 return SDValue();
23088
23089 assert((CC == ISD::SETEQ || CC == ISD::SETNE) && "Unsupported ISD::CondCode");
23090 X86CC = (CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE);
23091
23092 APInt Mask = OriginalMask;
23093
23094 auto MaskBits = [&](SDValue Src) {
23095 if (Mask.isAllOnes())
23096 return Src;
23097 EVT SrcVT = Src.getValueType();
23098 SDValue MaskValue = DAG.getConstant(Mask, DL, SrcVT);
23099 return DAG.getNode(ISD::AND, DL, SrcVT, Src, MaskValue);
23100 };
23101
23102 // For sub-128-bit vector, cast to (legal) integer and compare with zero.
23103 if (VT.getSizeInBits() < 128) {
23104 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
23105 if (!DAG.getTargetLoweringInfo().isTypeLegal(IntVT)) {
23106 if (IntVT != MVT::i64)
23107 return SDValue();
23108 auto SplitLHS = DAG.SplitScalar(DAG.getBitcast(IntVT, MaskBits(LHS)), DL,
23109 MVT::i32, MVT::i32);
23110 auto SplitRHS = DAG.SplitScalar(DAG.getBitcast(IntVT, MaskBits(RHS)), DL,
23111 MVT::i32, MVT::i32);
23112 SDValue Lo =
23113 DAG.getNode(ISD::XOR, DL, MVT::i32, SplitLHS.first, SplitRHS.first);
23114 SDValue Hi =
23115 DAG.getNode(ISD::XOR, DL, MVT::i32, SplitLHS.second, SplitRHS.second);
23116 return DAG.getNode(X86ISD::CMP, DL, MVT::i32,
23117 DAG.getNode(ISD::OR, DL, MVT::i32, Lo, Hi),
23118 DAG.getConstant(0, DL, MVT::i32));
23119 }
23120 return DAG.getNode(X86ISD::CMP, DL, MVT::i32,
23121 DAG.getBitcast(IntVT, MaskBits(LHS)),
23122 DAG.getBitcast(IntVT, MaskBits(RHS)));
23123 }
23124
23125 // Without PTEST, a masked v2i64 or-reduction is not faster than
23126 // scalarization.
23127 bool UseKORTEST = Subtarget.useAVX512Regs();
23128 bool UsePTEST = Subtarget.hasSSE41();
23129 if (!UsePTEST && !Mask.isAllOnes() && ScalarSize > 32)
23130 return SDValue();
23131
23132 // Split down to 128/256/512-bit vector.
23133 unsigned TestSize = UseKORTEST ? 512 : (Subtarget.hasAVX() ? 256 : 128);
23134
23135 // If the input vector has vector elements wider than the target test size,
23136 // then cast to <X x i64> so it will safely split.
23137 if (ScalarSize > TestSize) {
23138 if (!Mask.isAllOnes())
23139 return SDValue();
23140 VT = EVT::getVectorVT(*DAG.getContext(), MVT::i64, VT.getSizeInBits() / 64);
23141 LHS = DAG.getBitcast(VT, LHS);
23142 RHS = DAG.getBitcast(VT, RHS);
23143 Mask = APInt::getAllOnes(64);
23144 }
23145
23146 if (VT.getSizeInBits() > TestSize) {
23147 KnownBits KnownRHS = DAG.computeKnownBits(RHS);
23148 if (KnownRHS.isConstant() && KnownRHS.getConstant() == Mask) {
23149 // If ICMP(AND(LHS,MASK),MASK) - reduce using AND splits.
23150 while (VT.getSizeInBits() > TestSize) {
23151 auto Split = DAG.SplitVector(LHS, DL);
23152 VT = Split.first.getValueType();
23153 LHS = DAG.getNode(ISD::AND, DL, VT, Split.first, Split.second);
23154 }
23155 RHS = DAG.getAllOnesConstant(DL, VT);
23156 } else if (!UsePTEST && !KnownRHS.isZero()) {
23157 // MOVMSK Special Case:
23158 // ALLOF(CMPEQ(X,Y)) -> AND(CMPEQ(X[0],Y[0]),CMPEQ(X[1],Y[1]),....)
23159 MVT SVT = ScalarSize >= 32 ? MVT::i32 : MVT::i8;
23160 VT = MVT::getVectorVT(SVT, VT.getSizeInBits() / SVT.getSizeInBits());
23161 LHS = DAG.getBitcast(VT, MaskBits(LHS));
23162 RHS = DAG.getBitcast(VT, MaskBits(RHS));
23163 EVT BoolVT = VT.changeVectorElementType(MVT::i1);
23164 SDValue V = DAG.getSetCC(DL, BoolVT, LHS, RHS, ISD::SETEQ);
23165 V = DAG.getSExtOrTrunc(V, DL, VT);
23166 while (VT.getSizeInBits() > TestSize) {
23167 auto Split = DAG.SplitVector(V, DL);
23168 VT = Split.first.getValueType();
23169 V = DAG.getNode(ISD::AND, DL, VT, Split.first, Split.second);
23170 }
23171 V = DAG.getNOT(DL, V, VT);
23172 V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
23173 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, V,
23174 DAG.getConstant(0, DL, MVT::i32));
23175 } else {
23176 // Convert to a ICMP_EQ(XOR(LHS,RHS),0) pattern.
23177 SDValue V = DAG.getNode(ISD::XOR, DL, VT, LHS, RHS);
23178 while (VT.getSizeInBits() > TestSize) {
23179 auto Split = DAG.SplitVector(V, DL);
23180 VT = Split.first.getValueType();
23181 V = DAG.getNode(ISD::OR, DL, VT, Split.first, Split.second);
23182 }
23183 LHS = V;
23184 RHS = DAG.getConstant(0, DL, VT);
23185 }
23186 }
23187
23188 if (UseKORTEST && VT.is512BitVector()) {
23189 MVT TestVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
23190 MVT BoolVT = TestVT.changeVectorElementType(MVT::i1);
23191 LHS = DAG.getBitcast(TestVT, MaskBits(LHS));
23192 RHS = DAG.getBitcast(TestVT, MaskBits(RHS));
23193 SDValue V = DAG.getSetCC(DL, BoolVT, LHS, RHS, ISD::SETNE);
23194 return DAG.getNode(X86ISD::KORTEST, DL, MVT::i32, V, V);
23195 }
23196
23197 if (UsePTEST) {
23198 MVT TestVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
23199 LHS = DAG.getBitcast(TestVT, MaskBits(LHS));
23200 RHS = DAG.getBitcast(TestVT, MaskBits(RHS));
23201 SDValue V = DAG.getNode(ISD::XOR, DL, TestVT, LHS, RHS);
23202 return DAG.getNode(X86ISD::PTEST, DL, MVT::i32, V, V);
23203 }
23204
23205 assert(VT.getSizeInBits() == 128 && "Failure to split to 128-bits");
23206 MVT MaskVT = ScalarSize >= 32 ? MVT::v4i32 : MVT::v16i8;
23207 LHS = DAG.getBitcast(MaskVT, MaskBits(LHS));
23208 RHS = DAG.getBitcast(MaskVT, MaskBits(RHS));
23209 SDValue V = DAG.getNode(X86ISD::PCMPEQ, DL, MaskVT, LHS, RHS);
23210 V = DAG.getNOT(DL, V, MaskVT);
23211 V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
23212 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, V,
23213 DAG.getConstant(0, DL, MVT::i32));
23214}
23215
23216// Check whether an AND/OR'd reduction tree is PTEST-able, or if we can fallback
23217// to CMP(MOVMSK(PCMPEQB(X,Y))).
23219 ISD::CondCode CC, const SDLoc &DL,
23220 const X86Subtarget &Subtarget,
23221 SelectionDAG &DAG,
23222 X86::CondCode &X86CC) {
23223 SDValue Op = OrigLHS;
23224
23225 bool CmpNull;
23226 APInt Mask;
23227 if (CC == ISD::SETEQ || CC == ISD::SETNE) {
23228 CmpNull = isNullConstant(OrigRHS);
23229 if (!CmpNull && !isAllOnesConstant(OrigRHS))
23230 return SDValue();
23231
23232 if (!Subtarget.hasSSE2() || !Op->hasOneUse())
23233 return SDValue();
23234
23235 // Check whether we're masking/truncating an OR-reduction result, in which
23236 // case track the masked bits.
23237 // TODO: Add CmpAllOnes support.
23238 Mask = APInt::getAllOnes(Op.getScalarValueSizeInBits());
23239 if (CmpNull) {
23240 switch (Op.getOpcode()) {
23241 case ISD::TRUNCATE: {
23242 SDValue Src = Op.getOperand(0);
23243 Mask = APInt::getLowBitsSet(Src.getScalarValueSizeInBits(),
23244 Op.getScalarValueSizeInBits());
23245 Op = Src;
23246 break;
23247 }
23248 case ISD::AND: {
23249 if (auto *Cst = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
23250 Mask = Cst->getAPIntValue();
23251 Op = Op.getOperand(0);
23252 }
23253 break;
23254 }
23255 }
23256 }
23257 } else if (CC == ISD::SETGT && isAllOnesConstant(OrigRHS)) {
23258 CC = ISD::SETEQ;
23259 CmpNull = true;
23260 Mask = APInt::getSignMask(Op.getScalarValueSizeInBits());
23261 } else {
23262 return SDValue();
23263 }
23264
23265 ISD::NodeType LogicOp = CmpNull ? ISD::OR : ISD::AND;
23266
23267 // Match icmp(or(extract(X,0),extract(X,1)),0) anyof reduction patterns.
23268 // Match icmp(and(extract(X,0),extract(X,1)),-1) allof reduction patterns.
23270 if (Op.getOpcode() == LogicOp && matchScalarReduction(Op, LogicOp, VecIns)) {
23271 EVT VT = VecIns[0].getValueType();
23272 assert(llvm::all_of(VecIns,
23273 [VT](SDValue V) { return VT == V.getValueType(); }) &&
23274 "Reduction source vector mismatch");
23275
23276 // Quit if not splittable to scalar/128/256/512-bit vector.
23278 return SDValue();
23279
23280 // If more than one full vector is evaluated, AND/OR them first before
23281 // PTEST.
23282 for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1;
23283 Slot += 2, e += 1) {
23284 // Each iteration will AND/OR 2 nodes and append the result until there is
23285 // only 1 node left, i.e. the final value of all vectors.
23286 SDValue LHS = VecIns[Slot];
23287 SDValue RHS = VecIns[Slot + 1];
23288 VecIns.push_back(DAG.getNode(LogicOp, DL, VT, LHS, RHS));
23289 }
23290
23291 return LowerVectorAllEqual(DL, VecIns.back(),
23292 CmpNull ? DAG.getConstant(0, DL, VT)
23293 : DAG.getAllOnesConstant(DL, VT),
23294 CC, Mask, Subtarget, DAG, X86CC);
23295 }
23296
23297 // Match icmp(reduce_or(X),0) anyof reduction patterns.
23298 // Match icmp(reduce_and(X),-1) allof reduction patterns.
23299 if (Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
23300 ISD::NodeType BinOp;
23301 if (SDValue Match =
23302 DAG.matchBinOpReduction(Op.getNode(), BinOp, {LogicOp})) {
23303 EVT MatchVT = Match.getValueType();
23304 return LowerVectorAllEqual(DL, Match,
23305 CmpNull ? DAG.getConstant(0, DL, MatchVT)
23306 : DAG.getAllOnesConstant(DL, MatchVT),
23307 CC, Mask, Subtarget, DAG, X86CC);
23308 }
23309 }
23310
23311 if (Mask.isAllOnes()) {
23312 assert(!Op.getValueType().isVector() &&
23313 "Illegal vector type for reduction pattern");
23315 if (Src.getValueType().isFixedLengthVector() &&
23316 Src.getValueType().getScalarType() == MVT::i1) {
23317 // Match icmp(bitcast(icmp_ne(X,Y)),0) reduction patterns.
23318 // Match icmp(bitcast(icmp_eq(X,Y)),-1) reduction patterns.
23319 if (Src.getOpcode() == ISD::SETCC) {
23320 SDValue LHS = Src.getOperand(0);
23321 SDValue RHS = Src.getOperand(1);
23322 EVT LHSVT = LHS.getValueType();
23323 ISD::CondCode SrcCC = cast<CondCodeSDNode>(Src.getOperand(2))->get();
23324 if (SrcCC == (CmpNull ? ISD::SETNE : ISD::SETEQ) &&
23326 APInt SrcMask = APInt::getAllOnes(LHSVT.getScalarSizeInBits());
23327 return LowerVectorAllEqual(DL, LHS, RHS, CC, SrcMask, Subtarget, DAG,
23328 X86CC);
23329 }
23330 }
23331 // Match icmp(bitcast(vXi1 trunc(Y)),0) reduction patterns.
23332 // Match icmp(bitcast(vXi1 trunc(Y)),-1) reduction patterns.
23333 // Peek through truncation, mask the LSB and compare against zero/LSB.
23334 if (Src.getOpcode() == ISD::TRUNCATE) {
23335 SDValue Inner = Src.getOperand(0);
23336 EVT InnerVT = Inner.getValueType();
23338 unsigned BW = InnerVT.getScalarSizeInBits();
23339 APInt SrcMask = APInt(BW, 1);
23340 APInt Cmp = CmpNull ? APInt::getZero(BW) : SrcMask;
23341 return LowerVectorAllEqual(DL, Inner,
23342 DAG.getConstant(Cmp, DL, InnerVT), CC,
23343 SrcMask, Subtarget, DAG, X86CC);
23344 }
23345 }
23346 }
23347 }
23348
23349 return SDValue();
23350}
23351
23352/// return true if \c Op has a use that doesn't just read flags.
23354 for (SDUse &Use : Op->uses()) {
23355 SDNode *User = Use.getUser();
23356 unsigned UOpNo = Use.getOperandNo();
23357 if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) {
23358 // Look past truncate.
23359 UOpNo = User->use_begin()->getOperandNo();
23360 User = User->use_begin()->getUser();
23361 }
23362
23363 if (User->getOpcode() != ISD::BRCOND && User->getOpcode() != ISD::SETCC &&
23364 !(User->getOpcode() == ISD::SELECT && UOpNo == 0))
23365 return true;
23366 }
23367 return false;
23368}
23369
23370// Transform to an x86-specific ALU node with flags if there is a chance of
23371// using an RMW op or only the flags are used. Otherwise, leave
23372// the node alone and emit a 'cmp' or 'test' instruction.
23374 for (SDNode *U : Op->users())
23375 if (U->getOpcode() != ISD::CopyToReg &&
23376 U->getOpcode() != ISD::SETCC &&
23377 U->getOpcode() != ISD::STORE)
23378 return false;
23379
23380 return true;
23381}
23382
23383/// Emit nodes that will be selected as "test Op0,Op0", or something
23384/// equivalent.
23386 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
23387 // CF and OF aren't always set the way we want. Determine which
23388 // of these we need.
23389 bool NeedCF = false;
23390 bool NeedOF = false;
23391 switch (X86CC) {
23392 default: break;
23393 case X86::COND_A: case X86::COND_AE:
23394 case X86::COND_B: case X86::COND_BE:
23395 NeedCF = true;
23396 break;
23397 case X86::COND_G: case X86::COND_GE:
23398 case X86::COND_L: case X86::COND_LE:
23399 case X86::COND_O: case X86::COND_NO: {
23400 // Check if we really need to set the
23401 // Overflow flag. If NoSignedWrap is present
23402 // that is not actually needed.
23403 switch (Op->getOpcode()) {
23404 case ISD::ADD:
23405 case ISD::SUB:
23406 case ISD::MUL:
23407 case ISD::SHL:
23408 if (Op.getNode()->getFlags().hasNoSignedWrap())
23409 break;
23410 [[fallthrough]];
23411 default:
23412 NeedOF = true;
23413 break;
23414 }
23415 break;
23416 }
23417 }
23418 // See if we can use the EFLAGS value from the operand instead of
23419 // doing a separate TEST. TEST always sets OF and CF to 0, so unless
23420 // we prove that the arithmetic won't overflow, we can't use OF or CF.
23421 if (Op.getResNo() != 0 || NeedOF || NeedCF) {
23422 // Emit a CMP with 0, which is the TEST pattern.
23423 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
23424 DAG.getConstant(0, dl, Op.getValueType()));
23425 }
23426 unsigned Opcode = 0;
23427 unsigned NumOperands = 0;
23428
23429 SDValue ArithOp = Op;
23430
23431 // NOTICE: In the code below we use ArithOp to hold the arithmetic operation
23432 // which may be the result of a CAST. We use the variable 'Op', which is the
23433 // non-casted variable when we check for possible users.
23434 switch (ArithOp.getOpcode()) {
23435 case ISD::AND:
23436 // If the primary 'and' result isn't used, don't bother using X86ISD::AND,
23437 // because a TEST instruction will be better.
23438 if (!hasNonFlagsUse(Op))
23439 break;
23440
23441 [[fallthrough]];
23442 case ISD::ADD:
23443 case ISD::SUB:
23444 case ISD::OR:
23445 case ISD::XOR:
23447 break;
23448
23449 // Otherwise use a regular EFLAGS-setting instruction.
23450 switch (ArithOp.getOpcode()) {
23451 // clang-format off
23452 default: llvm_unreachable("unexpected operator!");
23453 case ISD::ADD: Opcode = X86ISD::ADD; break;
23454 case ISD::SUB: Opcode = X86ISD::SUB; break;
23455 case ISD::XOR: Opcode = X86ISD::XOR; break;
23456 case ISD::AND: Opcode = X86ISD::AND; break;
23457 case ISD::OR: Opcode = X86ISD::OR; break;
23458 // clang-format on
23459 }
23460
23461 NumOperands = 2;
23462 break;
23463 case X86ISD::ADD:
23464 case X86ISD::SUB:
23465 case X86ISD::OR:
23466 case X86ISD::XOR:
23467 case X86ISD::AND:
23468 return SDValue(Op.getNode(), 1);
23469 case ISD::SSUBO:
23470 case ISD::USUBO: {
23471 // /USUBO/SSUBO will become a X86ISD::SUB and we can use its Z flag.
23472 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
23473 return DAG.getNode(X86ISD::SUB, dl, VTs, Op->getOperand(0),
23474 Op->getOperand(1)).getValue(1);
23475 }
23476 default:
23477 break;
23478 }
23479
23480 if (Opcode == 0) {
23481 // Emit a CMP with 0, which is the TEST pattern.
23482 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
23483 DAG.getConstant(0, dl, Op.getValueType()));
23484 }
23485 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
23486 SmallVector<SDValue, 4> Ops(Op->ops().take_front(NumOperands));
23487
23488 SDValue New = DAG.getNode(Opcode, dl, VTs, Ops);
23489 DAG.ReplaceAllUsesOfValueWith(SDValue(Op.getNode(), 0), New);
23490 return SDValue(New.getNode(), 1);
23491}
23492
23493/// Emit nodes that will be selected as "cmp Op0,Op1", or something
23494/// equivalent.
23496 const SDLoc &dl, SelectionDAG &DAG,
23497 const X86Subtarget &Subtarget) {
23498 if (isNullConstant(Op1))
23499 return EmitTest(Op0, X86CC, dl, DAG, Subtarget);
23500
23501 EVT CmpVT = Op0.getValueType();
23502
23503 assert((CmpVT == MVT::i8 || CmpVT == MVT::i16 ||
23504 CmpVT == MVT::i32 || CmpVT == MVT::i64) && "Unexpected VT!");
23505
23506 // Only promote the compare up to I32 if it is a 16 bit operation
23507 // with an immediate. 16 bit immediates are to be avoided unless the target
23508 // isn't slowed down by length changing prefixes, we're optimizing for
23509 // codesize or the comparison is with a folded load.
23510 if (CmpVT == MVT::i16 && !Subtarget.hasFastImm16() &&
23511 !X86::mayFoldLoad(Op0, Subtarget) && !X86::mayFoldLoad(Op1, Subtarget) &&
23513 auto *COp0 = dyn_cast<ConstantSDNode>(Op0);
23514 auto *COp1 = dyn_cast<ConstantSDNode>(Op1);
23515 // Don't do this if the immediate can fit in 8-bits.
23516 if ((COp0 && !COp0->getAPIntValue().isSignedIntN(8)) ||
23517 (COp1 && !COp1->getAPIntValue().isSignedIntN(8))) {
23518 unsigned ExtendOp =
23520 if (X86CC == X86::COND_E || X86CC == X86::COND_NE) {
23521 // For equality comparisons try to use SIGN_EXTEND if the input was
23522 // truncate from something with enough sign bits.
23523 if (Op0.getOpcode() == ISD::TRUNCATE) {
23524 if (DAG.ComputeMaxSignificantBits(Op0.getOperand(0)) <= 16)
23525 ExtendOp = ISD::SIGN_EXTEND;
23526 } else if (Op1.getOpcode() == ISD::TRUNCATE) {
23527 if (DAG.ComputeMaxSignificantBits(Op1.getOperand(0)) <= 16)
23528 ExtendOp = ISD::SIGN_EXTEND;
23529 }
23530 }
23531
23532 CmpVT = MVT::i32;
23533 Op0 = DAG.getNode(ExtendOp, dl, CmpVT, Op0);
23534 Op1 = DAG.getNode(ExtendOp, dl, CmpVT, Op1);
23535 }
23536 }
23537
23538 // Try to shrink i64 compares if the input has enough zero bits.
23539 if (CmpVT == MVT::i64 && !isX86CCSigned(X86CC) &&
23540 Op0.hasOneUse() && // Hacky way to not break CSE opportunities with sub.
23541 DAG.MaskedValueIsZero(Op1, APInt::getHighBitsSet(64, 32)) &&
23542 DAG.MaskedValueIsZero(Op0, APInt::getHighBitsSet(64, 32))) {
23543 CmpVT = MVT::i32;
23544 Op0 = DAG.getNode(ISD::TRUNCATE, dl, CmpVT, Op0);
23545 Op1 = DAG.getNode(ISD::TRUNCATE, dl, CmpVT, Op1);
23546 }
23547
23548 // Try to shrink all i64 compares if the inputs are representable as signed
23549 // i32.
23550 if (CmpVT == MVT::i64 &&
23551 Op0.hasOneUse() && // Hacky way to not break CSE opportunities with sub.
23552 DAG.ComputeNumSignBits(Op1) > 32 && DAG.ComputeNumSignBits(Op0) > 32) {
23553 CmpVT = MVT::i32;
23554 Op0 = DAG.getNode(ISD::TRUNCATE, dl, CmpVT, Op0);
23555 Op1 = DAG.getNode(ISD::TRUNCATE, dl, CmpVT, Op1);
23556 }
23557
23558 // 0-x == y --> x+y == 0
23559 // 0-x != y --> x+y != 0
23560 if (Op0.getOpcode() == ISD::SUB && isNullConstant(Op0.getOperand(0)) &&
23561 Op0.hasOneUse() && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) {
23562 SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);
23563 SDValue Add = DAG.getNode(X86ISD::ADD, dl, VTs, Op0.getOperand(1), Op1);
23564 return Add.getValue(1);
23565 }
23566
23567 // x == 0-y --> x+y == 0
23568 // x != 0-y --> x+y != 0
23569 if (Op1.getOpcode() == ISD::SUB && isNullConstant(Op1.getOperand(0)) &&
23570 Op1.hasOneUse() && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) {
23571 SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);
23572 SDValue Add = DAG.getNode(X86ISD::ADD, dl, VTs, Op0, Op1.getOperand(1));
23573 return Add.getValue(1);
23574 }
23575
23576 // If we already have an XOR of the ops, use that to check for equality.
23577 // Else use SUB instead of CMP to enable CSE between SUB and CMP.
23578 unsigned X86Opc = X86ISD::SUB;
23579 if ((X86CC == X86::COND_E || X86CC == X86::COND_NE) &&
23580 (DAG.doesNodeExist(ISD::XOR, DAG.getVTList({CmpVT}), {Op0, Op1}) ||
23581 DAG.doesNodeExist(ISD::XOR, DAG.getVTList({CmpVT}), {Op1, Op0})))
23582 X86Opc = X86ISD::XOR;
23583
23584 SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);
23585 SDValue CmpOp = DAG.getNode(X86Opc, dl, VTs, Op0, Op1);
23586 return CmpOp.getValue(1);
23587}
23588
23593
23594bool X86TargetLowering::optimizeFMulOrFDivAsShiftAddBitcast(
23595 SDNode *N, SDValue, SDValue IntPow2) const {
23596 if (N->getOpcode() == ISD::FDIV)
23597 return true;
23598
23599 EVT FPVT = N->getValueType(0);
23600 EVT IntVT = IntPow2.getValueType();
23601
23602 // This indicates a non-free bitcast.
23603 // TODO: This is probably overly conservative as we will need to scale the
23604 // integer vector anyways for the int->fp cast.
23605 if (FPVT.isVector() &&
23606 FPVT.getScalarSizeInBits() != IntVT.getScalarSizeInBits())
23607 return false;
23608
23609 return true;
23610}
23611
23612/// Check if replacement of SQRT with RSQRT should be disabled.
23613bool X86TargetLowering::isFsqrtCheap(SDValue Op, SelectionDAG &DAG) const {
23614 EVT VT = Op.getValueType();
23615
23616 // We don't need to replace SQRT with RSQRT for half type.
23617 if (VT.getScalarType() == MVT::f16)
23618 return true;
23619
23620 // We never want to use both SQRT and RSQRT instructions for the same input.
23621 if (DAG.doesNodeExist(X86ISD::FRSQRT, DAG.getVTList(VT), Op))
23622 return false;
23623
23624 if (VT.isVector())
23625 return Subtarget.hasFastVectorFSQRT();
23626 return Subtarget.hasFastScalarFSQRT();
23627}
23628
23629/// The minimum architected relative accuracy is 2^-12. We need one
23630/// Newton-Raphson step to have a good float result (24 bits of precision).
23631SDValue X86TargetLowering::getSqrtEstimate(SDValue Op,
23632 SelectionDAG &DAG, int Enabled,
23633 int &RefinementSteps,
23634 bool &UseOneConstNR,
23635 bool Reciprocal) const {
23636 SDLoc DL(Op);
23637 EVT VT = Op.getValueType();
23638
23639 // SSE1 has rsqrtss and rsqrtps. AVX adds a 256-bit variant for rsqrtps.
23640 // It is likely not profitable to do this for f64 because a double-precision
23641 // rsqrt estimate with refinement on x86 prior to FMA requires at least 16
23642 // instructions: convert to single, rsqrtss, convert back to double, refine
23643 // (3 steps = at least 13 insts). If an 'rsqrtsd' variant was added to the ISA
23644 // along with FMA, this could be a throughput win.
23645 // TODO: SQRT requires SSE2 to prevent the introduction of an illegal v4i32
23646 // after legalize types.
23647 if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||
23648 (VT == MVT::v4f32 && Subtarget.hasSSE1() && Reciprocal) ||
23649 (VT == MVT::v4f32 && Subtarget.hasSSE2() && !Reciprocal) ||
23650 (VT == MVT::v8f32 && Subtarget.hasAVX()) ||
23651 (VT == MVT::v16f32 && Subtarget.useAVX512Regs())) {
23652 if (RefinementSteps == ReciprocalEstimate::Unspecified)
23653 RefinementSteps = 1;
23654
23655 UseOneConstNR = false;
23656 // There is no FSQRT for 512-bits, but there is RSQRT14.
23657 unsigned Opcode = VT == MVT::v16f32 ? X86ISD::RSQRT14 : X86ISD::FRSQRT;
23658 SDValue Estimate = DAG.getNode(Opcode, DL, VT, Op);
23659 if (RefinementSteps == 0 && !Reciprocal)
23660 Estimate = DAG.getNode(ISD::FMUL, DL, VT, Op, Estimate);
23661 return Estimate;
23662 }
23663
23664 if (VT.getScalarType() == MVT::f16 && isTypeLegal(VT) &&
23665 Subtarget.hasFP16()) {
23666 assert(Reciprocal && "Don't replace SQRT with RSQRT for half type");
23667 if (RefinementSteps == ReciprocalEstimate::Unspecified)
23668 RefinementSteps = 0;
23669
23670 if (VT == MVT::f16) {
23672 SDValue Undef = DAG.getUNDEF(MVT::v8f16);
23673 Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v8f16, Op);
23674 Op = DAG.getNode(X86ISD::RSQRT14S, DL, MVT::v8f16, Undef, Op);
23675 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f16, Op, Zero);
23676 }
23677
23678 return DAG.getNode(X86ISD::RSQRT14, DL, VT, Op);
23679 }
23680 return SDValue();
23681}
23682
23683/// The minimum architected relative accuracy is 2^-12. We need one
23684/// Newton-Raphson step to have a good float result (24 bits of precision).
23685SDValue X86TargetLowering::getRecipEstimate(SDValue Op, SelectionDAG &DAG,
23686 int Enabled,
23687 int &RefinementSteps) const {
23688 SDLoc DL(Op);
23689 EVT VT = Op.getValueType();
23690
23691 // SSE1 has rcpss and rcpps. AVX adds a 256-bit variant for rcpps.
23692 // It is likely not profitable to do this for f64 because a double-precision
23693 // reciprocal estimate with refinement on x86 prior to FMA requires
23694 // 15 instructions: convert to single, rcpss, convert back to double, refine
23695 // (3 steps = 12 insts). If an 'rcpsd' variant was added to the ISA
23696 // along with FMA, this could be a throughput win.
23697
23698 if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||
23699 (VT == MVT::v4f32 && Subtarget.hasSSE1()) ||
23700 (VT == MVT::v8f32 && Subtarget.hasAVX()) ||
23701 (VT == MVT::v16f32 && Subtarget.useAVX512Regs())) {
23702 // Enable estimate codegen with 1 refinement step for vector division.
23703 // Scalar division estimates are disabled because they break too much
23704 // real-world code. These defaults are intended to match GCC behavior.
23705 if (VT == MVT::f32 && Enabled == ReciprocalEstimate::Unspecified)
23706 return SDValue();
23707
23708 if (RefinementSteps == ReciprocalEstimate::Unspecified)
23709 RefinementSteps = 1;
23710
23711 // There is no FSQRT for 512-bits, but there is RCP14.
23712 unsigned Opcode = VT == MVT::v16f32 ? X86ISD::RCP14 : X86ISD::FRCP;
23713 return DAG.getNode(Opcode, DL, VT, Op);
23714 }
23715
23716 if (VT.getScalarType() == MVT::f16 && isTypeLegal(VT) &&
23717 Subtarget.hasFP16()) {
23718 if (RefinementSteps == ReciprocalEstimate::Unspecified)
23719 RefinementSteps = 0;
23720
23721 if (VT == MVT::f16) {
23723 SDValue Undef = DAG.getUNDEF(MVT::v8f16);
23724 Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v8f16, Op);
23725 Op = DAG.getNode(X86ISD::RCP14S, DL, MVT::v8f16, Undef, Op);
23726 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f16, Op, Zero);
23727 }
23728
23729 return DAG.getNode(X86ISD::RCP14, DL, VT, Op);
23730 }
23731 return SDValue();
23732}
23733
23734/// If we have at least two divisions that use the same divisor, convert to
23735/// multiplication by a reciprocal. This may need to be adjusted for a given
23736/// CPU if a division's cost is not at least twice the cost of a multiplication.
23737/// This is because we still need one division to calculate the reciprocal and
23738/// then we need two multiplies by that reciprocal as replacements for the
23739/// original divisions.
23741 return 2;
23742}
23743
23744SDValue
23745X86TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
23746 SelectionDAG &DAG,
23747 SmallVectorImpl<SDNode *> &Created) const {
23748 AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();
23749 if (isIntDivCheap(N->getValueType(0), Attr))
23750 return SDValue(N,0); // Lower SDIV as SDIV
23751
23752 assert((Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2()) &&
23753 "Unexpected divisor!");
23754
23755 // Only perform this transform if CMOV is supported otherwise the select
23756 // below will become a branch.
23757 if (!Subtarget.canUseCMOV())
23758 return SDValue();
23759
23760 // fold (sdiv X, pow2)
23761 EVT VT = N->getValueType(0);
23762 // FIXME: Support i8.
23763 if (VT != MVT::i16 && VT != MVT::i32 &&
23764 !(Subtarget.is64Bit() && VT == MVT::i64))
23765 return SDValue();
23766
23767 // If the divisor is 2 or -2, the default expansion is better.
23768 if (Divisor == 2 ||
23769 Divisor == APInt(Divisor.getBitWidth(), -2, /*isSigned*/ true))
23770 return SDValue();
23771
23772 return TargetLowering::buildSDIVPow2WithCMov(N, Divisor, DAG, Created);
23773}
23774
23775/// Result of 'and' is compared against zero. Change to a BT node if possible.
23776/// Returns the BT node and the condition code needed to use it.
23778 SelectionDAG &DAG, X86::CondCode &X86CC) {
23779 assert(And.getOpcode() == ISD::AND && "Expected AND node!");
23780 SDValue Op0 = And.getOperand(0);
23781 SDValue Op1 = And.getOperand(1);
23782 if (Op0.getOpcode() == ISD::TRUNCATE)
23783 Op0 = Op0.getOperand(0);
23784 if (Op1.getOpcode() == ISD::TRUNCATE)
23785 Op1 = Op1.getOperand(0);
23786
23787 SDValue Src, BitNo;
23788 if (Op1.getOpcode() == ISD::SHL)
23789 std::swap(Op0, Op1);
23790 if (Op0.getOpcode() == ISD::SHL) {
23791 if (isOneConstant(Op0.getOperand(0))) {
23792 // If we looked past a truncate, check that it's only truncating away
23793 // known zeros.
23794 unsigned BitWidth = Op0.getValueSizeInBits();
23795 unsigned AndBitWidth = And.getValueSizeInBits();
23796 if (BitWidth > AndBitWidth) {
23797 KnownBits Known = DAG.computeKnownBits(Op0);
23798 if (Known.countMinLeadingZeros() < BitWidth - AndBitWidth)
23799 return SDValue();
23800 }
23801 Src = Op1;
23802 BitNo = Op0.getOperand(1);
23803 }
23804 } else if (Op1.getOpcode() == ISD::Constant) {
23805 ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1);
23806 uint64_t AndRHSVal = AndRHS->getZExtValue();
23807 SDValue AndLHS = Op0;
23808
23809 if (AndRHSVal == 1 && AndLHS.getOpcode() == ISD::SRL) {
23810 Src = AndLHS.getOperand(0);
23811 BitNo = AndLHS.getOperand(1);
23812 } else {
23813 // Use BT if the immediate can't be encoded in a TEST instruction or we
23814 // are optimizing for size and the immedaite won't fit in a byte.
23815 bool OptForSize = DAG.shouldOptForSize();
23816 if ((!isUInt<32>(AndRHSVal) || (OptForSize && !isUInt<8>(AndRHSVal))) &&
23817 isPowerOf2_64(AndRHSVal)) {
23818 Src = AndLHS;
23819 BitNo = DAG.getConstant(Log2_64_Ceil(AndRHSVal), dl,
23820 Src.getValueType());
23821 }
23822 }
23823 }
23824
23825 // No patterns found, give up.
23826 if (!Src.getNode())
23827 return SDValue();
23828
23829 // Remove any bit flip.
23830 if (isBitwiseNot(Src)) {
23831 Src = Src.getOperand(0);
23832 CC = CC == ISD::SETEQ ? ISD::SETNE : ISD::SETEQ;
23833 }
23834
23835 // Attempt to create the X86ISD::BT node.
23836 if (SDValue BT = getBT(Src, BitNo, dl, DAG)) {
23837 X86CC = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;
23838 return BT;
23839 }
23840
23841 return SDValue();
23842}
23843
23844// Check if pre-AVX condcode can be performed by a single FCMP op.
23845static bool cheapX86FSETCC_SSE(ISD::CondCode SetCCOpcode) {
23846 return (SetCCOpcode != ISD::SETONE) && (SetCCOpcode != ISD::SETUEQ);
23847}
23848
23849/// Turns an ISD::CondCode into a value suitable for SSE floating-point mask
23850/// CMPs.
23851static unsigned translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0,
23852 SDValue &Op1, bool &IsAlwaysSignaling) {
23853 unsigned SSECC;
23854 bool Swap = false;
23855
23856 // SSE Condition code mapping:
23857 // 0 - EQ
23858 // 1 - LT
23859 // 2 - LE
23860 // 3 - UNORD
23861 // 4 - NEQ
23862 // 5 - NLT
23863 // 6 - NLE
23864 // 7 - ORD
23865 switch (SetCCOpcode) {
23866 // clang-format off
23867 default: llvm_unreachable("Unexpected SETCC condition");
23868 case ISD::SETOEQ:
23869 case ISD::SETEQ: SSECC = 0; break;
23870 case ISD::SETOGT:
23871 case ISD::SETGT: Swap = true; [[fallthrough]];
23872 case ISD::SETLT:
23873 case ISD::SETOLT: SSECC = 1; break;
23874 case ISD::SETOGE:
23875 case ISD::SETGE: Swap = true; [[fallthrough]];
23876 case ISD::SETLE:
23877 case ISD::SETOLE: SSECC = 2; break;
23878 case ISD::SETUO: SSECC = 3; break;
23879 case ISD::SETUNE:
23880 case ISD::SETNE: SSECC = 4; break;
23881 case ISD::SETULE: Swap = true; [[fallthrough]];
23882 case ISD::SETUGE: SSECC = 5; break;
23883 case ISD::SETULT: Swap = true; [[fallthrough]];
23884 case ISD::SETUGT: SSECC = 6; break;
23885 case ISD::SETO: SSECC = 7; break;
23886 case ISD::SETUEQ: SSECC = 8; break;
23887 case ISD::SETONE: SSECC = 12; break;
23888 // clang-format on
23889 }
23890 if (Swap)
23891 std::swap(Op0, Op1);
23892
23893 switch (SetCCOpcode) {
23894 default:
23895 IsAlwaysSignaling = true;
23896 break;
23897 case ISD::SETEQ:
23898 case ISD::SETOEQ:
23899 case ISD::SETUEQ:
23900 case ISD::SETNE:
23901 case ISD::SETONE:
23902 case ISD::SETUNE:
23903 case ISD::SETO:
23904 case ISD::SETUO:
23905 IsAlwaysSignaling = false;
23906 break;
23907 }
23908
23909 return SSECC;
23910}
23911
23912/// Break a VSETCC 256/512-bit vector into two new 128/256 ones and then
23913/// concatenate the result back.
23915 SelectionDAG &DAG, const SDLoc &dl) {
23916 assert(VT.isInteger() && LHS.getValueType() == RHS.getValueType() &&
23917 "Unsupported VTs!");
23918 SDValue CC = DAG.getCondCode(Cond);
23919
23920 // Extract the LHS Lo/Hi vectors
23921 SDValue LHS1, LHS2;
23922 std::tie(LHS1, LHS2) = splitVector(LHS, DAG, dl);
23923
23924 // Extract the RHS Lo/Hi vectors
23925 SDValue RHS1, RHS2;
23926 std::tie(RHS1, RHS2) = splitVector(RHS, DAG, dl);
23927
23928 // Issue the operation on the smaller types and concatenate the result back
23929 EVT LoVT, HiVT;
23930 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
23931 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
23932 DAG.getNode(ISD::SETCC, dl, LoVT, LHS1, RHS1, CC),
23933 DAG.getNode(ISD::SETCC, dl, HiVT, LHS2, RHS2, CC));
23934}
23935
23937 SelectionDAG &DAG) {
23938 SDValue Op0 = Op.getOperand(0);
23939 SDValue Op1 = Op.getOperand(1);
23940 SDValue CC = Op.getOperand(2);
23941 MVT VT = Op.getSimpleValueType();
23942 assert(VT.getVectorElementType() == MVT::i1 &&
23943 "Cannot set masked compare for this operation");
23944
23945 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
23946
23947 // Prefer SETGT over SETLT.
23948 if (SetCCOpcode == ISD::SETLT) {
23949 SetCCOpcode = ISD::getSetCCSwappedOperands(SetCCOpcode);
23950 std::swap(Op0, Op1);
23951 }
23952
23953 return DAG.getSetCC(dl, VT, Op0, Op1, SetCCOpcode);
23954}
23955
23956/// Given a buildvector constant, return a new vector constant with each element
23957/// incremented or decremented. If incrementing or decrementing would result in
23958/// unsigned overflow or underflow or this is not a simple vector constant,
23959/// return an empty value.
23961 bool NSW) {
23962 auto *BV = dyn_cast<BuildVectorSDNode>(V.getNode());
23963 if (!BV || !V.getValueType().isSimple())
23964 return SDValue();
23965
23966 MVT VT = V.getSimpleValueType();
23967 MVT EltVT = VT.getVectorElementType();
23968 unsigned NumElts = VT.getVectorNumElements();
23970 SDLoc DL(V);
23971 for (unsigned i = 0; i < NumElts; ++i) {
23972 auto *Elt = dyn_cast<ConstantSDNode>(BV->getOperand(i));
23973 if (!Elt || Elt->isOpaque() || Elt->getSimpleValueType(0) != EltVT)
23974 return SDValue();
23975
23976 // Avoid overflow/underflow.
23977 const APInt &EltC = Elt->getAPIntValue();
23978 if ((IsInc && EltC.isMaxValue()) || (!IsInc && EltC.isZero()))
23979 return SDValue();
23980 if (NSW && ((IsInc && EltC.isMaxSignedValue()) ||
23981 (!IsInc && EltC.isMinSignedValue())))
23982 return SDValue();
23983
23984 NewVecC.push_back(DAG.getConstant(EltC + (IsInc ? 1 : -1), DL, EltVT));
23985 }
23986
23987 return DAG.getBuildVector(VT, DL, NewVecC);
23988}
23989
23990/// As another special case, use PSUBUS[BW] when it's profitable. E.g. for
23991/// Op0 u<= Op1:
23992/// t = psubus Op0, Op1
23993/// pcmpeq t, <0..0>
23995 ISD::CondCode Cond, const SDLoc &dl,
23996 const X86Subtarget &Subtarget,
23997 SelectionDAG &DAG) {
23998 if (!Subtarget.hasSSE2())
23999 return SDValue();
24000
24001 MVT VET = VT.getVectorElementType();
24002 if (VET != MVT::i8 && VET != MVT::i16)
24003 return SDValue();
24004
24005 switch (Cond) {
24006 default:
24007 return SDValue();
24008 case ISD::SETULT: {
24009 // If the comparison is against a constant we can turn this into a
24010 // setule. With psubus, setule does not require a swap. This is
24011 // beneficial because the constant in the register is no longer
24012 // destructed as the destination so it can be hoisted out of a loop.
24013 // Only do this pre-AVX since vpcmp* is no longer destructive.
24014 if (Subtarget.hasAVX())
24015 return SDValue();
24016 SDValue ULEOp1 =
24017 incDecVectorConstant(Op1, DAG, /*IsInc*/ false, /*NSW*/ false);
24018 if (!ULEOp1)
24019 return SDValue();
24020 Op1 = ULEOp1;
24021 break;
24022 }
24023 case ISD::SETUGT: {
24024 // If the comparison is against a constant, we can turn this into a setuge.
24025 // This is beneficial because materializing a constant 0 for the PCMPEQ is
24026 // probably cheaper than XOR+PCMPGT using 2 different vector constants:
24027 // cmpgt (xor X, SignMaskC) CmpC --> cmpeq (usubsat (CmpC+1), X), 0
24028 SDValue UGEOp1 =
24029 incDecVectorConstant(Op1, DAG, /*IsInc*/ true, /*NSW*/ false);
24030 if (!UGEOp1)
24031 return SDValue();
24032 Op1 = Op0;
24033 Op0 = UGEOp1;
24034 break;
24035 }
24036 // Psubus is better than flip-sign because it requires no inversion.
24037 case ISD::SETUGE:
24038 std::swap(Op0, Op1);
24039 break;
24040 case ISD::SETULE:
24041 break;
24042 }
24043
24044 SDValue Result = DAG.getNode(ISD::USUBSAT, dl, VT, Op0, Op1);
24045 return DAG.getNode(X86ISD::PCMPEQ, dl, VT, Result,
24046 DAG.getConstant(0, dl, VT));
24047}
24048
24049static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
24050 SelectionDAG &DAG) {
24051 bool IsStrict = Op.getOpcode() == ISD::STRICT_FSETCC ||
24052 Op.getOpcode() == ISD::STRICT_FSETCCS;
24053 SDValue Op0 = Op.getOperand(IsStrict ? 1 : 0);
24054 SDValue Op1 = Op.getOperand(IsStrict ? 2 : 1);
24055 SDValue CC = Op.getOperand(IsStrict ? 3 : 2);
24056 MVT VT = Op->getSimpleValueType(0);
24058 MVT OpVT = Op0.getSimpleValueType();
24059 SDLoc dl(Op);
24060
24061 if (OpVT.isFloatingPoint()) {
24062 MVT EltVT = OpVT.getVectorElementType();
24063 assert(EltVT == MVT::bf16 || EltVT == MVT::f16 || EltVT == MVT::f32 ||
24064 EltVT == MVT::f64);
24065
24066 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
24067 if (isSoftF16(EltVT, Subtarget)) {
24068 if (Subtarget.hasAVX512() && !Subtarget.hasVLX())
24069 return SDValue();
24070
24071 // Break 256-bit FP vector compare into smaller ones.
24072 if (OpVT.is256BitVector() && !Subtarget.useAVX512Regs())
24073 return splitVSETCC(VT, Op0, Op1, Cond, DAG, dl);
24074
24075 // Break 512-bit FP vector compare into smaller ones.
24076 if (OpVT.is512BitVector())
24077 return splitVSETCC(VT, Op0, Op1, Cond, DAG, dl);
24078
24079 MVT NVT = OpVT.changeVectorElementType(MVT::f32);
24080 if (IsStrict) {
24081 Op0 = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {NVT, MVT::Other},
24082 {Chain, Op0});
24083 Op1 = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {NVT, MVT::Other},
24084 {Chain, Op1});
24085 return DAG.getNode(Op.getOpcode(), dl, {VT, MVT::Other},
24086 {Chain, Op0, Op1, CC});
24087 }
24088 MVT DVT = VT.getVectorElementType() == MVT::i16
24089 ? VT.changeVectorElementType(MVT::i32)
24090 : VT;
24091 SDValue Cmp = DAG.getNode(Op.getOpcode(), dl, DVT,
24092 DAG.getNode(ISD::FP_EXTEND, dl, NVT, Op0),
24093 DAG.getNode(ISD::FP_EXTEND, dl, NVT, Op1), CC);
24094 return DVT == VT ? Cmp : DAG.getNode(ISD::TRUNCATE, dl, VT, Cmp);
24095 }
24096
24097 bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
24098
24099 // If we have a strict compare with a vXi1 result and the input is 128/256
24100 // bits we can't use a masked compare unless we have VLX. If we use a wider
24101 // compare like we do for non-strict, we might trigger spurious exceptions
24102 // from the upper elements. Instead emit a AVX compare and convert to mask.
24103 unsigned Opc;
24104 if (Subtarget.hasAVX512() && VT.getVectorElementType() == MVT::i1 &&
24105 (!IsStrict || Subtarget.hasVLX() ||
24107#ifndef NDEBUG
24108 unsigned Num = VT.getVectorNumElements();
24109 assert(Num <= 16 ||
24110 (Num == 32 && (EltVT == MVT::f16 || EltVT == MVT::bf16)));
24111#endif
24112 Opc = IsStrict ? X86ISD::STRICT_CMPM : X86ISD::CMPM;
24113 } else {
24114 Opc = IsStrict ? X86ISD::STRICT_CMPP : X86ISD::CMPP;
24115 // The SSE/AVX packed FP comparison nodes are defined with a
24116 // floating-point vector result that matches the operand type. This allows
24117 // them to work with an SSE1 target (integer vector types are not legal).
24118 VT = Op0.getSimpleValueType();
24119 }
24120
24121 SDValue Cmp;
24122 bool IsAlwaysSignaling;
24123 unsigned SSECC = translateX86FSETCC(Cond, Op0, Op1, IsAlwaysSignaling);
24124 if (!Subtarget.hasAVX()) {
24125 // TODO: We could use following steps to handle a quiet compare with
24126 // signaling encodings.
24127 // 1. Get ordered masks from a quiet ISD::SETO
24128 // 2. Use the masks to mask potential unordered elements in operand A, B
24129 // 3. Get the compare results of masked A, B
24130 // 4. Calculating final result using the mask and result from 3
24131 // But currently, we just fall back to scalar operations.
24132 if (IsStrict && IsAlwaysSignaling && !IsSignaling)
24133 return SDValue();
24134
24135 // Insert an extra signaling instruction to raise exception.
24136 if (IsStrict && !IsAlwaysSignaling && IsSignaling) {
24137 SDValue SignalCmp = DAG.getNode(
24138 Opc, dl, {VT, MVT::Other},
24139 {Chain, Op0, Op1, DAG.getTargetConstant(1, dl, MVT::i8)}); // LT_OS
24140 // FIXME: It seems we need to update the flags of all new strict nodes.
24141 // Otherwise, mayRaiseFPException in MI will return false due to
24142 // NoFPExcept = false by default. However, I didn't find it in other
24143 // patches.
24144 SignalCmp->setFlags(Op->getFlags());
24145 Chain = SignalCmp.getValue(1);
24146 }
24147
24148 // In the two cases not handled by SSE compare predicates (SETUEQ/SETONE),
24149 // emit two comparisons and a logic op to tie them together.
24150 if (!cheapX86FSETCC_SSE(Cond)) {
24151 // LLVM predicate is SETUEQ or SETONE.
24152 unsigned CC0, CC1;
24153 unsigned CombineOpc;
24154 if (Cond == ISD::SETUEQ) {
24155 CC0 = 3; // UNORD
24156 CC1 = 0; // EQ
24157 CombineOpc = X86ISD::FOR;
24158 } else {
24160 CC0 = 7; // ORD
24161 CC1 = 4; // NEQ
24162 CombineOpc = X86ISD::FAND;
24163 }
24164
24165 SDValue Cmp0, Cmp1;
24166 if (IsStrict) {
24167 Cmp0 = DAG.getNode(
24168 Opc, dl, {VT, MVT::Other},
24169 {Chain, Op0, Op1, DAG.getTargetConstant(CC0, dl, MVT::i8)});
24170 Cmp1 = DAG.getNode(
24171 Opc, dl, {VT, MVT::Other},
24172 {Chain, Op0, Op1, DAG.getTargetConstant(CC1, dl, MVT::i8)});
24173 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Cmp0.getValue(1),
24174 Cmp1.getValue(1));
24175 } else {
24176 Cmp0 = DAG.getNode(
24177 Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(CC0, dl, MVT::i8));
24178 Cmp1 = DAG.getNode(
24179 Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(CC1, dl, MVT::i8));
24180 }
24181 Cmp = DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1);
24182 } else {
24183 if (IsStrict) {
24184 Cmp = DAG.getNode(
24185 Opc, dl, {VT, MVT::Other},
24186 {Chain, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8)});
24187 Chain = Cmp.getValue(1);
24188 } else
24189 Cmp = DAG.getNode(
24190 Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8));
24191 }
24192 } else {
24193 // Handle all other FP comparisons here.
24194 if (IsStrict) {
24195 // Make a flip on already signaling CCs before setting bit 4 of AVX CC.
24196 SSECC |= (IsAlwaysSignaling ^ IsSignaling) << 4;
24197 Cmp = DAG.getNode(
24198 Opc, dl, {VT, MVT::Other},
24199 {Chain, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8)});
24200 Chain = Cmp.getValue(1);
24201 } else
24202 Cmp = DAG.getNode(
24203 Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8));
24204 }
24205
24206 if (VT.getFixedSizeInBits() >
24207 Op.getSimpleValueType().getFixedSizeInBits()) {
24208 // We emitted a compare with an XMM/YMM result. Finish converting to a
24209 // mask register using a vptestm.
24211 Cmp = DAG.getBitcast(CastVT, Cmp);
24212 Cmp = DAG.getSetCC(dl, Op.getSimpleValueType(), Cmp,
24213 DAG.getConstant(0, dl, CastVT), ISD::SETNE);
24214 } else {
24215 // If this is SSE/AVX CMPP, bitcast the result back to integer to match
24216 // the result type of SETCC. The bitcast is expected to be optimized
24217 // away during combining/isel.
24218 Cmp = DAG.getBitcast(Op.getSimpleValueType(), Cmp);
24219 }
24220
24221 if (IsStrict)
24222 return DAG.getMergeValues({Cmp, Chain}, dl);
24223
24224 return Cmp;
24225 }
24226
24227 assert(!IsStrict && "Strict SETCC only handles FP operands.");
24228
24229 [[maybe_unused]] MVT VTOp0 = Op0.getSimpleValueType();
24230 assert(VTOp0 == Op1.getSimpleValueType() &&
24231 "Expected operands with same type!");
24233 "Invalid number of packed elements for source and destination!");
24234
24235 // The non-AVX512 code below works under the assumption that source and
24236 // destination types are the same.
24237 assert((Subtarget.hasAVX512() || (VT == VTOp0)) &&
24238 "Value types for source and destination must be the same!");
24239
24240 // The result is boolean, but operands are int/float
24241 if (VT.getVectorElementType() == MVT::i1) {
24242 // In AVX-512 architecture setcc returns mask with i1 elements,
24243 // But there is no compare instruction for i8 and i16 elements in KNL.
24244 assert((VTOp0.getScalarSizeInBits() >= 32 || Subtarget.hasBWI()) &&
24245 "Unexpected operand type");
24246 return LowerIntVSETCC_AVX512(Op, dl, DAG);
24247 }
24248
24249 // Lower using XOP integer comparisons.
24250 if (VT.is128BitVector() && Subtarget.hasXOP()) {
24251 // Translate compare code to XOP PCOM compare mode.
24252 unsigned CmpMode = 0;
24253 switch (Cond) {
24254 // clang-format off
24255 default: llvm_unreachable("Unexpected SETCC condition");
24256 case ISD::SETULT:
24257 case ISD::SETLT: CmpMode = 0x00; break;
24258 case ISD::SETULE:
24259 case ISD::SETLE: CmpMode = 0x01; break;
24260 case ISD::SETUGT:
24261 case ISD::SETGT: CmpMode = 0x02; break;
24262 case ISD::SETUGE:
24263 case ISD::SETGE: CmpMode = 0x03; break;
24264 case ISD::SETEQ: CmpMode = 0x04; break;
24265 case ISD::SETNE: CmpMode = 0x05; break;
24266 // clang-format on
24267 }
24268
24269 // Are we comparing unsigned or signed integers?
24270 unsigned Opc =
24272
24273 return DAG.getNode(Opc, dl, VT, Op0, Op1,
24274 DAG.getTargetConstant(CmpMode, dl, MVT::i8));
24275 }
24276
24277 // (X & Y) != 0 --> (X & Y) == Y iff Y is power-of-2.
24278 // Revert part of the simplifySetCCWithAnd combine, to avoid an invert.
24280 SDValue BC0 = peekThroughBitcasts(Op0);
24281 if (BC0.getOpcode() == ISD::AND &&
24283 /*AllowUndefs=*/false)) {
24284 Cond = ISD::SETEQ;
24285 Op1 = DAG.getBitcast(VT, BC0.getOperand(1));
24286 }
24287 }
24288
24289 // ICMP_EQ(AND(X,C),C) -> SRA(SHL(X,LOG2(C)),BW-1) iff C is power-of-2.
24290 if (Cond == ISD::SETEQ && Op0.getOpcode() == ISD::AND &&
24291 Op0.getOperand(1) == Op1 && Op0.hasOneUse()) {
24293 if (C1 && C1->getAPIntValue().isPowerOf2()) {
24294 unsigned BitWidth = VT.getScalarSizeInBits();
24295 unsigned ShiftAmt = BitWidth - C1->getAPIntValue().logBase2() - 1;
24296
24297 SDValue Result = Op0.getOperand(0);
24298 Result = DAG.getNode(ISD::SHL, dl, VT, Result,
24299 DAG.getConstant(ShiftAmt, dl, VT));
24300 Result = DAG.getNode(ISD::SRA, dl, VT, Result,
24301 DAG.getConstant(BitWidth - 1, dl, VT));
24302 return Result;
24303 }
24304 }
24305
24306 // Break 256-bit integer vector compare into smaller ones.
24307 if (VT.is256BitVector() && !Subtarget.hasInt256())
24308 return splitVSETCC(VT, Op0, Op1, Cond, DAG, dl);
24309
24310 // Break 512-bit integer vector compare into smaller ones.
24311 // TODO: Try harder to use VPCMPx + VPMOV2x?
24312 if (VT.is512BitVector())
24313 return splitVSETCC(VT, Op0, Op1, Cond, DAG, dl);
24314
24315 // If we have a limit constant, try to form PCMPGT (signed cmp) to avoid
24316 // not-of-PCMPEQ:
24317 // X != INT_MIN --> X >s INT_MIN
24318 // X != INT_MAX --> X <s INT_MAX --> INT_MAX >s X
24319 // +X != 0 --> +X >s 0
24320 APInt ConstValue;
24321 if (Cond == ISD::SETNE &&
24322 ISD::isConstantSplatVector(Op1.getNode(), ConstValue)) {
24323 if (ConstValue.isMinSignedValue())
24324 Cond = ISD::SETGT;
24325 else if (ConstValue.isMaxSignedValue())
24326 Cond = ISD::SETLT;
24327 else if (ConstValue.isZero() && DAG.SignBitIsZero(Op0))
24328 Cond = ISD::SETGT;
24329 }
24330
24331 // If both operands are known non-negative, then an unsigned compare is the
24332 // same as a signed compare and there's no need to flip signbits.
24333 // TODO: We could check for more general simplifications here since we're
24334 // computing known bits.
24335 bool FlipSigns = ISD::isUnsignedIntSetCC(Cond) &&
24336 !(DAG.SignBitIsZero(Op0) && DAG.SignBitIsZero(Op1));
24337
24338 // Special case: Use min/max operations for unsigned compares.
24339 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
24341 (FlipSigns || ISD::isTrueWhenEqual(Cond)) &&
24342 TLI.isOperationLegal(ISD::UMIN, VT)) {
24343 // If we have a constant operand, increment/decrement it and change the
24344 // condition to avoid an invert.
24345 if (Cond == ISD::SETUGT) {
24346 // X > C --> X >= (C+1) --> X == umax(X, C+1)
24347 if (SDValue UGTOp1 =
24348 incDecVectorConstant(Op1, DAG, /*IsInc*/ true, /*NSW*/ false)) {
24349 Op1 = UGTOp1;
24350 Cond = ISD::SETUGE;
24351 }
24352 }
24353 if (Cond == ISD::SETULT) {
24354 // X < C --> X <= (C-1) --> X == umin(X, C-1)
24355 if (SDValue ULTOp1 =
24356 incDecVectorConstant(Op1, DAG, /*IsInc*/ false, /*NSW*/ false)) {
24357 Op1 = ULTOp1;
24358 Cond = ISD::SETULE;
24359 }
24360 }
24361 bool Invert = false;
24362 unsigned Opc;
24363 switch (Cond) {
24364 // clang-format off
24365 default: llvm_unreachable("Unexpected condition code");
24366 case ISD::SETUGT: Invert = true; [[fallthrough]];
24367 case ISD::SETULE: Opc = ISD::UMIN; break;
24368 case ISD::SETULT: Invert = true; [[fallthrough]];
24369 case ISD::SETUGE: Opc = ISD::UMAX; break;
24370 // clang-format on
24371 }
24372
24373 SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
24374 Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Op0, Result);
24375
24376 // If the logical-not of the result is required, perform that now.
24377 if (Invert)
24378 Result = DAG.getNOT(dl, Result, VT);
24379
24380 return Result;
24381 }
24382
24383 // Try to use SUBUS and PCMPEQ.
24384 if (FlipSigns)
24385 if (SDValue V =
24386 LowerVSETCCWithSUBUS(Op0, Op1, VT, Cond, dl, Subtarget, DAG))
24387 return V;
24388
24389 // We are handling one of the integer comparisons here. Since SSE only has
24390 // GT and EQ comparisons for integer, swapping operands and multiple
24391 // operations may be required for some comparisons.
24392 unsigned Opc = (Cond == ISD::SETEQ || Cond == ISD::SETNE) ? X86ISD::PCMPEQ
24394 bool Swap = Cond == ISD::SETLT || Cond == ISD::SETULT ||
24396 bool Invert = Cond == ISD::SETNE ||
24398
24399 if (Swap)
24400 std::swap(Op0, Op1);
24401
24402 // Check that the operation in question is available (most are plain SSE2,
24403 // but PCMPGTQ and PCMPEQQ have different requirements).
24404 if (VT == MVT::v2i64) {
24405 if (Opc == X86ISD::PCMPGT && !Subtarget.hasSSE42()) {
24406 assert(Subtarget.hasSSE2() && "Don't know how to lower!");
24407
24408 // Special case for sign bit test. We can use a v4i32 PCMPGT and shuffle
24409 // the odd elements over the even elements.
24410 if (!FlipSigns && !Invert && ISD::isBuildVectorAllZeros(Op0.getNode())) {
24411 Op0 = DAG.getConstant(0, dl, MVT::v4i32);
24412 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
24413
24414 SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
24415 static const int MaskHi[] = { 1, 1, 3, 3 };
24416 SDValue Result = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
24417
24418 return DAG.getBitcast(VT, Result);
24419 }
24420
24421 if (!FlipSigns && !Invert && ISD::isBuildVectorAllOnes(Op1.getNode())) {
24422 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
24423 Op1 = DAG.getAllOnesConstant(dl, MVT::v4i32);
24424
24425 SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
24426 static const int MaskHi[] = { 1, 1, 3, 3 };
24427 SDValue Result = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
24428
24429 return DAG.getBitcast(VT, Result);
24430 }
24431
24432 // If the i64 elements are sign-extended enough to be representable as i32
24433 // then we can compare the lower i32 bits and splat.
24434 if (!FlipSigns && !Invert && DAG.ComputeNumSignBits(Op0) > 32 &&
24435 DAG.ComputeNumSignBits(Op1) > 32) {
24436 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
24437 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
24438
24439 SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
24440 static const int MaskLo[] = {0, 0, 2, 2};
24441 SDValue Result = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo);
24442
24443 return DAG.getBitcast(VT, Result);
24444 }
24445
24446 // Since SSE has no unsigned integer comparisons, we need to flip the sign
24447 // bits of the inputs before performing those operations. The lower
24448 // compare is always unsigned.
24449 SDValue SB = DAG.getConstant(FlipSigns ? 0x8000000080000000ULL
24450 : 0x0000000080000000ULL,
24451 dl, MVT::v2i64);
24452
24453 Op0 = DAG.getNode(ISD::XOR, dl, MVT::v2i64, Op0, SB);
24454 Op1 = DAG.getNode(ISD::XOR, dl, MVT::v2i64, Op1, SB);
24455
24456 // Cast everything to the right type.
24457 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
24458 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
24459
24460 // Emulate PCMPGTQ with (hi1 > hi2) | ((hi1 == hi2) & (lo1 > lo2))
24461 SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
24462 SDValue EQ = DAG.getNode(X86ISD::PCMPEQ, dl, MVT::v4i32, Op0, Op1);
24463
24464 // Create masks for only the low parts/high parts of the 64 bit integers.
24465 static const int MaskHi[] = { 1, 1, 3, 3 };
24466 static const int MaskLo[] = { 0, 0, 2, 2 };
24467 SDValue EQHi = DAG.getVectorShuffle(MVT::v4i32, dl, EQ, EQ, MaskHi);
24468 SDValue GTLo = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo);
24469 SDValue GTHi = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
24470
24471 SDValue Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, EQHi, GTLo);
24472 Result = DAG.getNode(ISD::OR, dl, MVT::v4i32, Result, GTHi);
24473
24474 if (Invert)
24475 Result = DAG.getNOT(dl, Result, MVT::v4i32);
24476
24477 return DAG.getBitcast(VT, Result);
24478 }
24479
24480 if (Opc == X86ISD::PCMPEQ && !Subtarget.hasSSE41()) {
24481 // If pcmpeqq is missing but pcmpeqd is available synthesize pcmpeqq with
24482 // pcmpeqd + pshufd + pand.
24483 assert(Subtarget.hasSSE2() && !FlipSigns && "Don't know how to lower!");
24484
24485 // First cast everything to the right type.
24486 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
24487 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
24488
24489 // Do the compare.
24490 SDValue Result = DAG.getNode(Opc, dl, MVT::v4i32, Op0, Op1);
24491
24492 // Make sure the lower and upper halves are both all-ones.
24493 static const int Mask[] = { 1, 0, 3, 2 };
24494 SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Result, Result, Mask);
24495 Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, Result, Shuf);
24496
24497 if (Invert)
24498 Result = DAG.getNOT(dl, Result, MVT::v4i32);
24499
24500 return DAG.getBitcast(VT, Result);
24501 }
24502 }
24503
24504 // Since SSE has no unsigned integer comparisons, we need to flip the sign
24505 // bits of the inputs before performing those operations.
24506 if (FlipSigns) {
24507 MVT EltVT = VT.getVectorElementType();
24509 VT);
24510 Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SM);
24511 Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SM);
24512 }
24513
24514 SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
24515
24516 // If the logical-not of the result is required, perform that now.
24517 if (Invert)
24518 Result = DAG.getNOT(dl, Result, VT);
24519
24520 return Result;
24521}
24522
24523// Try to select this as a KORTEST+SETCC or KTEST+SETCC if possible.
24525 const SDLoc &dl, SelectionDAG &DAG,
24526 const X86Subtarget &Subtarget,
24527 SDValue &X86CC) {
24528 assert((CC == ISD::SETEQ || CC == ISD::SETNE) && "Unsupported ISD::CondCode");
24529
24530 // Must be a bitcast from vXi1.
24531 if (Op0.getOpcode() != ISD::BITCAST)
24532 return SDValue();
24533
24534 Op0 = Op0.getOperand(0);
24535 MVT VT = Op0.getSimpleValueType();
24536 if (!(Subtarget.hasAVX512() && VT == MVT::v16i1) &&
24537 !(Subtarget.hasDQI() && VT == MVT::v8i1) &&
24538 !(Subtarget.hasBWI() && (VT == MVT::v32i1 || VT == MVT::v64i1)))
24539 return SDValue();
24540
24541 X86::CondCode X86Cond;
24542 if (isNullConstant(Op1)) {
24543 X86Cond = CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE;
24544 } else if (isAllOnesConstant(Op1)) {
24545 // C flag is set for all ones.
24546 X86Cond = CC == ISD::SETEQ ? X86::COND_B : X86::COND_AE;
24547 } else
24548 return SDValue();
24549
24550 // If the input is an AND, we can combine it's operands into the KTEST.
24551 bool KTestable = false;
24552 if (Subtarget.hasDQI() && (VT == MVT::v8i1 || VT == MVT::v16i1))
24553 KTestable = true;
24554 if (Subtarget.hasBWI() && (VT == MVT::v32i1 || VT == MVT::v64i1))
24555 KTestable = true;
24556 if (!isNullConstant(Op1))
24557 KTestable = false;
24558 if (KTestable && Op0.getOpcode() == ISD::AND && Op0.hasOneUse()) {
24559 SDValue LHS = Op0.getOperand(0);
24560 SDValue RHS = Op0.getOperand(1);
24561 X86CC = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
24562 return DAG.getNode(X86ISD::KTEST, dl, MVT::i32, LHS, RHS);
24563 }
24564
24565 // If the input is an OR, we can combine it's operands into the KORTEST.
24566 SDValue LHS = Op0;
24567 SDValue RHS = Op0;
24568 if (Op0.getOpcode() == ISD::OR && Op0.hasOneUse()) {
24569 LHS = Op0.getOperand(0);
24570 RHS = Op0.getOperand(1);
24571 }
24572
24573 X86CC = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
24574 return DAG.getNode(X86ISD::KORTEST, dl, MVT::i32, LHS, RHS);
24575}
24576
24577/// Emit flags for the given setcc condition and operands. Also returns the
24578/// corresponding X86 condition code constant in X86CC.
24579SDValue X86TargetLowering::emitFlagsForSetcc(SDValue Op0, SDValue Op1,
24580 ISD::CondCode CC, const SDLoc &dl,
24581 SelectionDAG &DAG,
24582 SDValue &X86CC) const {
24583 // Equality Combines.
24584 if (CC == ISD::SETEQ || CC == ISD::SETNE) {
24585 X86::CondCode X86CondCode;
24586
24587 // Optimize to BT if possible.
24588 // Lower (X & (1 << N)) == 0 to BT(X, N).
24589 // Lower ((X >>u N) & 1) != 0 to BT(X, N).
24590 // Lower ((X >>s N) & 1) != 0 to BT(X, N).
24591 if (Op0.getOpcode() == ISD::AND && Op0.hasOneUse() && isNullConstant(Op1)) {
24592 if (SDValue BT = LowerAndToBT(Op0, CC, dl, DAG, X86CondCode)) {
24593 X86CC = DAG.getTargetConstant(X86CondCode, dl, MVT::i8);
24594 return BT;
24595 }
24596 }
24597
24598 // Try to use PTEST/PMOVMSKB for a tree AND/ORs equality compared with -1/0.
24599 if (SDValue CmpZ = MatchVectorAllEqualTest(Op0, Op1, CC, dl, Subtarget, DAG,
24600 X86CondCode)) {
24601 X86CC = DAG.getTargetConstant(X86CondCode, dl, MVT::i8);
24602 return CmpZ;
24603 }
24604
24605 // Try to lower using KORTEST or KTEST.
24606 if (SDValue Test = EmitAVX512Test(Op0, Op1, CC, dl, DAG, Subtarget, X86CC))
24607 return Test;
24608
24609 // Look for X == 0, X == 1, X != 0, or X != 1. We can simplify some forms
24610 // of these.
24611 if (isOneConstant(Op1) || isNullConstant(Op1)) {
24612 // If the input is a setcc, then reuse the input setcc or use a new one
24613 // with the inverted condition.
24614 if (Op0.getOpcode() == X86ISD::SETCC) {
24615 bool Invert = (CC == ISD::SETNE) ^ isNullConstant(Op1);
24616
24617 X86CC = Op0.getOperand(0);
24618 if (Invert) {
24619 X86CondCode = (X86::CondCode)Op0.getConstantOperandVal(0);
24620 X86CondCode = X86::GetOppositeBranchCondition(X86CondCode);
24621 X86CC = DAG.getTargetConstant(X86CondCode, dl, MVT::i8);
24622 }
24623
24624 return Op0.getOperand(1);
24625 }
24626 }
24627
24628 // Look for X == INT_MIN or X != INT_MIN. We can use NEG and test for
24629 // overflow.
24630 if (isMinSignedConstant(Op1)) {
24631 EVT VT = Op0.getValueType();
24632 if (VT == MVT::i32 || VT == MVT::i64 || Op0->hasOneUse()) {
24633 SDVTList CmpVTs = DAG.getVTList(VT, MVT::i32);
24635 X86CC = DAG.getTargetConstant(CondCode, dl, MVT::i8);
24636 SDValue Neg = DAG.getNode(X86ISD::SUB, dl, CmpVTs,
24637 DAG.getConstant(0, dl, VT), Op0);
24638 return SDValue(Neg.getNode(), 1);
24639 }
24640 }
24641
24642 // Try to use the carry flag from the add in place of an separate CMP for:
24643 // (seteq (add X, -1), -1). Similar for setne.
24644 if (isAllOnesConstant(Op1) && Op0.getOpcode() == ISD::ADD &&
24645 Op0.getOperand(1) == Op1) {
24646 if (isProfitableToUseFlagOp(Op0)) {
24647 SDVTList VTs = DAG.getVTList(Op0.getValueType(), MVT::i32);
24648
24649 SDValue New = DAG.getNode(X86ISD::ADD, dl, VTs, Op0.getOperand(0),
24650 Op0.getOperand(1));
24651 DAG.ReplaceAllUsesOfValueWith(SDValue(Op0.getNode(), 0), New);
24652 X86CondCode = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;
24653 X86CC = DAG.getTargetConstant(X86CondCode, dl, MVT::i8);
24654 return SDValue(New.getNode(), 1);
24655 }
24656 }
24657 }
24658
24660 TranslateX86CC(CC, dl, /*IsFP*/ false, Op0, Op1, DAG);
24661 assert(CondCode != X86::COND_INVALID && "Unexpected condition code!");
24662
24663 SDValue EFLAGS = EmitCmp(Op0, Op1, CondCode, dl, DAG, Subtarget);
24664 X86CC = DAG.getTargetConstant(CondCode, dl, MVT::i8);
24665 return EFLAGS;
24666}
24667
24668SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
24669
24670 bool IsStrict = Op.getOpcode() == ISD::STRICT_FSETCC ||
24671 Op.getOpcode() == ISD::STRICT_FSETCCS;
24672 MVT VT = Op->getSimpleValueType(0);
24673
24674 if (VT.isVector()) return LowerVSETCC(Op, Subtarget, DAG);
24675
24676 assert(VT == MVT::i8 && "SetCC type must be 8-bit integer");
24677 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
24678 SDValue Op0 = Op.getOperand(IsStrict ? 1 : 0);
24679 SDValue Op1 = Op.getOperand(IsStrict ? 2 : 1);
24680 SDLoc dl(Op);
24681 ISD::CondCode CC =
24682 cast<CondCodeSDNode>(Op.getOperand(IsStrict ? 3 : 2))->get();
24683
24684 if (isSoftF16(Op0.getValueType(), Subtarget))
24685 return SDValue();
24686
24687 // Handle f128 first, since one possible outcome is a normal integer
24688 // comparison which gets handled by emitFlagsForSetcc.
24689 if (Op0.getValueType() == MVT::f128) {
24690 softenSetCCOperands(DAG, MVT::f128, Op0, Op1, CC, dl, Op0, Op1, Chain,
24691 Op.getOpcode() == ISD::STRICT_FSETCCS);
24692
24693 // If softenSetCCOperands returned a scalar, use it.
24694 if (!Op1.getNode()) {
24695 assert(Op0.getValueType() == Op.getValueType() &&
24696 "Unexpected setcc expansion!");
24697 if (IsStrict)
24698 return DAG.getMergeValues({Op0, Chain}, dl);
24699 return Op0;
24700 }
24701 }
24702
24703 if (Op0.getSimpleValueType().isInteger()) {
24704 // Attempt to canonicalize SGT/UGT -> SGE/UGE compares with constant which
24705 // reduces the number of EFLAGs bit reads (the GE conditions don't read ZF),
24706 // this may translate to less uops depending on uarch implementation. The
24707 // equivalent for SLE/ULE -> SLT/ULT isn't likely to happen as we already
24708 // canonicalize to that CondCode.
24709 // NOTE: Only do this if incrementing the constant doesn't increase the bit
24710 // encoding size - so it must either already be a i8 or i32 immediate, or it
24711 // shrinks down to that. We don't do this for any i64's to avoid additional
24712 // constant materializations.
24713 // TODO: Can we move this to TranslateX86CC to handle jumps/branches too?
24714 if (auto *Op1C = dyn_cast<ConstantSDNode>(Op1)) {
24715 const APInt &Op1Val = Op1C->getAPIntValue();
24716 if (!Op1Val.isZero()) {
24717 // Ensure the constant+1 doesn't overflow.
24718 if ((CC == ISD::CondCode::SETGT && !Op1Val.isMaxSignedValue()) ||
24719 (CC == ISD::CondCode::SETUGT && !Op1Val.isMaxValue())) {
24720 APInt Op1ValPlusOne = Op1Val + 1;
24721 if (Op1ValPlusOne.isSignedIntN(32) &&
24722 (!Op1Val.isSignedIntN(8) || Op1ValPlusOne.isSignedIntN(8))) {
24723 Op1 = DAG.getConstant(Op1ValPlusOne, dl, Op0.getValueType());
24726 }
24727 }
24728 }
24729 }
24730
24731 SDValue X86CC;
24732 SDValue EFLAGS = emitFlagsForSetcc(Op0, Op1, CC, dl, DAG, X86CC);
24733 SDValue Res = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, X86CC, EFLAGS);
24734 return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res;
24735 }
24736
24737 if (Subtarget.hasAVX10_2()) {
24738 if (CC == ISD::SETOEQ || CC == ISD::SETUNE) {
24739 auto NewCC = (CC == ISD::SETOEQ) ? X86::COND_E : (X86::COND_NE);
24740 assert(Op0.getSimpleValueType() != MVT::bf16 && "Unsupported Type");
24741 if (Op0.getSimpleValueType() != MVT::f80) {
24742 SDValue Res = getSETCC(
24743 NewCC, DAG.getNode(X86ISD::UCOMX, dl, MVT::i32, Op0, Op1), dl, DAG);
24744 return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res;
24745 }
24746 }
24747 }
24748 // Handle floating point.
24749 X86::CondCode CondCode = TranslateX86CC(CC, dl, /*IsFP*/ true, Op0, Op1, DAG);
24750 if (CondCode == X86::COND_INVALID)
24751 return SDValue();
24752
24753 SDValue EFLAGS;
24754 if (IsStrict) {
24755 bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
24756 EFLAGS =
24758 dl, {MVT::i32, MVT::Other}, {Chain, Op0, Op1});
24759 Chain = EFLAGS.getValue(1);
24760 } else {
24761 EFLAGS = DAG.getNode(X86ISD::FCMP, dl, MVT::i32, Op0, Op1);
24762 }
24763
24764 SDValue X86CC = DAG.getTargetConstant(CondCode, dl, MVT::i8);
24765 SDValue Res = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, X86CC, EFLAGS);
24766 return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res;
24767}
24768
24769SDValue X86TargetLowering::LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) const {
24770 SDValue LHS = Op.getOperand(0);
24771 SDValue RHS = Op.getOperand(1);
24772 SDValue Carry = Op.getOperand(2);
24773 SDValue Cond = Op.getOperand(3);
24774 SDLoc DL(Op);
24775
24776 assert(LHS.getSimpleValueType().isInteger() && "SETCCCARRY is integer only.");
24778
24779 // Recreate the carry if needed.
24780 EVT CarryVT = Carry.getValueType();
24781 Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
24782 Carry, DAG.getAllOnesConstant(DL, CarryVT));
24783
24784 SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
24785 SDValue Cmp = DAG.getNode(X86ISD::SBB, DL, VTs, LHS, RHS, Carry.getValue(1));
24786 return getSETCC(CC, Cmp.getValue(1), DL, DAG);
24787}
24788
24789// This function returns three things: the arithmetic computation itself
24790// (Value), an EFLAGS result (Overflow), and a condition code (Cond). The
24791// flag and the condition code define the case in which the arithmetic
24792// computation overflows.
24793static std::pair<SDValue, SDValue>
24795 assert(Op.getResNo() == 0 && "Unexpected result number!");
24796 SDValue Value, Overflow;
24797 SDValue LHS = Op.getOperand(0);
24798 SDValue RHS = Op.getOperand(1);
24799 unsigned BaseOp = 0;
24800 SDLoc DL(Op);
24801 switch (Op.getOpcode()) {
24802 default: llvm_unreachable("Unknown ovf instruction!");
24803 case ISD::SADDO:
24804 BaseOp = X86ISD::ADD;
24805 Cond = X86::COND_O;
24806 break;
24807 case ISD::UADDO:
24808 BaseOp = X86ISD::ADD;
24810 break;
24811 case ISD::SSUBO:
24812 BaseOp = X86ISD::SUB;
24813 Cond = X86::COND_O;
24814 break;
24815 case ISD::USUBO:
24816 BaseOp = X86ISD::SUB;
24817 Cond = X86::COND_B;
24818 break;
24819 case ISD::SMULO:
24820 BaseOp = X86ISD::SMUL;
24821 Cond = X86::COND_O;
24822 break;
24823 case ISD::UMULO:
24824 BaseOp = X86ISD::UMUL;
24825 Cond = X86::COND_O;
24826 break;
24827 }
24828
24829 if (BaseOp) {
24830 // Also sets EFLAGS.
24831 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
24832 Value = DAG.getNode(BaseOp, DL, VTs, LHS, RHS);
24833 Overflow = Value.getValue(1);
24834 }
24835
24836 return std::make_pair(Value, Overflow);
24837}
24838
24840 // Lower the "add/sub/mul with overflow" instruction into a regular ins plus
24841 // a "setcc" instruction that checks the overflow flag. The "brcond" lowering
24842 // looks for this combo and may remove the "setcc" instruction if the "setcc"
24843 // has only one use.
24844 SDLoc DL(Op);
24846 SDValue Value, Overflow;
24847 std::tie(Value, Overflow) = getX86XALUOOp(Cond, Op, DAG);
24848
24849 SDValue SetCC = getSETCC(Cond, Overflow, DL, DAG);
24850 assert(Op->getValueType(1) == MVT::i8 && "Unexpected VT!");
24851 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), Value, SetCC);
24852}
24853
24854/// Return true if opcode is a X86 logical comparison.
24856 unsigned Opc = Op.getOpcode();
24857 if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI ||
24858 Opc == X86ISD::FCMP)
24859 return true;
24860 if (Op.getResNo() == 1 &&
24861 (Opc == X86ISD::ADD || Opc == X86ISD::SUB || Opc == X86ISD::ADC ||
24863 Opc == X86ISD::OR || Opc == X86ISD::XOR || Opc == X86ISD::AND))
24864 return true;
24865
24866 return false;
24867}
24868
24870 if (V.getOpcode() != ISD::TRUNCATE)
24871 return false;
24872
24873 SDValue VOp0 = V.getOperand(0);
24874 unsigned InBits = VOp0.getValueSizeInBits();
24875 unsigned Bits = V.getValueSizeInBits();
24876 return DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits));
24877}
24878
24879// Lower various (select (icmp CmpVal, 0), LHS, RHS) custom patterns.
24881 unsigned X86CC, const SDLoc &DL,
24882 SelectionDAG &DAG,
24883 const X86Subtarget &Subtarget) {
24884 EVT CmpVT = CmpVal.getValueType();
24885 EVT VT = LHS.getValueType();
24886 if (!CmpVT.isScalarInteger() || !VT.isScalarInteger())
24887 return SDValue();
24888
24889 if (X86CC == X86::COND_E && CmpVal.getOpcode() == ISD::AND &&
24890 isOneConstant(CmpVal.getOperand(1))) {
24891 auto SplatLSB = [&](EVT SplatVT) {
24892 // we need mask of all zeros or ones with same size of the other
24893 // operands.
24894 SDValue Neg = CmpVal;
24895 if (CmpVT.bitsGT(SplatVT))
24896 Neg = DAG.getNode(ISD::TRUNCATE, DL, SplatVT, CmpVal);
24897 else if (CmpVT.bitsLT(SplatVT))
24898 Neg = DAG.getNode(
24899 ISD::AND, DL, SplatVT,
24900 DAG.getNode(ISD::ANY_EXTEND, DL, SplatVT, CmpVal.getOperand(0)),
24901 DAG.getConstant(1, DL, SplatVT));
24902 return DAG.getNegative(Neg, DL, SplatVT); // -(and (x, 0x1))
24903 };
24904
24905 // SELECT (AND(X,1) == 0), 0, -1 -> NEG(AND(X,1))
24907 return SplatLSB(VT);
24908
24909 // SELECT (AND(X,1) == 0), C1, C2 -> XOR(C1,AND(NEG(AND(X,1)),XOR(C1,C2))
24910 if (!Subtarget.canUseCMOV() && isa<ConstantSDNode>(LHS) &&
24912 SDValue Mask = SplatLSB(VT);
24913 SDValue Diff = DAG.getNode(ISD::XOR, DL, VT, LHS, RHS);
24914 SDValue Flip = DAG.getNode(ISD::AND, DL, VT, Mask, Diff);
24915 return DAG.getNode(ISD::XOR, DL, VT, LHS, Flip);
24916 }
24917
24918 SDValue Src1, Src2;
24919 auto isIdentityPatternZero = [&]() {
24920 switch (RHS.getOpcode()) {
24921 default:
24922 break;
24923 case ISD::OR:
24924 case ISD::XOR:
24925 case ISD::ADD:
24926 if (RHS.getOperand(0) == LHS || RHS.getOperand(1) == LHS) {
24927 Src1 = RHS.getOperand(RHS.getOperand(0) == LHS ? 1 : 0);
24928 Src2 = LHS;
24929 return true;
24930 }
24931 break;
24932 case ISD::SHL:
24933 case ISD::SRA:
24934 case ISD::SRL:
24935 case ISD::SUB:
24936 if (RHS.getOperand(0) == LHS) {
24937 Src1 = RHS.getOperand(1);
24938 Src2 = LHS;
24939 return true;
24940 }
24941 break;
24942 }
24943 return false;
24944 };
24945
24946 auto isIdentityPatternOnes = [&]() {
24947 switch (LHS.getOpcode()) {
24948 default:
24949 break;
24950 case ISD::AND:
24951 if (LHS.getOperand(0) == RHS || LHS.getOperand(1) == RHS) {
24952 Src1 = LHS.getOperand(LHS.getOperand(0) == RHS ? 1 : 0);
24953 Src2 = RHS;
24954 return true;
24955 }
24956 break;
24957 }
24958 return false;
24959 };
24960
24961 // Convert 'identity' patterns (iff X is 0 or 1):
24962 // SELECT (AND(X,1) == 0), Y, (OR Y, Z) -> (OR Y, (AND NEG(AND(X,1)), Z))
24963 // SELECT (AND(X,1) == 0), Y, (XOR Y, Z) -> (XOR Y, (AND NEG(AND(X,1)), Z))
24964 // SELECT (AND(X,1) == 0), Y, (ADD Y, Z) -> (ADD Y, (AND NEG(AND(X,1)), Z))
24965 // SELECT (AND(X,1) == 0), Y, (SUB Y, Z) -> (SUB Y, (AND NEG(AND(X,1)), Z))
24966 // SELECT (AND(X,1) == 0), Y, (SHL Y, Z) -> (SHL Y, (AND NEG(AND(X,1)), Z))
24967 // SELECT (AND(X,1) == 0), Y, (SRA Y, Z) -> (SRA Y, (AND NEG(AND(X,1)), Z))
24968 // SELECT (AND(X,1) == 0), Y, (SRL Y, Z) -> (SRL Y, (AND NEG(AND(X,1)), Z))
24969 if (!Subtarget.canUseCMOV() && isIdentityPatternZero()) {
24970 SDValue Mask = SplatLSB(Src1.getValueType());
24971 SDValue And = DAG.getNode(ISD::AND, DL, Src1.getValueType(), Mask,
24972 Src1); // Mask & z
24973 return DAG.getNode(RHS.getOpcode(), DL, VT, Src2, And); // y Op And
24974 }
24975 // SELECT (AND(X,1) == 0), (AND Y, Z), Y -> (AND Y, (OR NEG(AND(X, 1)), Z))
24976 if (!Subtarget.canUseCMOV() && isIdentityPatternOnes()) {
24977 SDValue Mask = SplatLSB(VT);
24978 SDValue Or = DAG.getNode(ISD::OR, DL, VT, Mask, Src1); // Mask | z
24979 return DAG.getNode(LHS.getOpcode(), DL, VT, Src2, Or); // y Op Or
24980 }
24981 }
24982
24983 if ((X86CC == X86::COND_E || X86CC == X86::COND_NE) &&
24986 SDVTList CmpVTs = DAG.getVTList(CmpVT, MVT::i32);
24987
24988 // 'X - 1' sets the carry flag if X == 0.
24989 // '0 - X' sets the carry flag if X != 0.
24990 // Convert the carry flag to a -1/0 mask with sbb:
24991 // select (X != 0), -1, Y --> 0 - X; or (sbb), Y
24992 // select (X == 0), Y, -1 --> 0 - X; or (sbb), Y
24993 // select (X != 0), Y, -1 --> X - 1; or (sbb), Y
24994 // select (X == 0), -1, Y --> X - 1; or (sbb), Y
24995 SDValue Sub;
24996 if (isAllOnesConstant(LHS) == (X86CC == X86::COND_NE)) {
24997 SDValue Zero = DAG.getConstant(0, DL, CmpVT);
24998 Sub = DAG.getNode(X86ISD::SUB, DL, CmpVTs, Zero, CmpVal);
24999 } else {
25000 SDValue One = DAG.getConstant(1, DL, CmpVT);
25001 Sub = DAG.getNode(X86ISD::SUB, DL, CmpVTs, CmpVal, One);
25002 }
25003 SDValue SBB = DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
25004 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
25005 Sub.getValue(1));
25006 return DAG.getNode(ISD::OR, DL, VT, SBB, Y);
25007 }
25008
25009 return SDValue();
25010}
25011
25012SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
25013 bool AddTest = true;
25014 SDValue Cond = Op.getOperand(0);
25015 SDValue Op1 = Op.getOperand(1);
25016 SDValue Op2 = Op.getOperand(2);
25017 SDLoc DL(Op);
25018 MVT VT = Op1.getSimpleValueType();
25019 SDValue CC;
25020
25021 if (isSoftF16(VT, Subtarget)) {
25022 MVT NVT = VT.changeTypeToInteger();
25023 return DAG.getBitcast(VT, DAG.getNode(ISD::SELECT, DL, NVT, Cond,
25024 DAG.getBitcast(NVT, Op1),
25025 DAG.getBitcast(NVT, Op2)));
25026 }
25027
25028 // Lower FP selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops
25029 // are available or VBLENDV if AVX is available.
25030 // Otherwise FP cmovs get lowered into a less efficient branch sequence later.
25031 if (Cond.getOpcode() == ISD::SETCC && isScalarFPTypeInSSEReg(VT) &&
25032 VT == Cond.getOperand(0).getSimpleValueType() && Cond->hasOneUse()) {
25033 SDValue CondOp0 = Cond.getOperand(0), CondOp1 = Cond.getOperand(1);
25034 bool IsAlwaysSignaling;
25035 unsigned SSECC =
25036 translateX86FSETCC(cast<CondCodeSDNode>(Cond.getOperand(2))->get(),
25037 CondOp0, CondOp1, IsAlwaysSignaling);
25038
25039 if (Subtarget.hasAVX512()) {
25040 SDValue Cmp =
25041 DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CondOp0, CondOp1,
25042 DAG.getTargetConstant(SSECC, DL, MVT::i8));
25043 assert(!VT.isVector() && "Not a scalar type?");
25044 return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);
25045 }
25046
25047 if (SSECC < 8 || Subtarget.hasAVX()) {
25048 SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, VT, CondOp0, CondOp1,
25049 DAG.getTargetConstant(SSECC, DL, MVT::i8));
25050
25051 // If we have SSE41/AVX, we can use a variable vector select (VBLENDV)
25052 // instead of 3 logic instructions for size savings and potentially speed.
25053 // Unfortunately, there is no scalar form of VBLENDV.
25054 //
25055 // If either operand is a +0.0 constant, don't try this. We can expect to
25056 // optimize away at least one of the logic instructions later in that
25057 // case, so that sequence would be faster than a variable blend.
25058 if (Subtarget.hasSSE41() && !isNullFPConstant(Op1) &&
25059 !isNullFPConstant(Op2)) {
25060 // Convert to vectors, do a VSELECT, and convert back to scalar.
25061 // All of the conversions should be optimized away.
25062 MVT VecVT = VT == MVT::f32 ? MVT::v4f32 : MVT::v2f64;
25063 SDValue VOp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op1);
25064 SDValue VOp2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op2);
25065 SDValue VCmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Cmp);
25066
25067 MVT VCmpVT = VT == MVT::f32 ? MVT::v4i32 : MVT::v2i64;
25068 VCmp = DAG.getBitcast(VCmpVT, VCmp);
25069
25070 SDValue VSel = DAG.getSelect(DL, VecVT, VCmp, VOp1, VOp2);
25071
25072 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, VSel,
25073 DAG.getVectorIdxConstant(0, DL));
25074 }
25075 SDValue AndN = DAG.getNode(X86ISD::FANDN, DL, VT, Cmp, Op2);
25076 SDValue And = DAG.getNode(X86ISD::FAND, DL, VT, Cmp, Op1);
25077 return DAG.getNode(X86ISD::FOR, DL, VT, AndN, And);
25078 }
25079 }
25080
25081 // AVX512 fallback is to lower selects of scalar floats to masked moves.
25082 if (isScalarFPTypeInSSEReg(VT) && Subtarget.hasAVX512()) {
25083 SDValue Cmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1, Cond);
25084 return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);
25085 }
25086
25087 if (Cond.getOpcode() == ISD::SETCC &&
25088 !isSoftF16(Cond.getOperand(0).getSimpleValueType(), Subtarget)) {
25089 if (SDValue NewCond = LowerSETCC(Cond, DAG)) {
25090 Cond = NewCond;
25091 // If the condition was updated, it's possible that the operands of the
25092 // select were also updated (for example, EmitTest has a RAUW). Refresh
25093 // the local references to the select operands in case they got stale.
25094 Op1 = Op.getOperand(1);
25095 Op2 = Op.getOperand(2);
25096 }
25097 }
25098
25099 // (select (x == 0), -1, y) -> (sign_bit (x - 1)) | y
25100 // (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y
25101 // (select (x != 0), y, -1) -> (sign_bit (x - 1)) | y
25102 // (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y
25103 // (select (and (x , 0x1) == 0), y, (z ^ y) ) -> (-(and (x , 0x1)) & z ) ^ y
25104 // (select (and (x , 0x1) == 0), y, (z | y) ) -> (-(and (x , 0x1)) & z ) | y
25105 // (select (x > 0), x, 0) -> (~(x >> (size_in_bits(x)-1))) & x
25106 // (select (x < 0), x, 0) -> ((x >> (size_in_bits(x)-1))) & x
25107 if (Cond.getOpcode() == X86ISD::SETCC &&
25108 Cond.getOperand(1).getOpcode() == X86ISD::CMP &&
25109 isNullConstant(Cond.getOperand(1).getOperand(1))) {
25110 SDValue Cmp = Cond.getOperand(1);
25111 SDValue CmpOp0 = Cmp.getOperand(0);
25112 unsigned CondCode = Cond.getConstantOperandVal(0);
25113
25114 // Special handling for __builtin_ffs(X) - 1 pattern which looks like
25115 // (select (seteq X, 0), -1, (cttz_zero_undef X)). Disable the special
25116 // handle to keep the CMP with 0. This should be removed by
25117 // optimizeCompareInst by using the flags from the BSR/TZCNT used for the
25118 // cttz_zero_undef.
25119 auto MatchFFSMinus1 = [&](SDValue Op1, SDValue Op2) {
25120 return (Op1.getOpcode() == ISD::CTTZ_ZERO_UNDEF && Op1.hasOneUse() &&
25121 Op1.getOperand(0) == CmpOp0 && isAllOnesConstant(Op2));
25122 };
25123 if (Subtarget.canUseCMOV() && (VT == MVT::i32 || VT == MVT::i64) &&
25124 ((CondCode == X86::COND_NE && MatchFFSMinus1(Op1, Op2)) ||
25125 (CondCode == X86::COND_E && MatchFFSMinus1(Op2, Op1)))) {
25126 // Keep Cmp.
25127 } else if (SDValue R = LowerSELECTWithCmpZero(CmpOp0, Op1, Op2, CondCode,
25128 DL, DAG, Subtarget)) {
25129 return R;
25130 } else if (VT.isScalarInteger() && isNullConstant(Op2) &&
25131 Cmp.getNode()->hasOneUse() && (CmpOp0 == Op1) &&
25132 ((CondCode == X86::COND_S) || // smin(x, 0)
25133 (CondCode == X86::COND_G && hasAndNot(Op1)))) { // smax(x, 0)
25134 // (select (x < 0), x, 0) -> ((x >> (size_in_bits(x)-1))) & x
25135 //
25136 // If the comparison is testing for a positive value, we have to invert
25137 // the sign bit mask, so only do that transform if the target has a
25138 // bitwise 'and not' instruction (the invert is free).
25139 // (select (x > 0), x, 0) -> (~(x >> (size_in_bits(x)-1))) & x
25140 unsigned ShCt = VT.getSizeInBits() - 1;
25141 SDValue ShiftAmt = DAG.getConstant(ShCt, DL, VT);
25142 SDValue Shift = DAG.getNode(ISD::SRA, DL, VT, Op1, ShiftAmt);
25143 if (CondCode == X86::COND_G)
25144 Shift = DAG.getNOT(DL, Shift, VT);
25145 return DAG.getNode(ISD::AND, DL, VT, Shift, Op1);
25146 }
25147 }
25148
25149 // Look past (and (setcc_carry (cmp ...)), 1).
25150 if (Cond.getOpcode() == ISD::AND &&
25151 Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY &&
25152 isOneConstant(Cond.getOperand(1)))
25153 Cond = Cond.getOperand(0);
25154
25155 // Attempt to fold "raw cond" cases by treating them as:
25156 // (select (and X, 1), Op1, Op2 --> (select (icmpeq (and X, 1), 0), Op2, Op1)
25157 if (Cond.getOpcode() == ISD::AND && isOneConstant(Cond.getOperand(1)))
25158 if (SDValue R = LowerSELECTWithCmpZero(Cond, Op2, Op1, X86::COND_E, DL, DAG,
25159 Subtarget))
25160 return R;
25161
25162 // If condition flag is set by a X86ISD::CMP, then use it as the condition
25163 // setting operand in place of the X86ISD::SETCC.
25164 unsigned CondOpcode = Cond.getOpcode();
25165 if (CondOpcode == X86ISD::SETCC ||
25166 CondOpcode == X86ISD::SETCC_CARRY) {
25167 CC = Cond.getOperand(0);
25168
25169 SDValue Cmp = Cond.getOperand(1);
25170 bool IllegalFPCMov = false;
25171 if (VT.isFloatingPoint() && !VT.isVector() &&
25172 !isScalarFPTypeInSSEReg(VT) && Subtarget.canUseCMOV()) // FPStack?
25173 IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue());
25174
25175 if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) ||
25176 Cmp.getOpcode() == X86ISD::BT) { // FIXME
25177 Cond = Cmp;
25178 AddTest = false;
25179 }
25180 } else if (CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
25181 CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
25182 CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) {
25183 SDValue Value;
25184 X86::CondCode X86Cond;
25185 std::tie(Value, Cond) = getX86XALUOOp(X86Cond, Cond.getValue(0), DAG);
25186
25187 CC = DAG.getTargetConstant(X86Cond, DL, MVT::i8);
25188 AddTest = false;
25189 }
25190
25191 if (AddTest) {
25192 // Look past the truncate if the high bits are known zero.
25194 Cond = Cond.getOperand(0);
25195
25196 // We know the result of AND is compared against zero. Try to match
25197 // it to BT.
25198 if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
25199 X86::CondCode X86CondCode;
25200 if (SDValue BT = LowerAndToBT(Cond, ISD::SETNE, DL, DAG, X86CondCode)) {
25201 CC = DAG.getTargetConstant(X86CondCode, DL, MVT::i8);
25202 Cond = BT;
25203 AddTest = false;
25204 }
25205 }
25206 }
25207
25208 if (AddTest) {
25209 CC = DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8);
25210 Cond = EmitTest(Cond, X86::COND_NE, DL, DAG, Subtarget);
25211 }
25212
25213 // a < b ? -1 : 0 -> RES = ~setcc_carry
25214 // a < b ? 0 : -1 -> RES = setcc_carry
25215 // a >= b ? -1 : 0 -> RES = setcc_carry
25216 // a >= b ? 0 : -1 -> RES = ~setcc_carry
25217 if (Cond.getOpcode() == X86ISD::SUB) {
25218 unsigned CondCode = CC->getAsZExtVal();
25219
25220 if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) &&
25221 (isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
25222 (isNullConstant(Op1) || isNullConstant(Op2))) {
25223 SDValue Res =
25224 DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
25225 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8), Cond);
25226 if (isAllOnesConstant(Op1) != (CondCode == X86::COND_B))
25227 return DAG.getNOT(DL, Res, Res.getValueType());
25228 return Res;
25229 }
25230 }
25231
25232 // X86 doesn't have an i8 cmov. If both operands are the result of a truncate
25233 // widen the cmov and push the truncate through. This avoids introducing a new
25234 // branch during isel and doesn't add any extensions.
25235 if (Op.getValueType() == MVT::i8 &&
25236 Op1.getOpcode() == ISD::TRUNCATE && Op2.getOpcode() == ISD::TRUNCATE) {
25237 SDValue T1 = Op1.getOperand(0), T2 = Op2.getOperand(0);
25238 if (T1.getValueType() == T2.getValueType() &&
25239 // Exclude CopyFromReg to avoid partial register stalls.
25240 T1.getOpcode() != ISD::CopyFromReg && T2.getOpcode()!=ISD::CopyFromReg){
25241 SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, T1.getValueType(), T2, T1,
25242 CC, Cond);
25243 return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
25244 }
25245 }
25246
25247 // Or finally, promote i8 cmovs if we have CMOV,
25248 // or i16 cmovs if it won't prevent folding a load.
25249 // FIXME: we should not limit promotion of i8 case to only when the CMOV is
25250 // legal, but EmitLoweredSelect() can not deal with these extensions
25251 // being inserted between two CMOV's. (in i16 case too TBN)
25252 // https://bugs.llvm.org/show_bug.cgi?id=40974
25253 if ((Op.getValueType() == MVT::i8 && Subtarget.canUseCMOV()) ||
25254 (Op.getValueType() == MVT::i16 && !X86::mayFoldLoad(Op1, Subtarget) &&
25255 !X86::mayFoldLoad(Op2, Subtarget))) {
25256 Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op1);
25257 Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op2);
25258 SDValue Ops[] = { Op2, Op1, CC, Cond };
25259 SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, MVT::i32, Ops);
25260 return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
25261 }
25262
25263 // X86ISD::CMOV means set the result (which is operand 1) to the RHS if
25264 // condition is true.
25265 SDValue Ops[] = { Op2, Op1, CC, Cond };
25266 return DAG.getNode(X86ISD::CMOV, DL, Op.getValueType(), Ops, Op->getFlags());
25267}
25268
25270 const X86Subtarget &Subtarget,
25271 SelectionDAG &DAG) {
25272 MVT VT = Op->getSimpleValueType(0);
25273 SDValue In = Op->getOperand(0);
25274 MVT InVT = In.getSimpleValueType();
25275 assert(InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!");
25276 MVT VTElt = VT.getVectorElementType();
25277 unsigned NumElts = VT.getVectorNumElements();
25278
25279 // Extend VT if the scalar type is i8/i16 and BWI is not supported.
25280 MVT ExtVT = VT;
25281 if (!Subtarget.hasBWI() && VTElt.getSizeInBits() <= 16) {
25282 // If v16i32 is to be avoided, we'll need to split and concatenate.
25283 if (NumElts == 16 && !Subtarget.canExtendTo512DQ())
25284 return SplitAndExtendv16i1(Op.getOpcode(), VT, In, dl, DAG);
25285
25286 ExtVT = MVT::getVectorVT(MVT::i32, NumElts);
25287 }
25288
25289 // Widen to 512-bits if VLX is not supported.
25290 MVT WideVT = ExtVT;
25291 if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) {
25292 NumElts *= 512 / ExtVT.getSizeInBits();
25293 InVT = MVT::getVectorVT(MVT::i1, NumElts);
25294 In = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, InVT, DAG.getUNDEF(InVT), In,
25295 DAG.getVectorIdxConstant(0, dl));
25296 WideVT = MVT::getVectorVT(ExtVT.getVectorElementType(), NumElts);
25297 }
25298
25299 SDValue V;
25300 MVT WideEltVT = WideVT.getVectorElementType();
25301 if ((Subtarget.hasDQI() && WideEltVT.getSizeInBits() >= 32) ||
25302 (Subtarget.hasBWI() && WideEltVT.getSizeInBits() <= 16)) {
25303 V = DAG.getNode(Op.getOpcode(), dl, WideVT, In);
25304 } else {
25305 SDValue NegOne = DAG.getAllOnesConstant(dl, WideVT);
25306 SDValue Zero = DAG.getConstant(0, dl, WideVT);
25307 V = DAG.getSelect(dl, WideVT, In, NegOne, Zero);
25308 }
25309
25310 // Truncate if we had to extend i16/i8 above.
25311 if (VT != ExtVT) {
25312 WideVT = MVT::getVectorVT(VTElt, NumElts);
25313 V = DAG.getNode(ISD::TRUNCATE, dl, WideVT, V);
25314 }
25315
25316 // Extract back to 128/256-bit if we widened.
25317 if (WideVT != VT)
25318 V = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, V,
25319 DAG.getVectorIdxConstant(0, dl));
25320
25321 return V;
25322}
25323
25325 SelectionDAG &DAG) {
25326 SDValue In = Op->getOperand(0);
25327 MVT InVT = In.getSimpleValueType();
25328 SDLoc DL(Op);
25329
25330 if (InVT.getVectorElementType() == MVT::i1)
25331 return LowerSIGN_EXTEND_Mask(Op, DL, Subtarget, DAG);
25332
25333 assert(Subtarget.hasAVX() && "Expected AVX support");
25334 return LowerAVXExtend(Op, DL, DAG, Subtarget);
25335}
25336
25337// Lowering for SIGN_EXTEND_VECTOR_INREG and ZERO_EXTEND_VECTOR_INREG.
25338// For sign extend this needs to handle all vector sizes and SSE4.1 and
25339// non-SSE4.1 targets. For zero extend this should only handle inputs of
25340// MVT::v64i8 when BWI is not supported, but AVX512 is.
25342 const X86Subtarget &Subtarget,
25343 SelectionDAG &DAG) {
25344 SDValue In = Op->getOperand(0);
25345 MVT VT = Op->getSimpleValueType(0);
25346 MVT InVT = In.getSimpleValueType();
25347
25348 MVT SVT = VT.getVectorElementType();
25349 MVT InSVT = InVT.getVectorElementType();
25351
25352 if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16)
25353 return SDValue();
25354 if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8)
25355 return SDValue();
25356 if (!(VT.is128BitVector() && Subtarget.hasSSE2()) &&
25357 !(VT.is256BitVector() && Subtarget.hasAVX()) &&
25358 !(VT.is512BitVector() && Subtarget.hasAVX512()))
25359 return SDValue();
25360
25361 SDLoc dl(Op);
25362 unsigned Opc = Op.getOpcode();
25363 unsigned NumElts = VT.getVectorNumElements();
25364
25365 // For 256-bit vectors, we only need the lower (128-bit) half of the input.
25366 // For 512-bit vectors, we need 128-bits or 256-bits.
25367 if (InVT.getSizeInBits() > 128) {
25368 // Input needs to be at least the same number of elements as output, and
25369 // at least 128-bits.
25370 int InSize = InSVT.getSizeInBits() * NumElts;
25371 In = extractSubVector(In, 0, DAG, dl, std::max(InSize, 128));
25372 InVT = In.getSimpleValueType();
25373 }
25374
25375 // SSE41 targets can use the pmov[sz]x* instructions directly for 128-bit results,
25376 // so are legal and shouldn't occur here. AVX2/AVX512 pmovsx* instructions still
25377 // need to be handled here for 256/512-bit results.
25378 if (Subtarget.hasInt256()) {
25379 assert(VT.getSizeInBits() > 128 && "Unexpected 128-bit vector extension");
25380
25381 if (InVT.getVectorNumElements() != NumElts)
25382 return DAG.getNode(Op.getOpcode(), dl, VT, In);
25383
25384 // FIXME: Apparently we create inreg operations that could be regular
25385 // extends.
25386 unsigned ExtOpc =
25389 return DAG.getNode(ExtOpc, dl, VT, In);
25390 }
25391
25392 // pre-AVX2 256-bit extensions need to be split into 128-bit instructions.
25393 if (Subtarget.hasAVX()) {
25394 assert(VT.is256BitVector() && "256-bit vector expected");
25395 MVT HalfVT = VT.getHalfNumVectorElementsVT();
25396 int HalfNumElts = HalfVT.getVectorNumElements();
25397
25398 unsigned NumSrcElts = InVT.getVectorNumElements();
25399 SmallVector<int, 16> HiMask(NumSrcElts, SM_SentinelUndef);
25400 for (int i = 0; i != HalfNumElts; ++i)
25401 HiMask[i] = HalfNumElts + i;
25402
25403 SDValue Lo = DAG.getNode(Opc, dl, HalfVT, In);
25404 SDValue Hi = DAG.getVectorShuffle(InVT, dl, In, DAG.getUNDEF(InVT), HiMask);
25405 Hi = DAG.getNode(Opc, dl, HalfVT, Hi);
25406 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
25407 }
25408
25409 // We should only get here for sign extend.
25410 assert(Opc == ISD::SIGN_EXTEND_VECTOR_INREG && "Unexpected opcode!");
25411 assert(VT.is128BitVector() && InVT.is128BitVector() && "Unexpected VTs");
25412 unsigned InNumElts = InVT.getVectorNumElements();
25413
25414 // If the source elements are already all-signbits, we don't need to extend,
25415 // just splat the elements.
25416 APInt DemandedElts = APInt::getLowBitsSet(InNumElts, NumElts);
25417 if (DAG.ComputeNumSignBits(In, DemandedElts) == InVT.getScalarSizeInBits()) {
25418 unsigned Scale = InNumElts / NumElts;
25419 SmallVector<int, 16> ShuffleMask;
25420 for (unsigned I = 0; I != NumElts; ++I)
25421 ShuffleMask.append(Scale, I);
25422 return DAG.getBitcast(VT,
25423 DAG.getVectorShuffle(InVT, dl, In, In, ShuffleMask));
25424 }
25425
25426 // pre-SSE41 targets unpack lower lanes and then sign-extend using SRAI.
25427 SDValue Curr = In;
25428 SDValue SignExt = Curr;
25429
25430 // As SRAI is only available on i16/i32 types, we expand only up to i32
25431 // and handle i64 separately.
25432 if (InVT != MVT::v4i32) {
25433 MVT DestVT = VT == MVT::v2i64 ? MVT::v4i32 : VT;
25434
25435 unsigned DestWidth = DestVT.getScalarSizeInBits();
25436 unsigned Scale = DestWidth / InSVT.getSizeInBits();
25437 unsigned DestElts = DestVT.getVectorNumElements();
25438
25439 // Build a shuffle mask that takes each input element and places it in the
25440 // MSBs of the new element size.
25441 SmallVector<int, 16> Mask(InNumElts, SM_SentinelUndef);
25442 for (unsigned i = 0; i != DestElts; ++i)
25443 Mask[i * Scale + (Scale - 1)] = i;
25444
25445 Curr = DAG.getVectorShuffle(InVT, dl, In, In, Mask);
25446 Curr = DAG.getBitcast(DestVT, Curr);
25447
25448 unsigned SignExtShift = DestWidth - InSVT.getSizeInBits();
25449 SignExt = DAG.getNode(X86ISD::VSRAI, dl, DestVT, Curr,
25450 DAG.getTargetConstant(SignExtShift, dl, MVT::i8));
25451 }
25452
25453 if (VT == MVT::v2i64) {
25454 assert(Curr.getValueType() == MVT::v4i32 && "Unexpected input VT");
25455 SDValue Zero = DAG.getConstant(0, dl, MVT::v4i32);
25456 SDValue Sign = DAG.getSetCC(dl, MVT::v4i32, Zero, Curr, ISD::SETGT);
25457 SignExt = DAG.getVectorShuffle(MVT::v4i32, dl, SignExt, Sign, {0, 4, 1, 5});
25458 SignExt = DAG.getBitcast(VT, SignExt);
25459 }
25460
25461 return SignExt;
25462}
25463
25465 SelectionDAG &DAG) {
25466 MVT VT = Op->getSimpleValueType(0);
25467 SDValue In = Op->getOperand(0);
25468 MVT InVT = In.getSimpleValueType();
25469 SDLoc dl(Op);
25470
25471 if (InVT.getVectorElementType() == MVT::i1)
25472 return LowerSIGN_EXTEND_Mask(Op, dl, Subtarget, DAG);
25473
25474 assert(VT.isVector() && InVT.isVector() && "Expected vector type");
25476 "Expected same number of elements");
25477 assert((VT.getVectorElementType() == MVT::i16 ||
25478 VT.getVectorElementType() == MVT::i32 ||
25479 VT.getVectorElementType() == MVT::i64) &&
25480 "Unexpected element type");
25481 assert((InVT.getVectorElementType() == MVT::i8 ||
25482 InVT.getVectorElementType() == MVT::i16 ||
25483 InVT.getVectorElementType() == MVT::i32) &&
25484 "Unexpected element type");
25485
25486 if (VT == MVT::v32i16 && !Subtarget.hasBWI()) {
25487 assert(InVT == MVT::v32i8 && "Unexpected VT!");
25488 return splitVectorIntUnary(Op, DAG, dl);
25489 }
25490
25491 if (Subtarget.hasInt256())
25492 return Op;
25493
25494 // Optimize vectors in AVX mode
25495 // Sign extend v8i16 to v8i32 and
25496 // v4i32 to v4i64
25497 //
25498 // Divide input vector into two parts
25499 // for v4i32 the high shuffle mask will be {2, 3, -1, -1}
25500 // use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32
25501 // concat the vectors to original VT
25502 MVT HalfVT = VT.getHalfNumVectorElementsVT();
25503 SDValue OpLo = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, HalfVT, In);
25504
25505 unsigned NumElems = InVT.getVectorNumElements();
25506 SmallVector<int,8> ShufMask(NumElems, -1);
25507 for (unsigned i = 0; i != NumElems/2; ++i)
25508 ShufMask[i] = i + NumElems/2;
25509
25510 SDValue OpHi = DAG.getVectorShuffle(InVT, dl, In, In, ShufMask);
25511 OpHi = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, HalfVT, OpHi);
25512
25513 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
25514}
25515
25516/// Change a vector store into a pair of half-size vector stores.
25518 SDValue StoredVal = Store->getValue();
25519 assert((StoredVal.getValueType().is256BitVector() ||
25520 StoredVal.getValueType().is512BitVector()) &&
25521 "Expecting 256/512-bit op");
25522
25523 // Splitting volatile memory ops is not allowed unless the operation was not
25524 // legal to begin with. Assume the input store is legal (this transform is
25525 // only used for targets with AVX). Note: It is possible that we have an
25526 // illegal type like v2i128, and so we could allow splitting a volatile store
25527 // in that case if that is important.
25528 if (!Store->isSimple())
25529 return SDValue();
25530
25531 SDLoc DL(Store);
25532 SDValue Value0, Value1;
25533 std::tie(Value0, Value1) = splitVector(StoredVal, DAG, DL);
25534 unsigned HalfOffset = Value0.getValueType().getStoreSize();
25535 SDValue Ptr0 = Store->getBasePtr();
25536 SDValue Ptr1 =
25537 DAG.getMemBasePlusOffset(Ptr0, TypeSize::getFixed(HalfOffset), DL);
25538 SDValue Ch0 =
25539 DAG.getStore(Store->getChain(), DL, Value0, Ptr0, Store->getPointerInfo(),
25540 Store->getBaseAlign(), Store->getMemOperand()->getFlags());
25541 SDValue Ch1 =
25542 DAG.getStore(Store->getChain(), DL, Value1, Ptr1,
25543 Store->getPointerInfo().getWithOffset(HalfOffset),
25544 Store->getBaseAlign(), Store->getMemOperand()->getFlags());
25545 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Ch0, Ch1);
25546}
25547
25548/// Scalarize a vector store, bitcasting to TargetVT to determine the scalar
25549/// type.
25551 SelectionDAG &DAG) {
25552 SDValue StoredVal = Store->getValue();
25553 assert(StoreVT.is128BitVector() &&
25554 StoredVal.getValueType().is128BitVector() && "Expecting 128-bit op");
25555 StoredVal = DAG.getBitcast(StoreVT, StoredVal);
25556
25557 // Splitting volatile memory ops is not allowed unless the operation was not
25558 // legal to begin with. We are assuming the input op is legal (this transform
25559 // is only used for targets with AVX).
25560 if (!Store->isSimple())
25561 return SDValue();
25562
25563 MVT StoreSVT = StoreVT.getScalarType();
25564 unsigned NumElems = StoreVT.getVectorNumElements();
25565 unsigned ScalarSize = StoreSVT.getStoreSize();
25566
25567 SDLoc DL(Store);
25569 for (unsigned i = 0; i != NumElems; ++i) {
25570 unsigned Offset = i * ScalarSize;
25571 SDValue Ptr = DAG.getMemBasePlusOffset(Store->getBasePtr(),
25573 SDValue Scl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, StoreSVT, StoredVal,
25574 DAG.getVectorIdxConstant(i, DL));
25575 SDValue Ch =
25576 DAG.getStore(Store->getChain(), DL, Scl, Ptr,
25577 Store->getPointerInfo().getWithOffset(Offset),
25578 Store->getBaseAlign(), Store->getMemOperand()->getFlags());
25579 Stores.push_back(Ch);
25580 }
25581 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores);
25582}
25583
25584static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget,
25585 SelectionDAG &DAG) {
25586 StoreSDNode *St = cast<StoreSDNode>(Op.getNode());
25587 SDLoc dl(St);
25588 SDValue StoredVal = St->getValue();
25589
25590 // Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 stores.
25591 if (StoredVal.getValueType().isVector() &&
25592 StoredVal.getValueType().getVectorElementType() == MVT::i1) {
25593 unsigned NumElts = StoredVal.getValueType().getVectorNumElements();
25594 assert(NumElts <= 8 && "Unexpected VT");
25595 assert(!St->isTruncatingStore() && "Expected non-truncating store");
25596 assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() &&
25597 "Expected AVX512F without AVX512DQI");
25598
25599 // We must pad with zeros to ensure we store zeroes to any unused bits.
25600 StoredVal = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1,
25601 DAG.getUNDEF(MVT::v16i1), StoredVal,
25602 DAG.getVectorIdxConstant(0, dl));
25603 StoredVal = DAG.getBitcast(MVT::i16, StoredVal);
25604 StoredVal = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, StoredVal);
25605 // Make sure we store zeros in the extra bits.
25606 if (NumElts < 8)
25607 StoredVal = DAG.getZeroExtendInReg(
25608 StoredVal, dl, EVT::getIntegerVT(*DAG.getContext(), NumElts));
25609
25610 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
25611 St->getPointerInfo(), St->getBaseAlign(),
25612 St->getMemOperand()->getFlags());
25613 }
25614
25615 if (St->isTruncatingStore())
25616 return SDValue();
25617
25618 // If this is a 256/512-bit store of concatenated ops, we are better off
25619 // splitting that store into two half-size stores. This avoids spurious use of
25620 // concatenated ops and each half can execute independently. Some cores would
25621 // split the op into halves anyway, so the concat is purely an extra op.
25622 MVT StoreVT = StoredVal.getSimpleValueType();
25623 if (StoreVT.is256BitVector() || StoreVT.is512BitVector()) {
25624 if (StoredVal.hasOneUse() && isFreeToSplitVector(StoredVal, DAG))
25625 return splitVectorStore(St, DAG);
25626 return SDValue();
25627 }
25628
25629 if (StoreVT.is32BitVector())
25630 return SDValue();
25631
25632 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
25633 assert(StoreVT.is64BitVector() && "Unexpected VT");
25634 assert(TLI.getTypeAction(*DAG.getContext(), StoreVT) ==
25636 "Unexpected type action!");
25637
25638 EVT WideVT = TLI.getTypeToTransformTo(*DAG.getContext(), StoreVT);
25639 StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, StoredVal,
25640 DAG.getUNDEF(StoreVT));
25641
25642 if (Subtarget.hasSSE2()) {
25643 // Widen the vector, cast to a v2x64 type, extract the single 64-bit element
25644 // and store it.
25645 MVT StVT = Subtarget.is64Bit() && StoreVT.isInteger() ? MVT::i64 : MVT::f64;
25646 MVT CastVT = MVT::getVectorVT(StVT, 2);
25647 StoredVal = DAG.getBitcast(CastVT, StoredVal);
25648 StoredVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, StVT, StoredVal,
25649 DAG.getVectorIdxConstant(0, dl));
25650
25651 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
25652 St->getPointerInfo(), St->getBaseAlign(),
25653 St->getMemOperand()->getFlags());
25654 }
25655 assert(Subtarget.hasSSE1() && "Expected SSE");
25656 SDVTList Tys = DAG.getVTList(MVT::Other);
25657 SDValue Ops[] = {St->getChain(), StoredVal, St->getBasePtr()};
25658 return DAG.getMemIntrinsicNode(X86ISD::VEXTRACT_STORE, dl, Tys, Ops, MVT::i64,
25659 St->getMemOperand());
25660}
25661
25662// Lower vector extended loads using a shuffle. If SSSE3 is not available we
25663// may emit an illegal shuffle but the expansion is still better than scalar
25664// code. We generate sext/sext_invec for SEXTLOADs if it's available, otherwise
25665// we'll emit a shuffle and a arithmetic shift.
25666// FIXME: Is the expansion actually better than scalar code? It doesn't seem so.
25667// TODO: It is possible to support ZExt by zeroing the undef values during
25668// the shuffle phase or after the shuffle.
25669static SDValue LowerLoad(SDValue Op, const X86Subtarget &Subtarget,
25670 SelectionDAG &DAG) {
25671 MVT RegVT = Op.getSimpleValueType();
25672 assert(RegVT.isVector() && "We only custom lower vector loads.");
25673 assert(RegVT.isInteger() &&
25674 "We only custom lower integer vector loads.");
25675
25676 LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
25677 SDLoc dl(Ld);
25678
25679 // Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 loads.
25680 if (RegVT.getVectorElementType() == MVT::i1) {
25681 assert(EVT(RegVT) == Ld->getMemoryVT() && "Expected non-extending load");
25682 assert(RegVT.getVectorNumElements() <= 8 && "Unexpected VT");
25683 assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() &&
25684 "Expected AVX512F without AVX512DQI");
25685
25686 SDValue NewLd = DAG.getLoad(MVT::i8, dl, Ld->getChain(), Ld->getBasePtr(),
25687 Ld->getPointerInfo(), Ld->getBaseAlign(),
25688 Ld->getMemOperand()->getFlags());
25689
25690 // Replace chain users with the new chain.
25691 assert(NewLd->getNumValues() == 2 && "Loads must carry a chain!");
25692
25693 SDValue Val = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, NewLd);
25694 Val = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, RegVT,
25695 DAG.getBitcast(MVT::v16i1, Val),
25696 DAG.getVectorIdxConstant(0, dl));
25697 return DAG.getMergeValues({Val, NewLd.getValue(1)}, dl);
25698 }
25699
25700 return SDValue();
25701}
25702
25703/// Return true if node is an ISD::AND or ISD::OR of two X86ISD::SETCC nodes
25704/// each of which has no other use apart from the AND / OR.
25705static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) {
25706 Opc = Op.getOpcode();
25707 if (Opc != ISD::OR && Opc != ISD::AND)
25708 return false;
25709 return (Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
25710 Op.getOperand(0).hasOneUse() &&
25711 Op.getOperand(1).getOpcode() == X86ISD::SETCC &&
25712 Op.getOperand(1).hasOneUse());
25713}
25714
25715SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
25716 SDValue Chain = Op.getOperand(0);
25717 SDValue Cond = Op.getOperand(1);
25718 SDValue Dest = Op.getOperand(2);
25719 SDLoc dl(Op);
25720
25721 // Bail out when we don't have native compare instructions.
25722 if (Cond.getOpcode() == ISD::SETCC &&
25723 Cond.getOperand(0).getValueType() != MVT::f128 &&
25724 !isSoftF16(Cond.getOperand(0).getValueType(), Subtarget)) {
25725 SDValue LHS = Cond.getOperand(0);
25726 SDValue RHS = Cond.getOperand(1);
25727 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
25728
25729 // Special case for
25730 // setcc([su]{add,sub,mul}o == 0)
25731 // setcc([su]{add,sub,mul}o != 1)
25733 (CC == ISD::SETEQ || CC == ISD::SETNE) &&
25735 SDValue Value, Overflow;
25736 X86::CondCode X86Cond;
25737 std::tie(Value, Overflow) = getX86XALUOOp(X86Cond, LHS.getValue(0), DAG);
25738
25739 if ((CC == ISD::SETEQ) == isNullConstant(RHS))
25740 X86Cond = X86::GetOppositeBranchCondition(X86Cond);
25741
25742 SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
25743 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
25744 Overflow, Op->getFlags());
25745 }
25746
25747 if (LHS.getSimpleValueType().isInteger()) {
25748 SDValue CCVal;
25749 SDValue EFLAGS = emitFlagsForSetcc(LHS, RHS, CC, SDLoc(Cond), DAG, CCVal);
25750 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
25751 EFLAGS, Op->getFlags());
25752 }
25753
25754 if (CC == ISD::SETOEQ) {
25755 // For FCMP_OEQ, we can emit
25756 // two branches instead of an explicit AND instruction with a
25757 // separate test. However, we only do this if this block doesn't
25758 // have a fall-through edge, because this requires an explicit
25759 // jmp when the condition is false.
25760 if (Op.getNode()->hasOneUse()) {
25761 SDNode *User = *Op.getNode()->user_begin();
25762 // Look for an unconditional branch following this conditional branch.
25763 // We need this because we need to reverse the successors in order
25764 // to implement FCMP_OEQ.
25765 if (User->getOpcode() == ISD::BR) {
25766 SDValue FalseBB = User->getOperand(1);
25767 SDNode *NewBR =
25768 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
25769 assert(NewBR == User);
25770 (void)NewBR;
25771 Dest = FalseBB;
25772
25773 SDValue Cmp =
25774 DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS);
25775 SDValue CCVal = DAG.getTargetConstant(X86::COND_NE, dl, MVT::i8);
25776 Chain = DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest,
25777 CCVal, Cmp, Op->getFlags());
25778 CCVal = DAG.getTargetConstant(X86::COND_P, dl, MVT::i8);
25779 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
25780 Cmp, Op->getFlags());
25781 }
25782 }
25783 } else if (CC == ISD::SETUNE) {
25784 // For FCMP_UNE, we can emit
25785 // two branches instead of an explicit OR instruction with a
25786 // separate test.
25787 SDValue Cmp = DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS);
25788 SDValue CCVal = DAG.getTargetConstant(X86::COND_NE, dl, MVT::i8);
25789 Chain = DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
25790 Cmp, Op->getFlags());
25791 CCVal = DAG.getTargetConstant(X86::COND_P, dl, MVT::i8);
25792 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
25793 Cmp, Op->getFlags());
25794 } else {
25795 X86::CondCode X86Cond =
25796 TranslateX86CC(CC, dl, /*IsFP*/ true, LHS, RHS, DAG);
25797 SDValue Cmp = DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS);
25798 SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
25799 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
25800 Cmp, Op->getFlags());
25801 }
25802 }
25803
25805 SDValue Value, Overflow;
25806 X86::CondCode X86Cond;
25807 std::tie(Value, Overflow) = getX86XALUOOp(X86Cond, Cond.getValue(0), DAG);
25808
25809 SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
25810 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
25811 Overflow, Op->getFlags());
25812 }
25813
25814 // Look past the truncate if the high bits are known zero.
25816 Cond = Cond.getOperand(0);
25817
25818 EVT CondVT = Cond.getValueType();
25819
25820 // Add an AND with 1 if we don't already have one.
25821 if (!(Cond.getOpcode() == ISD::AND && isOneConstant(Cond.getOperand(1))))
25822 Cond =
25823 DAG.getNode(ISD::AND, dl, CondVT, Cond, DAG.getConstant(1, dl, CondVT));
25824
25825 SDValue LHS = Cond;
25826 SDValue RHS = DAG.getConstant(0, dl, CondVT);
25827
25828 SDValue CCVal;
25829 SDValue EFLAGS = emitFlagsForSetcc(LHS, RHS, ISD::SETNE, dl, DAG, CCVal);
25830 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal, EFLAGS,
25831 Op->getFlags());
25832}
25833
25834// Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets.
25835// Calls to _alloca are needed to probe the stack when allocating more than 4k
25836// bytes in one go. Touching the stack at 4K increments is necessary to ensure
25837// that the guard pages used by the OS virtual memory manager are allocated in
25838// correct sequence.
25839SDValue
25840X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
25841 SelectionDAG &DAG) const {
25842 MachineFunction &MF = DAG.getMachineFunction();
25843 bool SplitStack = MF.shouldSplitStack();
25844 bool EmitStackProbeCall = hasStackProbeSymbol(MF);
25845 bool Lower = (Subtarget.isOSWindows() && !Subtarget.isTargetMachO()) ||
25846 SplitStack || EmitStackProbeCall;
25847 SDLoc dl(Op);
25848
25849 // Get the inputs.
25850 SDNode *Node = Op.getNode();
25851 SDValue Chain = Op.getOperand(0);
25852 SDValue Size = Op.getOperand(1);
25853 MaybeAlign Alignment(Op.getConstantOperandVal(2));
25854 EVT VT = Node->getValueType(0);
25855
25856 // Chain the dynamic stack allocation so that it doesn't modify the stack
25857 // pointer when other instructions are using the stack.
25858 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
25859
25860 bool Is64Bit = Subtarget.is64Bit();
25861 MVT SPTy = Op.getValueType().getSimpleVT();
25862
25864 if (!Lower) {
25865 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
25867 assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"
25868 " not tell us which reg is the stack pointer!");
25869
25870 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
25871 const Align StackAlign = TFI.getStackAlign();
25872 if (hasInlineStackProbe(MF)) {
25873 Result = DAG.getNode(X86ISD::PROBED_ALLOCA, dl, {SPTy, MVT::Other},
25874 {Chain, Size});
25875 Chain = Result.getValue(1);
25876 } else {
25877 SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
25878 Chain = SP.getValue(1);
25879 Result = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value
25880 }
25881 if (Alignment && *Alignment > StackAlign)
25882 Result = DAG.getNode(
25883 ISD::AND, dl, VT, Result,
25884 DAG.getSignedConstant(~(Alignment->value() - 1ULL), dl, VT));
25885 Chain = DAG.getCopyToReg(Chain, dl, SPReg, Result); // Output chain
25886 } else if (SplitStack) {
25887 if (Is64Bit) {
25888 // The 64 bit implementation of segmented stacks needs to clobber both r10
25889 // r11. This makes it impossible to use it along with nested parameters.
25890 const Function &F = MF.getFunction();
25891 for (const auto &A : F.args()) {
25892 if (A.hasNestAttr())
25893 report_fatal_error("Cannot use segmented stacks with functions that "
25894 "have nested arguments.");
25895 }
25896 }
25897
25898 Result =
25899 DAG.getNode(X86ISD::SEG_ALLOCA, dl, {SPTy, MVT::Other}, {Chain, Size});
25900 Chain = Result.getValue(1);
25901 } else {
25902 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
25903 Chain = DAG.getNode(X86ISD::DYN_ALLOCA, dl, NodeTys, Chain, Size);
25904 MF.getInfo<X86MachineFunctionInfo>()->setHasDynAlloca(true);
25905
25906 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
25907 Register SPReg = RegInfo->getStackRegister();
25908 SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy);
25909 Chain = SP.getValue(1);
25910
25911 if (Alignment) {
25912 SP = DAG.getNode(
25913 ISD::AND, dl, VT, SP.getValue(0),
25914 DAG.getSignedConstant(~(Alignment->value() - 1ULL), dl, VT));
25915 Chain = DAG.getCopyToReg(Chain, dl, SPReg, SP);
25916 }
25917
25918 Result = SP;
25919 }
25920
25921 Chain = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), dl);
25922
25923 SDValue Ops[2] = {Result, Chain};
25924 return DAG.getMergeValues(Ops, dl);
25925}
25926
25927SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
25928 MachineFunction &MF = DAG.getMachineFunction();
25929 SDValue Ptr = Op.getOperand(1);
25930 EVT PtrVT = Ptr.getValueType();
25931
25932 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
25933
25934 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
25935 SDLoc DL(Op);
25936
25937 if (!Subtarget.is64Bit() ||
25938 Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv())) {
25939 // vastart just stores the address of the VarArgsFrameIndex slot into the
25940 // memory location argument.
25941 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
25942 return DAG.getStore(Op.getOperand(0), DL, FR, Ptr, MachinePointerInfo(SV));
25943 }
25944
25945 // __va_list_tag:
25946 // gp_offset (0 - 6 * 8)
25947 // fp_offset (48 - 48 + 8 * 16)
25948 // overflow_arg_area (point to parameters coming in memory).
25949 // reg_save_area
25951 SDValue FIN = Op.getOperand(1);
25952 // Store gp_offset
25953 SDValue Store = DAG.getStore(
25954 Op.getOperand(0), DL,
25955 DAG.getConstant(FuncInfo->getVarArgsGPOffset(), DL, MVT::i32), FIN,
25956 MachinePointerInfo(SV));
25957 MemOps.push_back(Store);
25958
25959 // Store fp_offset
25960 FIN = DAG.getMemBasePlusOffset(FIN, TypeSize::getFixed(4), DL);
25961 Store = DAG.getStore(
25962 Op.getOperand(0), DL,
25963 DAG.getConstant(FuncInfo->getVarArgsFPOffset(), DL, MVT::i32), FIN,
25964 MachinePointerInfo(SV, 4));
25965 MemOps.push_back(Store);
25966
25967 // Store ptr to overflow_arg_area
25968 FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(4, DL));
25969 SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
25970 Store =
25971 DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN, MachinePointerInfo(SV, 8));
25972 MemOps.push_back(Store);
25973
25974 // Store ptr to reg_save_area.
25975 FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(
25976 Subtarget.isTarget64BitLP64() ? 8 : 4, DL));
25977 SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), PtrVT);
25978 Store = DAG.getStore(
25979 Op.getOperand(0), DL, RSFIN, FIN,
25980 MachinePointerInfo(SV, Subtarget.isTarget64BitLP64() ? 16 : 12));
25981 MemOps.push_back(Store);
25982 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
25983}
25984
25985SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
25986 assert(Subtarget.is64Bit() &&
25987 "LowerVAARG only handles 64-bit va_arg!");
25988 assert(Op.getNumOperands() == 4);
25989
25990 MachineFunction &MF = DAG.getMachineFunction();
25991 if (Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv()))
25992 // The Win64 ABI uses char* instead of a structure.
25993 return DAG.expandVAArg(Op.getNode());
25994
25995 SDValue Chain = Op.getOperand(0);
25996 SDValue SrcPtr = Op.getOperand(1);
25997 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
25998 unsigned Align = Op.getConstantOperandVal(3);
25999 SDLoc dl(Op);
26000
26001 EVT ArgVT = Op.getNode()->getValueType(0);
26002 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
26003 uint32_t ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy);
26004 uint8_t ArgMode;
26005
26006 // Decide which area this value should be read from.
26007 // TODO: Implement the AMD64 ABI in its entirety. This simple
26008 // selection mechanism works only for the basic types.
26009 assert(ArgVT != MVT::f80 && "va_arg for f80 not yet implemented");
26010 if (ArgVT.isFloatingPoint() && ArgSize <= 16 /*bytes*/) {
26011 ArgMode = 2; // Argument passed in XMM register. Use fp_offset.
26012 } else {
26013 assert(ArgVT.isInteger() && ArgSize <= 32 /*bytes*/ &&
26014 "Unhandled argument type in LowerVAARG");
26015 ArgMode = 1; // Argument passed in GPR64 register(s). Use gp_offset.
26016 }
26017
26018 if (ArgMode == 2) {
26019 // Make sure using fp_offset makes sense.
26020 assert(!Subtarget.useSoftFloat() &&
26021 !(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat)) &&
26022 Subtarget.hasSSE1());
26023 }
26024
26025 // Insert VAARG node into the DAG
26026 // VAARG returns two values: Variable Argument Address, Chain
26027 SDValue InstOps[] = {Chain, SrcPtr,
26028 DAG.getTargetConstant(ArgSize, dl, MVT::i32),
26029 DAG.getTargetConstant(ArgMode, dl, MVT::i8),
26030 DAG.getTargetConstant(Align, dl, MVT::i32)};
26031 SDVTList VTs = DAG.getVTList(SrcPtr.getValueType(), MVT::Other);
26032 SDValue VAARG = DAG.getMemIntrinsicNode(
26033 Subtarget.isTarget64BitLP64() ? X86ISD::VAARG_64 : X86ISD::VAARG_X32, dl,
26034 VTs, InstOps, MVT::i64, MachinePointerInfo(SV),
26035 /*Alignment=*/std::nullopt,
26037 Chain = VAARG.getValue(1);
26038
26039 // Load the next argument and return it
26040 return DAG.getLoad(ArgVT, dl, Chain, VAARG, MachinePointerInfo());
26041}
26042
26043static SDValue LowerVACOPY(SDValue Op, const X86Subtarget &Subtarget,
26044 SelectionDAG &DAG) {
26045 // X86-64 va_list is a struct { i32, i32, i8*, i8* }, except on Windows,
26046 // where a va_list is still an i8*.
26047 assert(Subtarget.is64Bit() && "This code only handles 64-bit va_copy!");
26048 if (Subtarget.isCallingConvWin64(
26050 // Probably a Win64 va_copy.
26051 return DAG.expandVACopy(Op.getNode());
26052
26053 SDValue Chain = Op.getOperand(0);
26054 SDValue DstPtr = Op.getOperand(1);
26055 SDValue SrcPtr = Op.getOperand(2);
26056 const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
26057 const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
26058 SDLoc DL(Op);
26059
26060 return DAG.getMemcpy(
26061 Chain, DL, DstPtr, SrcPtr,
26062 DAG.getIntPtrConstant(Subtarget.isTarget64BitLP64() ? 24 : 16, DL),
26063 Align(Subtarget.isTarget64BitLP64() ? 8 : 4), /*isVolatile*/ false, false,
26064 /*CI=*/nullptr, std::nullopt, MachinePointerInfo(DstSV),
26065 MachinePointerInfo(SrcSV));
26066}
26067
26068// Helper to get immediate/variable SSE shift opcode from other shift opcodes.
26069static unsigned getTargetVShiftUniformOpcode(unsigned Opc, bool IsVariable) {
26070 switch (Opc) {
26071 case ISD::SHL:
26072 case X86ISD::VSHL:
26073 case X86ISD::VSHLI:
26074 return IsVariable ? X86ISD::VSHL : X86ISD::VSHLI;
26075 case ISD::SRL:
26076 case X86ISD::VSRL:
26077 case X86ISD::VSRLI:
26078 return IsVariable ? X86ISD::VSRL : X86ISD::VSRLI;
26079 case ISD::SRA:
26080 case X86ISD::VSRA:
26081 case X86ISD::VSRAI:
26082 return IsVariable ? X86ISD::VSRA : X86ISD::VSRAI;
26083 }
26084 llvm_unreachable("Unknown target vector shift node");
26085}
26086
26087/// Handle vector element shifts where the shift amount is a constant.
26088/// Takes immediate version of shift as input.
26089static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT,
26090 SDValue SrcOp, uint64_t ShiftAmt,
26091 SelectionDAG &DAG) {
26092 MVT ElementType = VT.getVectorElementType();
26093
26094 // Bitcast the source vector to the output type, this is mainly necessary for
26095 // vXi8/vXi64 shifts.
26096 if (VT != SrcOp.getSimpleValueType())
26097 SrcOp = DAG.getBitcast(VT, SrcOp);
26098
26099 // Fold this packed shift into its first operand if ShiftAmt is 0.
26100 if (ShiftAmt == 0)
26101 return SrcOp;
26102
26103 // Check for ShiftAmt >= element width
26104 if (ShiftAmt >= ElementType.getSizeInBits()) {
26105 if (Opc == X86ISD::VSRAI)
26106 ShiftAmt = ElementType.getSizeInBits() - 1;
26107 else
26108 return DAG.getConstant(0, dl, VT);
26109 }
26110
26112 && "Unknown target vector shift-by-constant node");
26113
26114 // Fold this packed vector shift into a build vector if SrcOp is a
26115 // vector of Constants or UNDEFs.
26117 unsigned ShiftOpc;
26118 switch (Opc) {
26119 default: llvm_unreachable("Unknown opcode!");
26120 case X86ISD::VSHLI:
26121 ShiftOpc = ISD::SHL;
26122 break;
26123 case X86ISD::VSRLI:
26124 ShiftOpc = ISD::SRL;
26125 break;
26126 case X86ISD::VSRAI:
26127 ShiftOpc = ISD::SRA;
26128 break;
26129 }
26130
26131 SDValue Amt = DAG.getConstant(ShiftAmt, dl, VT);
26132 if (SDValue C = DAG.FoldConstantArithmetic(ShiftOpc, dl, VT, {SrcOp, Amt}))
26133 return C;
26134 }
26135
26136 return DAG.getNode(Opc, dl, VT, SrcOp,
26137 DAG.getTargetConstant(ShiftAmt, dl, MVT::i8));
26138}
26139
26140/// Handle vector element shifts by a splat shift amount
26141static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT,
26142 SDValue SrcOp, SDValue ShAmt, int ShAmtIdx,
26143 const X86Subtarget &Subtarget,
26144 SelectionDAG &DAG) {
26145 MVT AmtVT = ShAmt.getSimpleValueType();
26146 assert(AmtVT.isVector() && "Vector shift type mismatch");
26147 assert(0 <= ShAmtIdx && ShAmtIdx < (int)AmtVT.getVectorNumElements() &&
26148 "Illegal vector splat index");
26149
26150 // Move the splat element to the bottom element.
26151 if (ShAmtIdx != 0) {
26152 SmallVector<int> Mask(AmtVT.getVectorNumElements(), -1);
26153 Mask[0] = ShAmtIdx;
26154 ShAmt = DAG.getVectorShuffle(AmtVT, dl, ShAmt, DAG.getUNDEF(AmtVT), Mask);
26155 }
26156
26157 // Peek through any zext node if we can get back to a 128-bit source.
26158 if (AmtVT.getScalarSizeInBits() == 64 &&
26159 (ShAmt.getOpcode() == ISD::ZERO_EXTEND ||
26161 ShAmt.getOperand(0).getValueType().isSimple() &&
26162 ShAmt.getOperand(0).getValueType().is128BitVector()) {
26163 ShAmt = ShAmt.getOperand(0);
26164 AmtVT = ShAmt.getSimpleValueType();
26165 }
26166
26167 // See if we can mask off the upper elements using the existing source node.
26168 // The shift uses the entire lower 64-bits of the amount vector, so no need to
26169 // do this for vXi64 types.
26170 bool IsMasked = false;
26171 if (AmtVT.getScalarSizeInBits() < 64) {
26172 if (ShAmt.getOpcode() == ISD::BUILD_VECTOR ||
26173 ShAmt.getOpcode() == ISD::SCALAR_TO_VECTOR) {
26174 // If the shift amount has come from a scalar, then zero-extend the scalar
26175 // before moving to the vector.
26176 ShAmt = DAG.getZExtOrTrunc(ShAmt.getOperand(0), dl, MVT::i32);
26177 ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, ShAmt);
26178 ShAmt = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, ShAmt);
26179 AmtVT = MVT::v4i32;
26180 IsMasked = true;
26181 } else if (ShAmt.getOpcode() == ISD::AND) {
26182 // See if the shift amount is already masked (e.g. for rotation modulo),
26183 // then we can zero-extend it by setting all the other mask elements to
26184 // zero.
26185 SmallVector<SDValue> MaskElts(
26186 AmtVT.getVectorNumElements(),
26187 DAG.getConstant(0, dl, AmtVT.getScalarType()));
26188 MaskElts[0] = DAG.getAllOnesConstant(dl, AmtVT.getScalarType());
26189 SDValue Mask = DAG.getBuildVector(AmtVT, dl, MaskElts);
26190 if ((Mask = DAG.FoldConstantArithmetic(ISD::AND, dl, AmtVT,
26191 {ShAmt.getOperand(1), Mask}))) {
26192 ShAmt = DAG.getNode(ISD::AND, dl, AmtVT, ShAmt.getOperand(0), Mask);
26193 IsMasked = true;
26194 }
26195 }
26196 }
26197
26198 // Extract if the shift amount vector is larger than 128-bits.
26199 if (AmtVT.getSizeInBits() > 128) {
26200 ShAmt = extract128BitVector(ShAmt, 0, DAG, dl);
26201 AmtVT = ShAmt.getSimpleValueType();
26202 }
26203
26204 // Zero-extend bottom element to v2i64 vector type, either by extension or
26205 // shuffle masking.
26206 if (!IsMasked && AmtVT.getScalarSizeInBits() < 64) {
26207 if (AmtVT == MVT::v4i32 && (ShAmt.getOpcode() == X86ISD::VBROADCAST ||
26208 ShAmt.getOpcode() == X86ISD::VBROADCAST_LOAD)) {
26209 ShAmt = DAG.getNode(X86ISD::VZEXT_MOVL, SDLoc(ShAmt), MVT::v4i32, ShAmt);
26210 } else if (Subtarget.hasSSE41()) {
26211 ShAmt = DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(ShAmt),
26212 MVT::v2i64, ShAmt);
26213 } else {
26214 SDValue ByteShift = DAG.getTargetConstant(
26215 (128 - AmtVT.getScalarSizeInBits()) / 8, SDLoc(ShAmt), MVT::i8);
26216 ShAmt = DAG.getBitcast(MVT::v16i8, ShAmt);
26217 ShAmt = DAG.getNode(X86ISD::VSHLDQ, SDLoc(ShAmt), MVT::v16i8, ShAmt,
26218 ByteShift);
26219 ShAmt = DAG.getNode(X86ISD::VSRLDQ, SDLoc(ShAmt), MVT::v16i8, ShAmt,
26220 ByteShift);
26221 }
26222 }
26223
26224 // Change opcode to non-immediate version.
26226
26227 // The return type has to be a 128-bit type with the same element
26228 // type as the input type.
26229 MVT EltVT = VT.getVectorElementType();
26230 MVT ShVT = MVT::getVectorVT(EltVT, 128 / EltVT.getSizeInBits());
26231
26232 ShAmt = DAG.getBitcast(ShVT, ShAmt);
26233 return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt);
26234}
26235
26236/// Return Mask with the necessary casting or extending
26237/// for \p Mask according to \p MaskVT when lowering masking intrinsics
26238static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
26239 const X86Subtarget &Subtarget, SelectionDAG &DAG,
26240 const SDLoc &dl) {
26241
26242 if (isAllOnesConstant(Mask))
26243 return DAG.getConstant(1, dl, MaskVT);
26244 if (X86::isZeroNode(Mask))
26245 return DAG.getConstant(0, dl, MaskVT);
26246
26247 assert(MaskVT.bitsLE(Mask.getSimpleValueType()) && "Unexpected mask size!");
26248
26249 if (Mask.getSimpleValueType() == MVT::i64 && Subtarget.is32Bit()) {
26250 assert(MaskVT == MVT::v64i1 && "Expected v64i1 mask!");
26251 assert(Subtarget.hasBWI() && "Expected AVX512BW target!");
26252 // In case 32bit mode, bitcast i64 is illegal, extend/split it.
26253 SDValue Lo, Hi;
26254 std::tie(Lo, Hi) = DAG.SplitScalar(Mask, dl, MVT::i32, MVT::i32);
26255 Lo = DAG.getBitcast(MVT::v32i1, Lo);
26256 Hi = DAG.getBitcast(MVT::v32i1, Hi);
26257 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
26258 } else {
26259 MVT BitcastVT = MVT::getVectorVT(MVT::i1,
26260 Mask.getSimpleValueType().getSizeInBits());
26261 // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements
26262 // are extracted by EXTRACT_SUBVECTOR.
26263 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
26264 DAG.getBitcast(BitcastVT, Mask),
26265 DAG.getVectorIdxConstant(0, dl));
26266 }
26267}
26268
26269/// Return (and \p Op, \p Mask) for compare instructions or
26270/// (vselect \p Mask, \p Op, \p PreservedSrc) for others along with the
26271/// necessary casting or extending for \p Mask when lowering masking intrinsics
26273 SDValue PreservedSrc,
26274 const X86Subtarget &Subtarget,
26275 SelectionDAG &DAG) {
26276 MVT VT = Op.getSimpleValueType();
26277 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
26278 unsigned OpcodeSelect = ISD::VSELECT;
26279 SDLoc dl(Op);
26280
26281 if (isAllOnesConstant(Mask))
26282 return Op;
26283
26284 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
26285
26286 if (PreservedSrc.isUndef())
26287 PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
26288 return DAG.getNode(OpcodeSelect, dl, VT, VMask, Op, PreservedSrc);
26289}
26290
26291/// Creates an SDNode for a predicated scalar operation.
26292/// \returns (X86vselect \p Mask, \p Op, \p PreservedSrc).
26293/// The mask is coming as MVT::i8 and it should be transformed
26294/// to MVT::v1i1 while lowering masking intrinsics.
26295/// The main difference between ScalarMaskingNode and VectorMaskingNode is using
26296/// "X86select" instead of "vselect". We just can't create the "vselect" node
26297/// for a scalar instruction.
26299 SDValue PreservedSrc,
26300 const X86Subtarget &Subtarget,
26301 SelectionDAG &DAG) {
26302 auto *MaskConst = dyn_cast<ConstantSDNode>(Mask);
26303 if (MaskConst && (MaskConst->getZExtValue() & 0x1))
26304 return Op;
26305
26306 MVT VT = Op.getSimpleValueType();
26307 SDLoc dl(Op);
26308
26309 assert(Mask.getValueType() == MVT::i8 && "Unexpect type");
26310 SDValue IMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v1i1,
26311 DAG.getBitcast(MVT::v8i1, Mask),
26312 DAG.getVectorIdxConstant(0, dl));
26313 if (Op.getOpcode() == X86ISD::FSETCCM ||
26314 Op.getOpcode() == X86ISD::FSETCCM_SAE ||
26315 Op.getOpcode() == X86ISD::VFPCLASSS)
26316 return DAG.getNode(ISD::AND, dl, VT, Op, IMask);
26317
26318 if (PreservedSrc.isUndef())
26319 PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
26320
26321 if (MaskConst) {
26322 assert((MaskConst->getZExtValue() & 0x1) == 0 && "Expected false mask");
26323 // Discard op and blend passthrough with scalar op src/dst.
26325 std::iota(ShuffleMask.begin(), ShuffleMask.end(), 0);
26326 ShuffleMask[0] = VT.getVectorNumElements();
26327 return DAG.getVectorShuffle(VT, dl, Op.getOperand(0), PreservedSrc,
26328 ShuffleMask);
26329 }
26330
26331 return DAG.getNode(X86ISD::SELECTS, dl, VT, IMask, Op, PreservedSrc);
26332}
26333
26335 if (!Fn->hasPersonalityFn())
26337 "querying registration node size for function without personality");
26338 // The RegNodeSize is 6 32-bit words for SEH and 4 for C++ EH. See
26339 // WinEHStatePass for the full struct definition.
26340 switch (classifyEHPersonality(Fn->getPersonalityFn())) {
26341 case EHPersonality::MSVC_X86SEH: return 24;
26342 case EHPersonality::MSVC_CXX: return 16;
26343 default: break;
26344 }
26346 "can only recover FP for 32-bit MSVC EH personality functions");
26347}
26348
26349/// When the MSVC runtime transfers control to us, either to an outlined
26350/// function or when returning to a parent frame after catching an exception, we
26351/// recover the parent frame pointer by doing arithmetic on the incoming EBP.
26352/// Here's the math:
26353/// RegNodeBase = EntryEBP - RegNodeSize
26354/// ParentFP = RegNodeBase - ParentFrameOffset
26355/// Subtracting RegNodeSize takes us to the offset of the registration node, and
26356/// subtracting the offset (negative on x86) takes us back to the parent FP.
26358 SDValue EntryEBP) {
26360 SDLoc dl;
26361
26362 // It's possible that the parent function no longer has a personality function
26363 // if the exceptional code was optimized away, in which case we just return
26364 // the incoming EBP.
26365 if (!Fn->hasPersonalityFn())
26366 return EntryEBP;
26367
26368 // Get an MCSymbol that will ultimately resolve to the frame offset of the EH
26369 // registration, or the .set_setframe offset.
26372 MVT PtrVT = EntryEBP.getValueType().getSimpleVT();
26373 SDValue OffsetSymVal = DAG.getMCSymbol(OffsetSym, PtrVT);
26374 SDValue ParentFrameOffset =
26375 DAG.getNode(ISD::LOCAL_RECOVER, dl, PtrVT, OffsetSymVal);
26376
26377 // Return EntryEBP + ParentFrameOffset for x64. This adjusts from RSP after
26378 // prologue to RBP in the parent function.
26379 const X86Subtarget &Subtarget = DAG.getSubtarget<X86Subtarget>();
26380 if (Subtarget.is64Bit())
26381 return DAG.getNode(ISD::ADD, dl, PtrVT, EntryEBP, ParentFrameOffset);
26382
26383 int RegNodeSize = getSEHRegistrationNodeSize(Fn);
26384 // RegNodeBase = EntryEBP - RegNodeSize
26385 // ParentFP = RegNodeBase - ParentFrameOffset
26386 SDValue RegNodeBase = DAG.getNode(ISD::SUB, dl, PtrVT, EntryEBP,
26387 DAG.getConstant(RegNodeSize, dl, PtrVT));
26388 return DAG.getNode(ISD::SUB, dl, PtrVT, RegNodeBase, ParentFrameOffset);
26389}
26390
26391SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
26392 SelectionDAG &DAG) const {
26393 // Helper to detect if the operand is CUR_DIRECTION rounding mode.
26394 auto isRoundModeCurDirection = [](SDValue Rnd) {
26395 if (auto *C = dyn_cast<ConstantSDNode>(Rnd))
26396 return C->getAPIntValue() == X86::STATIC_ROUNDING::CUR_DIRECTION;
26397
26398 return false;
26399 };
26400 auto isRoundModeSAE = [](SDValue Rnd) {
26401 if (auto *C = dyn_cast<ConstantSDNode>(Rnd)) {
26402 unsigned RC = C->getZExtValue();
26404 // Clear the NO_EXC bit and check remaining bits.
26406 // As a convenience we allow no other bits or explicitly
26407 // current direction.
26408 return RC == 0 || RC == X86::STATIC_ROUNDING::CUR_DIRECTION;
26409 }
26410 }
26411
26412 return false;
26413 };
26414 auto isRoundModeSAEToX = [](SDValue Rnd, unsigned &RC) {
26415 if (auto *C = dyn_cast<ConstantSDNode>(Rnd)) {
26416 RC = C->getZExtValue();
26418 // Clear the NO_EXC bit and check remaining bits.
26424 }
26425 }
26426
26427 return false;
26428 };
26429
26430 SDLoc dl(Op);
26431 unsigned IntNo = Op.getConstantOperandVal(0);
26432 MVT VT = Op.getSimpleValueType();
26433 const IntrinsicData* IntrData = getIntrinsicWithoutChain(IntNo);
26434
26435 // Propagate flags from original node to transformed node(s).
26436 SelectionDAG::FlagInserter FlagsInserter(DAG, Op->getFlags());
26437
26438 if (IntrData) {
26439 switch(IntrData->Type) {
26440 case INTR_TYPE_1OP: {
26441 // We specify 2 possible opcodes for intrinsics with rounding modes.
26442 // First, we check if the intrinsic may have non-default rounding mode,
26443 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
26444 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
26445 if (IntrWithRoundingModeOpcode != 0) {
26446 SDValue Rnd = Op.getOperand(2);
26447 unsigned RC = 0;
26448 if (isRoundModeSAEToX(Rnd, RC))
26449 return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
26450 Op.getOperand(1),
26451 DAG.getTargetConstant(RC, dl, MVT::i32));
26452 if (!isRoundModeCurDirection(Rnd))
26453 return SDValue();
26454 }
26455 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26456 Op.getOperand(1));
26457 }
26458 case INTR_TYPE_1OP_SAE: {
26459 SDValue Sae = Op.getOperand(2);
26460
26461 unsigned Opc;
26462 if (isRoundModeCurDirection(Sae))
26463 Opc = IntrData->Opc0;
26464 else if (isRoundModeSAE(Sae))
26465 Opc = IntrData->Opc1;
26466 else
26467 return SDValue();
26468
26469 return DAG.getNode(Opc, dl, Op.getValueType(), Op.getOperand(1));
26470 }
26471 case INTR_TYPE_2OP: {
26472 SDValue Src2 = Op.getOperand(2);
26473
26474 // We specify 2 possible opcodes for intrinsics with rounding modes.
26475 // First, we check if the intrinsic may have non-default rounding mode,
26476 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
26477 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
26478 if (IntrWithRoundingModeOpcode != 0) {
26479 SDValue Rnd = Op.getOperand(3);
26480 unsigned RC = 0;
26481 if (isRoundModeSAEToX(Rnd, RC))
26482 return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
26483 Op.getOperand(1), Src2,
26484 DAG.getTargetConstant(RC, dl, MVT::i32));
26485 if (!isRoundModeCurDirection(Rnd))
26486 return SDValue();
26487 }
26488
26489 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26490 Op.getOperand(1), Src2);
26491 }
26492 case INTR_TYPE_2OP_SAE: {
26493 SDValue Sae = Op.getOperand(3);
26494
26495 unsigned Opc;
26496 if (isRoundModeCurDirection(Sae))
26497 Opc = IntrData->Opc0;
26498 else if (isRoundModeSAE(Sae))
26499 Opc = IntrData->Opc1;
26500 else
26501 return SDValue();
26502
26503 return DAG.getNode(Opc, dl, Op.getValueType(), Op.getOperand(1),
26504 Op.getOperand(2));
26505 }
26506 case INTR_TYPE_3OP:
26507 case INTR_TYPE_3OP_IMM8: {
26508 SDValue Src1 = Op.getOperand(1);
26509 SDValue Src2 = Op.getOperand(2);
26510 SDValue Src3 = Op.getOperand(3);
26511
26512 if (IntrData->Type == INTR_TYPE_3OP_IMM8 &&
26513 Src3.getValueType() != MVT::i8) {
26514 Src3 = DAG.getTargetConstant(Src3->getAsZExtVal() & 0xff, dl, MVT::i8);
26515 }
26516
26517 // We specify 2 possible opcodes for intrinsics with rounding modes.
26518 // First, we check if the intrinsic may have non-default rounding mode,
26519 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
26520 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
26521 if (IntrWithRoundingModeOpcode != 0) {
26522 SDValue Rnd = Op.getOperand(4);
26523 unsigned RC = 0;
26524 if (isRoundModeSAEToX(Rnd, RC))
26525 return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
26526 Src1, Src2, Src3,
26527 DAG.getTargetConstant(RC, dl, MVT::i32));
26528 if (!isRoundModeCurDirection(Rnd))
26529 return SDValue();
26530 }
26531
26532 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26533 {Src1, Src2, Src3});
26534 }
26535 case INTR_TYPE_4OP_IMM8: {
26536 assert(Op.getOperand(4)->getOpcode() == ISD::TargetConstant);
26537 SDValue Src4 = Op.getOperand(4);
26538 if (Src4.getValueType() != MVT::i8) {
26539 Src4 = DAG.getTargetConstant(Src4->getAsZExtVal() & 0xff, dl, MVT::i8);
26540 }
26541
26542 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26543 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),
26544 Src4);
26545 }
26546 case INTR_TYPE_1OP_MASK: {
26547 SDValue Src = Op.getOperand(1);
26548 SDValue PassThru = Op.getOperand(2);
26549 SDValue Mask = Op.getOperand(3);
26550 // We add rounding mode to the Node when
26551 // - RC Opcode is specified and
26552 // - RC is not "current direction".
26553 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
26554 if (IntrWithRoundingModeOpcode != 0) {
26555 SDValue Rnd = Op.getOperand(4);
26556 unsigned RC = 0;
26557 if (isRoundModeSAEToX(Rnd, RC))
26558 return getVectorMaskingNode(
26559 DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
26560 Src, DAG.getTargetConstant(RC, dl, MVT::i32)),
26561 Mask, PassThru, Subtarget, DAG);
26562 if (!isRoundModeCurDirection(Rnd))
26563 return SDValue();
26564 }
26565 return getVectorMaskingNode(
26566 DAG.getNode(IntrData->Opc0, dl, VT, Src), Mask, PassThru,
26567 Subtarget, DAG);
26568 }
26570 SDValue Src = Op.getOperand(1);
26571 SDValue PassThru = Op.getOperand(2);
26572 SDValue Mask = Op.getOperand(3);
26573 SDValue Rnd = Op.getOperand(4);
26574
26575 unsigned Opc;
26576 if (isRoundModeCurDirection(Rnd))
26577 Opc = IntrData->Opc0;
26578 else if (isRoundModeSAE(Rnd))
26579 Opc = IntrData->Opc1;
26580 else
26581 return SDValue();
26582
26583 return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src), Mask, PassThru,
26584 Subtarget, DAG);
26585 }
26586 case INTR_TYPE_SCALAR_MASK: {
26587 SDValue Src1 = Op.getOperand(1);
26588 SDValue Src2 = Op.getOperand(2);
26589 SDValue passThru = Op.getOperand(3);
26590 SDValue Mask = Op.getOperand(4);
26591 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
26592 // There are 2 kinds of intrinsics in this group:
26593 // (1) With suppress-all-exceptions (sae) or rounding mode- 6 operands
26594 // (2) With rounding mode and sae - 7 operands.
26595 bool HasRounding = IntrWithRoundingModeOpcode != 0;
26596 if (Op.getNumOperands() == (5U + HasRounding)) {
26597 if (HasRounding) {
26598 SDValue Rnd = Op.getOperand(5);
26599 unsigned RC = 0;
26600 if (isRoundModeSAEToX(Rnd, RC))
26601 return getScalarMaskingNode(
26602 DAG.getNode(IntrWithRoundingModeOpcode, dl, VT, Src1, Src2,
26603 DAG.getTargetConstant(RC, dl, MVT::i32)),
26604 Mask, passThru, Subtarget, DAG);
26605 if (!isRoundModeCurDirection(Rnd))
26606 return SDValue();
26607 }
26608 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1,
26609 Src2),
26610 Mask, passThru, Subtarget, DAG);
26611 }
26612
26613 assert(Op.getNumOperands() == (6U + HasRounding) &&
26614 "Unexpected intrinsic form");
26615 SDValue RoundingMode = Op.getOperand(5);
26616 unsigned Opc = IntrData->Opc0;
26617 if (HasRounding) {
26618 SDValue Sae = Op.getOperand(6);
26619 if (isRoundModeSAE(Sae))
26620 Opc = IntrWithRoundingModeOpcode;
26621 else if (!isRoundModeCurDirection(Sae))
26622 return SDValue();
26623 }
26624 return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1,
26625 Src2, RoundingMode),
26626 Mask, passThru, Subtarget, DAG);
26627 }
26629 SDValue Src1 = Op.getOperand(1);
26630 SDValue Src2 = Op.getOperand(2);
26631 SDValue passThru = Op.getOperand(3);
26632 SDValue Mask = Op.getOperand(4);
26633 SDValue Rnd = Op.getOperand(5);
26634
26635 SDValue NewOp;
26636 unsigned RC = 0;
26637 if (isRoundModeCurDirection(Rnd))
26638 NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2);
26639 else if (isRoundModeSAEToX(Rnd, RC))
26640 NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2,
26641 DAG.getTargetConstant(RC, dl, MVT::i32));
26642 else
26643 return SDValue();
26644
26645 return getScalarMaskingNode(NewOp, Mask, passThru, Subtarget, DAG);
26646 }
26648 SDValue Src1 = Op.getOperand(1);
26649 SDValue Src2 = Op.getOperand(2);
26650 SDValue passThru = Op.getOperand(3);
26651 SDValue Mask = Op.getOperand(4);
26652 SDValue Sae = Op.getOperand(5);
26653 unsigned Opc;
26654 if (isRoundModeCurDirection(Sae))
26655 Opc = IntrData->Opc0;
26656 else if (isRoundModeSAE(Sae))
26657 Opc = IntrData->Opc1;
26658 else
26659 return SDValue();
26660
26661 return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2),
26662 Mask, passThru, Subtarget, DAG);
26663 }
26664 case INTR_TYPE_2OP_MASK: {
26665 SDValue Src1 = Op.getOperand(1);
26666 SDValue Src2 = Op.getOperand(2);
26667 SDValue PassThru = Op.getOperand(3);
26668 SDValue Mask = Op.getOperand(4);
26669 SDValue NewOp;
26670 if (IntrData->Opc1 != 0) {
26671 SDValue Rnd = Op.getOperand(5);
26672 unsigned RC = 0;
26673 if (isRoundModeSAEToX(Rnd, RC))
26674 NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2,
26675 DAG.getTargetConstant(RC, dl, MVT::i32));
26676 else if (!isRoundModeCurDirection(Rnd))
26677 return SDValue();
26678 }
26679 if (!NewOp)
26680 NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2);
26681 return getVectorMaskingNode(NewOp, Mask, PassThru, Subtarget, DAG);
26682 }
26684 SDValue Src1 = Op.getOperand(1);
26685 SDValue Src2 = Op.getOperand(2);
26686 SDValue PassThru = Op.getOperand(3);
26687 SDValue Mask = Op.getOperand(4);
26688
26689 unsigned Opc = IntrData->Opc0;
26690 if (IntrData->Opc1 != 0) {
26691 SDValue Sae = Op.getOperand(5);
26692 if (isRoundModeSAE(Sae))
26693 Opc = IntrData->Opc1;
26694 else if (!isRoundModeCurDirection(Sae))
26695 return SDValue();
26696 }
26697
26698 return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2),
26699 Mask, PassThru, Subtarget, DAG);
26700 }
26702 SDValue Src1 = Op.getOperand(1);
26703 SDValue Src2 = Op.getOperand(2);
26704 SDValue Src3 = Op.getOperand(3);
26705 SDValue PassThru = Op.getOperand(4);
26706 SDValue Mask = Op.getOperand(5);
26707 SDValue Sae = Op.getOperand(6);
26708 unsigned Opc;
26709 if (isRoundModeCurDirection(Sae))
26710 Opc = IntrData->Opc0;
26711 else if (isRoundModeSAE(Sae))
26712 Opc = IntrData->Opc1;
26713 else
26714 return SDValue();
26715
26716 return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2, Src3),
26717 Mask, PassThru, Subtarget, DAG);
26718 }
26720 SDValue Src1 = Op.getOperand(1);
26721 SDValue Src2 = Op.getOperand(2);
26722 SDValue Src3 = Op.getOperand(3);
26723 SDValue PassThru = Op.getOperand(4);
26724 SDValue Mask = Op.getOperand(5);
26725
26726 unsigned Opc = IntrData->Opc0;
26727 if (IntrData->Opc1 != 0) {
26728 SDValue Sae = Op.getOperand(6);
26729 if (isRoundModeSAE(Sae))
26730 Opc = IntrData->Opc1;
26731 else if (!isRoundModeCurDirection(Sae))
26732 return SDValue();
26733 }
26734 return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2, Src3),
26735 Mask, PassThru, Subtarget, DAG);
26736 }
26737 case BLENDV: {
26738 SDValue Src1 = Op.getOperand(1);
26739 SDValue Src2 = Op.getOperand(2);
26740 SDValue Src3 = Op.getOperand(3);
26741
26742 EVT MaskVT = Src3.getValueType().changeVectorElementTypeToInteger();
26743 Src3 = DAG.getBitcast(MaskVT, Src3);
26744
26745 // Reverse the operands to match VSELECT order.
26746 return DAG.getNode(IntrData->Opc0, dl, VT, Src3, Src2, Src1);
26747 }
26748 case VPERM_2OP : {
26749 SDValue Src1 = Op.getOperand(1);
26750 SDValue Src2 = Op.getOperand(2);
26751
26752 // Swap Src1 and Src2 in the node creation
26753 return DAG.getNode(IntrData->Opc0, dl, VT,Src2, Src1);
26754 }
26755 case CFMA_OP_MASKZ:
26756 case CFMA_OP_MASK: {
26757 SDValue Src1 = Op.getOperand(1);
26758 SDValue Src2 = Op.getOperand(2);
26759 SDValue Src3 = Op.getOperand(3);
26760 SDValue Mask = Op.getOperand(4);
26761 MVT VT = Op.getSimpleValueType();
26762
26763 SDValue PassThru = Src3;
26764 if (IntrData->Type == CFMA_OP_MASKZ)
26765 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
26766
26767 // We add rounding mode to the Node when
26768 // - RC Opcode is specified and
26769 // - RC is not "current direction".
26770 SDValue NewOp;
26771 if (IntrData->Opc1 != 0) {
26772 SDValue Rnd = Op.getOperand(5);
26773 unsigned RC = 0;
26774 if (isRoundModeSAEToX(Rnd, RC))
26775 NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2, Src3,
26776 DAG.getTargetConstant(RC, dl, MVT::i32));
26777 else if (!isRoundModeCurDirection(Rnd))
26778 return SDValue();
26779 }
26780 if (!NewOp)
26781 NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2, Src3);
26782 if (IntrData->Opc0 == X86ISD::VFMADDCSH ||
26783 IntrData->Opc0 == X86ISD::VFCMADDCSH)
26784 return getScalarMaskingNode(NewOp, Mask, PassThru, Subtarget, DAG);
26785 return getVectorMaskingNode(NewOp, Mask, PassThru, Subtarget, DAG);
26786 }
26787 case IFMA_OP:
26788 // NOTE: We need to swizzle the operands to pass the multiply operands
26789 // first.
26790 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26791 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
26792 case FPCLASSS: {
26793 SDValue Src1 = Op.getOperand(1);
26794 SDValue Imm = Op.getOperand(2);
26795 SDValue Mask = Op.getOperand(3);
26796 SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Imm);
26797 SDValue FPclassMask = getScalarMaskingNode(FPclass, Mask, SDValue(),
26798 Subtarget, DAG);
26799 // Need to fill with zeros to ensure the bitcast will produce zeroes
26800 // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
26801 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
26802 DAG.getConstant(0, dl, MVT::v8i1), FPclassMask,
26803 DAG.getVectorIdxConstant(0, dl));
26804 return DAG.getBitcast(MVT::i8, Ins);
26805 }
26806
26807 case CMP_MASK_CC: {
26808 MVT MaskVT = Op.getSimpleValueType();
26809 SDValue CC = Op.getOperand(3);
26810 SDValue Mask = Op.getOperand(4);
26811 // We specify 2 possible opcodes for intrinsics with rounding modes.
26812 // First, we check if the intrinsic may have non-default rounding mode,
26813 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
26814 if (IntrData->Opc1 != 0) {
26815 SDValue Sae = Op.getOperand(5);
26816 if (isRoundModeSAE(Sae))
26817 return DAG.getNode(IntrData->Opc1, dl, MaskVT, Op.getOperand(1),
26818 Op.getOperand(2), CC, Mask, Sae);
26819 if (!isRoundModeCurDirection(Sae))
26820 return SDValue();
26821 }
26822 //default rounding mode
26823 return DAG.getNode(IntrData->Opc0, dl, MaskVT,
26824 {Op.getOperand(1), Op.getOperand(2), CC, Mask});
26825 }
26826 case CMP_MASK_SCALAR_CC: {
26827 SDValue Src1 = Op.getOperand(1);
26828 SDValue Src2 = Op.getOperand(2);
26829 SDValue CC = Op.getOperand(3);
26830 SDValue Mask = Op.getOperand(4);
26831
26832 SDValue Cmp;
26833 if (IntrData->Opc1 != 0) {
26834 SDValue Sae = Op.getOperand(5);
26835 if (isRoundModeSAE(Sae))
26836 Cmp = DAG.getNode(IntrData->Opc1, dl, MVT::v1i1, Src1, Src2, CC, Sae);
26837 else if (!isRoundModeCurDirection(Sae))
26838 return SDValue();
26839 }
26840 //default rounding mode
26841 if (!Cmp.getNode())
26842 Cmp = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Src2, CC);
26843
26844 SDValue CmpMask = getScalarMaskingNode(Cmp, Mask, SDValue(),
26845 Subtarget, DAG);
26846 // Need to fill with zeros to ensure the bitcast will produce zeroes
26847 // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
26848 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
26849 DAG.getConstant(0, dl, MVT::v8i1), CmpMask,
26850 DAG.getVectorIdxConstant(0, dl));
26851 return DAG.getBitcast(MVT::i8, Ins);
26852 }
26853 case COMI: { // Comparison intrinsics
26854 ISD::CondCode CC = (ISD::CondCode)IntrData->Opc1;
26855 SDValue LHS = Op.getOperand(1);
26856 SDValue RHS = Op.getOperand(2);
26857 // Some conditions require the operands to be swapped.
26858 if (CC == ISD::SETLT || CC == ISD::SETLE)
26859 std::swap(LHS, RHS);
26860
26861 // For AVX10.2, Support EQ and NE.
26862 bool HasAVX10_2_COMX =
26863 Subtarget.hasAVX10_2() && (CC == ISD::SETEQ || CC == ISD::SETNE);
26864
26865 // AVX10.2 COMPARE supports only v2f64, v4f32 or v8f16.
26866 // For BF type we need to fall back.
26867 bool HasAVX10_2_COMX_Ty = (LHS.getSimpleValueType() != MVT::v8bf16);
26868
26869 auto ComiOpCode = IntrData->Opc0;
26870 auto isUnordered = (ComiOpCode == X86ISD::UCOMI);
26871
26872 if (HasAVX10_2_COMX && HasAVX10_2_COMX_Ty)
26873 ComiOpCode = isUnordered ? X86ISD::UCOMX : X86ISD::COMX;
26874
26875 SDValue Comi = DAG.getNode(ComiOpCode, dl, MVT::i32, LHS, RHS);
26876
26877 SDValue SetCC;
26878 switch (CC) {
26879 case ISD::SETEQ: {
26880 SetCC = getSETCC(X86::COND_E, Comi, dl, DAG);
26881 if (HasAVX10_2_COMX && HasAVX10_2_COMX_Ty) // ZF == 1
26882 break;
26883 // (ZF = 1 and PF = 0)
26884 SDValue SetNP = getSETCC(X86::COND_NP, Comi, dl, DAG);
26885 SetCC = DAG.getNode(ISD::AND, dl, MVT::i8, SetCC, SetNP);
26886 break;
26887 }
26888 case ISD::SETNE: {
26889 SetCC = getSETCC(X86::COND_NE, Comi, dl, DAG);
26890 if (HasAVX10_2_COMX && HasAVX10_2_COMX_Ty) // ZF == 0
26891 break;
26892 // (ZF = 0 or PF = 1)
26893 SDValue SetP = getSETCC(X86::COND_P, Comi, dl, DAG);
26894 SetCC = DAG.getNode(ISD::OR, dl, MVT::i8, SetCC, SetP);
26895 break;
26896 }
26897 case ISD::SETGT: // (CF = 0 and ZF = 0)
26898 case ISD::SETLT: { // Condition opposite to GT. Operands swapped above.
26899 SetCC = getSETCC(X86::COND_A, Comi, dl, DAG);
26900 break;
26901 }
26902 case ISD::SETGE: // CF = 0
26903 case ISD::SETLE: // Condition opposite to GE. Operands swapped above.
26904 SetCC = getSETCC(X86::COND_AE, Comi, dl, DAG);
26905 break;
26906 default:
26907 llvm_unreachable("Unexpected illegal condition!");
26908 }
26909 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
26910 }
26911 case COMI_RM: { // Comparison intrinsics with Sae
26912 SDValue LHS = Op.getOperand(1);
26913 SDValue RHS = Op.getOperand(2);
26914 unsigned CondVal = Op.getConstantOperandVal(3);
26915 SDValue Sae = Op.getOperand(4);
26916
26917 SDValue FCmp;
26918 if (isRoundModeCurDirection(Sae))
26919 FCmp = DAG.getNode(X86ISD::FSETCCM, dl, MVT::v1i1, LHS, RHS,
26920 DAG.getTargetConstant(CondVal, dl, MVT::i8));
26921 else if (isRoundModeSAE(Sae))
26922 FCmp = DAG.getNode(X86ISD::FSETCCM_SAE, dl, MVT::v1i1, LHS, RHS,
26923 DAG.getTargetConstant(CondVal, dl, MVT::i8), Sae);
26924 else
26925 return SDValue();
26926 // Need to fill with zeros to ensure the bitcast will produce zeroes
26927 // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
26928 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1,
26929 DAG.getConstant(0, dl, MVT::v16i1), FCmp,
26930 DAG.getVectorIdxConstant(0, dl));
26931 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32,
26932 DAG.getBitcast(MVT::i16, Ins));
26933 }
26934 case VSHIFT: {
26935 SDValue SrcOp = Op.getOperand(1);
26936 SDValue ShAmt = Op.getOperand(2);
26937 assert(ShAmt.getValueType() == MVT::i32 &&
26938 "Unexpected VSHIFT amount type");
26939
26940 // Catch shift-by-constant.
26941 if (auto *CShAmt = dyn_cast<ConstantSDNode>(ShAmt))
26942 return getTargetVShiftByConstNode(IntrData->Opc0, dl,
26943 Op.getSimpleValueType(), SrcOp,
26944 CShAmt->getZExtValue(), DAG);
26945
26946 ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, ShAmt);
26947 return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(),
26948 SrcOp, ShAmt, 0, Subtarget, DAG);
26949 }
26951 SDValue Mask = Op.getOperand(3);
26952 SDValue DataToCompress = Op.getOperand(1);
26953 SDValue PassThru = Op.getOperand(2);
26954 if (ISD::isBuildVectorAllOnes(Mask.getNode())) // return data as is
26955 return Op.getOperand(1);
26956
26957 // Avoid false dependency.
26958 if (PassThru.isUndef())
26959 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
26960
26961 return DAG.getNode(IntrData->Opc0, dl, VT, DataToCompress, PassThru,
26962 Mask);
26963 }
26964 case FIXUPIMM:
26965 case FIXUPIMM_MASKZ: {
26966 SDValue Src1 = Op.getOperand(1);
26967 SDValue Src2 = Op.getOperand(2);
26968 SDValue Src3 = Op.getOperand(3);
26969 SDValue Imm = Op.getOperand(4);
26970 SDValue Mask = Op.getOperand(5);
26971 SDValue Passthru = (IntrData->Type == FIXUPIMM)
26972 ? Src1
26973 : getZeroVector(VT, Subtarget, DAG, dl);
26974
26975 unsigned Opc = IntrData->Opc0;
26976 if (IntrData->Opc1 != 0) {
26977 SDValue Sae = Op.getOperand(6);
26978 if (isRoundModeSAE(Sae))
26979 Opc = IntrData->Opc1;
26980 else if (!isRoundModeCurDirection(Sae))
26981 return SDValue();
26982 }
26983
26984 SDValue FixupImm = DAG.getNode(Opc, dl, VT, Src1, Src2, Src3, Imm);
26985
26987 return getVectorMaskingNode(FixupImm, Mask, Passthru, Subtarget, DAG);
26988
26989 return getScalarMaskingNode(FixupImm, Mask, Passthru, Subtarget, DAG);
26990 }
26991 case ROUNDP: {
26992 assert(IntrData->Opc0 == X86ISD::VRNDSCALE && "Unexpected opcode");
26993 // Clear the upper bits of the rounding immediate so that the legacy
26994 // intrinsic can't trigger the scaling behavior of VRNDSCALE.
26995 uint64_t Round = Op.getConstantOperandVal(2);
26996 SDValue RoundingMode = DAG.getTargetConstant(Round & 0xf, dl, MVT::i32);
26997 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26998 Op.getOperand(1), RoundingMode);
26999 }
27000 case ROUNDS: {
27001 assert(IntrData->Opc0 == X86ISD::VRNDSCALES && "Unexpected opcode");
27002 // Clear the upper bits of the rounding immediate so that the legacy
27003 // intrinsic can't trigger the scaling behavior of VRNDSCALE.
27004 uint64_t Round = Op.getConstantOperandVal(3);
27005 SDValue RoundingMode = DAG.getTargetConstant(Round & 0xf, dl, MVT::i32);
27006 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
27007 Op.getOperand(1), Op.getOperand(2), RoundingMode);
27008 }
27009 case BEXTRI: {
27010 assert(IntrData->Opc0 == X86ISD::BEXTRI && "Unexpected opcode");
27011
27012 uint64_t Imm = Op.getConstantOperandVal(2);
27013 SDValue Control = DAG.getTargetConstant(Imm & 0xffff, dl,
27014 Op.getValueType());
27015 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
27016 Op.getOperand(1), Control);
27017 }
27018 // ADC/SBB
27019 case ADX: {
27020 SDVTList CFVTs = DAG.getVTList(Op->getValueType(0), MVT::i32);
27021 SDVTList VTs = DAG.getVTList(Op.getOperand(2).getValueType(), MVT::i32);
27022
27023 SDValue Res;
27024 // If the carry in is zero, then we should just use ADD/SUB instead of
27025 // ADC/SBB.
27026 if (isNullConstant(Op.getOperand(1))) {
27027 Res = DAG.getNode(IntrData->Opc1, dl, VTs, Op.getOperand(2),
27028 Op.getOperand(3));
27029 } else {
27030 SDValue GenCF = DAG.getNode(X86ISD::ADD, dl, CFVTs, Op.getOperand(1),
27031 DAG.getAllOnesConstant(dl, MVT::i8));
27032 Res = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(2),
27033 Op.getOperand(3), GenCF.getValue(1));
27034 }
27035 SDValue SetCC = getSETCC(X86::COND_B, Res.getValue(1), dl, DAG);
27036 SDValue Results[] = { SetCC, Res };
27037 return DAG.getMergeValues(Results, dl);
27038 }
27039 case CVTPD2PS_MASK:
27040 case CVTPD2DQ_MASK:
27041 case CVTQQ2PS_MASK:
27042 case TRUNCATE_TO_REG: {
27043 SDValue Src = Op.getOperand(1);
27044 SDValue PassThru = Op.getOperand(2);
27045 SDValue Mask = Op.getOperand(3);
27046
27047 if (isAllOnesConstant(Mask))
27048 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src);
27049
27050 MVT SrcVT = Src.getSimpleValueType();
27051 MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements());
27052 Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
27053 return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(),
27054 {Src, PassThru, Mask});
27055 }
27056 case TRUNCATE2_TO_REG: {
27057 SDValue Src = Op.getOperand(1);
27058 SDValue Src2 = Op.getOperand(2);
27059 SDValue PassThru = Op.getOperand(3);
27060 SDValue Mask = Op.getOperand(4);
27061
27062 if (isAllOnesConstant(Mask))
27063 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), {Src, Src2});
27064
27065 MVT Src2VT = Src2.getSimpleValueType();
27066 MVT MaskVT = MVT::getVectorVT(MVT::i1, Src2VT.getVectorNumElements());
27067 Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
27068 return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(),
27069 {Src, Src2, PassThru, Mask});
27070 }
27071 case CVTPS2PH_MASK: {
27072 SDValue Src = Op.getOperand(1);
27073 SDValue Rnd = Op.getOperand(2);
27074 SDValue PassThru = Op.getOperand(3);
27075 SDValue Mask = Op.getOperand(4);
27076
27077 unsigned RC = 0;
27078 unsigned Opc = IntrData->Opc0;
27079 bool SAE = Src.getValueType().is512BitVector() &&
27080 (isRoundModeSAEToX(Rnd, RC) || isRoundModeSAE(Rnd));
27081 if (SAE) {
27083 Rnd = DAG.getTargetConstant(RC, dl, MVT::i32);
27084 }
27085
27086 if (isAllOnesConstant(Mask))
27087 return DAG.getNode(Opc, dl, Op.getValueType(), Src, Rnd);
27088
27089 if (SAE)
27091 else
27092 Opc = IntrData->Opc1;
27093 MVT SrcVT = Src.getSimpleValueType();
27094 MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements());
27095 Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
27096 return DAG.getNode(Opc, dl, Op.getValueType(), Src, Rnd, PassThru, Mask);
27097 }
27098 case CVTNEPS2BF16_MASK: {
27099 SDValue Src = Op.getOperand(1);
27100 SDValue PassThru = Op.getOperand(2);
27101 SDValue Mask = Op.getOperand(3);
27102
27103 if (ISD::isBuildVectorAllOnes(Mask.getNode()))
27104 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src);
27105
27106 // Break false dependency.
27107 if (PassThru.isUndef())
27108 PassThru = DAG.getConstant(0, dl, PassThru.getValueType());
27109
27110 return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(), Src, PassThru,
27111 Mask);
27112 }
27113 default:
27114 break;
27115 }
27116 }
27117
27118 switch (IntNo) {
27119 default: return SDValue(); // Don't custom lower most intrinsics.
27120
27121 // ptest and testp intrinsics. The intrinsic these come from are designed to
27122 // return an integer value, not just an instruction so lower it to the ptest
27123 // or testp pattern and a setcc for the result.
27124 case Intrinsic::x86_avx512_ktestc_b:
27125 case Intrinsic::x86_avx512_ktestc_w:
27126 case Intrinsic::x86_avx512_ktestc_d:
27127 case Intrinsic::x86_avx512_ktestc_q:
27128 case Intrinsic::x86_avx512_ktestz_b:
27129 case Intrinsic::x86_avx512_ktestz_w:
27130 case Intrinsic::x86_avx512_ktestz_d:
27131 case Intrinsic::x86_avx512_ktestz_q:
27132 case Intrinsic::x86_sse41_ptestz:
27133 case Intrinsic::x86_sse41_ptestc:
27134 case Intrinsic::x86_sse41_ptestnzc:
27135 case Intrinsic::x86_avx_ptestz_256:
27136 case Intrinsic::x86_avx_ptestc_256:
27137 case Intrinsic::x86_avx_ptestnzc_256:
27138 case Intrinsic::x86_avx_vtestz_ps:
27139 case Intrinsic::x86_avx_vtestc_ps:
27140 case Intrinsic::x86_avx_vtestnzc_ps:
27141 case Intrinsic::x86_avx_vtestz_pd:
27142 case Intrinsic::x86_avx_vtestc_pd:
27143 case Intrinsic::x86_avx_vtestnzc_pd:
27144 case Intrinsic::x86_avx_vtestz_ps_256:
27145 case Intrinsic::x86_avx_vtestc_ps_256:
27146 case Intrinsic::x86_avx_vtestnzc_ps_256:
27147 case Intrinsic::x86_avx_vtestz_pd_256:
27148 case Intrinsic::x86_avx_vtestc_pd_256:
27149 case Intrinsic::x86_avx_vtestnzc_pd_256: {
27150 unsigned TestOpc = X86ISD::PTEST;
27151 X86::CondCode X86CC;
27152 switch (IntNo) {
27153 default: llvm_unreachable("Bad fallthrough in Intrinsic lowering.");
27154 case Intrinsic::x86_avx512_ktestc_b:
27155 case Intrinsic::x86_avx512_ktestc_w:
27156 case Intrinsic::x86_avx512_ktestc_d:
27157 case Intrinsic::x86_avx512_ktestc_q:
27158 // CF = 1
27159 TestOpc = X86ISD::KTEST;
27160 X86CC = X86::COND_B;
27161 break;
27162 case Intrinsic::x86_avx512_ktestz_b:
27163 case Intrinsic::x86_avx512_ktestz_w:
27164 case Intrinsic::x86_avx512_ktestz_d:
27165 case Intrinsic::x86_avx512_ktestz_q:
27166 TestOpc = X86ISD::KTEST;
27167 X86CC = X86::COND_E;
27168 break;
27169 case Intrinsic::x86_avx_vtestz_ps:
27170 case Intrinsic::x86_avx_vtestz_pd:
27171 case Intrinsic::x86_avx_vtestz_ps_256:
27172 case Intrinsic::x86_avx_vtestz_pd_256:
27173 TestOpc = X86ISD::TESTP;
27174 [[fallthrough]];
27175 case Intrinsic::x86_sse41_ptestz:
27176 case Intrinsic::x86_avx_ptestz_256:
27177 // ZF = 1
27178 X86CC = X86::COND_E;
27179 break;
27180 case Intrinsic::x86_avx_vtestc_ps:
27181 case Intrinsic::x86_avx_vtestc_pd:
27182 case Intrinsic::x86_avx_vtestc_ps_256:
27183 case Intrinsic::x86_avx_vtestc_pd_256:
27184 TestOpc = X86ISD::TESTP;
27185 [[fallthrough]];
27186 case Intrinsic::x86_sse41_ptestc:
27187 case Intrinsic::x86_avx_ptestc_256:
27188 // CF = 1
27189 X86CC = X86::COND_B;
27190 break;
27191 case Intrinsic::x86_avx_vtestnzc_ps:
27192 case Intrinsic::x86_avx_vtestnzc_pd:
27193 case Intrinsic::x86_avx_vtestnzc_ps_256:
27194 case Intrinsic::x86_avx_vtestnzc_pd_256:
27195 TestOpc = X86ISD::TESTP;
27196 [[fallthrough]];
27197 case Intrinsic::x86_sse41_ptestnzc:
27198 case Intrinsic::x86_avx_ptestnzc_256:
27199 // ZF and CF = 0
27200 X86CC = X86::COND_A;
27201 break;
27202 }
27203
27204 SDValue LHS = Op.getOperand(1);
27205 SDValue RHS = Op.getOperand(2);
27206 SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS);
27207 SDValue SetCC = getSETCC(X86CC, Test, dl, DAG);
27208 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
27209 }
27210
27211 case Intrinsic::x86_sse42_pcmpistria128:
27212 case Intrinsic::x86_sse42_pcmpestria128:
27213 case Intrinsic::x86_sse42_pcmpistric128:
27214 case Intrinsic::x86_sse42_pcmpestric128:
27215 case Intrinsic::x86_sse42_pcmpistrio128:
27216 case Intrinsic::x86_sse42_pcmpestrio128:
27217 case Intrinsic::x86_sse42_pcmpistris128:
27218 case Intrinsic::x86_sse42_pcmpestris128:
27219 case Intrinsic::x86_sse42_pcmpistriz128:
27220 case Intrinsic::x86_sse42_pcmpestriz128: {
27221 unsigned Opcode;
27222 X86::CondCode X86CC;
27223 switch (IntNo) {
27224 default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
27225 case Intrinsic::x86_sse42_pcmpistria128:
27226 Opcode = X86ISD::PCMPISTR;
27227 X86CC = X86::COND_A;
27228 break;
27229 case Intrinsic::x86_sse42_pcmpestria128:
27230 Opcode = X86ISD::PCMPESTR;
27231 X86CC = X86::COND_A;
27232 break;
27233 case Intrinsic::x86_sse42_pcmpistric128:
27234 Opcode = X86ISD::PCMPISTR;
27235 X86CC = X86::COND_B;
27236 break;
27237 case Intrinsic::x86_sse42_pcmpestric128:
27238 Opcode = X86ISD::PCMPESTR;
27239 X86CC = X86::COND_B;
27240 break;
27241 case Intrinsic::x86_sse42_pcmpistrio128:
27242 Opcode = X86ISD::PCMPISTR;
27243 X86CC = X86::COND_O;
27244 break;
27245 case Intrinsic::x86_sse42_pcmpestrio128:
27246 Opcode = X86ISD::PCMPESTR;
27247 X86CC = X86::COND_O;
27248 break;
27249 case Intrinsic::x86_sse42_pcmpistris128:
27250 Opcode = X86ISD::PCMPISTR;
27251 X86CC = X86::COND_S;
27252 break;
27253 case Intrinsic::x86_sse42_pcmpestris128:
27254 Opcode = X86ISD::PCMPESTR;
27255 X86CC = X86::COND_S;
27256 break;
27257 case Intrinsic::x86_sse42_pcmpistriz128:
27258 Opcode = X86ISD::PCMPISTR;
27259 X86CC = X86::COND_E;
27260 break;
27261 case Intrinsic::x86_sse42_pcmpestriz128:
27262 Opcode = X86ISD::PCMPESTR;
27263 X86CC = X86::COND_E;
27264 break;
27265 }
27267 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
27268 SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps).getValue(2);
27269 SDValue SetCC = getSETCC(X86CC, PCMP, dl, DAG);
27270 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
27271 }
27272
27273 case Intrinsic::x86_sse42_pcmpistri128:
27274 case Intrinsic::x86_sse42_pcmpestri128: {
27275 unsigned Opcode;
27276 if (IntNo == Intrinsic::x86_sse42_pcmpistri128)
27277 Opcode = X86ISD::PCMPISTR;
27278 else
27279 Opcode = X86ISD::PCMPESTR;
27280
27282 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
27283 return DAG.getNode(Opcode, dl, VTs, NewOps);
27284 }
27285
27286 case Intrinsic::x86_sse42_pcmpistrm128:
27287 case Intrinsic::x86_sse42_pcmpestrm128: {
27288 unsigned Opcode;
27289 if (IntNo == Intrinsic::x86_sse42_pcmpistrm128)
27290 Opcode = X86ISD::PCMPISTR;
27291 else
27292 Opcode = X86ISD::PCMPESTR;
27293
27295 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
27296 return DAG.getNode(Opcode, dl, VTs, NewOps).getValue(1);
27297 }
27298
27299 case Intrinsic::eh_sjlj_lsda: {
27300 MachineFunction &MF = DAG.getMachineFunction();
27301 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
27302 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
27303 auto &Context = MF.getContext();
27304 MCSymbol *S = Context.getOrCreateSymbol(Twine("GCC_except_table") +
27305 Twine(MF.getFunctionNumber()));
27306 return DAG.getNode(getGlobalWrapperKind(nullptr, /*OpFlags=*/0), dl, VT,
27307 DAG.getMCSymbol(S, PtrVT));
27308 }
27309
27310 case Intrinsic::x86_seh_lsda: {
27311 // Compute the symbol for the LSDA. We know it'll get emitted later.
27312 MachineFunction &MF = DAG.getMachineFunction();
27313 SDValue Op1 = Op.getOperand(1);
27314 auto *Fn = cast<Function>(cast<GlobalAddressSDNode>(Op1)->getGlobal());
27317
27318 // Generate a simple absolute symbol reference. This intrinsic is only
27319 // supported on 32-bit Windows, which isn't PIC.
27320 SDValue Result = DAG.getMCSymbol(LSDASym, VT);
27321 return DAG.getNode(X86ISD::Wrapper, dl, VT, Result);
27322 }
27323
27324 case Intrinsic::eh_recoverfp: {
27325 SDValue FnOp = Op.getOperand(1);
27326 SDValue IncomingFPOp = Op.getOperand(2);
27327 GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(FnOp);
27328 auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr);
27329 if (!Fn)
27331 "llvm.eh.recoverfp must take a function as the first argument");
27332 return recoverFramePointer(DAG, Fn, IncomingFPOp);
27333 }
27334
27335 case Intrinsic::localaddress: {
27336 // Returns one of the stack, base, or frame pointer registers, depending on
27337 // which is used to reference local variables.
27338 MachineFunction &MF = DAG.getMachineFunction();
27339 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
27340 Register Reg;
27341 if (RegInfo->hasBasePointer(MF))
27342 Reg = RegInfo->getBaseRegister();
27343 else { // Handles the SP or FP case.
27344 bool CantUseFP = RegInfo->hasStackRealignment(MF);
27345 if (CantUseFP)
27346 Reg = RegInfo->getPtrSizedStackRegister(MF);
27347 else
27348 Reg = RegInfo->getPtrSizedFrameRegister(MF);
27349 }
27350 return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT);
27351 }
27352 case Intrinsic::x86_avx512_vp2intersect_q_512:
27353 case Intrinsic::x86_avx512_vp2intersect_q_256:
27354 case Intrinsic::x86_avx512_vp2intersect_q_128:
27355 case Intrinsic::x86_avx512_vp2intersect_d_512:
27356 case Intrinsic::x86_avx512_vp2intersect_d_256:
27357 case Intrinsic::x86_avx512_vp2intersect_d_128: {
27358 SDLoc DL(Op);
27359 MVT MaskVT = Op.getSimpleValueType();
27360 SDVTList VTs = DAG.getVTList(MVT::Untyped, MVT::Other);
27362 Op.getOperand(1), Op.getOperand(2));
27363 SDValue Result0 =
27364 DAG.getTargetExtractSubreg(X86::sub_mask_0, DL, MaskVT, Operation);
27365 SDValue Result1 =
27366 DAG.getTargetExtractSubreg(X86::sub_mask_1, DL, MaskVT, Operation);
27367 return DAG.getMergeValues({Result0, Result1}, DL);
27368 }
27369 case Intrinsic::x86_mmx_pslli_w:
27370 case Intrinsic::x86_mmx_pslli_d:
27371 case Intrinsic::x86_mmx_pslli_q:
27372 case Intrinsic::x86_mmx_psrli_w:
27373 case Intrinsic::x86_mmx_psrli_d:
27374 case Intrinsic::x86_mmx_psrli_q:
27375 case Intrinsic::x86_mmx_psrai_w:
27376 case Intrinsic::x86_mmx_psrai_d: {
27377 SDLoc DL(Op);
27378 SDValue ShAmt = Op.getOperand(2);
27379 // If the argument is a constant, convert it to a target constant.
27380 if (auto *C = dyn_cast<ConstantSDNode>(ShAmt)) {
27381 // Clamp out of bounds shift amounts since they will otherwise be masked
27382 // to 8-bits which may make it no longer out of bounds.
27383 unsigned ShiftAmount = C->getAPIntValue().getLimitedValue(255);
27384 if (ShiftAmount == 0)
27385 return Op.getOperand(1);
27386
27387 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),
27388 Op.getOperand(0), Op.getOperand(1),
27389 DAG.getTargetConstant(ShiftAmount, DL, MVT::i32));
27390 }
27391
27392 unsigned NewIntrinsic;
27393 switch (IntNo) {
27394 default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
27395 case Intrinsic::x86_mmx_pslli_w:
27396 NewIntrinsic = Intrinsic::x86_mmx_psll_w;
27397 break;
27398 case Intrinsic::x86_mmx_pslli_d:
27399 NewIntrinsic = Intrinsic::x86_mmx_psll_d;
27400 break;
27401 case Intrinsic::x86_mmx_pslli_q:
27402 NewIntrinsic = Intrinsic::x86_mmx_psll_q;
27403 break;
27404 case Intrinsic::x86_mmx_psrli_w:
27405 NewIntrinsic = Intrinsic::x86_mmx_psrl_w;
27406 break;
27407 case Intrinsic::x86_mmx_psrli_d:
27408 NewIntrinsic = Intrinsic::x86_mmx_psrl_d;
27409 break;
27410 case Intrinsic::x86_mmx_psrli_q:
27411 NewIntrinsic = Intrinsic::x86_mmx_psrl_q;
27412 break;
27413 case Intrinsic::x86_mmx_psrai_w:
27414 NewIntrinsic = Intrinsic::x86_mmx_psra_w;
27415 break;
27416 case Intrinsic::x86_mmx_psrai_d:
27417 NewIntrinsic = Intrinsic::x86_mmx_psra_d;
27418 break;
27419 }
27420
27421 // The vector shift intrinsics with scalars uses 32b shift amounts but
27422 // the sse2/mmx shift instructions reads 64 bits. Copy the 32 bits to an
27423 // MMX register.
27424 ShAmt = DAG.getNode(X86ISD::MMX_MOVW2D, DL, MVT::x86mmx, ShAmt);
27425 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),
27426 DAG.getTargetConstant(NewIntrinsic, DL,
27428 Op.getOperand(1), ShAmt);
27429 }
27430 case Intrinsic::thread_pointer: {
27431 if (Subtarget.isTargetELF()) {
27432 SDLoc dl(Op);
27433 EVT PtrVT = Op.getValueType();
27434 // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
27436 *DAG.getContext(), Subtarget.is64Bit() ? X86AS::FS : X86AS::GS));
27437 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
27438 DAG.getIntPtrConstant(0, dl), MachinePointerInfo(Ptr));
27439 }
27441 "Target OS doesn't support __builtin_thread_pointer() yet.");
27442 }
27443 }
27444}
27445
27447 SDValue Src, SDValue Mask, SDValue Base,
27448 SDValue Index, SDValue ScaleOp, SDValue Chain,
27449 const X86Subtarget &Subtarget) {
27450 SDLoc dl(Op);
27451 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
27452 // Scale must be constant.
27453 if (!C)
27454 return SDValue();
27455 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
27456 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
27457 TLI.getPointerTy(DAG.getDataLayout()));
27458 EVT MaskVT = Mask.getValueType().changeVectorElementTypeToInteger();
27459 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Other);
27460 // If source is undef or we know it won't be used, use a zero vector
27461 // to break register dependency.
27462 // TODO: use undef instead and let BreakFalseDeps deal with it?
27463 if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode()))
27464 Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
27465
27466 // Cast mask to an integer type.
27467 Mask = DAG.getBitcast(MaskVT, Mask);
27468
27470
27471 SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale };
27472 SDValue Res =
27474 MemIntr->getMemoryVT(), MemIntr->getMemOperand());
27475 return DAG.getMergeValues({Res, Res.getValue(1)}, dl);
27476}
27477
27479 SDValue Src, SDValue Mask, SDValue Base,
27480 SDValue Index, SDValue ScaleOp, SDValue Chain,
27481 const X86Subtarget &Subtarget) {
27482 MVT VT = Op.getSimpleValueType();
27483 SDLoc dl(Op);
27484 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
27485 // Scale must be constant.
27486 if (!C)
27487 return SDValue();
27488 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
27489 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
27490 TLI.getPointerTy(DAG.getDataLayout()));
27491 unsigned MinElts = std::min(Index.getSimpleValueType().getVectorNumElements(),
27493 MVT MaskVT = MVT::getVectorVT(MVT::i1, MinElts);
27494
27495 // We support two versions of the gather intrinsics. One with scalar mask and
27496 // one with vXi1 mask. Convert scalar to vXi1 if necessary.
27497 if (Mask.getValueType() != MaskVT)
27498 Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
27499
27500 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Other);
27501 // If source is undef or we know it won't be used, use a zero vector
27502 // to break register dependency.
27503 // TODO: use undef instead and let BreakFalseDeps deal with it?
27504 if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode()))
27505 Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
27506
27508
27509 SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale };
27510 SDValue Res =
27512 MemIntr->getMemoryVT(), MemIntr->getMemOperand());
27513 return DAG.getMergeValues({Res, Res.getValue(1)}, dl);
27514}
27515
27517 SDValue Src, SDValue Mask, SDValue Base,
27518 SDValue Index, SDValue ScaleOp, SDValue Chain,
27519 const X86Subtarget &Subtarget) {
27520 SDLoc dl(Op);
27521 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
27522 // Scale must be constant.
27523 if (!C)
27524 return SDValue();
27525 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
27526 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
27527 TLI.getPointerTy(DAG.getDataLayout()));
27528 unsigned MinElts = std::min(Index.getSimpleValueType().getVectorNumElements(),
27529 Src.getSimpleValueType().getVectorNumElements());
27530 MVT MaskVT = MVT::getVectorVT(MVT::i1, MinElts);
27531
27532 // We support two versions of the scatter intrinsics. One with scalar mask and
27533 // one with vXi1 mask. Convert scalar to vXi1 if necessary.
27534 if (Mask.getValueType() != MaskVT)
27535 Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
27536
27538
27539 SDVTList VTs = DAG.getVTList(MVT::Other);
27540 SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale};
27541 SDValue Res =
27543 MemIntr->getMemoryVT(), MemIntr->getMemOperand());
27544 return Res;
27545}
27546
27548 SDValue Mask, SDValue Base, SDValue Index,
27549 SDValue ScaleOp, SDValue Chain,
27550 const X86Subtarget &Subtarget) {
27551 SDLoc dl(Op);
27552 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
27553 // Scale must be constant.
27554 if (!C)
27555 return SDValue();
27556 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
27557 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
27558 TLI.getPointerTy(DAG.getDataLayout()));
27559 SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
27560 SDValue Segment = DAG.getRegister(0, MVT::i32);
27561 MVT MaskVT =
27562 MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements());
27563 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
27564 SDValue Ops[] = {VMask, Base, Scale, Index, Disp, Segment, Chain};
27565 SDNode *Res = DAG.getMachineNode(Opc, dl, MVT::Other, Ops);
27566 return SDValue(Res, 0);
27567}
27568
27569/// Handles the lowering of builtin intrinsics with chain that return their
27570/// value into registers EDX:EAX.
27571/// If operand ScrReg is a valid register identifier, then operand 2 of N is
27572/// copied to SrcReg. The assumption is that SrcReg is an implicit input to
27573/// TargetOpcode.
27574/// Returns a Glue value which can be used to add extra copy-from-reg if the
27575/// expanded intrinsics implicitly defines extra registers (i.e. not just
27576/// EDX:EAX).
27578 SelectionDAG &DAG,
27579 unsigned TargetOpcode,
27580 unsigned SrcReg,
27581 const X86Subtarget &Subtarget,
27583 SDValue Chain = N->getOperand(0);
27584 SDValue Glue;
27585
27586 if (SrcReg) {
27587 assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
27588 Chain = DAG.getCopyToReg(Chain, DL, SrcReg, N->getOperand(2), Glue);
27589 Glue = Chain.getValue(1);
27590 }
27591
27592 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
27593 SDValue N1Ops[] = {Chain, Glue};
27594 SDNode *N1 = DAG.getMachineNode(
27595 TargetOpcode, DL, Tys, ArrayRef<SDValue>(N1Ops, Glue.getNode() ? 2 : 1));
27596 Chain = SDValue(N1, 0);
27597
27598 // Reads the content of XCR and returns it in registers EDX:EAX.
27599 SDValue LO, HI;
27600 if (Subtarget.is64Bit()) {
27601 LO = DAG.getCopyFromReg(Chain, DL, X86::RAX, MVT::i64, SDValue(N1, 1));
27602 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
27603 LO.getValue(2));
27604 } else {
27605 LO = DAG.getCopyFromReg(Chain, DL, X86::EAX, MVT::i32, SDValue(N1, 1));
27606 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
27607 LO.getValue(2));
27608 }
27609 Chain = HI.getValue(1);
27610 Glue = HI.getValue(2);
27611
27612 if (Subtarget.is64Bit()) {
27613 // Merge the two 32-bit values into a 64-bit one.
27614 SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
27615 DAG.getConstant(32, DL, MVT::i8));
27616 Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
27617 Results.push_back(Chain);
27618 return Glue;
27619 }
27620
27621 // Use a buildpair to merge the two 32-bit values into a 64-bit one.
27622 SDValue Ops[] = { LO, HI };
27623 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
27624 Results.push_back(Pair);
27625 Results.push_back(Chain);
27626 return Glue;
27627}
27628
27629/// Handles the lowering of builtin intrinsics that read the time stamp counter
27630/// (x86_rdtsc and x86_rdtscp). This function is also used to custom lower
27631/// READCYCLECOUNTER nodes.
27632static void getReadTimeStampCounter(SDNode *N, const SDLoc &DL, unsigned Opcode,
27633 SelectionDAG &DAG,
27634 const X86Subtarget &Subtarget,
27636 // The processor's time-stamp counter (a 64-bit MSR) is stored into the
27637 // EDX:EAX registers. EDX is loaded with the high-order 32 bits of the MSR
27638 // and the EAX register is loaded with the low-order 32 bits.
27639 SDValue Glue = expandIntrinsicWChainHelper(N, DL, DAG, Opcode,
27640 /* NoRegister */0, Subtarget,
27641 Results);
27642 if (Opcode != X86::RDTSCP)
27643 return;
27644
27645 SDValue Chain = Results[1];
27646 // Instruction RDTSCP loads the IA32:TSC_AUX_MSR (address C000_0103H) into
27647 // the ECX register. Add 'ecx' explicitly to the chain.
27648 SDValue ecx = DAG.getCopyFromReg(Chain, DL, X86::ECX, MVT::i32, Glue);
27649 Results[1] = ecx;
27650 Results.push_back(ecx.getValue(1));
27651}
27652
27654 SelectionDAG &DAG) {
27656 SDLoc DL(Op);
27657 getReadTimeStampCounter(Op.getNode(), DL, X86::RDTSC, DAG, Subtarget,
27658 Results);
27659 return DAG.getMergeValues(Results, DL);
27660}
27661
27664 SDValue Chain = Op.getOperand(0);
27665 SDValue RegNode = Op.getOperand(2);
27666 WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
27667 if (!EHInfo)
27668 report_fatal_error("EH registrations only live in functions using WinEH");
27669
27670 // Cast the operand to an alloca, and remember the frame index.
27671 auto *FINode = dyn_cast<FrameIndexSDNode>(RegNode);
27672 if (!FINode)
27673 report_fatal_error("llvm.x86.seh.ehregnode expects a static alloca");
27674 EHInfo->EHRegNodeFrameIndex = FINode->getIndex();
27675
27676 // Return the chain operand without making any DAG nodes.
27677 return Chain;
27678}
27679
27682 SDValue Chain = Op.getOperand(0);
27683 SDValue EHGuard = Op.getOperand(2);
27684 WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
27685 if (!EHInfo)
27686 report_fatal_error("EHGuard only live in functions using WinEH");
27687
27688 // Cast the operand to an alloca, and remember the frame index.
27689 auto *FINode = dyn_cast<FrameIndexSDNode>(EHGuard);
27690 if (!FINode)
27691 report_fatal_error("llvm.x86.seh.ehguard expects a static alloca");
27692 EHInfo->EHGuardFrameIndex = FINode->getIndex();
27693
27694 // Return the chain operand without making any DAG nodes.
27695 return Chain;
27696}
27697
27698/// Emit Truncating Store with signed or unsigned saturation.
27699static SDValue
27700EmitTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &DL, SDValue Val,
27701 SDValue Ptr, EVT MemVT, MachineMemOperand *MMO,
27702 SelectionDAG &DAG) {
27703 SDVTList VTs = DAG.getVTList(MVT::Other);
27704 SDValue Undef = DAG.getUNDEF(Ptr.getValueType());
27705 SDValue Ops[] = { Chain, Val, Ptr, Undef };
27706 unsigned Opc = SignedSat ? X86ISD::VTRUNCSTORES : X86ISD::VTRUNCSTOREUS;
27707 return DAG.getMemIntrinsicNode(Opc, DL, VTs, Ops, MemVT, MMO);
27708}
27709
27710/// Emit Masked Truncating Store with signed or unsigned saturation.
27711static SDValue EmitMaskedTruncSStore(bool SignedSat, SDValue Chain,
27712 const SDLoc &DL,
27713 SDValue Val, SDValue Ptr, SDValue Mask, EVT MemVT,
27714 MachineMemOperand *MMO, SelectionDAG &DAG) {
27715 SDVTList VTs = DAG.getVTList(MVT::Other);
27716 SDValue Ops[] = { Chain, Val, Ptr, Mask };
27717 unsigned Opc = SignedSat ? X86ISD::VMTRUNCSTORES : X86ISD::VMTRUNCSTOREUS;
27718 return DAG.getMemIntrinsicNode(Opc, DL, VTs, Ops, MemVT, MMO);
27719}
27720
27722 const MachineFunction &MF) {
27723 if (!Subtarget.is64Bit())
27724 return false;
27725 // 64-bit targets support extended Swift async frame setup,
27726 // except for targets that use the windows 64 prologue.
27727 return !MF.getTarget().getMCAsmInfo()->usesWindowsCFI();
27728}
27729
27731 SelectionDAG &DAG) {
27732 unsigned IntNo = Op.getConstantOperandVal(1);
27733 const IntrinsicData *IntrData = getIntrinsicWithChain(IntNo);
27734 if (!IntrData) {
27735 switch (IntNo) {
27736
27737 case Intrinsic::swift_async_context_addr: {
27738 SDLoc dl(Op);
27739 auto &MF = DAG.getMachineFunction();
27740 auto *X86FI = MF.getInfo<X86MachineFunctionInfo>();
27741 if (X86::isExtendedSwiftAsyncFrameSupported(Subtarget, MF)) {
27743 X86FI->setHasSwiftAsyncContext(true);
27744 SDValue Chain = Op->getOperand(0);
27745 SDValue CopyRBP = DAG.getCopyFromReg(Chain, dl, X86::RBP, MVT::i64);
27746 SDValue Result =
27747 SDValue(DAG.getMachineNode(X86::SUB64ri32, dl, MVT::i64, CopyRBP,
27748 DAG.getTargetConstant(8, dl, MVT::i32)),
27749 0);
27750 // Return { result, chain }.
27751 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result,
27752 CopyRBP.getValue(1));
27753 } else {
27754 // No special extended frame, create or reuse an existing stack slot.
27755 int PtrSize = Subtarget.is64Bit() ? 8 : 4;
27756 if (!X86FI->getSwiftAsyncContextFrameIdx())
27757 X86FI->setSwiftAsyncContextFrameIdx(
27758 MF.getFrameInfo().CreateStackObject(PtrSize, Align(PtrSize),
27759 false));
27760 SDValue Result =
27761 DAG.getFrameIndex(*X86FI->getSwiftAsyncContextFrameIdx(),
27762 PtrSize == 8 ? MVT::i64 : MVT::i32);
27763 // Return { result, chain }.
27764 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result,
27765 Op->getOperand(0));
27766 }
27767 }
27768
27769 case llvm::Intrinsic::x86_seh_ehregnode:
27770 return MarkEHRegistrationNode(Op, DAG);
27771 case llvm::Intrinsic::x86_seh_ehguard:
27772 return MarkEHGuard(Op, DAG);
27773 case llvm::Intrinsic::x86_rdpkru: {
27774 SDLoc dl(Op);
27775 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
27776 // Create a RDPKRU node and pass 0 to the ECX parameter.
27777 return DAG.getNode(X86ISD::RDPKRU, dl, VTs, Op.getOperand(0),
27778 DAG.getConstant(0, dl, MVT::i32));
27779 }
27780 case llvm::Intrinsic::x86_wrpkru: {
27781 SDLoc dl(Op);
27782 // Create a WRPKRU node, pass the input to the EAX parameter, and pass 0
27783 // to the EDX and ECX parameters.
27784 return DAG.getNode(X86ISD::WRPKRU, dl, MVT::Other,
27785 Op.getOperand(0), Op.getOperand(2),
27786 DAG.getConstant(0, dl, MVT::i32),
27787 DAG.getConstant(0, dl, MVT::i32));
27788 }
27789 case llvm::Intrinsic::asan_check_memaccess: {
27790 // Mark this as adjustsStack because it will be lowered to a call.
27792 // Don't do anything here, we will expand these intrinsics out later.
27793 return Op;
27794 }
27795 case llvm::Intrinsic::x86_flags_read_u32:
27796 case llvm::Intrinsic::x86_flags_read_u64:
27797 case llvm::Intrinsic::x86_flags_write_u32:
27798 case llvm::Intrinsic::x86_flags_write_u64: {
27799 // We need a frame pointer because this will get lowered to a PUSH/POP
27800 // sequence.
27803 // Don't do anything here, we will expand these intrinsics out later
27804 // during FinalizeISel in EmitInstrWithCustomInserter.
27805 return Op;
27806 }
27807 case Intrinsic::x86_lwpins32:
27808 case Intrinsic::x86_lwpins64:
27809 case Intrinsic::x86_umwait:
27810 case Intrinsic::x86_tpause: {
27811 SDLoc dl(Op);
27812 SDValue Chain = Op->getOperand(0);
27813 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
27814 unsigned Opcode;
27815
27816 switch (IntNo) {
27817 default: llvm_unreachable("Impossible intrinsic");
27818 case Intrinsic::x86_umwait:
27819 Opcode = X86ISD::UMWAIT;
27820 break;
27821 case Intrinsic::x86_tpause:
27822 Opcode = X86ISD::TPAUSE;
27823 break;
27824 case Intrinsic::x86_lwpins32:
27825 case Intrinsic::x86_lwpins64:
27826 Opcode = X86ISD::LWPINS;
27827 break;
27828 }
27829
27831 DAG.getNode(Opcode, dl, VTs, Chain, Op->getOperand(2),
27832 Op->getOperand(3), Op->getOperand(4));
27833 SDValue SetCC = getSETCC(X86::COND_B, Operation.getValue(0), dl, DAG);
27834 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
27835 Operation.getValue(1));
27836 }
27837 case Intrinsic::x86_enqcmd:
27838 case Intrinsic::x86_enqcmds: {
27839 SDLoc dl(Op);
27840 SDValue Chain = Op.getOperand(0);
27841 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
27842 unsigned Opcode;
27843 switch (IntNo) {
27844 default: llvm_unreachable("Impossible intrinsic!");
27845 case Intrinsic::x86_enqcmd:
27846 Opcode = X86ISD::ENQCMD;
27847 break;
27848 case Intrinsic::x86_enqcmds:
27849 Opcode = X86ISD::ENQCMDS;
27850 break;
27851 }
27852 SDValue Operation = DAG.getNode(Opcode, dl, VTs, Chain, Op.getOperand(2),
27853 Op.getOperand(3));
27854 SDValue SetCC = getSETCC(X86::COND_E, Operation.getValue(0), dl, DAG);
27855 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
27856 Operation.getValue(1));
27857 }
27858 case Intrinsic::x86_aesenc128kl:
27859 case Intrinsic::x86_aesdec128kl:
27860 case Intrinsic::x86_aesenc256kl:
27861 case Intrinsic::x86_aesdec256kl: {
27862 SDLoc DL(Op);
27863 SDVTList VTs = DAG.getVTList(MVT::v2i64, MVT::i32, MVT::Other);
27864 SDValue Chain = Op.getOperand(0);
27865 unsigned Opcode;
27866
27867 switch (IntNo) {
27868 default: llvm_unreachable("Impossible intrinsic");
27869 case Intrinsic::x86_aesenc128kl:
27870 Opcode = X86ISD::AESENC128KL;
27871 break;
27872 case Intrinsic::x86_aesdec128kl:
27873 Opcode = X86ISD::AESDEC128KL;
27874 break;
27875 case Intrinsic::x86_aesenc256kl:
27876 Opcode = X86ISD::AESENC256KL;
27877 break;
27878 case Intrinsic::x86_aesdec256kl:
27879 Opcode = X86ISD::AESDEC256KL;
27880 break;
27881 }
27882
27884 MachineMemOperand *MMO = MemIntr->getMemOperand();
27885 EVT MemVT = MemIntr->getMemoryVT();
27887 Opcode, DL, VTs, {Chain, Op.getOperand(2), Op.getOperand(3)}, MemVT,
27888 MMO);
27889 SDValue ZF = getSETCC(X86::COND_E, Operation.getValue(1), DL, DAG);
27890
27891 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),
27892 {ZF, Operation.getValue(0), Operation.getValue(2)});
27893 }
27894 case Intrinsic::x86_aesencwide128kl:
27895 case Intrinsic::x86_aesdecwide128kl:
27896 case Intrinsic::x86_aesencwide256kl:
27897 case Intrinsic::x86_aesdecwide256kl: {
27898 SDLoc DL(Op);
27899 SDVTList VTs = DAG.getVTList(
27900 {MVT::i32, MVT::v2i64, MVT::v2i64, MVT::v2i64, MVT::v2i64, MVT::v2i64,
27901 MVT::v2i64, MVT::v2i64, MVT::v2i64, MVT::Other});
27902 SDValue Chain = Op.getOperand(0);
27903 unsigned Opcode;
27904
27905 switch (IntNo) {
27906 default: llvm_unreachable("Impossible intrinsic");
27907 case Intrinsic::x86_aesencwide128kl:
27908 Opcode = X86ISD::AESENCWIDE128KL;
27909 break;
27910 case Intrinsic::x86_aesdecwide128kl:
27911 Opcode = X86ISD::AESDECWIDE128KL;
27912 break;
27913 case Intrinsic::x86_aesencwide256kl:
27914 Opcode = X86ISD::AESENCWIDE256KL;
27915 break;
27916 case Intrinsic::x86_aesdecwide256kl:
27917 Opcode = X86ISD::AESDECWIDE256KL;
27918 break;
27919 }
27920
27922 MachineMemOperand *MMO = MemIntr->getMemOperand();
27923 EVT MemVT = MemIntr->getMemoryVT();
27925 Opcode, DL, VTs,
27926 {Chain, Op.getOperand(2), Op.getOperand(3), Op.getOperand(4),
27927 Op.getOperand(5), Op.getOperand(6), Op.getOperand(7),
27928 Op.getOperand(8), Op.getOperand(9), Op.getOperand(10)},
27929 MemVT, MMO);
27930 SDValue ZF = getSETCC(X86::COND_E, Operation.getValue(0), DL, DAG);
27931
27932 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),
27933 {ZF, Operation.getValue(1), Operation.getValue(2),
27934 Operation.getValue(3), Operation.getValue(4),
27935 Operation.getValue(5), Operation.getValue(6),
27936 Operation.getValue(7), Operation.getValue(8),
27937 Operation.getValue(9)});
27938 }
27939 case Intrinsic::x86_testui: {
27940 SDLoc dl(Op);
27941 SDValue Chain = Op.getOperand(0);
27942 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
27943 SDValue Operation = DAG.getNode(X86ISD::TESTUI, dl, VTs, Chain);
27944 SDValue SetCC = getSETCC(X86::COND_B, Operation.getValue(0), dl, DAG);
27945 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
27946 Operation.getValue(1));
27947 }
27948 case Intrinsic::x86_t2rpntlvwz0rs_internal:
27949 case Intrinsic::x86_t2rpntlvwz0rst1_internal:
27950 case Intrinsic::x86_t2rpntlvwz1rs_internal:
27951 case Intrinsic::x86_t2rpntlvwz1rst1_internal:
27952 case Intrinsic::x86_t2rpntlvwz0_internal:
27953 case Intrinsic::x86_t2rpntlvwz0t1_internal:
27954 case Intrinsic::x86_t2rpntlvwz1_internal:
27955 case Intrinsic::x86_t2rpntlvwz1t1_internal: {
27956 auto *X86MFI = DAG.getMachineFunction().getInfo<X86MachineFunctionInfo>();
27958 unsigned IntNo = Op.getConstantOperandVal(1);
27959 unsigned Opc = 0;
27960 switch (IntNo) {
27961 default:
27962 llvm_unreachable("Unexpected intrinsic!");
27963 case Intrinsic::x86_t2rpntlvwz0_internal:
27964 Opc = X86::PT2RPNTLVWZ0V;
27965 break;
27966 case Intrinsic::x86_t2rpntlvwz0t1_internal:
27967 Opc = X86::PT2RPNTLVWZ0T1V;
27968 break;
27969 case Intrinsic::x86_t2rpntlvwz1_internal:
27970 Opc = X86::PT2RPNTLVWZ1V;
27971 break;
27972 case Intrinsic::x86_t2rpntlvwz1t1_internal:
27973 Opc = X86::PT2RPNTLVWZ1T1V;
27974 break;
27975 case Intrinsic::x86_t2rpntlvwz0rs_internal:
27976 Opc = X86::PT2RPNTLVWZ0RSV;
27977 break;
27978 case Intrinsic::x86_t2rpntlvwz0rst1_internal:
27979 Opc = X86::PT2RPNTLVWZ0RST1V;
27980 break;
27981 case Intrinsic::x86_t2rpntlvwz1rs_internal:
27982 Opc = X86::PT2RPNTLVWZ1RSV;
27983 break;
27984 case Intrinsic::x86_t2rpntlvwz1rst1_internal:
27985 Opc = X86::PT2RPNTLVWZ1RST1V;
27986 break;
27987 }
27988
27989 SDLoc DL(Op);
27990 SDVTList VTs = DAG.getVTList(MVT::Untyped, MVT::Other);
27991
27992 SDValue Ops[] = {Op.getOperand(2), // Row
27993 Op.getOperand(3), // Col0
27994 Op.getOperand(4), // Col1
27995 Op.getOperand(5), // Base
27996 DAG.getTargetConstant(1, DL, MVT::i8), // Scale
27997 Op.getOperand(6), // Index
27998 DAG.getTargetConstant(0, DL, MVT::i32), // Disp
27999 DAG.getRegister(0, MVT::i16), // Segment
28000 Op.getOperand(0)}; // Chain
28001
28002 MachineSDNode *Res = DAG.getMachineNode(Opc, DL, VTs, Ops);
28003 SDValue Res0 = DAG.getTargetExtractSubreg(X86::sub_t0, DL, MVT::x86amx,
28004 SDValue(Res, 0));
28005 SDValue Res1 = DAG.getTargetExtractSubreg(X86::sub_t1, DL, MVT::x86amx,
28006 SDValue(Res, 0));
28007 return DAG.getMergeValues({Res0, Res1, SDValue(Res, 1)}, DL);
28008 }
28009 case Intrinsic::x86_atomic_bts_rm:
28010 case Intrinsic::x86_atomic_btc_rm:
28011 case Intrinsic::x86_atomic_btr_rm: {
28012 SDLoc DL(Op);
28013 MVT VT = Op.getSimpleValueType();
28014 SDValue Chain = Op.getOperand(0);
28015 SDValue Op1 = Op.getOperand(2);
28016 SDValue Op2 = Op.getOperand(3);
28017 unsigned Opc = IntNo == Intrinsic::x86_atomic_bts_rm ? X86ISD::LBTS_RM
28018 : IntNo == Intrinsic::x86_atomic_btc_rm ? X86ISD::LBTC_RM
28020 MachineMemOperand *MMO = cast<MemIntrinsicSDNode>(Op)->getMemOperand();
28021 SDValue Res =
28022 DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::i32, MVT::Other),
28023 {Chain, Op1, Op2}, VT, MMO);
28024 Chain = Res.getValue(1);
28025 Res = DAG.getZExtOrTrunc(getSETCC(X86::COND_B, Res, DL, DAG), DL, VT);
28026 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), Res, Chain);
28027 }
28028 case Intrinsic::x86_atomic_bts:
28029 case Intrinsic::x86_atomic_btc:
28030 case Intrinsic::x86_atomic_btr: {
28031 SDLoc DL(Op);
28032 MVT VT = Op.getSimpleValueType();
28033 SDValue Chain = Op.getOperand(0);
28034 SDValue Op1 = Op.getOperand(2);
28035 SDValue Op2 = Op.getOperand(3);
28036 unsigned Opc = IntNo == Intrinsic::x86_atomic_bts ? X86ISD::LBTS
28037 : IntNo == Intrinsic::x86_atomic_btc ? X86ISD::LBTC
28038 : X86ISD::LBTR;
28039 SDValue Size = DAG.getConstant(VT.getScalarSizeInBits(), DL, MVT::i32);
28040 MachineMemOperand *MMO = cast<MemIntrinsicSDNode>(Op)->getMemOperand();
28041 SDValue Res =
28042 DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::i32, MVT::Other),
28043 {Chain, Op1, Op2, Size}, VT, MMO);
28044 Chain = Res.getValue(1);
28045 Res = DAG.getZExtOrTrunc(getSETCC(X86::COND_B, Res, DL, DAG), DL, VT);
28046 unsigned Imm = Op2->getAsZExtVal();
28047 if (Imm)
28048 Res = DAG.getNode(ISD::SHL, DL, VT, Res,
28049 DAG.getShiftAmountConstant(Imm, VT, DL));
28050 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), Res, Chain);
28051 }
28052 case Intrinsic::x86_cmpccxadd32:
28053 case Intrinsic::x86_cmpccxadd64: {
28054 SDLoc DL(Op);
28055 SDValue Chain = Op.getOperand(0);
28056 SDValue Addr = Op.getOperand(2);
28057 SDValue Src1 = Op.getOperand(3);
28058 SDValue Src2 = Op.getOperand(4);
28059 SDValue CC = Op.getOperand(5);
28060 MachineMemOperand *MMO = cast<MemIntrinsicSDNode>(Op)->getMemOperand();
28062 X86ISD::CMPCCXADD, DL, Op->getVTList(), {Chain, Addr, Src1, Src2, CC},
28063 MVT::i32, MMO);
28064 return Operation;
28065 }
28066 case Intrinsic::x86_aadd32:
28067 case Intrinsic::x86_aadd64:
28068 case Intrinsic::x86_aand32:
28069 case Intrinsic::x86_aand64:
28070 case Intrinsic::x86_aor32:
28071 case Intrinsic::x86_aor64:
28072 case Intrinsic::x86_axor32:
28073 case Intrinsic::x86_axor64: {
28074 SDLoc DL(Op);
28075 SDValue Chain = Op.getOperand(0);
28076 SDValue Op1 = Op.getOperand(2);
28077 SDValue Op2 = Op.getOperand(3);
28078 MVT VT = Op2.getSimpleValueType();
28079 unsigned Opc = 0;
28080 switch (IntNo) {
28081 default:
28082 llvm_unreachable("Unknown Intrinsic");
28083 case Intrinsic::x86_aadd32:
28084 case Intrinsic::x86_aadd64:
28085 Opc = X86ISD::AADD;
28086 break;
28087 case Intrinsic::x86_aand32:
28088 case Intrinsic::x86_aand64:
28089 Opc = X86ISD::AAND;
28090 break;
28091 case Intrinsic::x86_aor32:
28092 case Intrinsic::x86_aor64:
28093 Opc = X86ISD::AOR;
28094 break;
28095 case Intrinsic::x86_axor32:
28096 case Intrinsic::x86_axor64:
28097 Opc = X86ISD::AXOR;
28098 break;
28099 }
28100 MachineMemOperand *MMO = cast<MemSDNode>(Op)->getMemOperand();
28101 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(),
28102 {Chain, Op1, Op2}, VT, MMO);
28103 }
28104 case Intrinsic::x86_atomic_add_cc:
28105 case Intrinsic::x86_atomic_sub_cc:
28106 case Intrinsic::x86_atomic_or_cc:
28107 case Intrinsic::x86_atomic_and_cc:
28108 case Intrinsic::x86_atomic_xor_cc: {
28109 SDLoc DL(Op);
28110 SDValue Chain = Op.getOperand(0);
28111 SDValue Op1 = Op.getOperand(2);
28112 SDValue Op2 = Op.getOperand(3);
28113 X86::CondCode CC = (X86::CondCode)Op.getConstantOperandVal(4);
28114 MVT VT = Op2.getSimpleValueType();
28115 unsigned Opc = 0;
28116 switch (IntNo) {
28117 default:
28118 llvm_unreachable("Unknown Intrinsic");
28119 case Intrinsic::x86_atomic_add_cc:
28120 Opc = X86ISD::LADD;
28121 break;
28122 case Intrinsic::x86_atomic_sub_cc:
28123 Opc = X86ISD::LSUB;
28124 break;
28125 case Intrinsic::x86_atomic_or_cc:
28126 Opc = X86ISD::LOR;
28127 break;
28128 case Intrinsic::x86_atomic_and_cc:
28129 Opc = X86ISD::LAND;
28130 break;
28131 case Intrinsic::x86_atomic_xor_cc:
28132 Opc = X86ISD::LXOR;
28133 break;
28134 }
28135 MachineMemOperand *MMO = cast<MemIntrinsicSDNode>(Op)->getMemOperand();
28136 SDValue LockArith =
28137 DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::i32, MVT::Other),
28138 {Chain, Op1, Op2}, VT, MMO);
28139 Chain = LockArith.getValue(1);
28140 return DAG.getMergeValues({getSETCC(CC, LockArith, DL, DAG), Chain}, DL);
28141 }
28142 }
28143 return SDValue();
28144 }
28145
28146 SDLoc dl(Op);
28147 switch(IntrData->Type) {
28148 default: llvm_unreachable("Unknown Intrinsic Type");
28149 case RDSEED:
28150 case RDRAND: {
28151 // Emit the node with the right value type.
28152 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32, MVT::Other);
28153 SDValue Result = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
28154
28155 // If the value returned by RDRAND/RDSEED was valid (CF=1), return 1.
28156 // Otherwise return the value from Rand, which is always 0, casted to i32.
28157 SDValue Ops[] = {DAG.getZExtOrTrunc(Result, dl, Op->getValueType(1)),
28158 DAG.getConstant(1, dl, Op->getValueType(1)),
28159 DAG.getTargetConstant(X86::COND_B, dl, MVT::i8),
28160 SDValue(Result.getNode(), 1)};
28161 SDValue isValid = DAG.getNode(X86ISD::CMOV, dl, Op->getValueType(1), Ops);
28162
28163 // Return { result, isValid, chain }.
28164 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid,
28165 SDValue(Result.getNode(), 2));
28166 }
28167 case GATHER_AVX2: {
28168 SDValue Chain = Op.getOperand(0);
28169 SDValue Src = Op.getOperand(2);
28170 SDValue Base = Op.getOperand(3);
28171 SDValue Index = Op.getOperand(4);
28172 SDValue Mask = Op.getOperand(5);
28173 SDValue Scale = Op.getOperand(6);
28174 return getAVX2GatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
28175 Scale, Chain, Subtarget);
28176 }
28177 case GATHER: {
28178 //gather(v1, mask, index, base, scale);
28179 SDValue Chain = Op.getOperand(0);
28180 SDValue Src = Op.getOperand(2);
28181 SDValue Base = Op.getOperand(3);
28182 SDValue Index = Op.getOperand(4);
28183 SDValue Mask = Op.getOperand(5);
28184 SDValue Scale = Op.getOperand(6);
28185 return getGatherNode(Op, DAG, Src, Mask, Base, Index, Scale,
28186 Chain, Subtarget);
28187 }
28188 case SCATTER: {
28189 //scatter(base, mask, index, v1, scale);
28190 SDValue Chain = Op.getOperand(0);
28191 SDValue Base = Op.getOperand(2);
28192 SDValue Mask = Op.getOperand(3);
28193 SDValue Index = Op.getOperand(4);
28194 SDValue Src = Op.getOperand(5);
28195 SDValue Scale = Op.getOperand(6);
28196 return getScatterNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
28197 Scale, Chain, Subtarget);
28198 }
28199 case PREFETCH: {
28200 const APInt &HintVal = Op.getConstantOperandAPInt(6);
28201 assert((HintVal == 2 || HintVal == 3) &&
28202 "Wrong prefetch hint in intrinsic: should be 2 or 3");
28203 unsigned Opcode = (HintVal == 2 ? IntrData->Opc1 : IntrData->Opc0);
28204 SDValue Chain = Op.getOperand(0);
28205 SDValue Mask = Op.getOperand(2);
28206 SDValue Index = Op.getOperand(3);
28207 SDValue Base = Op.getOperand(4);
28208 SDValue Scale = Op.getOperand(5);
28209 return getPrefetchNode(Opcode, Op, DAG, Mask, Base, Index, Scale, Chain,
28210 Subtarget);
28211 }
28212 // Read Time Stamp Counter (RDTSC) and Processor ID (RDTSCP).
28213 case RDTSC: {
28215 getReadTimeStampCounter(Op.getNode(), dl, IntrData->Opc0, DAG, Subtarget,
28216 Results);
28217 return DAG.getMergeValues(Results, dl);
28218 }
28219 // Read Performance Monitoring Counters.
28220 case RDPMC:
28221 // Read Processor Register.
28222 case RDPRU:
28223 // GetExtended Control Register.
28224 case XGETBV: {
28226
28227 // RDPMC uses ECX to select the index of the performance counter to read.
28228 // RDPRU uses ECX to select the processor register to read.
28229 // XGETBV uses ECX to select the index of the XCR register to return.
28230 // The result is stored into registers EDX:EAX.
28231 expandIntrinsicWChainHelper(Op.getNode(), dl, DAG, IntrData->Opc0, X86::ECX,
28232 Subtarget, Results);
28233 return DAG.getMergeValues(Results, dl);
28234 }
28235 // XTEST intrinsics.
28236 case XTEST: {
28237 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
28238 SDValue InTrans = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
28239
28240 SDValue SetCC = getSETCC(X86::COND_NE, InTrans, dl, DAG);
28241 SDValue Ret = DAG.getNode(ISD::ZERO_EXTEND, dl, Op->getValueType(0), SetCC);
28242 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(),
28243 Ret, SDValue(InTrans.getNode(), 1));
28244 }
28247 case TRUNCATE_TO_MEM_VI32: {
28248 SDValue Mask = Op.getOperand(4);
28249 SDValue DataToTruncate = Op.getOperand(3);
28250 SDValue Addr = Op.getOperand(2);
28251 SDValue Chain = Op.getOperand(0);
28252
28254 assert(MemIntr && "Expected MemIntrinsicSDNode!");
28255
28256 EVT MemVT = MemIntr->getMemoryVT();
28257
28258 uint16_t TruncationOp = IntrData->Opc0;
28259 switch (TruncationOp) {
28260 case X86ISD::VTRUNC: {
28261 if (isAllOnesConstant(Mask)) // return just a truncate store
28262 return DAG.getTruncStore(Chain, dl, DataToTruncate, Addr, MemVT,
28263 MemIntr->getMemOperand());
28264
28265 MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
28266 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
28267 SDValue Offset = DAG.getUNDEF(VMask.getValueType());
28268
28269 return DAG.getMaskedStore(Chain, dl, DataToTruncate, Addr, Offset, VMask,
28270 MemVT, MemIntr->getMemOperand(), ISD::UNINDEXED,
28271 true /* truncating */);
28272 }
28273 case X86ISD::VTRUNCUS:
28274 case X86ISD::VTRUNCS: {
28275 bool IsSigned = (TruncationOp == X86ISD::VTRUNCS);
28276 if (isAllOnesConstant(Mask))
28277 return EmitTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr, MemVT,
28278 MemIntr->getMemOperand(), DAG);
28279
28280 MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
28281 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
28282
28283 return EmitMaskedTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr,
28284 VMask, MemVT, MemIntr->getMemOperand(), DAG);
28285 }
28286 default:
28287 llvm_unreachable("Unsupported truncstore intrinsic");
28288 }
28289 }
28290 case INTR_TYPE_CAST_MMX:
28291 return SDValue(); // handled in combineINTRINSIC_*
28292 }
28293}
28294
28295SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op,
28296 SelectionDAG &DAG) const {
28297 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
28298 MFI.setReturnAddressIsTaken(true);
28299
28300 unsigned Depth = Op.getConstantOperandVal(0);
28301 SDLoc dl(Op);
28302 EVT PtrVT = Op.getValueType();
28303
28304 if (Depth > 0) {
28305 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
28306 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
28307 SDValue Offset = DAG.getConstant(RegInfo->getSlotSize(), dl, PtrVT);
28308 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
28309 DAG.getNode(ISD::ADD, dl, PtrVT, FrameAddr, Offset),
28310 MachinePointerInfo());
28311 }
28312
28313 // Just load the return address.
28314 SDValue RetAddrFI = getReturnAddressFrameIndex(DAG);
28315 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), RetAddrFI,
28316 MachinePointerInfo());
28317}
28318
28319SDValue X86TargetLowering::LowerADDROFRETURNADDR(SDValue Op,
28320 SelectionDAG &DAG) const {
28322 return getReturnAddressFrameIndex(DAG);
28323}
28324
28325SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
28326 MachineFunction &MF = DAG.getMachineFunction();
28327 MachineFrameInfo &MFI = MF.getFrameInfo();
28328 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
28329 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
28330 EVT VT = Op.getValueType();
28331
28332 MFI.setFrameAddressIsTaken(true);
28333
28334 if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI()) {
28335 // Depth > 0 makes no sense on targets which use Windows unwind codes. It
28336 // is not possible to crawl up the stack without looking at the unwind codes
28337 // simultaneously.
28338 int FrameAddrIndex = FuncInfo->getFAIndex();
28339 if (!FrameAddrIndex) {
28340 // Set up a frame object for the return address.
28341 unsigned SlotSize = RegInfo->getSlotSize();
28342 FrameAddrIndex = MF.getFrameInfo().CreateFixedObject(
28343 SlotSize, /*SPOffset=*/0, /*IsImmutable=*/false);
28344 FuncInfo->setFAIndex(FrameAddrIndex);
28345 }
28346 return DAG.getFrameIndex(FrameAddrIndex, VT);
28347 }
28348
28349 Register FrameReg =
28350 RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
28351 SDLoc dl(Op); // FIXME probably not meaningful
28352 unsigned Depth = Op.getConstantOperandVal(0);
28353 assert(((FrameReg == X86::RBP && VT == MVT::i64) ||
28354 (FrameReg == X86::EBP && VT == MVT::i32)) &&
28355 "Invalid Frame Register!");
28356 SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
28357 while (Depth--)
28358 FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
28359 MachinePointerInfo());
28360 return FrameAddr;
28361}
28362
28363// FIXME? Maybe this could be a TableGen attribute on some registers and
28364// this table could be generated automatically from RegInfo.
28366 const MachineFunction &MF) const {
28367 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
28368
28370 .Case("esp", X86::ESP)
28371 .Case("rsp", X86::RSP)
28372 .Case("ebp", X86::EBP)
28373 .Case("rbp", X86::RBP)
28374 .Case("r14", X86::R14)
28375 .Case("r15", X86::R15)
28376 .Default(0);
28377
28378 if (Reg == X86::EBP || Reg == X86::RBP) {
28379 if (!TFI.hasFP(MF))
28380 report_fatal_error("register " + StringRef(RegName) +
28381 " is allocatable: function has no frame pointer");
28382#ifndef NDEBUG
28383 else {
28384 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
28385 Register FrameReg = RegInfo->getPtrSizedFrameRegister(MF);
28386 assert((FrameReg == X86::EBP || FrameReg == X86::RBP) &&
28387 "Invalid Frame Register!");
28388 }
28389#endif
28390 }
28391
28392 return Reg;
28393}
28394
28395SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,
28396 SelectionDAG &DAG) const {
28397 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
28398 return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize(), SDLoc(Op));
28399}
28400
28402 const Constant *PersonalityFn) const {
28403 if (classifyEHPersonality(PersonalityFn) == EHPersonality::CoreCLR)
28404 return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
28405
28406 return Subtarget.isTarget64BitLP64() ? X86::RAX : X86::EAX;
28407}
28408
28410 const Constant *PersonalityFn) const {
28411 // Funclet personalities don't use selectors (the runtime does the selection).
28413 return X86::NoRegister;
28414 return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
28415}
28416
28418 return Subtarget.isTargetWin64();
28419}
28420
28421SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
28422 SDValue Chain = Op.getOperand(0);
28423 SDValue Offset = Op.getOperand(1);
28424 SDValue Handler = Op.getOperand(2);
28425 SDLoc dl (Op);
28426
28427 EVT PtrVT = getPointerTy(DAG.getDataLayout());
28428 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
28429 Register FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction());
28430 assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) ||
28431 (FrameReg == X86::EBP && PtrVT == MVT::i32)) &&
28432 "Invalid Frame Register!");
28433 SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, PtrVT);
28434 Register StoreAddrReg = (PtrVT == MVT::i64) ? X86::RCX : X86::ECX;
28435
28436 SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, Frame,
28437 DAG.getIntPtrConstant(RegInfo->getSlotSize(),
28438 dl));
28439 StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StoreAddr, Offset);
28440 Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo());
28441 Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr);
28442
28443 return DAG.getNode(X86ISD::EH_RETURN, dl, MVT::Other, Chain,
28444 DAG.getRegister(StoreAddrReg, PtrVT));
28445}
28446
28447SDValue X86TargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op,
28448 SelectionDAG &DAG) const {
28449 SDLoc DL(Op);
28450 // If the subtarget is not 64bit, we may need the global base reg
28451 // after isel expand pseudo, i.e., after CGBR pass ran.
28452 // Therefore, ask for the GlobalBaseReg now, so that the pass
28453 // inserts the code for us in case we need it.
28454 // Otherwise, we will end up in a situation where we will
28455 // reference a virtual register that is not defined!
28456 if (!Subtarget.is64Bit()) {
28457 const X86InstrInfo *TII = Subtarget.getInstrInfo();
28458 (void)TII->getGlobalBaseReg(&DAG.getMachineFunction());
28459 }
28460 return DAG.getNode(X86ISD::EH_SJLJ_SETJMP, DL,
28461 DAG.getVTList(MVT::i32, MVT::Other),
28462 Op.getOperand(0), Op.getOperand(1));
28463}
28464
28465SDValue X86TargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op,
28466 SelectionDAG &DAG) const {
28467 SDLoc DL(Op);
28468 return DAG.getNode(X86ISD::EH_SJLJ_LONGJMP, DL, MVT::Other,
28469 Op.getOperand(0), Op.getOperand(1));
28470}
28471
28472SDValue X86TargetLowering::lowerEH_SJLJ_SETUP_DISPATCH(SDValue Op,
28473 SelectionDAG &DAG) const {
28474 SDLoc DL(Op);
28475 return DAG.getNode(X86ISD::EH_SJLJ_SETUP_DISPATCH, DL, MVT::Other,
28476 Op.getOperand(0));
28477}
28478
28480 return Op.getOperand(0);
28481}
28482
28483SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
28484 SelectionDAG &DAG) const {
28485 SDValue Root = Op.getOperand(0);
28486 SDValue Trmp = Op.getOperand(1); // trampoline
28487 SDValue FPtr = Op.getOperand(2); // nested function
28488 SDValue Nest = Op.getOperand(3); // 'nest' parameter value
28489 SDLoc dl (Op);
28490
28491 const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
28492 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
28493
28494 if (Subtarget.is64Bit()) {
28495 SDValue OutChains[6];
28496
28497 // Large code-model.
28498 const unsigned char JMP64r = 0xFF; // 64-bit jmp through register opcode.
28499 const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode.
28500
28501 const unsigned char N86R10 = TRI->getEncodingValue(X86::R10) & 0x7;
28502 const unsigned char N86R11 = TRI->getEncodingValue(X86::R11) & 0x7;
28503
28504 const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix
28505
28506 // Load the pointer to the nested function into R11.
28507 unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11
28508 SDValue Addr = Trmp;
28509 OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
28510 Addr, MachinePointerInfo(TrmpAddr));
28511
28512 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
28513 DAG.getConstant(2, dl, MVT::i64));
28514 OutChains[1] = DAG.getStore(Root, dl, FPtr, Addr,
28515 MachinePointerInfo(TrmpAddr, 2), Align(2));
28516
28517 // Load the 'nest' parameter value into R10.
28518 // R10 is specified in X86CallingConv.td
28519 OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10
28520 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
28521 DAG.getConstant(10, dl, MVT::i64));
28522 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
28523 Addr, MachinePointerInfo(TrmpAddr, 10));
28524
28525 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
28526 DAG.getConstant(12, dl, MVT::i64));
28527 OutChains[3] = DAG.getStore(Root, dl, Nest, Addr,
28528 MachinePointerInfo(TrmpAddr, 12), Align(2));
28529
28530 // Jump to the nested function.
28531 OpCode = (JMP64r << 8) | REX_WB; // jmpq *...
28532 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
28533 DAG.getConstant(20, dl, MVT::i64));
28534 OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
28535 Addr, MachinePointerInfo(TrmpAddr, 20));
28536
28537 unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11
28538 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
28539 DAG.getConstant(22, dl, MVT::i64));
28540 OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, dl, MVT::i8),
28541 Addr, MachinePointerInfo(TrmpAddr, 22));
28542
28543 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
28544 } else {
28545 const Function *Func =
28546 cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());
28547 CallingConv::ID CC = Func->getCallingConv();
28548 unsigned NestReg;
28549
28550 switch (CC) {
28551 default:
28552 llvm_unreachable("Unsupported calling convention");
28553 case CallingConv::C:
28555 // Pass 'nest' parameter in ECX.
28556 // Must be kept in sync with X86CallingConv.td
28557 NestReg = X86::ECX;
28558
28559 // Check that ECX wasn't needed by an 'inreg' parameter.
28560 FunctionType *FTy = Func->getFunctionType();
28561 const AttributeList &Attrs = Func->getAttributes();
28562
28563 if (!Attrs.isEmpty() && !Func->isVarArg()) {
28564 unsigned InRegCount = 0;
28565 unsigned Idx = 0;
28566
28567 for (FunctionType::param_iterator I = FTy->param_begin(),
28568 E = FTy->param_end(); I != E; ++I, ++Idx)
28569 if (Attrs.hasParamAttr(Idx, Attribute::InReg)) {
28570 const DataLayout &DL = DAG.getDataLayout();
28571 // FIXME: should only count parameters that are lowered to integers.
28572 InRegCount += (DL.getTypeSizeInBits(*I) + 31) / 32;
28573 }
28574
28575 if (InRegCount > 2) {
28576 report_fatal_error("Nest register in use - reduce number of inreg"
28577 " parameters!");
28578 }
28579 }
28580 break;
28581 }
28584 case CallingConv::Fast:
28585 case CallingConv::Tail:
28587 // Pass 'nest' parameter in EAX.
28588 // Must be kept in sync with X86CallingConv.td
28589 NestReg = X86::EAX;
28590 break;
28591 }
28592
28593 SDValue OutChains[4];
28594 SDValue Addr, Disp;
28595
28596 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
28597 DAG.getConstant(10, dl, MVT::i32));
28598 Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr);
28599
28600 // This is storing the opcode for MOV32ri.
28601 const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte.
28602 const unsigned char N86Reg = TRI->getEncodingValue(NestReg) & 0x7;
28603 OutChains[0] =
28604 DAG.getStore(Root, dl, DAG.getConstant(MOV32ri | N86Reg, dl, MVT::i8),
28605 Trmp, MachinePointerInfo(TrmpAddr));
28606
28607 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
28608 DAG.getConstant(1, dl, MVT::i32));
28609 OutChains[1] = DAG.getStore(Root, dl, Nest, Addr,
28610 MachinePointerInfo(TrmpAddr, 1), Align(1));
28611
28612 const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode.
28613 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
28614 DAG.getConstant(5, dl, MVT::i32));
28615 OutChains[2] =
28616 DAG.getStore(Root, dl, DAG.getConstant(JMP, dl, MVT::i8), Addr,
28617 MachinePointerInfo(TrmpAddr, 5), Align(1));
28618
28619 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
28620 DAG.getConstant(6, dl, MVT::i32));
28621 OutChains[3] = DAG.getStore(Root, dl, Disp, Addr,
28622 MachinePointerInfo(TrmpAddr, 6), Align(1));
28623
28624 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
28625 }
28626}
28627
28628SDValue X86TargetLowering::LowerGET_ROUNDING(SDValue Op,
28629 SelectionDAG &DAG) const {
28630 /*
28631 The rounding mode is in bits 11:10 of FPSR, and has the following
28632 settings:
28633 00 Round to nearest
28634 01 Round to -inf
28635 10 Round to +inf
28636 11 Round to 0
28637
28638 GET_ROUNDING, on the other hand, expects the following:
28639 -1 Undefined
28640 0 Round to 0
28641 1 Round to nearest
28642 2 Round to +inf
28643 3 Round to -inf
28644
28645 To perform the conversion, we use a packed lookup table of the four 2-bit
28646 values that we can index by FPSP[11:10]
28647 0x2d --> (0b00,10,11,01) --> (0,2,3,1) >> FPSR[11:10]
28648
28649 (0x2d >> ((FPSR & 0xc00) >> 9)) & 3
28650 */
28651
28652 MachineFunction &MF = DAG.getMachineFunction();
28653 MVT VT = Op.getSimpleValueType();
28654 SDLoc DL(Op);
28655
28656 // Save FP Control Word to stack slot
28657 int SSFI = MF.getFrameInfo().CreateStackObject(2, Align(2), false);
28658 SDValue StackSlot =
28659 DAG.getFrameIndex(SSFI, getPointerTy(DAG.getDataLayout()));
28660
28661 MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, SSFI);
28662
28663 SDValue Chain = Op.getOperand(0);
28664 SDValue Ops[] = {Chain, StackSlot};
28666 DAG.getVTList(MVT::Other), Ops, MVT::i16, MPI,
28668
28669 // Load FP Control Word from stack slot
28670 SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot, MPI, Align(2));
28671 Chain = CWD.getValue(1);
28672
28673 // Mask and turn the control bits into a shift for the lookup table.
28674 SDValue Shift =
28675 DAG.getNode(ISD::SRL, DL, MVT::i16,
28676 DAG.getNode(ISD::AND, DL, MVT::i16,
28677 CWD, DAG.getConstant(0xc00, DL, MVT::i16)),
28678 DAG.getConstant(9, DL, MVT::i8));
28679 Shift = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, Shift);
28680
28681 SDValue LUT = DAG.getConstant(0x2d, DL, MVT::i32);
28682 SDValue RetVal =
28683 DAG.getNode(ISD::AND, DL, MVT::i32,
28684 DAG.getNode(ISD::SRL, DL, MVT::i32, LUT, Shift),
28685 DAG.getConstant(3, DL, MVT::i32));
28686
28687 RetVal = DAG.getZExtOrTrunc(RetVal, DL, VT);
28688
28689 return DAG.getMergeValues({RetVal, Chain}, DL);
28690}
28691
28692SDValue X86TargetLowering::LowerSET_ROUNDING(SDValue Op,
28693 SelectionDAG &DAG) const {
28694 MachineFunction &MF = DAG.getMachineFunction();
28695 SDLoc DL(Op);
28696 SDValue Chain = Op.getNode()->getOperand(0);
28697
28698 // FP control word may be set only from data in memory. So we need to allocate
28699 // stack space to save/load FP control word.
28700 int OldCWFrameIdx = MF.getFrameInfo().CreateStackObject(4, Align(4), false);
28701 SDValue StackSlot =
28702 DAG.getFrameIndex(OldCWFrameIdx, getPointerTy(DAG.getDataLayout()));
28703 MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, OldCWFrameIdx);
28704 MachineMemOperand *MMO =
28706
28707 // Store FP control word into memory.
28708 SDValue Ops[] = {Chain, StackSlot};
28709 Chain = DAG.getMemIntrinsicNode(
28710 X86ISD::FNSTCW16m, DL, DAG.getVTList(MVT::Other), Ops, MVT::i16, MMO);
28711
28712 // Load FP Control Word from stack slot and clear RM field (bits 11:10).
28713 SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot, MPI);
28714 Chain = CWD.getValue(1);
28715 CWD = DAG.getNode(ISD::AND, DL, MVT::i16, CWD.getValue(0),
28716 DAG.getConstant(0xf3ff, DL, MVT::i16));
28717
28718 // Calculate new rounding mode.
28719 SDValue NewRM = Op.getNode()->getOperand(1);
28720 SDValue RMBits;
28721 if (auto *CVal = dyn_cast<ConstantSDNode>(NewRM)) {
28722 uint64_t RM = CVal->getZExtValue();
28723 int FieldVal = X86::getRoundingModeX86(RM);
28724
28725 if (FieldVal == X86::rmInvalid) {
28726 FieldVal = X86::rmToNearest;
28727 LLVMContext &C = MF.getFunction().getContext();
28728 C.diagnose(DiagnosticInfoUnsupported(
28729 MF.getFunction(), "rounding mode is not supported by X86 hardware",
28730 DiagnosticLocation(DL.getDebugLoc()), DS_Error));
28731 }
28732 RMBits = DAG.getConstant(FieldVal, DL, MVT::i16);
28733 } else {
28734 // Need to convert argument into bits of control word:
28735 // 0 Round to 0 -> 11
28736 // 1 Round to nearest -> 00
28737 // 2 Round to +inf -> 10
28738 // 3 Round to -inf -> 01
28739 // The 2-bit value needs then to be shifted so that it occupies bits 11:10.
28740 // To make the conversion, put all these values into a value 0xc9 and shift
28741 // it left depending on the rounding mode:
28742 // (0xc9 << 4) & 0xc00 = X86::rmTowardZero
28743 // (0xc9 << 6) & 0xc00 = X86::rmToNearest
28744 // ...
28745 // (0xc9 << (2 * NewRM + 4)) & 0xc00
28746 SDValue ShiftValue =
28747 DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
28748 DAG.getNode(ISD::ADD, DL, MVT::i32,
28749 DAG.getNode(ISD::SHL, DL, MVT::i32, NewRM,
28750 DAG.getConstant(1, DL, MVT::i8)),
28751 DAG.getConstant(4, DL, MVT::i32)));
28752 SDValue Shifted =
28753 DAG.getNode(ISD::SHL, DL, MVT::i16, DAG.getConstant(0xc9, DL, MVT::i16),
28754 ShiftValue);
28755 RMBits = DAG.getNode(ISD::AND, DL, MVT::i16, Shifted,
28756 DAG.getConstant(0xc00, DL, MVT::i16));
28757 }
28758
28759 // Update rounding mode bits and store the new FP Control Word into stack.
28760 CWD = DAG.getNode(ISD::OR, DL, MVT::i16, CWD, RMBits);
28761 Chain = DAG.getStore(Chain, DL, CWD, StackSlot, MPI, Align(2));
28762
28763 // Load FP control word from the slot.
28764 SDValue OpsLD[] = {Chain, StackSlot};
28765 MachineMemOperand *MMOL =
28767 Chain = DAG.getMemIntrinsicNode(
28768 X86ISD::FLDCW16m, DL, DAG.getVTList(MVT::Other), OpsLD, MVT::i16, MMOL);
28769
28770 // If target supports SSE, set MXCSR as well. Rounding mode is encoded in the
28771 // same way but in bits 14:13.
28772 if (Subtarget.hasSSE1()) {
28773 // Store MXCSR into memory.
28774 Chain = DAG.getNode(
28775 ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other), Chain,
28776 DAG.getTargetConstant(Intrinsic::x86_sse_stmxcsr, DL, MVT::i32),
28777 StackSlot);
28778
28779 // Load MXCSR from stack slot and clear RM field (bits 14:13).
28780 SDValue CWD = DAG.getLoad(MVT::i32, DL, Chain, StackSlot, MPI);
28781 Chain = CWD.getValue(1);
28782 CWD = DAG.getNode(ISD::AND, DL, MVT::i32, CWD.getValue(0),
28783 DAG.getConstant(0xffff9fff, DL, MVT::i32));
28784
28785 // Shift X87 RM bits from 11:10 to 14:13.
28786 RMBits = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, RMBits);
28787 RMBits = DAG.getNode(ISD::SHL, DL, MVT::i32, RMBits,
28788 DAG.getConstant(3, DL, MVT::i8));
28789
28790 // Update rounding mode bits and store the new FP Control Word into stack.
28791 CWD = DAG.getNode(ISD::OR, DL, MVT::i32, CWD, RMBits);
28792 Chain = DAG.getStore(Chain, DL, CWD, StackSlot, MPI, Align(4));
28793
28794 // Load MXCSR from the slot.
28795 Chain = DAG.getNode(
28796 ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other), Chain,
28797 DAG.getTargetConstant(Intrinsic::x86_sse_ldmxcsr, DL, MVT::i32),
28798 StackSlot);
28799 }
28800
28801 return Chain;
28802}
28803
28804const unsigned X87StateSize = 28;
28805const unsigned FPStateSize = 32;
28806[[maybe_unused]] const unsigned FPStateSizeInBits = FPStateSize * 8;
28807
28808SDValue X86TargetLowering::LowerGET_FPENV_MEM(SDValue Op,
28809 SelectionDAG &DAG) const {
28811 SDLoc DL(Op);
28812 SDValue Chain = Op->getOperand(0);
28813 SDValue Ptr = Op->getOperand(1);
28815 EVT MemVT = Node->getMemoryVT();
28817 MachineMemOperand *MMO = cast<FPStateAccessSDNode>(Op)->getMemOperand();
28818
28819 // Get x87 state, if it presents.
28820 if (Subtarget.hasX87()) {
28821 Chain =
28822 DAG.getMemIntrinsicNode(X86ISD::FNSTENVm, DL, DAG.getVTList(MVT::Other),
28823 {Chain, Ptr}, MemVT, MMO);
28824
28825 // FNSTENV changes the exception mask, so load back the stored environment.
28826 MachineMemOperand::Flags NewFlags =
28829 MMO = MF.getMachineMemOperand(MMO, NewFlags);
28830 Chain =
28831 DAG.getMemIntrinsicNode(X86ISD::FLDENVm, DL, DAG.getVTList(MVT::Other),
28832 {Chain, Ptr}, MemVT, MMO);
28833 }
28834
28835 // If target supports SSE, get MXCSR as well.
28836 if (Subtarget.hasSSE1()) {
28837 // Get pointer to the MXCSR location in memory.
28839 SDValue MXCSRAddr = DAG.getNode(ISD::ADD, DL, PtrVT, Ptr,
28840 DAG.getConstant(X87StateSize, DL, PtrVT));
28841 // Store MXCSR into memory.
28842 Chain = DAG.getNode(
28843 ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other), Chain,
28844 DAG.getTargetConstant(Intrinsic::x86_sse_stmxcsr, DL, MVT::i32),
28845 MXCSRAddr);
28846 }
28847
28848 return Chain;
28849}
28850
28852 EVT MemVT, MachineMemOperand *MMO,
28853 SelectionDAG &DAG,
28854 const X86Subtarget &Subtarget) {
28855 // Set x87 state, if it presents.
28856 if (Subtarget.hasX87())
28857 Chain =
28858 DAG.getMemIntrinsicNode(X86ISD::FLDENVm, DL, DAG.getVTList(MVT::Other),
28859 {Chain, Ptr}, MemVT, MMO);
28860 // If target supports SSE, set MXCSR as well.
28861 if (Subtarget.hasSSE1()) {
28862 // Get pointer to the MXCSR location in memory.
28864 SDValue MXCSRAddr = DAG.getNode(ISD::ADD, DL, PtrVT, Ptr,
28865 DAG.getConstant(X87StateSize, DL, PtrVT));
28866 // Load MXCSR from memory.
28867 Chain = DAG.getNode(
28868 ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other), Chain,
28869 DAG.getTargetConstant(Intrinsic::x86_sse_ldmxcsr, DL, MVT::i32),
28870 MXCSRAddr);
28871 }
28872 return Chain;
28873}
28874
28875SDValue X86TargetLowering::LowerSET_FPENV_MEM(SDValue Op,
28876 SelectionDAG &DAG) const {
28877 SDLoc DL(Op);
28878 SDValue Chain = Op->getOperand(0);
28879 SDValue Ptr = Op->getOperand(1);
28881 EVT MemVT = Node->getMemoryVT();
28883 MachineMemOperand *MMO = cast<FPStateAccessSDNode>(Op)->getMemOperand();
28884 return createSetFPEnvNodes(Ptr, Chain, DL, MemVT, MMO, DAG, Subtarget);
28885}
28886
28887SDValue X86TargetLowering::LowerRESET_FPENV(SDValue Op,
28888 SelectionDAG &DAG) const {
28889 MachineFunction &MF = DAG.getMachineFunction();
28890 SDLoc DL(Op);
28891 SDValue Chain = Op.getNode()->getOperand(0);
28892
28893 IntegerType *ItemTy = Type::getInt32Ty(*DAG.getContext());
28894 ArrayType *FPEnvTy = ArrayType::get(ItemTy, 8);
28896
28897 // x87 FPU Control Word: mask all floating-point exceptions, sets rounding to
28898 // nearest. FPU precision is set to 53 bits on Windows and 64 bits otherwise
28899 // for compatibility with glibc.
28900 unsigned X87CW = Subtarget.isTargetWindowsMSVC() ? 0x27F : 0x37F;
28901 FPEnvVals.push_back(ConstantInt::get(ItemTy, X87CW));
28902 Constant *Zero = ConstantInt::get(ItemTy, 0);
28903 for (unsigned I = 0; I < 6; ++I)
28904 FPEnvVals.push_back(Zero);
28905
28906 // MXCSR: mask all floating-point exceptions, sets rounding to nearest, clear
28907 // all exceptions, sets DAZ and FTZ to 0.
28908 FPEnvVals.push_back(ConstantInt::get(ItemTy, 0x1F80));
28909 Constant *FPEnvBits = ConstantArray::get(FPEnvTy, FPEnvVals);
28910 MVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
28911 SDValue Env = DAG.getConstantPool(FPEnvBits, PtrVT);
28912 MachinePointerInfo MPI =
28914 MachineMemOperand *MMO = MF.getMachineMemOperand(
28916
28917 return createSetFPEnvNodes(Env, Chain, DL, MVT::i32, MMO, DAG, Subtarget);
28918}
28919
28920// Generate a GFNI gf2p8affine bitmask for vXi8 bitreverse/shift/rotate.
28921uint64_t getGFNICtrlImm(unsigned Opcode, unsigned Amt = 0) {
28922 assert((Amt < 8) && "Shift/Rotation amount out of range");
28923 switch (Opcode) {
28924 case ISD::BITREVERSE:
28925 return 0x8040201008040201ULL;
28926 case ISD::SHL:
28927 return ((0x0102040810204080ULL >> (Amt)) &
28928 (0x0101010101010101ULL * (0xFF >> (Amt))));
28929 case ISD::SRL:
28930 return ((0x0102040810204080ULL << (Amt)) &
28931 (0x0101010101010101ULL * ((0xFF << (Amt)) & 0xFF)));
28932 case ISD::SRA:
28933 return (getGFNICtrlImm(ISD::SRL, Amt) |
28934 (0x8080808080808080ULL >> (64 - (8 * Amt))));
28935 case ISD::ROTL:
28936 return getGFNICtrlImm(ISD::SRL, 8 - Amt) | getGFNICtrlImm(ISD::SHL, Amt);
28937 case ISD::ROTR:
28938 return getGFNICtrlImm(ISD::SHL, 8 - Amt) | getGFNICtrlImm(ISD::SRL, Amt);
28939 }
28940 llvm_unreachable("Unsupported GFNI opcode");
28941}
28942
28943// Generate a GFNI gf2p8affine bitmask for vXi8 bitreverse/shift/rotate.
28944SDValue getGFNICtrlMask(unsigned Opcode, SelectionDAG &DAG, const SDLoc &DL,
28945 MVT VT, unsigned Amt = 0) {
28946 assert(VT.getVectorElementType() == MVT::i8 &&
28947 (VT.getSizeInBits() % 64) == 0 && "Illegal GFNI control type");
28948 uint64_t Imm = getGFNICtrlImm(Opcode, Amt);
28949 SmallVector<SDValue> MaskBits;
28950 for (unsigned I = 0, E = VT.getSizeInBits(); I != E; I += 8) {
28951 uint64_t Bits = (Imm >> (I % 64)) & 255;
28952 MaskBits.push_back(DAG.getConstant(Bits, DL, MVT::i8));
28953 }
28954 return DAG.getBuildVector(VT, DL, MaskBits);
28955}
28956
28957/// Lower a vector CTLZ using native supported vector CTLZ instruction.
28958//
28959// i8/i16 vector implemented using dword LZCNT vector instruction
28960// ( sub(trunc(lzcnt(zext32(x)))) ). In case zext32(x) is illegal,
28961// split the vector, perform operation on it's Lo a Hi part and
28962// concatenate the results.
28964 const X86Subtarget &Subtarget) {
28965 assert(Op.getOpcode() == ISD::CTLZ);
28966 SDLoc dl(Op);
28967 MVT VT = Op.getSimpleValueType();
28968 MVT EltVT = VT.getVectorElementType();
28969 unsigned NumElems = VT.getVectorNumElements();
28970
28971 assert((EltVT == MVT::i8 || EltVT == MVT::i16) &&
28972 "Unsupported element type");
28973
28974 // Split vector, it's Lo and Hi parts will be handled in next iteration.
28975 if (NumElems > 16 ||
28976 (NumElems == 16 && !Subtarget.canExtendTo512DQ()))
28977 return splitVectorIntUnary(Op, DAG, dl);
28978
28979 MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
28980 assert((NewVT.is256BitVector() || NewVT.is512BitVector()) &&
28981 "Unsupported value type for operation");
28982
28983 // Use native supported vector instruction vplzcntd.
28984 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, NewVT, Op.getOperand(0));
28985 SDValue CtlzNode = DAG.getNode(ISD::CTLZ, dl, NewVT, Op);
28986 SDValue TruncNode = DAG.getNode(ISD::TRUNCATE, dl, VT, CtlzNode);
28987 SDValue Delta = DAG.getConstant(32 - EltVT.getSizeInBits(), dl, VT);
28988
28989 return DAG.getNode(ISD::SUB, dl, VT, TruncNode, Delta);
28990}
28991
28992// Lower CTLZ using a PSHUFB lookup table implementation.
28994 const X86Subtarget &Subtarget,
28995 SelectionDAG &DAG) {
28996 MVT VT = Op.getSimpleValueType();
28997 int NumElts = VT.getVectorNumElements();
28998 int NumBytes = NumElts * (VT.getScalarSizeInBits() / 8);
28999 MVT CurrVT = MVT::getVectorVT(MVT::i8, NumBytes);
29000
29001 // Per-nibble leading zero PSHUFB lookup table.
29002 const int LUT[16] = {/* 0 */ 4, /* 1 */ 3, /* 2 */ 2, /* 3 */ 2,
29003 /* 4 */ 1, /* 5 */ 1, /* 6 */ 1, /* 7 */ 1,
29004 /* 8 */ 0, /* 9 */ 0, /* a */ 0, /* b */ 0,
29005 /* c */ 0, /* d */ 0, /* e */ 0, /* f */ 0};
29006
29008 for (int i = 0; i < NumBytes; ++i)
29009 LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
29010 SDValue InRegLUT = DAG.getBuildVector(CurrVT, DL, LUTVec);
29011
29012 // Begin by bitcasting the input to byte vector, then split those bytes
29013 // into lo/hi nibbles and use the PSHUFB LUT to perform CTLZ on each of them.
29014 // If the hi input nibble is zero then we add both results together, otherwise
29015 // we just take the hi result (by masking the lo result to zero before the
29016 // add).
29017 SDValue Op0 = DAG.getBitcast(CurrVT, Op.getOperand(0));
29018 SDValue Zero = DAG.getConstant(0, DL, CurrVT);
29019
29020 SDValue NibbleShift = DAG.getConstant(0x4, DL, CurrVT);
29021 SDValue Lo = Op0;
29022 SDValue Hi = DAG.getNode(ISD::SRL, DL, CurrVT, Op0, NibbleShift);
29023 SDValue HiZ;
29024 if (CurrVT.is512BitVector()) {
29025 MVT MaskVT = MVT::getVectorVT(MVT::i1, CurrVT.getVectorNumElements());
29026 HiZ = DAG.getSetCC(DL, MaskVT, Hi, Zero, ISD::SETEQ);
29027 HiZ = DAG.getNode(ISD::SIGN_EXTEND, DL, CurrVT, HiZ);
29028 } else {
29029 HiZ = DAG.getSetCC(DL, CurrVT, Hi, Zero, ISD::SETEQ);
29030 }
29031
29032 Lo = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Lo);
29033 Hi = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Hi);
29034 Lo = DAG.getNode(ISD::AND, DL, CurrVT, Lo, HiZ);
29035 SDValue Res = DAG.getNode(ISD::ADD, DL, CurrVT, Lo, Hi);
29036
29037 // Merge result back from vXi8 back to VT, working on the lo/hi halves
29038 // of the current vector width in the same way we did for the nibbles.
29039 // If the upper half of the input element is zero then add the halves'
29040 // leading zero counts together, otherwise just use the upper half's.
29041 // Double the width of the result until we are at target width.
29042 while (CurrVT != VT) {
29043 int CurrScalarSizeInBits = CurrVT.getScalarSizeInBits();
29044 int CurrNumElts = CurrVT.getVectorNumElements();
29045 MVT NextSVT = MVT::getIntegerVT(CurrScalarSizeInBits * 2);
29046 MVT NextVT = MVT::getVectorVT(NextSVT, CurrNumElts / 2);
29047 SDValue Shift = DAG.getConstant(CurrScalarSizeInBits, DL, NextVT);
29048
29049 // Check if the upper half of the input element is zero.
29050 if (CurrVT.is512BitVector()) {
29051 MVT MaskVT = MVT::getVectorVT(MVT::i1, CurrVT.getVectorNumElements());
29052 HiZ = DAG.getSetCC(DL, MaskVT, DAG.getBitcast(CurrVT, Op0),
29053 DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
29054 HiZ = DAG.getNode(ISD::SIGN_EXTEND, DL, CurrVT, HiZ);
29055 } else {
29056 HiZ = DAG.getSetCC(DL, CurrVT, DAG.getBitcast(CurrVT, Op0),
29057 DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
29058 }
29059 HiZ = DAG.getBitcast(NextVT, HiZ);
29060
29061 // Move the upper/lower halves to the lower bits as we'll be extending to
29062 // NextVT. Mask the lower result to zero if HiZ is true and add the results
29063 // together.
29064 SDValue ResNext = Res = DAG.getBitcast(NextVT, Res);
29065 SDValue R0 = DAG.getNode(ISD::SRL, DL, NextVT, ResNext, Shift);
29066 SDValue R1 = DAG.getNode(ISD::SRL, DL, NextVT, HiZ, Shift);
29067 R1 = DAG.getNode(ISD::AND, DL, NextVT, ResNext, R1);
29068 Res = DAG.getNode(ISD::ADD, DL, NextVT, R0, R1);
29069 CurrVT = NextVT;
29070 }
29071
29072 return Res;
29073}
29074
29076 const X86Subtarget &Subtarget,
29077 SelectionDAG &DAG) {
29078 MVT VT = Op.getSimpleValueType();
29079
29080 if (Subtarget.hasCDI() &&
29081 // vXi8 vectors need to be promoted to 512-bits for vXi32.
29082 (Subtarget.canExtendTo512DQ() || VT.getVectorElementType() != MVT::i8))
29083 return LowerVectorCTLZ_AVX512CDI(Op, DAG, Subtarget);
29084
29085 // Decompose 256-bit ops into smaller 128-bit ops.
29086 if (VT.is256BitVector() && !Subtarget.hasInt256())
29087 return splitVectorIntUnary(Op, DAG, DL);
29088
29089 // Decompose 512-bit ops into smaller 256-bit ops.
29090 if (VT.is512BitVector() && !Subtarget.hasBWI())
29091 return splitVectorIntUnary(Op, DAG, DL);
29092
29093 assert(Subtarget.hasSSSE3() && "Expected SSSE3 support for PSHUFB");
29094 return LowerVectorCTLZInRegLUT(Op, DL, Subtarget, DAG);
29095}
29096
29098 SelectionDAG &DAG,
29099 const X86Subtarget &Subtarget) {
29100 MVT VT = Op.getSimpleValueType();
29101 SDValue Input = Op.getOperand(0);
29102
29103 assert(VT.isVector() && VT.getVectorElementType() == MVT::i8 &&
29104 "Expected vXi8 input for GFNI-based CTLZ lowering");
29105
29106 SDValue Reversed = DAG.getNode(ISD::BITREVERSE, DL, VT, Input);
29107
29108 SDValue Neg = DAG.getNegative(Reversed, DL, VT);
29109 SDValue Filtered = DAG.getNode(ISD::AND, DL, VT, Reversed, Neg);
29110
29111 MVT VT64 = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
29112 SDValue CTTZConst = DAG.getConstant(0xAACCF0FF00000000ULL, DL, VT64);
29113 SDValue CTTZMatrix = DAG.getBitcast(VT, CTTZConst);
29114
29115 SDValue LZCNT =
29116 DAG.getNode(X86ISD::GF2P8AFFINEQB, DL, VT, Filtered, CTTZMatrix,
29117 DAG.getTargetConstant(8, DL, MVT::i8));
29118 return LZCNT;
29119}
29120
29121static SDValue LowerCTLZ(SDValue Op, const X86Subtarget &Subtarget,
29122 SelectionDAG &DAG) {
29123 MVT VT = Op.getSimpleValueType();
29124 MVT OpVT = VT;
29125 unsigned NumBits = VT.getSizeInBits();
29126 SDLoc dl(Op);
29127 unsigned Opc = Op.getOpcode();
29128
29129 if (VT.isVector() && VT.getScalarType() == MVT::i8 && Subtarget.hasGFNI())
29130 return LowerVectorCTLZ_GFNI(Op, dl, DAG, Subtarget);
29131
29132 if (VT.isVector())
29133 return LowerVectorCTLZ(Op, dl, Subtarget, DAG);
29134
29135 Op = Op.getOperand(0);
29136 if (VT == MVT::i8) {
29137 // Zero extend to i32 since there is not an i8 bsr.
29138 OpVT = MVT::i32;
29139 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
29140 }
29141
29142 // Check if we can safely pass a result though BSR for zero sources.
29143 SDValue PassThru = DAG.getUNDEF(OpVT);
29144 if (Opc == ISD::CTLZ && Subtarget.hasBitScanPassThrough() &&
29145 !DAG.isKnownNeverZero(Op))
29146 PassThru = DAG.getConstant(NumBits + NumBits - 1, dl, OpVT);
29147
29148 // Issue a bsr (scan bits in reverse) which also sets EFLAGS.
29149 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
29150 Op = DAG.getNode(X86ISD::BSR, dl, VTs, PassThru, Op);
29151
29152 // Skip CMOV if we're using a pass through value.
29153 if (Opc == ISD::CTLZ && PassThru.isUndef()) {
29154 // If src is zero (i.e. bsr sets ZF), returns NumBits.
29155 SDValue Ops[] = {Op, DAG.getConstant(NumBits + NumBits - 1, dl, OpVT),
29156 DAG.getTargetConstant(X86::COND_E, dl, MVT::i8),
29157 Op.getValue(1)};
29158 Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops);
29159 }
29160
29161 // Finally xor with NumBits-1.
29162 Op = DAG.getNode(ISD::XOR, dl, OpVT, Op,
29163 DAG.getConstant(NumBits - 1, dl, OpVT));
29164
29165 if (VT == MVT::i8)
29166 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
29167 return Op;
29168}
29169
29170static SDValue LowerCTTZ(SDValue Op, const X86Subtarget &Subtarget,
29171 SelectionDAG &DAG) {
29172 MVT VT = Op.getSimpleValueType();
29173 unsigned NumBits = VT.getScalarSizeInBits();
29174 SDValue N0 = Op.getOperand(0);
29175 SDLoc dl(Op);
29176 bool NonZeroSrc = DAG.isKnownNeverZero(N0);
29177
29178 assert(!VT.isVector() && Op.getOpcode() == ISD::CTTZ &&
29179 "Only scalar CTTZ requires custom lowering");
29180
29181 // Check if we can safely pass a result though BSF for zero sources.
29182 SDValue PassThru = DAG.getUNDEF(VT);
29183 if (!NonZeroSrc && Subtarget.hasBitScanPassThrough())
29184 PassThru = DAG.getConstant(NumBits, dl, VT);
29185
29186 // Issue a bsf (scan bits forward) which also sets EFLAGS.
29187 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
29188 Op = DAG.getNode(X86ISD::BSF, dl, VTs, PassThru, N0);
29189
29190 // Skip CMOV if src is never zero or we're using a pass through value.
29191 if (NonZeroSrc || !PassThru.isUndef())
29192 return Op;
29193
29194 // If src is zero (i.e. bsf sets ZF), returns NumBits.
29195 SDValue Ops[] = {Op, DAG.getConstant(NumBits, dl, VT),
29196 DAG.getTargetConstant(X86::COND_E, dl, MVT::i8),
29197 Op.getValue(1)};
29198 return DAG.getNode(X86ISD::CMOV, dl, VT, Ops);
29199}
29200
29202 const X86Subtarget &Subtarget) {
29203 MVT VT = Op.getSimpleValueType();
29204 SDLoc DL(Op);
29205
29206 if (VT == MVT::i16 || VT == MVT::i32)
29207 return lowerAddSubToHorizontalOp(Op, DL, DAG, Subtarget);
29208
29209 if (VT == MVT::v32i16 || VT == MVT::v64i8)
29210 return splitVectorIntBinary(Op, DAG, DL);
29211
29212 assert(Op.getSimpleValueType().is256BitVector() &&
29213 Op.getSimpleValueType().isInteger() &&
29214 "Only handle AVX 256-bit vector integer operation");
29215 return splitVectorIntBinary(Op, DAG, DL);
29216}
29217
29219 const X86Subtarget &Subtarget) {
29220 MVT VT = Op.getSimpleValueType();
29221 SDValue X = Op.getOperand(0), Y = Op.getOperand(1);
29222 unsigned Opcode = Op.getOpcode();
29223 SDLoc DL(Op);
29224
29225 if (VT == MVT::v32i16 || VT == MVT::v64i8 ||
29226 (VT.is256BitVector() && !Subtarget.hasInt256())) {
29227 assert(Op.getSimpleValueType().isInteger() &&
29228 "Only handle AVX vector integer operation");
29229 return splitVectorIntBinary(Op, DAG, DL);
29230 }
29231
29232 // Avoid the generic expansion with min/max if we don't have pminu*/pmaxu*.
29233 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
29234 EVT SetCCResultType =
29235 TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
29236
29237 unsigned BitWidth = VT.getScalarSizeInBits();
29238 if (Opcode == ISD::USUBSAT) {
29239 if (!TLI.isOperationLegal(ISD::UMAX, VT) || useVPTERNLOG(Subtarget, VT)) {
29240 // Handle a special-case with a bit-hack instead of cmp+select:
29241 // usubsat X, SMIN --> (X ^ SMIN) & (X s>> BW-1)
29242 // If the target can use VPTERNLOG, DAGToDAG will match this as
29243 // "vpsra + vpternlog" which is better than "vpmax + vpsub" with a
29244 // "broadcast" constant load.
29246 if (C && C->getAPIntValue().isSignMask()) {
29247 SDValue SignMask = DAG.getConstant(C->getAPIntValue(), DL, VT);
29248 SDValue ShiftAmt = DAG.getConstant(BitWidth - 1, DL, VT);
29249 SDValue Xor = DAG.getNode(ISD::XOR, DL, VT, X, SignMask);
29250 SDValue Sra = DAG.getNode(ISD::SRA, DL, VT, X, ShiftAmt);
29251 return DAG.getNode(ISD::AND, DL, VT, Xor, Sra);
29252 }
29253 }
29254 if (!TLI.isOperationLegal(ISD::UMAX, VT)) {
29255 // usubsat X, Y --> (X >u Y) ? X - Y : 0
29256 SDValue Sub = DAG.getNode(ISD::SUB, DL, VT, X, Y);
29257 SDValue Cmp = DAG.getSetCC(DL, SetCCResultType, X, Y, ISD::SETUGT);
29258 // TODO: Move this to DAGCombiner?
29259 if (SetCCResultType == VT &&
29260 DAG.ComputeNumSignBits(Cmp) == VT.getScalarSizeInBits())
29261 return DAG.getNode(ISD::AND, DL, VT, Cmp, Sub);
29262 return DAG.getSelect(DL, VT, Cmp, Sub, DAG.getConstant(0, DL, VT));
29263 }
29264 }
29265
29266 if ((Opcode == ISD::SADDSAT || Opcode == ISD::SSUBSAT) &&
29267 (!VT.isVector() || VT == MVT::v2i64)) {
29270 SDValue Zero = DAG.getConstant(0, DL, VT);
29271 SDValue Result =
29272 DAG.getNode(Opcode == ISD::SADDSAT ? ISD::SADDO : ISD::SSUBO, DL,
29273 DAG.getVTList(VT, SetCCResultType), X, Y);
29274 SDValue SumDiff = Result.getValue(0);
29275 SDValue Overflow = Result.getValue(1);
29276 SDValue SatMin = DAG.getConstant(MinVal, DL, VT);
29277 SDValue SatMax = DAG.getConstant(MaxVal, DL, VT);
29278 SDValue SumNeg =
29279 DAG.getSetCC(DL, SetCCResultType, SumDiff, Zero, ISD::SETLT);
29280 Result = DAG.getSelect(DL, VT, SumNeg, SatMax, SatMin);
29281 return DAG.getSelect(DL, VT, Overflow, Result, SumDiff);
29282 }
29283
29284 // Use default expansion.
29285 return SDValue();
29286}
29287
29288static SDValue LowerABS(SDValue Op, const X86Subtarget &Subtarget,
29289 SelectionDAG &DAG) {
29290 MVT VT = Op.getSimpleValueType();
29291 SDLoc DL(Op);
29292
29293 if (VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) {
29294 // Since X86 does not have CMOV for 8-bit integer, we don't convert
29295 // 8-bit integer abs to NEG and CMOV.
29296 SDValue N0 = Op.getOperand(0);
29297 SDValue Neg = DAG.getNode(X86ISD::SUB, DL, DAG.getVTList(VT, MVT::i32),
29298 DAG.getConstant(0, DL, VT), N0);
29299 SDValue Ops[] = {N0, Neg, DAG.getTargetConstant(X86::COND_NS, DL, MVT::i8),
29300 SDValue(Neg.getNode(), 1)};
29301 return DAG.getNode(X86ISD::CMOV, DL, VT, Ops);
29302 }
29303
29304 // ABS(vXi64 X) --> VPBLENDVPD(X, 0-X, X).
29305 if ((VT == MVT::v2i64 || VT == MVT::v4i64) && Subtarget.hasSSE41()) {
29306 SDValue Src = Op.getOperand(0);
29307 SDValue Neg = DAG.getNegative(Src, DL, VT);
29308 return DAG.getNode(X86ISD::BLENDV, DL, VT, Src, Neg, Src);
29309 }
29310
29311 if (VT.is256BitVector() && !Subtarget.hasInt256()) {
29312 assert(VT.isInteger() &&
29313 "Only handle AVX 256-bit vector integer operation");
29314 return splitVectorIntUnary(Op, DAG, DL);
29315 }
29316
29317 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
29318 return splitVectorIntUnary(Op, DAG, DL);
29319
29320 // Default to expand.
29321 return SDValue();
29322}
29323
29324static SDValue LowerAVG(SDValue Op, const X86Subtarget &Subtarget,
29325 SelectionDAG &DAG) {
29326 MVT VT = Op.getSimpleValueType();
29327 SDLoc DL(Op);
29328
29329 // For AVX1 cases, split to use legal ops.
29330 if (VT.is256BitVector() && !Subtarget.hasInt256())
29331 return splitVectorIntBinary(Op, DAG, DL);
29332
29333 if (VT == MVT::v32i16 || VT == MVT::v64i8)
29334 return splitVectorIntBinary(Op, DAG, DL);
29335
29336 // Default to expand.
29337 return SDValue();
29338}
29339
29340static SDValue LowerMINMAX(SDValue Op, const X86Subtarget &Subtarget,
29341 SelectionDAG &DAG) {
29342 MVT VT = Op.getSimpleValueType();
29343 SDLoc DL(Op);
29344
29345 // For AVX1 cases, split to use legal ops.
29346 if (VT.is256BitVector() && !Subtarget.hasInt256())
29347 return splitVectorIntBinary(Op, DAG, DL);
29348
29349 if (VT == MVT::v32i16 || VT == MVT::v64i8)
29350 return splitVectorIntBinary(Op, DAG, DL);
29351
29352 // Default to expand.
29353 return SDValue();
29354}
29355
29357 SelectionDAG &DAG) {
29358 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
29359 EVT VT = Op.getValueType();
29360 SDValue X = Op.getOperand(0);
29361 SDValue Y = Op.getOperand(1);
29362 SDLoc DL(Op);
29363 bool IsMaxOp =
29364 Op.getOpcode() == ISD::FMAXIMUM || Op.getOpcode() == ISD::FMAXIMUMNUM;
29365 bool IsNum =
29366 Op.getOpcode() == ISD::FMINIMUMNUM || Op.getOpcode() == ISD::FMAXIMUMNUM;
29367 if (Subtarget.hasAVX10_2() && TLI.isTypeLegal(VT)) {
29368 unsigned Opc = 0;
29369 if (VT.isVector())
29371 else if (VT == MVT::f16 || VT == MVT::f32 || VT == MVT::f64)
29373
29374 if (Opc) {
29375 SDValue Imm =
29376 DAG.getTargetConstant(IsMaxOp + (IsNum ? 16 : 0), DL, MVT::i32);
29377 return DAG.getNode(Opc, DL, VT, X, Y, Imm, Op->getFlags());
29378 }
29379 }
29380
29381 uint64_t SizeInBits = VT.getScalarSizeInBits();
29382 APInt PreferredZero = APInt::getZero(SizeInBits);
29383 APInt OppositeZero = PreferredZero;
29384 EVT IVT = VT.changeTypeToInteger();
29385 X86ISD::NodeType MinMaxOp;
29386 if (IsMaxOp) {
29387 MinMaxOp = X86ISD::FMAX;
29388 OppositeZero.setSignBit();
29389 } else {
29390 PreferredZero.setSignBit();
29391 MinMaxOp = X86ISD::FMIN;
29392 }
29393 EVT SetCCType =
29394 TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
29395
29396 // The tables below show the expected result of Max in cases of NaN and
29397 // signed zeros.
29398 //
29399 // Y Y
29400 // Num xNaN +0 -0
29401 // --------------- ---------------
29402 // Num | Max | Y | +0 | +0 | +0 |
29403 // X --------------- X ---------------
29404 // xNaN | X | X/Y | -0 | +0 | -0 |
29405 // --------------- ---------------
29406 //
29407 // It is achieved by means of FMAX/FMIN with preliminary checks and operand
29408 // reordering.
29409 //
29410 // We check if any of operands is NaN and return NaN. Then we check if any of
29411 // operands is zero or negative zero (for fmaximum and fminimum respectively)
29412 // to ensure the correct zero is returned.
29413 auto MatchesZero = [](SDValue Op, APInt Zero) {
29415 if (auto *CstOp = dyn_cast<ConstantFPSDNode>(Op))
29416 return CstOp->getValueAPF().bitcastToAPInt() == Zero;
29417 if (auto *CstOp = dyn_cast<ConstantSDNode>(Op))
29418 return CstOp->getAPIntValue() == Zero;
29419 if (Op->getOpcode() == ISD::BUILD_VECTOR ||
29420 Op->getOpcode() == ISD::SPLAT_VECTOR) {
29421 for (const SDValue &OpVal : Op->op_values()) {
29422 if (OpVal.isUndef())
29423 continue;
29424 auto *CstOp = dyn_cast<ConstantFPSDNode>(OpVal);
29425 if (!CstOp)
29426 return false;
29427 if (!CstOp->getValueAPF().isZero())
29428 continue;
29429 if (CstOp->getValueAPF().bitcastToAPInt() != Zero)
29430 return false;
29431 }
29432 return true;
29433 }
29434 return false;
29435 };
29436
29437 bool IsXNeverNaN = DAG.isKnownNeverNaN(X);
29438 bool IsYNeverNaN = DAG.isKnownNeverNaN(Y);
29439 bool IgnoreSignedZero = DAG.getTarget().Options.NoSignedZerosFPMath ||
29440 Op->getFlags().hasNoSignedZeros() ||
29441 DAG.isKnownNeverZeroFloat(X) ||
29443 SDValue NewX, NewY;
29444 if (IgnoreSignedZero || MatchesZero(Y, PreferredZero) ||
29445 MatchesZero(X, OppositeZero)) {
29446 // Operands are already in right order or order does not matter.
29447 NewX = X;
29448 NewY = Y;
29449 } else if (MatchesZero(X, PreferredZero) || MatchesZero(Y, OppositeZero)) {
29450 NewX = Y;
29451 NewY = X;
29452 } else if (!VT.isVector() && (VT == MVT::f16 || Subtarget.hasDQI()) &&
29453 (Op->getFlags().hasNoNaNs() || IsXNeverNaN || IsYNeverNaN)) {
29454 if (IsXNeverNaN)
29455 std::swap(X, Y);
29456 // VFPCLASSS consumes a vector type. So provide a minimal one corresponded
29457 // xmm register.
29458 MVT VectorType = MVT::getVectorVT(VT.getSimpleVT(), 128 / SizeInBits);
29460 // Bits of classes:
29461 // Bits Imm8[0] Imm8[1] Imm8[2] Imm8[3] Imm8[4] Imm8[5] Imm8[6] Imm8[7]
29462 // Class QNAN PosZero NegZero PosINF NegINF Denormal Negative SNAN
29463 SDValue Imm = DAG.getTargetConstant(MinMaxOp == X86ISD::FMAX ? 0b11 : 0b101,
29464 DL, MVT::i32);
29465 SDValue IsNanZero = DAG.getNode(X86ISD::VFPCLASSS, DL, MVT::v1i1, VX, Imm);
29466 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i1,
29467 DAG.getConstant(0, DL, MVT::v8i1), IsNanZero,
29468 DAG.getVectorIdxConstant(0, DL));
29469 SDValue NeedSwap = DAG.getBitcast(MVT::i8, Ins);
29470 NewX = DAG.getSelect(DL, VT, NeedSwap, Y, X);
29471 NewY = DAG.getSelect(DL, VT, NeedSwap, X, Y);
29472 return DAG.getNode(MinMaxOp, DL, VT, NewX, NewY, Op->getFlags());
29473 } else {
29474 SDValue IsXSigned;
29475 if (Subtarget.is64Bit() || VT != MVT::f64) {
29476 SDValue XInt = DAG.getNode(ISD::BITCAST, DL, IVT, X);
29477 SDValue ZeroCst = DAG.getConstant(0, DL, IVT);
29478 IsXSigned = DAG.getSetCC(DL, SetCCType, XInt, ZeroCst, ISD::SETLT);
29479 } else {
29480 assert(VT == MVT::f64);
29481 SDValue Ins = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v2f64,
29482 DAG.getConstantFP(0, DL, MVT::v2f64), X,
29483 DAG.getVectorIdxConstant(0, DL));
29484 SDValue VX = DAG.getNode(ISD::BITCAST, DL, MVT::v4f32, Ins);
29485 SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VX,
29486 DAG.getVectorIdxConstant(1, DL));
29487 Hi = DAG.getBitcast(MVT::i32, Hi);
29488 SDValue ZeroCst = DAG.getConstant(0, DL, MVT::i32);
29489 EVT SetCCType = TLI.getSetCCResultType(DAG.getDataLayout(),
29490 *DAG.getContext(), MVT::i32);
29491 IsXSigned = DAG.getSetCC(DL, SetCCType, Hi, ZeroCst, ISD::SETLT);
29492 }
29493 if (MinMaxOp == X86ISD::FMAX) {
29494 NewX = DAG.getSelect(DL, VT, IsXSigned, X, Y);
29495 NewY = DAG.getSelect(DL, VT, IsXSigned, Y, X);
29496 } else {
29497 NewX = DAG.getSelect(DL, VT, IsXSigned, Y, X);
29498 NewY = DAG.getSelect(DL, VT, IsXSigned, X, Y);
29499 }
29500 }
29501
29502 bool IgnoreNaN = DAG.getTarget().Options.NoNaNsFPMath ||
29503 Op->getFlags().hasNoNaNs() || (IsXNeverNaN && IsYNeverNaN);
29504
29505 // If we did no ordering operands for signed zero handling and we need
29506 // to process NaN and we know that one of the operands is not NaN then:
29507 // - For minimum/maximum, put it in the first operand,
29508 // - For minimumnum/maximumnum, put it in the second operand,
29509 // and we will not need to post handle NaN after max/min.
29510 if (IgnoreSignedZero && !IgnoreNaN &&
29511 DAG.isKnownNeverNaN(IsNum ? NewX : NewY))
29512 std::swap(NewX, NewY);
29513
29514 SDValue MinMax = DAG.getNode(MinMaxOp, DL, VT, NewX, NewY, Op->getFlags());
29515
29516 if (IgnoreNaN || DAG.isKnownNeverNaN(IsNum ? NewY : NewX))
29517 return MinMax;
29518
29519 if (DAG.isKnownNeverNaN(NewX))
29520 NewX = NewY;
29521
29522 SDValue IsNaN =
29523 DAG.getSetCC(DL, SetCCType, NewX, NewX, IsNum ? ISD::SETO : ISD::SETUO);
29524
29525 return DAG.getSelect(DL, VT, IsNaN, NewX, MinMax);
29526}
29527
29528static SDValue LowerABD(SDValue Op, const X86Subtarget &Subtarget,
29529 SelectionDAG &DAG) {
29530 MVT VT = Op.getSimpleValueType();
29531 SDLoc dl(Op);
29532
29533 // For AVX1 cases, split to use legal ops.
29534 if (VT.is256BitVector() && !Subtarget.hasInt256())
29535 return splitVectorIntBinary(Op, DAG, dl);
29536
29537 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.useBWIRegs())
29538 return splitVectorIntBinary(Op, DAG, dl);
29539
29540 bool IsSigned = Op.getOpcode() == ISD::ABDS;
29541 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
29542
29543 if (Subtarget.canUseCMOV() && VT.isScalarInteger()) {
29544 X86::CondCode CC = IsSigned ? X86::COND_L : X86::COND_B;
29545 unsigned ExtOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
29546
29547 // abds(lhs, rhs) -> select(slt(lhs,rhs),sub(rhs,lhs),sub(lhs,rhs))
29548 // abdu(lhs, rhs) -> select(ult(lhs,rhs),sub(rhs,lhs),sub(lhs,rhs))
29549 if (VT.bitsGE(MVT::i32)) {
29550 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
29551 SDValue LHS = DAG.getFreeze(Op.getOperand(0));
29552 SDValue RHS = DAG.getFreeze(Op.getOperand(1));
29553 SDValue Diff0 = DAG.getNode(X86ISD::SUB, dl, VTs, LHS, RHS);
29554 SDValue Diff1 = DAG.getNode(X86ISD::SUB, dl, VTs, RHS, LHS);
29555 return DAG.getNode(X86ISD::CMOV, dl, VT, Diff1, Diff0,
29556 DAG.getTargetConstant(CC, dl, MVT::i8),
29557 Diff1.getValue(1));
29558 }
29559
29560 // abds(lhs, rhs) -> trunc(abs(sub(sext(lhs), sext(rhs))))
29561 // abdu(lhs, rhs) -> trunc(abs(sub(zext(lhs), zext(rhs))))
29562 unsigned WideBits = std::max<unsigned>(2 * VT.getScalarSizeInBits(), 32u);
29563 MVT WideVT = MVT::getIntegerVT(WideBits);
29564 if (TLI.isTypeLegal(WideVT)) {
29565 SDVTList WideVTs = DAG.getVTList(WideVT, MVT::i32);
29566 SDValue LHS = DAG.getNode(ExtOpc, dl, WideVT, Op.getOperand(0));
29567 SDValue RHS = DAG.getNode(ExtOpc, dl, WideVT, Op.getOperand(1));
29568 SDValue Diff0 = DAG.getNode(X86ISD::SUB, dl, WideVTs, LHS, RHS);
29569 SDValue Diff1 = DAG.getNode(X86ISD::SUB, dl, WideVTs, RHS, LHS);
29570 SDValue AbsDiff = DAG.getNode(X86ISD::CMOV, dl, WideVT, Diff1, Diff0,
29571 DAG.getTargetConstant(CC, dl, MVT::i8),
29572 Diff1.getValue(1));
29573 return DAG.getNode(ISD::TRUNCATE, dl, VT, AbsDiff);
29574 }
29575 }
29576
29577 // Default to expand.
29578 return SDValue();
29579}
29580
29581static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget,
29582 SelectionDAG &DAG) {
29583 SDLoc dl(Op);
29584 MVT VT = Op.getSimpleValueType();
29585
29586 // Decompose 256-bit ops into 128-bit ops.
29587 if (VT.is256BitVector() && !Subtarget.hasInt256())
29588 return splitVectorIntBinary(Op, DAG, dl);
29589
29590 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
29591 return splitVectorIntBinary(Op, DAG, dl);
29592
29593 SDValue A = Op.getOperand(0);
29594 SDValue B = Op.getOperand(1);
29595
29596 // Lower v16i8/v32i8/v64i8 mul as sign-extension to v8i16/v16i16/v32i16
29597 // vector pairs, multiply and truncate.
29598 if (VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8) {
29599 unsigned NumElts = VT.getVectorNumElements();
29600 unsigned NumLanes = VT.getSizeInBits() / 128;
29601 unsigned NumEltsPerLane = NumElts / NumLanes;
29602
29603 if ((VT == MVT::v16i8 && Subtarget.hasInt256()) ||
29604 (VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) {
29605 MVT ExVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());
29606 return DAG.getNode(
29607 ISD::TRUNCATE, dl, VT,
29608 DAG.getNode(ISD::MUL, dl, ExVT,
29609 DAG.getNode(ISD::ANY_EXTEND, dl, ExVT, A),
29610 DAG.getNode(ISD::ANY_EXTEND, dl, ExVT, B)));
29611 }
29612
29613 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
29614
29615 // For vXi8 mul, try PMADDUBSW to avoid the need for extension.
29616 // Don't do this if we only need to unpack one half.
29617 if (Subtarget.hasSSSE3()) {
29618 bool BIsBuildVector = isa<BuildVectorSDNode>(B);
29619 bool IsLoLaneAllZeroOrUndef = BIsBuildVector;
29620 bool IsHiLaneAllZeroOrUndef = BIsBuildVector;
29621 if (BIsBuildVector) {
29622 for (auto [Idx, Val] : enumerate(B->ops())) {
29623 if ((Idx % NumEltsPerLane) >= (NumEltsPerLane / 2))
29624 IsHiLaneAllZeroOrUndef &= isNullConstantOrUndef(Val);
29625 else
29626 IsLoLaneAllZeroOrUndef &= isNullConstantOrUndef(Val);
29627 }
29628 }
29629 if (!(IsLoLaneAllZeroOrUndef || IsHiLaneAllZeroOrUndef)) {
29630 SDValue Mask = DAG.getBitcast(VT, DAG.getConstant(0x00FF, dl, ExVT));
29631 SDValue BLo = DAG.getNode(ISD::AND, dl, VT, Mask, B);
29632 SDValue BHi = DAG.getNode(X86ISD::ANDNP, dl, VT, Mask, B);
29633 SDValue RLo = DAG.getNode(X86ISD::VPMADDUBSW, dl, ExVT, A, BLo);
29634 SDValue RHi = DAG.getNode(X86ISD::VPMADDUBSW, dl, ExVT, A, BHi);
29635 RLo = DAG.getNode(ISD::AND, dl, VT, DAG.getBitcast(VT, RLo), Mask);
29636 RHi = DAG.getNode(X86ISD::VSHLI, dl, ExVT, RHi,
29637 DAG.getTargetConstant(8, dl, MVT::i8));
29638 return DAG.getNode(ISD::OR, dl, VT, RLo, DAG.getBitcast(VT, RHi));
29639 }
29640 }
29641
29642 // Extract the lo/hi parts to any extend to i16.
29643 // We're going to mask off the low byte of each result element of the
29644 // pmullw, so it doesn't matter what's in the high byte of each 16-bit
29645 // element.
29646 SDValue Undef = DAG.getUNDEF(VT);
29647 SDValue ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, A, Undef));
29648 SDValue AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, A, Undef));
29649
29650 SDValue BLo, BHi;
29651 if (ISD::isBuildVectorOfConstantSDNodes(B.getNode())) {
29652 // If the RHS is a constant, manually unpackl/unpackh.
29653 SmallVector<SDValue, 16> LoOps, HiOps;
29654 for (unsigned i = 0; i != NumElts; i += 16) {
29655 for (unsigned j = 0; j != 8; ++j) {
29656 LoOps.push_back(DAG.getAnyExtOrTrunc(B.getOperand(i + j), dl,
29657 MVT::i16));
29658 HiOps.push_back(DAG.getAnyExtOrTrunc(B.getOperand(i + j + 8), dl,
29659 MVT::i16));
29660 }
29661 }
29662
29663 BLo = DAG.getBuildVector(ExVT, dl, LoOps);
29664 BHi = DAG.getBuildVector(ExVT, dl, HiOps);
29665 } else {
29666 BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, B, Undef));
29667 BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, B, Undef));
29668 }
29669
29670 // Multiply, mask the lower 8bits of the lo/hi results and pack.
29671 SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
29672 SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
29673 return getPack(DAG, Subtarget, dl, VT, RLo, RHi);
29674 }
29675
29676 // Lower v4i32 mul as 2x shuffle, 2x pmuludq, 2x shuffle.
29677 if (VT == MVT::v4i32) {
29678 assert(Subtarget.hasSSE2() && !Subtarget.hasSSE41() &&
29679 "Should not custom lower when pmulld is available!");
29680
29681 // Extract the odd parts.
29682 static const int UnpackMask[] = {1, 1, 3, 3};
29683 SDValue Aodds = DAG.getVectorShuffle(VT, dl, A, A, UnpackMask);
29684 SDValue Bodds = DAG.getVectorShuffle(VT, dl, B, B, UnpackMask);
29685
29686 // Multiply the even parts.
29687 SDValue Evens = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64,
29688 DAG.getBitcast(MVT::v2i64, A),
29689 DAG.getBitcast(MVT::v2i64, B));
29690 // Now multiply odd parts.
29691 SDValue Odds = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64,
29692 DAG.getBitcast(MVT::v2i64, Aodds),
29693 DAG.getBitcast(MVT::v2i64, Bodds));
29694
29695 Evens = DAG.getBitcast(VT, Evens);
29696 Odds = DAG.getBitcast(VT, Odds);
29697
29698 // Merge the two vectors back together with a shuffle. This expands into 2
29699 // shuffles.
29700 static const int ShufMask[] = { 0, 4, 2, 6 };
29701 return DAG.getVectorShuffle(VT, dl, Evens, Odds, ShufMask);
29702 }
29703
29704 assert((VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) &&
29705 "Only know how to lower V2I64/V4I64/V8I64 multiply");
29706 assert(!Subtarget.hasDQI() && "DQI should use MULLQ");
29707
29708 // Ahi = psrlqi(a, 32);
29709 // Bhi = psrlqi(b, 32);
29710 //
29711 // AloBlo = pmuludq(a, b);
29712 // AloBhi = pmuludq(a, Bhi);
29713 // AhiBlo = pmuludq(Ahi, b);
29714 //
29715 // Hi = psllqi(AloBhi + AhiBlo, 32);
29716 // return AloBlo + Hi;
29717 KnownBits AKnown = DAG.computeKnownBits(A);
29718 KnownBits BKnown = DAG.computeKnownBits(B);
29719
29720 APInt LowerBitsMask = APInt::getLowBitsSet(64, 32);
29721 bool ALoIsZero = LowerBitsMask.isSubsetOf(AKnown.Zero);
29722 bool BLoIsZero = LowerBitsMask.isSubsetOf(BKnown.Zero);
29723
29724 APInt UpperBitsMask = APInt::getHighBitsSet(64, 32);
29725 bool AHiIsZero = UpperBitsMask.isSubsetOf(AKnown.Zero);
29726 bool BHiIsZero = UpperBitsMask.isSubsetOf(BKnown.Zero);
29727
29728 SDValue Zero = DAG.getConstant(0, dl, VT);
29729
29730 // Only multiply lo/hi halves that aren't known to be zero.
29731 SDValue AloBlo = Zero;
29732 if (!ALoIsZero && !BLoIsZero)
29733 AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, B);
29734
29735 SDValue AloBhi = Zero;
29736 if (!ALoIsZero && !BHiIsZero) {
29737 SDValue Bhi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, B, 32, DAG);
29738 AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, Bhi);
29739 }
29740
29741 SDValue AhiBlo = Zero;
29742 if (!AHiIsZero && !BLoIsZero) {
29743 SDValue Ahi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, A, 32, DAG);
29744 AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, B);
29745 }
29746
29747 SDValue Hi = DAG.getNode(ISD::ADD, dl, VT, AloBhi, AhiBlo);
29748 Hi = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Hi, 32, DAG);
29749
29750 return DAG.getNode(ISD::ADD, dl, VT, AloBlo, Hi);
29751}
29752
29754 MVT VT, bool IsSigned,
29755 const X86Subtarget &Subtarget,
29756 SelectionDAG &DAG,
29757 SDValue *Low = nullptr) {
29758 // For vXi8 we will unpack the low and high half of each 128 bit lane to widen
29759 // to a vXi16 type. Do the multiplies, shift the results and pack the half
29760 // lane results back together.
29761
29762 // We'll take different approaches for signed and unsigned.
29763 // For unsigned we'll use punpcklbw/punpckhbw to zero extend the bytes to
29764 // words and use pmullw to calculate the full 16-bit product.
29765 // For signed we'll use punpcklbw/punpckbw to extend the bytes to words and
29766 // shift them left into the upper byte of each word. This allows us to use
29767 // pmulhw to calculate the full 16-bit product. This trick means we don't
29768 // need to sign extend the bytes to use pmullw.
29769 MVT ExVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements() / 2);
29770 SDValue Zero = DAG.getConstant(0, dl, VT);
29771
29772 SDValue ALo, AHi, BLo, BHi;
29773 if (IsSigned) {
29774 ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, Zero, A));
29775 BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, Zero, B));
29776 AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, Zero, A));
29777 BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, Zero, B));
29778 } else {
29779 ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, A, Zero));
29780 BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, B, Zero));
29781 AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, A, Zero));
29782 BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, B, Zero));
29783 }
29784
29785 // Multiply, lshr the upper 8bits to the lower 8bits of the lo/hi results and
29786 // pack back to vXi8.
29787 unsigned MulOpc = IsSigned ? ISD::MULHS : ISD::MUL;
29788 SDValue RLo = DAG.getNode(MulOpc, dl, ExVT, ALo, BLo);
29789 SDValue RHi = DAG.getNode(MulOpc, dl, ExVT, AHi, BHi);
29790
29791 if (Low)
29792 *Low = getPack(DAG, Subtarget, dl, VT, RLo, RHi);
29793
29794 return getPack(DAG, Subtarget, dl, VT, RLo, RHi, /*PackHiHalf=*/true);
29795}
29796
29797static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,
29798 SelectionDAG &DAG) {
29799 SDLoc dl(Op);
29800 MVT VT = Op.getSimpleValueType();
29801 bool IsSigned = Op->getOpcode() == ISD::MULHS;
29802 unsigned NumElts = VT.getVectorNumElements();
29803 SDValue A = Op.getOperand(0);
29804 SDValue B = Op.getOperand(1);
29805
29806 // Decompose 256-bit ops into 128-bit ops.
29807 if (VT.is256BitVector() && !Subtarget.hasInt256())
29808 return splitVectorIntBinary(Op, DAG, dl);
29809
29810 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
29811 return splitVectorIntBinary(Op, DAG, dl);
29812
29813 if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) {
29814 assert((VT == MVT::v4i32 && Subtarget.hasSSE2()) ||
29815 (VT == MVT::v8i32 && Subtarget.hasInt256()) ||
29816 (VT == MVT::v16i32 && Subtarget.hasAVX512()));
29817
29818 // PMULxD operations multiply each even value (starting at 0) of LHS with
29819 // the related value of RHS and produce a widen result.
29820 // E.g., PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
29821 // => <2 x i64> <ae|cg>
29822 //
29823 // In other word, to have all the results, we need to perform two PMULxD:
29824 // 1. one with the even values.
29825 // 2. one with the odd values.
29826 // To achieve #2, with need to place the odd values at an even position.
29827 //
29828 // Place the odd value at an even position (basically, shift all values 1
29829 // step to the left):
29830 const int Mask[] = {1, -1, 3, -1, 5, -1, 7, -1,
29831 9, -1, 11, -1, 13, -1, 15, -1};
29832 // <a|b|c|d> => <b|undef|d|undef>
29833 SDValue Odd0 =
29834 DAG.getVectorShuffle(VT, dl, A, A, ArrayRef(&Mask[0], NumElts));
29835 // <e|f|g|h> => <f|undef|h|undef>
29836 SDValue Odd1 =
29837 DAG.getVectorShuffle(VT, dl, B, B, ArrayRef(&Mask[0], NumElts));
29838
29839 // Emit two multiplies, one for the lower 2 ints and one for the higher 2
29840 // ints.
29841 MVT MulVT = MVT::getVectorVT(MVT::i64, NumElts / 2);
29842 unsigned Opcode =
29843 (IsSigned && Subtarget.hasSSE41()) ? X86ISD::PMULDQ : X86ISD::PMULUDQ;
29844 // PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
29845 // => <2 x i64> <ae|cg>
29846 SDValue Mul1 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT,
29847 DAG.getBitcast(MulVT, A),
29848 DAG.getBitcast(MulVT, B)));
29849 // PMULUDQ <4 x i32> <b|undef|d|undef>, <4 x i32> <f|undef|h|undef>
29850 // => <2 x i64> <bf|dh>
29851 SDValue Mul2 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT,
29852 DAG.getBitcast(MulVT, Odd0),
29853 DAG.getBitcast(MulVT, Odd1)));
29854
29855 // Shuffle it back into the right order.
29856 SmallVector<int, 16> ShufMask(NumElts);
29857 for (int i = 0; i != (int)NumElts; ++i)
29858 ShufMask[i] = (i / 2) * 2 + ((i % 2) * NumElts) + 1;
29859
29860 SDValue Res = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, ShufMask);
29861
29862 // If we have a signed multiply but no PMULDQ fix up the result of an
29863 // unsigned multiply.
29864 if (IsSigned && !Subtarget.hasSSE41()) {
29865 SDValue Zero = DAG.getConstant(0, dl, VT);
29866 SDValue T1 = DAG.getNode(ISD::AND, dl, VT,
29867 DAG.getSetCC(dl, VT, Zero, A, ISD::SETGT), B);
29868 SDValue T2 = DAG.getNode(ISD::AND, dl, VT,
29869 DAG.getSetCC(dl, VT, Zero, B, ISD::SETGT), A);
29870
29871 SDValue Fixup = DAG.getNode(ISD::ADD, dl, VT, T1, T2);
29872 Res = DAG.getNode(ISD::SUB, dl, VT, Res, Fixup);
29873 }
29874
29875 return Res;
29876 }
29877
29878 // Only i8 vectors should need custom lowering after this.
29879 assert((VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) ||
29880 (VT == MVT::v64i8 && Subtarget.hasBWI())) &&
29881 "Unsupported vector type");
29882
29883 // Lower v16i8/v32i8 as extension to v8i16/v16i16 vector pairs, multiply,
29884 // logical shift down the upper half and pack back to i8.
29885
29886 // With SSE41 we can use sign/zero extend, but for pre-SSE41 we unpack
29887 // and then ashr/lshr the upper bits down to the lower bits before multiply.
29888
29889 if ((VT == MVT::v16i8 && Subtarget.hasInt256()) ||
29890 (VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) {
29891 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts);
29892 unsigned ExAVX = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
29893 SDValue ExA = DAG.getNode(ExAVX, dl, ExVT, A);
29894 SDValue ExB = DAG.getNode(ExAVX, dl, ExVT, B);
29895 SDValue Mul = DAG.getNode(ISD::MUL, dl, ExVT, ExA, ExB);
29896 Mul = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Mul, 8, DAG);
29897 return DAG.getNode(ISD::TRUNCATE, dl, VT, Mul);
29898 }
29899
29900 return LowervXi8MulWithUNPCK(A, B, dl, VT, IsSigned, Subtarget, DAG);
29901}
29902
29903// Custom lowering for SMULO/UMULO.
29904static SDValue LowerMULO(SDValue Op, const X86Subtarget &Subtarget,
29905 SelectionDAG &DAG) {
29906 MVT VT = Op.getSimpleValueType();
29907
29908 // Scalars defer to LowerXALUO.
29909 if (!VT.isVector())
29910 return LowerXALUO(Op, DAG);
29911
29912 SDLoc dl(Op);
29913 bool IsSigned = Op->getOpcode() == ISD::SMULO;
29914 SDValue A = Op.getOperand(0);
29915 SDValue B = Op.getOperand(1);
29916 EVT OvfVT = Op->getValueType(1);
29917
29918 if ((VT == MVT::v32i8 && !Subtarget.hasInt256()) ||
29919 (VT == MVT::v64i8 && !Subtarget.hasBWI())) {
29920 // Extract the LHS Lo/Hi vectors
29921 SDValue LHSLo, LHSHi;
29922 std::tie(LHSLo, LHSHi) = splitVector(A, DAG, dl);
29923
29924 // Extract the RHS Lo/Hi vectors
29925 SDValue RHSLo, RHSHi;
29926 std::tie(RHSLo, RHSHi) = splitVector(B, DAG, dl);
29927
29928 EVT LoOvfVT, HiOvfVT;
29929 std::tie(LoOvfVT, HiOvfVT) = DAG.GetSplitDestVTs(OvfVT);
29930 SDVTList LoVTs = DAG.getVTList(LHSLo.getValueType(), LoOvfVT);
29931 SDVTList HiVTs = DAG.getVTList(LHSHi.getValueType(), HiOvfVT);
29932
29933 // Issue the split operations.
29934 SDValue Lo = DAG.getNode(Op.getOpcode(), dl, LoVTs, LHSLo, RHSLo);
29935 SDValue Hi = DAG.getNode(Op.getOpcode(), dl, HiVTs, LHSHi, RHSHi);
29936
29937 // Join the separate data results and the overflow results.
29938 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
29939 SDValue Ovf = DAG.getNode(ISD::CONCAT_VECTORS, dl, OvfVT, Lo.getValue(1),
29940 Hi.getValue(1));
29941
29942 return DAG.getMergeValues({Res, Ovf}, dl);
29943 }
29944
29945 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
29946 EVT SetccVT =
29947 TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
29948
29949 if ((VT == MVT::v16i8 && Subtarget.hasInt256()) ||
29950 (VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) {
29951 unsigned NumElts = VT.getVectorNumElements();
29952 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts);
29953 unsigned ExAVX = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
29954 SDValue ExA = DAG.getNode(ExAVX, dl, ExVT, A);
29955 SDValue ExB = DAG.getNode(ExAVX, dl, ExVT, B);
29956 SDValue Mul = DAG.getNode(ISD::MUL, dl, ExVT, ExA, ExB);
29957
29958 SDValue Low = DAG.getNode(ISD::TRUNCATE, dl, VT, Mul);
29959
29960 SDValue Ovf;
29961 if (IsSigned) {
29962 SDValue High, LowSign;
29963 if (OvfVT.getVectorElementType() == MVT::i1 &&
29964 (Subtarget.hasBWI() || Subtarget.canExtendTo512DQ())) {
29965 // Rather the truncating try to do the compare on vXi16 or vXi32.
29966 // Shift the high down filling with sign bits.
29967 High = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Mul, 8, DAG);
29968 // Fill all 16 bits with the sign bit from the low.
29969 LowSign =
29970 getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ExVT, Mul, 8, DAG);
29971 LowSign = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, LowSign,
29972 15, DAG);
29973 SetccVT = OvfVT;
29974 if (!Subtarget.hasBWI()) {
29975 // We can't do a vXi16 compare so sign extend to v16i32.
29976 High = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v16i32, High);
29977 LowSign = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v16i32, LowSign);
29978 }
29979 } else {
29980 // Otherwise do the compare at vXi8.
29981 High = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Mul, 8, DAG);
29982 High = DAG.getNode(ISD::TRUNCATE, dl, VT, High);
29983 LowSign =
29984 DAG.getNode(ISD::SRA, dl, VT, Low, DAG.getConstant(7, dl, VT));
29985 }
29986
29987 Ovf = DAG.getSetCC(dl, SetccVT, LowSign, High, ISD::SETNE);
29988 } else {
29989 SDValue High =
29990 getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Mul, 8, DAG);
29991 if (OvfVT.getVectorElementType() == MVT::i1 &&
29992 (Subtarget.hasBWI() || Subtarget.canExtendTo512DQ())) {
29993 // Rather the truncating try to do the compare on vXi16 or vXi32.
29994 SetccVT = OvfVT;
29995 if (!Subtarget.hasBWI()) {
29996 // We can't do a vXi16 compare so sign extend to v16i32.
29997 High = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v16i32, High);
29998 }
29999 } else {
30000 // Otherwise do the compare at vXi8.
30001 High = DAG.getNode(ISD::TRUNCATE, dl, VT, High);
30002 }
30003
30004 Ovf =
30005 DAG.getSetCC(dl, SetccVT, High,
30006 DAG.getConstant(0, dl, High.getValueType()), ISD::SETNE);
30007 }
30008
30009 Ovf = DAG.getSExtOrTrunc(Ovf, dl, OvfVT);
30010
30011 return DAG.getMergeValues({Low, Ovf}, dl);
30012 }
30013
30014 SDValue Low;
30015 SDValue High =
30016 LowervXi8MulWithUNPCK(A, B, dl, VT, IsSigned, Subtarget, DAG, &Low);
30017
30018 SDValue Ovf;
30019 if (IsSigned) {
30020 // SMULO overflows if the high bits don't match the sign of the low.
30021 SDValue LowSign =
30022 DAG.getNode(ISD::SRA, dl, VT, Low, DAG.getConstant(7, dl, VT));
30023 Ovf = DAG.getSetCC(dl, SetccVT, LowSign, High, ISD::SETNE);
30024 } else {
30025 // UMULO overflows if the high bits are non-zero.
30026 Ovf =
30027 DAG.getSetCC(dl, SetccVT, High, DAG.getConstant(0, dl, VT), ISD::SETNE);
30028 }
30029
30030 Ovf = DAG.getSExtOrTrunc(Ovf, dl, OvfVT);
30031
30032 return DAG.getMergeValues({Low, Ovf}, dl);
30033}
30034
30035SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const {
30036 assert(Subtarget.isTargetWin64() && "Unexpected target");
30037 EVT VT = Op.getValueType();
30038 assert(VT.isInteger() && VT.getSizeInBits() == 128 &&
30039 "Unexpected return type for lowering");
30040
30041 if (isa<ConstantSDNode>(Op->getOperand(1))) {
30043 if (expandDIVREMByConstant(Op.getNode(), Result, MVT::i64, DAG))
30044 return DAG.getNode(ISD::BUILD_PAIR, SDLoc(Op), VT, Result[0], Result[1]);
30045 }
30046
30047 RTLIB::Libcall LC;
30048 bool isSigned;
30049 switch (Op->getOpcode()) {
30050 // clang-format off
30051 default: llvm_unreachable("Unexpected request for libcall!");
30052 case ISD::SDIV: isSigned = true; LC = RTLIB::SDIV_I128; break;
30053 case ISD::UDIV: isSigned = false; LC = RTLIB::UDIV_I128; break;
30054 case ISD::SREM: isSigned = true; LC = RTLIB::SREM_I128; break;
30055 case ISD::UREM: isSigned = false; LC = RTLIB::UREM_I128; break;
30056 // clang-format on
30057 }
30058
30059 SDLoc dl(Op);
30060 SDValue InChain = DAG.getEntryNode();
30061
30063 for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) {
30064 EVT ArgVT = Op->getOperand(i).getValueType();
30065 assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&
30066 "Unexpected argument type for lowering");
30067 SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);
30068 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
30069 MachinePointerInfo MPI =
30071 InChain =
30072 DAG.getStore(InChain, dl, Op->getOperand(i), StackPtr, MPI, Align(16));
30073 Args.emplace_back(StackPtr, PointerType::get(*DAG.getContext(), 0));
30074 }
30075
30078
30079 TargetLowering::CallLoweringInfo CLI(DAG);
30080 CLI.setDebugLoc(dl)
30081 .setChain(InChain)
30082 .setLibCallee(
30084 static_cast<EVT>(MVT::v2i64).getTypeForEVT(*DAG.getContext()), Callee,
30085 std::move(Args))
30086 .setInRegister()
30087 .setSExtResult(isSigned)
30088 .setZExtResult(!isSigned);
30089
30090 std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
30091 return DAG.getBitcast(VT, CallInfo.first);
30092}
30093
30094SDValue X86TargetLowering::LowerWin64_FP_TO_INT128(SDValue Op,
30095 SelectionDAG &DAG,
30096 SDValue &Chain) const {
30097 assert(Subtarget.isTargetWin64() && "Unexpected target");
30098 EVT VT = Op.getValueType();
30099 bool IsStrict = Op->isStrictFPOpcode();
30100
30101 SDValue Arg = Op.getOperand(IsStrict ? 1 : 0);
30102 EVT ArgVT = Arg.getValueType();
30103
30104 assert(VT.isInteger() && VT.getSizeInBits() == 128 &&
30105 "Unexpected return type for lowering");
30106
30107 RTLIB::Libcall LC;
30108 if (Op->getOpcode() == ISD::FP_TO_SINT ||
30109 Op->getOpcode() == ISD::STRICT_FP_TO_SINT)
30110 LC = RTLIB::getFPTOSINT(ArgVT, VT);
30111 else
30112 LC = RTLIB::getFPTOUINT(ArgVT, VT);
30113 assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpected request for libcall!");
30114
30115 SDLoc dl(Op);
30116 MakeLibCallOptions CallOptions;
30117 Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
30118
30120 // Expect the i128 argument returned as a v2i64 in xmm0, cast back to the
30121 // expected VT (i128).
30122 std::tie(Result, Chain) =
30123 makeLibCall(DAG, LC, MVT::v2i64, Arg, CallOptions, dl, Chain);
30124 Result = DAG.getBitcast(VT, Result);
30125 return Result;
30126}
30127
30128SDValue X86TargetLowering::LowerWin64_INT128_TO_FP(SDValue Op,
30129 SelectionDAG &DAG) const {
30130 assert(Subtarget.isTargetWin64() && "Unexpected target");
30131 EVT VT = Op.getValueType();
30132 bool IsStrict = Op->isStrictFPOpcode();
30133
30134 SDValue Arg = Op.getOperand(IsStrict ? 1 : 0);
30135 EVT ArgVT = Arg.getValueType();
30136
30137 assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&
30138 "Unexpected argument type for lowering");
30139
30140 RTLIB::Libcall LC;
30141 if (Op->getOpcode() == ISD::SINT_TO_FP ||
30142 Op->getOpcode() == ISD::STRICT_SINT_TO_FP)
30143 LC = RTLIB::getSINTTOFP(ArgVT, VT);
30144 else
30145 LC = RTLIB::getUINTTOFP(ArgVT, VT);
30146 assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpected request for libcall!");
30147
30148 SDLoc dl(Op);
30149 MakeLibCallOptions CallOptions;
30150 SDValue Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
30151
30152 // Pass the i128 argument as an indirect argument on the stack.
30153 SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);
30154 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
30155 MachinePointerInfo MPI =
30157 Chain = DAG.getStore(Chain, dl, Arg, StackPtr, MPI, Align(16));
30158
30160 std::tie(Result, Chain) =
30161 makeLibCall(DAG, LC, VT, StackPtr, CallOptions, dl, Chain);
30162 return IsStrict ? DAG.getMergeValues({Result, Chain}, dl) : Result;
30163}
30164
30165// Return true if the required (according to Opcode) shift-imm form is natively
30166// supported by the Subtarget
30167static bool supportedVectorShiftWithImm(EVT VT, const X86Subtarget &Subtarget,
30168 unsigned Opcode) {
30169 assert((Opcode == ISD::SHL || Opcode == ISD::SRA || Opcode == ISD::SRL) &&
30170 "Unexpected shift opcode");
30171
30172 if (!VT.isSimple())
30173 return false;
30174
30175 if (!(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))
30176 return false;
30177
30178 if (VT.getScalarSizeInBits() < 16)
30179 return false;
30180
30181 if (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
30182 (VT.getScalarSizeInBits() > 16 || Subtarget.hasBWI()))
30183 return true;
30184
30185 bool LShift = (VT.is128BitVector() && Subtarget.hasSSE2()) ||
30186 (VT.is256BitVector() && Subtarget.hasInt256());
30187
30188 bool AShift = LShift && (Subtarget.hasAVX512() ||
30189 (VT != MVT::v2i64 && VT != MVT::v4i64));
30190 return (Opcode == ISD::SRA) ? AShift : LShift;
30191}
30192
30193// The shift amount is a variable, but it is the same for all vector lanes.
30194// These instructions are defined together with shift-immediate.
30195static
30197 unsigned Opcode) {
30198 return supportedVectorShiftWithImm(VT, Subtarget, Opcode);
30199}
30200
30201// Return true if the required (according to Opcode) variable-shift form is
30202// natively supported by the Subtarget
30203static bool supportedVectorVarShift(EVT VT, const X86Subtarget &Subtarget,
30204 unsigned Opcode) {
30205 assert((Opcode == ISD::SHL || Opcode == ISD::SRA || Opcode == ISD::SRL) &&
30206 "Unexpected shift opcode");
30207
30208 if (!VT.isSimple())
30209 return false;
30210
30211 if (!(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))
30212 return false;
30213
30214 if (!Subtarget.hasInt256() || VT.getScalarSizeInBits() < 16)
30215 return false;
30216
30217 // vXi16 supported only on AVX-512, BWI
30218 if (VT.getScalarSizeInBits() == 16 && !Subtarget.hasBWI())
30219 return false;
30220
30221 if (Subtarget.hasAVX512() &&
30222 (Subtarget.useAVX512Regs() || !VT.is512BitVector()))
30223 return true;
30224
30225 bool LShift = VT.is128BitVector() || VT.is256BitVector();
30226 bool AShift = LShift && VT != MVT::v2i64 && VT != MVT::v4i64;
30227 return (Opcode == ISD::SRA) ? AShift : LShift;
30228}
30229
30231 const X86Subtarget &Subtarget) {
30232 MVT VT = Op.getSimpleValueType();
30233 SDLoc dl(Op);
30234 SDValue R = Op.getOperand(0);
30235 SDValue Amt = Op.getOperand(1);
30236 unsigned X86Opc = getTargetVShiftUniformOpcode(Op.getOpcode(), false);
30237 unsigned EltSizeInBits = VT.getScalarSizeInBits();
30238
30239 auto ArithmeticShiftRight64 = [&](uint64_t ShiftAmt) {
30240 assert((VT == MVT::v2i64 || VT == MVT::v4i64) && "Unexpected SRA type");
30241 MVT ExVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2);
30242 SDValue Ex = DAG.getBitcast(ExVT, R);
30243
30244 // ashr(R, 63) === cmp_slt(R, 0)
30245 if (ShiftAmt == 63 && Subtarget.hasSSE42()) {
30246 assert((VT != MVT::v4i64 || Subtarget.hasInt256()) &&
30247 "Unsupported PCMPGT op");
30248 return DAG.getNode(X86ISD::PCMPGT, dl, VT, DAG.getConstant(0, dl, VT), R);
30249 }
30250
30251 if (ShiftAmt >= 32) {
30252 // Splat sign to upper i32 dst, and SRA upper i32 src to lower i32.
30253 SDValue Upper =
30254 getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex, 31, DAG);
30256 ShiftAmt - 32, DAG);
30257 if (VT == MVT::v2i64)
30258 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {5, 1, 7, 3});
30259 if (VT == MVT::v4i64)
30260 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
30261 {9, 1, 11, 3, 13, 5, 15, 7});
30262 } else {
30263 // SRA upper i32, SRL whole i64 and select lower i32.
30265 ShiftAmt, DAG);
30266 SDValue Lower =
30267 getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt, DAG);
30268 Lower = DAG.getBitcast(ExVT, Lower);
30269 if (VT == MVT::v2i64)
30270 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {4, 1, 6, 3});
30271 if (VT == MVT::v4i64)
30272 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
30273 {8, 1, 10, 3, 12, 5, 14, 7});
30274 }
30275 return DAG.getBitcast(VT, Ex);
30276 };
30277
30278 // Optimize shl/srl/sra with constant shift amount.
30279 APInt APIntShiftAmt;
30280 if (!X86::isConstantSplat(Amt, APIntShiftAmt))
30281 return SDValue();
30282
30283 // If the shift amount is out of range, return undef.
30284 if (APIntShiftAmt.uge(EltSizeInBits))
30285 return DAG.getUNDEF(VT);
30286
30287 uint64_t ShiftAmt = APIntShiftAmt.getZExtValue();
30288
30289 if (supportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode()))
30290 return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);
30291
30292 // i64 SRA needs to be performed as partial shifts.
30293 if (((!Subtarget.hasXOP() && VT == MVT::v2i64) ||
30294 (Subtarget.hasInt256() && VT == MVT::v4i64)) &&
30295 Op.getOpcode() == ISD::SRA)
30296 return ArithmeticShiftRight64(ShiftAmt);
30297
30298 // If we're logical shifting an all-signbits value then we can just perform as
30299 // a mask.
30300 if ((Op.getOpcode() == ISD::SHL || Op.getOpcode() == ISD::SRL) &&
30301 DAG.ComputeNumSignBits(R) == EltSizeInBits) {
30302 SDValue Mask = DAG.getAllOnesConstant(dl, VT);
30303 Mask = DAG.getNode(Op.getOpcode(), dl, VT, Mask, Amt);
30304 return DAG.getNode(ISD::AND, dl, VT, R, Mask);
30305 }
30306
30307 if (VT == MVT::v16i8 || (Subtarget.hasInt256() && VT == MVT::v32i8) ||
30308 (Subtarget.hasBWI() && VT == MVT::v64i8)) {
30309 unsigned NumElts = VT.getVectorNumElements();
30310 MVT ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
30311
30312 // Simple i8 add case
30313 if (Op.getOpcode() == ISD::SHL && ShiftAmt == 1) {
30314 // R may be undef at run-time, but (shl R, 1) must be an even number (LSB
30315 // must be 0). (add undef, undef) however can be any value. To make this
30316 // safe, we must freeze R to ensure that register allocation uses the same
30317 // register for an undefined value. This ensures that the result will
30318 // still be even and preserves the original semantics.
30319 R = DAG.getFreeze(R);
30320 return DAG.getNode(ISD::ADD, dl, VT, R, R);
30321 }
30322
30323 // ashr(R, 7) === cmp_slt(R, 0)
30324 if (Op.getOpcode() == ISD::SRA && ShiftAmt == 7) {
30325 SDValue Zeros = DAG.getConstant(0, dl, VT);
30326 if (VT.is512BitVector()) {
30327 assert(VT == MVT::v64i8 && "Unexpected element type!");
30328 SDValue CMP = DAG.getSetCC(dl, MVT::v64i1, Zeros, R, ISD::SETGT);
30329 return DAG.getNode(ISD::SIGN_EXTEND, dl, VT, CMP);
30330 }
30331 return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);
30332 }
30333
30334 // XOP can shift v16i8 directly instead of as shift v8i16 + mask.
30335 if (VT == MVT::v16i8 && Subtarget.hasXOP())
30336 return SDValue();
30337
30338 if (Subtarget.hasGFNI()) {
30339 SDValue Mask = getGFNICtrlMask(Op.getOpcode(), DAG, dl, VT, ShiftAmt);
30340 return DAG.getNode(X86ISD::GF2P8AFFINEQB, dl, VT, R, Mask,
30341 DAG.getTargetConstant(0, dl, MVT::i8));
30342 }
30343
30344 if (Op.getOpcode() == ISD::SHL) {
30345 // Make a large shift.
30346 SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ShiftVT, R,
30347 ShiftAmt, DAG);
30348 SHL = DAG.getBitcast(VT, SHL);
30349 // Zero out the rightmost bits.
30350 APInt Mask = APInt::getHighBitsSet(8, 8 - ShiftAmt);
30351 return DAG.getNode(ISD::AND, dl, VT, SHL, DAG.getConstant(Mask, dl, VT));
30352 }
30353 if (Op.getOpcode() == ISD::SRL) {
30354 // Make a large shift.
30355 SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ShiftVT, R,
30356 ShiftAmt, DAG);
30357 SRL = DAG.getBitcast(VT, SRL);
30358 // Zero out the leftmost bits.
30359 APInt Mask = APInt::getLowBitsSet(8, 8 - ShiftAmt);
30360 return DAG.getNode(ISD::AND, dl, VT, SRL, DAG.getConstant(Mask, dl, VT));
30361 }
30362 if (Op.getOpcode() == ISD::SRA) {
30363 // ashr(R, Amt) === sub(xor(lshr(R, Amt), Mask), Mask)
30364 SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
30365
30366 SDValue Mask = DAG.getConstant(128 >> ShiftAmt, dl, VT);
30367 Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
30368 Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
30369 return Res;
30370 }
30371 llvm_unreachable("Unknown shift opcode.");
30372 }
30373
30374 return SDValue();
30375}
30376
30378 const X86Subtarget &Subtarget) {
30379 MVT VT = Op.getSimpleValueType();
30380 SDLoc dl(Op);
30381 SDValue R = Op.getOperand(0);
30382 SDValue Amt = Op.getOperand(1);
30383 unsigned Opcode = Op.getOpcode();
30384 unsigned X86OpcI = getTargetVShiftUniformOpcode(Opcode, false);
30385
30386 int BaseShAmtIdx = -1;
30387 if (SDValue BaseShAmt = DAG.getSplatSourceVector(Amt, BaseShAmtIdx)) {
30388 if (supportedVectorShiftWithBaseAmnt(VT, Subtarget, Opcode))
30389 return getTargetVShiftNode(X86OpcI, dl, VT, R, BaseShAmt, BaseShAmtIdx,
30390 Subtarget, DAG);
30391
30392 // vXi8 shifts - shift as v8i16 + mask result.
30393 if (((VT == MVT::v16i8 && !Subtarget.canExtendTo512DQ()) ||
30394 (VT == MVT::v32i8 && !Subtarget.canExtendTo512BW()) ||
30395 VT == MVT::v64i8) &&
30396 !Subtarget.hasXOP()) {
30397 unsigned NumElts = VT.getVectorNumElements();
30398 MVT ExtVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
30399 if (supportedVectorShiftWithBaseAmnt(ExtVT, Subtarget, Opcode)) {
30400 unsigned LogicalOp = (Opcode == ISD::SHL ? ISD::SHL : ISD::SRL);
30401 unsigned LogicalX86Op = getTargetVShiftUniformOpcode(LogicalOp, false);
30402
30403 // Create the mask using vXi16 shifts. For shift-rights we need to move
30404 // the upper byte down before splatting the vXi8 mask.
30405 SDValue BitMask = DAG.getAllOnesConstant(dl, ExtVT);
30406 BitMask = getTargetVShiftNode(LogicalX86Op, dl, ExtVT, BitMask,
30407 BaseShAmt, BaseShAmtIdx, Subtarget, DAG);
30408 if (Opcode != ISD::SHL)
30409 BitMask = getTargetVShiftByConstNode(LogicalX86Op, dl, ExtVT, BitMask,
30410 8, DAG);
30411 BitMask = DAG.getBitcast(VT, BitMask);
30412 BitMask = DAG.getVectorShuffle(VT, dl, BitMask, BitMask,
30413 SmallVector<int, 64>(NumElts, 0));
30414
30415 SDValue Res = getTargetVShiftNode(LogicalX86Op, dl, ExtVT,
30416 DAG.getBitcast(ExtVT, R), BaseShAmt,
30417 BaseShAmtIdx, Subtarget, DAG);
30418 Res = DAG.getBitcast(VT, Res);
30419 Res = DAG.getNode(ISD::AND, dl, VT, Res, BitMask);
30420
30421 if (Opcode == ISD::SRA) {
30422 // ashr(R, Amt) === sub(xor(lshr(R, Amt), SignMask), SignMask)
30423 // SignMask = lshr(SignBit, Amt) - safe to do this with PSRLW.
30424 SDValue SignMask = DAG.getConstant(0x8080, dl, ExtVT);
30425 SignMask =
30426 getTargetVShiftNode(LogicalX86Op, dl, ExtVT, SignMask, BaseShAmt,
30427 BaseShAmtIdx, Subtarget, DAG);
30428 SignMask = DAG.getBitcast(VT, SignMask);
30429 Res = DAG.getNode(ISD::XOR, dl, VT, Res, SignMask);
30430 Res = DAG.getNode(ISD::SUB, dl, VT, Res, SignMask);
30431 }
30432 return Res;
30433 }
30434 }
30435 }
30436
30437 return SDValue();
30438}
30439
30440// Convert a shift/rotate left amount to a multiplication scale factor.
30442 const X86Subtarget &Subtarget,
30443 SelectionDAG &DAG) {
30444 MVT VT = Amt.getSimpleValueType();
30445 if (!(VT == MVT::v8i16 || VT == MVT::v4i32 ||
30446 (Subtarget.hasInt256() && VT == MVT::v16i16) ||
30447 (Subtarget.hasAVX512() && VT == MVT::v32i16) ||
30448 (!Subtarget.hasAVX512() && VT == MVT::v16i8) ||
30449 (Subtarget.hasInt256() && VT == MVT::v32i8) ||
30450 (Subtarget.hasBWI() && VT == MVT::v64i8)))
30451 return SDValue();
30452
30453 MVT SVT = VT.getVectorElementType();
30454 unsigned SVTBits = SVT.getSizeInBits();
30455 unsigned NumElems = VT.getVectorNumElements();
30456
30457 APInt UndefElts;
30458 SmallVector<APInt> EltBits;
30459 if (getTargetConstantBitsFromNode(Amt, SVTBits, UndefElts, EltBits)) {
30460 APInt One(SVTBits, 1);
30461 SmallVector<SDValue> Elts(NumElems, DAG.getUNDEF(SVT));
30462 for (unsigned I = 0; I != NumElems; ++I) {
30463 if (UndefElts[I] || EltBits[I].uge(SVTBits))
30464 continue;
30465 uint64_t ShAmt = EltBits[I].getZExtValue();
30466 Elts[I] = DAG.getConstant(One.shl(ShAmt), dl, SVT);
30467 }
30468 return DAG.getBuildVector(VT, dl, Elts);
30469 }
30470
30471 // If the target doesn't support variable shifts, use either FP conversion
30472 // or integer multiplication to avoid shifting each element individually.
30473 if (VT == MVT::v4i32) {
30474 Amt = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(23, dl, VT));
30475 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt,
30476 DAG.getConstant(0x3f800000U, dl, VT));
30477 Amt = DAG.getBitcast(MVT::v4f32, Amt);
30478 return DAG.getNode(ISD::FP_TO_SINT, dl, VT, Amt);
30479 }
30480
30481 // AVX2 can more effectively perform this as a zext/trunc to/from v8i32.
30482 if (VT == MVT::v8i16 && !Subtarget.hasAVX2()) {
30483 SDValue Z = DAG.getConstant(0, dl, VT);
30484 SDValue Lo = DAG.getBitcast(MVT::v4i32, getUnpackl(DAG, dl, VT, Amt, Z));
30485 SDValue Hi = DAG.getBitcast(MVT::v4i32, getUnpackh(DAG, dl, VT, Amt, Z));
30486 Lo = convertShiftLeftToScale(Lo, dl, Subtarget, DAG);
30487 Hi = convertShiftLeftToScale(Hi, dl, Subtarget, DAG);
30488 if (Subtarget.hasSSE41())
30489 return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
30490 return getPack(DAG, Subtarget, dl, VT, Lo, Hi);
30491 }
30492
30493 return SDValue();
30494}
30495
30496static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
30497 SelectionDAG &DAG) {
30498 MVT VT = Op.getSimpleValueType();
30499 SDLoc dl(Op);
30500 SDValue R = Op.getOperand(0);
30501 SDValue Amt = Op.getOperand(1);
30502 unsigned NumElts = VT.getVectorNumElements();
30503 unsigned EltSizeInBits = VT.getScalarSizeInBits();
30504 bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
30505
30506 unsigned Opc = Op.getOpcode();
30507 unsigned X86OpcV = getTargetVShiftUniformOpcode(Opc, true);
30508 unsigned X86OpcI = getTargetVShiftUniformOpcode(Opc, false);
30509
30510 assert(VT.isVector() && "Custom lowering only for vector shifts!");
30511 assert(Subtarget.hasSSE2() && "Only custom lower when we have SSE2!");
30512
30513 if (SDValue V = LowerShiftByScalarImmediate(Op, DAG, Subtarget))
30514 return V;
30515
30516 if (SDValue V = LowerShiftByScalarVariable(Op, DAG, Subtarget))
30517 return V;
30518
30519 if (supportedVectorVarShift(VT, Subtarget, Opc))
30520 return Op;
30521
30522 // i64 vector arithmetic shift can be emulated with the transform:
30523 // M = lshr(SIGN_MASK, Amt)
30524 // ashr(R, Amt) === sub(xor(lshr(R, Amt), M), M)
30525 if (((VT == MVT::v2i64 && !Subtarget.hasXOP()) ||
30526 (VT == MVT::v4i64 && Subtarget.hasInt256())) &&
30527 Opc == ISD::SRA) {
30528 SDValue S = DAG.getConstant(APInt::getSignMask(64), dl, VT);
30529 SDValue M = DAG.getNode(ISD::SRL, dl, VT, S, Amt);
30530 R = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
30531 R = DAG.getNode(ISD::XOR, dl, VT, R, M);
30532 R = DAG.getNode(ISD::SUB, dl, VT, R, M);
30533 return R;
30534 }
30535
30536 // XOP has 128-bit variable logical/arithmetic shifts.
30537 // +ve/-ve Amt = shift left/right.
30538 if (Subtarget.hasXOP() && (VT == MVT::v2i64 || VT == MVT::v4i32 ||
30539 VT == MVT::v8i16 || VT == MVT::v16i8)) {
30540 if (Opc == ISD::SRL || Opc == ISD::SRA)
30541 Amt = DAG.getNegative(Amt, dl, VT);
30542 if (Opc == ISD::SHL || Opc == ISD::SRL)
30543 return DAG.getNode(X86ISD::VPSHL, dl, VT, R, Amt);
30544 if (Opc == ISD::SRA)
30545 return DAG.getNode(X86ISD::VPSHA, dl, VT, R, Amt);
30546 }
30547
30548 // 2i64 vector logical shifts can efficiently avoid scalarization - do the
30549 // shifts per-lane and then shuffle the partial results back together.
30550 if (VT == MVT::v2i64 && Opc != ISD::SRA) {
30551 // Splat the shift amounts so the scalar shifts above will catch it.
30552 SDValue Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {0, 0});
30553 SDValue Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {1, 1});
30554 SDValue R0 = DAG.getNode(Opc, dl, VT, R, Amt0);
30555 SDValue R1 = DAG.getNode(Opc, dl, VT, R, Amt1);
30556 return DAG.getVectorShuffle(VT, dl, R0, R1, {0, 3});
30557 }
30558
30559 // Build a map of inrange constant amounts with element mask where they occur.
30561 if (ConstantAmt) {
30562 for (unsigned I = 0; I != NumElts; ++I) {
30563 SDValue A = Amt.getOperand(I);
30564 if (A.isUndef() || A->getAsAPIntVal().uge(EltSizeInBits))
30565 continue;
30566 unsigned CstAmt = A->getAsAPIntVal().getZExtValue();
30567 auto [It, Inserted] = UniqueCstAmt.try_emplace(CstAmt);
30568 if (!Inserted) {
30569 It->second.setBit(I);
30570 continue;
30571 }
30572 It->second = APInt::getOneBitSet(NumElts, I);
30573 }
30574 assert(!UniqueCstAmt.empty() && "Illegal constant shift amounts");
30575 }
30576
30577 // If possible, lower this shift as a sequence of two shifts by
30578 // constant plus a BLENDing shuffle instead of scalarizing it.
30579 // Example:
30580 // (v4i32 (srl A, (build_vector < X, Y, Y, Y>)))
30581 //
30582 // Could be rewritten as:
30583 // (v4i32 (MOVSS (srl A, <Y,Y,Y,Y>), (srl A, <X,X,X,X>)))
30584 //
30585 // The advantage is that the two shifts from the example would be
30586 // lowered as X86ISD::VSRLI nodes in parallel before blending.
30587 if (UniqueCstAmt.size() == 2 &&
30588 (VT == MVT::v8i16 || VT == MVT::v4i32 ||
30589 (VT == MVT::v16i16 && Subtarget.hasInt256()))) {
30590 unsigned AmtA = UniqueCstAmt.begin()->first;
30591 unsigned AmtB = std::next(UniqueCstAmt.begin())->first;
30592 const APInt &MaskA = UniqueCstAmt.begin()->second;
30593 const APInt &MaskB = std::next(UniqueCstAmt.begin())->second;
30594 SmallVector<int, 8> ShuffleMask(NumElts, SM_SentinelUndef);
30595 for (unsigned I = 0; I != NumElts; ++I) {
30596 if (MaskA[I])
30597 ShuffleMask[I] = I;
30598 if (MaskB[I])
30599 ShuffleMask[I] = I + NumElts;
30600 }
30601
30602 // Only perform this blend if we can perform it without loading a mask.
30603 if ((VT != MVT::v16i16 ||
30604 is128BitLaneRepeatedShuffleMask(VT, ShuffleMask)) &&
30605 (VT == MVT::v4i32 || Subtarget.hasSSE41() || Opc != ISD::SHL ||
30606 canWidenShuffleElements(ShuffleMask))) {
30607 SDValue Shift1 =
30608 DAG.getNode(Opc, dl, VT, R, DAG.getConstant(AmtA, dl, VT));
30609 SDValue Shift2 =
30610 DAG.getNode(Opc, dl, VT, R, DAG.getConstant(AmtB, dl, VT));
30611 return DAG.getVectorShuffle(VT, dl, Shift1, Shift2, ShuffleMask);
30612 }
30613 }
30614
30615 // Constant ISD::SRA/SRL/SHL can be performed efficiently on vXiN vectors by
30616 // using vYiM vector operations where X*N == Y*M and M > N.
30617 if (ConstantAmt &&
30618 (VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8 ||
30619 VT == MVT::v8i16 || VT == MVT::v16i16 || VT == MVT::v32i16) &&
30620 !Subtarget.hasXOP()) {
30621 MVT NarrowScalarVT = VT.getScalarType();
30622 // We can do this extra fast if each pair of narrow elements is shifted by
30623 // the same amount by doing this SWAR style: use a shift to move the valid
30624 // bits to the right position, mask out any bits which crossed from one
30625 // element to the other.
30626 // This optimized lowering is only valid if the elements in a pair can
30627 // be treated identically.
30628 SmallVector<SDValue, 32> AmtWideElts(Amt->ops());
30629 SmallVector<SDValue, 32> TmpAmtWideElts;
30630 int WideEltSizeInBits = EltSizeInBits;
30631 while (WideEltSizeInBits < 32) {
30632 // AVX1 does not have psrlvd, etc. which makes interesting 32-bit shifts
30633 // unprofitable.
30634 if (WideEltSizeInBits >= 16 && !Subtarget.hasAVX2()) {
30635 break;
30636 }
30637 TmpAmtWideElts.resize(AmtWideElts.size() / 2);
30638 bool SameShifts = true;
30639 for (unsigned SrcI = 0, E = AmtWideElts.size(); SrcI != E; SrcI += 2) {
30640 unsigned DstI = SrcI / 2;
30641 // Both elements are undef? Make a note and keep going.
30642 if (AmtWideElts[SrcI].isUndef() && AmtWideElts[SrcI + 1].isUndef()) {
30643 TmpAmtWideElts[DstI] = AmtWideElts[SrcI];
30644 continue;
30645 }
30646 // Even element is undef? We will shift it by the same shift amount as
30647 // the odd element.
30648 if (AmtWideElts[SrcI].isUndef()) {
30649 TmpAmtWideElts[DstI] = AmtWideElts[SrcI + 1];
30650 continue;
30651 }
30652 // Odd element is undef? We will shift it by the same shift amount as
30653 // the even element.
30654 if (AmtWideElts[SrcI + 1].isUndef()) {
30655 TmpAmtWideElts[DstI] = AmtWideElts[SrcI];
30656 continue;
30657 }
30658 // Both elements are equal.
30659 if (AmtWideElts[SrcI].getNode()->getAsAPIntVal() ==
30660 AmtWideElts[SrcI + 1].getNode()->getAsAPIntVal()) {
30661 TmpAmtWideElts[DstI] = AmtWideElts[SrcI];
30662 continue;
30663 }
30664 // One of the provisional wide elements will not have the same shift
30665 // amount. Let's bail.
30666 SameShifts = false;
30667 break;
30668 }
30669 if (!SameShifts) {
30670 break;
30671 }
30672 WideEltSizeInBits *= 2;
30673 std::swap(TmpAmtWideElts, AmtWideElts);
30674 }
30675 APInt APIntShiftAmt;
30676 bool IsConstantSplat = X86::isConstantSplat(Amt, APIntShiftAmt);
30677 bool Profitable = WidenShift;
30678 // AVX512BW brings support for vpsllvw.
30679 if (WideEltSizeInBits * AmtWideElts.size() >= 512 &&
30680 WideEltSizeInBits < 32 && !Subtarget.hasBWI()) {
30681 Profitable = false;
30682 }
30683 // Leave AVX512 uniform arithmetic shifts alone, they can be implemented
30684 // fairly cheaply in other ways.
30685 if (WideEltSizeInBits * AmtWideElts.size() >= 512 && IsConstantSplat) {
30686 Profitable = false;
30687 }
30688 // Leave it up to GFNI if we have it around.
30689 // TODO: gf2p8affine is usually higher latency and more port restricted. It
30690 // is probably a win to use other strategies in some cases.
30691 if (EltSizeInBits == 8 && Subtarget.hasGFNI()) {
30692 Profitable = false;
30693 }
30694
30695 // AVX1 does not have vpand which makes our masking impractical. It does
30696 // have vandps but that is an FP instruction and crossing FP<->int typically
30697 // has some cost.
30698 if (WideEltSizeInBits * AmtWideElts.size() >= 256 &&
30699 (WideEltSizeInBits < 32 || IsConstantSplat) && !Subtarget.hasAVX2()) {
30700 Profitable = false;
30701 }
30702 unsigned WideNumElts = AmtWideElts.size();
30703 // We are only dealing with identical pairs.
30704 if (Profitable && WideNumElts != NumElts) {
30705 MVT WideScalarVT = MVT::getIntegerVT(WideEltSizeInBits);
30706 MVT WideVT = MVT::getVectorVT(WideScalarVT, WideNumElts);
30707 // Cast the operand to vXiM.
30708 SDValue RWide = DAG.getBitcast(WideVT, R);
30709 // Create our new vector of shift amounts.
30710 SDValue AmtWide = DAG.getBuildVector(
30711 MVT::getVectorVT(NarrowScalarVT, WideNumElts), dl, AmtWideElts);
30712 AmtWide = DAG.getZExtOrTrunc(AmtWide, dl, WideVT);
30713 // Perform the actual shift.
30714 unsigned LogicalOpc = Opc == ISD::SRA ? (unsigned)ISD::SRL : Opc;
30715 SDValue ShiftedR = DAG.getNode(LogicalOpc, dl, WideVT, RWide, AmtWide);
30716 // Now we need to construct a mask which will "drop" bits that get
30717 // shifted past the LSB/MSB. For a logical shift left, it will look
30718 // like:
30719 // FullMask = (1 << EltSizeInBits) - 1
30720 // Mask = FullMask << Amt
30721 //
30722 // This masking ensures that bits cannot migrate from one narrow lane to
30723 // another. The construction of this mask will be constant folded.
30724 // The mask for a logical right shift is nearly identical, the only
30725 // difference is that the all ones mask is shifted right instead of left.
30726 SDValue SplatFullMask = DAG.getAllOnesConstant(dl, VT);
30727 SDValue Mask = DAG.getNode(LogicalOpc, dl, VT, SplatFullMask, Amt);
30728 Mask = DAG.getBitcast(WideVT, Mask);
30729 // Finally, we mask the shifted vector with the SWAR mask.
30730 SDValue Masked = DAG.getNode(ISD::AND, dl, WideVT, ShiftedR, Mask);
30731 Masked = DAG.getBitcast(VT, Masked);
30732 if (Opc != ISD::SRA) {
30733 // Logical shifts are complete at this point.
30734 return Masked;
30735 }
30736 // At this point, we have done a *logical* shift right. We now need to
30737 // sign extend the result so that we get behavior equivalent to an
30738 // arithmetic shift right. Post-shifting by AmtWide, our narrow elements
30739 // are `EltSizeInBits-AmtWide` bits wide.
30740 //
30741 // To convert our `EltSizeInBits-AmtWide` bit unsigned numbers to signed
30742 // numbers as wide as `EltSizeInBits`, we need to replicate the bit at
30743 // position `EltSizeInBits-AmtWide` into the MSBs of each narrow lane. We
30744 // can use the following trick to accomplish this:
30745 // SignBitMask = 1 << (EltSizeInBits-AmtWide-1)
30746 // (Masked ^ SignBitMask) - SignBitMask
30747 //
30748 // When the sign bit is already clear, this will compute:
30749 // Masked + SignBitMask - SignBitMask
30750 //
30751 // This is equal to Masked which is what we want: the sign bit was clear
30752 // so sign extending should be a no-op.
30753 //
30754 // When the sign bit is set, this will compute:
30755 // Masked - SignBitmask - SignBitMask
30756 //
30757 // This is equal to Masked - 2*SignBitMask which will correctly sign
30758 // extend our result.
30759 SDValue SplatHighBit =
30760 DAG.getConstant(APInt::getSignMask(EltSizeInBits), dl, VT);
30761 // This does not induce recursion, all operands are constants.
30762 SDValue SignBitMask = DAG.getNode(LogicalOpc, dl, VT, SplatHighBit, Amt);
30763 SDValue FlippedSignBit =
30764 DAG.getNode(ISD::XOR, dl, VT, Masked, SignBitMask);
30765 SDValue Subtraction =
30766 DAG.getNode(ISD::SUB, dl, VT, FlippedSignBit, SignBitMask);
30767 return Subtraction;
30768 }
30769 }
30770
30771 // If possible, lower this packed shift into a vector multiply instead of
30772 // expanding it into a sequence of scalar shifts.
30773 // For v32i8 cases, it might be quicker to split/extend to vXi16 shifts.
30774 if (Opc == ISD::SHL && !(VT == MVT::v32i8 && (Subtarget.hasXOP() ||
30775 Subtarget.canExtendTo512BW())))
30776 if (SDValue Scale = convertShiftLeftToScale(Amt, dl, Subtarget, DAG))
30777 return DAG.getNode(ISD::MUL, dl, VT, R, Scale);
30778
30779 // Constant ISD::SRL can be performed efficiently on vXi16 vectors as we
30780 // can replace with ISD::MULHU, creating scale factor from (NumEltBits - Amt).
30781 if (Opc == ISD::SRL && ConstantAmt &&
30782 (VT == MVT::v8i16 || (VT == MVT::v16i16 && Subtarget.hasInt256()))) {
30783 SDValue EltBits = DAG.getConstant(EltSizeInBits, dl, VT);
30784 SDValue RAmt = DAG.getNode(ISD::SUB, dl, VT, EltBits, Amt);
30785 if (SDValue Scale = convertShiftLeftToScale(RAmt, dl, Subtarget, DAG)) {
30786 SDValue Zero = DAG.getConstant(0, dl, VT);
30787 SDValue ZAmt = DAG.getSetCC(dl, VT, Amt, Zero, ISD::SETEQ);
30788 SDValue Res = DAG.getNode(ISD::MULHU, dl, VT, R, Scale);
30789 return DAG.getSelect(dl, VT, ZAmt, R, Res);
30790 }
30791 }
30792
30793 // Constant ISD::SRA can be performed efficiently on vXi16 vectors as we
30794 // can replace with ISD::MULHS, creating scale factor from (NumEltBits - Amt).
30795 // TODO: Special case handling for shift by 0/1, really we can afford either
30796 // of these cases in pre-SSE41/XOP/AVX512 but not both.
30797 if (Opc == ISD::SRA && ConstantAmt &&
30798 (VT == MVT::v8i16 || (VT == MVT::v16i16 && Subtarget.hasInt256())) &&
30799 ((Subtarget.hasSSE41() && !Subtarget.hasXOP() &&
30800 !Subtarget.hasAVX512()) ||
30801 DAG.isKnownNeverZero(Amt))) {
30802 SDValue EltBits = DAG.getConstant(EltSizeInBits, dl, VT);
30803 SDValue RAmt = DAG.getNode(ISD::SUB, dl, VT, EltBits, Amt);
30804 if (SDValue Scale = convertShiftLeftToScale(RAmt, dl, Subtarget, DAG)) {
30805 SDValue Amt0 =
30806 DAG.getSetCC(dl, VT, Amt, DAG.getConstant(0, dl, VT), ISD::SETEQ);
30807 SDValue Amt1 =
30808 DAG.getSetCC(dl, VT, Amt, DAG.getConstant(1, dl, VT), ISD::SETEQ);
30809 SDValue Sra1 =
30810 getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, R, 1, DAG);
30811 SDValue Res = DAG.getNode(ISD::MULHS, dl, VT, R, Scale);
30812 Res = DAG.getSelect(dl, VT, Amt0, R, Res);
30813 return DAG.getSelect(dl, VT, Amt1, Sra1, Res);
30814 }
30815 }
30816
30817 // v4i32 Non Uniform Shifts.
30818 // If the shift amount is constant we can shift each lane using the SSE2
30819 // immediate shifts, else we need to zero-extend each lane to the lower i64
30820 // and shift using the SSE2 variable shifts.
30821 // The separate results can then be blended together.
30822 if (VT == MVT::v4i32) {
30823 SDValue Amt0, Amt1, Amt2, Amt3;
30824 if (ConstantAmt) {
30825 Amt0 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {0, 0, 0, 0});
30826 Amt1 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {1, 1, 1, 1});
30827 Amt2 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {2, 2, 2, 2});
30828 Amt3 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {3, 3, 3, 3});
30829 } else {
30830 // The SSE2 shifts use the lower i64 as the same shift amount for
30831 // all lanes and the upper i64 is ignored. On AVX we're better off
30832 // just zero-extending, but for SSE just duplicating the top 16-bits is
30833 // cheaper and has the same effect for out of range values.
30834 if (Subtarget.hasAVX()) {
30835 SDValue Z = DAG.getConstant(0, dl, VT);
30836 Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Z, {0, 4, -1, -1});
30837 Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Z, {1, 5, -1, -1});
30838 Amt2 = DAG.getVectorShuffle(VT, dl, Amt, Z, {2, 6, -1, -1});
30839 Amt3 = DAG.getVectorShuffle(VT, dl, Amt, Z, {3, 7, -1, -1});
30840 } else {
30841 SDValue Amt01 = DAG.getBitcast(MVT::v8i16, Amt);
30842 SDValue Amt23 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt01, Amt01,
30843 {4, 5, 6, 7, -1, -1, -1, -1});
30844 SDValue Msk02 = getV4X86ShuffleImm8ForMask({0, 1, 1, 1}, dl, DAG);
30845 SDValue Msk13 = getV4X86ShuffleImm8ForMask({2, 3, 3, 3}, dl, DAG);
30846 Amt0 = DAG.getNode(X86ISD::PSHUFLW, dl, MVT::v8i16, Amt01, Msk02);
30847 Amt1 = DAG.getNode(X86ISD::PSHUFLW, dl, MVT::v8i16, Amt01, Msk13);
30848 Amt2 = DAG.getNode(X86ISD::PSHUFLW, dl, MVT::v8i16, Amt23, Msk02);
30849 Amt3 = DAG.getNode(X86ISD::PSHUFLW, dl, MVT::v8i16, Amt23, Msk13);
30850 }
30851 }
30852
30853 unsigned ShOpc = ConstantAmt ? Opc : X86OpcV;
30854 SDValue R0 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt0));
30855 SDValue R1 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt1));
30856 SDValue R2 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt2));
30857 SDValue R3 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt3));
30858
30859 // Merge the shifted lane results optimally with/without PBLENDW.
30860 // TODO - ideally shuffle combining would handle this.
30861 if (Subtarget.hasSSE41()) {
30862 SDValue R02 = DAG.getVectorShuffle(VT, dl, R0, R2, {0, -1, 6, -1});
30863 SDValue R13 = DAG.getVectorShuffle(VT, dl, R1, R3, {-1, 1, -1, 7});
30864 return DAG.getVectorShuffle(VT, dl, R02, R13, {0, 5, 2, 7});
30865 }
30866 SDValue R01 = DAG.getVectorShuffle(VT, dl, R0, R1, {0, -1, -1, 5});
30867 SDValue R23 = DAG.getVectorShuffle(VT, dl, R2, R3, {2, -1, -1, 7});
30868 return DAG.getVectorShuffle(VT, dl, R01, R23, {0, 3, 4, 7});
30869 }
30870
30871 // If we're shifting (per-lane) uniform vXi8 constants, we can use PSHUFB to
30872 // look up the pre-computed shift values.
30873 if ((VT == MVT::v16i8 && Subtarget.hasSSSE3()) ||
30874 (VT == MVT::v32i8 && Subtarget.hasInt256()) ||
30875 (VT == MVT::v64i8 && Subtarget.hasBWI())) {
30876 unsigned NumLanes = VT.getSizeInBits() / 128u;
30877 unsigned NumEltsPerLane = NumElts / NumLanes;
30879 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
30880 unsigned LoElt = Lane * NumEltsPerLane;
30881 APInt EltMask = APInt::getBitsSet(NumElts, LoElt, LoElt + NumEltsPerLane);
30882 KnownBits KnownLane = DAG.computeKnownBits(R, EltMask);
30883 if (!KnownLane.isConstant())
30884 break;
30885 const APInt &LaneSplat = KnownLane.getConstant();
30886 for (unsigned I = 0; I != 8; ++I) {
30887 if (Opc == ISD::SHL)
30888 LUT.push_back(LaneSplat.shl(I));
30889 else if (Opc == ISD::SRL)
30890 LUT.push_back(LaneSplat.lshr(I));
30891 else if (Opc == ISD::SRA)
30892 LUT.push_back(LaneSplat.ashr(I));
30893 }
30894 LUT.append(8, APInt::getZero(8));
30895 }
30896 if (LUT.size() == NumElts) {
30897 APInt Undefs = APInt::getSplat(NumElts, APInt(16, 0xFF00));
30898 SDValue Mask = getConstVector(LUT, Undefs, VT, DAG, dl);
30899 return DAG.getNode(X86ISD::PSHUFB, dl, VT, Mask, Amt);
30900 }
30901 }
30902
30903 // It's worth extending once and using the vXi16/vXi32 shifts for smaller
30904 // types, but without AVX512 the extra overheads to get from vXi8 to vXi32
30905 // make the existing SSE solution better.
30906 // NOTE: We honor prefered vector width before promoting to 512-bits.
30907 if ((Subtarget.hasInt256() && VT == MVT::v8i16) ||
30908 (Subtarget.canExtendTo512DQ() && VT == MVT::v16i16) ||
30909 (Subtarget.canExtendTo512DQ() && VT == MVT::v16i8) ||
30910 (Subtarget.canExtendTo512BW() && VT == MVT::v32i8) ||
30911 (Subtarget.hasBWI() && Subtarget.hasVLX() && VT == MVT::v16i8)) {
30912 assert((!Subtarget.hasBWI() || VT == MVT::v32i8 || VT == MVT::v16i8) &&
30913 "Unexpected vector type");
30914 MVT EvtSVT = Subtarget.hasBWI() ? MVT::i16 : MVT::i32;
30915 MVT ExtVT = MVT::getVectorVT(EvtSVT, NumElts);
30916 unsigned ExtOpc = Opc == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
30917 R = DAG.getNode(ExtOpc, dl, ExtVT, R);
30918 Amt = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVT, Amt);
30919 return DAG.getNode(ISD::TRUNCATE, dl, VT,
30920 DAG.getNode(Opc, dl, ExtVT, R, Amt));
30921 }
30922
30923 // Constant ISD::SRA/SRL can be performed efficiently on vXi8 vectors as we
30924 // extend to vXi16 to perform a MUL scale effectively as a MUL_LOHI.
30925 if (ConstantAmt && (Opc == ISD::SRA || Opc == ISD::SRL) &&
30926 (VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) ||
30927 (VT == MVT::v64i8 && Subtarget.hasBWI())) &&
30928 !Subtarget.hasXOP()) {
30929 MVT VT16 = MVT::getVectorVT(MVT::i16, NumElts / 2);
30930 SDValue Cst8 = DAG.getTargetConstant(8, dl, MVT::i8);
30931
30932 // Extend constant shift amount to vXi16 (it doesn't matter if the type
30933 // isn't legal).
30934 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts);
30935 Amt = DAG.getZExtOrTrunc(Amt, dl, ExVT);
30936 Amt = DAG.getNode(ISD::SUB, dl, ExVT, DAG.getConstant(8, dl, ExVT), Amt);
30937 Amt = DAG.getNode(ISD::SHL, dl, ExVT, DAG.getConstant(1, dl, ExVT), Amt);
30939 "Constant build vector expected");
30940
30941 if (VT == MVT::v16i8 && Subtarget.hasInt256()) {
30942 bool IsSigned = Opc == ISD::SRA;
30943 R = DAG.getExtOrTrunc(IsSigned, R, dl, ExVT);
30944 R = DAG.getNode(ISD::MUL, dl, ExVT, R, Amt);
30945 R = DAG.getNode(X86ISD::VSRLI, dl, ExVT, R, Cst8);
30946 return DAG.getZExtOrTrunc(R, dl, VT);
30947 }
30948
30949 SmallVector<SDValue, 16> LoAmt, HiAmt;
30950 for (unsigned i = 0; i != NumElts; i += 16) {
30951 for (int j = 0; j != 8; ++j) {
30952 LoAmt.push_back(Amt.getOperand(i + j));
30953 HiAmt.push_back(Amt.getOperand(i + j + 8));
30954 }
30955 }
30956
30957 SDValue LoA = DAG.getBuildVector(VT16, dl, LoAmt);
30958 SDValue HiA = DAG.getBuildVector(VT16, dl, HiAmt);
30959
30960 SDValue LoR = DAG.getBitcast(VT16, getUnpackl(DAG, dl, VT, R, R));
30961 SDValue HiR = DAG.getBitcast(VT16, getUnpackh(DAG, dl, VT, R, R));
30962 LoR = DAG.getNode(X86OpcI, dl, VT16, LoR, Cst8);
30963 HiR = DAG.getNode(X86OpcI, dl, VT16, HiR, Cst8);
30964 LoR = DAG.getNode(ISD::MUL, dl, VT16, LoR, LoA);
30965 HiR = DAG.getNode(ISD::MUL, dl, VT16, HiR, HiA);
30966 LoR = DAG.getNode(X86ISD::VSRLI, dl, VT16, LoR, Cst8);
30967 HiR = DAG.getNode(X86ISD::VSRLI, dl, VT16, HiR, Cst8);
30968 return DAG.getNode(X86ISD::PACKUS, dl, VT, LoR, HiR);
30969 }
30970
30971 if (VT == MVT::v16i8 ||
30972 (VT == MVT::v32i8 && Subtarget.hasInt256() && !Subtarget.hasXOP()) ||
30973 (VT == MVT::v64i8 && Subtarget.hasBWI())) {
30974 MVT ExtVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
30975
30976 auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {
30977 if (VT.is512BitVector()) {
30978 // On AVX512BW targets we make use of the fact that VSELECT lowers
30979 // to a masked blend which selects bytes based just on the sign bit
30980 // extracted to a mask.
30981 MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
30982 V0 = DAG.getBitcast(VT, V0);
30983 V1 = DAG.getBitcast(VT, V1);
30984 Sel = DAG.getBitcast(VT, Sel);
30985 Sel = DAG.getSetCC(dl, MaskVT, DAG.getConstant(0, dl, VT), Sel,
30986 ISD::SETGT);
30987 return DAG.getBitcast(SelVT, DAG.getSelect(dl, VT, Sel, V0, V1));
30988 } else if (Subtarget.hasSSE41()) {
30989 // On SSE41 targets we can use PBLENDVB which selects bytes based just
30990 // on the sign bit.
30991 V0 = DAG.getBitcast(VT, V0);
30992 V1 = DAG.getBitcast(VT, V1);
30993 Sel = DAG.getBitcast(VT, Sel);
30994 return DAG.getBitcast(SelVT,
30995 DAG.getNode(X86ISD::BLENDV, dl, VT, Sel, V0, V1));
30996 }
30997 // On pre-SSE41 targets we test for the sign bit by comparing to
30998 // zero - a negative value will set all bits of the lanes to true
30999 // and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.
31000 SDValue Z = DAG.getConstant(0, dl, SelVT);
31001 SDValue C = DAG.getNode(X86ISD::PCMPGT, dl, SelVT, Z, Sel);
31002 return DAG.getSelect(dl, SelVT, C, V0, V1);
31003 };
31004
31005 // Turn 'a' into a mask suitable for VSELECT: a = a << 5;
31006 // We can safely do this using i16 shifts as we're only interested in
31007 // the 3 lower bits of each byte.
31008 Amt = DAG.getBitcast(ExtVT, Amt);
31009 Amt = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ExtVT, Amt, 5, DAG);
31010 Amt = DAG.getBitcast(VT, Amt);
31011
31012 if (Opc == ISD::SHL || Opc == ISD::SRL) {
31013 // r = VSELECT(r, shift(r, 4), a);
31014 SDValue M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(4, dl, VT));
31015 R = SignBitSelect(VT, Amt, M, R);
31016
31017 // a += a
31018 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
31019
31020 // r = VSELECT(r, shift(r, 2), a);
31021 M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(2, dl, VT));
31022 R = SignBitSelect(VT, Amt, M, R);
31023
31024 // a += a
31025 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
31026
31027 // return VSELECT(r, shift(r, 1), a);
31028 M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(1, dl, VT));
31029 R = SignBitSelect(VT, Amt, M, R);
31030 return R;
31031 }
31032
31033 if (Opc == ISD::SRA) {
31034 // For SRA we need to unpack each byte to the higher byte of a i16 vector
31035 // so we can correctly sign extend. We don't care what happens to the
31036 // lower byte.
31037 SDValue ALo = getUnpackl(DAG, dl, VT, DAG.getUNDEF(VT), Amt);
31038 SDValue AHi = getUnpackh(DAG, dl, VT, DAG.getUNDEF(VT), Amt);
31039 SDValue RLo = getUnpackl(DAG, dl, VT, DAG.getUNDEF(VT), R);
31040 SDValue RHi = getUnpackh(DAG, dl, VT, DAG.getUNDEF(VT), R);
31041 ALo = DAG.getBitcast(ExtVT, ALo);
31042 AHi = DAG.getBitcast(ExtVT, AHi);
31043 RLo = DAG.getBitcast(ExtVT, RLo);
31044 RHi = DAG.getBitcast(ExtVT, RHi);
31045
31046 // r = VSELECT(r, shift(r, 4), a);
31047 SDValue MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 4, DAG);
31048 SDValue MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 4, DAG);
31049 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
31050 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
31051
31052 // a += a
31053 ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
31054 AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
31055
31056 // r = VSELECT(r, shift(r, 2), a);
31057 MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 2, DAG);
31058 MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 2, DAG);
31059 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
31060 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
31061
31062 // a += a
31063 ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
31064 AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
31065
31066 // r = VSELECT(r, shift(r, 1), a);
31067 MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 1, DAG);
31068 MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 1, DAG);
31069 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
31070 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
31071
31072 // Logical shift the result back to the lower byte, leaving a zero upper
31073 // byte meaning that we can safely pack with PACKUSWB.
31074 RLo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, RLo, 8, DAG);
31075 RHi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, RHi, 8, DAG);
31076 return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
31077 }
31078 }
31079
31080 if (Subtarget.hasInt256() && !Subtarget.hasXOP() && VT == MVT::v16i16) {
31081 MVT ExtVT = MVT::v8i32;
31082 SDValue Z = DAG.getConstant(0, dl, VT);
31083 SDValue ALo = getUnpackl(DAG, dl, VT, Amt, Z);
31084 SDValue AHi = getUnpackh(DAG, dl, VT, Amt, Z);
31085 SDValue RLo = getUnpackl(DAG, dl, VT, Z, R);
31086 SDValue RHi = getUnpackh(DAG, dl, VT, Z, R);
31087 ALo = DAG.getBitcast(ExtVT, ALo);
31088 AHi = DAG.getBitcast(ExtVT, AHi);
31089 RLo = DAG.getBitcast(ExtVT, RLo);
31090 RHi = DAG.getBitcast(ExtVT, RHi);
31091 SDValue Lo = DAG.getNode(Opc, dl, ExtVT, RLo, ALo);
31092 SDValue Hi = DAG.getNode(Opc, dl, ExtVT, RHi, AHi);
31093 Lo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, Lo, 16, DAG);
31094 Hi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, Hi, 16, DAG);
31095 return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
31096 }
31097
31098 if (VT == MVT::v8i16) {
31099 // If we have a constant shift amount, the non-SSE41 path is best as
31100 // avoiding bitcasts make it easier to constant fold and reduce to PBLENDW.
31101 bool UseSSE41 = Subtarget.hasSSE41() &&
31103
31104 auto SignBitSelect = [&](SDValue Sel, SDValue V0, SDValue V1) {
31105 // On SSE41 targets we can use PBLENDVB which selects bytes based just on
31106 // the sign bit.
31107 if (UseSSE41) {
31108 MVT ExtVT = MVT::getVectorVT(MVT::i8, NumElts * 2);
31109 V0 = DAG.getBitcast(ExtVT, V0);
31110 V1 = DAG.getBitcast(ExtVT, V1);
31111 Sel = DAG.getBitcast(ExtVT, Sel);
31112 return DAG.getBitcast(
31113 VT, DAG.getNode(X86ISD::BLENDV, dl, ExtVT, Sel, V0, V1));
31114 }
31115 // On pre-SSE41 targets we splat the sign bit - a negative value will
31116 // set all bits of the lanes to true and VSELECT uses that in
31117 // its OR(AND(V0,C),AND(V1,~C)) lowering.
31118 SDValue C =
31119 getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, Sel, 15, DAG);
31120 return DAG.getSelect(dl, VT, C, V0, V1);
31121 };
31122
31123 // Turn 'a' into a mask suitable for VSELECT: a = a << 12;
31124 if (UseSSE41) {
31125 // On SSE41 targets we need to replicate the shift mask in both
31126 // bytes for PBLENDVB.
31127 Amt = DAG.getNode(
31128 ISD::OR, dl, VT,
31129 getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 4, DAG),
31130 getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 12, DAG));
31131 } else {
31132 Amt = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 12, DAG);
31133 }
31134
31135 // r = VSELECT(r, shift(r, 8), a);
31136 SDValue M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 8, DAG);
31137 R = SignBitSelect(Amt, M, R);
31138
31139 // a += a
31140 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
31141
31142 // r = VSELECT(r, shift(r, 4), a);
31143 M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 4, DAG);
31144 R = SignBitSelect(Amt, M, R);
31145
31146 // a += a
31147 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
31148
31149 // r = VSELECT(r, shift(r, 2), a);
31150 M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 2, DAG);
31151 R = SignBitSelect(Amt, M, R);
31152
31153 // a += a
31154 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
31155
31156 // return VSELECT(r, shift(r, 1), a);
31157 M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 1, DAG);
31158 R = SignBitSelect(Amt, M, R);
31159 return R;
31160 }
31161
31162 // Decompose 256-bit shifts into 128-bit shifts.
31163 if (VT.is256BitVector())
31164 return splitVectorIntBinary(Op, DAG, dl);
31165
31166 if (VT == MVT::v32i16 || VT == MVT::v64i8)
31167 return splitVectorIntBinary(Op, DAG, dl);
31168
31169 return SDValue();
31170}
31171
31173 SelectionDAG &DAG) {
31174 MVT VT = Op.getSimpleValueType();
31175 assert((Op.getOpcode() == ISD::FSHL || Op.getOpcode() == ISD::FSHR) &&
31176 "Unexpected funnel shift opcode!");
31177
31178 SDLoc DL(Op);
31179 SDValue Op0 = Op.getOperand(0);
31180 SDValue Op1 = Op.getOperand(1);
31181 SDValue Amt = Op.getOperand(2);
31182 unsigned EltSizeInBits = VT.getScalarSizeInBits();
31183 bool IsFSHR = Op.getOpcode() == ISD::FSHR;
31184
31185 if (VT.isVector()) {
31186 APInt APIntShiftAmt;
31187 bool IsCstSplat = X86::isConstantSplat(Amt, APIntShiftAmt);
31188 unsigned NumElts = VT.getVectorNumElements();
31189
31190 if (Subtarget.hasVBMI2() && EltSizeInBits > 8) {
31191
31192 if (IsCstSplat) {
31193 if (IsFSHR)
31194 std::swap(Op0, Op1);
31195 uint64_t ShiftAmt = APIntShiftAmt.urem(EltSizeInBits);
31196 SDValue Imm = DAG.getTargetConstant(ShiftAmt, DL, MVT::i8);
31197 return getAVX512Node(IsFSHR ? X86ISD::VSHRD : X86ISD::VSHLD, DL, VT,
31198 {Op0, Op1, Imm}, DAG, Subtarget);
31199 }
31200 return getAVX512Node(IsFSHR ? ISD::FSHR : ISD::FSHL, DL, VT,
31201 {Op0, Op1, Amt}, DAG, Subtarget);
31202 }
31203 assert((VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8 ||
31204 VT == MVT::v8i16 || VT == MVT::v16i16 || VT == MVT::v32i16 ||
31205 VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) &&
31206 "Unexpected funnel shift type!");
31207
31208 // fshl(x,y,z) -> unpack(y,x) << (z & (bw-1))) >> bw.
31209 // fshr(x,y,z) -> unpack(y,x) >> (z & (bw-1))).
31210 if (IsCstSplat) {
31211 // TODO: Can't use generic expansion as UNDEF amt elements can be
31212 // converted to other values when folded to shift amounts, losing the
31213 // splat.
31214 uint64_t ShiftAmt = APIntShiftAmt.urem(EltSizeInBits);
31215 uint64_t ShXAmt = IsFSHR ? (EltSizeInBits - ShiftAmt) : ShiftAmt;
31216 uint64_t ShYAmt = IsFSHR ? ShiftAmt : (EltSizeInBits - ShiftAmt);
31217 assert((ShXAmt + ShYAmt) == EltSizeInBits && "Illegal funnel shift");
31218 MVT WideVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
31219
31220 if (EltSizeInBits == 8 &&
31221 (Subtarget.hasXOP() ||
31222 (useVPTERNLOG(Subtarget, VT) &&
31223 supportedVectorShiftWithImm(WideVT, Subtarget, ISD::SHL)))) {
31224 // For vXi8 cases on Subtargets that can perform VPCMOV/VPTERNLOG
31225 // bit-select - lower using vXi16 shifts and then perform the bitmask at
31226 // the original vector width to handle cases where we split.
31227 APInt MaskX = APInt::getHighBitsSet(8, 8 - ShXAmt);
31228 APInt MaskY = APInt::getLowBitsSet(8, 8 - ShYAmt);
31229 SDValue ShX =
31230 DAG.getNode(ISD::SHL, DL, WideVT, DAG.getBitcast(WideVT, Op0),
31231 DAG.getShiftAmountConstant(ShXAmt, WideVT, DL));
31232 SDValue ShY =
31233 DAG.getNode(ISD::SRL, DL, WideVT, DAG.getBitcast(WideVT, Op1),
31234 DAG.getShiftAmountConstant(ShYAmt, WideVT, DL));
31235 ShX = DAG.getNode(ISD::AND, DL, VT, DAG.getBitcast(VT, ShX),
31236 DAG.getConstant(MaskX, DL, VT));
31237 ShY = DAG.getNode(ISD::AND, DL, VT, DAG.getBitcast(VT, ShY),
31238 DAG.getConstant(MaskY, DL, VT));
31239 return DAG.getNode(ISD::OR, DL, VT, ShX, ShY);
31240 }
31241
31242 SDValue ShX = DAG.getNode(ISD::SHL, DL, VT, Op0,
31243 DAG.getShiftAmountConstant(ShXAmt, VT, DL));
31244 SDValue ShY = DAG.getNode(ISD::SRL, DL, VT, Op1,
31245 DAG.getShiftAmountConstant(ShYAmt, VT, DL));
31246 return DAG.getNode(ISD::OR, DL, VT, ShX, ShY);
31247 }
31248
31249 SDValue AmtMask = DAG.getConstant(EltSizeInBits - 1, DL, VT);
31250 SDValue AmtMod = DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask);
31251 bool IsCst = ISD::isBuildVectorOfConstantSDNodes(AmtMod.getNode());
31252
31253 // Constant vXi16 funnel shifts can be efficiently handled by default.
31254 if (IsCst && EltSizeInBits == 16)
31255 return SDValue();
31256
31257 unsigned ShiftOpc = IsFSHR ? ISD::SRL : ISD::SHL;
31258 MVT ExtSVT = MVT::getIntegerVT(2 * EltSizeInBits);
31259 MVT ExtVT = MVT::getVectorVT(ExtSVT, NumElts / 2);
31260
31261 // Split 256-bit integers on XOP/pre-AVX2 targets.
31262 // Split 512-bit integers on non 512-bit BWI targets.
31263 if ((VT.is256BitVector() && ((Subtarget.hasXOP() && EltSizeInBits < 16) ||
31264 !Subtarget.hasAVX2())) ||
31265 (VT.is512BitVector() && !Subtarget.useBWIRegs() &&
31266 EltSizeInBits < 32)) {
31267 // Pre-mask the amount modulo using the wider vector.
31268 Op = DAG.getNode(Op.getOpcode(), DL, VT, Op0, Op1, AmtMod);
31269 return splitVectorOp(Op, DAG, DL);
31270 }
31271
31272 // Attempt to fold scalar shift as unpack(y,x) << zext(splat(z))
31273 if (supportedVectorShiftWithBaseAmnt(ExtVT, Subtarget, ShiftOpc)) {
31274 int ScalarAmtIdx = -1;
31275 if (SDValue ScalarAmt = DAG.getSplatSourceVector(AmtMod, ScalarAmtIdx)) {
31276 // Uniform vXi16 funnel shifts can be efficiently handled by default.
31277 if (EltSizeInBits == 16)
31278 return SDValue();
31279
31280 SDValue Lo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, Op1, Op0));
31281 SDValue Hi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, Op1, Op0));
31282 Lo = getTargetVShiftNode(ShiftOpc, DL, ExtVT, Lo, ScalarAmt,
31283 ScalarAmtIdx, Subtarget, DAG);
31284 Hi = getTargetVShiftNode(ShiftOpc, DL, ExtVT, Hi, ScalarAmt,
31285 ScalarAmtIdx, Subtarget, DAG);
31286 return getPack(DAG, Subtarget, DL, VT, Lo, Hi, !IsFSHR);
31287 }
31288 }
31289
31290 MVT WideSVT = MVT::getIntegerVT(
31291 std::min<unsigned>(EltSizeInBits * 2, Subtarget.hasBWI() ? 16 : 32));
31292 MVT WideVT = MVT::getVectorVT(WideSVT, NumElts);
31293
31294 // If per-element shifts are legal, fallback to generic expansion.
31295 if (supportedVectorVarShift(VT, Subtarget, ShiftOpc) || Subtarget.hasXOP())
31296 return SDValue();
31297
31298 // Attempt to fold as:
31299 // fshl(x,y,z) -> (((aext(x) << bw) | zext(y)) << (z & (bw-1))) >> bw.
31300 // fshr(x,y,z) -> (((aext(x) << bw) | zext(y)) >> (z & (bw-1))).
31301 if (supportedVectorVarShift(WideVT, Subtarget, ShiftOpc) &&
31302 supportedVectorShiftWithImm(WideVT, Subtarget, ShiftOpc)) {
31303 Op0 = DAG.getNode(ISD::ANY_EXTEND, DL, WideVT, Op0);
31304 Op1 = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT, Op1);
31305 AmtMod = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT, AmtMod);
31306 Op0 = getTargetVShiftByConstNode(X86ISD::VSHLI, DL, WideVT, Op0,
31307 EltSizeInBits, DAG);
31308 SDValue Res = DAG.getNode(ISD::OR, DL, WideVT, Op0, Op1);
31309 Res = DAG.getNode(ShiftOpc, DL, WideVT, Res, AmtMod);
31310 if (!IsFSHR)
31311 Res = getTargetVShiftByConstNode(X86ISD::VSRLI, DL, WideVT, Res,
31312 EltSizeInBits, DAG);
31313 return DAG.getNode(ISD::TRUNCATE, DL, VT, Res);
31314 }
31315
31316 // Attempt to fold per-element (ExtVT) shift as unpack(y,x) << zext(z)
31317 if (((IsCst || !Subtarget.hasAVX512()) && !IsFSHR && EltSizeInBits <= 16) ||
31318 supportedVectorVarShift(ExtVT, Subtarget, ShiftOpc)) {
31319 SDValue Z = DAG.getConstant(0, DL, VT);
31320 SDValue RLo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, Op1, Op0));
31321 SDValue RHi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, Op1, Op0));
31322 SDValue ALo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, AmtMod, Z));
31323 SDValue AHi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, AmtMod, Z));
31324 SDValue Lo = DAG.getNode(ShiftOpc, DL, ExtVT, RLo, ALo);
31325 SDValue Hi = DAG.getNode(ShiftOpc, DL, ExtVT, RHi, AHi);
31326 return getPack(DAG, Subtarget, DL, VT, Lo, Hi, !IsFSHR);
31327 }
31328
31329 // Fallback to generic expansion.
31330 return SDValue();
31331 }
31332 assert(
31333 (VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) &&
31334 "Unexpected funnel shift type!");
31335
31336 // Expand slow SHLD/SHRD cases if we are not optimizing for size.
31337 bool OptForSize = DAG.shouldOptForSize();
31338 bool ExpandFunnel = !OptForSize && Subtarget.isSHLDSlow();
31339
31340 // fshl(x,y,z) -> (((aext(x) << bw) | zext(y)) << (z & (bw-1))) >> bw.
31341 // fshr(x,y,z) -> (((aext(x) << bw) | zext(y)) >> (z & (bw-1))).
31342 if ((VT == MVT::i8 || (ExpandFunnel && VT == MVT::i16)) &&
31343 !isa<ConstantSDNode>(Amt)) {
31344 SDValue Mask = DAG.getConstant(EltSizeInBits - 1, DL, Amt.getValueType());
31345 SDValue HiShift = DAG.getConstant(EltSizeInBits, DL, Amt.getValueType());
31346 Op0 = DAG.getAnyExtOrTrunc(Op0, DL, MVT::i32);
31347 Op1 = DAG.getZExtOrTrunc(Op1, DL, MVT::i32);
31348 Amt = DAG.getNode(ISD::AND, DL, Amt.getValueType(), Amt, Mask);
31349 SDValue Res = DAG.getNode(ISD::SHL, DL, MVT::i32, Op0, HiShift);
31350 Res = DAG.getNode(ISD::OR, DL, MVT::i32, Res, Op1);
31351 if (IsFSHR) {
31352 Res = DAG.getNode(ISD::SRL, DL, MVT::i32, Res, Amt);
31353 } else {
31354 Res = DAG.getNode(ISD::SHL, DL, MVT::i32, Res, Amt);
31355 Res = DAG.getNode(ISD::SRL, DL, MVT::i32, Res, HiShift);
31356 }
31357 return DAG.getZExtOrTrunc(Res, DL, VT);
31358 }
31359
31360 if (VT == MVT::i8 || ExpandFunnel)
31361 return SDValue();
31362
31363 // i16 needs to modulo the shift amount, but i32/i64 have implicit modulo.
31364 if (VT == MVT::i16) {
31365 Amt = DAG.getNode(ISD::AND, DL, Amt.getValueType(), Amt,
31366 DAG.getConstant(15, DL, Amt.getValueType()));
31367 unsigned FSHOp = (IsFSHR ? X86ISD::FSHR : X86ISD::FSHL);
31368 return DAG.getNode(FSHOp, DL, VT, Op0, Op1, Amt);
31369 }
31370
31371 return Op;
31372}
31373
31374static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
31375 SelectionDAG &DAG) {
31376 MVT VT = Op.getSimpleValueType();
31377 assert(VT.isVector() && "Custom lowering only for vector rotates!");
31378
31379 SDLoc DL(Op);
31380 SDValue R = Op.getOperand(0);
31381 SDValue Amt = Op.getOperand(1);
31382 unsigned Opcode = Op.getOpcode();
31383 unsigned EltSizeInBits = VT.getScalarSizeInBits();
31384 int NumElts = VT.getVectorNumElements();
31385 bool IsROTL = Opcode == ISD::ROTL;
31386
31387 // Check for constant splat rotation amount.
31388 APInt CstSplatValue;
31389 bool IsCstSplat = X86::isConstantSplat(Amt, CstSplatValue);
31390
31391 // Check for splat rotate by zero.
31392 if (IsCstSplat && CstSplatValue.urem(EltSizeInBits) == 0)
31393 return R;
31394
31395 // AVX512 implicitly uses modulo rotation amounts.
31396 if ((Subtarget.hasVLX() || Subtarget.hasAVX512()) && 32 <= EltSizeInBits) {
31397 // Attempt to rotate by immediate.
31398 if (IsCstSplat) {
31399 unsigned RotOpc = IsROTL ? X86ISD::VROTLI : X86ISD::VROTRI;
31400 uint64_t RotAmt = CstSplatValue.urem(EltSizeInBits);
31401 return DAG.getNode(RotOpc, DL, VT, R,
31402 DAG.getTargetConstant(RotAmt, DL, MVT::i8));
31403 }
31404
31405 // Else, fall-back on VPROLV/VPRORV.
31406 return Op;
31407 }
31408
31409 // AVX512 VBMI2 vXi16 - lower to funnel shifts.
31410 if (Subtarget.hasVBMI2() && 16 == EltSizeInBits) {
31411 unsigned FunnelOpc = IsROTL ? ISD::FSHL : ISD::FSHR;
31412 return DAG.getNode(FunnelOpc, DL, VT, R, R, Amt);
31413 }
31414
31415 SDValue Z = DAG.getConstant(0, DL, VT);
31416
31417 if (!IsROTL) {
31418 // If the ISD::ROTR amount is constant, we're always better converting to
31419 // ISD::ROTL.
31420 if (SDValue NegAmt = DAG.FoldConstantArithmetic(ISD::SUB, DL, VT, {Z, Amt}))
31421 return DAG.getNode(ISD::ROTL, DL, VT, R, NegAmt);
31422
31423 // XOP targets always prefers ISD::ROTL.
31424 if (Subtarget.hasXOP())
31425 return DAG.getNode(ISD::ROTL, DL, VT, R,
31426 DAG.getNode(ISD::SUB, DL, VT, Z, Amt));
31427 }
31428
31429 // Attempt to use GFNI gf2p8affine to rotate vXi8 by an uniform constant.
31430 if (IsCstSplat && Subtarget.hasGFNI() && VT.getScalarType() == MVT::i8 &&
31432 uint64_t RotAmt = CstSplatValue.urem(EltSizeInBits);
31433 SDValue Mask = getGFNICtrlMask(Opcode, DAG, DL, VT, RotAmt);
31434 return DAG.getNode(X86ISD::GF2P8AFFINEQB, DL, VT, R, Mask,
31435 DAG.getTargetConstant(0, DL, MVT::i8));
31436 }
31437
31438 // Split 256-bit integers on XOP/pre-AVX2 targets.
31439 if (VT.is256BitVector() && (Subtarget.hasXOP() || !Subtarget.hasAVX2()))
31440 return splitVectorIntBinary(Op, DAG, DL);
31441
31442 // XOP has 128-bit vector variable + immediate rotates.
31443 // +ve/-ve Amt = rotate left/right - just need to handle ISD::ROTL.
31444 // XOP implicitly uses modulo rotation amounts.
31445 if (Subtarget.hasXOP()) {
31446 assert(IsROTL && "Only ROTL expected");
31447 assert(VT.is128BitVector() && "Only rotate 128-bit vectors!");
31448
31449 // Attempt to rotate by immediate.
31450 if (IsCstSplat) {
31451 uint64_t RotAmt = CstSplatValue.urem(EltSizeInBits);
31452 return DAG.getNode(X86ISD::VROTLI, DL, VT, R,
31453 DAG.getTargetConstant(RotAmt, DL, MVT::i8));
31454 }
31455
31456 // Use general rotate by variable (per-element).
31457 return Op;
31458 }
31459
31460 // Rotate by an uniform constant - expand back to shifts.
31461 // TODO: Can't use generic expansion as UNDEF amt elements can be converted
31462 // to other values when folded to shift amounts, losing the splat.
31463 if (IsCstSplat) {
31464 uint64_t RotAmt = CstSplatValue.urem(EltSizeInBits);
31465 uint64_t ShlAmt = IsROTL ? RotAmt : (EltSizeInBits - RotAmt);
31466 uint64_t SrlAmt = IsROTL ? (EltSizeInBits - RotAmt) : RotAmt;
31467 SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, R,
31468 DAG.getShiftAmountConstant(ShlAmt, VT, DL));
31469 SDValue Srl = DAG.getNode(ISD::SRL, DL, VT, R,
31470 DAG.getShiftAmountConstant(SrlAmt, VT, DL));
31471 return DAG.getNode(ISD::OR, DL, VT, Shl, Srl);
31472 }
31473
31474 // Split 512-bit integers on non 512-bit BWI targets.
31475 if (VT.is512BitVector() && !Subtarget.useBWIRegs())
31476 return splitVectorIntBinary(Op, DAG, DL);
31477
31478 assert(
31479 (VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 ||
31480 ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8) &&
31481 Subtarget.hasAVX2()) ||
31482 ((VT == MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs())) &&
31483 "Only vXi32/vXi16/vXi8 vector rotates supported");
31484
31485 MVT ExtSVT = MVT::getIntegerVT(2 * EltSizeInBits);
31486 MVT ExtVT = MVT::getVectorVT(ExtSVT, NumElts / 2);
31487
31488 SDValue AmtMask = DAG.getConstant(EltSizeInBits - 1, DL, VT);
31489 SDValue AmtMod = DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask);
31490
31491 // Attempt to fold as unpack(x,x) << zext(splat(y)):
31492 // rotl(x,y) -> (unpack(x,x) << (y & (bw-1))) >> bw.
31493 // rotr(x,y) -> (unpack(x,x) >> (y & (bw-1))).
31494 if (EltSizeInBits == 8 || EltSizeInBits == 16 || EltSizeInBits == 32) {
31495 int BaseRotAmtIdx = -1;
31496 if (SDValue BaseRotAmt = DAG.getSplatSourceVector(AmtMod, BaseRotAmtIdx)) {
31497 if (EltSizeInBits == 16 && Subtarget.hasSSE41()) {
31498 unsigned FunnelOpc = IsROTL ? ISD::FSHL : ISD::FSHR;
31499 return DAG.getNode(FunnelOpc, DL, VT, R, R, Amt);
31500 }
31501 unsigned ShiftX86Opc = IsROTL ? X86ISD::VSHLI : X86ISD::VSRLI;
31502 SDValue Lo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, R, R));
31503 SDValue Hi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, R, R));
31504 Lo = getTargetVShiftNode(ShiftX86Opc, DL, ExtVT, Lo, BaseRotAmt,
31505 BaseRotAmtIdx, Subtarget, DAG);
31506 Hi = getTargetVShiftNode(ShiftX86Opc, DL, ExtVT, Hi, BaseRotAmt,
31507 BaseRotAmtIdx, Subtarget, DAG);
31508 return getPack(DAG, Subtarget, DL, VT, Lo, Hi, IsROTL);
31509 }
31510 }
31511
31512 bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
31513 unsigned ShiftOpc = IsROTL ? ISD::SHL : ISD::SRL;
31514
31515 // Attempt to fold as unpack(x,x) << zext(y):
31516 // rotl(x,y) -> (unpack(x,x) << (y & (bw-1))) >> bw.
31517 // rotr(x,y) -> (unpack(x,x) >> (y & (bw-1))).
31518 // Const vXi16/vXi32 are excluded in favor of MUL-based lowering.
31519 if (!(ConstantAmt && EltSizeInBits != 8) &&
31520 !supportedVectorVarShift(VT, Subtarget, ShiftOpc) &&
31521 (ConstantAmt || supportedVectorVarShift(ExtVT, Subtarget, ShiftOpc))) {
31522 SDValue RLo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, R, R));
31523 SDValue RHi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, R, R));
31524 SDValue ALo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, AmtMod, Z));
31525 SDValue AHi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, AmtMod, Z));
31526 SDValue Lo = DAG.getNode(ShiftOpc, DL, ExtVT, RLo, ALo);
31527 SDValue Hi = DAG.getNode(ShiftOpc, DL, ExtVT, RHi, AHi);
31528 return getPack(DAG, Subtarget, DL, VT, Lo, Hi, IsROTL);
31529 }
31530
31531 // v16i8/v32i8/v64i8: Split rotation into rot4/rot2/rot1 stages and select by
31532 // the amount bit.
31533 // TODO: We're doing nothing here that we couldn't do for funnel shifts.
31534 if (EltSizeInBits == 8) {
31535 MVT WideVT =
31536 MVT::getVectorVT(Subtarget.hasBWI() ? MVT::i16 : MVT::i32, NumElts);
31537
31538 // Attempt to fold as:
31539 // rotl(x,y) -> (((aext(x) << bw) | zext(x)) << (y & (bw-1))) >> bw.
31540 // rotr(x,y) -> (((aext(x) << bw) | zext(x)) >> (y & (bw-1))).
31541 if (supportedVectorVarShift(WideVT, Subtarget, ShiftOpc) &&
31542 supportedVectorShiftWithImm(WideVT, Subtarget, ShiftOpc)) {
31543 // If we're rotating by constant, just use default promotion.
31544 if (ConstantAmt)
31545 return SDValue();
31546 // See if we can perform this by widening to vXi16 or vXi32.
31547 R = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT, R);
31548 R = DAG.getNode(
31549 ISD::OR, DL, WideVT, R,
31550 getTargetVShiftByConstNode(X86ISD::VSHLI, DL, WideVT, R, 8, DAG));
31551 Amt = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT, AmtMod);
31552 R = DAG.getNode(ShiftOpc, DL, WideVT, R, Amt);
31553 if (IsROTL)
31554 R = getTargetVShiftByConstNode(X86ISD::VSRLI, DL, WideVT, R, 8, DAG);
31555 return DAG.getNode(ISD::TRUNCATE, DL, VT, R);
31556 }
31557
31558 // We don't need ModuloAmt here as we just peek at individual bits.
31559 auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {
31560 if (Subtarget.hasSSE41()) {
31561 // On SSE41 targets we can use PBLENDVB which selects bytes based just
31562 // on the sign bit.
31563 V0 = DAG.getBitcast(VT, V0);
31564 V1 = DAG.getBitcast(VT, V1);
31565 Sel = DAG.getBitcast(VT, Sel);
31566 return DAG.getBitcast(SelVT,
31567 DAG.getNode(X86ISD::BLENDV, DL, VT, Sel, V0, V1));
31568 }
31569 // On pre-SSE41 targets we test for the sign bit by comparing to
31570 // zero - a negative value will set all bits of the lanes to true
31571 // and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.
31572 SDValue Z = DAG.getConstant(0, DL, SelVT);
31573 SDValue C = DAG.getNode(X86ISD::PCMPGT, DL, SelVT, Z, Sel);
31574 return DAG.getSelect(DL, SelVT, C, V0, V1);
31575 };
31576
31577 // ISD::ROTR is currently only profitable on AVX512 targets with VPTERNLOG.
31578 if (!IsROTL && !useVPTERNLOG(Subtarget, VT)) {
31579 Amt = DAG.getNode(ISD::SUB, DL, VT, Z, Amt);
31580 IsROTL = true;
31581 }
31582
31583 unsigned ShiftLHS = IsROTL ? ISD::SHL : ISD::SRL;
31584 unsigned ShiftRHS = IsROTL ? ISD::SRL : ISD::SHL;
31585
31586 // Turn 'a' into a mask suitable for VSELECT: a = a << 5;
31587 // We can safely do this using i16 shifts as we're only interested in
31588 // the 3 lower bits of each byte.
31589 Amt = DAG.getBitcast(ExtVT, Amt);
31590 Amt = DAG.getNode(ISD::SHL, DL, ExtVT, Amt, DAG.getConstant(5, DL, ExtVT));
31591 Amt = DAG.getBitcast(VT, Amt);
31592
31593 // r = VSELECT(r, rot(r, 4), a);
31594 SDValue M;
31595 M = DAG.getNode(
31596 ISD::OR, DL, VT,
31597 DAG.getNode(ShiftLHS, DL, VT, R, DAG.getConstant(4, DL, VT)),
31598 DAG.getNode(ShiftRHS, DL, VT, R, DAG.getConstant(4, DL, VT)));
31599 R = SignBitSelect(VT, Amt, M, R);
31600
31601 // a += a
31602 Amt = DAG.getNode(ISD::ADD, DL, VT, Amt, Amt);
31603
31604 // r = VSELECT(r, rot(r, 2), a);
31605 M = DAG.getNode(
31606 ISD::OR, DL, VT,
31607 DAG.getNode(ShiftLHS, DL, VT, R, DAG.getConstant(2, DL, VT)),
31608 DAG.getNode(ShiftRHS, DL, VT, R, DAG.getConstant(6, DL, VT)));
31609 R = SignBitSelect(VT, Amt, M, R);
31610
31611 // a += a
31612 Amt = DAG.getNode(ISD::ADD, DL, VT, Amt, Amt);
31613
31614 // return VSELECT(r, rot(r, 1), a);
31615 M = DAG.getNode(
31616 ISD::OR, DL, VT,
31617 DAG.getNode(ShiftLHS, DL, VT, R, DAG.getConstant(1, DL, VT)),
31618 DAG.getNode(ShiftRHS, DL, VT, R, DAG.getConstant(7, DL, VT)));
31619 return SignBitSelect(VT, Amt, M, R);
31620 }
31621
31622 bool IsSplatAmt = DAG.isSplatValue(Amt);
31623 bool LegalVarShifts = supportedVectorVarShift(VT, Subtarget, ISD::SHL) &&
31624 supportedVectorVarShift(VT, Subtarget, ISD::SRL);
31625
31626 // Fallback for splats + all supported variable shifts.
31627 // Fallback for non-constants AVX2 vXi16 as well.
31628 if (IsSplatAmt || LegalVarShifts || (Subtarget.hasAVX2() && !ConstantAmt)) {
31629 Amt = DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask);
31630 SDValue AmtR = DAG.getConstant(EltSizeInBits, DL, VT);
31631 AmtR = DAG.getNode(ISD::SUB, DL, VT, AmtR, Amt);
31632 SDValue SHL = DAG.getNode(IsROTL ? ISD::SHL : ISD::SRL, DL, VT, R, Amt);
31633 SDValue SRL = DAG.getNode(IsROTL ? ISD::SRL : ISD::SHL, DL, VT, R, AmtR);
31634 return DAG.getNode(ISD::OR, DL, VT, SHL, SRL);
31635 }
31636
31637 // Everything below assumes ISD::ROTL.
31638 if (!IsROTL) {
31639 Amt = DAG.getNode(ISD::SUB, DL, VT, Z, Amt);
31640 IsROTL = true;
31641 }
31642
31643 // ISD::ROT* uses modulo rotate amounts.
31644 Amt = DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask);
31645
31646 assert(IsROTL && "Only ROTL supported");
31647
31648 // As with shifts, attempt to convert the rotation amount to a multiplication
31649 // factor, fallback to general expansion.
31650 SDValue Scale = convertShiftLeftToScale(Amt, DL, Subtarget, DAG);
31651 if (!Scale)
31652 return SDValue();
31653
31654 // v8i16/v16i16: perform unsigned multiply hi/lo and OR the results.
31655 if (EltSizeInBits == 16) {
31656 SDValue Lo = DAG.getNode(ISD::MUL, DL, VT, R, Scale);
31657 SDValue Hi = DAG.getNode(ISD::MULHU, DL, VT, R, Scale);
31658 return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);
31659 }
31660
31661 // v4i32: make use of the PMULUDQ instruction to multiply 2 lanes of v4i32
31662 // to v2i64 results at a time. The upper 32-bits contain the wrapped bits
31663 // that can then be OR'd with the lower 32-bits.
31664 assert(VT == MVT::v4i32 && "Only v4i32 vector rotate expected");
31665 static const int OddMask[] = {1, 1, 3, 3};
31666 SDValue R13 = DAG.getVectorShuffle(VT, DL, R, R, OddMask);
31667 SDValue Scale13 = DAG.getVectorShuffle(VT, DL, Scale, Scale, OddMask);
31668
31669 SDValue Res02 = DAG.getNode(X86ISD::PMULUDQ, DL, MVT::v2i64,
31670 DAG.getBitcast(MVT::v2i64, R),
31671 DAG.getBitcast(MVT::v2i64, Scale));
31672 SDValue Res13 = DAG.getNode(X86ISD::PMULUDQ, DL, MVT::v2i64,
31673 DAG.getBitcast(MVT::v2i64, R13),
31674 DAG.getBitcast(MVT::v2i64, Scale13));
31675 Res02 = DAG.getBitcast(VT, Res02);
31676 Res13 = DAG.getBitcast(VT, Res13);
31677
31678 return DAG.getNode(ISD::OR, DL, VT,
31679 DAG.getVectorShuffle(VT, DL, Res02, Res13, {0, 4, 2, 6}),
31680 DAG.getVectorShuffle(VT, DL, Res02, Res13, {1, 5, 3, 7}));
31681}
31682
31683/// Returns true if the operand type is exactly twice the native width, and
31684/// the corresponding cmpxchg8b or cmpxchg16b instruction is available.
31685/// Used to know whether to use cmpxchg8/16b when expanding atomic operations
31686/// (otherwise we leave them alone to become __sync_fetch_and_... calls).
31687bool X86TargetLowering::needsCmpXchgNb(Type *MemType) const {
31688 unsigned OpWidth = MemType->getPrimitiveSizeInBits();
31689
31690 if (OpWidth == 64)
31691 return Subtarget.canUseCMPXCHG8B() && !Subtarget.is64Bit();
31692 if (OpWidth == 128)
31693 return Subtarget.canUseCMPXCHG16B();
31694
31695 return false;
31696}
31697
31699X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
31700 Type *MemType = SI->getValueOperand()->getType();
31701
31702 if (!SI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat) &&
31703 !Subtarget.useSoftFloat()) {
31704 if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() &&
31705 (Subtarget.hasSSE1() || Subtarget.hasX87()))
31707
31708 if (MemType->getPrimitiveSizeInBits() == 128 && Subtarget.is64Bit() &&
31709 Subtarget.hasAVX())
31711 }
31712
31713 return needsCmpXchgNb(MemType) ? AtomicExpansionKind::Expand
31715}
31716
31717// Note: this turns large loads into lock cmpxchg8b/16b.
31719X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
31720 Type *MemType = LI->getType();
31721
31722 if (!LI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat) &&
31723 !Subtarget.useSoftFloat()) {
31724 // If this a 64 bit atomic load on a 32-bit target and SSE2 is enabled, we
31725 // can use movq to do the load. If we have X87 we can load into an 80-bit
31726 // X87 register and store it to a stack temporary.
31727 if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() &&
31728 (Subtarget.hasSSE1() || Subtarget.hasX87()))
31730
31731 // If this is a 128-bit load with AVX, 128-bit SSE loads/stores are atomic.
31732 if (MemType->getPrimitiveSizeInBits() == 128 && Subtarget.is64Bit() &&
31733 Subtarget.hasAVX())
31735 }
31736
31737 return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg
31739}
31740
31748
31749static std::pair<Value *, BitTestKind> FindSingleBitChange(Value *V) {
31750 using namespace llvm::PatternMatch;
31751 BitTestKind BTK = UndefBit;
31752 if (auto *C = dyn_cast<ConstantInt>(V)) {
31753 // Check if V is a power of 2 or NOT power of 2.
31754 if (isPowerOf2_64(C->getZExtValue()))
31755 BTK = ConstantBit;
31756 else if (isPowerOf2_64((~C->getValue()).getZExtValue()))
31757 BTK = NotConstantBit;
31758 return {V, BTK};
31759 }
31760
31761 // Check if V is some power of 2 pattern known to be non-zero
31762 if (auto *I = dyn_cast<Instruction>(V)) {
31763 bool Not = false;
31764 // Check if we have a NOT
31765 Value *PeekI;
31766 if (match(I, m_Not(m_Value(PeekI))) ||
31767 match(I, m_Sub(m_AllOnes(), m_Value(PeekI)))) {
31768 Not = true;
31769 I = dyn_cast<Instruction>(PeekI);
31770
31771 // If I is constant, it will fold and we can evaluate later. If its an
31772 // argument or something of that nature, we can't analyze.
31773 if (I == nullptr)
31774 return {nullptr, UndefBit};
31775 }
31776 // We can only use 1 << X without more sophisticated analysis. C << X where
31777 // C is a power of 2 but not 1 can result in zero which cannot be translated
31778 // to bittest. Likewise any C >> X (either arith or logical) can be zero.
31779 if (I->getOpcode() == Instruction::Shl) {
31780 // Todo(1): The cmpxchg case is pretty costly so matching `BLSI(X)`, `X &
31781 // -X` and some other provable power of 2 patterns that we can use CTZ on
31782 // may be profitable.
31783 // Todo(2): It may be possible in some cases to prove that Shl(C, X) is
31784 // non-zero even where C != 1. Likewise LShr(C, X) and AShr(C, X) may also
31785 // be provably a non-zero power of 2.
31786 // Todo(3): ROTL and ROTR patterns on a power of 2 C should also be
31787 // transformable to bittest.
31788 auto *ShiftVal = dyn_cast<ConstantInt>(I->getOperand(0));
31789 if (!ShiftVal)
31790 return {nullptr, UndefBit};
31791 if (ShiftVal->equalsInt(1))
31792 BTK = Not ? NotShiftBit : ShiftBit;
31793
31794 if (BTK == UndefBit)
31795 return {nullptr, UndefBit};
31796
31797 Value *BitV = I->getOperand(1);
31798
31799 // Read past a shiftmask instruction to find count
31800 Value *AndOp;
31801 uint64_t ShiftMask = I->getType()->getPrimitiveSizeInBits() - 1;
31802 if (match(BitV, m_c_And(m_Value(AndOp), m_SpecificInt(ShiftMask))))
31803 BitV = AndOp;
31804
31805 return {BitV, BTK};
31806 }
31807 }
31808 return {nullptr, UndefBit};
31809}
31810
31812X86TargetLowering::shouldExpandLogicAtomicRMWInIR(AtomicRMWInst *AI) const {
31813 using namespace llvm::PatternMatch;
31814 // If the atomicrmw's result isn't actually used, we can just add a "lock"
31815 // prefix to a normal instruction for these operations.
31816 if (AI->use_empty())
31818
31819 if (AI->getOperation() == AtomicRMWInst::Xor) {
31820 // A ^ SignBit -> A + SignBit. This allows us to use `xadd` which is
31821 // preferable to both `cmpxchg` and `btc`.
31822 if (match(AI->getOperand(1), m_SignMask()))
31824 }
31825
31826 // If the atomicrmw's result is used by a single bit AND, we may use
31827 // bts/btr/btc instruction for these operations.
31828 // Note: InstCombinePass can cause a de-optimization here. It replaces the
31829 // SETCC(And(AtomicRMW(P, power_of_2), power_of_2)) with LShr and Xor
31830 // (depending on CC). This pattern can only use bts/btr/btc but we don't
31831 // detect it.
31832 Instruction *I = AI->user_back();
31833 auto BitChange = FindSingleBitChange(AI->getValOperand());
31834 if (BitChange.second == UndefBit || !AI->hasOneUse() ||
31835 I->getOpcode() != Instruction::And ||
31836 AI->getType()->getPrimitiveSizeInBits() == 8 ||
31837 AI->getParent() != I->getParent())
31839
31840 unsigned OtherIdx = I->getOperand(0) == AI ? 1 : 0;
31841
31842 // This is a redundant AND, it should get cleaned up elsewhere.
31843 if (AI == I->getOperand(OtherIdx))
31845
31846 // The following instruction must be a AND single bit.
31847 if (BitChange.second == ConstantBit || BitChange.second == NotConstantBit) {
31848 auto *C1 = cast<ConstantInt>(AI->getValOperand());
31849 auto *C2 = dyn_cast<ConstantInt>(I->getOperand(OtherIdx));
31850 if (!C2 || !isPowerOf2_64(C2->getZExtValue())) {
31852 }
31853 if (AI->getOperation() == AtomicRMWInst::And) {
31854 return ~C1->getValue() == C2->getValue()
31857 }
31860 }
31861
31862 assert(BitChange.second == ShiftBit || BitChange.second == NotShiftBit);
31863
31864 auto BitTested = FindSingleBitChange(I->getOperand(OtherIdx));
31865 if (BitTested.second != ShiftBit && BitTested.second != NotShiftBit)
31867
31868 assert(BitChange.first != nullptr && BitTested.first != nullptr);
31869
31870 // If shift amounts are not the same we can't use BitTestIntrinsic.
31871 if (BitChange.first != BitTested.first)
31873
31874 // If atomic AND need to be masking all be one bit and testing the one bit
31875 // unset in the mask.
31876 if (AI->getOperation() == AtomicRMWInst::And)
31877 return (BitChange.second == NotShiftBit && BitTested.second == ShiftBit)
31880
31881 // If atomic XOR/OR need to be setting and testing the same bit.
31882 return (BitChange.second == ShiftBit && BitTested.second == ShiftBit)
31885}
31886
31887void X86TargetLowering::emitBitTestAtomicRMWIntrinsic(AtomicRMWInst *AI) const {
31888 IRBuilder<> Builder(AI);
31889 Builder.CollectMetadataToCopy(AI, {LLVMContext::MD_pcsections});
31892 switch (AI->getOperation()) {
31893 default:
31894 llvm_unreachable("Unknown atomic operation");
31895 case AtomicRMWInst::Or:
31896 IID_C = Intrinsic::x86_atomic_bts;
31897 IID_I = Intrinsic::x86_atomic_bts_rm;
31898 break;
31899 case AtomicRMWInst::Xor:
31900 IID_C = Intrinsic::x86_atomic_btc;
31901 IID_I = Intrinsic::x86_atomic_btc_rm;
31902 break;
31903 case AtomicRMWInst::And:
31904 IID_C = Intrinsic::x86_atomic_btr;
31905 IID_I = Intrinsic::x86_atomic_btr_rm;
31906 break;
31907 }
31908 Instruction *I = AI->user_back();
31909 LLVMContext &Ctx = AI->getContext();
31910 Value *Addr = Builder.CreatePointerCast(AI->getPointerOperand(),
31912 Value *Result = nullptr;
31913 auto BitTested = FindSingleBitChange(AI->getValOperand());
31914 assert(BitTested.first != nullptr);
31915
31916 if (BitTested.second == ConstantBit || BitTested.second == NotConstantBit) {
31917 auto *C = cast<ConstantInt>(I->getOperand(I->getOperand(0) == AI ? 1 : 0));
31918
31919 unsigned Imm = llvm::countr_zero(C->getZExtValue());
31920 Result = Builder.CreateIntrinsic(IID_C, AI->getType(),
31921 {Addr, Builder.getInt8(Imm)});
31922 } else {
31923 assert(BitTested.second == ShiftBit || BitTested.second == NotShiftBit);
31924
31925 Value *SI = BitTested.first;
31926 assert(SI != nullptr);
31927
31928 // BT{S|R|C} on memory operand don't modulo bit position so we need to
31929 // mask it.
31930 unsigned ShiftBits = SI->getType()->getPrimitiveSizeInBits();
31931 Value *BitPos =
31932 Builder.CreateAnd(SI, Builder.getIntN(ShiftBits, ShiftBits - 1));
31933 // Todo(1): In many cases it may be provable that SI is less than
31934 // ShiftBits in which case this mask is unnecessary
31935 // Todo(2): In the fairly idiomatic case of P[X / sizeof_bits(X)] OP 1
31936 // << (X % sizeof_bits(X)) we can drop the shift mask and AGEN in
31937 // favor of just a raw BT{S|R|C}.
31938
31939 Result = Builder.CreateIntrinsic(IID_I, AI->getType(), {Addr, BitPos});
31940 Result = Builder.CreateZExtOrTrunc(Result, AI->getType());
31941
31942 // If the result is only used for zero/non-zero status then we don't need to
31943 // shift value back. Otherwise do so.
31944 for (auto It = I->user_begin(); It != I->user_end(); ++It) {
31945 if (auto *ICmp = dyn_cast<ICmpInst>(*It)) {
31946 if (ICmp->isEquality()) {
31947 auto *C0 = dyn_cast<ConstantInt>(ICmp->getOperand(0));
31948 auto *C1 = dyn_cast<ConstantInt>(ICmp->getOperand(1));
31949 if (C0 || C1) {
31950 assert(C0 == nullptr || C1 == nullptr);
31951 if ((C0 ? C0 : C1)->isZero())
31952 continue;
31953 }
31954 }
31955 }
31956 Result = Builder.CreateShl(Result, BitPos);
31957 break;
31958 }
31959 }
31960
31961 I->replaceAllUsesWith(Result);
31962 I->eraseFromParent();
31963 AI->eraseFromParent();
31964}
31965
31967 using namespace llvm::PatternMatch;
31968 if (!AI->hasOneUse())
31969 return false;
31970
31971 Value *Op = AI->getOperand(1);
31972 CmpPredicate Pred;
31973 Instruction *I = AI->user_back();
31975 if (Opc == AtomicRMWInst::Add) {
31976 if (match(I, m_c_ICmp(Pred, m_Sub(m_ZeroInt(), m_Specific(Op)), m_Value())))
31977 return Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE;
31978 if (match(I, m_OneUse(m_c_Add(m_Specific(Op), m_Value())))) {
31979 if (match(I->user_back(),
31981 return true;
31982 if (match(I->user_back(),
31984 return true;
31985 }
31986 return false;
31987 }
31988 if (Opc == AtomicRMWInst::Sub) {
31989 if (match(I, m_c_ICmp(Pred, m_Specific(Op), m_Value())))
31990 return Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE;
31991 if (match(I, m_OneUse(m_Sub(m_Value(), m_Specific(Op))))) {
31992 if (match(I->user_back(),
31994 return true;
31995 if (match(I->user_back(),
31997 return true;
31998 }
31999 return false;
32000 }
32001 if ((Opc == AtomicRMWInst::Or &&
32003 (Opc == AtomicRMWInst::And &&
32005 if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_ZeroInt())))
32006 return Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE ||
32007 Pred == CmpInst::ICMP_SLT;
32008 if (match(I->user_back(),
32010 return true;
32011 return false;
32012 }
32013 if (Opc == AtomicRMWInst::Xor) {
32014 if (match(I, m_c_ICmp(Pred, m_Specific(Op), m_Value())))
32015 return Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE;
32016 if (match(I, m_OneUse(m_c_Xor(m_Specific(Op), m_Value())))) {
32017 if (match(I->user_back(),
32019 return true;
32020 if (match(I->user_back(),
32022 return true;
32023 }
32024 return false;
32025 }
32026
32027 return false;
32028}
32029
32030void X86TargetLowering::emitCmpArithAtomicRMWIntrinsic(
32031 AtomicRMWInst *AI) const {
32032 IRBuilder<> Builder(AI);
32033 Builder.CollectMetadataToCopy(AI, {LLVMContext::MD_pcsections});
32034 Instruction *TempI = nullptr;
32035 LLVMContext &Ctx = AI->getContext();
32036 ICmpInst *ICI = dyn_cast<ICmpInst>(AI->user_back());
32037 if (!ICI) {
32038 TempI = AI->user_back();
32039 assert(TempI->hasOneUse() && "Must have one use");
32040 ICI = cast<ICmpInst>(TempI->user_back());
32041 }
32043 ICmpInst::Predicate Pred = ICI->getPredicate();
32044 switch (Pred) {
32045 default:
32046 llvm_unreachable("Not supported Pred");
32047 case CmpInst::ICMP_EQ:
32048 CC = X86::COND_E;
32049 break;
32050 case CmpInst::ICMP_NE:
32051 CC = X86::COND_NE;
32052 break;
32053 case CmpInst::ICMP_SLT:
32054 CC = X86::COND_S;
32055 break;
32056 case CmpInst::ICMP_SGT:
32057 CC = X86::COND_NS;
32058 break;
32059 }
32061 switch (AI->getOperation()) {
32062 default:
32063 llvm_unreachable("Unknown atomic operation");
32064 case AtomicRMWInst::Add:
32065 IID = Intrinsic::x86_atomic_add_cc;
32066 break;
32067 case AtomicRMWInst::Sub:
32068 IID = Intrinsic::x86_atomic_sub_cc;
32069 break;
32070 case AtomicRMWInst::Or:
32071 IID = Intrinsic::x86_atomic_or_cc;
32072 break;
32073 case AtomicRMWInst::And:
32074 IID = Intrinsic::x86_atomic_and_cc;
32075 break;
32076 case AtomicRMWInst::Xor:
32077 IID = Intrinsic::x86_atomic_xor_cc;
32078 break;
32079 }
32080 Value *Addr = Builder.CreatePointerCast(AI->getPointerOperand(),
32082 Value *Call = Builder.CreateIntrinsic(
32083 IID, AI->getType(),
32084 {Addr, AI->getValOperand(), Builder.getInt32((unsigned)CC)});
32085 Value *Result = Builder.CreateTrunc(Call, Type::getInt1Ty(Ctx));
32086 ICI->replaceAllUsesWith(Result);
32087 ICI->eraseFromParent();
32088 if (TempI)
32089 TempI->eraseFromParent();
32090 AI->eraseFromParent();
32091}
32092
32094X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
32095 unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
32096 Type *MemType = AI->getType();
32097
32098 // If the operand is too big, we must see if cmpxchg8/16b is available
32099 // and default to library calls otherwise.
32100 if (MemType->getPrimitiveSizeInBits() > NativeWidth) {
32101 return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg
32103 }
32104
32106 switch (Op) {
32109 case AtomicRMWInst::Add:
32110 case AtomicRMWInst::Sub:
32113 // It's better to use xadd, xsub or xchg for these in other cases.
32115 case AtomicRMWInst::Or:
32116 case AtomicRMWInst::And:
32117 case AtomicRMWInst::Xor:
32120 return shouldExpandLogicAtomicRMWInIR(AI);
32122 case AtomicRMWInst::Max:
32123 case AtomicRMWInst::Min:
32134 default:
32135 // These always require a non-trivial set of data operations on x86. We must
32136 // use a cmpxchg loop.
32138 }
32139}
32140
32141LoadInst *
32142X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
32143 unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
32144 Type *MemType = AI->getType();
32145 // Accesses larger than the native width are turned into cmpxchg/libcalls, so
32146 // there is no benefit in turning such RMWs into loads, and it is actually
32147 // harmful as it introduces a mfence.
32148 if (MemType->getPrimitiveSizeInBits() > NativeWidth)
32149 return nullptr;
32150
32151 // If this is a canonical idempotent atomicrmw w/no uses, we have a better
32152 // lowering available in lowerAtomicArith.
32153 // TODO: push more cases through this path.
32154 if (auto *C = dyn_cast<ConstantInt>(AI->getValOperand()))
32155 if (AI->getOperation() == AtomicRMWInst::Or && C->isZero() &&
32156 AI->use_empty())
32157 return nullptr;
32158
32159 IRBuilder<> Builder(AI);
32160 Builder.CollectMetadataToCopy(AI, {LLVMContext::MD_pcsections});
32161 auto SSID = AI->getSyncScopeID();
32162 // We must restrict the ordering to avoid generating loads with Release or
32163 // ReleaseAcquire orderings.
32165
32166 // Before the load we need a fence. Here is an example lifted from
32167 // http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf showing why a fence
32168 // is required:
32169 // Thread 0:
32170 // x.store(1, relaxed);
32171 // r1 = y.fetch_add(0, release);
32172 // Thread 1:
32173 // y.fetch_add(42, acquire);
32174 // r2 = x.load(relaxed);
32175 // r1 = r2 = 0 is impossible, but becomes possible if the idempotent rmw is
32176 // lowered to just a load without a fence. A mfence flushes the store buffer,
32177 // making the optimization clearly correct.
32178 // FIXME: it is required if isReleaseOrStronger(Order) but it is not clear
32179 // otherwise, we might be able to be more aggressive on relaxed idempotent
32180 // rmw. In practice, they do not look useful, so we don't try to be
32181 // especially clever.
32182
32183 // Use `fence seq_cst` over `llvm.x64.sse2.mfence` here to get the correct
32184 // lowering for SSID == SyncScope::SingleThread and avoidMFence || !hasMFence
32185 Builder.CreateFence(AtomicOrdering::SequentiallyConsistent, SSID);
32186
32187 // Finally we can emit the atomic load.
32188 LoadInst *Loaded = Builder.CreateAlignedLoad(
32189 AI->getType(), AI->getPointerOperand(), AI->getAlign());
32190 Loaded->setAtomic(Order, SSID);
32191 AI->replaceAllUsesWith(Loaded);
32192 AI->eraseFromParent();
32193 return Loaded;
32194}
32195
32196/// Emit a locked operation on a stack location which does not change any
32197/// memory location, but does involve a lock prefix. Location is chosen to be
32198/// a) very likely accessed only by a single thread to minimize cache traffic,
32199/// and b) definitely dereferenceable. Returns the new Chain result.
32201 const X86Subtarget &Subtarget, SDValue Chain,
32202 const SDLoc &DL) {
32203 // Implementation notes:
32204 // 1) LOCK prefix creates a full read/write reordering barrier for memory
32205 // operations issued by the current processor. As such, the location
32206 // referenced is not relevant for the ordering properties of the instruction.
32207 // See: Intel® 64 and IA-32 ArchitecturesSoftware Developer’s Manual,
32208 // 8.2.3.9 Loads and Stores Are Not Reordered with Locked Instructions
32209 // 2) Using an immediate operand appears to be the best encoding choice
32210 // here since it doesn't require an extra register.
32211 // 3) OR appears to be very slightly faster than ADD. (Though, the difference
32212 // is small enough it might just be measurement noise.)
32213 // 4) When choosing offsets, there are several contributing factors:
32214 // a) If there's no redzone, we default to TOS. (We could allocate a cache
32215 // line aligned stack object to improve this case.)
32216 // b) To minimize our chances of introducing a false dependence, we prefer
32217 // to offset the stack usage from TOS slightly.
32218 // c) To minimize concerns about cross thread stack usage - in particular,
32219 // the idiomatic MyThreadPool.run([&StackVars]() {...}) pattern which
32220 // captures state in the TOS frame and accesses it from many threads -
32221 // we want to use an offset such that the offset is in a distinct cache
32222 // line from the TOS frame.
32223 //
32224 // For a general discussion of the tradeoffs and benchmark results, see:
32225 // https://shipilev.net/blog/2014/on-the-fence-with-dependencies/
32226
32227 auto &MF = DAG.getMachineFunction();
32228 auto &TFL = *Subtarget.getFrameLowering();
32229 const unsigned SPOffset = TFL.has128ByteRedZone(MF) ? -64 : 0;
32230
32231 if (Subtarget.is64Bit()) {
32232 SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32);
32233 SDValue Ops[] = {
32234 DAG.getRegister(X86::RSP, MVT::i64), // Base
32235 DAG.getTargetConstant(1, DL, MVT::i8), // Scale
32236 DAG.getRegister(0, MVT::i64), // Index
32237 DAG.getTargetConstant(SPOffset, DL, MVT::i32), // Disp
32238 DAG.getRegister(0, MVT::i16), // Segment.
32239 Zero,
32240 Chain};
32241 SDNode *Res = DAG.getMachineNode(X86::OR32mi8Locked, DL, MVT::i32,
32242 MVT::Other, Ops);
32243 return SDValue(Res, 1);
32244 }
32245
32246 SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32);
32247 SDValue Ops[] = {
32248 DAG.getRegister(X86::ESP, MVT::i32), // Base
32249 DAG.getTargetConstant(1, DL, MVT::i8), // Scale
32250 DAG.getRegister(0, MVT::i32), // Index
32251 DAG.getTargetConstant(SPOffset, DL, MVT::i32), // Disp
32252 DAG.getRegister(0, MVT::i16), // Segment.
32253 Zero,
32254 Chain
32255 };
32256 SDNode *Res = DAG.getMachineNode(X86::OR32mi8Locked, DL, MVT::i32,
32257 MVT::Other, Ops);
32258 return SDValue(Res, 1);
32259}
32260
32262 SelectionDAG &DAG) {
32263 SDLoc dl(Op);
32264 AtomicOrdering FenceOrdering =
32265 static_cast<AtomicOrdering>(Op.getConstantOperandVal(1));
32266 SyncScope::ID FenceSSID =
32267 static_cast<SyncScope::ID>(Op.getConstantOperandVal(2));
32268
32269 // The only fence that needs an instruction is a sequentially-consistent
32270 // cross-thread fence.
32271 if (FenceOrdering == AtomicOrdering::SequentiallyConsistent &&
32272 FenceSSID == SyncScope::System) {
32273 if (!Subtarget.avoidMFence() && Subtarget.hasMFence())
32274 return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));
32275
32276 SDValue Chain = Op.getOperand(0);
32277 return emitLockedStackOp(DAG, Subtarget, Chain, dl);
32278 }
32279
32280 // MEMBARRIER is a compiler barrier; it codegens to a no-op.
32281 return DAG.getNode(ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0));
32282}
32283
32285 SelectionDAG &DAG) {
32286 MVT T = Op.getSimpleValueType();
32287 SDLoc DL(Op);
32288 unsigned Reg = 0;
32289 unsigned size = 0;
32290 switch(T.SimpleTy) {
32291 default: llvm_unreachable("Invalid value type!");
32292 case MVT::i8: Reg = X86::AL; size = 1; break;
32293 case MVT::i16: Reg = X86::AX; size = 2; break;
32294 case MVT::i32: Reg = X86::EAX; size = 4; break;
32295 case MVT::i64:
32296 assert(Subtarget.is64Bit() && "Node not type legal!");
32297 Reg = X86::RAX; size = 8;
32298 break;
32299 }
32300 SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg,
32301 Op.getOperand(2), SDValue());
32302 SDValue Ops[] = { cpIn.getValue(0),
32303 Op.getOperand(1),
32304 Op.getOperand(3),
32305 DAG.getTargetConstant(size, DL, MVT::i8),
32306 cpIn.getValue(1) };
32307 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
32308 MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand();
32310 Ops, T, MMO);
32311
32312 SDValue cpOut =
32313 DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1));
32314 SDValue EFLAGS = DAG.getCopyFromReg(cpOut.getValue(1), DL, X86::EFLAGS,
32315 MVT::i32, cpOut.getValue(2));
32316 SDValue Success = getSETCC(X86::COND_E, EFLAGS, DL, DAG);
32317
32318 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),
32319 cpOut, Success, EFLAGS.getValue(1));
32320}
32321
32322// Create MOVMSKB, taking into account whether we need to split for AVX1.
32324 const X86Subtarget &Subtarget) {
32325 MVT InVT = V.getSimpleValueType();
32326
32327 if (InVT == MVT::v64i8) {
32328 SDValue Lo, Hi;
32329 std::tie(Lo, Hi) = DAG.SplitVector(V, DL);
32330 Lo = getPMOVMSKB(DL, Lo, DAG, Subtarget);
32331 Hi = getPMOVMSKB(DL, Hi, DAG, Subtarget);
32332 Lo = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Lo);
32333 Hi = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, Hi);
32334 Hi = DAG.getNode(ISD::SHL, DL, MVT::i64, Hi,
32335 DAG.getConstant(32, DL, MVT::i8));
32336 return DAG.getNode(ISD::OR, DL, MVT::i64, Lo, Hi);
32337 }
32338 if (InVT == MVT::v32i8 && !Subtarget.hasInt256()) {
32339 SDValue Lo, Hi;
32340 std::tie(Lo, Hi) = DAG.SplitVector(V, DL);
32341 Lo = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Lo);
32342 Hi = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Hi);
32343 Hi = DAG.getNode(ISD::SHL, DL, MVT::i32, Hi,
32344 DAG.getConstant(16, DL, MVT::i8));
32345 return DAG.getNode(ISD::OR, DL, MVT::i32, Lo, Hi);
32346 }
32347
32348 return DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
32349}
32350
32351static SDValue LowerBITCAST(SDValue Op, const X86Subtarget &Subtarget,
32352 SelectionDAG &DAG) {
32353 SDValue Src = Op.getOperand(0);
32354 MVT SrcVT = Src.getSimpleValueType();
32355 MVT DstVT = Op.getSimpleValueType();
32356
32357 // Legalize (v64i1 (bitcast i64 (X))) by splitting the i64, bitcasting each
32358 // half to v32i1 and concatenating the result.
32359 if (SrcVT == MVT::i64 && DstVT == MVT::v64i1) {
32360 assert(!Subtarget.is64Bit() && "Expected 32-bit mode");
32361 assert(Subtarget.hasBWI() && "Expected BWI target");
32362 SDLoc dl(Op);
32363 SDValue Lo, Hi;
32364 std::tie(Lo, Hi) = DAG.SplitScalar(Src, dl, MVT::i32, MVT::i32);
32365 Lo = DAG.getBitcast(MVT::v32i1, Lo);
32366 Hi = DAG.getBitcast(MVT::v32i1, Hi);
32367 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
32368 }
32369
32370 // Use MOVMSK for vector to scalar conversion to prevent scalarization.
32371 if ((SrcVT == MVT::v16i1 || SrcVT == MVT::v32i1) && DstVT.isScalarInteger()) {
32372 assert(!Subtarget.hasAVX512() && "Should use K-registers with AVX512");
32373 MVT SExtVT = SrcVT == MVT::v16i1 ? MVT::v16i8 : MVT::v32i8;
32374 SDLoc DL(Op);
32375 SDValue V = DAG.getSExtOrTrunc(Src, DL, SExtVT);
32376 V = getPMOVMSKB(DL, V, DAG, Subtarget);
32377 return DAG.getZExtOrTrunc(V, DL, DstVT);
32378 }
32379
32380 assert((SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8 ||
32381 SrcVT == MVT::i64) && "Unexpected VT!");
32382
32383 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
32384 if (!(DstVT == MVT::f64 && SrcVT == MVT::i64) &&
32385 !(DstVT == MVT::x86mmx && SrcVT.isVector()))
32386 // This conversion needs to be expanded.
32387 return SDValue();
32388
32389 SDLoc dl(Op);
32390 if (SrcVT.isVector()) {
32391 // Widen the vector in input in the case of MVT::v2i32.
32392 // Example: from MVT::v2i32 to MVT::v4i32.
32394 SrcVT.getVectorNumElements() * 2);
32395 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewVT, Src,
32396 DAG.getUNDEF(SrcVT));
32397 } else {
32398 assert(SrcVT == MVT::i64 && !Subtarget.is64Bit() &&
32399 "Unexpected source type in LowerBITCAST");
32400 Src = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Src);
32401 }
32402
32403 MVT V2X64VT = DstVT == MVT::f64 ? MVT::v2f64 : MVT::v2i64;
32404 Src = DAG.getNode(ISD::BITCAST, dl, V2X64VT, Src);
32405
32406 if (DstVT == MVT::x86mmx)
32407 return DAG.getNode(X86ISD::MOVDQ2Q, dl, DstVT, Src);
32408
32409 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, DstVT, Src,
32410 DAG.getVectorIdxConstant(0, dl));
32411}
32412
32413/// Compute the horizontal sum of bytes in V for the elements of VT.
32414///
32415/// Requires V to be a byte vector and VT to be an integer vector type with
32416/// wider elements than V's type. The width of the elements of VT determines
32417/// how many bytes of V are summed horizontally to produce each element of the
32418/// result.
32420 const X86Subtarget &Subtarget,
32421 SelectionDAG &DAG) {
32422 SDLoc DL(V);
32423 MVT ByteVecVT = V.getSimpleValueType();
32424 MVT EltVT = VT.getVectorElementType();
32425 assert(ByteVecVT.getVectorElementType() == MVT::i8 &&
32426 "Expected value to have byte element type.");
32427 assert(EltVT != MVT::i8 &&
32428 "Horizontal byte sum only makes sense for wider elements!");
32429 unsigned VecSize = VT.getSizeInBits();
32430 assert(ByteVecVT.getSizeInBits() == VecSize && "Cannot change vector size!");
32431
32432 // PSADBW instruction horizontally add all bytes and leave the result in i64
32433 // chunks, thus directly computes the pop count for v2i64 and v4i64.
32434 if (EltVT == MVT::i64) {
32435 SDValue Zeros = DAG.getConstant(0, DL, ByteVecVT);
32436 MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
32437 V = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT, V, Zeros);
32438 return DAG.getBitcast(VT, V);
32439 }
32440
32441 if (EltVT == MVT::i32) {
32442 // We unpack the low half and high half into i32s interleaved with zeros so
32443 // that we can use PSADBW to horizontally sum them. The most useful part of
32444 // this is that it lines up the results of two PSADBW instructions to be
32445 // two v2i64 vectors which concatenated are the 4 population counts. We can
32446 // then use PACKUSWB to shrink and concatenate them into a v4i32 again.
32447 SDValue Zeros = DAG.getConstant(0, DL, VT);
32448 SDValue V32 = DAG.getBitcast(VT, V);
32449 SDValue Low = getUnpackl(DAG, DL, VT, V32, Zeros);
32450 SDValue High = getUnpackh(DAG, DL, VT, V32, Zeros);
32451
32452 // Do the horizontal sums into two v2i64s.
32453 Zeros = DAG.getConstant(0, DL, ByteVecVT);
32454 MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
32455 Low = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
32456 DAG.getBitcast(ByteVecVT, Low), Zeros);
32457 High = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
32458 DAG.getBitcast(ByteVecVT, High), Zeros);
32459
32460 // Merge them together.
32461 MVT ShortVecVT = MVT::getVectorVT(MVT::i16, VecSize / 16);
32462 V = DAG.getNode(X86ISD::PACKUS, DL, ByteVecVT,
32463 DAG.getBitcast(ShortVecVT, Low),
32464 DAG.getBitcast(ShortVecVT, High));
32465
32466 return DAG.getBitcast(VT, V);
32467 }
32468
32469 // The only element type left is i16.
32470 assert(EltVT == MVT::i16 && "Unknown how to handle type");
32471
32472 // To obtain pop count for each i16 element starting from the pop count for
32473 // i8 elements, shift the i16s left by 8, sum as i8s, and then shift as i16s
32474 // right by 8. It is important to shift as i16s as i8 vector shift isn't
32475 // directly supported.
32476 SDValue ShifterV = DAG.getConstant(8, DL, VT);
32477 SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
32478 V = DAG.getNode(ISD::ADD, DL, ByteVecVT, DAG.getBitcast(ByteVecVT, Shl),
32479 DAG.getBitcast(ByteVecVT, V));
32480 return DAG.getNode(ISD::SRL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
32481}
32482
32484 const X86Subtarget &Subtarget,
32485 SelectionDAG &DAG) {
32486 MVT VT = Op.getSimpleValueType();
32487 MVT EltVT = VT.getVectorElementType();
32488 int NumElts = VT.getVectorNumElements();
32489 (void)EltVT;
32490 assert(EltVT == MVT::i8 && "Only vXi8 vector CTPOP lowering supported.");
32491
32492 // Implement a lookup table in register by using an algorithm based on:
32493 // http://wm.ite.pl/articles/sse-popcount.html
32494 //
32495 // The general idea is that every lower byte nibble in the input vector is an
32496 // index into a in-register pre-computed pop count table. We then split up the
32497 // input vector in two new ones: (1) a vector with only the shifted-right
32498 // higher nibbles for each byte and (2) a vector with the lower nibbles (and
32499 // masked out higher ones) for each byte. PSHUFB is used separately with both
32500 // to index the in-register table. Next, both are added and the result is a
32501 // i8 vector where each element contains the pop count for input byte.
32502 const int LUT[16] = {/* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2,
32503 /* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3,
32504 /* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3,
32505 /* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4};
32506
32508 for (int i = 0; i < NumElts; ++i)
32509 LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
32510 SDValue InRegLUT = DAG.getBuildVector(VT, DL, LUTVec);
32511 SDValue M0F = DAG.getConstant(0x0F, DL, VT);
32512
32513 // High nibbles
32514 SDValue FourV = DAG.getConstant(4, DL, VT);
32515 SDValue HiNibbles = DAG.getNode(ISD::SRL, DL, VT, Op, FourV);
32516
32517 // Low nibbles
32518 SDValue LoNibbles = DAG.getNode(ISD::AND, DL, VT, Op, M0F);
32519
32520 // The input vector is used as the shuffle mask that index elements into the
32521 // LUT. After counting low and high nibbles, add the vector to obtain the
32522 // final pop count per i8 element.
32523 SDValue HiPopCnt = DAG.getNode(X86ISD::PSHUFB, DL, VT, InRegLUT, HiNibbles);
32524 SDValue LoPopCnt = DAG.getNode(X86ISD::PSHUFB, DL, VT, InRegLUT, LoNibbles);
32525 return DAG.getNode(ISD::ADD, DL, VT, HiPopCnt, LoPopCnt);
32526}
32527
32528// Please ensure that any codegen change from LowerVectorCTPOP is reflected in
32529// updated cost models in X86TTIImpl::getIntrinsicInstrCost.
32531 const X86Subtarget &Subtarget,
32532 SelectionDAG &DAG) {
32533 MVT VT = Op.getSimpleValueType();
32534 assert((VT.is512BitVector() || VT.is256BitVector() || VT.is128BitVector()) &&
32535 "Unknown CTPOP type to handle");
32536 SDValue Op0 = Op.getOperand(0);
32537
32538 // TRUNC(CTPOP(ZEXT(X))) to make use of vXi32/vXi64 VPOPCNT instructions.
32539 if (Subtarget.hasVPOPCNTDQ()) {
32540 unsigned NumElems = VT.getVectorNumElements();
32541 assert((VT.getVectorElementType() == MVT::i8 ||
32542 VT.getVectorElementType() == MVT::i16) && "Unexpected type");
32543 if (NumElems < 16 || (NumElems == 16 && Subtarget.canExtendTo512DQ())) {
32544 MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
32545 Op = DAG.getNode(ISD::ZERO_EXTEND, DL, NewVT, Op0);
32546 Op = DAG.getNode(ISD::CTPOP, DL, NewVT, Op);
32547 return DAG.getNode(ISD::TRUNCATE, DL, VT, Op);
32548 }
32549 }
32550
32551 // Decompose 256-bit ops into smaller 128-bit ops.
32552 if (VT.is256BitVector() && !Subtarget.hasInt256())
32553 return splitVectorIntUnary(Op, DAG, DL);
32554
32555 // Decompose 512-bit ops into smaller 256-bit ops.
32556 if (VT.is512BitVector() && !Subtarget.hasBWI())
32557 return splitVectorIntUnary(Op, DAG, DL);
32558
32559 // For element types greater than i8, do vXi8 pop counts and a bytesum.
32560 if (VT.getScalarType() != MVT::i8) {
32561 MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
32562 SDValue ByteOp = DAG.getBitcast(ByteVT, Op0);
32563 SDValue PopCnt8 = DAG.getNode(ISD::CTPOP, DL, ByteVT, ByteOp);
32564 return LowerHorizontalByteSum(PopCnt8, VT, Subtarget, DAG);
32565 }
32566
32567 // We can't use the fast LUT approach, so fall back on LegalizeDAG.
32568 if (!Subtarget.hasSSSE3())
32569 return SDValue();
32570
32571 return LowerVectorCTPOPInRegLUT(Op0, DL, Subtarget, DAG);
32572}
32573
32574static SDValue LowerCTPOP(SDValue N, const X86Subtarget &Subtarget,
32575 SelectionDAG &DAG) {
32576 MVT VT = N.getSimpleValueType();
32577 SDValue Op = N.getOperand(0);
32578 SDLoc DL(N);
32579
32580 if (VT.isScalarInteger()) {
32581 // Compute the lower/upper bounds of the active bits of the value,
32582 // allowing us to shift the active bits down if necessary to fit into the
32583 // special cases below.
32584 KnownBits Known = DAG.computeKnownBits(Op);
32585 if (Known.isConstant())
32586 return DAG.getConstant(Known.getConstant().popcount(), DL, VT);
32587 unsigned LZ = Known.countMinLeadingZeros();
32588 unsigned TZ = Known.countMinTrailingZeros();
32589 assert((LZ + TZ) < Known.getBitWidth() && "Illegal shifted mask");
32590 unsigned ActiveBits = Known.getBitWidth() - LZ;
32591 unsigned ShiftedActiveBits = Known.getBitWidth() - (LZ + TZ);
32592
32593 // i2 CTPOP - "ctpop(x) --> sub(x, (x >> 1))".
32594 if (ShiftedActiveBits <= 2) {
32595 if (ActiveBits > 2)
32596 Op = DAG.getNode(ISD::SRL, DL, VT, Op,
32597 DAG.getShiftAmountConstant(TZ, VT, DL));
32598 Op = DAG.getZExtOrTrunc(Op, DL, MVT::i32);
32599 Op = DAG.getNode(ISD::SUB, DL, MVT::i32, Op,
32600 DAG.getNode(ISD::SRL, DL, MVT::i32, Op,
32601 DAG.getShiftAmountConstant(1, VT, DL)));
32602 return DAG.getZExtOrTrunc(Op, DL, VT);
32603 }
32604
32605 // i3 CTPOP - perform LUT into i32 integer.
32606 if (ShiftedActiveBits <= 3) {
32607 if (ActiveBits > 3)
32608 Op = DAG.getNode(ISD::SRL, DL, VT, Op,
32609 DAG.getShiftAmountConstant(TZ, VT, DL));
32610 Op = DAG.getZExtOrTrunc(Op, DL, MVT::i32);
32611 Op = DAG.getNode(ISD::SHL, DL, MVT::i32, Op,
32612 DAG.getShiftAmountConstant(1, VT, DL));
32613 Op = DAG.getNode(ISD::SRL, DL, MVT::i32,
32614 DAG.getConstant(0b1110100110010100U, DL, MVT::i32), Op);
32615 Op = DAG.getNode(ISD::AND, DL, MVT::i32, Op,
32616 DAG.getConstant(0x3, DL, MVT::i32));
32617 return DAG.getZExtOrTrunc(Op, DL, VT);
32618 }
32619
32620 // i4 CTPOP - perform LUT into i64 integer.
32621 if (ShiftedActiveBits <= 4 &&
32622 DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64)) {
32623 SDValue LUT = DAG.getConstant(0x4332322132212110ULL, DL, MVT::i64);
32624 if (ActiveBits > 4)
32625 Op = DAG.getNode(ISD::SRL, DL, VT, Op,
32626 DAG.getShiftAmountConstant(TZ, VT, DL));
32627 Op = DAG.getZExtOrTrunc(Op, DL, MVT::i32);
32628 Op = DAG.getNode(ISD::MUL, DL, MVT::i32, Op,
32629 DAG.getConstant(4, DL, MVT::i32));
32630 Op = DAG.getNode(ISD::SRL, DL, MVT::i64, LUT,
32631 DAG.getShiftAmountOperand(MVT::i64, Op));
32632 Op = DAG.getNode(ISD::AND, DL, MVT::i64, Op,
32633 DAG.getConstant(0x7, DL, MVT::i64));
32634 return DAG.getZExtOrTrunc(Op, DL, VT);
32635 }
32636
32637 // i8 CTPOP - with efficient i32 MUL, then attempt multiply-mask-multiply.
32638 if (ShiftedActiveBits <= 8) {
32639 SDValue Mask11 = DAG.getConstant(0x11111111U, DL, MVT::i32);
32640 if (ActiveBits > 8)
32641 Op = DAG.getNode(ISD::SRL, DL, VT, Op,
32642 DAG.getShiftAmountConstant(TZ, VT, DL));
32643 Op = DAG.getZExtOrTrunc(Op, DL, MVT::i32);
32644 Op = DAG.getNode(ISD::MUL, DL, MVT::i32, Op,
32645 DAG.getConstant(0x08040201U, DL, MVT::i32));
32646 Op = DAG.getNode(ISD::SRL, DL, MVT::i32, Op,
32647 DAG.getShiftAmountConstant(3, MVT::i32, DL));
32648 Op = DAG.getNode(ISD::AND, DL, MVT::i32, Op, Mask11);
32649 Op = DAG.getNode(ISD::MUL, DL, MVT::i32, Op, Mask11);
32650 Op = DAG.getNode(ISD::SRL, DL, MVT::i32, Op,
32651 DAG.getShiftAmountConstant(28, MVT::i32, DL));
32652 return DAG.getZExtOrTrunc(Op, DL, VT);
32653 }
32654
32655 return SDValue(); // fallback to generic expansion.
32656 }
32657
32658 assert(VT.isVector() &&
32659 "We only do custom lowering for vector population count.");
32660 return LowerVectorCTPOP(N, DL, Subtarget, DAG);
32661}
32662
32664 MVT VT = Op.getSimpleValueType();
32665 SDValue In = Op.getOperand(0);
32666 SDLoc DL(Op);
32667
32668 // For scalars, its still beneficial to transfer to/from the SIMD unit to
32669 // perform the BITREVERSE.
32670 if (!VT.isVector()) {
32671 MVT VecVT = MVT::getVectorVT(VT, 128 / VT.getSizeInBits());
32672 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, In);
32673 Res = DAG.getNode(ISD::BITREVERSE, DL, VecVT, Res);
32674 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Res,
32675 DAG.getVectorIdxConstant(0, DL));
32676 }
32677
32678 int NumElts = VT.getVectorNumElements();
32679 int ScalarSizeInBytes = VT.getScalarSizeInBits() / 8;
32680
32681 // Decompose 256-bit ops into smaller 128-bit ops.
32682 if (VT.is256BitVector())
32683 return splitVectorIntUnary(Op, DAG, DL);
32684
32685 assert(VT.is128BitVector() &&
32686 "Only 128-bit vector bitreverse lowering supported.");
32687
32688 // VPPERM reverses the bits of a byte with the permute Op (2 << 5), and we
32689 // perform the BSWAP in the shuffle.
32690 // Its best to shuffle using the second operand as this will implicitly allow
32691 // memory folding for multiple vectors.
32692 SmallVector<SDValue, 16> MaskElts;
32693 for (int i = 0; i != NumElts; ++i) {
32694 for (int j = ScalarSizeInBytes - 1; j >= 0; --j) {
32695 int SourceByte = 16 + (i * ScalarSizeInBytes) + j;
32696 int PermuteByte = SourceByte | (2 << 5);
32697 MaskElts.push_back(DAG.getConstant(PermuteByte, DL, MVT::i8));
32698 }
32699 }
32700
32701 SDValue Mask = DAG.getBuildVector(MVT::v16i8, DL, MaskElts);
32702 SDValue Res = DAG.getBitcast(MVT::v16i8, In);
32703 Res = DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, DAG.getUNDEF(MVT::v16i8),
32704 Res, Mask);
32705 return DAG.getBitcast(VT, Res);
32706}
32707
32709 SelectionDAG &DAG) {
32710 MVT VT = Op.getSimpleValueType();
32711
32712 if (Subtarget.hasXOP() && !VT.is512BitVector())
32713 return LowerBITREVERSE_XOP(Op, DAG);
32714
32715 assert((Subtarget.hasSSSE3() || Subtarget.hasGFNI()) &&
32716 "SSSE3 or GFNI required for BITREVERSE");
32717
32718 SDValue In = Op.getOperand(0);
32719 SDLoc DL(Op);
32720
32721 // Split 512-bit ops without BWI so that we can still use the PSHUFB lowering.
32722 if (VT.is512BitVector() && !Subtarget.hasBWI())
32723 return splitVectorIntUnary(Op, DAG, DL);
32724
32725 // Decompose 256-bit ops into smaller 128-bit ops on pre-AVX2.
32726 if (VT.is256BitVector() && !Subtarget.hasInt256())
32727 return splitVectorIntUnary(Op, DAG, DL);
32728
32729 // Lower i8/i16/i32/i64 as vXi8 BITREVERSE + BSWAP
32730 if (!VT.isVector()) {
32731 assert(
32732 (VT == MVT::i32 || VT == MVT::i64 || VT == MVT::i16 || VT == MVT::i8) &&
32733 "Only tested for i8/i16/i32/i64");
32734 MVT VecVT = MVT::getVectorVT(VT, 128 / VT.getSizeInBits());
32735 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, In);
32736 Res = DAG.getNode(ISD::BITREVERSE, DL, MVT::v16i8,
32737 DAG.getBitcast(MVT::v16i8, Res));
32738 Res =
32739 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, DAG.getBitcast(VecVT, Res),
32740 DAG.getVectorIdxConstant(0, DL));
32741 return (VT == MVT::i8) ? Res : DAG.getNode(ISD::BSWAP, DL, VT, Res);
32742 }
32743
32744 assert(VT.isVector() && VT.getSizeInBits() >= 128);
32745
32746 // Lower vXi16/vXi32/vXi64 as BSWAP + vXi8 BITREVERSE.
32747 if (VT.getScalarType() != MVT::i8) {
32748 MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
32749 SDValue Res = DAG.getNode(ISD::BSWAP, DL, VT, In);
32750 Res = DAG.getBitcast(ByteVT, Res);
32751 Res = DAG.getNode(ISD::BITREVERSE, DL, ByteVT, Res);
32752 return DAG.getBitcast(VT, Res);
32753 }
32754 assert(VT.isVector() && VT.getScalarType() == MVT::i8 &&
32755 "Only byte vector BITREVERSE supported");
32756
32757 unsigned NumElts = VT.getVectorNumElements();
32758
32759 // If we have GFNI, we can use GF2P8AFFINEQB to reverse the bits.
32760 if (Subtarget.hasGFNI()) {
32762 return DAG.getNode(X86ISD::GF2P8AFFINEQB, DL, VT, In, Matrix,
32763 DAG.getTargetConstant(0, DL, MVT::i8));
32764 }
32765
32766 // Perform BITREVERSE using PSHUFB lookups. Each byte is split into
32767 // two nibbles and a PSHUFB lookup to find the bitreverse of each
32768 // 0-15 value (moved to the other nibble).
32769 SDValue NibbleMask = DAG.getConstant(0xF, DL, VT);
32770 SDValue Lo = DAG.getNode(ISD::AND, DL, VT, In, NibbleMask);
32771 SDValue Hi = DAG.getNode(ISD::SRL, DL, VT, In, DAG.getConstant(4, DL, VT));
32772
32773 const int LoLUT[16] = {
32774 /* 0 */ 0x00, /* 1 */ 0x80, /* 2 */ 0x40, /* 3 */ 0xC0,
32775 /* 4 */ 0x20, /* 5 */ 0xA0, /* 6 */ 0x60, /* 7 */ 0xE0,
32776 /* 8 */ 0x10, /* 9 */ 0x90, /* a */ 0x50, /* b */ 0xD0,
32777 /* c */ 0x30, /* d */ 0xB0, /* e */ 0x70, /* f */ 0xF0};
32778 const int HiLUT[16] = {
32779 /* 0 */ 0x00, /* 1 */ 0x08, /* 2 */ 0x04, /* 3 */ 0x0C,
32780 /* 4 */ 0x02, /* 5 */ 0x0A, /* 6 */ 0x06, /* 7 */ 0x0E,
32781 /* 8 */ 0x01, /* 9 */ 0x09, /* a */ 0x05, /* b */ 0x0D,
32782 /* c */ 0x03, /* d */ 0x0B, /* e */ 0x07, /* f */ 0x0F};
32783
32784 SmallVector<SDValue, 16> LoMaskElts, HiMaskElts;
32785 for (unsigned i = 0; i < NumElts; ++i) {
32786 LoMaskElts.push_back(DAG.getConstant(LoLUT[i % 16], DL, MVT::i8));
32787 HiMaskElts.push_back(DAG.getConstant(HiLUT[i % 16], DL, MVT::i8));
32788 }
32789
32790 SDValue LoMask = DAG.getBuildVector(VT, DL, LoMaskElts);
32791 SDValue HiMask = DAG.getBuildVector(VT, DL, HiMaskElts);
32792 Lo = DAG.getNode(X86ISD::PSHUFB, DL, VT, LoMask, Lo);
32793 Hi = DAG.getNode(X86ISD::PSHUFB, DL, VT, HiMask, Hi);
32794 return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);
32795}
32796
32797static SDValue LowerPARITY(SDValue Op, const X86Subtarget &Subtarget,
32798 SelectionDAG &DAG) {
32799 SDLoc DL(Op);
32800 SDValue X = Op.getOperand(0);
32801 MVT VT = Op.getSimpleValueType();
32802
32803 // Special case. If the input fits in 8-bits we can use a single 8-bit TEST.
32804 if (VT == MVT::i8 ||
32806 X = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, X);
32807 SDValue Flags = DAG.getNode(X86ISD::CMP, DL, MVT::i32, X,
32808 DAG.getConstant(0, DL, MVT::i8));
32809 // Copy the inverse of the parity flag into a register with setcc.
32810 SDValue Setnp = getSETCC(X86::COND_NP, Flags, DL, DAG);
32811 // Extend to the original type.
32812 return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Setnp);
32813 }
32814
32815 // If we have POPCNT, use the default expansion.
32816 if (Subtarget.hasPOPCNT())
32817 return SDValue();
32818
32819 if (VT == MVT::i64) {
32820 // Xor the high and low 16-bits together using a 32-bit operation.
32821 SDValue Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32,
32822 DAG.getNode(ISD::SRL, DL, MVT::i64, X,
32823 DAG.getConstant(32, DL, MVT::i8)));
32824 SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, X);
32825 X = DAG.getNode(ISD::XOR, DL, MVT::i32, Lo, Hi);
32826 }
32827
32828 if (VT != MVT::i16) {
32829 // Xor the high and low 16-bits together using a 32-bit operation.
32830 SDValue Hi16 = DAG.getNode(ISD::SRL, DL, MVT::i32, X,
32831 DAG.getConstant(16, DL, MVT::i8));
32832 X = DAG.getNode(ISD::XOR, DL, MVT::i32, X, Hi16);
32833 } else {
32834 // If the input is 16-bits, we need to extend to use an i32 shift below.
32835 X = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, X);
32836 }
32837
32838 // Finally xor the low 2 bytes together and use a 8-bit flag setting xor.
32839 // This should allow an h-reg to be used to save a shift.
32840 SDValue Hi = DAG.getNode(
32841 ISD::TRUNCATE, DL, MVT::i8,
32842 DAG.getNode(ISD::SRL, DL, MVT::i32, X, DAG.getConstant(8, DL, MVT::i8)));
32843 SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, X);
32844 SDVTList VTs = DAG.getVTList(MVT::i8, MVT::i32);
32845 SDValue Flags = DAG.getNode(X86ISD::XOR, DL, VTs, Lo, Hi).getValue(1);
32846
32847 // Copy the inverse of the parity flag into a register with setcc.
32848 SDValue Setnp = getSETCC(X86::COND_NP, Flags, DL, DAG);
32849 // Extend to the original type.
32850 return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Setnp);
32851}
32852
32854 const X86Subtarget &Subtarget) {
32855 unsigned NewOpc = 0;
32856 switch (N->getOpcode()) {
32857 case ISD::ATOMIC_LOAD_ADD:
32858 NewOpc = X86ISD::LADD;
32859 break;
32860 case ISD::ATOMIC_LOAD_SUB:
32861 NewOpc = X86ISD::LSUB;
32862 break;
32863 case ISD::ATOMIC_LOAD_OR:
32864 NewOpc = X86ISD::LOR;
32865 break;
32866 case ISD::ATOMIC_LOAD_XOR:
32867 NewOpc = X86ISD::LXOR;
32868 break;
32869 case ISD::ATOMIC_LOAD_AND:
32870 NewOpc = X86ISD::LAND;
32871 break;
32872 default:
32873 llvm_unreachable("Unknown ATOMIC_LOAD_ opcode");
32874 }
32875
32876 MachineMemOperand *MMO = cast<MemSDNode>(N)->getMemOperand();
32877
32878 return DAG.getMemIntrinsicNode(
32879 NewOpc, SDLoc(N), DAG.getVTList(MVT::i32, MVT::Other),
32880 {N->getOperand(0), N->getOperand(1), N->getOperand(2)},
32881 /*MemVT=*/N->getSimpleValueType(0), MMO);
32882}
32883
32884/// Lower atomic_load_ops into LOCK-prefixed operations.
32886 const X86Subtarget &Subtarget) {
32887 AtomicSDNode *AN = cast<AtomicSDNode>(N.getNode());
32888 SDValue Chain = N->getOperand(0);
32889 SDValue LHS = N->getOperand(1);
32890 SDValue RHS = N->getOperand(2);
32891 unsigned Opc = N->getOpcode();
32892 MVT VT = N->getSimpleValueType(0);
32893 SDLoc DL(N);
32894
32895 // We can lower atomic_load_add into LXADD. However, any other atomicrmw op
32896 // can only be lowered when the result is unused. They should have already
32897 // been transformed into a cmpxchg loop in AtomicExpand.
32898 if (N->hasAnyUseOfValue(0)) {
32899 // Handle (atomic_load_sub p, v) as (atomic_load_add p, -v), to be able to
32900 // select LXADD if LOCK_SUB can't be selected.
32901 // Handle (atomic_load_xor p, SignBit) as (atomic_load_add p, SignBit) so we
32902 // can use LXADD as opposed to cmpxchg.
32903 if (Opc == ISD::ATOMIC_LOAD_SUB ||
32904 (Opc == ISD::ATOMIC_LOAD_XOR && isMinSignedConstant(RHS)))
32905 return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, DL, VT, Chain, LHS,
32906 DAG.getNegative(RHS, DL, VT), AN->getMemOperand());
32907
32908 assert(Opc == ISD::ATOMIC_LOAD_ADD &&
32909 "Used AtomicRMW ops other than Add should have been expanded!");
32910 return N;
32911 }
32912
32913 // Specialized lowering for the canonical form of an idemptotent atomicrmw.
32914 // The core idea here is that since the memory location isn't actually
32915 // changing, all we need is a lowering for the *ordering* impacts of the
32916 // atomicrmw. As such, we can chose a different operation and memory
32917 // location to minimize impact on other code.
32918 // The above holds unless the node is marked volatile in which
32919 // case it needs to be preserved according to the langref.
32920 if (Opc == ISD::ATOMIC_LOAD_OR && isNullConstant(RHS) && !AN->isVolatile()) {
32921 // On X86, the only ordering which actually requires an instruction is
32922 // seq_cst which isn't SingleThread, everything just needs to be preserved
32923 // during codegen and then dropped. Note that we expect (but don't assume),
32924 // that orderings other than seq_cst and acq_rel have been canonicalized to
32925 // a store or load.
32928 // Prefer a locked operation against a stack location to minimize cache
32929 // traffic. This assumes that stack locations are very likely to be
32930 // accessed only by the owning thread.
32931 SDValue NewChain = emitLockedStackOp(DAG, Subtarget, Chain, DL);
32932 assert(!N->hasAnyUseOfValue(0));
32933 // NOTE: The getUNDEF is needed to give something for the unused result 0.
32934 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),
32935 DAG.getUNDEF(VT), NewChain);
32936 }
32937 // MEMBARRIER is a compiler barrier; it codegens to a no-op.
32938 SDValue NewChain = DAG.getNode(ISD::MEMBARRIER, DL, MVT::Other, Chain);
32939 assert(!N->hasAnyUseOfValue(0));
32940 // NOTE: The getUNDEF is needed to give something for the unused result 0.
32941 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),
32942 DAG.getUNDEF(VT), NewChain);
32943 }
32944
32945 SDValue LockOp = lowerAtomicArithWithLOCK(N, DAG, Subtarget);
32946 // RAUW the chain, but don't worry about the result, as it's unused.
32947 assert(!N->hasAnyUseOfValue(0));
32948 // NOTE: The getUNDEF is needed to give something for the unused result 0.
32949 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),
32950 DAG.getUNDEF(VT), LockOp.getValue(1));
32951}
32952
32954 const X86Subtarget &Subtarget) {
32955 auto *Node = cast<AtomicSDNode>(Op.getNode());
32956 SDLoc dl(Node);
32957 EVT VT = Node->getMemoryVT();
32958
32959 bool IsSeqCst =
32960 Node->getSuccessOrdering() == AtomicOrdering::SequentiallyConsistent;
32961 bool IsTypeLegal = DAG.getTargetLoweringInfo().isTypeLegal(VT);
32962
32963 // If this store is not sequentially consistent and the type is legal
32964 // we can just keep it.
32965 if (!IsSeqCst && IsTypeLegal)
32966 return Op;
32967
32968 if (!IsTypeLegal && !Subtarget.useSoftFloat() &&
32970 Attribute::NoImplicitFloat)) {
32971 SDValue Chain;
32972 // For illegal i128 atomic_store, when AVX is enabled, we can simply emit a
32973 // vector store.
32974 if (VT == MVT::i128 && Subtarget.is64Bit() && Subtarget.hasAVX()) {
32975 SDValue VecVal = DAG.getBitcast(MVT::v2i64, Node->getVal());
32976 Chain = DAG.getStore(Node->getChain(), dl, VecVal, Node->getBasePtr(),
32977 Node->getMemOperand());
32978 }
32979
32980 // For illegal i64 atomic_stores, we can try to use MOVQ or MOVLPS if SSE
32981 // is enabled.
32982 if (VT == MVT::i64) {
32983 if (Subtarget.hasSSE1()) {
32984 SDValue SclToVec =
32985 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Node->getVal());
32986 MVT StVT = Subtarget.hasSSE2() ? MVT::v2i64 : MVT::v4f32;
32987 SclToVec = DAG.getBitcast(StVT, SclToVec);
32988 SDVTList Tys = DAG.getVTList(MVT::Other);
32989 SDValue Ops[] = {Node->getChain(), SclToVec, Node->getBasePtr()};
32990 Chain = DAG.getMemIntrinsicNode(X86ISD::VEXTRACT_STORE, dl, Tys, Ops,
32991 MVT::i64, Node->getMemOperand());
32992 } else if (Subtarget.hasX87()) {
32993 // First load this into an 80-bit X87 register using a stack temporary.
32994 // This will put the whole integer into the significand.
32995 SDValue StackPtr = DAG.CreateStackTemporary(MVT::i64);
32996 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
32997 MachinePointerInfo MPI =
32999 Chain = DAG.getStore(Node->getChain(), dl, Node->getVal(), StackPtr,
33001 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
33002 SDValue LdOps[] = {Chain, StackPtr};
33004 X86ISD::FILD, dl, Tys, LdOps, MVT::i64, MPI,
33005 /*Align*/ std::nullopt, MachineMemOperand::MOLoad);
33006 Chain = Value.getValue(1);
33007
33008 // Now use an FIST to do the atomic store.
33009 SDValue StoreOps[] = {Chain, Value, Node->getBasePtr()};
33010 Chain =
33011 DAG.getMemIntrinsicNode(X86ISD::FIST, dl, DAG.getVTList(MVT::Other),
33012 StoreOps, MVT::i64, Node->getMemOperand());
33013 }
33014 }
33015
33016 if (Chain) {
33017 // If this is a sequentially consistent store, also emit an appropriate
33018 // barrier.
33019 if (IsSeqCst)
33020 Chain = emitLockedStackOp(DAG, Subtarget, Chain, dl);
33021
33022 return Chain;
33023 }
33024 }
33025
33026 // Convert seq_cst store -> xchg
33027 // Convert wide store -> swap (-> cmpxchg8b/cmpxchg16b)
33028 // FIXME: 16-byte ATOMIC_SWAP isn't actually hooked up at the moment.
33029 SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl, Node->getMemoryVT(),
33030 Node->getOperand(0), Node->getOperand(2),
33031 Node->getOperand(1), Node->getMemOperand());
33032 return Swap.getValue(1);
33033}
33034
33036 SDNode *N = Op.getNode();
33037 MVT VT = N->getSimpleValueType(0);
33038 unsigned Opc = Op.getOpcode();
33039
33040 // Let legalize expand this if it isn't a legal type yet.
33041 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
33042 return SDValue();
33043
33044 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
33045 SDLoc DL(N);
33046
33047 // Set the carry flag.
33048 SDValue Carry = Op.getOperand(2);
33049 EVT CarryVT = Carry.getValueType();
33050 Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
33051 Carry, DAG.getAllOnesConstant(DL, CarryVT));
33052
33053 bool IsAdd = Opc == ISD::UADDO_CARRY || Opc == ISD::SADDO_CARRY;
33054 SDValue Sum = DAG.getNode(IsAdd ? X86ISD::ADC : X86ISD::SBB, DL, VTs,
33055 Op.getOperand(0), Op.getOperand(1),
33056 Carry.getValue(1));
33057
33058 bool IsSigned = Opc == ISD::SADDO_CARRY || Opc == ISD::SSUBO_CARRY;
33059 SDValue SetCC = getSETCC(IsSigned ? X86::COND_O : X86::COND_B,
33060 Sum.getValue(1), DL, DAG);
33061 if (N->getValueType(1) == MVT::i1)
33062 SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
33063
33064 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
33065}
33066
33067static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget,
33068 SelectionDAG &DAG) {
33069 assert(Subtarget.isTargetDarwin() && Subtarget.is64Bit());
33070
33071 // For MacOSX, we want to call an alternative entry point: __sincos_stret,
33072 // which returns the values as { float, float } (in XMM0) or
33073 // { double, double } (which is returned in XMM0, XMM1).
33074 SDLoc dl(Op);
33075 SDValue Arg = Op.getOperand(0);
33076 EVT ArgVT = Arg.getValueType();
33077 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
33078
33080 Args.emplace_back(Arg, ArgTy);
33081
33082 bool isF64 = ArgVT == MVT::f64;
33083 // Only optimize x86_64 for now. i386 is a bit messy. For f32,
33084 // the small struct {f32, f32} is returned in (eax, edx). For f64,
33085 // the results are returned via SRet in memory.
33086 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
33087 RTLIB::Libcall LC = isF64 ? RTLIB::SINCOS_STRET_F64 : RTLIB::SINCOS_STRET_F32;
33088 const char *LibcallName = TLI.getLibcallName(LC);
33089 SDValue Callee =
33090 DAG.getExternalSymbol(LibcallName, TLI.getPointerTy(DAG.getDataLayout()));
33091
33092 Type *RetTy = isF64 ? (Type *)StructType::get(ArgTy, ArgTy)
33093 : (Type *)FixedVectorType::get(ArgTy, 4);
33094
33096 CLI.setDebugLoc(dl)
33097 .setChain(DAG.getEntryNode())
33098 .setLibCallee(CallingConv::C, RetTy, Callee, std::move(Args));
33099
33100 std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
33101
33102 if (isF64)
33103 // Returned in xmm0 and xmm1.
33104 return CallResult.first;
33105
33106 // Returned in bits 0:31 and 32:64 xmm0.
33107 SDValue SinVal =
33108 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT, CallResult.first,
33109 DAG.getVectorIdxConstant(0, dl));
33110 SDValue CosVal =
33111 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT, CallResult.first,
33112 DAG.getVectorIdxConstant(1, dl));
33113 SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
33114 return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal);
33115}
33116
33117/// Widen a vector input to a vector of NVT. The
33118/// input vector must have the same element type as NVT.
33120 bool FillWithZeroes = false) {
33121 // Check if InOp already has the right width.
33122 MVT InVT = InOp.getSimpleValueType();
33123 if (InVT == NVT)
33124 return InOp;
33125
33126 if (InOp.isUndef())
33127 return DAG.getUNDEF(NVT);
33128
33130 "input and widen element type must match");
33131
33132 unsigned InNumElts = InVT.getVectorNumElements();
33133 unsigned WidenNumElts = NVT.getVectorNumElements();
33134 assert(WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 &&
33135 "Unexpected request for vector widening");
33136
33137 SDLoc dl(InOp);
33138 if (InOp.getOpcode() == ISD::CONCAT_VECTORS && InOp.getNumOperands() == 2) {
33139 SDValue N1 = InOp.getOperand(1);
33140 if ((ISD::isBuildVectorAllZeros(N1.getNode()) && FillWithZeroes) ||
33141 N1.isUndef()) {
33142 InOp = InOp.getOperand(0);
33143 InVT = InOp.getSimpleValueType();
33144 InNumElts = InVT.getVectorNumElements();
33145 }
33146 }
33149 EVT EltVT = InOp.getOperand(0).getValueType();
33150 SDValue FillVal =
33151 FillWithZeroes ? DAG.getConstant(0, dl, EltVT) : DAG.getUNDEF(EltVT);
33153 Ops.append(WidenNumElts - InNumElts, FillVal);
33154 return DAG.getBuildVector(NVT, dl, Ops);
33155 }
33156 SDValue FillVal =
33157 FillWithZeroes ? DAG.getConstant(0, dl, NVT) : DAG.getUNDEF(NVT);
33158 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NVT, FillVal, InOp,
33159 DAG.getVectorIdxConstant(0, dl));
33160}
33161
33163 SelectionDAG &DAG) {
33164 assert(Subtarget.hasAVX512() &&
33165 "MGATHER/MSCATTER are supported on AVX-512 arch only");
33166
33168 SDValue Src = N->getValue();
33169 MVT VT = Src.getSimpleValueType();
33170 assert(VT.getScalarSizeInBits() >= 32 && "Unsupported scatter op");
33171 SDLoc dl(Op);
33172
33173 SDValue Scale = N->getScale();
33174 SDValue Index = N->getIndex();
33175 SDValue Mask = N->getMask();
33176 SDValue Chain = N->getChain();
33177 SDValue BasePtr = N->getBasePtr();
33178
33179 if (VT == MVT::v2f32 || VT == MVT::v2i32) {
33180 assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type");
33181 // If the index is v2i64 and we have VLX we can use xmm for data and index.
33182 if (Index.getValueType() == MVT::v2i64 && Subtarget.hasVLX()) {
33183 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
33184 EVT WideVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
33185 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, Src, DAG.getUNDEF(VT));
33186 SDVTList VTs = DAG.getVTList(MVT::Other);
33187 SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
33188 return DAG.getMemIntrinsicNode(X86ISD::MSCATTER, dl, VTs, Ops,
33189 N->getMemoryVT(), N->getMemOperand());
33190 }
33191 return SDValue();
33192 }
33193
33194 MVT IndexVT = Index.getSimpleValueType();
33195
33196 // If the index is v2i32, we're being called by type legalization and we
33197 // should just let the default handling take care of it.
33198 if (IndexVT == MVT::v2i32)
33199 return SDValue();
33200
33201 // If we don't have VLX and neither the passthru or index is 512-bits, we
33202 // need to widen until one is.
33203 if (!Subtarget.hasVLX() && !VT.is512BitVector() &&
33204 !Index.getSimpleValueType().is512BitVector()) {
33205 // Determine how much we need to widen by to get a 512-bit type.
33206 unsigned Factor = std::min(512/VT.getSizeInBits(),
33207 512/IndexVT.getSizeInBits());
33208 unsigned NumElts = VT.getVectorNumElements() * Factor;
33209
33210 VT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
33211 IndexVT = MVT::getVectorVT(IndexVT.getVectorElementType(), NumElts);
33212 MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
33213
33214 Src = ExtendToType(Src, VT, DAG);
33215 Index = ExtendToType(Index, IndexVT, DAG);
33216 Mask = ExtendToType(Mask, MaskVT, DAG, true);
33217 }
33218
33219 SDVTList VTs = DAG.getVTList(MVT::Other);
33220 SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
33221 return DAG.getMemIntrinsicNode(X86ISD::MSCATTER, dl, VTs, Ops,
33222 N->getMemoryVT(), N->getMemOperand());
33223}
33224
33225static SDValue LowerMLOAD(SDValue Op, const X86Subtarget &Subtarget,
33226 SelectionDAG &DAG) {
33227
33229 MVT VT = Op.getSimpleValueType();
33230 MVT ScalarVT = VT.getScalarType();
33231 SDValue Mask = N->getMask();
33232 MVT MaskVT = Mask.getSimpleValueType();
33233 SDValue PassThru = N->getPassThru();
33234 SDLoc dl(Op);
33235
33236 // Handle AVX masked loads which don't support passthru other than 0.
33237 if (MaskVT.getVectorElementType() != MVT::i1) {
33238 // We also allow undef in the isel pattern.
33239 if (PassThru.isUndef() || ISD::isBuildVectorAllZeros(PassThru.getNode()))
33240 return Op;
33241
33242 SDValue NewLoad = DAG.getMaskedLoad(
33243 VT, dl, N->getChain(), N->getBasePtr(), N->getOffset(), Mask,
33244 getZeroVector(VT, Subtarget, DAG, dl), N->getMemoryVT(),
33245 N->getMemOperand(), N->getAddressingMode(), N->getExtensionType(),
33246 N->isExpandingLoad());
33247 // Emit a blend.
33248 SDValue Select = DAG.getNode(ISD::VSELECT, dl, VT, Mask, NewLoad, PassThru);
33249 return DAG.getMergeValues({ Select, NewLoad.getValue(1) }, dl);
33250 }
33251
33252 assert((!N->isExpandingLoad() || Subtarget.hasAVX512()) &&
33253 "Expanding masked load is supported on AVX-512 target only!");
33254
33255 assert((!N->isExpandingLoad() || ScalarVT.getSizeInBits() >= 32) &&
33256 "Expanding masked load is supported for 32 and 64-bit types only!");
33257
33258 assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
33259 "Cannot lower masked load op.");
33260
33261 assert((ScalarVT.getSizeInBits() >= 32 ||
33262 (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16 ||
33263 ScalarVT == MVT::f16))) &&
33264 "Unsupported masked load op.");
33265
33266 // This operation is legal for targets with VLX, but without
33267 // VLX the vector should be widened to 512 bit
33268 unsigned NumEltsInWideVec = 512 / VT.getScalarSizeInBits();
33269 MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
33270 PassThru = ExtendToType(PassThru, WideDataVT, DAG);
33271
33272 // Mask element has to be i1.
33273 assert(Mask.getSimpleValueType().getScalarType() == MVT::i1 &&
33274 "Unexpected mask type");
33275
33276 MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);
33277
33278 Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
33279 SDValue NewLoad = DAG.getMaskedLoad(
33280 WideDataVT, dl, N->getChain(), N->getBasePtr(), N->getOffset(), Mask,
33281 PassThru, N->getMemoryVT(), N->getMemOperand(), N->getAddressingMode(),
33282 N->getExtensionType(), N->isExpandingLoad());
33283
33284 SDValue Extract =
33285 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, NewLoad.getValue(0),
33286 DAG.getVectorIdxConstant(0, dl));
33287 SDValue RetOps[] = {Extract, NewLoad.getValue(1)};
33288 return DAG.getMergeValues(RetOps, dl);
33289}
33290
33291static SDValue LowerMSTORE(SDValue Op, const X86Subtarget &Subtarget,
33292 SelectionDAG &DAG) {
33294 SDValue DataToStore = N->getValue();
33295 MVT VT = DataToStore.getSimpleValueType();
33296 MVT ScalarVT = VT.getScalarType();
33297 SDValue Mask = N->getMask();
33298 SDLoc dl(Op);
33299
33300 assert((!N->isCompressingStore() || Subtarget.hasAVX512()) &&
33301 "Expanding masked load is supported on AVX-512 target only!");
33302
33303 assert((!N->isCompressingStore() || ScalarVT.getSizeInBits() >= 32) &&
33304 "Expanding masked load is supported for 32 and 64-bit types only!");
33305
33306 assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
33307 "Cannot lower masked store op.");
33308
33309 assert((ScalarVT.getSizeInBits() >= 32 ||
33310 (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16 ||
33311 ScalarVT == MVT::f16))) &&
33312 "Unsupported masked store op.");
33313
33314 // This operation is legal for targets with VLX, but without
33315 // VLX the vector should be widened to 512 bit
33316 unsigned NumEltsInWideVec = 512/VT.getScalarSizeInBits();
33317 MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
33318
33319 // Mask element has to be i1.
33320 assert(Mask.getSimpleValueType().getScalarType() == MVT::i1 &&
33321 "Unexpected mask type");
33322
33323 MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);
33324
33325 DataToStore = ExtendToType(DataToStore, WideDataVT, DAG);
33326 Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
33327 return DAG.getMaskedStore(N->getChain(), dl, DataToStore, N->getBasePtr(),
33328 N->getOffset(), Mask, N->getMemoryVT(),
33329 N->getMemOperand(), N->getAddressingMode(),
33330 N->isTruncatingStore(), N->isCompressingStore());
33331}
33332
33333static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget,
33334 SelectionDAG &DAG) {
33335 assert(Subtarget.hasAVX2() &&
33336 "MGATHER/MSCATTER are supported on AVX-512/AVX-2 arch only");
33337
33339 SDLoc dl(Op);
33340 MVT VT = Op.getSimpleValueType();
33341 SDValue Index = N->getIndex();
33342 SDValue Mask = N->getMask();
33343 SDValue PassThru = N->getPassThru();
33344 MVT IndexVT = Index.getSimpleValueType();
33345
33346 assert(VT.getScalarSizeInBits() >= 32 && "Unsupported gather op");
33347
33348 // If the index is v2i32, we're being called by type legalization.
33349 if (IndexVT == MVT::v2i32)
33350 return SDValue();
33351
33352 // If we don't have VLX and neither the passthru or index is 512-bits, we
33353 // need to widen until one is.
33354 MVT OrigVT = VT;
33355 if (Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
33356 !IndexVT.is512BitVector()) {
33357 // Determine how much we need to widen by to get a 512-bit type.
33358 unsigned Factor = std::min(512/VT.getSizeInBits(),
33359 512/IndexVT.getSizeInBits());
33360
33361 unsigned NumElts = VT.getVectorNumElements() * Factor;
33362
33363 VT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
33364 IndexVT = MVT::getVectorVT(IndexVT.getVectorElementType(), NumElts);
33365 MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
33366
33367 PassThru = ExtendToType(PassThru, VT, DAG);
33368 Index = ExtendToType(Index, IndexVT, DAG);
33369 Mask = ExtendToType(Mask, MaskVT, DAG, true);
33370 }
33371
33372 // Break dependency on the data register.
33373 if (PassThru.isUndef())
33374 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
33375
33376 SDValue Ops[] = { N->getChain(), PassThru, Mask, N->getBasePtr(), Index,
33377 N->getScale() };
33378 SDValue NewGather = DAG.getMemIntrinsicNode(
33379 X86ISD::MGATHER, dl, DAG.getVTList(VT, MVT::Other), Ops, N->getMemoryVT(),
33380 N->getMemOperand());
33381 SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OrigVT, NewGather,
33382 DAG.getVectorIdxConstant(0, dl));
33383 return DAG.getMergeValues({Extract, NewGather.getValue(1)}, dl);
33384}
33385
33387 SDLoc dl(Op);
33388 SDValue Src = Op.getOperand(0);
33389 MVT DstVT = Op.getSimpleValueType();
33390
33392 unsigned SrcAS = N->getSrcAddressSpace();
33393
33394 assert(SrcAS != N->getDestAddressSpace() &&
33395 "addrspacecast must be between different address spaces");
33396
33397 if (SrcAS == X86AS::PTR32_UPTR && DstVT == MVT::i64) {
33398 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Src);
33399 } else if (DstVT == MVT::i64) {
33400 Op = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Src);
33401 } else if (DstVT == MVT::i32) {
33402 Op = DAG.getNode(ISD::TRUNCATE, dl, DstVT, Src);
33403 } else {
33404 report_fatal_error("Bad address space in addrspacecast");
33405 }
33406 return Op;
33407}
33408
33409SDValue X86TargetLowering::LowerGC_TRANSITION(SDValue Op,
33410 SelectionDAG &DAG) const {
33411 // TODO: Eventually, the lowering of these nodes should be informed by or
33412 // deferred to the GC strategy for the function in which they appear. For
33413 // now, however, they must be lowered to something. Since they are logically
33414 // no-ops in the case of a null GC strategy (or a GC strategy which does not
33415 // require special handling for these nodes), lower them as literal NOOPs for
33416 // the time being.
33418 Ops.push_back(Op.getOperand(0));
33419 if (Op->getGluedNode())
33420 Ops.push_back(Op->getOperand(Op->getNumOperands() - 1));
33421
33422 SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
33423 return SDValue(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0);
33424}
33425
33426// Custom split CVTPS2PH with wide types.
33428 SDLoc dl(Op);
33429 EVT VT = Op.getValueType();
33430 SDValue Lo, Hi;
33431 std::tie(Lo, Hi) = DAG.SplitVectorOperand(Op.getNode(), 0);
33432 EVT LoVT, HiVT;
33433 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
33434 SDValue RC = Op.getOperand(1);
33435 Lo = DAG.getNode(X86ISD::CVTPS2PH, dl, LoVT, Lo, RC);
33436 Hi = DAG.getNode(X86ISD::CVTPS2PH, dl, HiVT, Hi, RC);
33437 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
33438}
33439
33441 SelectionDAG &DAG) {
33442 unsigned IsData = Op.getConstantOperandVal(4);
33443
33444 // We don't support non-data prefetch without PREFETCHI.
33445 // Just preserve the chain.
33446 if (!IsData && !Subtarget.hasPREFETCHI())
33447 return Op.getOperand(0);
33448
33449 return Op;
33450}
33451
33453 SDNode *N = Op.getNode();
33454 SDValue Operand = N->getOperand(0);
33455 EVT VT = Operand.getValueType();
33456 SDLoc dl(N);
33457
33458 SDValue One = DAG.getConstantFP(1.0, dl, VT);
33459
33460 // TODO: Fix Crash for bf16 when generating strict_fmul as it
33461 // leads to a error : SoftPromoteHalfResult #0: t11: bf16,ch = strict_fmul t0,
33462 // ConstantFP:bf16<APFloat(16256)>, t5 LLVM ERROR: Do not know how to soft
33463 // promote this operator's result!
33464 SDValue Chain = DAG.getEntryNode();
33465 SDValue StrictFmul = DAG.getNode(ISD::STRICT_FMUL, dl, {VT, MVT::Other},
33466 {Chain, Operand, One});
33467 return StrictFmul;
33468}
33469
33471 unsigned OpNo) {
33472 const APInt Operand(32, OpNo);
33473 std::string OpNoStr = llvm::toString(Operand, 10, false);
33474 std::string Str(" $");
33475
33476 std::string OpNoStr1(Str + OpNoStr); // e.g. " $1" (OpNo=1)
33477 std::string OpNoStr2(Str + "{" + OpNoStr + ":"); // With modifier, e.g. ${1:P}
33478
33479 auto I = StringRef::npos;
33480 for (auto &AsmStr : AsmStrs) {
33481 // Match the OpNo string. We should match exactly to exclude match
33482 // sub-string, e.g. "$12" contain "$1"
33483 if (AsmStr.ends_with(OpNoStr1))
33484 I = AsmStr.size() - OpNoStr1.size();
33485
33486 // Get the index of operand in AsmStr.
33487 if (I == StringRef::npos)
33488 I = AsmStr.find(OpNoStr1 + ",");
33489 if (I == StringRef::npos)
33490 I = AsmStr.find(OpNoStr2);
33491
33492 if (I == StringRef::npos)
33493 continue;
33494
33495 assert(I > 0 && "Unexpected inline asm string!");
33496 // Remove the operand string and label (if exsit).
33497 // For example:
33498 // ".L__MSASMLABEL_.${:uid}__l:call dword ptr ${0:P}"
33499 // ==>
33500 // ".L__MSASMLABEL_.${:uid}__l:call dword ptr "
33501 // ==>
33502 // "call dword ptr "
33503 auto TmpStr = AsmStr.substr(0, I);
33504 I = TmpStr.rfind(':');
33505 if (I != StringRef::npos)
33506 TmpStr = TmpStr.substr(I + 1);
33507 return TmpStr.take_while(llvm::isAlpha);
33508 }
33509
33510 return StringRef();
33511}
33512
33514 const SmallVectorImpl<StringRef> &AsmStrs, unsigned OpNo) const {
33515 // In a __asm block, __asm inst foo where inst is CALL or JMP should be
33516 // changed from indirect TargetLowering::C_Memory to direct
33517 // TargetLowering::C_Address.
33518 // We don't need to special case LOOP* and Jcc, which cannot target a memory
33519 // location.
33520 StringRef Inst = getInstrStrFromOpNo(AsmStrs, OpNo);
33521 return Inst.equals_insensitive("call") || Inst.equals_insensitive("jmp");
33522}
33523
33525 SDValue Mask) {
33526 EVT Ty = MVT::i8;
33527 auto V = DAG.getBitcast(MVT::i1, Mask);
33528 auto VE = DAG.getZExtOrTrunc(V, DL, Ty);
33529 auto Zero = DAG.getConstant(0, DL, Ty);
33530 SDVTList X86SubVTs = DAG.getVTList(Ty, MVT::i32);
33531 auto CmpZero = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Zero, VE);
33532 return SDValue(CmpZero.getNode(), 1);
33533}
33534
33536 SelectionDAG &DAG, const SDLoc &DL, SDValue Chain, MachineMemOperand *MMO,
33537 SDValue &NewLoad, SDValue Ptr, SDValue PassThru, SDValue Mask) const {
33538 // @llvm.masked.load.v1*(ptr, alignment, mask, passthru)
33539 // ->
33540 // _, flags = SUB 0, mask
33541 // res, chain = CLOAD inchain, ptr, (bit_cast_to_scalar passthru), cond, flags
33542 // bit_cast_to_vector<res>
33543 EVT VTy = PassThru.getValueType();
33544 EVT Ty = VTy.getVectorElementType();
33545 SDVTList Tys = DAG.getVTList(Ty, MVT::Other);
33546 auto ScalarPassThru = PassThru.isUndef() ? DAG.getConstant(0, DL, Ty)
33547 : DAG.getBitcast(Ty, PassThru);
33548 auto Flags = getFlagsOfCmpZeroFori1(DAG, DL, Mask);
33549 auto COND_NE = DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8);
33550 SDValue Ops[] = {Chain, Ptr, ScalarPassThru, COND_NE, Flags};
33551 NewLoad = DAG.getMemIntrinsicNode(X86ISD::CLOAD, DL, Tys, Ops, Ty, MMO);
33552 return DAG.getBitcast(VTy, NewLoad);
33553}
33554
33556 SDValue Chain,
33558 SDValue Val, SDValue Mask) const {
33559 // llvm.masked.store.v1*(Src0, Ptr, alignment, Mask)
33560 // ->
33561 // _, flags = SUB 0, mask
33562 // chain = CSTORE inchain, (bit_cast_to_scalar val), ptr, cond, flags
33564 SDVTList Tys = DAG.getVTList(MVT::Other);
33565 auto ScalarVal = DAG.getBitcast(Ty, Val);
33566 auto Flags = getFlagsOfCmpZeroFori1(DAG, DL, Mask);
33567 auto COND_NE = DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8);
33568 SDValue Ops[] = {Chain, ScalarVal, Ptr, COND_NE, Flags};
33569 return DAG.getMemIntrinsicNode(X86ISD::CSTORE, DL, Tys, Ops, Ty, MMO);
33570}
33571
33572/// Provide custom lowering hooks for some operations.
33574 switch (Op.getOpcode()) {
33575 // clang-format off
33576 default: llvm_unreachable("Should not custom lower this!");
33577 case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, Subtarget, DAG);
33578 case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
33579 return LowerCMP_SWAP(Op, Subtarget, DAG);
33580 case ISD::CTPOP: return LowerCTPOP(Op, Subtarget, DAG);
33581 case ISD::ATOMIC_LOAD_ADD:
33582 case ISD::ATOMIC_LOAD_SUB:
33583 case ISD::ATOMIC_LOAD_OR:
33584 case ISD::ATOMIC_LOAD_XOR:
33585 case ISD::ATOMIC_LOAD_AND: return lowerAtomicArith(Op, DAG, Subtarget);
33586 case ISD::ATOMIC_STORE: return LowerATOMIC_STORE(Op, DAG, Subtarget);
33587 case ISD::BITREVERSE: return LowerBITREVERSE(Op, Subtarget, DAG);
33588 case ISD::PARITY: return LowerPARITY(Op, Subtarget, DAG);
33589 case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG);
33590 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, Subtarget, DAG);
33591 case ISD::VECTOR_SHUFFLE: return lowerVECTOR_SHUFFLE(Op, Subtarget, DAG);
33592 case ISD::VECTOR_COMPRESS: return lowerVECTOR_COMPRESS(Op, Subtarget, DAG);
33593 case ISD::VSELECT: return LowerVSELECT(Op, DAG);
33594 case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
33595 case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
33596 case ISD::INSERT_SUBVECTOR: return LowerINSERT_SUBVECTOR(Op, Subtarget,DAG);
33597 case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op,Subtarget,DAG);
33598 case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, Subtarget,DAG);
33599 case ISD::ConstantPool: return LowerConstantPool(Op, DAG);
33600 case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG);
33601 case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
33602 case ISD::ExternalSymbol: return LowerExternalSymbol(Op, DAG);
33603 case ISD::BlockAddress: return LowerBlockAddress(Op, DAG);
33604 case ISD::SHL_PARTS:
33605 case ISD::SRA_PARTS:
33606 case ISD::SRL_PARTS: return LowerShiftParts(Op, DAG);
33607 case ISD::FSHL:
33608 case ISD::FSHR: return LowerFunnelShift(Op, Subtarget, DAG);
33609 case ISD::FCANONICALIZE: return LowerFCanonicalize(Op, DAG);
33611 case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
33613 case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
33614 case ISD::TRUNCATE: return LowerTRUNCATE(Op, DAG);
33615 case ISD::ZERO_EXTEND: return LowerZERO_EXTEND(Op, Subtarget, DAG);
33616 case ISD::SIGN_EXTEND: return LowerSIGN_EXTEND(Op, Subtarget, DAG);
33617 case ISD::ANY_EXTEND: return LowerANY_EXTEND(Op, Subtarget, DAG);
33620 return LowerEXTEND_VECTOR_INREG(Op, Subtarget, DAG);
33621 case ISD::FP_TO_SINT:
33623 case ISD::FP_TO_UINT:
33624 case ISD::STRICT_FP_TO_UINT: return LowerFP_TO_INT(Op, DAG);
33626 case ISD::FP_TO_UINT_SAT: return LowerFP_TO_INT_SAT(Op, DAG);
33627 case ISD::FP_EXTEND:
33628 case ISD::STRICT_FP_EXTEND: return LowerFP_EXTEND(Op, DAG);
33629 case ISD::FP_ROUND:
33630 case ISD::STRICT_FP_ROUND: return LowerFP_ROUND(Op, DAG);
33631 case ISD::FP16_TO_FP:
33632 case ISD::STRICT_FP16_TO_FP: return LowerFP16_TO_FP(Op, DAG);
33633 case ISD::FP_TO_FP16:
33634 case ISD::STRICT_FP_TO_FP16: return LowerFP_TO_FP16(Op, DAG);
33635 case ISD::FP_TO_BF16: return LowerFP_TO_BF16(Op, DAG);
33636 case ISD::LOAD: return LowerLoad(Op, Subtarget, DAG);
33637 case ISD::STORE: return LowerStore(Op, Subtarget, DAG);
33638 case ISD::FADD:
33639 case ISD::FSUB: return lowerFaddFsub(Op, DAG);
33640 case ISD::FROUND: return LowerFROUND(Op, DAG);
33641 case ISD::FABS:
33642 case ISD::FNEG: return LowerFABSorFNEG(Op, DAG);
33643 case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG);
33644 case ISD::FGETSIGN: return LowerFGETSIGN(Op, DAG);
33645 case ISD::LRINT:
33646 case ISD::LLRINT: return LowerLRINT_LLRINT(Op, DAG);
33647 case ISD::SETCC:
33648 case ISD::STRICT_FSETCC:
33649 case ISD::STRICT_FSETCCS: return LowerSETCC(Op, DAG);
33650 case ISD::SETCCCARRY: return LowerSETCCCARRY(Op, DAG);
33651 case ISD::SELECT: return LowerSELECT(Op, DAG);
33652 case ISD::BRCOND: return LowerBRCOND(Op, DAG);
33653 case ISD::JumpTable: return LowerJumpTable(Op, DAG);
33654 case ISD::VASTART: return LowerVASTART(Op, DAG);
33655 case ISD::VAARG: return LowerVAARG(Op, DAG);
33656 case ISD::VACOPY: return LowerVACOPY(Op, Subtarget, DAG);
33657 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
33659 case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, Subtarget, DAG);
33660 case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG);
33661 case ISD::ADDROFRETURNADDR: return LowerADDROFRETURNADDR(Op, DAG);
33662 case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG);
33664 return LowerFRAME_TO_ARGS_OFFSET(Op, DAG);
33665 case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
33666 case ISD::EH_RETURN: return LowerEH_RETURN(Op, DAG);
33667 case ISD::EH_SJLJ_SETJMP: return lowerEH_SJLJ_SETJMP(Op, DAG);
33668 case ISD::EH_SJLJ_LONGJMP: return lowerEH_SJLJ_LONGJMP(Op, DAG);
33670 return lowerEH_SJLJ_SETUP_DISPATCH(Op, DAG);
33671 case ISD::INIT_TRAMPOLINE: return LowerINIT_TRAMPOLINE(Op, DAG);
33672 case ISD::ADJUST_TRAMPOLINE: return LowerADJUST_TRAMPOLINE(Op, DAG);
33673 case ISD::GET_ROUNDING: return LowerGET_ROUNDING(Op, DAG);
33674 case ISD::SET_ROUNDING: return LowerSET_ROUNDING(Op, DAG);
33675 case ISD::GET_FPENV_MEM: return LowerGET_FPENV_MEM(Op, DAG);
33676 case ISD::SET_FPENV_MEM: return LowerSET_FPENV_MEM(Op, DAG);
33677 case ISD::RESET_FPENV: return LowerRESET_FPENV(Op, DAG);
33678 case ISD::CTLZ:
33679 case ISD::CTLZ_ZERO_UNDEF: return LowerCTLZ(Op, Subtarget, DAG);
33680 case ISD::CTTZ:
33681 case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op, Subtarget, DAG);
33682 case ISD::MUL: return LowerMUL(Op, Subtarget, DAG);
33683 case ISD::MULHS:
33684 case ISD::MULHU: return LowerMULH(Op, Subtarget, DAG);
33685 case ISD::ROTL:
33686 case ISD::ROTR: return LowerRotate(Op, Subtarget, DAG);
33687 case ISD::SRA:
33688 case ISD::SRL:
33689 case ISD::SHL: return LowerShift(Op, Subtarget, DAG);
33690 case ISD::SADDO:
33691 case ISD::UADDO:
33692 case ISD::SSUBO:
33693 case ISD::USUBO: return LowerXALUO(Op, DAG);
33694 case ISD::SMULO:
33695 case ISD::UMULO: return LowerMULO(Op, Subtarget, DAG);
33696 case ISD::READCYCLECOUNTER: return LowerREADCYCLECOUNTER(Op, Subtarget,DAG);
33697 case ISD::BITCAST: return LowerBITCAST(Op, Subtarget, DAG);
33698 case ISD::SADDO_CARRY:
33699 case ISD::SSUBO_CARRY:
33700 case ISD::UADDO_CARRY:
33701 case ISD::USUBO_CARRY: return LowerADDSUBO_CARRY(Op, DAG);
33702 case ISD::ADD:
33703 case ISD::SUB: return lowerAddSub(Op, DAG, Subtarget);
33704 case ISD::UADDSAT:
33705 case ISD::SADDSAT:
33706 case ISD::USUBSAT:
33707 case ISD::SSUBSAT: return LowerADDSAT_SUBSAT(Op, DAG, Subtarget);
33708 case ISD::SMAX:
33709 case ISD::SMIN:
33710 case ISD::UMAX:
33711 case ISD::UMIN: return LowerMINMAX(Op, Subtarget, DAG);
33712 case ISD::FMINIMUM:
33713 case ISD::FMAXIMUM:
33714 case ISD::FMINIMUMNUM:
33715 case ISD::FMAXIMUMNUM:
33716 return LowerFMINIMUM_FMAXIMUM(Op, Subtarget, DAG);
33717 case ISD::ABS: return LowerABS(Op, Subtarget, DAG);
33718 case ISD::ABDS:
33719 case ISD::ABDU: return LowerABD(Op, Subtarget, DAG);
33720 case ISD::AVGCEILU: return LowerAVG(Op, Subtarget, DAG);
33721 case ISD::FSINCOS: return LowerFSINCOS(Op, Subtarget, DAG);
33722 case ISD::MLOAD: return LowerMLOAD(Op, Subtarget, DAG);
33723 case ISD::MSTORE: return LowerMSTORE(Op, Subtarget, DAG);
33724 case ISD::MGATHER: return LowerMGATHER(Op, Subtarget, DAG);
33725 case ISD::MSCATTER: return LowerMSCATTER(Op, Subtarget, DAG);
33726 case ISD::GC_TRANSITION_START:
33727 case ISD::GC_TRANSITION_END: return LowerGC_TRANSITION(Op, DAG);
33728 case ISD::ADDRSPACECAST: return LowerADDRSPACECAST(Op, DAG);
33729 case X86ISD::CVTPS2PH: return LowerCVTPS2PH(Op, DAG);
33730 case ISD::PREFETCH: return LowerPREFETCH(Op, Subtarget, DAG);
33731 // clang-format on
33732 }
33733}
33734
33735/// Replace a node with an illegal result type with a new node built out of
33736/// custom code.
33739 SelectionDAG &DAG) const {
33740 SDLoc dl(N);
33741 unsigned Opc = N->getOpcode();
33742 switch (Opc) {
33743 default:
33744#ifndef NDEBUG
33745 dbgs() << "ReplaceNodeResults: ";
33746 N->dump(&DAG);
33747#endif
33748 llvm_unreachable("Do not know how to custom type legalize this operation!");
33749 case X86ISD::CVTPH2PS: {
33750 EVT VT = N->getValueType(0);
33751 SDValue Lo, Hi;
33752 std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
33753 EVT LoVT, HiVT;
33754 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
33755 Lo = DAG.getNode(X86ISD::CVTPH2PS, dl, LoVT, Lo);
33756 Hi = DAG.getNode(X86ISD::CVTPH2PS, dl, HiVT, Hi);
33757 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
33758 Results.push_back(Res);
33759 return;
33760 }
33762 EVT VT = N->getValueType(0);
33763 SDValue Lo, Hi;
33764 std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 1);
33765 EVT LoVT, HiVT;
33766 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
33767 Lo = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {LoVT, MVT::Other},
33768 {N->getOperand(0), Lo});
33769 Hi = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {HiVT, MVT::Other},
33770 {N->getOperand(0), Hi});
33771 SDValue Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
33772 Lo.getValue(1), Hi.getValue(1));
33773 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
33774 Results.push_back(Res);
33775 Results.push_back(Chain);
33776 return;
33777 }
33778 case X86ISD::CVTPS2PH:
33779 Results.push_back(LowerCVTPS2PH(SDValue(N, 0), DAG));
33780 return;
33781 case ISD::CTPOP: {
33782 assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!");
33783 // If we have at most 32 active bits, then perform as i32 CTPOP.
33784 // TODO: Perform this in generic legalizer?
33785 KnownBits Known = DAG.computeKnownBits(N->getOperand(0));
33786 unsigned LZ = Known.countMinLeadingZeros();
33787 unsigned TZ = Known.countMinTrailingZeros();
33788 if ((LZ + TZ) >= 32) {
33789 SDValue Op = DAG.getNode(ISD::SRL, dl, MVT::i64, N->getOperand(0),
33790 DAG.getShiftAmountConstant(TZ, MVT::i64, dl));
33791 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Op);
33792 Op = DAG.getNode(ISD::CTPOP, dl, MVT::i32, Op);
33793 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Op);
33794 Results.push_back(Op);
33795 return;
33796 }
33797 // Use a v2i64 if possible.
33798 bool NoImplicitFloatOps =
33800 Attribute::NoImplicitFloat);
33801 if (isTypeLegal(MVT::v2i64) && !NoImplicitFloatOps) {
33802 SDValue Wide =
33803 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, N->getOperand(0));
33804 Wide = DAG.getNode(ISD::CTPOP, dl, MVT::v2i64, Wide);
33805 // Bit count should fit in 32-bits, extract it as that and then zero
33806 // extend to i64. Otherwise we end up extracting bits 63:32 separately.
33807 Wide = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Wide);
33808 Wide = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, Wide,
33809 DAG.getVectorIdxConstant(0, dl));
33810 Wide = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Wide);
33811 Results.push_back(Wide);
33812 }
33813 return;
33814 }
33815 case ISD::MUL: {
33816 EVT VT = N->getValueType(0);
33818 VT.getVectorElementType() == MVT::i8 && "Unexpected VT!");
33819 // Pre-promote these to vXi16 to avoid op legalization thinking all 16
33820 // elements are needed.
33821 MVT MulVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());
33822 SDValue Op0 = DAG.getNode(ISD::ANY_EXTEND, dl, MulVT, N->getOperand(0));
33823 SDValue Op1 = DAG.getNode(ISD::ANY_EXTEND, dl, MulVT, N->getOperand(1));
33824 SDValue Res = DAG.getNode(ISD::MUL, dl, MulVT, Op0, Op1);
33825 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
33826 unsigned NumConcats = 16 / VT.getVectorNumElements();
33827 SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(VT));
33828 ConcatOps[0] = Res;
33829 Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i8, ConcatOps);
33830 Results.push_back(Res);
33831 return;
33832 }
33833 case ISD::SMULO:
33834 case ISD::UMULO: {
33835 EVT VT = N->getValueType(0);
33837 VT == MVT::v2i32 && "Unexpected VT!");
33838 bool IsSigned = Opc == ISD::SMULO;
33839 unsigned ExtOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
33840 SDValue Op0 = DAG.getNode(ExtOpc, dl, MVT::v2i64, N->getOperand(0));
33841 SDValue Op1 = DAG.getNode(ExtOpc, dl, MVT::v2i64, N->getOperand(1));
33842 SDValue Res = DAG.getNode(ISD::MUL, dl, MVT::v2i64, Op0, Op1);
33843 // Extract the high 32 bits from each result using PSHUFD.
33844 // TODO: Could use SRL+TRUNCATE but that doesn't become a PSHUFD.
33845 SDValue Hi = DAG.getBitcast(MVT::v4i32, Res);
33846 Hi = DAG.getVectorShuffle(MVT::v4i32, dl, Hi, Hi, {1, 3, -1, -1});
33847 Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Hi,
33848 DAG.getVectorIdxConstant(0, dl));
33849
33850 // Truncate the low bits of the result. This will become PSHUFD.
33851 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
33852
33853 SDValue HiCmp;
33854 if (IsSigned) {
33855 // SMULO overflows if the high bits don't match the sign of the low.
33856 HiCmp = DAG.getNode(ISD::SRA, dl, VT, Res, DAG.getConstant(31, dl, VT));
33857 } else {
33858 // UMULO overflows if the high bits are non-zero.
33859 HiCmp = DAG.getConstant(0, dl, VT);
33860 }
33861 SDValue Ovf = DAG.getSetCC(dl, N->getValueType(1), Hi, HiCmp, ISD::SETNE);
33862
33863 // Widen the result with by padding with undef.
33864 Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Res,
33865 DAG.getUNDEF(VT));
33866 Results.push_back(Res);
33867 Results.push_back(Ovf);
33868 return;
33869 }
33870 case X86ISD::VPMADDWD: {
33871 // Legalize types for X86ISD::VPMADDWD by widening.
33872 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
33873
33874 EVT VT = N->getValueType(0);
33875 EVT InVT = N->getOperand(0).getValueType();
33876 assert(VT.getSizeInBits() < 128 && 128 % VT.getSizeInBits() == 0 &&
33877 "Expected a VT that divides into 128 bits.");
33879 "Unexpected type action!");
33880 unsigned NumConcat = 128 / InVT.getSizeInBits();
33881
33882 EVT InWideVT = EVT::getVectorVT(*DAG.getContext(),
33883 InVT.getVectorElementType(),
33884 NumConcat * InVT.getVectorNumElements());
33885 EVT WideVT = EVT::getVectorVT(*DAG.getContext(),
33887 NumConcat * VT.getVectorNumElements());
33888
33889 SmallVector<SDValue, 16> Ops(NumConcat, DAG.getUNDEF(InVT));
33890 Ops[0] = N->getOperand(0);
33891 SDValue InVec0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, InWideVT, Ops);
33892 Ops[0] = N->getOperand(1);
33893 SDValue InVec1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, InWideVT, Ops);
33894
33895 SDValue Res = DAG.getNode(Opc, dl, WideVT, InVec0, InVec1);
33896 Results.push_back(Res);
33897 return;
33898 }
33899 // We might have generated v2f32 FMIN/FMAX operations. Widen them to v4f32.
33900 case X86ISD::FMINC:
33901 case X86ISD::FMIN:
33902 case X86ISD::FMAXC:
33903 case X86ISD::FMAX:
33905 case X86ISD::STRICT_FMAX: {
33906 EVT VT = N->getValueType(0);
33907 assert(VT == MVT::v2f32 && "Unexpected type (!= v2f32) on FMIN/FMAX.");
33908 bool IsStrict = Opc == X86ISD::STRICT_FMIN || Opc == X86ISD::STRICT_FMAX;
33909 SDValue UNDEF = DAG.getUNDEF(VT);
33910 SDValue LHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
33911 N->getOperand(IsStrict ? 1 : 0), UNDEF);
33912 SDValue RHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
33913 N->getOperand(IsStrict ? 2 : 1), UNDEF);
33914 SDValue Res;
33915 if (IsStrict)
33916 Res = DAG.getNode(Opc, dl, {MVT::v4f32, MVT::Other},
33917 {N->getOperand(0), LHS, RHS});
33918 else
33919 Res = DAG.getNode(Opc, dl, MVT::v4f32, LHS, RHS);
33920 Results.push_back(Res);
33921 if (IsStrict)
33922 Results.push_back(Res.getValue(1));
33923 return;
33924 }
33925 case ISD::SDIV:
33926 case ISD::UDIV:
33927 case ISD::SREM:
33928 case ISD::UREM: {
33929 EVT VT = N->getValueType(0);
33930 if (VT.isVector()) {
33932 "Unexpected type action!");
33933 // If this RHS is a constant splat vector we can widen this and let
33934 // division/remainder by constant optimize it.
33935 // TODO: Can we do something for non-splat?
33936 APInt SplatVal;
33937 if (ISD::isConstantSplatVector(N->getOperand(1).getNode(), SplatVal)) {
33938 unsigned NumConcats = 128 / VT.getSizeInBits();
33939 SmallVector<SDValue, 8> Ops0(NumConcats, DAG.getUNDEF(VT));
33940 Ops0[0] = N->getOperand(0);
33941 EVT ResVT = getTypeToTransformTo(*DAG.getContext(), VT);
33942 SDValue N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Ops0);
33943 SDValue N1 = DAG.getConstant(SplatVal, dl, ResVT);
33944 SDValue Res = DAG.getNode(Opc, dl, ResVT, N0, N1);
33945 Results.push_back(Res);
33946 }
33947 return;
33948 }
33949
33950 SDValue V = LowerWin64_i128OP(SDValue(N,0), DAG);
33951 Results.push_back(V);
33952 return;
33953 }
33954 case ISD::TRUNCATE: {
33955 MVT VT = N->getSimpleValueType(0);
33956 if (getTypeAction(*DAG.getContext(), VT) != TypeWidenVector)
33957 return;
33958
33959 // The generic legalizer will try to widen the input type to the same
33960 // number of elements as the widened result type. But this isn't always
33961 // the best thing so do some custom legalization to avoid some cases.
33962 MVT WidenVT = getTypeToTransformTo(*DAG.getContext(), VT).getSimpleVT();
33963 SDValue In = N->getOperand(0);
33964 EVT InVT = In.getValueType();
33965 EVT InEltVT = InVT.getVectorElementType();
33966 EVT EltVT = VT.getVectorElementType();
33967 unsigned MinElts = VT.getVectorNumElements();
33968 unsigned WidenNumElts = WidenVT.getVectorNumElements();
33969 unsigned InBits = InVT.getSizeInBits();
33970
33971 // See if there are sufficient leading bits to perform a PACKUS/PACKSS.
33972 unsigned PackOpcode;
33973 if (SDValue Src = matchTruncateWithPACK(PackOpcode, VT, In, dl, DAG,
33974 Subtarget, N->getFlags())) {
33975 if (SDValue Res =
33976 truncateVectorWithPACK(PackOpcode, VT, Src, dl, DAG, Subtarget)) {
33977 Res = widenSubVector(WidenVT, Res, false, Subtarget, DAG, dl);
33978 Results.push_back(Res);
33979 return;
33980 }
33981 }
33982
33983 if ((128 % InBits) == 0 && WidenVT.is128BitVector()) {
33984 // 128 bit and smaller inputs should avoid truncate all together and
33985 // use a shuffle.
33986 if ((InEltVT.getSizeInBits() % EltVT.getSizeInBits()) == 0) {
33987 int Scale = InEltVT.getSizeInBits() / EltVT.getSizeInBits();
33988 SmallVector<int, 16> TruncMask(WidenNumElts, -1);
33989 for (unsigned I = 0; I < MinElts; ++I)
33990 TruncMask[I] = Scale * I;
33991 SDValue WidenIn = widenSubVector(In, false, Subtarget, DAG, dl, 128);
33992 assert(isTypeLegal(WidenVT) && isTypeLegal(WidenIn.getValueType()) &&
33993 "Illegal vector type in truncation");
33994 WidenIn = DAG.getBitcast(WidenVT, WidenIn);
33995 Results.push_back(
33996 DAG.getVectorShuffle(WidenVT, dl, WidenIn, WidenIn, TruncMask));
33997 return;
33998 }
33999 }
34000
34001 // With AVX512 there are some cases that can use a target specific
34002 // truncate node to go from 256/512 to less than 128 with zeros in the
34003 // upper elements of the 128 bit result.
34004 if (Subtarget.hasAVX512() && isTypeLegal(InVT)) {
34005 // We can use VTRUNC directly if for 256 bits with VLX or for any 512.
34006 if ((InBits == 256 && Subtarget.hasVLX()) || InBits == 512) {
34007 Results.push_back(DAG.getNode(X86ISD::VTRUNC, dl, WidenVT, In));
34008 return;
34009 }
34010 // There's one case we can widen to 512 bits and use VTRUNC.
34011 if (InVT == MVT::v4i64 && VT == MVT::v4i8 && isTypeLegal(MVT::v8i64)) {
34012 In = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i64, In,
34013 DAG.getUNDEF(MVT::v4i64));
34014 Results.push_back(DAG.getNode(X86ISD::VTRUNC, dl, WidenVT, In));
34015 return;
34016 }
34017 }
34018 if (Subtarget.hasVLX() && InVT == MVT::v8i64 && VT == MVT::v8i8 &&
34019 getTypeAction(*DAG.getContext(), InVT) == TypeSplitVector &&
34020 isTypeLegal(MVT::v4i64)) {
34021 // Input needs to be split and output needs to widened. Let's use two
34022 // VTRUNCs, and shuffle their results together into the wider type.
34023 SDValue Lo, Hi;
34024 std::tie(Lo, Hi) = DAG.SplitVector(In, dl);
34025
34026 Lo = DAG.getNode(X86ISD::VTRUNC, dl, MVT::v16i8, Lo);
34027 Hi = DAG.getNode(X86ISD::VTRUNC, dl, MVT::v16i8, Hi);
34028 SDValue Res = DAG.getVectorShuffle(MVT::v16i8, dl, Lo, Hi,
34029 { 0, 1, 2, 3, 16, 17, 18, 19,
34030 -1, -1, -1, -1, -1, -1, -1, -1 });
34031 Results.push_back(Res);
34032 return;
34033 }
34034
34035 // Attempt to widen the truncation input vector to let LowerTRUNCATE handle
34036 // this via type legalization.
34037 if ((InEltVT == MVT::i16 || InEltVT == MVT::i32 || InEltVT == MVT::i64) &&
34038 (EltVT == MVT::i8 || EltVT == MVT::i16 || EltVT == MVT::i32) &&
34039 (!Subtarget.hasSSSE3() ||
34040 (!isTypeLegal(InVT) &&
34041 !(MinElts <= 4 && InEltVT == MVT::i64 && EltVT == MVT::i8)))) {
34042 SDValue WidenIn = widenSubVector(In, false, Subtarget, DAG, dl,
34043 InEltVT.getSizeInBits() * WidenNumElts);
34044 Results.push_back(DAG.getNode(ISD::TRUNCATE, dl, WidenVT, WidenIn));
34045 return;
34046 }
34047
34048 return;
34049 }
34050 case ISD::ANY_EXTEND:
34051 // Right now, only MVT::v8i8 has Custom action for an illegal type.
34052 // It's intended to custom handle the input type.
34053 assert(N->getValueType(0) == MVT::v8i8 &&
34054 "Do not know how to legalize this Node");
34055 return;
34056 case ISD::SIGN_EXTEND:
34057 case ISD::ZERO_EXTEND: {
34058 EVT VT = N->getValueType(0);
34059 SDValue In = N->getOperand(0);
34060 EVT InVT = In.getValueType();
34061 if (!Subtarget.hasSSE41() && VT == MVT::v4i64 &&
34062 (InVT == MVT::v4i16 || InVT == MVT::v4i8)){
34064 "Unexpected type action!");
34065 assert(Opc == ISD::SIGN_EXTEND && "Unexpected opcode");
34066 // Custom split this so we can extend i8/i16->i32 invec. This is better
34067 // since sign_extend_inreg i8/i16->i64 requires an extend to i32 using
34068 // sra. Then extending from i32 to i64 using pcmpgt. By custom splitting
34069 // we allow the sra from the extend to i32 to be shared by the split.
34070 In = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, In);
34071
34072 // Fill a vector with sign bits for each element.
34073 SDValue Zero = DAG.getConstant(0, dl, MVT::v4i32);
34074 SDValue SignBits = DAG.getSetCC(dl, MVT::v4i32, Zero, In, ISD::SETGT);
34075
34076 // Create an unpackl and unpackh to interleave the sign bits then bitcast
34077 // to v2i64.
34078 SDValue Lo = DAG.getVectorShuffle(MVT::v4i32, dl, In, SignBits,
34079 {0, 4, 1, 5});
34080 Lo = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, Lo);
34081 SDValue Hi = DAG.getVectorShuffle(MVT::v4i32, dl, In, SignBits,
34082 {2, 6, 3, 7});
34083 Hi = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, Hi);
34084
34085 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
34086 Results.push_back(Res);
34087 return;
34088 }
34089
34090 if (VT == MVT::v16i32 || VT == MVT::v8i64) {
34091 if (!InVT.is128BitVector()) {
34092 // Not a 128 bit vector, but maybe type legalization will promote
34093 // it to 128 bits.
34094 if (getTypeAction(*DAG.getContext(), InVT) != TypePromoteInteger)
34095 return;
34096 InVT = getTypeToTransformTo(*DAG.getContext(), InVT);
34097 if (!InVT.is128BitVector())
34098 return;
34099
34100 // Promote the input to 128 bits. Type legalization will turn this into
34101 // zext_inreg/sext_inreg.
34102 In = DAG.getNode(Opc, dl, InVT, In);
34103 }
34104
34105 // Perform custom splitting instead of the two stage extend we would get
34106 // by default.
34107 EVT LoVT, HiVT;
34108 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
34109 assert(isTypeLegal(LoVT) && "Split VT not legal?");
34110
34111 SDValue Lo = getEXTEND_VECTOR_INREG(Opc, dl, LoVT, In, DAG);
34112
34113 // We need to shift the input over by half the number of elements.
34114 unsigned NumElts = InVT.getVectorNumElements();
34115 unsigned HalfNumElts = NumElts / 2;
34116 SmallVector<int, 16> ShufMask(NumElts, SM_SentinelUndef);
34117 for (unsigned i = 0; i != HalfNumElts; ++i)
34118 ShufMask[i] = i + HalfNumElts;
34119
34120 SDValue Hi = DAG.getVectorShuffle(InVT, dl, In, In, ShufMask);
34121 Hi = getEXTEND_VECTOR_INREG(Opc, dl, HiVT, Hi, DAG);
34122
34123 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
34124 Results.push_back(Res);
34125 }
34126 return;
34127 }
34129 case ISD::FP_TO_UINT_SAT: {
34130 if (!Subtarget.hasAVX10_2())
34131 return;
34132
34133 bool IsSigned = Opc == ISD::FP_TO_SINT_SAT;
34134 EVT VT = N->getValueType(0);
34135 SDValue Op = N->getOperand(0);
34136 EVT OpVT = Op.getValueType();
34137 SDValue Res;
34138
34139 if (VT == MVT::v2i32 && OpVT == MVT::v2f64) {
34140 if (IsSigned)
34141 Res = DAG.getNode(X86ISD::FP_TO_SINT_SAT, dl, MVT::v4i32, Op);
34142 else
34143 Res = DAG.getNode(X86ISD::FP_TO_UINT_SAT, dl, MVT::v4i32, Op);
34144 Results.push_back(Res);
34145 }
34146 return;
34147 }
34148 case ISD::FP_TO_SINT:
34150 case ISD::FP_TO_UINT:
34152 bool IsStrict = N->isStrictFPOpcode();
34153 bool IsSigned = Opc == ISD::FP_TO_SINT || Opc == ISD::STRICT_FP_TO_SINT;
34154 EVT VT = N->getValueType(0);
34155 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
34156 SDValue Chain = IsStrict ? N->getOperand(0) : SDValue();
34157 EVT SrcVT = Src.getValueType();
34158
34159 SDValue Res;
34160 if (isSoftF16(SrcVT, Subtarget)) {
34161 EVT NVT = VT.isVector() ? VT.changeVectorElementType(MVT::f32) : MVT::f32;
34162 if (IsStrict) {
34163 Res =
34164 DAG.getNode(Opc, dl, {VT, MVT::Other},
34165 {Chain, DAG.getNode(ISD::STRICT_FP_EXTEND, dl,
34166 {NVT, MVT::Other}, {Chain, Src})});
34167 Chain = Res.getValue(1);
34168 } else {
34169 Res =
34170 DAG.getNode(Opc, dl, VT, DAG.getNode(ISD::FP_EXTEND, dl, NVT, Src));
34171 }
34172 Results.push_back(Res);
34173 if (IsStrict)
34174 Results.push_back(Chain);
34175
34176 return;
34177 }
34178
34179 if (VT.isVector() && Subtarget.hasFP16() && Subtarget.hasVLX() &&
34180 SrcVT.getVectorElementType() == MVT::f16) {
34181 EVT EleVT = VT.getVectorElementType();
34182 EVT ResVT = EleVT == MVT::i32 ? MVT::v4i32 : MVT::v8i16;
34183
34184 if (SrcVT != MVT::v8f16) {
34185 SDValue Tmp =
34186 IsStrict ? DAG.getConstantFP(0.0, dl, SrcVT) : DAG.getUNDEF(SrcVT);
34187 SmallVector<SDValue, 4> Ops(SrcVT == MVT::v2f16 ? 4 : 2, Tmp);
34188 Ops[0] = Src;
34189 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8f16, Ops);
34190 }
34191
34192 if (IsStrict) {
34194 Res =
34195 DAG.getNode(Opc, dl, {ResVT, MVT::Other}, {N->getOperand(0), Src});
34196 Chain = Res.getValue(1);
34197 } else {
34198 Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
34199 Res = DAG.getNode(Opc, dl, ResVT, Src);
34200 }
34201
34202 // TODO: Need to add exception check code for strict FP.
34203 if (EleVT.getSizeInBits() < 16) {
34204 MVT TmpVT = MVT::getVectorVT(EleVT.getSimpleVT(), 8);
34205 Res = DAG.getNode(ISD::TRUNCATE, dl, TmpVT, Res);
34206
34207 // Now widen to 128 bits.
34208 unsigned NumConcats = 128 / TmpVT.getSizeInBits();
34209 MVT ConcatVT = MVT::getVectorVT(EleVT.getSimpleVT(), 8 * NumConcats);
34210 SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(TmpVT));
34211 ConcatOps[0] = Res;
34212 Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatVT, ConcatOps);
34213 }
34214
34215 Results.push_back(Res);
34216 if (IsStrict)
34217 Results.push_back(Chain);
34218
34219 return;
34220 }
34221
34222 if (VT.isVector() && VT.getScalarSizeInBits() < 32) {
34224 "Unexpected type action!");
34225
34226 // Try to create a 128 bit vector, but don't exceed a 32 bit element.
34227 unsigned NewEltWidth = std::min(128 / VT.getVectorNumElements(), 32U);
34228 MVT PromoteVT = MVT::getVectorVT(MVT::getIntegerVT(NewEltWidth),
34230 SDValue Res;
34231 SDValue Chain;
34232 if (IsStrict) {
34233 Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, {PromoteVT, MVT::Other},
34234 {N->getOperand(0), Src});
34235 Chain = Res.getValue(1);
34236 } else
34237 Res = DAG.getNode(ISD::FP_TO_SINT, dl, PromoteVT, Src);
34238
34239 // Preserve what we know about the size of the original result. If the
34240 // result is v2i32, we have to manually widen the assert.
34241 if (PromoteVT == MVT::v2i32)
34242 Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Res,
34243 DAG.getUNDEF(MVT::v2i32));
34244
34245 Res = DAG.getNode(!IsSigned ? ISD::AssertZext : ISD::AssertSext, dl,
34246 Res.getValueType(), Res,
34248
34249 if (PromoteVT == MVT::v2i32)
34250 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res,
34251 DAG.getVectorIdxConstant(0, dl));
34252
34253 // Truncate back to the original width.
34254 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
34255
34256 // Now widen to 128 bits.
34257 unsigned NumConcats = 128 / VT.getSizeInBits();
34259 VT.getVectorNumElements() * NumConcats);
34260 SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(VT));
34261 ConcatOps[0] = Res;
34262 Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatVT, ConcatOps);
34263 Results.push_back(Res);
34264 if (IsStrict)
34265 Results.push_back(Chain);
34266 return;
34267 }
34268
34269
34270 if (VT == MVT::v2i32) {
34271 assert((!IsStrict || IsSigned || Subtarget.hasAVX512()) &&
34272 "Strict unsigned conversion requires AVX512");
34273 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
34275 "Unexpected type action!");
34276 if (Src.getValueType() == MVT::v2f64) {
34277 if (!IsSigned && !Subtarget.hasAVX512()) {
34278 SDValue Res =
34279 expandFP_TO_UINT_SSE(MVT::v4i32, Src, dl, DAG, Subtarget);
34280 Results.push_back(Res);
34281 return;
34282 }
34283
34284 if (IsStrict)
34286 else
34287 Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
34288
34289 // If we have VLX we can emit a target specific FP_TO_UINT node,.
34290 if (!IsSigned && !Subtarget.hasVLX()) {
34291 // Otherwise we can defer to the generic legalizer which will widen
34292 // the input as well. This will be further widened during op
34293 // legalization to v8i32<-v8f64.
34294 // For strict nodes we'll need to widen ourselves.
34295 // FIXME: Fix the type legalizer to safely widen strict nodes?
34296 if (!IsStrict)
34297 return;
34298 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f64, Src,
34299 DAG.getConstantFP(0.0, dl, MVT::v2f64));
34300 Opc = N->getOpcode();
34301 }
34302 SDValue Res;
34303 SDValue Chain;
34304 if (IsStrict) {
34305 Res = DAG.getNode(Opc, dl, {MVT::v4i32, MVT::Other},
34306 {N->getOperand(0), Src});
34307 Chain = Res.getValue(1);
34308 } else {
34309 Res = DAG.getNode(Opc, dl, MVT::v4i32, Src);
34310 }
34311 Results.push_back(Res);
34312 if (IsStrict)
34313 Results.push_back(Chain);
34314 return;
34315 }
34316
34317 // Custom widen strict v2f32->v2i32 by padding with zeros.
34318 // FIXME: Should generic type legalizer do this?
34319 if (Src.getValueType() == MVT::v2f32 && IsStrict) {
34320 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
34321 DAG.getConstantFP(0.0, dl, MVT::v2f32));
34322 SDValue Res = DAG.getNode(Opc, dl, {MVT::v4i32, MVT::Other},
34323 {N->getOperand(0), Src});
34324 Results.push_back(Res);
34325 Results.push_back(Res.getValue(1));
34326 return;
34327 }
34328
34329 // The FP_TO_INTHelper below only handles f32/f64/f80 scalar inputs,
34330 // so early out here.
34331 return;
34332 }
34333
34334 assert(!VT.isVector() && "Vectors should have been handled above!");
34335
34336 if ((Subtarget.hasDQI() && VT == MVT::i64 &&
34337 (SrcVT == MVT::f32 || SrcVT == MVT::f64)) ||
34338 (Subtarget.hasFP16() && SrcVT == MVT::f16)) {
34339 assert(!Subtarget.is64Bit() && "i64 should be legal");
34340 unsigned NumElts = Subtarget.hasVLX() ? 2 : 8;
34341 // If we use a 128-bit result we might need to use a target specific node.
34342 unsigned SrcElts =
34343 std::max(NumElts, 128U / (unsigned)SrcVT.getSizeInBits());
34344 MVT VecVT = MVT::getVectorVT(MVT::i64, NumElts);
34345 MVT VecInVT = MVT::getVectorVT(SrcVT.getSimpleVT(), SrcElts);
34346 if (NumElts != SrcElts) {
34347 if (IsStrict)
34349 else
34350 Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
34351 }
34352
34353 SDValue ZeroIdx = DAG.getVectorIdxConstant(0, dl);
34354 SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VecInVT,
34355 DAG.getConstantFP(0.0, dl, VecInVT), Src,
34356 ZeroIdx);
34357 SDValue Chain;
34358 if (IsStrict) {
34359 SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);
34360 Res = DAG.getNode(Opc, SDLoc(N), Tys, N->getOperand(0), Res);
34361 Chain = Res.getValue(1);
34362 } else
34363 Res = DAG.getNode(Opc, SDLoc(N), VecVT, Res);
34364 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Res, ZeroIdx);
34365 Results.push_back(Res);
34366 if (IsStrict)
34367 Results.push_back(Chain);
34368 return;
34369 }
34370
34371 if (VT == MVT::i128 && Subtarget.isTargetWin64()) {
34372 SDValue Chain;
34373 SDValue V = LowerWin64_FP_TO_INT128(SDValue(N, 0), DAG, Chain);
34374 Results.push_back(V);
34375 if (IsStrict)
34376 Results.push_back(Chain);
34377 return;
34378 }
34379
34380 if (SDValue V = FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, Chain)) {
34381 Results.push_back(V);
34382 if (IsStrict)
34383 Results.push_back(Chain);
34384 }
34385 return;
34386 }
34387 case ISD::LRINT:
34388 if (N->getValueType(0) == MVT::v2i32) {
34389 SDValue Src = N->getOperand(0);
34390 if (Subtarget.hasFP16() && Src.getValueType() == MVT::v2f16) {
34391 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f16, Src,
34392 DAG.getUNDEF(MVT::v2f16));
34393 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8f16, Src,
34394 DAG.getUNDEF(MVT::v4f16));
34395 } else if (Src.getValueType() != MVT::v2f64) {
34396 return;
34397 }
34398 Results.push_back(DAG.getNode(X86ISD::CVTP2SI, dl, MVT::v4i32, Src));
34399 return;
34400 }
34401 [[fallthrough]];
34402 case ISD::LLRINT: {
34403 if (SDValue V = LRINT_LLRINTHelper(N, DAG))
34404 Results.push_back(V);
34405 return;
34406 }
34407
34408 case ISD::SINT_TO_FP:
34410 case ISD::UINT_TO_FP:
34412 bool IsStrict = N->isStrictFPOpcode();
34413 bool IsSigned = Opc == ISD::SINT_TO_FP || Opc == ISD::STRICT_SINT_TO_FP;
34414 EVT VT = N->getValueType(0);
34415 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
34416 if (VT.getVectorElementType() == MVT::f16 && Subtarget.hasFP16() &&
34417 Subtarget.hasVLX()) {
34418 if (Src.getValueType().getVectorElementType() == MVT::i16)
34419 return;
34420
34421 if (VT == MVT::v2f16 && Src.getValueType() == MVT::v2i32)
34422 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
34423 IsStrict ? DAG.getConstant(0, dl, MVT::v2i32)
34424 : DAG.getUNDEF(MVT::v2i32));
34425 if (IsStrict) {
34426 unsigned Opc =
34428 SDValue Res = DAG.getNode(Opc, dl, {MVT::v8f16, MVT::Other},
34429 {N->getOperand(0), Src});
34430 Results.push_back(Res);
34431 Results.push_back(Res.getValue(1));
34432 } else {
34433 unsigned Opc = IsSigned ? X86ISD::CVTSI2P : X86ISD::CVTUI2P;
34434 Results.push_back(DAG.getNode(Opc, dl, MVT::v8f16, Src));
34435 }
34436 return;
34437 }
34438 if (VT != MVT::v2f32)
34439 return;
34440 EVT SrcVT = Src.getValueType();
34441 if (Subtarget.hasDQI() && Subtarget.hasVLX() && SrcVT == MVT::v2i64) {
34442 if (IsStrict) {
34443 unsigned Opc = IsSigned ? X86ISD::STRICT_CVTSI2P
34445 SDValue Res = DAG.getNode(Opc, dl, {MVT::v4f32, MVT::Other},
34446 {N->getOperand(0), Src});
34447 Results.push_back(Res);
34448 Results.push_back(Res.getValue(1));
34449 } else {
34450 unsigned Opc = IsSigned ? X86ISD::CVTSI2P : X86ISD::CVTUI2P;
34451 Results.push_back(DAG.getNode(Opc, dl, MVT::v4f32, Src));
34452 }
34453 return;
34454 }
34455 if (SrcVT == MVT::v2i64 && !IsSigned && Subtarget.is64Bit() &&
34456 Subtarget.hasSSE41() && !Subtarget.hasAVX512()) {
34457 SDValue Zero = DAG.getConstant(0, dl, SrcVT);
34458 SDValue One = DAG.getConstant(1, dl, SrcVT);
34459 SDValue Sign = DAG.getNode(ISD::OR, dl, SrcVT,
34460 DAG.getNode(ISD::SRL, dl, SrcVT, Src, One),
34461 DAG.getNode(ISD::AND, dl, SrcVT, Src, One));
34462 SDValue IsNeg = DAG.getSetCC(dl, MVT::v2i64, Src, Zero, ISD::SETLT);
34463 SDValue SignSrc = DAG.getSelect(dl, SrcVT, IsNeg, Sign, Src);
34464 SmallVector<SDValue, 4> SignCvts(4, DAG.getConstantFP(0.0, dl, MVT::f32));
34465 for (int i = 0; i != 2; ++i) {
34466 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64,
34467 SignSrc, DAG.getVectorIdxConstant(i, dl));
34468 if (IsStrict)
34469 SignCvts[i] =
34470 DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {MVT::f32, MVT::Other},
34471 {N->getOperand(0), Elt});
34472 else
34473 SignCvts[i] = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::f32, Elt);
34474 };
34475 SDValue SignCvt = DAG.getBuildVector(MVT::v4f32, dl, SignCvts);
34476 SDValue Slow, Chain;
34477 if (IsStrict) {
34478 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
34479 SignCvts[0].getValue(1), SignCvts[1].getValue(1));
34480 Slow = DAG.getNode(ISD::STRICT_FADD, dl, {MVT::v4f32, MVT::Other},
34481 {Chain, SignCvt, SignCvt});
34482 Chain = Slow.getValue(1);
34483 } else {
34484 Slow = DAG.getNode(ISD::FADD, dl, MVT::v4f32, SignCvt, SignCvt);
34485 }
34486 IsNeg = DAG.getBitcast(MVT::v4i32, IsNeg);
34487 IsNeg =
34488 DAG.getVectorShuffle(MVT::v4i32, dl, IsNeg, IsNeg, {1, 3, -1, -1});
34489 SDValue Cvt = DAG.getSelect(dl, MVT::v4f32, IsNeg, Slow, SignCvt);
34490 Results.push_back(Cvt);
34491 if (IsStrict)
34492 Results.push_back(Chain);
34493 return;
34494 }
34495
34496 if (SrcVT != MVT::v2i32)
34497 return;
34498
34499 if (IsSigned || Subtarget.hasAVX512()) {
34500 if (!IsStrict)
34501 return;
34502
34503 // Custom widen strict v2i32->v2f32 to avoid scalarization.
34504 // FIXME: Should generic type legalizer do this?
34505 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
34506 DAG.getConstant(0, dl, MVT::v2i32));
34507 SDValue Res = DAG.getNode(Opc, dl, {MVT::v4f32, MVT::Other},
34508 {N->getOperand(0), Src});
34509 Results.push_back(Res);
34510 Results.push_back(Res.getValue(1));
34511 return;
34512 }
34513
34514 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
34515 SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64, Src);
34516 SDValue VBias = DAG.getConstantFP(
34517 llvm::bit_cast<double>(0x4330000000000000ULL), dl, MVT::v2f64);
34518 SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, ZExtIn,
34519 DAG.getBitcast(MVT::v2i64, VBias));
34520 Or = DAG.getBitcast(MVT::v2f64, Or);
34521 if (IsStrict) {
34522 SDValue Sub = DAG.getNode(ISD::STRICT_FSUB, dl, {MVT::v2f64, MVT::Other},
34523 {N->getOperand(0), Or, VBias});
34525 {MVT::v4f32, MVT::Other},
34526 {Sub.getValue(1), Sub});
34527 Results.push_back(Res);
34528 Results.push_back(Res.getValue(1));
34529 } else {
34530 // TODO: Are there any fast-math-flags to propagate here?
34531 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, Or, VBias);
34532 Results.push_back(DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, Sub));
34533 }
34534 return;
34535 }
34537 case ISD::FP_ROUND: {
34538 bool IsStrict = N->isStrictFPOpcode();
34539 SDValue Chain = IsStrict ? N->getOperand(0) : SDValue();
34540 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
34541 SDValue Rnd = N->getOperand(IsStrict ? 2 : 1);
34542 EVT SrcVT = Src.getValueType();
34543 EVT VT = N->getValueType(0);
34544 SDValue V;
34545 if (VT == MVT::v2f16 && Src.getValueType() == MVT::v2f32) {
34546 SDValue Ext = IsStrict ? DAG.getConstantFP(0.0, dl, MVT::v2f32)
34547 : DAG.getUNDEF(MVT::v2f32);
34548 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src, Ext);
34549 }
34550 if (!Subtarget.hasFP16() && VT.getVectorElementType() == MVT::f16) {
34551 assert(Subtarget.hasF16C() && "Cannot widen f16 without F16C");
34552 if (SrcVT.getVectorElementType() != MVT::f32)
34553 return;
34554
34555 if (IsStrict)
34556 V = DAG.getNode(X86ISD::STRICT_CVTPS2PH, dl, {MVT::v8i16, MVT::Other},
34557 {Chain, Src, Rnd});
34558 else
34559 V = DAG.getNode(X86ISD::CVTPS2PH, dl, MVT::v8i16, Src, Rnd);
34560
34561 Results.push_back(DAG.getBitcast(MVT::v8f16, V));
34562 if (IsStrict)
34563 Results.push_back(V.getValue(1));
34564 return;
34565 }
34566 if (!isTypeLegal(Src.getValueType()))
34567 return;
34568 EVT NewVT = VT.getVectorElementType() == MVT::f16 ? MVT::v8f16 : MVT::v4f32;
34569 if (IsStrict)
34570 V = DAG.getNode(X86ISD::STRICT_VFPROUND, dl, {NewVT, MVT::Other},
34571 {Chain, Src});
34572 else
34573 V = DAG.getNode(X86ISD::VFPROUND, dl, NewVT, Src);
34574 Results.push_back(V);
34575 if (IsStrict)
34576 Results.push_back(V.getValue(1));
34577 return;
34578 }
34579 case ISD::FP_EXTEND:
34580 case ISD::STRICT_FP_EXTEND: {
34581 // Right now, only MVT::v2f32 has OperationAction for FP_EXTEND.
34582 // No other ValueType for FP_EXTEND should reach this point.
34583 assert(N->getValueType(0) == MVT::v2f32 &&
34584 "Do not know how to legalize this Node");
34585 if (!Subtarget.hasFP16() || !Subtarget.hasVLX())
34586 return;
34587 bool IsStrict = N->isStrictFPOpcode();
34588 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
34589 if (Src.getValueType().getVectorElementType() != MVT::f16)
34590 return;
34591 SDValue Ext = IsStrict ? DAG.getConstantFP(0.0, dl, MVT::v2f16)
34592 : DAG.getUNDEF(MVT::v2f16);
34593 SDValue V = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f16, Src, Ext);
34594 if (IsStrict)
34595 V = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {MVT::v4f32, MVT::Other},
34596 {N->getOperand(0), V});
34597 else
34598 V = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v4f32, V);
34599 Results.push_back(V);
34600 if (IsStrict)
34601 Results.push_back(V.getValue(1));
34602 return;
34603 }
34605 unsigned IntNo = N->getConstantOperandVal(1);
34606 switch (IntNo) {
34607 default : llvm_unreachable("Do not know how to custom type "
34608 "legalize this intrinsic operation!");
34609 case Intrinsic::x86_rdtsc:
34610 return getReadTimeStampCounter(N, dl, X86::RDTSC, DAG, Subtarget,
34611 Results);
34612 case Intrinsic::x86_rdtscp:
34613 return getReadTimeStampCounter(N, dl, X86::RDTSCP, DAG, Subtarget,
34614 Results);
34615 case Intrinsic::x86_rdpmc:
34616 expandIntrinsicWChainHelper(N, dl, DAG, X86::RDPMC, X86::ECX, Subtarget,
34617 Results);
34618 return;
34619 case Intrinsic::x86_rdpru:
34620 expandIntrinsicWChainHelper(N, dl, DAG, X86::RDPRU, X86::ECX, Subtarget,
34621 Results);
34622 return;
34623 case Intrinsic::x86_xgetbv:
34624 expandIntrinsicWChainHelper(N, dl, DAG, X86::XGETBV, X86::ECX, Subtarget,
34625 Results);
34626 return;
34627 }
34628 }
34629 case ISD::READCYCLECOUNTER: {
34630 return getReadTimeStampCounter(N, dl, X86::RDTSC, DAG, Subtarget, Results);
34631 }
34632 case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: {
34633 EVT T = N->getValueType(0);
34634 assert((T == MVT::i64 || T == MVT::i128) && "can only expand cmpxchg pair");
34635 bool Regs64bit = T == MVT::i128;
34636 assert((!Regs64bit || Subtarget.canUseCMPXCHG16B()) &&
34637 "64-bit ATOMIC_CMP_SWAP_WITH_SUCCESS requires CMPXCHG16B");
34638 MVT HalfT = Regs64bit ? MVT::i64 : MVT::i32;
34639 SDValue cpInL, cpInH;
34640 std::tie(cpInL, cpInH) =
34641 DAG.SplitScalar(N->getOperand(2), dl, HalfT, HalfT);
34642 cpInL = DAG.getCopyToReg(N->getOperand(0), dl,
34643 Regs64bit ? X86::RAX : X86::EAX, cpInL, SDValue());
34644 cpInH =
34645 DAG.getCopyToReg(cpInL.getValue(0), dl, Regs64bit ? X86::RDX : X86::EDX,
34646 cpInH, cpInL.getValue(1));
34647 SDValue swapInL, swapInH;
34648 std::tie(swapInL, swapInH) =
34649 DAG.SplitScalar(N->getOperand(3), dl, HalfT, HalfT);
34650 swapInH =
34651 DAG.getCopyToReg(cpInH.getValue(0), dl, Regs64bit ? X86::RCX : X86::ECX,
34652 swapInH, cpInH.getValue(1));
34653
34654 // In 64-bit mode we might need the base pointer in RBX, but we can't know
34655 // until later. So we keep the RBX input in a vreg and use a custom
34656 // inserter.
34657 // Since RBX will be a reserved register the register allocator will not
34658 // make sure its value will be properly saved and restored around this
34659 // live-range.
34660 SDValue Result;
34661 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
34662 MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();
34663 if (Regs64bit) {
34664 SDValue Ops[] = {swapInH.getValue(0), N->getOperand(1), swapInL,
34665 swapInH.getValue(1)};
34666 Result =
34667 DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG16_DAG, dl, Tys, Ops, T, MMO);
34668 } else {
34669 swapInL = DAG.getCopyToReg(swapInH.getValue(0), dl, X86::EBX, swapInL,
34670 swapInH.getValue(1));
34671 SDValue Ops[] = {swapInL.getValue(0), N->getOperand(1),
34672 swapInL.getValue(1)};
34673 Result =
34674 DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG8_DAG, dl, Tys, Ops, T, MMO);
34675 }
34676
34677 SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl,
34678 Regs64bit ? X86::RAX : X86::EAX,
34679 HalfT, Result.getValue(1));
34680 SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl,
34681 Regs64bit ? X86::RDX : X86::EDX,
34682 HalfT, cpOutL.getValue(2));
34683 SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)};
34684
34685 SDValue EFLAGS = DAG.getCopyFromReg(cpOutH.getValue(1), dl, X86::EFLAGS,
34686 MVT::i32, cpOutH.getValue(2));
34687 SDValue Success = getSETCC(X86::COND_E, EFLAGS, dl, DAG);
34688 Success = DAG.getZExtOrTrunc(Success, dl, N->getValueType(1));
34689
34690 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF));
34691 Results.push_back(Success);
34692 Results.push_back(EFLAGS.getValue(1));
34693 return;
34694 }
34695 case ISD::ATOMIC_LOAD: {
34696 assert(
34697 (N->getValueType(0) == MVT::i64 || N->getValueType(0) == MVT::i128) &&
34698 "Unexpected VT!");
34699 bool NoImplicitFloatOps =
34701 Attribute::NoImplicitFloat);
34702 if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps) {
34703 auto *Node = cast<AtomicSDNode>(N);
34704
34705 if (N->getValueType(0) == MVT::i128) {
34706 if (Subtarget.is64Bit() && Subtarget.hasAVX()) {
34707 SDValue Ld = DAG.getLoad(MVT::v2i64, dl, Node->getChain(),
34708 Node->getBasePtr(), Node->getMemOperand());
34709 SDValue ResL = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Ld,
34710 DAG.getVectorIdxConstant(0, dl));
34711 SDValue ResH = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Ld,
34712 DAG.getVectorIdxConstant(1, dl));
34713 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, N->getValueType(0),
34714 {ResL, ResH}));
34715 Results.push_back(Ld.getValue(1));
34716 return;
34717 }
34718 break;
34719 }
34720 if (Subtarget.hasSSE1()) {
34721 // Use a VZEXT_LOAD which will be selected as MOVQ or XORPS+MOVLPS.
34722 // Then extract the lower 64-bits.
34723 MVT LdVT = Subtarget.hasSSE2() ? MVT::v2i64 : MVT::v4f32;
34724 SDVTList Tys = DAG.getVTList(LdVT, MVT::Other);
34725 SDValue Ops[] = { Node->getChain(), Node->getBasePtr() };
34727 MVT::i64, Node->getMemOperand());
34728 if (Subtarget.hasSSE2()) {
34729 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Ld,
34730 DAG.getVectorIdxConstant(0, dl));
34731 Results.push_back(Res);
34732 Results.push_back(Ld.getValue(1));
34733 return;
34734 }
34735 // We use an alternative sequence for SSE1 that extracts as v2f32 and
34736 // then casts to i64. This avoids a 128-bit stack temporary being
34737 // created by type legalization if we were to cast v4f32->v2i64.
34738 SDValue Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2f32, Ld,
34739 DAG.getVectorIdxConstant(0, dl));
34740 Res = DAG.getBitcast(MVT::i64, Res);
34741 Results.push_back(Res);
34742 Results.push_back(Ld.getValue(1));
34743 return;
34744 }
34745 if (Subtarget.hasX87()) {
34746 // First load this into an 80-bit X87 register. This will put the whole
34747 // integer into the significand.
34748 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
34749 SDValue Ops[] = { Node->getChain(), Node->getBasePtr() };
34751 dl, Tys, Ops, MVT::i64,
34752 Node->getMemOperand());
34753 SDValue Chain = Result.getValue(1);
34754
34755 // Now store the X87 register to a stack temporary and convert to i64.
34756 // This store is not atomic and doesn't need to be.
34757 // FIXME: We don't need a stack temporary if the result of the load
34758 // is already being stored. We could just directly store there.
34759 SDValue StackPtr = DAG.CreateStackTemporary(MVT::i64);
34760 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
34761 MachinePointerInfo MPI =
34763 SDValue StoreOps[] = { Chain, Result, StackPtr };
34764 Chain = DAG.getMemIntrinsicNode(
34765 X86ISD::FIST, dl, DAG.getVTList(MVT::Other), StoreOps, MVT::i64,
34766 MPI, std::nullopt /*Align*/, MachineMemOperand::MOStore);
34767
34768 // Finally load the value back from the stack temporary and return it.
34769 // This load is not atomic and doesn't need to be.
34770 // This load will be further type legalized.
34771 Result = DAG.getLoad(MVT::i64, dl, Chain, StackPtr, MPI);
34772 Results.push_back(Result);
34773 Results.push_back(Result.getValue(1));
34774 return;
34775 }
34776 }
34777 // TODO: Use MOVLPS when SSE1 is available?
34778 // Delegate to generic TypeLegalization. Situations we can really handle
34779 // should have already been dealt with by AtomicExpandPass.cpp.
34780 break;
34781 }
34782 case ISD::ATOMIC_SWAP:
34783 case ISD::ATOMIC_LOAD_ADD:
34784 case ISD::ATOMIC_LOAD_SUB:
34785 case ISD::ATOMIC_LOAD_AND:
34786 case ISD::ATOMIC_LOAD_OR:
34787 case ISD::ATOMIC_LOAD_XOR:
34788 case ISD::ATOMIC_LOAD_NAND:
34789 case ISD::ATOMIC_LOAD_MIN:
34790 case ISD::ATOMIC_LOAD_MAX:
34791 case ISD::ATOMIC_LOAD_UMIN:
34792 case ISD::ATOMIC_LOAD_UMAX:
34793 // Delegate to generic TypeLegalization. Situations we can really handle
34794 // should have already been dealt with by AtomicExpandPass.cpp.
34795 break;
34796
34797 case ISD::BITCAST: {
34798 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
34799 EVT DstVT = N->getValueType(0);
34800 EVT SrcVT = N->getOperand(0).getValueType();
34801
34802 // If this is a bitcast from a v64i1 k-register to a i64 on a 32-bit target
34803 // we can split using the k-register rather than memory.
34804 if (SrcVT == MVT::v64i1 && DstVT == MVT::i64 && Subtarget.hasBWI()) {
34805 assert(!Subtarget.is64Bit() && "Expected 32-bit mode");
34806 SDValue Lo, Hi;
34807 std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
34808 Lo = DAG.getBitcast(MVT::i32, Lo);
34809 Hi = DAG.getBitcast(MVT::i32, Hi);
34810 SDValue Res = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
34811 Results.push_back(Res);
34812 return;
34813 }
34814
34815 if (DstVT.isVector() && SrcVT == MVT::x86mmx) {
34816 // FIXME: Use v4f32 for SSE1?
34817 assert(Subtarget.hasSSE2() && "Requires SSE2");
34818 assert(getTypeAction(*DAG.getContext(), DstVT) == TypeWidenVector &&
34819 "Unexpected type action!");
34820 EVT WideVT = getTypeToTransformTo(*DAG.getContext(), DstVT);
34821 SDValue Res = DAG.getNode(X86ISD::MOVQ2DQ, dl, MVT::v2i64,
34822 N->getOperand(0));
34823 Res = DAG.getBitcast(WideVT, Res);
34824 Results.push_back(Res);
34825 return;
34826 }
34827
34828 return;
34829 }
34830 case ISD::MGATHER: {
34831 EVT VT = N->getValueType(0);
34832 if ((VT == MVT::v2f32 || VT == MVT::v2i32) &&
34833 (Subtarget.hasVLX() || !Subtarget.hasAVX512())) {
34834 auto *Gather = cast<MaskedGatherSDNode>(N);
34835 SDValue Index = Gather->getIndex();
34836 if (Index.getValueType() != MVT::v2i64)
34837 return;
34839 "Unexpected type action!");
34840 EVT WideVT = getTypeToTransformTo(*DAG.getContext(), VT);
34841 SDValue Mask = Gather->getMask();
34842 assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type");
34843 SDValue PassThru = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT,
34844 Gather->getPassThru(),
34845 DAG.getUNDEF(VT));
34846 if (!Subtarget.hasVLX()) {
34847 // We need to widen the mask, but the instruction will only use 2
34848 // of its elements. So we can use undef.
34849 Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask,
34850 DAG.getUNDEF(MVT::v2i1));
34851 Mask = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Mask);
34852 }
34853 SDValue Ops[] = { Gather->getChain(), PassThru, Mask,
34854 Gather->getBasePtr(), Index, Gather->getScale() };
34855 SDValue Res = DAG.getMemIntrinsicNode(
34856 X86ISD::MGATHER, dl, DAG.getVTList(WideVT, MVT::Other), Ops,
34857 Gather->getMemoryVT(), Gather->getMemOperand());
34858 Results.push_back(Res);
34859 Results.push_back(Res.getValue(1));
34860 return;
34861 }
34862 return;
34863 }
34864 case ISD::LOAD: {
34865 // Use an f64/i64 load and a scalar_to_vector for v2f32/v2i32 loads. This
34866 // avoids scalarizing in 32-bit mode. In 64-bit mode this avoids a int->fp
34867 // cast since type legalization will try to use an i64 load.
34868 MVT VT = N->getSimpleValueType(0);
34869 assert(VT.isVector() && VT.getSizeInBits() == 64 && "Unexpected VT");
34871 "Unexpected type action!");
34872 if (!ISD::isNON_EXTLoad(N))
34873 return;
34874 auto *Ld = cast<LoadSDNode>(N);
34875 if (Subtarget.hasSSE2()) {
34876 MVT LdVT = Subtarget.is64Bit() && VT.isInteger() ? MVT::i64 : MVT::f64;
34877 SDValue Res = DAG.getLoad(LdVT, dl, Ld->getChain(), Ld->getBasePtr(),
34878 Ld->getPointerInfo(), Ld->getBaseAlign(),
34879 Ld->getMemOperand()->getFlags());
34880 SDValue Chain = Res.getValue(1);
34881 MVT VecVT = MVT::getVectorVT(LdVT, 2);
34882 Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Res);
34883 EVT WideVT = getTypeToTransformTo(*DAG.getContext(), VT);
34884 Res = DAG.getBitcast(WideVT, Res);
34885 Results.push_back(Res);
34886 Results.push_back(Chain);
34887 return;
34888 }
34889 assert(Subtarget.hasSSE1() && "Expected SSE");
34890 SDVTList Tys = DAG.getVTList(MVT::v4f32, MVT::Other);
34891 SDValue Ops[] = {Ld->getChain(), Ld->getBasePtr()};
34893 MVT::i64, Ld->getMemOperand());
34894 Results.push_back(Res);
34895 Results.push_back(Res.getValue(1));
34896 return;
34897 }
34898 case ISD::ADDRSPACECAST: {
34899 SDValue V = LowerADDRSPACECAST(SDValue(N,0), DAG);
34900 Results.push_back(V);
34901 return;
34902 }
34903 case ISD::BITREVERSE: {
34904 assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!");
34905 assert((Subtarget.hasXOP() || Subtarget.hasGFNI()) && "Expected XOP/GFNI");
34906 // We can use VPPERM/GF2P8AFFINEQB by copying to a vector register and back.
34907 // We'll need to move the scalar in two i32 pieces.
34908 Results.push_back(LowerBITREVERSE(SDValue(N, 0), Subtarget, DAG));
34909 return;
34910 }
34912 // f16 = extract vXf16 %vec, i64 %idx
34913 assert(N->getSimpleValueType(0) == MVT::f16 &&
34914 "Unexpected Value type of EXTRACT_VECTOR_ELT!");
34915 assert(Subtarget.hasFP16() && "Expected FP16");
34916 SDValue VecOp = N->getOperand(0);
34918 SDValue Split = DAG.getBitcast(ExtVT, N->getOperand(0));
34919 Split = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Split,
34920 N->getOperand(1));
34921 Split = DAG.getBitcast(MVT::f16, Split);
34922 Results.push_back(Split);
34923 return;
34924 }
34925 }
34926}
34927
34928const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
34929 switch ((X86ISD::NodeType)Opcode) {
34930 case X86ISD::FIRST_NUMBER: break;
34931#define NODE_NAME_CASE(NODE) case X86ISD::NODE: return "X86ISD::" #NODE;
34932 NODE_NAME_CASE(BSF)
34933 NODE_NAME_CASE(BSR)
34934 NODE_NAME_CASE(FSHL)
34935 NODE_NAME_CASE(FSHR)
34936 NODE_NAME_CASE(FAND)
34937 NODE_NAME_CASE(FANDN)
34938 NODE_NAME_CASE(FOR)
34939 NODE_NAME_CASE(FXOR)
34940 NODE_NAME_CASE(FILD)
34941 NODE_NAME_CASE(FIST)
34942 NODE_NAME_CASE(FP_TO_INT_IN_MEM)
34943 NODE_NAME_CASE(FLD)
34944 NODE_NAME_CASE(FST)
34945 NODE_NAME_CASE(CALL)
34946 NODE_NAME_CASE(CALL_RVMARKER)
34947 NODE_NAME_CASE(IMP_CALL)
34949 NODE_NAME_CASE(CMP)
34950 NODE_NAME_CASE(FCMP)
34951 NODE_NAME_CASE(STRICT_FCMP)
34952 NODE_NAME_CASE(STRICT_FCMPS)
34954 NODE_NAME_CASE(UCOMI)
34955 NODE_NAME_CASE(COMX)
34956 NODE_NAME_CASE(UCOMX)
34957 NODE_NAME_CASE(CMPM)
34958 NODE_NAME_CASE(CMPMM)
34959 NODE_NAME_CASE(STRICT_CMPM)
34960 NODE_NAME_CASE(CMPMM_SAE)
34961 NODE_NAME_CASE(SETCC)
34962 NODE_NAME_CASE(SETCC_CARRY)
34963 NODE_NAME_CASE(FSETCC)
34964 NODE_NAME_CASE(FSETCCM)
34965 NODE_NAME_CASE(FSETCCM_SAE)
34966 NODE_NAME_CASE(CMOV)
34967 NODE_NAME_CASE(BRCOND)
34968 NODE_NAME_CASE(RET_GLUE)
34969 NODE_NAME_CASE(IRET)
34970 NODE_NAME_CASE(REP_STOS)
34971 NODE_NAME_CASE(REP_MOVS)
34972 NODE_NAME_CASE(GlobalBaseReg)
34974 NODE_NAME_CASE(WrapperRIP)
34975 NODE_NAME_CASE(MOVQ2DQ)
34976 NODE_NAME_CASE(MOVDQ2Q)
34977 NODE_NAME_CASE(MMX_MOVD2W)
34978 NODE_NAME_CASE(MMX_MOVW2D)
34979 NODE_NAME_CASE(PEXTRB)
34980 NODE_NAME_CASE(PEXTRW)
34981 NODE_NAME_CASE(INSERTPS)
34982 NODE_NAME_CASE(PINSRB)
34983 NODE_NAME_CASE(PINSRW)
34984 NODE_NAME_CASE(PSHUFB)
34985 NODE_NAME_CASE(ANDNP)
34986 NODE_NAME_CASE(BLENDI)
34988 NODE_NAME_CASE(HADD)
34989 NODE_NAME_CASE(HSUB)
34990 NODE_NAME_CASE(FHADD)
34991 NODE_NAME_CASE(FHSUB)
34992 NODE_NAME_CASE(CONFLICT)
34993 NODE_NAME_CASE(FMAX)
34994 NODE_NAME_CASE(FMAXS)
34995 NODE_NAME_CASE(FMAX_SAE)
34996 NODE_NAME_CASE(FMAXS_SAE)
34997 NODE_NAME_CASE(STRICT_FMAX)
34998 NODE_NAME_CASE(FMIN)
34999 NODE_NAME_CASE(FMINS)
35000 NODE_NAME_CASE(FMIN_SAE)
35001 NODE_NAME_CASE(FMINS_SAE)
35002 NODE_NAME_CASE(STRICT_FMIN)
35003 NODE_NAME_CASE(FMAXC)
35004 NODE_NAME_CASE(FMINC)
35005 NODE_NAME_CASE(FRSQRT)
35006 NODE_NAME_CASE(FRCP)
35007 NODE_NAME_CASE(EXTRQI)
35008 NODE_NAME_CASE(INSERTQI)
35009 NODE_NAME_CASE(TLSADDR)
35010 NODE_NAME_CASE(TLSBASEADDR)
35011 NODE_NAME_CASE(TLSCALL)
35012 NODE_NAME_CASE(TLSDESC)
35013 NODE_NAME_CASE(EH_SJLJ_SETJMP)
35014 NODE_NAME_CASE(EH_SJLJ_LONGJMP)
35015 NODE_NAME_CASE(EH_SJLJ_SETUP_DISPATCH)
35016 NODE_NAME_CASE(EH_RETURN)
35017 NODE_NAME_CASE(TC_RETURN)
35018 NODE_NAME_CASE(FNSTCW16m)
35019 NODE_NAME_CASE(FLDCW16m)
35020 NODE_NAME_CASE(FNSTENVm)
35021 NODE_NAME_CASE(FLDENVm)
35022 NODE_NAME_CASE(LCMPXCHG_DAG)
35023 NODE_NAME_CASE(LCMPXCHG8_DAG)
35024 NODE_NAME_CASE(LCMPXCHG16_DAG)
35025 NODE_NAME_CASE(LCMPXCHG16_SAVE_RBX_DAG)
35026 NODE_NAME_CASE(LADD)
35027 NODE_NAME_CASE(LSUB)
35028 NODE_NAME_CASE(LOR)
35029 NODE_NAME_CASE(LXOR)
35030 NODE_NAME_CASE(LAND)
35031 NODE_NAME_CASE(LBTS)
35032 NODE_NAME_CASE(LBTC)
35033 NODE_NAME_CASE(LBTR)
35034 NODE_NAME_CASE(LBTS_RM)
35035 NODE_NAME_CASE(LBTC_RM)
35036 NODE_NAME_CASE(LBTR_RM)
35037 NODE_NAME_CASE(AADD)
35038 NODE_NAME_CASE(AOR)
35039 NODE_NAME_CASE(AXOR)
35040 NODE_NAME_CASE(AAND)
35041 NODE_NAME_CASE(VZEXT_MOVL)
35042 NODE_NAME_CASE(VZEXT_LOAD)
35043 NODE_NAME_CASE(VEXTRACT_STORE)
35044 NODE_NAME_CASE(VTRUNC)
35045 NODE_NAME_CASE(VTRUNCS)
35046 NODE_NAME_CASE(VTRUNCUS)
35047 NODE_NAME_CASE(VMTRUNC)
35048 NODE_NAME_CASE(VMTRUNCS)
35049 NODE_NAME_CASE(VMTRUNCUS)
35050 NODE_NAME_CASE(VTRUNCSTORES)
35051 NODE_NAME_CASE(VTRUNCSTOREUS)
35052 NODE_NAME_CASE(VMTRUNCSTORES)
35053 NODE_NAME_CASE(VMTRUNCSTOREUS)
35054 NODE_NAME_CASE(VFPEXT)
35055 NODE_NAME_CASE(STRICT_VFPEXT)
35056 NODE_NAME_CASE(VFPEXT_SAE)
35057 NODE_NAME_CASE(VFPEXTS)
35058 NODE_NAME_CASE(VFPEXTS_SAE)
35059 NODE_NAME_CASE(VFPROUND)
35060 NODE_NAME_CASE(VFPROUND2)
35061 NODE_NAME_CASE(VFPROUND2_RND)
35062 NODE_NAME_CASE(STRICT_VFPROUND)
35063 NODE_NAME_CASE(VMFPROUND)
35064 NODE_NAME_CASE(VFPROUND_RND)
35065 NODE_NAME_CASE(VFPROUNDS)
35066 NODE_NAME_CASE(VFPROUNDS_RND)
35067 NODE_NAME_CASE(VSHLDQ)
35068 NODE_NAME_CASE(VSRLDQ)
35069 NODE_NAME_CASE(VSHL)
35070 NODE_NAME_CASE(VSRL)
35071 NODE_NAME_CASE(VSRA)
35072 NODE_NAME_CASE(VSHLI)
35073 NODE_NAME_CASE(VSRLI)
35074 NODE_NAME_CASE(VSRAI)
35075 NODE_NAME_CASE(VSHLV)
35076 NODE_NAME_CASE(VSRLV)
35077 NODE_NAME_CASE(VSRAV)
35078 NODE_NAME_CASE(VROTLI)
35079 NODE_NAME_CASE(VROTRI)
35080 NODE_NAME_CASE(VPPERM)
35081 NODE_NAME_CASE(CMPP)
35082 NODE_NAME_CASE(STRICT_CMPP)
35083 NODE_NAME_CASE(PCMPEQ)
35084 NODE_NAME_CASE(PCMPGT)
35085 NODE_NAME_CASE(PHMINPOS)
35086 NODE_NAME_CASE(ADD)
35087 NODE_NAME_CASE(SUB)
35088 NODE_NAME_CASE(ADC)
35089 NODE_NAME_CASE(SBB)
35090 NODE_NAME_CASE(SMUL)
35091 NODE_NAME_CASE(UMUL)
35092 NODE_NAME_CASE(OR)
35093 NODE_NAME_CASE(XOR)
35094 NODE_NAME_CASE(AND)
35095 NODE_NAME_CASE(BEXTR)
35097 NODE_NAME_CASE(BZHI)
35098 NODE_NAME_CASE(PDEP)
35099 NODE_NAME_CASE(PEXT)
35100 NODE_NAME_CASE(MUL_IMM)
35101 NODE_NAME_CASE(MOVMSK)
35102 NODE_NAME_CASE(PTEST)
35103 NODE_NAME_CASE(TESTP)
35104 NODE_NAME_CASE(KORTEST)
35105 NODE_NAME_CASE(KTEST)
35106 NODE_NAME_CASE(KADD)
35107 NODE_NAME_CASE(KSHIFTL)
35108 NODE_NAME_CASE(KSHIFTR)
35109 NODE_NAME_CASE(PACKSS)
35110 NODE_NAME_CASE(PACKUS)
35111 NODE_NAME_CASE(PALIGNR)
35112 NODE_NAME_CASE(VALIGN)
35113 NODE_NAME_CASE(VSHLD)
35114 NODE_NAME_CASE(VSHRD)
35115 NODE_NAME_CASE(PSHUFD)
35116 NODE_NAME_CASE(PSHUFHW)
35117 NODE_NAME_CASE(PSHUFLW)
35118 NODE_NAME_CASE(SHUFP)
35119 NODE_NAME_CASE(SHUF128)
35120 NODE_NAME_CASE(MOVLHPS)
35121 NODE_NAME_CASE(MOVHLPS)
35122 NODE_NAME_CASE(MOVDDUP)
35123 NODE_NAME_CASE(MOVSHDUP)
35124 NODE_NAME_CASE(MOVSLDUP)
35125 NODE_NAME_CASE(MOVSD)
35126 NODE_NAME_CASE(MOVSS)
35127 NODE_NAME_CASE(MOVSH)
35128 NODE_NAME_CASE(UNPCKL)
35129 NODE_NAME_CASE(UNPCKH)
35130 NODE_NAME_CASE(VBROADCAST)
35131 NODE_NAME_CASE(VBROADCAST_LOAD)
35132 NODE_NAME_CASE(VBROADCASTM)
35133 NODE_NAME_CASE(SUBV_BROADCAST_LOAD)
35134 NODE_NAME_CASE(VPERMILPV)
35135 NODE_NAME_CASE(VPERMILPI)
35136 NODE_NAME_CASE(VPERM2X128)
35137 NODE_NAME_CASE(VPERMV)
35138 NODE_NAME_CASE(VPERMV3)
35139 NODE_NAME_CASE(VPERMI)
35140 NODE_NAME_CASE(VPTERNLOG)
35141 NODE_NAME_CASE(FP_TO_SINT_SAT)
35142 NODE_NAME_CASE(FP_TO_UINT_SAT)
35143 NODE_NAME_CASE(VFIXUPIMM)
35144 NODE_NAME_CASE(VFIXUPIMM_SAE)
35145 NODE_NAME_CASE(VFIXUPIMMS)
35146 NODE_NAME_CASE(VFIXUPIMMS_SAE)
35147 NODE_NAME_CASE(VRANGE)
35148 NODE_NAME_CASE(VRANGE_SAE)
35149 NODE_NAME_CASE(VRANGES)
35150 NODE_NAME_CASE(VRANGES_SAE)
35151 NODE_NAME_CASE(PMULUDQ)
35152 NODE_NAME_CASE(PMULDQ)
35153 NODE_NAME_CASE(PSADBW)
35154 NODE_NAME_CASE(DBPSADBW)
35155 NODE_NAME_CASE(VASTART_SAVE_XMM_REGS)
35156 NODE_NAME_CASE(VAARG_64)
35157 NODE_NAME_CASE(VAARG_X32)
35158 NODE_NAME_CASE(DYN_ALLOCA)
35159 NODE_NAME_CASE(MFENCE)
35160 NODE_NAME_CASE(SEG_ALLOCA)
35161 NODE_NAME_CASE(PROBED_ALLOCA)
35164 NODE_NAME_CASE(RDPKRU)
35165 NODE_NAME_CASE(WRPKRU)
35166 NODE_NAME_CASE(VPMADDUBSW)
35167 NODE_NAME_CASE(VPMADDWD)
35168 NODE_NAME_CASE(VPSHA)
35169 NODE_NAME_CASE(VPSHL)
35170 NODE_NAME_CASE(VPCOM)
35171 NODE_NAME_CASE(VPCOMU)
35172 NODE_NAME_CASE(VPERMIL2)
35174 NODE_NAME_CASE(STRICT_FMSUB)
35176 NODE_NAME_CASE(STRICT_FNMADD)
35178 NODE_NAME_CASE(STRICT_FNMSUB)
35179 NODE_NAME_CASE(FMADDSUB)
35180 NODE_NAME_CASE(FMSUBADD)
35181 NODE_NAME_CASE(FMADD_RND)
35182 NODE_NAME_CASE(FNMADD_RND)
35183 NODE_NAME_CASE(FMSUB_RND)
35184 NODE_NAME_CASE(FNMSUB_RND)
35185 NODE_NAME_CASE(FMADDSUB_RND)
35186 NODE_NAME_CASE(FMSUBADD_RND)
35187 NODE_NAME_CASE(VFMADDC)
35188 NODE_NAME_CASE(VFMADDC_RND)
35189 NODE_NAME_CASE(VFCMADDC)
35190 NODE_NAME_CASE(VFCMADDC_RND)
35191 NODE_NAME_CASE(VFMULC)
35192 NODE_NAME_CASE(VFMULC_RND)
35193 NODE_NAME_CASE(VFCMULC)
35194 NODE_NAME_CASE(VFCMULC_RND)
35195 NODE_NAME_CASE(VFMULCSH)
35196 NODE_NAME_CASE(VFMULCSH_RND)
35197 NODE_NAME_CASE(VFCMULCSH)
35198 NODE_NAME_CASE(VFCMULCSH_RND)
35199 NODE_NAME_CASE(VFMADDCSH)
35200 NODE_NAME_CASE(VFMADDCSH_RND)
35201 NODE_NAME_CASE(VFCMADDCSH)
35202 NODE_NAME_CASE(VFCMADDCSH_RND)
35203 NODE_NAME_CASE(VPMADD52H)
35204 NODE_NAME_CASE(VPMADD52L)
35205 NODE_NAME_CASE(VRNDSCALE)
35206 NODE_NAME_CASE(STRICT_VRNDSCALE)
35207 NODE_NAME_CASE(VRNDSCALE_SAE)
35208 NODE_NAME_CASE(VRNDSCALES)
35209 NODE_NAME_CASE(VRNDSCALES_SAE)
35210 NODE_NAME_CASE(VREDUCE)
35211 NODE_NAME_CASE(VREDUCE_SAE)
35212 NODE_NAME_CASE(VREDUCES)
35213 NODE_NAME_CASE(VREDUCES_SAE)
35214 NODE_NAME_CASE(VGETMANT)
35215 NODE_NAME_CASE(VGETMANT_SAE)
35216 NODE_NAME_CASE(VGETMANTS)
35217 NODE_NAME_CASE(VGETMANTS_SAE)
35218 NODE_NAME_CASE(PCMPESTR)
35219 NODE_NAME_CASE(PCMPISTR)
35221 NODE_NAME_CASE(COMPRESS)
35223 NODE_NAME_CASE(SELECTS)
35224 NODE_NAME_CASE(ADDSUB)
35225 NODE_NAME_CASE(RCP14)
35226 NODE_NAME_CASE(RCP14S)
35227 NODE_NAME_CASE(RSQRT14)
35228 NODE_NAME_CASE(RSQRT14S)
35229 NODE_NAME_CASE(FADD_RND)
35230 NODE_NAME_CASE(FADDS)
35231 NODE_NAME_CASE(FADDS_RND)
35232 NODE_NAME_CASE(FSUB_RND)
35233 NODE_NAME_CASE(FSUBS)
35234 NODE_NAME_CASE(FSUBS_RND)
35235 NODE_NAME_CASE(FMUL_RND)
35236 NODE_NAME_CASE(FMULS)
35237 NODE_NAME_CASE(FMULS_RND)
35238 NODE_NAME_CASE(FDIV_RND)
35239 NODE_NAME_CASE(FDIVS)
35240 NODE_NAME_CASE(FDIVS_RND)
35241 NODE_NAME_CASE(FSQRT_RND)
35242 NODE_NAME_CASE(FSQRTS)
35243 NODE_NAME_CASE(FSQRTS_RND)
35244 NODE_NAME_CASE(FGETEXP)
35245 NODE_NAME_CASE(FGETEXP_SAE)
35246 NODE_NAME_CASE(FGETEXPS)
35247 NODE_NAME_CASE(FGETEXPS_SAE)
35248 NODE_NAME_CASE(SCALEF)
35249 NODE_NAME_CASE(SCALEF_RND)
35250 NODE_NAME_CASE(SCALEFS)
35251 NODE_NAME_CASE(SCALEFS_RND)
35252 NODE_NAME_CASE(MULHRS)
35253 NODE_NAME_CASE(SINT_TO_FP_RND)
35254 NODE_NAME_CASE(UINT_TO_FP_RND)
35255 NODE_NAME_CASE(CVTTP2SI)
35256 NODE_NAME_CASE(CVTTP2UI)
35257 NODE_NAME_CASE(STRICT_CVTTP2SI)
35258 NODE_NAME_CASE(STRICT_CVTTP2UI)
35259 NODE_NAME_CASE(MCVTTP2SI)
35260 NODE_NAME_CASE(MCVTTP2UI)
35261 NODE_NAME_CASE(CVTTP2SI_SAE)
35262 NODE_NAME_CASE(CVTTP2UI_SAE)
35263 NODE_NAME_CASE(CVTTS2SI)
35264 NODE_NAME_CASE(CVTTS2UI)
35265 NODE_NAME_CASE(CVTTS2SI_SAE)
35266 NODE_NAME_CASE(CVTTS2UI_SAE)
35267 NODE_NAME_CASE(CVTSI2P)
35268 NODE_NAME_CASE(CVTUI2P)
35269 NODE_NAME_CASE(STRICT_CVTSI2P)
35270 NODE_NAME_CASE(STRICT_CVTUI2P)
35271 NODE_NAME_CASE(MCVTSI2P)
35272 NODE_NAME_CASE(MCVTUI2P)
35273 NODE_NAME_CASE(VFPCLASS)
35274 NODE_NAME_CASE(VFPCLASSS)
35275 NODE_NAME_CASE(MULTISHIFT)
35276 NODE_NAME_CASE(SCALAR_SINT_TO_FP)
35277 NODE_NAME_CASE(SCALAR_SINT_TO_FP_RND)
35278 NODE_NAME_CASE(SCALAR_UINT_TO_FP)
35279 NODE_NAME_CASE(SCALAR_UINT_TO_FP_RND)
35280 NODE_NAME_CASE(CVTPS2PH)
35281 NODE_NAME_CASE(STRICT_CVTPS2PH)
35282 NODE_NAME_CASE(CVTPS2PH_SAE)
35283 NODE_NAME_CASE(MCVTPS2PH)
35284 NODE_NAME_CASE(MCVTPS2PH_SAE)
35285 NODE_NAME_CASE(CVTPH2PS)
35286 NODE_NAME_CASE(STRICT_CVTPH2PS)
35287 NODE_NAME_CASE(CVTPH2PS_SAE)
35288 NODE_NAME_CASE(CVTP2SI)
35289 NODE_NAME_CASE(CVTP2UI)
35290 NODE_NAME_CASE(MCVTP2SI)
35291 NODE_NAME_CASE(MCVTP2UI)
35292 NODE_NAME_CASE(CVTP2SI_RND)
35293 NODE_NAME_CASE(CVTP2UI_RND)
35294 NODE_NAME_CASE(CVTS2SI)
35295 NODE_NAME_CASE(CVTS2UI)
35296 NODE_NAME_CASE(CVTS2SI_RND)
35297 NODE_NAME_CASE(CVTS2UI_RND)
35298 NODE_NAME_CASE(CVTNEPS2BF16)
35299 NODE_NAME_CASE(MCVTNEPS2BF16)
35300 NODE_NAME_CASE(DPBF16PS)
35301 NODE_NAME_CASE(DPFP16PS)
35302 NODE_NAME_CASE(MPSADBW)
35303 NODE_NAME_CASE(LWPINS)
35304 NODE_NAME_CASE(MGATHER)
35305 NODE_NAME_CASE(MSCATTER)
35306 NODE_NAME_CASE(VPDPBUSD)
35307 NODE_NAME_CASE(VPDPBUSDS)
35308 NODE_NAME_CASE(VPDPWSSD)
35309 NODE_NAME_CASE(VPDPWSSDS)
35310 NODE_NAME_CASE(VPSHUFBITQMB)
35311 NODE_NAME_CASE(GF2P8MULB)
35312 NODE_NAME_CASE(GF2P8AFFINEQB)
35313 NODE_NAME_CASE(GF2P8AFFINEINVQB)
35314 NODE_NAME_CASE(NT_CALL)
35315 NODE_NAME_CASE(NT_BRIND)
35316 NODE_NAME_CASE(UMWAIT)
35317 NODE_NAME_CASE(TPAUSE)
35318 NODE_NAME_CASE(ENQCMD)
35319 NODE_NAME_CASE(ENQCMDS)
35320 NODE_NAME_CASE(VP2INTERSECT)
35321 NODE_NAME_CASE(VPDPBSUD)
35322 NODE_NAME_CASE(VPDPBSUDS)
35323 NODE_NAME_CASE(VPDPBUUD)
35324 NODE_NAME_CASE(VPDPBUUDS)
35325 NODE_NAME_CASE(VPDPBSSD)
35326 NODE_NAME_CASE(VPDPBSSDS)
35327 NODE_NAME_CASE(VPDPWSUD)
35328 NODE_NAME_CASE(VPDPWSUDS)
35329 NODE_NAME_CASE(VPDPWUSD)
35330 NODE_NAME_CASE(VPDPWUSDS)
35331 NODE_NAME_CASE(VPDPWUUD)
35332 NODE_NAME_CASE(VPDPWUUDS)
35333 NODE_NAME_CASE(VMINMAX)
35334 NODE_NAME_CASE(VMINMAX_SAE)
35335 NODE_NAME_CASE(VMINMAXS)
35336 NODE_NAME_CASE(VMINMAXS_SAE)
35337 NODE_NAME_CASE(CVTP2IBS)
35338 NODE_NAME_CASE(CVTP2IUBS)
35339 NODE_NAME_CASE(CVTP2IBS_RND)
35340 NODE_NAME_CASE(CVTP2IUBS_RND)
35341 NODE_NAME_CASE(CVTTP2IBS)
35342 NODE_NAME_CASE(CVTTP2IUBS)
35343 NODE_NAME_CASE(CVTTP2IBS_SAE)
35344 NODE_NAME_CASE(CVTTP2IUBS_SAE)
35345 NODE_NAME_CASE(VCVT2PH2BF8)
35346 NODE_NAME_CASE(VCVT2PH2BF8S)
35347 NODE_NAME_CASE(VCVT2PH2HF8)
35348 NODE_NAME_CASE(VCVT2PH2HF8S)
35349 NODE_NAME_CASE(VCVTBIASPH2BF8)
35350 NODE_NAME_CASE(VCVTBIASPH2BF8S)
35351 NODE_NAME_CASE(VCVTBIASPH2HF8)
35352 NODE_NAME_CASE(VCVTBIASPH2HF8S)
35353 NODE_NAME_CASE(VCVTPH2BF8)
35354 NODE_NAME_CASE(VCVTPH2BF8S)
35355 NODE_NAME_CASE(VCVTPH2HF8)
35356 NODE_NAME_CASE(VCVTPH2HF8S)
35357 NODE_NAME_CASE(VMCVTBIASPH2BF8)
35358 NODE_NAME_CASE(VMCVTBIASPH2BF8S)
35359 NODE_NAME_CASE(VMCVTBIASPH2HF8)
35360 NODE_NAME_CASE(VMCVTBIASPH2HF8S)
35361 NODE_NAME_CASE(VMCVTPH2BF8)
35362 NODE_NAME_CASE(VMCVTPH2BF8S)
35363 NODE_NAME_CASE(VMCVTPH2HF8)
35364 NODE_NAME_CASE(VMCVTPH2HF8S)
35365 NODE_NAME_CASE(VCVTHF82PH)
35366 NODE_NAME_CASE(AESENC128KL)
35367 NODE_NAME_CASE(AESDEC128KL)
35368 NODE_NAME_CASE(AESENC256KL)
35369 NODE_NAME_CASE(AESDEC256KL)
35370 NODE_NAME_CASE(AESENCWIDE128KL)
35371 NODE_NAME_CASE(AESDECWIDE128KL)
35372 NODE_NAME_CASE(AESENCWIDE256KL)
35373 NODE_NAME_CASE(AESDECWIDE256KL)
35374 NODE_NAME_CASE(CMPCCXADD)
35375 NODE_NAME_CASE(TESTUI)
35376 NODE_NAME_CASE(FP80_ADD)
35377 NODE_NAME_CASE(STRICT_FP80_ADD)
35378 NODE_NAME_CASE(CCMP)
35379 NODE_NAME_CASE(CTEST)
35380 NODE_NAME_CASE(CLOAD)
35381 NODE_NAME_CASE(CSTORE)
35382 NODE_NAME_CASE(CVTTS2SIS)
35383 NODE_NAME_CASE(CVTTS2UIS)
35384 NODE_NAME_CASE(CVTTS2SIS_SAE)
35385 NODE_NAME_CASE(CVTTS2UIS_SAE)
35386 NODE_NAME_CASE(CVTTP2SIS)
35387 NODE_NAME_CASE(MCVTTP2SIS)
35388 NODE_NAME_CASE(CVTTP2UIS_SAE)
35389 NODE_NAME_CASE(CVTTP2SIS_SAE)
35390 NODE_NAME_CASE(CVTTP2UIS)
35391 NODE_NAME_CASE(MCVTTP2UIS)
35392 NODE_NAME_CASE(POP_FROM_X87_REG)
35393 }
35394 return nullptr;
35395#undef NODE_NAME_CASE
35396}
35397
35398/// Return true if the addressing mode represented by AM is legal for this
35399/// target, for a load/store of the specified type.
35401 const AddrMode &AM, Type *Ty,
35402 unsigned AS,
35403 Instruction *I) const {
35404 // X86 supports extremely general addressing modes.
35406
35407 // X86 allows a sign-extended 32-bit immediate field as a displacement.
35408 if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != nullptr))
35409 return false;
35410
35411 if (AM.BaseGV) {
35412 unsigned GVFlags = Subtarget.classifyGlobalReference(AM.BaseGV);
35413
35414 // If a reference to this global requires an extra load, we can't fold it.
35415 if (isGlobalStubReference(GVFlags))
35416 return false;
35417
35418 // If BaseGV requires a register for the PIC base, we cannot also have a
35419 // BaseReg specified.
35420 if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags))
35421 return false;
35422
35423 // If lower 4G is not available, then we must use rip-relative addressing.
35424 if ((M != CodeModel::Small || isPositionIndependent()) &&
35425 Subtarget.is64Bit() && (AM.BaseOffs || AM.Scale > 1))
35426 return false;
35427 }
35428
35429 switch (AM.Scale) {
35430 case 0:
35431 case 1:
35432 case 2:
35433 case 4:
35434 case 8:
35435 // These scales always work.
35436 break;
35437 case 3:
35438 case 5:
35439 case 9:
35440 // These scales are formed with basereg+scalereg. Only accept if there is
35441 // no basereg yet.
35442 if (AM.HasBaseReg)
35443 return false;
35444 break;
35445 default: // Other stuff never works.
35446 return false;
35447 }
35448
35449 return true;
35450}
35451
35452bool X86TargetLowering::isBinOp(unsigned Opcode) const {
35453 switch (Opcode) {
35454 // These are non-commutative binops.
35455 // TODO: Add more X86ISD opcodes once we have test coverage.
35456 case X86ISD::ANDNP:
35457 case X86ISD::PCMPGT:
35458 case X86ISD::FMAX:
35459 case X86ISD::FMIN:
35460 case X86ISD::FANDN:
35461 case X86ISD::VPSHA:
35462 case X86ISD::VPSHL:
35463 case X86ISD::VSHLV:
35464 case X86ISD::VSRLV:
35465 case X86ISD::VSRAV:
35466 return true;
35467 }
35468
35469 return TargetLoweringBase::isBinOp(Opcode);
35470}
35471
35472bool X86TargetLowering::isCommutativeBinOp(unsigned Opcode) const {
35473 switch (Opcode) {
35474 // TODO: Add more X86ISD opcodes once we have test coverage.
35475 case X86ISD::PCMPEQ:
35476 case X86ISD::PMULDQ:
35477 case X86ISD::PMULUDQ:
35478 case X86ISD::FMAXC:
35479 case X86ISD::FMINC:
35480 case X86ISD::FAND:
35481 case X86ISD::FOR:
35482 case X86ISD::FXOR:
35483 return true;
35484 }
35485
35487}
35488
35490 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
35491 return false;
35492 unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
35493 unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
35494 return NumBits1 > NumBits2;
35495}
35496
35498 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
35499 return false;
35500
35501 if (!isTypeLegal(EVT::getEVT(Ty1)))
35502 return false;
35503
35504 assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop");
35505
35506 // Assuming the caller doesn't have a zeroext or signext return parameter,
35507 // truncation all the way down to i1 is valid.
35508 return true;
35509}
35510
35512 return isInt<32>(Imm);
35513}
35514
35516 // Can also use sub to handle negated immediates.
35517 return isInt<32>(Imm);
35518}
35519
35521 return isInt<32>(Imm);
35522}
35523
35525 if (!VT1.isScalarInteger() || !VT2.isScalarInteger())
35526 return false;
35527 unsigned NumBits1 = VT1.getSizeInBits();
35528 unsigned NumBits2 = VT2.getSizeInBits();
35529 return NumBits1 > NumBits2;
35530}
35531
35533 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
35534 return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget.is64Bit();
35535}
35536
35538 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
35539 return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget.is64Bit();
35540}
35541
35543 EVT VT1 = Val.getValueType();
35544 if (isZExtFree(VT1, VT2))
35545 return true;
35546
35547 if (Val.getOpcode() != ISD::LOAD)
35548 return false;
35549
35550 if (!VT1.isSimple() || !VT1.isInteger() ||
35551 !VT2.isSimple() || !VT2.isInteger())
35552 return false;
35553
35554 switch (VT1.getSimpleVT().SimpleTy) {
35555 default: break;
35556 case MVT::i8:
35557 case MVT::i16:
35558 case MVT::i32:
35559 // X86 has 8, 16, and 32-bit zero-extending loads.
35560 return true;
35561 }
35562
35563 return false;
35564}
35565
35567 if (!Subtarget.is64Bit())
35568 return false;
35569 return TargetLowering::shouldConvertPhiType(From, To);
35570}
35571
35573 if (isa<MaskedLoadSDNode>(ExtVal.getOperand(0)))
35574 return false;
35575
35576 EVT SrcVT = ExtVal.getOperand(0).getValueType();
35577
35578 // There is no extending load for vXi1.
35579 if (SrcVT.getScalarType() == MVT::i1)
35580 return false;
35581
35582 return true;
35583}
35584
35586 EVT VT) const {
35587 if (Subtarget.useSoftFloat())
35588 return false;
35589
35590 if (!Subtarget.hasAnyFMA())
35591 return false;
35592
35593 VT = VT.getScalarType();
35594
35595 if (!VT.isSimple())
35596 return false;
35597
35598 switch (VT.getSimpleVT().SimpleTy) {
35599 case MVT::f16:
35600 return Subtarget.hasFP16();
35601 case MVT::f32:
35602 case MVT::f64:
35603 return true;
35604 default:
35605 break;
35606 }
35607
35608 return false;
35609}
35610
35612 EVT DestVT) const {
35613 // i16 instructions are longer (0x66 prefix) and potentially slower.
35614 return !(SrcVT == MVT::i32 && DestVT == MVT::i16);
35615}
35616
35618 unsigned BinOpcode, EVT VT, unsigned SelectOpcode, SDValue X,
35619 SDValue Y) const {
35620 if (SelectOpcode == ISD::SELECT) {
35621 if (VT.isVector())
35622 return false;
35623 if (!Subtarget.hasBMI() || (VT != MVT::i32 && VT != MVT::i64))
35624 return false;
35625 using namespace llvm::SDPatternMatch;
35626 // BLSI
35627 if (BinOpcode == ISD::AND && (sd_match(Y, m_Neg(m_Specific(X))) ||
35629 return true;
35630 // BLSR
35631 if (BinOpcode == ISD::AND &&
35634 return true;
35635 // BLSMSK
35636 if (BinOpcode == ISD::XOR &&
35639 return true;
35640
35641 return false;
35642 }
35643 // TODO: This is too general. There are cases where pre-AVX512 codegen would
35644 // benefit. The transform may also be profitable for scalar code.
35645 if (!Subtarget.hasAVX512())
35646 return false;
35647 if (!Subtarget.hasVLX() && !VT.is512BitVector())
35648 return false;
35649 if (!VT.isVector() || VT.getScalarType() == MVT::i1)
35650 return false;
35651
35652 return true;
35653}
35654
35655/// Targets can use this to indicate that they only support *some*
35656/// VECTOR_SHUFFLE operations, those with specific masks.
35657/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
35658/// are assumed to be legal.
35660 if (!VT.isSimple())
35661 return false;
35662
35663 // Not for i1 vectors
35664 if (VT.getSimpleVT().getScalarType() == MVT::i1)
35665 return false;
35666
35667 // Very little shuffling can be done for 64-bit vectors right now.
35668 if (VT.getSimpleVT().getSizeInBits() == 64)
35669 return false;
35670
35671 // We only care that the types being shuffled are legal. The lowering can
35672 // handle any possible shuffle mask that results.
35673 return isTypeLegal(VT.getSimpleVT());
35674}
35675
35677 EVT VT) const {
35678 // Don't convert an 'and' into a shuffle that we don't directly support.
35679 // vpblendw and vpshufb for 256-bit vectors are not available on AVX1.
35680 if (!Subtarget.hasAVX2())
35681 if (VT == MVT::v32i8 || VT == MVT::v16i16)
35682 return false;
35683
35684 // Just delegate to the generic legality, clear masks aren't special.
35685 return isShuffleMaskLegal(Mask, VT);
35686}
35687
35689 // If the subtarget is using thunks, we need to not generate jump tables.
35690 if (Subtarget.useIndirectThunkBranches())
35691 return false;
35692
35693 // Otherwise, fallback on the generic logic.
35695}
35696
35698 EVT ConditionVT) const {
35699 // Avoid 8 and 16 bit types because they increase the chance for unnecessary
35700 // zero-extensions.
35701 if (ConditionVT.getSizeInBits() < 32)
35702 return MVT::i32;
35704 ConditionVT);
35705}
35706
35707//===----------------------------------------------------------------------===//
35708// X86 Scheduler Hooks
35709//===----------------------------------------------------------------------===//
35710
35711/// Utility function to emit xbegin specifying the start of an RTM region.
35713 const TargetInstrInfo *TII) {
35714 const MIMetadata MIMD(MI);
35715
35716 const BasicBlock *BB = MBB->getBasicBlock();
35717 MachineFunction::iterator I = ++MBB->getIterator();
35718
35719 // For the v = xbegin(), we generate
35720 //
35721 // thisMBB:
35722 // xbegin sinkMBB
35723 //
35724 // mainMBB:
35725 // s0 = -1
35726 //
35727 // fallBB:
35728 // eax = # XABORT_DEF
35729 // s1 = eax
35730 //
35731 // sinkMBB:
35732 // v = phi(s0/mainBB, s1/fallBB)
35733
35734 MachineBasicBlock *thisMBB = MBB;
35735 MachineFunction *MF = MBB->getParent();
35736 MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
35737 MachineBasicBlock *fallMBB = MF->CreateMachineBasicBlock(BB);
35738 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
35739 MF->insert(I, mainMBB);
35740 MF->insert(I, fallMBB);
35741 MF->insert(I, sinkMBB);
35742
35743 if (isPhysRegUsedAfter(X86::EFLAGS, MI)) {
35744 mainMBB->addLiveIn(X86::EFLAGS);
35745 fallMBB->addLiveIn(X86::EFLAGS);
35746 sinkMBB->addLiveIn(X86::EFLAGS);
35747 }
35748
35749 // Transfer the remainder of BB and its successor edges to sinkMBB.
35750 sinkMBB->splice(sinkMBB->begin(), MBB,
35751 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
35753
35755 Register DstReg = MI.getOperand(0).getReg();
35756 const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
35757 Register mainDstReg = MRI.createVirtualRegister(RC);
35758 Register fallDstReg = MRI.createVirtualRegister(RC);
35759
35760 // thisMBB:
35761 // xbegin fallMBB
35762 // # fallthrough to mainMBB
35763 // # abortion to fallMBB
35764 BuildMI(thisMBB, MIMD, TII->get(X86::XBEGIN_4)).addMBB(fallMBB);
35765 thisMBB->addSuccessor(mainMBB);
35766 thisMBB->addSuccessor(fallMBB);
35767
35768 // mainMBB:
35769 // mainDstReg := -1
35770 BuildMI(mainMBB, MIMD, TII->get(X86::MOV32ri), mainDstReg).addImm(-1);
35771 BuildMI(mainMBB, MIMD, TII->get(X86::JMP_1)).addMBB(sinkMBB);
35772 mainMBB->addSuccessor(sinkMBB);
35773
35774 // fallMBB:
35775 // ; pseudo instruction to model hardware's definition from XABORT
35776 // EAX := XABORT_DEF
35777 // fallDstReg := EAX
35778 BuildMI(fallMBB, MIMD, TII->get(X86::XABORT_DEF));
35779 BuildMI(fallMBB, MIMD, TII->get(TargetOpcode::COPY), fallDstReg)
35780 .addReg(X86::EAX);
35781 fallMBB->addSuccessor(sinkMBB);
35782
35783 // sinkMBB:
35784 // DstReg := phi(mainDstReg/mainBB, fallDstReg/fallBB)
35785 BuildMI(*sinkMBB, sinkMBB->begin(), MIMD, TII->get(X86::PHI), DstReg)
35786 .addReg(mainDstReg).addMBB(mainMBB)
35787 .addReg(fallDstReg).addMBB(fallMBB);
35788
35789 MI.eraseFromParent();
35790 return sinkMBB;
35791}
35792
35794X86TargetLowering::EmitVAARGWithCustomInserter(MachineInstr &MI,
35795 MachineBasicBlock *MBB) const {
35796 // Emit va_arg instruction on X86-64.
35797
35798 // Operands to this pseudo-instruction:
35799 // 0 ) Output : destination address (reg)
35800 // 1-5) Input : va_list address (addr, i64mem)
35801 // 6 ) ArgSize : Size (in bytes) of vararg type
35802 // 7 ) ArgMode : 0=overflow only, 1=use gp_offset, 2=use fp_offset
35803 // 8 ) Align : Alignment of type
35804 // 9 ) EFLAGS (implicit-def)
35805
35806 assert(MI.getNumOperands() == 10 && "VAARG should have 10 operands!");
35807 static_assert(X86::AddrNumOperands == 5, "VAARG assumes 5 address operands");
35808
35809 Register DestReg = MI.getOperand(0).getReg();
35810 MachineOperand &Base = MI.getOperand(1);
35811 MachineOperand &Scale = MI.getOperand(2);
35812 MachineOperand &Index = MI.getOperand(3);
35813 MachineOperand &Disp = MI.getOperand(4);
35814 MachineOperand &Segment = MI.getOperand(5);
35815 unsigned ArgSize = MI.getOperand(6).getImm();
35816 unsigned ArgMode = MI.getOperand(7).getImm();
35817 Align Alignment = Align(MI.getOperand(8).getImm());
35818
35819 MachineFunction *MF = MBB->getParent();
35820
35821 // Memory Reference
35822 assert(MI.hasOneMemOperand() && "Expected VAARG to have one memoperand");
35823
35824 MachineMemOperand *OldMMO = MI.memoperands().front();
35825
35826 // Clone the MMO into two separate MMOs for loading and storing
35827 MachineMemOperand *LoadOnlyMMO = MF->getMachineMemOperand(
35828 OldMMO, OldMMO->getFlags() & ~MachineMemOperand::MOStore);
35829 MachineMemOperand *StoreOnlyMMO = MF->getMachineMemOperand(
35830 OldMMO, OldMMO->getFlags() & ~MachineMemOperand::MOLoad);
35831
35832 // Machine Information
35833 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
35834 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
35835 const TargetRegisterClass *AddrRegClass =
35837 const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32);
35838 const MIMetadata MIMD(MI);
35839
35840 // struct va_list {
35841 // i32 gp_offset
35842 // i32 fp_offset
35843 // i64 overflow_area (address)
35844 // i64 reg_save_area (address)
35845 // }
35846 // sizeof(va_list) = 24
35847 // alignment(va_list) = 8
35848
35849 unsigned TotalNumIntRegs = 6;
35850 unsigned TotalNumXMMRegs = 8;
35851 bool UseGPOffset = (ArgMode == 1);
35852 bool UseFPOffset = (ArgMode == 2);
35853 unsigned MaxOffset = TotalNumIntRegs * 8 +
35854 (UseFPOffset ? TotalNumXMMRegs * 16 : 0);
35855
35856 /* Align ArgSize to a multiple of 8 */
35857 unsigned ArgSizeA8 = (ArgSize + 7) & ~7;
35858 bool NeedsAlign = (Alignment > 8);
35859
35860 MachineBasicBlock *thisMBB = MBB;
35861 MachineBasicBlock *overflowMBB;
35862 MachineBasicBlock *offsetMBB;
35863 MachineBasicBlock *endMBB;
35864
35865 Register OffsetDestReg; // Argument address computed by offsetMBB
35866 Register OverflowDestReg; // Argument address computed by overflowMBB
35867 Register OffsetReg;
35868
35869 if (!UseGPOffset && !UseFPOffset) {
35870 // If we only pull from the overflow region, we don't create a branch.
35871 // We don't need to alter control flow.
35872 OffsetDestReg = Register(); // unused
35873 OverflowDestReg = DestReg;
35874
35875 offsetMBB = nullptr;
35876 overflowMBB = thisMBB;
35877 endMBB = thisMBB;
35878 } else {
35879 // First emit code to check if gp_offset (or fp_offset) is below the bound.
35880 // If so, pull the argument from reg_save_area. (branch to offsetMBB)
35881 // If not, pull from overflow_area. (branch to overflowMBB)
35882 //
35883 // thisMBB
35884 // | .
35885 // | .
35886 // offsetMBB overflowMBB
35887 // | .
35888 // | .
35889 // endMBB
35890
35891 // Registers for the PHI in endMBB
35892 OffsetDestReg = MRI.createVirtualRegister(AddrRegClass);
35893 OverflowDestReg = MRI.createVirtualRegister(AddrRegClass);
35894
35895 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
35896 overflowMBB = MF->CreateMachineBasicBlock(LLVM_BB);
35897 offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB);
35898 endMBB = MF->CreateMachineBasicBlock(LLVM_BB);
35899
35901
35902 // Insert the new basic blocks
35903 MF->insert(MBBIter, offsetMBB);
35904 MF->insert(MBBIter, overflowMBB);
35905 MF->insert(MBBIter, endMBB);
35906
35907 // Transfer the remainder of MBB and its successor edges to endMBB.
35908 endMBB->splice(endMBB->begin(), thisMBB,
35909 std::next(MachineBasicBlock::iterator(MI)), thisMBB->end());
35910 endMBB->transferSuccessorsAndUpdatePHIs(thisMBB);
35911
35912 // Make offsetMBB and overflowMBB successors of thisMBB
35913 thisMBB->addSuccessor(offsetMBB);
35914 thisMBB->addSuccessor(overflowMBB);
35915
35916 // endMBB is a successor of both offsetMBB and overflowMBB
35917 offsetMBB->addSuccessor(endMBB);
35918 overflowMBB->addSuccessor(endMBB);
35919
35920 // Load the offset value into a register
35921 OffsetReg = MRI.createVirtualRegister(OffsetRegClass);
35922 BuildMI(thisMBB, MIMD, TII->get(X86::MOV32rm), OffsetReg)
35923 .add(Base)
35924 .add(Scale)
35925 .add(Index)
35926 .addDisp(Disp, UseFPOffset ? 4 : 0)
35927 .add(Segment)
35928 .setMemRefs(LoadOnlyMMO);
35929
35930 // Check if there is enough room left to pull this argument.
35931 BuildMI(thisMBB, MIMD, TII->get(X86::CMP32ri))
35932 .addReg(OffsetReg)
35933 .addImm(MaxOffset + 8 - ArgSizeA8);
35934
35935 // Branch to "overflowMBB" if offset >= max
35936 // Fall through to "offsetMBB" otherwise
35937 BuildMI(thisMBB, MIMD, TII->get(X86::JCC_1))
35938 .addMBB(overflowMBB).addImm(X86::COND_AE);
35939 }
35940
35941 // In offsetMBB, emit code to use the reg_save_area.
35942 if (offsetMBB) {
35943 assert(OffsetReg != 0);
35944
35945 // Read the reg_save_area address.
35946 Register RegSaveReg = MRI.createVirtualRegister(AddrRegClass);
35947 BuildMI(
35948 offsetMBB, MIMD,
35949 TII->get(Subtarget.isTarget64BitLP64() ? X86::MOV64rm : X86::MOV32rm),
35950 RegSaveReg)
35951 .add(Base)
35952 .add(Scale)
35953 .add(Index)
35954 .addDisp(Disp, Subtarget.isTarget64BitLP64() ? 16 : 12)
35955 .add(Segment)
35956 .setMemRefs(LoadOnlyMMO);
35957
35958 if (Subtarget.isTarget64BitLP64()) {
35959 // Zero-extend the offset
35960 Register OffsetReg64 = MRI.createVirtualRegister(AddrRegClass);
35961 BuildMI(offsetMBB, MIMD, TII->get(X86::SUBREG_TO_REG), OffsetReg64)
35962 .addImm(0)
35963 .addReg(OffsetReg)
35964 .addImm(X86::sub_32bit);
35965
35966 // Add the offset to the reg_save_area to get the final address.
35967 BuildMI(offsetMBB, MIMD, TII->get(X86::ADD64rr), OffsetDestReg)
35968 .addReg(OffsetReg64)
35969 .addReg(RegSaveReg);
35970 } else {
35971 // Add the offset to the reg_save_area to get the final address.
35972 BuildMI(offsetMBB, MIMD, TII->get(X86::ADD32rr), OffsetDestReg)
35973 .addReg(OffsetReg)
35974 .addReg(RegSaveReg);
35975 }
35976
35977 // Compute the offset for the next argument
35978 Register NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass);
35979 BuildMI(offsetMBB, MIMD, TII->get(X86::ADD32ri), NextOffsetReg)
35980 .addReg(OffsetReg)
35981 .addImm(UseFPOffset ? 16 : 8);
35982
35983 // Store it back into the va_list.
35984 BuildMI(offsetMBB, MIMD, TII->get(X86::MOV32mr))
35985 .add(Base)
35986 .add(Scale)
35987 .add(Index)
35988 .addDisp(Disp, UseFPOffset ? 4 : 0)
35989 .add(Segment)
35990 .addReg(NextOffsetReg)
35991 .setMemRefs(StoreOnlyMMO);
35992
35993 // Jump to endMBB
35994 BuildMI(offsetMBB, MIMD, TII->get(X86::JMP_1))
35995 .addMBB(endMBB);
35996 }
35997
35998 //
35999 // Emit code to use overflow area
36000 //
36001
36002 // Load the overflow_area address into a register.
36003 Register OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass);
36004 BuildMI(overflowMBB, MIMD,
36005 TII->get(Subtarget.isTarget64BitLP64() ? X86::MOV64rm : X86::MOV32rm),
36006 OverflowAddrReg)
36007 .add(Base)
36008 .add(Scale)
36009 .add(Index)
36010 .addDisp(Disp, 8)
36011 .add(Segment)
36012 .setMemRefs(LoadOnlyMMO);
36013
36014 // If we need to align it, do so. Otherwise, just copy the address
36015 // to OverflowDestReg.
36016 if (NeedsAlign) {
36017 // Align the overflow address
36018 Register TmpReg = MRI.createVirtualRegister(AddrRegClass);
36019
36020 // aligned_addr = (addr + (align-1)) & ~(align-1)
36021 BuildMI(
36022 overflowMBB, MIMD,
36023 TII->get(Subtarget.isTarget64BitLP64() ? X86::ADD64ri32 : X86::ADD32ri),
36024 TmpReg)
36025 .addReg(OverflowAddrReg)
36026 .addImm(Alignment.value() - 1);
36027
36028 BuildMI(
36029 overflowMBB, MIMD,
36030 TII->get(Subtarget.isTarget64BitLP64() ? X86::AND64ri32 : X86::AND32ri),
36031 OverflowDestReg)
36032 .addReg(TmpReg)
36033 .addImm(~(uint64_t)(Alignment.value() - 1));
36034 } else {
36035 BuildMI(overflowMBB, MIMD, TII->get(TargetOpcode::COPY), OverflowDestReg)
36036 .addReg(OverflowAddrReg);
36037 }
36038
36039 // Compute the next overflow address after this argument.
36040 // (the overflow address should be kept 8-byte aligned)
36041 Register NextAddrReg = MRI.createVirtualRegister(AddrRegClass);
36042 BuildMI(
36043 overflowMBB, MIMD,
36044 TII->get(Subtarget.isTarget64BitLP64() ? X86::ADD64ri32 : X86::ADD32ri),
36045 NextAddrReg)
36046 .addReg(OverflowDestReg)
36047 .addImm(ArgSizeA8);
36048
36049 // Store the new overflow address.
36050 BuildMI(overflowMBB, MIMD,
36051 TII->get(Subtarget.isTarget64BitLP64() ? X86::MOV64mr : X86::MOV32mr))
36052 .add(Base)
36053 .add(Scale)
36054 .add(Index)
36055 .addDisp(Disp, 8)
36056 .add(Segment)
36057 .addReg(NextAddrReg)
36058 .setMemRefs(StoreOnlyMMO);
36059
36060 // If we branched, emit the PHI to the front of endMBB.
36061 if (offsetMBB) {
36062 BuildMI(*endMBB, endMBB->begin(), MIMD,
36063 TII->get(X86::PHI), DestReg)
36064 .addReg(OffsetDestReg).addMBB(offsetMBB)
36065 .addReg(OverflowDestReg).addMBB(overflowMBB);
36066 }
36067
36068 // Erase the pseudo instruction
36069 MI.eraseFromParent();
36070
36071 return endMBB;
36072}
36073
36074// The EFLAGS operand of SelectItr might be missing a kill marker
36075// because there were multiple uses of EFLAGS, and ISel didn't know
36076// which to mark. Figure out whether SelectItr should have had a
36077// kill marker, and set it if it should. Returns the correct kill
36078// marker value.
36081 const TargetRegisterInfo* TRI) {
36082 if (isPhysRegUsedAfter(X86::EFLAGS, SelectItr))
36083 return false;
36084
36085 // We found a def, or hit the end of the basic block and EFLAGS wasn't live
36086 // out. SelectMI should have a kill flag on EFLAGS.
36087 SelectItr->addRegisterKilled(X86::EFLAGS, TRI);
36088 return true;
36089}
36090
36091// Return true if it is OK for this CMOV pseudo-opcode to be cascaded
36092// together with other CMOV pseudo-opcodes into a single basic-block with
36093// conditional jump around it.
36095 switch (MI.getOpcode()) {
36096 case X86::CMOV_FR16:
36097 case X86::CMOV_FR16X:
36098 case X86::CMOV_FR32:
36099 case X86::CMOV_FR32X:
36100 case X86::CMOV_FR64:
36101 case X86::CMOV_FR64X:
36102 case X86::CMOV_GR8:
36103 case X86::CMOV_GR16:
36104 case X86::CMOV_GR32:
36105 case X86::CMOV_RFP32:
36106 case X86::CMOV_RFP64:
36107 case X86::CMOV_RFP80:
36108 case X86::CMOV_VR64:
36109 case X86::CMOV_VR128:
36110 case X86::CMOV_VR128X:
36111 case X86::CMOV_VR256:
36112 case X86::CMOV_VR256X:
36113 case X86::CMOV_VR512:
36114 case X86::CMOV_VK1:
36115 case X86::CMOV_VK2:
36116 case X86::CMOV_VK4:
36117 case X86::CMOV_VK8:
36118 case X86::CMOV_VK16:
36119 case X86::CMOV_VK32:
36120 case X86::CMOV_VK64:
36121 return true;
36122
36123 default:
36124 return false;
36125 }
36126}
36127
36128// Helper function, which inserts PHI functions into SinkMBB:
36129// %Result(i) = phi [ %FalseValue(i), FalseMBB ], [ %TrueValue(i), TrueMBB ],
36130// where %FalseValue(i) and %TrueValue(i) are taken from the consequent CMOVs
36131// in [MIItBegin, MIItEnd) range. It returns the last MachineInstrBuilder for
36132// the last PHI function inserted.
36135 MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB,
36136 MachineBasicBlock *SinkMBB) {
36137 MachineFunction *MF = TrueMBB->getParent();
36139 const MIMetadata MIMD(*MIItBegin);
36140
36141 X86::CondCode CC = X86::CondCode(MIItBegin->getOperand(3).getImm());
36143
36144 MachineBasicBlock::iterator SinkInsertionPoint = SinkMBB->begin();
36145
36146 // As we are creating the PHIs, we have to be careful if there is more than
36147 // one. Later CMOVs may reference the results of earlier CMOVs, but later
36148 // PHIs have to reference the individual true/false inputs from earlier PHIs.
36149 // That also means that PHI construction must work forward from earlier to
36150 // later, and that the code must maintain a mapping from earlier PHI's
36151 // destination registers, and the registers that went into the PHI.
36154
36155 for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; ++MIIt) {
36156 Register DestReg = MIIt->getOperand(0).getReg();
36157 Register Op1Reg = MIIt->getOperand(1).getReg();
36158 Register Op2Reg = MIIt->getOperand(2).getReg();
36159
36160 // If this CMOV we are generating is the opposite condition from
36161 // the jump we generated, then we have to swap the operands for the
36162 // PHI that is going to be generated.
36163 if (MIIt->getOperand(3).getImm() == OppCC)
36164 std::swap(Op1Reg, Op2Reg);
36165
36166 if (auto It = RegRewriteTable.find(Op1Reg); It != RegRewriteTable.end())
36167 Op1Reg = It->second.first;
36168
36169 if (auto It = RegRewriteTable.find(Op2Reg); It != RegRewriteTable.end())
36170 Op2Reg = It->second.second;
36171
36172 MIB =
36173 BuildMI(*SinkMBB, SinkInsertionPoint, MIMD, TII->get(X86::PHI), DestReg)
36174 .addReg(Op1Reg)
36175 .addMBB(FalseMBB)
36176 .addReg(Op2Reg)
36177 .addMBB(TrueMBB);
36178
36179 // Add this PHI to the rewrite table.
36180 RegRewriteTable[DestReg] = std::make_pair(Op1Reg, Op2Reg);
36181 }
36182
36183 return MIB;
36184}
36185
36186// Lower cascaded selects in form of (SecondCmov (FirstCMOV F, T, cc1), T, cc2).
36188X86TargetLowering::EmitLoweredCascadedSelect(MachineInstr &FirstCMOV,
36189 MachineInstr &SecondCascadedCMOV,
36190 MachineBasicBlock *ThisMBB) const {
36191 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
36192 const MIMetadata MIMD(FirstCMOV);
36193
36194 // We lower cascaded CMOVs such as
36195 //
36196 // (SecondCascadedCMOV (FirstCMOV F, T, cc1), T, cc2)
36197 //
36198 // to two successive branches.
36199 //
36200 // Without this, we would add a PHI between the two jumps, which ends up
36201 // creating a few copies all around. For instance, for
36202 //
36203 // (sitofp (zext (fcmp une)))
36204 //
36205 // we would generate:
36206 //
36207 // ucomiss %xmm1, %xmm0
36208 // movss <1.0f>, %xmm0
36209 // movaps %xmm0, %xmm1
36210 // jne .LBB5_2
36211 // xorps %xmm1, %xmm1
36212 // .LBB5_2:
36213 // jp .LBB5_4
36214 // movaps %xmm1, %xmm0
36215 // .LBB5_4:
36216 // retq
36217 //
36218 // because this custom-inserter would have generated:
36219 //
36220 // A
36221 // | \
36222 // | B
36223 // | /
36224 // C
36225 // | \
36226 // | D
36227 // | /
36228 // E
36229 //
36230 // A: X = ...; Y = ...
36231 // B: empty
36232 // C: Z = PHI [X, A], [Y, B]
36233 // D: empty
36234 // E: PHI [X, C], [Z, D]
36235 //
36236 // If we lower both CMOVs in a single step, we can instead generate:
36237 //
36238 // A
36239 // | \
36240 // | C
36241 // | /|
36242 // |/ |
36243 // | |
36244 // | D
36245 // | /
36246 // E
36247 //
36248 // A: X = ...; Y = ...
36249 // D: empty
36250 // E: PHI [X, A], [X, C], [Y, D]
36251 //
36252 // Which, in our sitofp/fcmp example, gives us something like:
36253 //
36254 // ucomiss %xmm1, %xmm0
36255 // movss <1.0f>, %xmm0
36256 // jne .LBB5_4
36257 // jp .LBB5_4
36258 // xorps %xmm0, %xmm0
36259 // .LBB5_4:
36260 // retq
36261 //
36262
36263 // We lower cascaded CMOV into two successive branches to the same block.
36264 // EFLAGS is used by both, so mark it as live in the second.
36265 const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock();
36266 MachineFunction *F = ThisMBB->getParent();
36267 MachineBasicBlock *FirstInsertedMBB = F->CreateMachineBasicBlock(LLVM_BB);
36268 MachineBasicBlock *SecondInsertedMBB = F->CreateMachineBasicBlock(LLVM_BB);
36269 MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
36270
36271 MachineFunction::iterator It = ++ThisMBB->getIterator();
36272 F->insert(It, FirstInsertedMBB);
36273 F->insert(It, SecondInsertedMBB);
36274 F->insert(It, SinkMBB);
36275
36276 // For a cascaded CMOV, we lower it to two successive branches to
36277 // the same block (SinkMBB). EFLAGS is used by both, so mark it as live in
36278 // the FirstInsertedMBB.
36279 FirstInsertedMBB->addLiveIn(X86::EFLAGS);
36280
36281 // If the EFLAGS register isn't dead in the terminator, then claim that it's
36282 // live into the sink and copy blocks.
36283 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
36284 if (!SecondCascadedCMOV.killsRegister(X86::EFLAGS, /*TRI=*/nullptr) &&
36285 !checkAndUpdateEFLAGSKill(SecondCascadedCMOV, ThisMBB, TRI)) {
36286 SecondInsertedMBB->addLiveIn(X86::EFLAGS);
36287 SinkMBB->addLiveIn(X86::EFLAGS);
36288 }
36289
36290 // Transfer the remainder of ThisMBB and its successor edges to SinkMBB.
36291 SinkMBB->splice(SinkMBB->begin(), ThisMBB,
36292 std::next(MachineBasicBlock::iterator(FirstCMOV)),
36293 ThisMBB->end());
36294 SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB);
36295
36296 // Fallthrough block for ThisMBB.
36297 ThisMBB->addSuccessor(FirstInsertedMBB);
36298 // The true block target of the first branch is always SinkMBB.
36299 ThisMBB->addSuccessor(SinkMBB);
36300 // Fallthrough block for FirstInsertedMBB.
36301 FirstInsertedMBB->addSuccessor(SecondInsertedMBB);
36302 // The true block for the branch of FirstInsertedMBB.
36303 FirstInsertedMBB->addSuccessor(SinkMBB);
36304 // This is fallthrough.
36305 SecondInsertedMBB->addSuccessor(SinkMBB);
36306
36307 // Create the conditional branch instructions.
36308 X86::CondCode FirstCC = X86::CondCode(FirstCMOV.getOperand(3).getImm());
36309 BuildMI(ThisMBB, MIMD, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(FirstCC);
36310
36311 X86::CondCode SecondCC =
36312 X86::CondCode(SecondCascadedCMOV.getOperand(3).getImm());
36313 BuildMI(FirstInsertedMBB, MIMD, TII->get(X86::JCC_1))
36314 .addMBB(SinkMBB)
36315 .addImm(SecondCC);
36316
36317 // SinkMBB:
36318 // %Result = phi [ %FalseValue, SecondInsertedMBB ], [ %TrueValue, ThisMBB ]
36319 Register DestReg = SecondCascadedCMOV.getOperand(0).getReg();
36320 Register Op1Reg = FirstCMOV.getOperand(1).getReg();
36321 Register Op2Reg = FirstCMOV.getOperand(2).getReg();
36322 MachineInstrBuilder MIB =
36323 BuildMI(*SinkMBB, SinkMBB->begin(), MIMD, TII->get(X86::PHI), DestReg)
36324 .addReg(Op1Reg)
36325 .addMBB(SecondInsertedMBB)
36326 .addReg(Op2Reg)
36327 .addMBB(ThisMBB);
36328
36329 // The second SecondInsertedMBB provides the same incoming value as the
36330 // FirstInsertedMBB (the True operand of the SELECT_CC/CMOV nodes).
36331 MIB.addReg(FirstCMOV.getOperand(2).getReg()).addMBB(FirstInsertedMBB);
36332
36333 // Now remove the CMOVs.
36334 FirstCMOV.eraseFromParent();
36335 SecondCascadedCMOV.eraseFromParent();
36336
36337 return SinkMBB;
36338}
36339
36341X86TargetLowering::EmitLoweredSelect(MachineInstr &MI,
36342 MachineBasicBlock *ThisMBB) const {
36343 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
36344 const MIMetadata MIMD(MI);
36345
36346 // To "insert" a SELECT_CC instruction, we actually have to insert the
36347 // diamond control-flow pattern. The incoming instruction knows the
36348 // destination vreg to set, the condition code register to branch on, the
36349 // true/false values to select between and a branch opcode to use.
36350
36351 // ThisMBB:
36352 // ...
36353 // TrueVal = ...
36354 // cmpTY ccX, r1, r2
36355 // bCC copy1MBB
36356 // fallthrough --> FalseMBB
36357
36358 // This code lowers all pseudo-CMOV instructions. Generally it lowers these
36359 // as described above, by inserting a BB, and then making a PHI at the join
36360 // point to select the true and false operands of the CMOV in the PHI.
36361 //
36362 // The code also handles two different cases of multiple CMOV opcodes
36363 // in a row.
36364 //
36365 // Case 1:
36366 // In this case, there are multiple CMOVs in a row, all which are based on
36367 // the same condition setting (or the exact opposite condition setting).
36368 // In this case we can lower all the CMOVs using a single inserted BB, and
36369 // then make a number of PHIs at the join point to model the CMOVs. The only
36370 // trickiness here, is that in a case like:
36371 //
36372 // t2 = CMOV cond1 t1, f1
36373 // t3 = CMOV cond1 t2, f2
36374 //
36375 // when rewriting this into PHIs, we have to perform some renaming on the
36376 // temps since you cannot have a PHI operand refer to a PHI result earlier
36377 // in the same block. The "simple" but wrong lowering would be:
36378 //
36379 // t2 = PHI t1(BB1), f1(BB2)
36380 // t3 = PHI t2(BB1), f2(BB2)
36381 //
36382 // but clearly t2 is not defined in BB1, so that is incorrect. The proper
36383 // renaming is to note that on the path through BB1, t2 is really just a
36384 // copy of t1, and do that renaming, properly generating:
36385 //
36386 // t2 = PHI t1(BB1), f1(BB2)
36387 // t3 = PHI t1(BB1), f2(BB2)
36388 //
36389 // Case 2:
36390 // CMOV ((CMOV F, T, cc1), T, cc2) is checked here and handled by a separate
36391 // function - EmitLoweredCascadedSelect.
36392
36393 X86::CondCode CC = X86::CondCode(MI.getOperand(3).getImm());
36395 MachineInstr *LastCMOV = &MI;
36397
36398 // Check for case 1, where there are multiple CMOVs with the same condition
36399 // first. Of the two cases of multiple CMOV lowerings, case 1 reduces the
36400 // number of jumps the most.
36401
36402 if (isCMOVPseudo(MI)) {
36403 // See if we have a string of CMOVS with the same condition. Skip over
36404 // intervening debug insts.
36405 while (NextMIIt != ThisMBB->end() && isCMOVPseudo(*NextMIIt) &&
36406 (NextMIIt->getOperand(3).getImm() == CC ||
36407 NextMIIt->getOperand(3).getImm() == OppCC)) {
36408 LastCMOV = &*NextMIIt;
36409 NextMIIt = next_nodbg(NextMIIt, ThisMBB->end());
36410 }
36411 }
36412
36413 // This checks for case 2, but only do this if we didn't already find
36414 // case 1, as indicated by LastCMOV == MI.
36415 if (LastCMOV == &MI && NextMIIt != ThisMBB->end() &&
36416 NextMIIt->getOpcode() == MI.getOpcode() &&
36417 NextMIIt->getOperand(2).getReg() == MI.getOperand(2).getReg() &&
36418 NextMIIt->getOperand(1).getReg() == MI.getOperand(0).getReg() &&
36419 NextMIIt->getOperand(1).isKill()) {
36420 return EmitLoweredCascadedSelect(MI, *NextMIIt, ThisMBB);
36421 }
36422
36423 const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock();
36424 MachineFunction *F = ThisMBB->getParent();
36425 MachineBasicBlock *FalseMBB = F->CreateMachineBasicBlock(LLVM_BB);
36426 MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
36427
36428 MachineFunction::iterator It = ++ThisMBB->getIterator();
36429 F->insert(It, FalseMBB);
36430 F->insert(It, SinkMBB);
36431
36432 // Set the call frame size on entry to the new basic blocks.
36433 unsigned CallFrameSize = TII->getCallFrameSizeAt(MI);
36434 FalseMBB->setCallFrameSize(CallFrameSize);
36435 SinkMBB->setCallFrameSize(CallFrameSize);
36436
36437 // If the EFLAGS register isn't dead in the terminator, then claim that it's
36438 // live into the sink and copy blocks.
36439 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
36440 if (!LastCMOV->killsRegister(X86::EFLAGS, /*TRI=*/nullptr) &&
36441 !checkAndUpdateEFLAGSKill(LastCMOV, ThisMBB, TRI)) {
36442 FalseMBB->addLiveIn(X86::EFLAGS);
36443 SinkMBB->addLiveIn(X86::EFLAGS);
36444 }
36445
36446 // Transfer any debug instructions inside the CMOV sequence to the sunk block.
36448 MachineBasicBlock::iterator(LastCMOV));
36449 for (MachineInstr &MI : llvm::make_early_inc_range(DbgRange))
36450 if (MI.isDebugInstr())
36451 SinkMBB->push_back(MI.removeFromParent());
36452
36453 // Transfer the remainder of ThisMBB and its successor edges to SinkMBB.
36454 SinkMBB->splice(SinkMBB->end(), ThisMBB,
36455 std::next(MachineBasicBlock::iterator(LastCMOV)),
36456 ThisMBB->end());
36457 SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB);
36458
36459 // Fallthrough block for ThisMBB.
36460 ThisMBB->addSuccessor(FalseMBB);
36461 // The true block target of the first (or only) branch is always a SinkMBB.
36462 ThisMBB->addSuccessor(SinkMBB);
36463 // Fallthrough block for FalseMBB.
36464 FalseMBB->addSuccessor(SinkMBB);
36465
36466 // Create the conditional branch instruction.
36467 BuildMI(ThisMBB, MIMD, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(CC);
36468
36469 // SinkMBB:
36470 // %Result = phi [ %FalseValue, FalseMBB ], [ %TrueValue, ThisMBB ]
36471 // ...
36474 std::next(MachineBasicBlock::iterator(LastCMOV));
36475 createPHIsForCMOVsInSinkBB(MIItBegin, MIItEnd, ThisMBB, FalseMBB, SinkMBB);
36476
36477 // Now remove the CMOV(s).
36478 ThisMBB->erase(MIItBegin, MIItEnd);
36479
36480 return SinkMBB;
36481}
36482
36483static unsigned getSUBriOpcode(bool IsLP64) {
36484 if (IsLP64)
36485 return X86::SUB64ri32;
36486 else
36487 return X86::SUB32ri;
36488}
36489
36491X86TargetLowering::EmitLoweredProbedAlloca(MachineInstr &MI,
36492 MachineBasicBlock *MBB) const {
36493 MachineFunction *MF = MBB->getParent();
36494 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
36495 const X86FrameLowering &TFI = *Subtarget.getFrameLowering();
36496 const MIMetadata MIMD(MI);
36497 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
36498
36499 const unsigned ProbeSize = getStackProbeSize(*MF);
36500
36501 MachineRegisterInfo &MRI = MF->getRegInfo();
36502 MachineBasicBlock *testMBB = MF->CreateMachineBasicBlock(LLVM_BB);
36503 MachineBasicBlock *tailMBB = MF->CreateMachineBasicBlock(LLVM_BB);
36504 MachineBasicBlock *blockMBB = MF->CreateMachineBasicBlock(LLVM_BB);
36505
36507 MF->insert(MBBIter, testMBB);
36508 MF->insert(MBBIter, blockMBB);
36509 MF->insert(MBBIter, tailMBB);
36510
36511 Register sizeVReg = MI.getOperand(1).getReg();
36512
36513 Register physSPReg = TFI.Uses64BitFramePtr ? X86::RSP : X86::ESP;
36514
36515 Register TmpStackPtr = MRI.createVirtualRegister(
36516 TFI.Uses64BitFramePtr ? &X86::GR64RegClass : &X86::GR32RegClass);
36517 Register FinalStackPtr = MRI.createVirtualRegister(
36518 TFI.Uses64BitFramePtr ? &X86::GR64RegClass : &X86::GR32RegClass);
36519
36520 BuildMI(*MBB, {MI}, MIMD, TII->get(TargetOpcode::COPY), TmpStackPtr)
36521 .addReg(physSPReg);
36522 {
36523 const unsigned Opc = TFI.Uses64BitFramePtr ? X86::SUB64rr : X86::SUB32rr;
36524 BuildMI(*MBB, {MI}, MIMD, TII->get(Opc), FinalStackPtr)
36525 .addReg(TmpStackPtr)
36526 .addReg(sizeVReg);
36527 }
36528
36529 // test rsp size
36530
36531 BuildMI(testMBB, MIMD,
36532 TII->get(TFI.Uses64BitFramePtr ? X86::CMP64rr : X86::CMP32rr))
36533 .addReg(FinalStackPtr)
36534 .addReg(physSPReg);
36535
36536 BuildMI(testMBB, MIMD, TII->get(X86::JCC_1))
36537 .addMBB(tailMBB)
36539 testMBB->addSuccessor(blockMBB);
36540 testMBB->addSuccessor(tailMBB);
36541
36542 // Touch the block then extend it. This is done on the opposite side of
36543 // static probe where we allocate then touch, to avoid the need of probing the
36544 // tail of the static alloca. Possible scenarios are:
36545 //
36546 // + ---- <- ------------ <- ------------- <- ------------ +
36547 // | |
36548 // [free probe] -> [page alloc] -> [alloc probe] -> [tail alloc] + -> [dyn probe] -> [page alloc] -> [dyn probe] -> [tail alloc] +
36549 // | |
36550 // + <- ----------- <- ------------ <- ----------- <- ------------ +
36551 //
36552 // The property we want to enforce is to never have more than [page alloc] between two probes.
36553
36554 const unsigned XORMIOpc =
36555 TFI.Uses64BitFramePtr ? X86::XOR64mi32 : X86::XOR32mi;
36556 addRegOffset(BuildMI(blockMBB, MIMD, TII->get(XORMIOpc)), physSPReg, false, 0)
36557 .addImm(0);
36558
36559 BuildMI(blockMBB, MIMD, TII->get(getSUBriOpcode(TFI.Uses64BitFramePtr)),
36560 physSPReg)
36561 .addReg(physSPReg)
36562 .addImm(ProbeSize);
36563
36564 BuildMI(blockMBB, MIMD, TII->get(X86::JMP_1)).addMBB(testMBB);
36565 blockMBB->addSuccessor(testMBB);
36566
36567 // Replace original instruction by the expected stack ptr
36568 BuildMI(tailMBB, MIMD, TII->get(TargetOpcode::COPY),
36569 MI.getOperand(0).getReg())
36570 .addReg(FinalStackPtr);
36571
36572 tailMBB->splice(tailMBB->end(), MBB,
36573 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
36575 MBB->addSuccessor(testMBB);
36576
36577 // Delete the original pseudo instruction.
36578 MI.eraseFromParent();
36579
36580 // And we're done.
36581 return tailMBB;
36582}
36583
36585X86TargetLowering::EmitLoweredSegAlloca(MachineInstr &MI,
36586 MachineBasicBlock *BB) const {
36587 MachineFunction *MF = BB->getParent();
36588 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
36589 const MIMetadata MIMD(MI);
36590 const BasicBlock *LLVM_BB = BB->getBasicBlock();
36591
36592 assert(MF->shouldSplitStack());
36593
36594 const bool Is64Bit = Subtarget.is64Bit();
36595 const bool IsLP64 = Subtarget.isTarget64BitLP64();
36596
36597 const unsigned TlsReg = Is64Bit ? X86::FS : X86::GS;
36598 const unsigned TlsOffset = IsLP64 ? 0x70 : Is64Bit ? 0x40 : 0x30;
36599
36600 // BB:
36601 // ... [Till the alloca]
36602 // If stacklet is not large enough, jump to mallocMBB
36603 //
36604 // bumpMBB:
36605 // Allocate by subtracting from RSP
36606 // Jump to continueMBB
36607 //
36608 // mallocMBB:
36609 // Allocate by call to runtime
36610 //
36611 // continueMBB:
36612 // ...
36613 // [rest of original BB]
36614 //
36615
36616 MachineBasicBlock *mallocMBB = MF->CreateMachineBasicBlock(LLVM_BB);
36617 MachineBasicBlock *bumpMBB = MF->CreateMachineBasicBlock(LLVM_BB);
36618 MachineBasicBlock *continueMBB = MF->CreateMachineBasicBlock(LLVM_BB);
36619
36620 MachineRegisterInfo &MRI = MF->getRegInfo();
36621 const TargetRegisterClass *AddrRegClass =
36623
36624 Register mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass),
36625 bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass),
36626 tmpSPVReg = MRI.createVirtualRegister(AddrRegClass),
36627 SPLimitVReg = MRI.createVirtualRegister(AddrRegClass),
36628 sizeVReg = MI.getOperand(1).getReg(),
36629 physSPReg = IsLP64 ? X86::RSP : X86::ESP;
36630
36631 MachineFunction::iterator MBBIter = ++BB->getIterator();
36632
36633 MF->insert(MBBIter, bumpMBB);
36634 MF->insert(MBBIter, mallocMBB);
36635 MF->insert(MBBIter, continueMBB);
36636
36637 continueMBB->splice(continueMBB->begin(), BB,
36638 std::next(MachineBasicBlock::iterator(MI)), BB->end());
36639 continueMBB->transferSuccessorsAndUpdatePHIs(BB);
36640
36641 // Add code to the main basic block to check if the stack limit has been hit,
36642 // and if so, jump to mallocMBB otherwise to bumpMBB.
36643 BuildMI(BB, MIMD, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg);
36644 BuildMI(BB, MIMD, TII->get(IsLP64 ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg)
36645 .addReg(tmpSPVReg).addReg(sizeVReg);
36646 BuildMI(BB, MIMD, TII->get(IsLP64 ? X86::CMP64mr:X86::CMP32mr))
36647 .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg)
36648 .addReg(SPLimitVReg);
36649 BuildMI(BB, MIMD, TII->get(X86::JCC_1)).addMBB(mallocMBB).addImm(X86::COND_G);
36650
36651 // bumpMBB simply decreases the stack pointer, since we know the current
36652 // stacklet has enough space.
36653 BuildMI(bumpMBB, MIMD, TII->get(TargetOpcode::COPY), physSPReg)
36654 .addReg(SPLimitVReg);
36655 BuildMI(bumpMBB, MIMD, TII->get(TargetOpcode::COPY), bumpSPPtrVReg)
36656 .addReg(SPLimitVReg);
36657 BuildMI(bumpMBB, MIMD, TII->get(X86::JMP_1)).addMBB(continueMBB);
36658
36659 // Calls into a routine in libgcc to allocate more space from the heap.
36660 const uint32_t *RegMask =
36661 Subtarget.getRegisterInfo()->getCallPreservedMask(*MF, CallingConv::C);
36662 if (IsLP64) {
36663 BuildMI(mallocMBB, MIMD, TII->get(X86::MOV64rr), X86::RDI)
36664 .addReg(sizeVReg);
36665 BuildMI(mallocMBB, MIMD, TII->get(X86::CALL64pcrel32))
36666 .addExternalSymbol("__morestack_allocate_stack_space")
36667 .addRegMask(RegMask)
36668 .addReg(X86::RDI, RegState::Implicit)
36669 .addReg(X86::RAX, RegState::ImplicitDefine);
36670 } else if (Is64Bit) {
36671 BuildMI(mallocMBB, MIMD, TII->get(X86::MOV32rr), X86::EDI)
36672 .addReg(sizeVReg);
36673 BuildMI(mallocMBB, MIMD, TII->get(X86::CALL64pcrel32))
36674 .addExternalSymbol("__morestack_allocate_stack_space")
36675 .addRegMask(RegMask)
36676 .addReg(X86::EDI, RegState::Implicit)
36677 .addReg(X86::EAX, RegState::ImplicitDefine);
36678 } else {
36679 BuildMI(mallocMBB, MIMD, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg)
36680 .addImm(12);
36681 BuildMI(mallocMBB, MIMD, TII->get(X86::PUSH32r)).addReg(sizeVReg);
36682 BuildMI(mallocMBB, MIMD, TII->get(X86::CALLpcrel32))
36683 .addExternalSymbol("__morestack_allocate_stack_space")
36684 .addRegMask(RegMask)
36685 .addReg(X86::EAX, RegState::ImplicitDefine);
36686 }
36687
36688 if (!Is64Bit)
36689 BuildMI(mallocMBB, MIMD, TII->get(X86::ADD32ri), physSPReg).addReg(physSPReg)
36690 .addImm(16);
36691
36692 BuildMI(mallocMBB, MIMD, TII->get(TargetOpcode::COPY), mallocPtrVReg)
36693 .addReg(IsLP64 ? X86::RAX : X86::EAX);
36694 BuildMI(mallocMBB, MIMD, TII->get(X86::JMP_1)).addMBB(continueMBB);
36695
36696 // Set up the CFG correctly.
36697 BB->addSuccessor(bumpMBB);
36698 BB->addSuccessor(mallocMBB);
36699 mallocMBB->addSuccessor(continueMBB);
36700 bumpMBB->addSuccessor(continueMBB);
36701
36702 // Take care of the PHI nodes.
36703 BuildMI(*continueMBB, continueMBB->begin(), MIMD, TII->get(X86::PHI),
36704 MI.getOperand(0).getReg())
36705 .addReg(mallocPtrVReg)
36706 .addMBB(mallocMBB)
36707 .addReg(bumpSPPtrVReg)
36708 .addMBB(bumpMBB);
36709
36710 // Delete the original pseudo instruction.
36711 MI.eraseFromParent();
36712
36713 // And we're done.
36714 return continueMBB;
36715}
36716
36718X86TargetLowering::EmitLoweredCatchRet(MachineInstr &MI,
36719 MachineBasicBlock *BB) const {
36720 MachineFunction *MF = BB->getParent();
36721 const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
36722 MachineBasicBlock *TargetMBB = MI.getOperand(0).getMBB();
36723 const MIMetadata MIMD(MI);
36724
36727 "SEH does not use catchret!");
36728
36729 // Only 32-bit EH needs to worry about manually restoring stack pointers.
36730 if (!Subtarget.is32Bit())
36731 return BB;
36732
36733 // C++ EH creates a new target block to hold the restore code, and wires up
36734 // the new block to the return destination with a normal JMP_4.
36735 MachineBasicBlock *RestoreMBB =
36737 assert(BB->succ_size() == 1);
36738 MF->insert(std::next(BB->getIterator()), RestoreMBB);
36739 RestoreMBB->transferSuccessorsAndUpdatePHIs(BB);
36740 BB->addSuccessor(RestoreMBB);
36741 MI.getOperand(0).setMBB(RestoreMBB);
36742
36743 // Marking this as an EH pad but not a funclet entry block causes PEI to
36744 // restore stack pointers in the block.
36745 RestoreMBB->setIsEHPad(true);
36746
36747 auto RestoreMBBI = RestoreMBB->begin();
36748 BuildMI(*RestoreMBB, RestoreMBBI, MIMD, TII.get(X86::JMP_4)).addMBB(TargetMBB);
36749 return BB;
36750}
36751
36753X86TargetLowering::EmitLoweredTLSCall(MachineInstr &MI,
36754 MachineBasicBlock *BB) const {
36755 // This is pretty easy. We're taking the value that we received from
36756 // our load from the relocation, sticking it in either RDI (x86-64)
36757 // or EAX and doing an indirect call. The return value will then
36758 // be in the normal return register.
36759 MachineFunction *F = BB->getParent();
36760 const X86InstrInfo *TII = Subtarget.getInstrInfo();
36761 const MIMetadata MIMD(MI);
36762
36763 assert(Subtarget.isTargetDarwin() && "Darwin only instr emitted?");
36764 assert(MI.getOperand(3).isGlobal() && "This should be a global");
36765
36766 // Get a register mask for the lowered call.
36767 // FIXME: The 32-bit calls have non-standard calling conventions. Use a
36768 // proper register mask.
36769 const uint32_t *RegMask =
36770 Subtarget.is64Bit() ?
36771 Subtarget.getRegisterInfo()->getDarwinTLSCallPreservedMask() :
36772 Subtarget.getRegisterInfo()->getCallPreservedMask(*F, CallingConv::C);
36773 if (Subtarget.is64Bit()) {
36774 MachineInstrBuilder MIB =
36775 BuildMI(*BB, MI, MIMD, TII->get(X86::MOV64rm), X86::RDI)
36776 .addReg(X86::RIP)
36777 .addImm(0)
36778 .addReg(0)
36779 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
36780 MI.getOperand(3).getTargetFlags())
36781 .addReg(0);
36782 MIB = BuildMI(*BB, MI, MIMD, TII->get(X86::CALL64m));
36783 addDirectMem(MIB, X86::RDI);
36784 MIB.addReg(X86::RAX, RegState::ImplicitDefine).addRegMask(RegMask);
36785 } else if (!isPositionIndependent()) {
36786 MachineInstrBuilder MIB =
36787 BuildMI(*BB, MI, MIMD, TII->get(X86::MOV32rm), X86::EAX)
36788 .addReg(0)
36789 .addImm(0)
36790 .addReg(0)
36791 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
36792 MI.getOperand(3).getTargetFlags())
36793 .addReg(0);
36794 MIB = BuildMI(*BB, MI, MIMD, TII->get(X86::CALL32m));
36795 addDirectMem(MIB, X86::EAX);
36796 MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
36797 } else {
36798 MachineInstrBuilder MIB =
36799 BuildMI(*BB, MI, MIMD, TII->get(X86::MOV32rm), X86::EAX)
36800 .addReg(TII->getGlobalBaseReg(F))
36801 .addImm(0)
36802 .addReg(0)
36803 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
36804 MI.getOperand(3).getTargetFlags())
36805 .addReg(0);
36806 MIB = BuildMI(*BB, MI, MIMD, TII->get(X86::CALL32m));
36807 addDirectMem(MIB, X86::EAX);
36808 MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
36809 }
36810
36811 MI.eraseFromParent(); // The pseudo instruction is gone now.
36812 return BB;
36813}
36814
36815static unsigned getOpcodeForIndirectThunk(unsigned RPOpc) {
36816 switch (RPOpc) {
36817 case X86::INDIRECT_THUNK_CALL32:
36818 return X86::CALLpcrel32;
36819 case X86::INDIRECT_THUNK_CALL64:
36820 return X86::CALL64pcrel32;
36821 case X86::INDIRECT_THUNK_TCRETURN32:
36822 return X86::TCRETURNdi;
36823 case X86::INDIRECT_THUNK_TCRETURN64:
36824 return X86::TCRETURNdi64;
36825 }
36826 llvm_unreachable("not indirect thunk opcode");
36827}
36828
36829static const char *getIndirectThunkSymbol(const X86Subtarget &Subtarget,
36830 Register Reg) {
36831 if (Subtarget.useRetpolineExternalThunk()) {
36832 // When using an external thunk for retpolines, we pick names that match the
36833 // names GCC happens to use as well. This helps simplify the implementation
36834 // of the thunks for kernels where they have no easy ability to create
36835 // aliases and are doing non-trivial configuration of the thunk's body. For
36836 // example, the Linux kernel will do boot-time hot patching of the thunk
36837 // bodies and cannot easily export aliases of these to loaded modules.
36838 //
36839 // Note that at any point in the future, we may need to change the semantics
36840 // of how we implement retpolines and at that time will likely change the
36841 // name of the called thunk. Essentially, there is no hard guarantee that
36842 // LLVM will generate calls to specific thunks, we merely make a best-effort
36843 // attempt to help out kernels and other systems where duplicating the
36844 // thunks is costly.
36845 switch (Reg.id()) {
36846 case X86::EAX:
36847 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
36848 return "__x86_indirect_thunk_eax";
36849 case X86::ECX:
36850 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
36851 return "__x86_indirect_thunk_ecx";
36852 case X86::EDX:
36853 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
36854 return "__x86_indirect_thunk_edx";
36855 case X86::EDI:
36856 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
36857 return "__x86_indirect_thunk_edi";
36858 case X86::R11:
36859 assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!");
36860 return "__x86_indirect_thunk_r11";
36861 }
36862 llvm_unreachable("unexpected reg for external indirect thunk");
36863 }
36864
36865 if (Subtarget.useRetpolineIndirectCalls() ||
36866 Subtarget.useRetpolineIndirectBranches()) {
36867 // When targeting an internal COMDAT thunk use an LLVM-specific name.
36868 switch (Reg.id()) {
36869 case X86::EAX:
36870 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
36871 return "__llvm_retpoline_eax";
36872 case X86::ECX:
36873 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
36874 return "__llvm_retpoline_ecx";
36875 case X86::EDX:
36876 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
36877 return "__llvm_retpoline_edx";
36878 case X86::EDI:
36879 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
36880 return "__llvm_retpoline_edi";
36881 case X86::R11:
36882 assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!");
36883 return "__llvm_retpoline_r11";
36884 }
36885 llvm_unreachable("unexpected reg for retpoline");
36886 }
36887
36888 if (Subtarget.useLVIControlFlowIntegrity()) {
36889 assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!");
36890 return "__llvm_lvi_thunk_r11";
36891 }
36892 llvm_unreachable("getIndirectThunkSymbol() invoked without thunk feature");
36893}
36894
36896X86TargetLowering::EmitLoweredIndirectThunk(MachineInstr &MI,
36897 MachineBasicBlock *BB) const {
36898 // Copy the virtual register into the R11 physical register and
36899 // call the retpoline thunk.
36900 const MIMetadata MIMD(MI);
36901 const X86InstrInfo *TII = Subtarget.getInstrInfo();
36902 Register CalleeVReg = MI.getOperand(0).getReg();
36903 unsigned Opc = getOpcodeForIndirectThunk(MI.getOpcode());
36904
36905 // Find an available scratch register to hold the callee. On 64-bit, we can
36906 // just use R11, but we scan for uses anyway to ensure we don't generate
36907 // incorrect code. On 32-bit, we use one of EAX, ECX, or EDX that isn't
36908 // already a register use operand to the call to hold the callee. If none
36909 // are available, use EDI instead. EDI is chosen because EBX is the PIC base
36910 // register and ESI is the base pointer to realigned stack frames with VLAs.
36911 SmallVector<Register, 3> AvailableRegs;
36912 if (Subtarget.is64Bit())
36913 AvailableRegs.push_back(X86::R11);
36914 else
36915 AvailableRegs.append({X86::EAX, X86::ECX, X86::EDX, X86::EDI});
36916
36917 // Zero out any registers that are already used.
36918 for (const auto &MO : MI.operands()) {
36919 if (MO.isReg() && MO.isUse())
36920 llvm::replace(AvailableRegs, MO.getReg(), Register());
36921 }
36922
36923 // Choose the first remaining non-zero available register.
36924 Register AvailableReg;
36925 for (Register MaybeReg : AvailableRegs) {
36926 if (MaybeReg) {
36927 AvailableReg = MaybeReg;
36928 break;
36929 }
36930 }
36931 if (!AvailableReg)
36932 report_fatal_error("calling convention incompatible with retpoline, no "
36933 "available registers");
36934
36935 const char *Symbol = getIndirectThunkSymbol(Subtarget, AvailableReg);
36936
36937 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), AvailableReg)
36938 .addReg(CalleeVReg);
36939 MI.getOperand(0).ChangeToES(Symbol);
36940 MI.setDesc(TII->get(Opc));
36941 MachineInstrBuilder(*BB->getParent(), &MI)
36942 .addReg(AvailableReg, RegState::Implicit | RegState::Kill);
36943 return BB;
36944}
36945
36946/// SetJmp implies future control flow change upon calling the corresponding
36947/// LongJmp.
36948/// Instead of using the 'return' instruction, the long jump fixes the stack and
36949/// performs an indirect branch. To do so it uses the registers that were stored
36950/// in the jump buffer (when calling SetJmp).
36951/// In case the shadow stack is enabled we need to fix it as well, because some
36952/// return addresses will be skipped.
36953/// The function will save the SSP for future fixing in the function
36954/// emitLongJmpShadowStackFix.
36955/// \sa emitLongJmpShadowStackFix
36956/// \param [in] MI The temporary Machine Instruction for the builtin.
36957/// \param [in] MBB The Machine Basic Block that will be modified.
36958void X86TargetLowering::emitSetJmpShadowStackFix(MachineInstr &MI,
36959 MachineBasicBlock *MBB) const {
36960 const MIMetadata MIMD(MI);
36961 MachineFunction *MF = MBB->getParent();
36962 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
36963 MachineRegisterInfo &MRI = MF->getRegInfo();
36964 MachineInstrBuilder MIB;
36965
36966 // Memory Reference.
36967 SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands());
36968
36969 // Initialize a register with zero.
36970 MVT PVT = getPointerTy(MF->getDataLayout());
36971 const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
36972 Register ZReg = MRI.createVirtualRegister(PtrRC);
36973 unsigned XorRROpc = (PVT == MVT::i64) ? X86::XOR64rr : X86::XOR32rr;
36974 BuildMI(*MBB, MI, MIMD, TII->get(XorRROpc))
36975 .addDef(ZReg)
36976 .addReg(ZReg, RegState::Undef)
36977 .addReg(ZReg, RegState::Undef);
36978
36979 // Read the current SSP Register value to the zeroed register.
36980 Register SSPCopyReg = MRI.createVirtualRegister(PtrRC);
36981 unsigned RdsspOpc = (PVT == MVT::i64) ? X86::RDSSPQ : X86::RDSSPD;
36982 BuildMI(*MBB, MI, MIMD, TII->get(RdsspOpc), SSPCopyReg).addReg(ZReg);
36983
36984 // Write the SSP register value to offset 3 in input memory buffer.
36985 unsigned PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
36986 MIB = BuildMI(*MBB, MI, MIMD, TII->get(PtrStoreOpc));
36987 const int64_t SSPOffset = 3 * PVT.getStoreSize();
36988 const unsigned MemOpndSlot = 1;
36989 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
36990 if (i == X86::AddrDisp)
36991 MIB.addDisp(MI.getOperand(MemOpndSlot + i), SSPOffset);
36992 else
36993 MIB.add(MI.getOperand(MemOpndSlot + i));
36994 }
36995 MIB.addReg(SSPCopyReg);
36996 MIB.setMemRefs(MMOs);
36997}
36998
37000X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
37001 MachineBasicBlock *MBB) const {
37002 const MIMetadata MIMD(MI);
37003 MachineFunction *MF = MBB->getParent();
37004 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
37005 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
37006 MachineRegisterInfo &MRI = MF->getRegInfo();
37007
37008 const BasicBlock *BB = MBB->getBasicBlock();
37010
37011 // Memory Reference
37012 SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands());
37013
37014 unsigned MemOpndSlot = 0;
37015
37016 unsigned CurOp = 0;
37017
37018 Register DstReg = MI.getOperand(CurOp++).getReg();
37019 const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
37020 assert(TRI->isTypeLegalForClass(*RC, MVT::i32) && "Invalid destination!");
37021 (void)TRI;
37022 Register mainDstReg = MRI.createVirtualRegister(RC);
37023 Register restoreDstReg = MRI.createVirtualRegister(RC);
37024
37025 MemOpndSlot = CurOp;
37026
37027 MVT PVT = getPointerTy(MF->getDataLayout());
37028 assert((PVT == MVT::i64 || PVT == MVT::i32) &&
37029 "Invalid Pointer Size!");
37030
37031 // For v = setjmp(buf), we generate
37032 //
37033 // thisMBB:
37034 // buf[LabelOffset] = restoreMBB <-- takes address of restoreMBB
37035 // SjLjSetup restoreMBB
37036 //
37037 // mainMBB:
37038 // v_main = 0
37039 //
37040 // sinkMBB:
37041 // v = phi(main, restore)
37042 //
37043 // restoreMBB:
37044 // if base pointer being used, load it from frame
37045 // v_restore = 1
37046
37047 MachineBasicBlock *thisMBB = MBB;
37048 MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
37049 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
37050 MachineBasicBlock *restoreMBB = MF->CreateMachineBasicBlock(BB);
37051 MF->insert(I, mainMBB);
37052 MF->insert(I, sinkMBB);
37053 MF->push_back(restoreMBB);
37054 restoreMBB->setMachineBlockAddressTaken();
37055
37056 MachineInstrBuilder MIB;
37057
37058 // Transfer the remainder of BB and its successor edges to sinkMBB.
37059 sinkMBB->splice(sinkMBB->begin(), MBB,
37060 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
37062
37063 // thisMBB:
37064 unsigned PtrStoreOpc = 0;
37065 Register LabelReg;
37066 const int64_t LabelOffset = 1 * PVT.getStoreSize();
37067 bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
37069
37070 // Prepare IP either in reg or imm.
37071 if (!UseImmLabel) {
37072 PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
37073 const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
37074 LabelReg = MRI.createVirtualRegister(PtrRC);
37075 if (Subtarget.is64Bit()) {
37076 MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(X86::LEA64r), LabelReg)
37077 .addReg(X86::RIP)
37078 .addImm(0)
37079 .addReg(0)
37080 .addMBB(restoreMBB)
37081 .addReg(0);
37082 } else {
37083 const X86InstrInfo *XII = static_cast<const X86InstrInfo*>(TII);
37084 MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(X86::LEA32r), LabelReg)
37085 .addReg(XII->getGlobalBaseReg(MF))
37086 .addImm(0)
37087 .addReg(0)
37088 .addMBB(restoreMBB, Subtarget.classifyBlockAddressReference())
37089 .addReg(0);
37090 }
37091 } else
37092 PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
37093 // Store IP
37094 MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(PtrStoreOpc));
37095 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
37096 if (i == X86::AddrDisp)
37097 MIB.addDisp(MI.getOperand(MemOpndSlot + i), LabelOffset);
37098 else
37099 MIB.add(MI.getOperand(MemOpndSlot + i));
37100 }
37101 if (!UseImmLabel)
37102 MIB.addReg(LabelReg);
37103 else
37104 MIB.addMBB(restoreMBB);
37105 MIB.setMemRefs(MMOs);
37106
37107 if (MF->getFunction().getParent()->getModuleFlag("cf-protection-return")) {
37108 emitSetJmpShadowStackFix(MI, thisMBB);
37109 }
37110
37111 // Setup
37112 MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(X86::EH_SjLj_Setup))
37113 .addMBB(restoreMBB);
37114
37115 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
37116 MIB.addRegMask(RegInfo->getNoPreservedMask());
37117 thisMBB->addSuccessor(mainMBB);
37118 thisMBB->addSuccessor(restoreMBB);
37119
37120 // mainMBB:
37121 // EAX = 0
37122 BuildMI(mainMBB, MIMD, TII->get(X86::MOV32r0), mainDstReg);
37123 mainMBB->addSuccessor(sinkMBB);
37124
37125 // sinkMBB:
37126 BuildMI(*sinkMBB, sinkMBB->begin(), MIMD, TII->get(X86::PHI), DstReg)
37127 .addReg(mainDstReg)
37128 .addMBB(mainMBB)
37129 .addReg(restoreDstReg)
37130 .addMBB(restoreMBB);
37131
37132 // restoreMBB:
37133 if (RegInfo->hasBasePointer(*MF)) {
37134 const bool Uses64BitFramePtr = Subtarget.isTarget64BitLP64();
37135 X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>();
37136 X86FI->setRestoreBasePointer(MF);
37137 Register FramePtr = RegInfo->getFrameRegister(*MF);
37138 Register BasePtr = RegInfo->getBaseRegister();
37139 unsigned Opm = Uses64BitFramePtr ? X86::MOV64rm : X86::MOV32rm;
37140 addRegOffset(BuildMI(restoreMBB, MIMD, TII->get(Opm), BasePtr),
37141 FramePtr, true, X86FI->getRestoreBasePointerOffset())
37143 }
37144 BuildMI(restoreMBB, MIMD, TII->get(X86::MOV32ri), restoreDstReg).addImm(1);
37145 BuildMI(restoreMBB, MIMD, TII->get(X86::JMP_1)).addMBB(sinkMBB);
37146 restoreMBB->addSuccessor(sinkMBB);
37147
37148 MI.eraseFromParent();
37149 return sinkMBB;
37150}
37151
37152/// Fix the shadow stack using the previously saved SSP pointer.
37153/// \sa emitSetJmpShadowStackFix
37154/// \param [in] MI The temporary Machine Instruction for the builtin.
37155/// \param [in] MBB The Machine Basic Block that will be modified.
37156/// \return The sink MBB that will perform the future indirect branch.
37158X86TargetLowering::emitLongJmpShadowStackFix(MachineInstr &MI,
37159 MachineBasicBlock *MBB) const {
37160 const MIMetadata MIMD(MI);
37161 MachineFunction *MF = MBB->getParent();
37162 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
37163 MachineRegisterInfo &MRI = MF->getRegInfo();
37164
37165 // Memory Reference
37166 SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands());
37167
37168 MVT PVT = getPointerTy(MF->getDataLayout());
37169 const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
37170
37171 // checkSspMBB:
37172 // xor vreg1, vreg1
37173 // rdssp vreg1
37174 // test vreg1, vreg1
37175 // je sinkMBB # Jump if Shadow Stack is not supported
37176 // fallMBB:
37177 // mov buf+24/12(%rip), vreg2
37178 // sub vreg1, vreg2
37179 // jbe sinkMBB # No need to fix the Shadow Stack
37180 // fixShadowMBB:
37181 // shr 3/2, vreg2
37182 // incssp vreg2 # fix the SSP according to the lower 8 bits
37183 // shr 8, vreg2
37184 // je sinkMBB
37185 // fixShadowLoopPrepareMBB:
37186 // shl vreg2
37187 // mov 128, vreg3
37188 // fixShadowLoopMBB:
37189 // incssp vreg3
37190 // dec vreg2
37191 // jne fixShadowLoopMBB # Iterate until you finish fixing
37192 // # the Shadow Stack
37193 // sinkMBB:
37194
37196 const BasicBlock *BB = MBB->getBasicBlock();
37197
37198 MachineBasicBlock *checkSspMBB = MF->CreateMachineBasicBlock(BB);
37199 MachineBasicBlock *fallMBB = MF->CreateMachineBasicBlock(BB);
37200 MachineBasicBlock *fixShadowMBB = MF->CreateMachineBasicBlock(BB);
37201 MachineBasicBlock *fixShadowLoopPrepareMBB = MF->CreateMachineBasicBlock(BB);
37202 MachineBasicBlock *fixShadowLoopMBB = MF->CreateMachineBasicBlock(BB);
37203 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
37204 MF->insert(I, checkSspMBB);
37205 MF->insert(I, fallMBB);
37206 MF->insert(I, fixShadowMBB);
37207 MF->insert(I, fixShadowLoopPrepareMBB);
37208 MF->insert(I, fixShadowLoopMBB);
37209 MF->insert(I, sinkMBB);
37210
37211 // Transfer the remainder of BB and its successor edges to sinkMBB.
37212 sinkMBB->splice(sinkMBB->begin(), MBB, MachineBasicBlock::iterator(MI),
37213 MBB->end());
37215
37216 MBB->addSuccessor(checkSspMBB);
37217
37218 // Initialize a register with zero.
37219 Register ZReg = MRI.createVirtualRegister(&X86::GR32RegClass);
37220 BuildMI(checkSspMBB, MIMD, TII->get(X86::MOV32r0), ZReg);
37221
37222 if (PVT == MVT::i64) {
37223 Register TmpZReg = MRI.createVirtualRegister(PtrRC);
37224 BuildMI(checkSspMBB, MIMD, TII->get(X86::SUBREG_TO_REG), TmpZReg)
37225 .addImm(0)
37226 .addReg(ZReg)
37227 .addImm(X86::sub_32bit);
37228 ZReg = TmpZReg;
37229 }
37230
37231 // Read the current SSP Register value to the zeroed register.
37232 Register SSPCopyReg = MRI.createVirtualRegister(PtrRC);
37233 unsigned RdsspOpc = (PVT == MVT::i64) ? X86::RDSSPQ : X86::RDSSPD;
37234 BuildMI(checkSspMBB, MIMD, TII->get(RdsspOpc), SSPCopyReg).addReg(ZReg);
37235
37236 // Check whether the result of the SSP register is zero and jump directly
37237 // to the sink.
37238 unsigned TestRROpc = (PVT == MVT::i64) ? X86::TEST64rr : X86::TEST32rr;
37239 BuildMI(checkSspMBB, MIMD, TII->get(TestRROpc))
37240 .addReg(SSPCopyReg)
37241 .addReg(SSPCopyReg);
37242 BuildMI(checkSspMBB, MIMD, TII->get(X86::JCC_1))
37243 .addMBB(sinkMBB)
37245 checkSspMBB->addSuccessor(sinkMBB);
37246 checkSspMBB->addSuccessor(fallMBB);
37247
37248 // Reload the previously saved SSP register value.
37249 Register PrevSSPReg = MRI.createVirtualRegister(PtrRC);
37250 unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
37251 const int64_t SPPOffset = 3 * PVT.getStoreSize();
37252 MachineInstrBuilder MIB =
37253 BuildMI(fallMBB, MIMD, TII->get(PtrLoadOpc), PrevSSPReg);
37254 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
37255 const MachineOperand &MO = MI.getOperand(i);
37256 if (i == X86::AddrDisp)
37257 MIB.addDisp(MO, SPPOffset);
37258 else if (MO.isReg()) // Don't add the whole operand, we don't want to
37259 // preserve kill flags.
37260 MIB.addReg(MO.getReg());
37261 else
37262 MIB.add(MO);
37263 }
37264 MIB.setMemRefs(MMOs);
37265
37266 // Subtract the current SSP from the previous SSP.
37267 Register SspSubReg = MRI.createVirtualRegister(PtrRC);
37268 unsigned SubRROpc = (PVT == MVT::i64) ? X86::SUB64rr : X86::SUB32rr;
37269 BuildMI(fallMBB, MIMD, TII->get(SubRROpc), SspSubReg)
37270 .addReg(PrevSSPReg)
37271 .addReg(SSPCopyReg);
37272
37273 // Jump to sink in case PrevSSPReg <= SSPCopyReg.
37274 BuildMI(fallMBB, MIMD, TII->get(X86::JCC_1))
37275 .addMBB(sinkMBB)
37277 fallMBB->addSuccessor(sinkMBB);
37278 fallMBB->addSuccessor(fixShadowMBB);
37279
37280 // Shift right by 2/3 for 32/64 because incssp multiplies the argument by 4/8.
37281 unsigned ShrRIOpc = (PVT == MVT::i64) ? X86::SHR64ri : X86::SHR32ri;
37282 unsigned Offset = (PVT == MVT::i64) ? 3 : 2;
37283 Register SspFirstShrReg = MRI.createVirtualRegister(PtrRC);
37284 BuildMI(fixShadowMBB, MIMD, TII->get(ShrRIOpc), SspFirstShrReg)
37285 .addReg(SspSubReg)
37286 .addImm(Offset);
37287
37288 // Increase SSP when looking only on the lower 8 bits of the delta.
37289 unsigned IncsspOpc = (PVT == MVT::i64) ? X86::INCSSPQ : X86::INCSSPD;
37290 BuildMI(fixShadowMBB, MIMD, TII->get(IncsspOpc)).addReg(SspFirstShrReg);
37291
37292 // Reset the lower 8 bits.
37293 Register SspSecondShrReg = MRI.createVirtualRegister(PtrRC);
37294 BuildMI(fixShadowMBB, MIMD, TII->get(ShrRIOpc), SspSecondShrReg)
37295 .addReg(SspFirstShrReg)
37296 .addImm(8);
37297
37298 // Jump if the result of the shift is zero.
37299 BuildMI(fixShadowMBB, MIMD, TII->get(X86::JCC_1))
37300 .addMBB(sinkMBB)
37302 fixShadowMBB->addSuccessor(sinkMBB);
37303 fixShadowMBB->addSuccessor(fixShadowLoopPrepareMBB);
37304
37305 // Do a single shift left.
37306 unsigned ShlR1Opc = (PVT == MVT::i64) ? X86::SHL64ri : X86::SHL32ri;
37307 Register SspAfterShlReg = MRI.createVirtualRegister(PtrRC);
37308 BuildMI(fixShadowLoopPrepareMBB, MIMD, TII->get(ShlR1Opc), SspAfterShlReg)
37309 .addReg(SspSecondShrReg)
37310 .addImm(1);
37311
37312 // Save the value 128 to a register (will be used next with incssp).
37313 Register Value128InReg = MRI.createVirtualRegister(PtrRC);
37314 unsigned MovRIOpc = (PVT == MVT::i64) ? X86::MOV64ri32 : X86::MOV32ri;
37315 BuildMI(fixShadowLoopPrepareMBB, MIMD, TII->get(MovRIOpc), Value128InReg)
37316 .addImm(128);
37317 fixShadowLoopPrepareMBB->addSuccessor(fixShadowLoopMBB);
37318
37319 // Since incssp only looks at the lower 8 bits, we might need to do several
37320 // iterations of incssp until we finish fixing the shadow stack.
37321 Register DecReg = MRI.createVirtualRegister(PtrRC);
37322 Register CounterReg = MRI.createVirtualRegister(PtrRC);
37323 BuildMI(fixShadowLoopMBB, MIMD, TII->get(X86::PHI), CounterReg)
37324 .addReg(SspAfterShlReg)
37325 .addMBB(fixShadowLoopPrepareMBB)
37326 .addReg(DecReg)
37327 .addMBB(fixShadowLoopMBB);
37328
37329 // Every iteration we increase the SSP by 128.
37330 BuildMI(fixShadowLoopMBB, MIMD, TII->get(IncsspOpc)).addReg(Value128InReg);
37331
37332 // Every iteration we decrement the counter by 1.
37333 unsigned DecROpc = (PVT == MVT::i64) ? X86::DEC64r : X86::DEC32r;
37334 BuildMI(fixShadowLoopMBB, MIMD, TII->get(DecROpc), DecReg).addReg(CounterReg);
37335
37336 // Jump if the counter is not zero yet.
37337 BuildMI(fixShadowLoopMBB, MIMD, TII->get(X86::JCC_1))
37338 .addMBB(fixShadowLoopMBB)
37340 fixShadowLoopMBB->addSuccessor(sinkMBB);
37341 fixShadowLoopMBB->addSuccessor(fixShadowLoopMBB);
37342
37343 return sinkMBB;
37344}
37345
37347X86TargetLowering::emitEHSjLjLongJmp(MachineInstr &MI,
37348 MachineBasicBlock *MBB) const {
37349 const MIMetadata MIMD(MI);
37350 MachineFunction *MF = MBB->getParent();
37351 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
37352 MachineRegisterInfo &MRI = MF->getRegInfo();
37353
37354 // Memory Reference
37355 SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands());
37356
37357 MVT PVT = getPointerTy(MF->getDataLayout());
37358 assert((PVT == MVT::i64 || PVT == MVT::i32) &&
37359 "Invalid Pointer Size!");
37360
37361 const TargetRegisterClass *RC =
37362 (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
37363 Register Tmp = MRI.createVirtualRegister(RC);
37364 // Since FP is only updated here but NOT referenced, it's treated as GPR.
37365 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
37366 Register FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP;
37367 Register SP = RegInfo->getStackRegister();
37368
37369 MachineInstrBuilder MIB;
37370
37371 const int64_t LabelOffset = 1 * PVT.getStoreSize();
37372 const int64_t SPOffset = 2 * PVT.getStoreSize();
37373
37374 unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
37375 unsigned IJmpOpc = (PVT == MVT::i64) ? X86::JMP64r : X86::JMP32r;
37376
37377 MachineBasicBlock *thisMBB = MBB;
37378
37379 // When CET and shadow stack is enabled, we need to fix the Shadow Stack.
37380 if (MF->getFunction().getParent()->getModuleFlag("cf-protection-return")) {
37381 thisMBB = emitLongJmpShadowStackFix(MI, thisMBB);
37382 }
37383
37384 // Reload FP
37385 MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(PtrLoadOpc), FP);
37386 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
37387 const MachineOperand &MO = MI.getOperand(i);
37388 if (MO.isReg()) // Don't add the whole operand, we don't want to
37389 // preserve kill flags.
37390 MIB.addReg(MO.getReg());
37391 else
37392 MIB.add(MO);
37393 }
37394 MIB.setMemRefs(MMOs);
37396
37397 // Reload IP
37398 MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(PtrLoadOpc), Tmp);
37399 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
37400 const MachineOperand &MO = MI.getOperand(i);
37401 if (i == X86::AddrDisp)
37402 MIB.addDisp(MO, LabelOffset);
37403 else if (MO.isReg()) // Don't add the whole operand, we don't want to
37404 // preserve kill flags.
37405 MIB.addReg(MO.getReg());
37406 else
37407 MIB.add(MO);
37408 }
37409 MIB.setMemRefs(MMOs);
37410
37411 // Reload SP
37412 MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(PtrLoadOpc), SP);
37413 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
37414 if (i == X86::AddrDisp)
37415 MIB.addDisp(MI.getOperand(i), SPOffset);
37416 else
37417 MIB.add(MI.getOperand(i)); // We can preserve the kill flags here, it's
37418 // the last instruction of the expansion.
37419 }
37420 MIB.setMemRefs(MMOs);
37422
37423 // Jump
37424 BuildMI(*thisMBB, MI, MIMD, TII->get(IJmpOpc)).addReg(Tmp);
37425
37426 MI.eraseFromParent();
37427 return thisMBB;
37428}
37429
37430void X86TargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,
37432 MachineBasicBlock *DispatchBB,
37433 int FI) const {
37434 const MIMetadata MIMD(MI);
37435 MachineFunction *MF = MBB->getParent();
37436 MachineRegisterInfo *MRI = &MF->getRegInfo();
37437 const X86InstrInfo *TII = Subtarget.getInstrInfo();
37438
37439 MVT PVT = getPointerTy(MF->getDataLayout());
37440 assert((PVT == MVT::i64 || PVT == MVT::i32) && "Invalid Pointer Size!");
37441
37442 unsigned Op = 0;
37443 Register VR;
37444
37445 bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
37447
37448 if (UseImmLabel) {
37449 Op = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
37450 } else {
37451 const TargetRegisterClass *TRC =
37452 (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
37453 VR = MRI->createVirtualRegister(TRC);
37454 Op = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
37455
37456 if (Subtarget.is64Bit())
37457 BuildMI(*MBB, MI, MIMD, TII->get(X86::LEA64r), VR)
37458 .addReg(X86::RIP)
37459 .addImm(1)
37460 .addReg(0)
37461 .addMBB(DispatchBB)
37462 .addReg(0);
37463 else
37464 BuildMI(*MBB, MI, MIMD, TII->get(X86::LEA32r), VR)
37465 .addReg(0) /* TII->getGlobalBaseReg(MF) */
37466 .addImm(1)
37467 .addReg(0)
37468 .addMBB(DispatchBB, Subtarget.classifyBlockAddressReference())
37469 .addReg(0);
37470 }
37471
37472 MachineInstrBuilder MIB = BuildMI(*MBB, MI, MIMD, TII->get(Op));
37473 addFrameReference(MIB, FI, Subtarget.is64Bit() ? 56 : 36);
37474 if (UseImmLabel)
37475 MIB.addMBB(DispatchBB);
37476 else
37477 MIB.addReg(VR);
37478}
37479
37481X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
37482 MachineBasicBlock *BB) const {
37483 const MIMetadata MIMD(MI);
37484 MachineFunction *MF = BB->getParent();
37485 MachineRegisterInfo *MRI = &MF->getRegInfo();
37486 const X86InstrInfo *TII = Subtarget.getInstrInfo();
37487 int FI = MF->getFrameInfo().getFunctionContextIndex();
37488
37489 // Get a mapping of the call site numbers to all of the landing pads they're
37490 // associated with.
37491 DenseMap<unsigned, SmallVector<MachineBasicBlock *, 2>> CallSiteNumToLPad;
37492 unsigned MaxCSNum = 0;
37493 for (auto &MBB : *MF) {
37494 if (!MBB.isEHPad())
37495 continue;
37496
37497 MCSymbol *Sym = nullptr;
37498 for (const auto &MI : MBB) {
37499 if (MI.isDebugInstr())
37500 continue;
37501
37502 assert(MI.isEHLabel() && "expected EH_LABEL");
37503 Sym = MI.getOperand(0).getMCSymbol();
37504 break;
37505 }
37506
37507 if (!MF->hasCallSiteLandingPad(Sym))
37508 continue;
37509
37510 for (unsigned CSI : MF->getCallSiteLandingPad(Sym)) {
37511 CallSiteNumToLPad[CSI].push_back(&MBB);
37512 MaxCSNum = std::max(MaxCSNum, CSI);
37513 }
37514 }
37515
37516 // Get an ordered list of the machine basic blocks for the jump table.
37517 std::vector<MachineBasicBlock *> LPadList;
37518 SmallPtrSet<MachineBasicBlock *, 32> InvokeBBs;
37519 LPadList.reserve(CallSiteNumToLPad.size());
37520
37521 for (unsigned CSI = 1; CSI <= MaxCSNum; ++CSI) {
37522 for (auto &LP : CallSiteNumToLPad[CSI]) {
37523 LPadList.push_back(LP);
37524 InvokeBBs.insert_range(LP->predecessors());
37525 }
37526 }
37527
37528 assert(!LPadList.empty() &&
37529 "No landing pad destinations for the dispatch jump table!");
37530
37531 // Create the MBBs for the dispatch code.
37532
37533 // Shove the dispatch's address into the return slot in the function context.
37534 MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock();
37535 DispatchBB->setIsEHPad(true);
37536
37537 MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
37538 BuildMI(TrapBB, MIMD, TII->get(X86::TRAP));
37539 DispatchBB->addSuccessor(TrapBB);
37540
37541 MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock();
37542 DispatchBB->addSuccessor(DispContBB);
37543
37544 // Insert MBBs.
37545 MF->push_back(DispatchBB);
37546 MF->push_back(DispContBB);
37547 MF->push_back(TrapBB);
37548
37549 // Insert code into the entry block that creates and registers the function
37550 // context.
37551 SetupEntryBlockForSjLj(MI, BB, DispatchBB, FI);
37552
37553 // Create the jump table and associated information
37554 unsigned JTE = getJumpTableEncoding();
37555 MachineJumpTableInfo *JTI = MF->getOrCreateJumpTableInfo(JTE);
37556 unsigned MJTI = JTI->createJumpTableIndex(LPadList);
37557
37558 const X86RegisterInfo &RI = TII->getRegisterInfo();
37559 // Add a register mask with no preserved registers. This results in all
37560 // registers being marked as clobbered.
37561 if (RI.hasBasePointer(*MF)) {
37562 const bool FPIs64Bit = Subtarget.isTarget64BitLP64();
37563 X86MachineFunctionInfo *MFI = MF->getInfo<X86MachineFunctionInfo>();
37564 MFI->setRestoreBasePointer(MF);
37565
37566 Register FP = RI.getFrameRegister(*MF);
37567 Register BP = RI.getBaseRegister();
37568 unsigned Op = FPIs64Bit ? X86::MOV64rm : X86::MOV32rm;
37569 addRegOffset(BuildMI(DispatchBB, MIMD, TII->get(Op), BP), FP, true,
37572 } else {
37573 BuildMI(DispatchBB, MIMD, TII->get(X86::NOOP))
37575 }
37576
37577 // IReg is used as an index in a memory operand and therefore can't be SP
37578 Register IReg = MRI->createVirtualRegister(&X86::GR32_NOSPRegClass);
37579 addFrameReference(BuildMI(DispatchBB, MIMD, TII->get(X86::MOV32rm), IReg), FI,
37580 Subtarget.is64Bit() ? 8 : 4);
37581 BuildMI(DispatchBB, MIMD, TII->get(X86::CMP32ri))
37582 .addReg(IReg)
37583 .addImm(LPadList.size());
37584 BuildMI(DispatchBB, MIMD, TII->get(X86::JCC_1))
37585 .addMBB(TrapBB)
37587
37588 if (Subtarget.is64Bit()) {
37589 Register BReg = MRI->createVirtualRegister(&X86::GR64RegClass);
37590 Register IReg64 = MRI->createVirtualRegister(&X86::GR64_NOSPRegClass);
37591
37592 // leaq .LJTI0_0(%rip), BReg
37593 BuildMI(DispContBB, MIMD, TII->get(X86::LEA64r), BReg)
37594 .addReg(X86::RIP)
37595 .addImm(1)
37596 .addReg(0)
37597 .addJumpTableIndex(MJTI)
37598 .addReg(0);
37599 // movzx IReg64, IReg
37600 BuildMI(DispContBB, MIMD, TII->get(TargetOpcode::SUBREG_TO_REG), IReg64)
37601 .addImm(0)
37602 .addReg(IReg)
37603 .addImm(X86::sub_32bit);
37604
37605 switch (JTE) {
37607 // jmpq *(BReg,IReg64,8)
37608 BuildMI(DispContBB, MIMD, TII->get(X86::JMP64m))
37609 .addReg(BReg)
37610 .addImm(8)
37611 .addReg(IReg64)
37612 .addImm(0)
37613 .addReg(0);
37614 break;
37616 Register OReg = MRI->createVirtualRegister(&X86::GR32RegClass);
37617 Register OReg64 = MRI->createVirtualRegister(&X86::GR64RegClass);
37618 Register TReg = MRI->createVirtualRegister(&X86::GR64RegClass);
37619
37620 // movl (BReg,IReg64,4), OReg
37621 BuildMI(DispContBB, MIMD, TII->get(X86::MOV32rm), OReg)
37622 .addReg(BReg)
37623 .addImm(4)
37624 .addReg(IReg64)
37625 .addImm(0)
37626 .addReg(0);
37627 // movsx OReg64, OReg
37628 BuildMI(DispContBB, MIMD, TII->get(X86::MOVSX64rr32), OReg64)
37629 .addReg(OReg);
37630 // addq BReg, OReg64, TReg
37631 BuildMI(DispContBB, MIMD, TII->get(X86::ADD64rr), TReg)
37632 .addReg(OReg64)
37633 .addReg(BReg);
37634 // jmpq *TReg
37635 BuildMI(DispContBB, MIMD, TII->get(X86::JMP64r)).addReg(TReg);
37636 break;
37637 }
37638 default:
37639 llvm_unreachable("Unexpected jump table encoding");
37640 }
37641 } else {
37642 // jmpl *.LJTI0_0(,IReg,4)
37643 BuildMI(DispContBB, MIMD, TII->get(X86::JMP32m))
37644 .addReg(0)
37645 .addImm(4)
37646 .addReg(IReg)
37647 .addJumpTableIndex(MJTI)
37648 .addReg(0);
37649 }
37650
37651 // Add the jump table entries as successors to the MBB.
37652 SmallPtrSet<MachineBasicBlock *, 8> SeenMBBs;
37653 for (auto &LP : LPadList)
37654 if (SeenMBBs.insert(LP).second)
37655 DispContBB->addSuccessor(LP);
37656
37657 // N.B. the order the invoke BBs are processed in doesn't matter here.
37659 const MCPhysReg *SavedRegs = MF->getRegInfo().getCalleeSavedRegs();
37660 for (MachineBasicBlock *MBB : InvokeBBs) {
37661 // Remove the landing pad successor from the invoke block and replace it
37662 // with the new dispatch block.
37663 // Keep a copy of Successors since it's modified inside the loop.
37664 SmallVector<MachineBasicBlock *, 8> Successors(MBB->succ_rbegin(),
37665 MBB->succ_rend());
37666 // FIXME: Avoid quadratic complexity.
37667 for (auto *MBBS : Successors) {
37668 if (MBBS->isEHPad()) {
37669 MBB->removeSuccessor(MBBS);
37670 MBBLPads.push_back(MBBS);
37671 }
37672 }
37673
37674 MBB->addSuccessor(DispatchBB);
37675
37676 // Find the invoke call and mark all of the callee-saved registers as
37677 // 'implicit defined' so that they're spilled. This prevents code from
37678 // moving instructions to before the EH block, where they will never be
37679 // executed.
37680 for (auto &II : reverse(*MBB)) {
37681 if (!II.isCall())
37682 continue;
37683
37684 DenseSet<Register> DefRegs;
37685 for (auto &MOp : II.operands())
37686 if (MOp.isReg())
37687 DefRegs.insert(MOp.getReg());
37688
37689 MachineInstrBuilder MIB(*MF, &II);
37690 for (unsigned RegIdx = 0; SavedRegs[RegIdx]; ++RegIdx) {
37691 Register Reg = SavedRegs[RegIdx];
37692 if (!DefRegs.contains(Reg))
37694 }
37695
37696 break;
37697 }
37698 }
37699
37700 // Mark all former landing pads as non-landing pads. The dispatch is the only
37701 // landing pad now.
37702 for (auto &LP : MBBLPads)
37703 LP->setIsEHPad(false);
37704
37705 // The instruction is gone now.
37706 MI.eraseFromParent();
37707 return BB;
37708}
37709
37711X86TargetLowering::emitPatchableEventCall(MachineInstr &MI,
37712 MachineBasicBlock *BB) const {
37713 // Wrap patchable event calls in CALLSEQ_START/CALLSEQ_END, as tracing
37714 // calls may require proper stack alignment.
37715 const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
37716 const MIMetadata MIMD(MI);
37717 MachineFunction &MF = *BB->getParent();
37718
37719 // Emit CALLSEQ_START right before the instruction.
37720 MF.getFrameInfo().setAdjustsStack(true);
37721 unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
37722 MachineInstrBuilder CallseqStart =
37723 BuildMI(MF, MIMD, TII.get(AdjStackDown)).addImm(0).addImm(0).addImm(0);
37724 BB->insert(MachineBasicBlock::iterator(MI), CallseqStart);
37725
37726 // Emit CALLSEQ_END right after the instruction.
37727 unsigned AdjStackUp = TII.getCallFrameDestroyOpcode();
37728 MachineInstrBuilder CallseqEnd =
37729 BuildMI(MF, MIMD, TII.get(AdjStackUp)).addImm(0).addImm(0);
37730 BB->insertAfter(MachineBasicBlock::iterator(MI), CallseqEnd);
37731
37732 return BB;
37733}
37734
37737 MachineBasicBlock *BB) const {
37738 MachineFunction *MF = BB->getParent();
37739 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
37740 const MIMetadata MIMD(MI);
37741
37742 auto TMMImmToTMMReg = [](unsigned Imm) {
37743 assert (Imm < 8 && "Illegal tmm index");
37744 return X86::TMM0 + Imm;
37745 };
37746 auto TMMImmToTMMPair = [](unsigned Imm) {
37747 assert(Imm < 8 && "Illegal tmm pair index.");
37748 return X86::TMM0_TMM1 + Imm / 2;
37749 };
37750 switch (MI.getOpcode()) {
37751 default:
37752 llvm_unreachable("Unexpected instr type to insert");
37753 case X86::INDIRECT_THUNK_CALL32:
37754 case X86::INDIRECT_THUNK_CALL64:
37755 case X86::INDIRECT_THUNK_TCRETURN32:
37756 case X86::INDIRECT_THUNK_TCRETURN64:
37757 return EmitLoweredIndirectThunk(MI, BB);
37758 case X86::CATCHRET:
37759 return EmitLoweredCatchRet(MI, BB);
37760 case X86::SEG_ALLOCA_32:
37761 case X86::SEG_ALLOCA_64:
37762 return EmitLoweredSegAlloca(MI, BB);
37763 case X86::PROBED_ALLOCA_32:
37764 case X86::PROBED_ALLOCA_64:
37765 return EmitLoweredProbedAlloca(MI, BB);
37766 case X86::TLSCall_32:
37767 case X86::TLSCall_64:
37768 return EmitLoweredTLSCall(MI, BB);
37769 case X86::CMOV_FR16:
37770 case X86::CMOV_FR16X:
37771 case X86::CMOV_FR32:
37772 case X86::CMOV_FR32X:
37773 case X86::CMOV_FR64:
37774 case X86::CMOV_FR64X:
37775 case X86::CMOV_GR8:
37776 case X86::CMOV_GR16:
37777 case X86::CMOV_GR32:
37778 case X86::CMOV_RFP32:
37779 case X86::CMOV_RFP64:
37780 case X86::CMOV_RFP80:
37781 case X86::CMOV_VR64:
37782 case X86::CMOV_VR128:
37783 case X86::CMOV_VR128X:
37784 case X86::CMOV_VR256:
37785 case X86::CMOV_VR256X:
37786 case X86::CMOV_VR512:
37787 case X86::CMOV_VK1:
37788 case X86::CMOV_VK2:
37789 case X86::CMOV_VK4:
37790 case X86::CMOV_VK8:
37791 case X86::CMOV_VK16:
37792 case X86::CMOV_VK32:
37793 case X86::CMOV_VK64:
37794 return EmitLoweredSelect(MI, BB);
37795
37796 case X86::FP80_ADDr:
37797 case X86::FP80_ADDm32: {
37798 // Change the floating point control register to use double extended
37799 // precision when performing the addition.
37800 int OrigCWFrameIdx =
37801 MF->getFrameInfo().CreateStackObject(2, Align(2), false);
37802 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::FNSTCW16m)),
37803 OrigCWFrameIdx);
37804
37805 // Load the old value of the control word...
37806 Register OldCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
37807 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::MOVZX32rm16), OldCW),
37808 OrigCWFrameIdx);
37809
37810 // OR 0b11 into bit 8 and 9. 0b11 is the encoding for double extended
37811 // precision.
37812 Register NewCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
37813 BuildMI(*BB, MI, MIMD, TII->get(X86::OR32ri), NewCW)
37814 .addReg(OldCW, RegState::Kill)
37815 .addImm(0x300);
37816
37817 // Extract to 16 bits.
37818 Register NewCW16 =
37819 MF->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
37820 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), NewCW16)
37821 .addReg(NewCW, RegState::Kill, X86::sub_16bit);
37822
37823 // Prepare memory for FLDCW.
37824 int NewCWFrameIdx =
37825 MF->getFrameInfo().CreateStackObject(2, Align(2), false);
37826 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::MOV16mr)),
37827 NewCWFrameIdx)
37828 .addReg(NewCW16, RegState::Kill);
37829
37830 // Reload the modified control word now...
37831 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::FLDCW16m)),
37832 NewCWFrameIdx);
37833
37834 // Do the addition.
37835 if (MI.getOpcode() == X86::FP80_ADDr) {
37836 BuildMI(*BB, MI, MIMD, TII->get(X86::ADD_Fp80))
37837 .add(MI.getOperand(0))
37838 .add(MI.getOperand(1))
37839 .add(MI.getOperand(2));
37840 } else {
37841 BuildMI(*BB, MI, MIMD, TII->get(X86::ADD_Fp80m32))
37842 .add(MI.getOperand(0))
37843 .add(MI.getOperand(1))
37844 .add(MI.getOperand(2))
37845 .add(MI.getOperand(3))
37846 .add(MI.getOperand(4))
37847 .add(MI.getOperand(5))
37848 .add(MI.getOperand(6));
37849 }
37850
37851 // Reload the original control word now.
37852 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::FLDCW16m)),
37853 OrigCWFrameIdx);
37854
37855 MI.eraseFromParent(); // The pseudo instruction is gone now.
37856 return BB;
37857 }
37858
37859 case X86::FP32_TO_INT16_IN_MEM:
37860 case X86::FP32_TO_INT32_IN_MEM:
37861 case X86::FP32_TO_INT64_IN_MEM:
37862 case X86::FP64_TO_INT16_IN_MEM:
37863 case X86::FP64_TO_INT32_IN_MEM:
37864 case X86::FP64_TO_INT64_IN_MEM:
37865 case X86::FP80_TO_INT16_IN_MEM:
37866 case X86::FP80_TO_INT32_IN_MEM:
37867 case X86::FP80_TO_INT64_IN_MEM: {
37868 // Change the floating point control register to use "round towards zero"
37869 // mode when truncating to an integer value.
37870 int OrigCWFrameIdx =
37871 MF->getFrameInfo().CreateStackObject(2, Align(2), false);
37872 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::FNSTCW16m)),
37873 OrigCWFrameIdx);
37874
37875 // Load the old value of the control word...
37876 Register OldCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
37877 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::MOVZX32rm16), OldCW),
37878 OrigCWFrameIdx);
37879
37880 // OR 0b11 into bit 10 and 11. 0b11 is the encoding for round toward zero.
37881 Register NewCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
37882 BuildMI(*BB, MI, MIMD, TII->get(X86::OR32ri), NewCW)
37883 .addReg(OldCW, RegState::Kill).addImm(0xC00);
37884
37885 // Extract to 16 bits.
37886 Register NewCW16 =
37887 MF->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
37888 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), NewCW16)
37889 .addReg(NewCW, RegState::Kill, X86::sub_16bit);
37890
37891 // Prepare memory for FLDCW.
37892 int NewCWFrameIdx =
37893 MF->getFrameInfo().CreateStackObject(2, Align(2), false);
37894 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::MOV16mr)),
37895 NewCWFrameIdx)
37896 .addReg(NewCW16, RegState::Kill);
37897
37898 // Reload the modified control word now...
37899 addFrameReference(BuildMI(*BB, MI, MIMD,
37900 TII->get(X86::FLDCW16m)), NewCWFrameIdx);
37901
37902 // Get the X86 opcode to use.
37903 unsigned Opc;
37904 switch (MI.getOpcode()) {
37905 // clang-format off
37906 default: llvm_unreachable("illegal opcode!");
37907 case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break;
37908 case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break;
37909 case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break;
37910 case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break;
37911 case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break;
37912 case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break;
37913 case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break;
37914 case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break;
37915 case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break;
37916 // clang-format on
37917 }
37918
37920 addFullAddress(BuildMI(*BB, MI, MIMD, TII->get(Opc)), AM)
37921 .addReg(MI.getOperand(X86::AddrNumOperands).getReg());
37922
37923 // Reload the original control word now.
37924 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::FLDCW16m)),
37925 OrigCWFrameIdx);
37926
37927 MI.eraseFromParent(); // The pseudo instruction is gone now.
37928 return BB;
37929 }
37930
37931 // xbegin
37932 case X86::XBEGIN:
37933 return emitXBegin(MI, BB, Subtarget.getInstrInfo());
37934
37935 case X86::VAARG_64:
37936 case X86::VAARG_X32:
37937 return EmitVAARGWithCustomInserter(MI, BB);
37938
37939 case X86::EH_SjLj_SetJmp32:
37940 case X86::EH_SjLj_SetJmp64:
37941 return emitEHSjLjSetJmp(MI, BB);
37942
37943 case X86::EH_SjLj_LongJmp32:
37944 case X86::EH_SjLj_LongJmp64:
37945 return emitEHSjLjLongJmp(MI, BB);
37946
37947 case X86::Int_eh_sjlj_setup_dispatch:
37948 return EmitSjLjDispatchBlock(MI, BB);
37949
37950 case TargetOpcode::STATEPOINT:
37951 // As an implementation detail, STATEPOINT shares the STACKMAP format at
37952 // this point in the process. We diverge later.
37953 return emitPatchPoint(MI, BB);
37954
37955 case TargetOpcode::STACKMAP:
37956 case TargetOpcode::PATCHPOINT:
37957 return emitPatchPoint(MI, BB);
37958
37959 case TargetOpcode::PATCHABLE_EVENT_CALL:
37960 case TargetOpcode::PATCHABLE_TYPED_EVENT_CALL:
37961 return emitPatchableEventCall(MI, BB);
37962
37963 case X86::LCMPXCHG8B: {
37964 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
37965 // In addition to 4 E[ABCD] registers implied by encoding, CMPXCHG8B
37966 // requires a memory operand. If it happens that current architecture is
37967 // i686 and for current function we need a base pointer
37968 // - which is ESI for i686 - register allocator would not be able to
37969 // allocate registers for an address in form of X(%reg, %reg, Y)
37970 // - there never would be enough unreserved registers during regalloc
37971 // (without the need for base ptr the only option would be X(%edi, %esi, Y).
37972 // We are giving a hand to register allocator by precomputing the address in
37973 // a new vreg using LEA.
37974
37975 // If it is not i686 or there is no base pointer - nothing to do here.
37976 if (!Subtarget.is32Bit() || !TRI->hasBasePointer(*MF))
37977 return BB;
37978
37979 // Even though this code does not necessarily needs the base pointer to
37980 // be ESI, we check for that. The reason: if this assert fails, there are
37981 // some changes happened in the compiler base pointer handling, which most
37982 // probably have to be addressed somehow here.
37983 assert(TRI->getBaseRegister() == X86::ESI &&
37984 "LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a "
37985 "base pointer in mind");
37986
37988 MVT SPTy = getPointerTy(MF->getDataLayout());
37989 const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
37990 Register computedAddrVReg = MRI.createVirtualRegister(AddrRegClass);
37991
37993 // Regalloc does not need any help when the memory operand of CMPXCHG8B
37994 // does not use index register.
37995 if (AM.IndexReg == X86::NoRegister)
37996 return BB;
37997
37998 // After X86TargetLowering::ReplaceNodeResults CMPXCHG8B is glued to its
37999 // four operand definitions that are E[ABCD] registers. We skip them and
38000 // then insert the LEA.
38001 MachineBasicBlock::reverse_iterator RMBBI(MI.getReverseIterator());
38002 while (RMBBI != BB->rend() &&
38003 (RMBBI->definesRegister(X86::EAX, /*TRI=*/nullptr) ||
38004 RMBBI->definesRegister(X86::EBX, /*TRI=*/nullptr) ||
38005 RMBBI->definesRegister(X86::ECX, /*TRI=*/nullptr) ||
38006 RMBBI->definesRegister(X86::EDX, /*TRI=*/nullptr))) {
38007 ++RMBBI;
38008 }
38011 BuildMI(*BB, *MBBI, MIMD, TII->get(X86::LEA32r), computedAddrVReg), AM);
38012
38013 setDirectAddressInInstr(&MI, 0, computedAddrVReg);
38014
38015 return BB;
38016 }
38017 case X86::LCMPXCHG16B_NO_RBX: {
38018 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
38019 Register BasePtr = TRI->getBaseRegister();
38020 if (TRI->hasBasePointer(*MF) &&
38021 (BasePtr == X86::RBX || BasePtr == X86::EBX)) {
38022 if (!BB->isLiveIn(BasePtr))
38023 BB->addLiveIn(BasePtr);
38024 // Save RBX into a virtual register.
38025 Register SaveRBX =
38026 MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
38027 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), SaveRBX)
38028 .addReg(X86::RBX);
38029 Register Dst = MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
38031 BuildMI(*BB, MI, MIMD, TII->get(X86::LCMPXCHG16B_SAVE_RBX), Dst);
38032 for (unsigned Idx = 0; Idx < X86::AddrNumOperands; ++Idx)
38033 MIB.add(MI.getOperand(Idx));
38034 MIB.add(MI.getOperand(X86::AddrNumOperands));
38035 MIB.addReg(SaveRBX);
38036 } else {
38037 // Simple case, just copy the virtual register to RBX.
38038 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), X86::RBX)
38039 .add(MI.getOperand(X86::AddrNumOperands));
38041 BuildMI(*BB, MI, MIMD, TII->get(X86::LCMPXCHG16B));
38042 for (unsigned Idx = 0; Idx < X86::AddrNumOperands; ++Idx)
38043 MIB.add(MI.getOperand(Idx));
38044 }
38045 MI.eraseFromParent();
38046 return BB;
38047 }
38048 case X86::MWAITX: {
38049 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
38050 Register BasePtr = TRI->getBaseRegister();
38051 bool IsRBX = (BasePtr == X86::RBX || BasePtr == X86::EBX);
38052 // If no need to save the base pointer, we generate MWAITXrrr,
38053 // else we generate pseudo MWAITX_SAVE_RBX.
38054 if (!IsRBX || !TRI->hasBasePointer(*MF)) {
38055 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), X86::ECX)
38056 .addReg(MI.getOperand(0).getReg());
38057 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), X86::EAX)
38058 .addReg(MI.getOperand(1).getReg());
38059 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), X86::EBX)
38060 .addReg(MI.getOperand(2).getReg());
38061 BuildMI(*BB, MI, MIMD, TII->get(X86::MWAITXrrr));
38062 MI.eraseFromParent();
38063 } else {
38064 if (!BB->isLiveIn(BasePtr)) {
38065 BB->addLiveIn(BasePtr);
38066 }
38067 // Parameters can be copied into ECX and EAX but not EBX yet.
38068 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), X86::ECX)
38069 .addReg(MI.getOperand(0).getReg());
38070 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), X86::EAX)
38071 .addReg(MI.getOperand(1).getReg());
38072 assert(Subtarget.is64Bit() && "Expected 64-bit mode!");
38073 // Save RBX into a virtual register.
38074 Register SaveRBX =
38075 MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
38076 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), SaveRBX)
38077 .addReg(X86::RBX);
38078 // Generate mwaitx pseudo.
38079 Register Dst = MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
38080 BuildMI(*BB, MI, MIMD, TII->get(X86::MWAITX_SAVE_RBX))
38081 .addDef(Dst) // Destination tied in with SaveRBX.
38082 .addReg(MI.getOperand(2).getReg()) // input value of EBX.
38083 .addUse(SaveRBX); // Save of base pointer.
38084 MI.eraseFromParent();
38085 }
38086 return BB;
38087 }
38088 case TargetOpcode::PREALLOCATED_SETUP: {
38089 assert(Subtarget.is32Bit() && "preallocated only used in 32-bit");
38090 auto *MFI = MF->getInfo<X86MachineFunctionInfo>();
38091 MFI->setHasPreallocatedCall(true);
38092 int64_t PreallocatedId = MI.getOperand(0).getImm();
38093 size_t StackAdjustment = MFI->getPreallocatedStackSize(PreallocatedId);
38094 assert(StackAdjustment != 0 && "0 stack adjustment");
38095 LLVM_DEBUG(dbgs() << "PREALLOCATED_SETUP stack adjustment "
38096 << StackAdjustment << "\n");
38097 BuildMI(*BB, MI, MIMD, TII->get(X86::SUB32ri), X86::ESP)
38098 .addReg(X86::ESP)
38099 .addImm(StackAdjustment);
38100 MI.eraseFromParent();
38101 return BB;
38102 }
38103 case TargetOpcode::PREALLOCATED_ARG: {
38104 assert(Subtarget.is32Bit() && "preallocated calls only used in 32-bit");
38105 int64_t PreallocatedId = MI.getOperand(1).getImm();
38106 int64_t ArgIdx = MI.getOperand(2).getImm();
38107 auto *MFI = MF->getInfo<X86MachineFunctionInfo>();
38108 size_t ArgOffset = MFI->getPreallocatedArgOffsets(PreallocatedId)[ArgIdx];
38109 LLVM_DEBUG(dbgs() << "PREALLOCATED_ARG arg index " << ArgIdx
38110 << ", arg offset " << ArgOffset << "\n");
38111 // stack pointer + offset
38112 addRegOffset(BuildMI(*BB, MI, MIMD, TII->get(X86::LEA32r),
38113 MI.getOperand(0).getReg()),
38114 X86::ESP, false, ArgOffset);
38115 MI.eraseFromParent();
38116 return BB;
38117 }
38118 case X86::PTDPBSSD:
38119 case X86::PTDPBSUD:
38120 case X86::PTDPBUSD:
38121 case X86::PTDPBUUD:
38122 case X86::PTDPBF16PS:
38123 case X86::PTDPFP16PS:
38124 case X86::PTCMMIMFP16PS:
38125 case X86::PTCMMRLFP16PS:
38126 case X86::PTDPBF8PS:
38127 case X86::PTDPBHF8PS:
38128 case X86::PTDPHBF8PS:
38129 case X86::PTDPHF8PS:
38130 case X86::PTTDPBF16PS:
38131 case X86::PTTDPFP16PS:
38132 case X86::PTTCMMIMFP16PS:
38133 case X86::PTTCMMRLFP16PS:
38134 case X86::PTCONJTCMMIMFP16PS:
38135 case X86::PTMMULTF32PS:
38136 case X86::PTTMMULTF32PS: {
38137 unsigned Opc;
38138 switch (MI.getOpcode()) {
38139 default: llvm_unreachable("illegal opcode!");
38140 case X86::PTDPBSSD: Opc = X86::TDPBSSD; break;
38141 case X86::PTDPBSUD: Opc = X86::TDPBSUD; break;
38142 case X86::PTDPBUSD: Opc = X86::TDPBUSD; break;
38143 case X86::PTDPBUUD: Opc = X86::TDPBUUD; break;
38144 case X86::PTDPBF16PS: Opc = X86::TDPBF16PS; break;
38145 case X86::PTDPFP16PS: Opc = X86::TDPFP16PS; break;
38146 case X86::PTCMMIMFP16PS:
38147 Opc = X86::TCMMIMFP16PS;
38148 break;
38149 case X86::PTCMMRLFP16PS:
38150 Opc = X86::TCMMRLFP16PS;
38151 break;
38152 case X86::PTDPBF8PS: Opc = X86::TDPBF8PS; break;
38153 case X86::PTDPBHF8PS: Opc = X86::TDPBHF8PS; break;
38154 case X86::PTDPHBF8PS: Opc = X86::TDPHBF8PS; break;
38155 case X86::PTDPHF8PS: Opc = X86::TDPHF8PS; break;
38156 case X86::PTTDPBF16PS:
38157 Opc = X86::TTDPBF16PS;
38158 break;
38159 case X86::PTTDPFP16PS:
38160 Opc = X86::TTDPFP16PS;
38161 break;
38162 case X86::PTTCMMIMFP16PS:
38163 Opc = X86::TTCMMIMFP16PS;
38164 break;
38165 case X86::PTTCMMRLFP16PS:
38166 Opc = X86::TTCMMRLFP16PS;
38167 break;
38168 case X86::PTCONJTCMMIMFP16PS:
38169 Opc = X86::TCONJTCMMIMFP16PS;
38170 break;
38171 case X86::PTMMULTF32PS:
38172 Opc = X86::TMMULTF32PS;
38173 break;
38174 case X86::PTTMMULTF32PS:
38175 Opc = X86::TTMMULTF32PS;
38176 break;
38177 }
38178
38179 MachineInstrBuilder MIB = BuildMI(*BB, MI, MIMD, TII->get(Opc));
38180 MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Define);
38181 MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Undef);
38182 MIB.addReg(TMMImmToTMMReg(MI.getOperand(1).getImm()), RegState::Undef);
38183 MIB.addReg(TMMImmToTMMReg(MI.getOperand(2).getImm()), RegState::Undef);
38184
38185 MI.eraseFromParent(); // The pseudo is gone now.
38186 return BB;
38187 }
38188 case X86::PTILEZERO: {
38189 unsigned Imm = MI.getOperand(0).getImm();
38190 BuildMI(*BB, MI, MIMD, TII->get(X86::TILEZERO), TMMImmToTMMReg(Imm));
38191 MI.eraseFromParent(); // The pseudo is gone now.
38192 auto *MFI = MF->getInfo<X86MachineFunctionInfo>();
38194 return BB;
38195 }
38196 case X86::PTILEZEROV: {
38197 auto *MFI = MF->getInfo<X86MachineFunctionInfo>();
38199 return BB;
38200 }
38201 case X86::PTILELOADDRS:
38202 case X86::PTILELOADDRST1:
38203 case X86::PTILELOADD:
38204 case X86::PTILELOADDT1:
38205 case X86::PTILESTORED: {
38206 unsigned Opc;
38207 switch (MI.getOpcode()) {
38208 default: llvm_unreachable("illegal opcode!");
38209#define GET_EGPR_IF_ENABLED(OPC) (Subtarget.hasEGPR() ? OPC##_EVEX : OPC)
38210 case X86::PTILELOADD:
38211 Opc = GET_EGPR_IF_ENABLED(X86::TILELOADD);
38212 break;
38213 case X86::PTILELOADDT1:
38214 Opc = GET_EGPR_IF_ENABLED(X86::TILELOADDT1);
38215 break;
38216 case X86::PTILESTORED:
38217 Opc = GET_EGPR_IF_ENABLED(X86::TILESTORED);
38218 break;
38219 case X86::PTILELOADDRS:
38220 Opc = GET_EGPR_IF_ENABLED(X86::TILELOADDRS);
38221 break;
38222 case X86::PTILELOADDRST1:
38223 Opc = GET_EGPR_IF_ENABLED(X86::TILELOADDRST1);
38224 break;
38225 }
38226#undef GET_EGPR_IF_ENABLED
38227
38228 MachineInstrBuilder MIB = BuildMI(*BB, MI, MIMD, TII->get(Opc));
38229 unsigned CurOp = 0;
38230 if (Opc != X86::TILESTORED && Opc != X86::TILESTORED_EVEX)
38231 MIB.addReg(TMMImmToTMMReg(MI.getOperand(CurOp++).getImm()),
38233
38234 MIB.add(MI.getOperand(CurOp++)); // base
38235 MIB.add(MI.getOperand(CurOp++)); // scale
38236 MIB.add(MI.getOperand(CurOp++)); // index -- stride
38237 MIB.add(MI.getOperand(CurOp++)); // displacement
38238 MIB.add(MI.getOperand(CurOp++)); // segment
38239
38240 if (Opc == X86::TILESTORED || Opc == X86::TILESTORED_EVEX)
38241 MIB.addReg(TMMImmToTMMReg(MI.getOperand(CurOp++).getImm()),
38243
38244 MI.eraseFromParent(); // The pseudo is gone now.
38245 return BB;
38246 }
38247 case X86::PT2RPNTLVWZ0:
38248 case X86::PT2RPNTLVWZ0T1:
38249 case X86::PT2RPNTLVWZ1:
38250 case X86::PT2RPNTLVWZ1T1:
38251 case X86::PT2RPNTLVWZ0RS:
38252 case X86::PT2RPNTLVWZ0RST1:
38253 case X86::PT2RPNTLVWZ1RS:
38254 case X86::PT2RPNTLVWZ1RST1: {
38255 const DebugLoc &DL = MI.getDebugLoc();
38256 unsigned Opc;
38257#define GET_EGPR_IF_ENABLED(OPC) (Subtarget.hasEGPR() ? OPC##_EVEX : OPC)
38258 switch (MI.getOpcode()) {
38259 default:
38260 llvm_unreachable("Unexpected instruction!");
38261 case X86::PT2RPNTLVWZ0:
38262 Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ0);
38263 break;
38264 case X86::PT2RPNTLVWZ0T1:
38265 Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ0T1);
38266 break;
38267 case X86::PT2RPNTLVWZ1:
38268 Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ1);
38269 break;
38270 case X86::PT2RPNTLVWZ1T1:
38271 Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ1T1);
38272 break;
38273 case X86::PT2RPNTLVWZ0RS:
38274 Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ0RS);
38275 break;
38276 case X86::PT2RPNTLVWZ0RST1:
38277 Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ0RST1);
38278 break;
38279 case X86::PT2RPNTLVWZ1RS:
38280 Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ1RS);
38281 break;
38282 case X86::PT2RPNTLVWZ1RST1:
38283 Opc = GET_EGPR_IF_ENABLED(X86::T2RPNTLVWZ1RST1);
38284 break;
38285 }
38286#undef GET_EGPR_IF_ENABLED
38287 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc));
38288 MIB.addReg(TMMImmToTMMPair(MI.getOperand(0).getImm()), RegState::Define);
38289
38290 MIB.add(MI.getOperand(1)); // base
38291 MIB.add(MI.getOperand(2)); // scale
38292 MIB.add(MI.getOperand(3)); // index
38293 MIB.add(MI.getOperand(4)); // displacement
38294 MIB.add(MI.getOperand(5)); // segment
38295 MI.eraseFromParent(); // The pseudo is gone now.
38296 return BB;
38297 }
38298 case X86::PTTRANSPOSED:
38299 case X86::PTCONJTFP16: {
38300 const DebugLoc &DL = MI.getDebugLoc();
38301 unsigned Opc = MI.getOpcode() == X86::PTTRANSPOSED ? X86::TTRANSPOSED
38302 : X86::TCONJTFP16;
38303
38304 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc));
38305 MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Define);
38306 MIB.addReg(TMMImmToTMMReg(MI.getOperand(1).getImm()), RegState::Undef);
38307
38308 MI.eraseFromParent(); // The pseudo is gone now.
38309 return BB;
38310 }
38311 case X86::PTCVTROWPS2BF16Hrri:
38312 case X86::PTCVTROWPS2BF16Lrri:
38313 case X86::PTCVTROWPS2PHHrri:
38314 case X86::PTCVTROWPS2PHLrri:
38315 case X86::PTCVTROWD2PSrri:
38316 case X86::PTILEMOVROWrri: {
38317 const DebugLoc &DL = MI.getDebugLoc();
38318 unsigned Opc;
38319 switch (MI.getOpcode()) {
38320 default:
38321 llvm_unreachable("Unexpected instruction!");
38322 case X86::PTCVTROWD2PSrri:
38323 Opc = X86::TCVTROWD2PSrri;
38324 break;
38325 case X86::PTCVTROWPS2BF16Hrri:
38326 Opc = X86::TCVTROWPS2BF16Hrri;
38327 break;
38328 case X86::PTCVTROWPS2PHHrri:
38329 Opc = X86::TCVTROWPS2PHHrri;
38330 break;
38331 case X86::PTCVTROWPS2BF16Lrri:
38332 Opc = X86::TCVTROWPS2BF16Lrri;
38333 break;
38334 case X86::PTCVTROWPS2PHLrri:
38335 Opc = X86::TCVTROWPS2PHLrri;
38336 break;
38337 case X86::PTILEMOVROWrri:
38338 Opc = X86::TILEMOVROWrri;
38339 break;
38340 }
38341 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc));
38342 MIB.add(MI.getOperand(0));
38343 MIB.addReg(TMMImmToTMMReg(MI.getOperand(1).getImm()), RegState::Undef);
38344 MIB.addImm(MI.getOperand(2).getImm());
38345
38346 MI.eraseFromParent(); // The pseudo is gone now.
38347 return BB;
38348 }
38349 case X86::PTCVTROWPS2BF16Hrre:
38350 case X86::PTCVTROWPS2BF16Lrre:
38351 case X86::PTCVTROWPS2PHHrre:
38352 case X86::PTCVTROWPS2PHLrre:
38353 case X86::PTCVTROWD2PSrre:
38354 case X86::PTILEMOVROWrre: {
38355 const DebugLoc &DL = MI.getDebugLoc();
38356 unsigned Opc;
38357 switch (MI.getOpcode()) {
38358 default:
38359 llvm_unreachable("Unexpected instruction!");
38360 case X86::PTCVTROWD2PSrre:
38361 Opc = X86::TCVTROWD2PSrre;
38362 break;
38363 case X86::PTCVTROWPS2BF16Hrre:
38364 Opc = X86::TCVTROWPS2BF16Hrre;
38365 break;
38366 case X86::PTCVTROWPS2BF16Lrre:
38367 Opc = X86::TCVTROWPS2BF16Lrre;
38368 break;
38369 case X86::PTCVTROWPS2PHHrre:
38370 Opc = X86::TCVTROWPS2PHHrre;
38371 break;
38372 case X86::PTCVTROWPS2PHLrre:
38373 Opc = X86::TCVTROWPS2PHLrre;
38374 break;
38375 case X86::PTILEMOVROWrre:
38376 Opc = X86::TILEMOVROWrre;
38377 break;
38378 }
38379 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc));
38380 MIB.add(MI.getOperand(0));
38381 MIB.addReg(TMMImmToTMMReg(MI.getOperand(1).getImm()), RegState::Undef);
38382 MIB.add(MI.getOperand(2));
38383
38384 MI.eraseFromParent(); // The pseudo is gone now.
38385 return BB;
38386 }
38387 }
38388}
38389
38390//===----------------------------------------------------------------------===//
38391// X86 Optimization Hooks
38392//===----------------------------------------------------------------------===//
38393
38394bool
38396 const APInt &DemandedBits,
38397 const APInt &DemandedElts,
38398 TargetLoweringOpt &TLO) const {
38399 EVT VT = Op.getValueType();
38400 unsigned Opcode = Op.getOpcode();
38401 unsigned EltSize = VT.getScalarSizeInBits();
38402
38403 if (VT.isVector()) {
38404 // If the constant is only all signbits in the active bits, then we should
38405 // extend it to the entire constant to allow it act as a boolean constant
38406 // vector.
38407 auto NeedsSignExtension = [&](SDValue V, unsigned ActiveBits) {
38408 if (!ISD::isBuildVectorOfConstantSDNodes(V.getNode()))
38409 return false;
38410 for (unsigned i = 0, e = V.getNumOperands(); i != e; ++i) {
38411 if (!DemandedElts[i] || V.getOperand(i).isUndef())
38412 continue;
38413 const APInt &Val = V.getConstantOperandAPInt(i);
38414 if (Val.getBitWidth() > Val.getNumSignBits() &&
38415 Val.trunc(ActiveBits).getNumSignBits() == ActiveBits)
38416 return true;
38417 }
38418 return false;
38419 };
38420 // For vectors - if we have a constant, then try to sign extend.
38421 // TODO: Handle AND cases.
38422 unsigned ActiveBits = DemandedBits.getActiveBits();
38423 if (EltSize > ActiveBits && EltSize > 1 && isTypeLegal(VT) &&
38424 (Opcode == ISD::OR || Opcode == ISD::XOR || Opcode == X86ISD::ANDNP) &&
38425 NeedsSignExtension(Op.getOperand(1), ActiveBits)) {
38426 EVT ExtSVT = EVT::getIntegerVT(*TLO.DAG.getContext(), ActiveBits);
38427 EVT ExtVT = EVT::getVectorVT(*TLO.DAG.getContext(), ExtSVT,
38429 SDValue NewC =
38431 Op.getOperand(1), TLO.DAG.getValueType(ExtVT));
38432 SDValue NewOp =
38433 TLO.DAG.getNode(Opcode, SDLoc(Op), VT, Op.getOperand(0), NewC);
38434 return TLO.CombineTo(Op, NewOp);
38435 }
38436 return false;
38437 }
38438
38439 // Only optimize Ands to prevent shrinking a constant that could be
38440 // matched by movzx.
38441 if (Opcode != ISD::AND)
38442 return false;
38443
38444 // Make sure the RHS really is a constant.
38445 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
38446 if (!C)
38447 return false;
38448
38449 const APInt &Mask = C->getAPIntValue();
38450
38451 // Clear all non-demanded bits initially.
38452 APInt ShrunkMask = Mask & DemandedBits;
38453
38454 // Find the width of the shrunk mask.
38455 unsigned Width = ShrunkMask.getActiveBits();
38456
38457 // If the mask is all 0s there's nothing to do here.
38458 if (Width == 0)
38459 return false;
38460
38461 // Find the next power of 2 width, rounding up to a byte.
38462 Width = llvm::bit_ceil(std::max(Width, 8U));
38463 // Truncate the width to size to handle illegal types.
38464 Width = std::min(Width, EltSize);
38465
38466 // Calculate a possible zero extend mask for this constant.
38467 APInt ZeroExtendMask = APInt::getLowBitsSet(EltSize, Width);
38468
38469 // If we aren't changing the mask, just return true to keep it and prevent
38470 // the caller from optimizing.
38471 if (ZeroExtendMask == Mask)
38472 return true;
38473
38474 // Make sure the new mask can be represented by a combination of mask bits
38475 // and non-demanded bits.
38476 if (!ZeroExtendMask.isSubsetOf(Mask | ~DemandedBits))
38477 return false;
38478
38479 // Replace the constant with the zero extend mask.
38480 SDLoc DL(Op);
38481 SDValue NewC = TLO.DAG.getConstant(ZeroExtendMask, DL, VT);
38482 SDValue NewOp = TLO.DAG.getNode(ISD::AND, DL, VT, Op.getOperand(0), NewC);
38483 return TLO.CombineTo(Op, NewOp);
38484}
38485
38487 KnownBits &Known,
38488 const APInt &DemandedElts,
38489 const SelectionDAG &DAG, unsigned Depth) {
38490 KnownBits Known2;
38491 unsigned NumSrcElts = LHS.getValueType().getVectorNumElements();
38492 APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedElts, NumSrcElts);
38493 Known = DAG.computeKnownBits(RHS, DemandedSrcElts, Depth + 1);
38494 Known2 = DAG.computeKnownBits(LHS, DemandedSrcElts, Depth + 1);
38495 Known = KnownBits::abdu(Known, Known2).zext(16);
38496 // Known = (((D0 + D1) + (D2 + D3)) + ((D4 + D5) + (D6 + D7)))
38497 Known = KnownBits::add(Known, Known, /*NSW=*/true, /*NUW=*/true);
38498 Known = KnownBits::add(Known, Known, /*NSW=*/true, /*NUW=*/true);
38499 Known = KnownBits::add(Known, Known, /*NSW=*/true, /*NUW=*/true);
38500 Known = Known.zext(64);
38501}
38502
38504 KnownBits &Known,
38505 const APInt &DemandedElts,
38506 const SelectionDAG &DAG,
38507 unsigned Depth) {
38508 unsigned NumSrcElts = LHS.getValueType().getVectorNumElements();
38509
38510 // Multiply signed i16 elements to create i32 values and add Lo/Hi pairs.
38511 APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedElts, NumSrcElts);
38512 APInt DemandedLoElts =
38513 DemandedSrcElts & APInt::getSplat(NumSrcElts, APInt(2, 0b01));
38514 APInt DemandedHiElts =
38515 DemandedSrcElts & APInt::getSplat(NumSrcElts, APInt(2, 0b10));
38516 KnownBits LHSLo = DAG.computeKnownBits(LHS, DemandedLoElts, Depth + 1);
38517 KnownBits LHSHi = DAG.computeKnownBits(LHS, DemandedHiElts, Depth + 1);
38518 KnownBits RHSLo = DAG.computeKnownBits(RHS, DemandedLoElts, Depth + 1);
38519 KnownBits RHSHi = DAG.computeKnownBits(RHS, DemandedHiElts, Depth + 1);
38520 KnownBits Lo = KnownBits::mul(LHSLo.sext(32), RHSLo.sext(32));
38521 KnownBits Hi = KnownBits::mul(LHSHi.sext(32), RHSHi.sext(32));
38522 Known = KnownBits::add(Lo, Hi, /*NSW=*/false, /*NUW=*/false);
38523}
38524
38526 KnownBits &Known,
38527 const APInt &DemandedElts,
38528 const SelectionDAG &DAG,
38529 unsigned Depth) {
38530 unsigned NumSrcElts = LHS.getValueType().getVectorNumElements();
38531
38532 // Multiply unsigned/signed i8 elements to create i16 values and add_sat Lo/Hi
38533 // pairs.
38534 APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedElts, NumSrcElts);
38535 APInt DemandedLoElts =
38536 DemandedSrcElts & APInt::getSplat(NumSrcElts, APInt(2, 0b01));
38537 APInt DemandedHiElts =
38538 DemandedSrcElts & APInt::getSplat(NumSrcElts, APInt(2, 0b10));
38539 KnownBits LHSLo = DAG.computeKnownBits(LHS, DemandedLoElts, Depth + 1);
38540 KnownBits LHSHi = DAG.computeKnownBits(LHS, DemandedHiElts, Depth + 1);
38541 KnownBits RHSLo = DAG.computeKnownBits(RHS, DemandedLoElts, Depth + 1);
38542 KnownBits RHSHi = DAG.computeKnownBits(RHS, DemandedHiElts, Depth + 1);
38543 KnownBits Lo = KnownBits::mul(LHSLo.zext(16), RHSLo.sext(16));
38544 KnownBits Hi = KnownBits::mul(LHSHi.zext(16), RHSHi.sext(16));
38545 Known = KnownBits::sadd_sat(Lo, Hi);
38546}
38547
38549 const SDValue Op, const APInt &DemandedElts, unsigned Depth,
38550 const SelectionDAG &DAG,
38551 const function_ref<KnownBits(const KnownBits &, const KnownBits &)>
38552 KnownBitsFunc) {
38553 APInt DemandedEltsLHS, DemandedEltsRHS;
38554 getHorizDemandedEltsForFirstOperand(Op.getValueType().getSizeInBits(),
38555 DemandedElts, DemandedEltsLHS,
38556 DemandedEltsRHS);
38557
38558 const auto ComputeForSingleOpFunc =
38559 [&DAG, Depth, KnownBitsFunc](SDValue Op, APInt &DemandedEltsOp) {
38560 return KnownBitsFunc(
38561 DAG.computeKnownBits(Op, DemandedEltsOp, Depth + 1),
38562 DAG.computeKnownBits(Op, DemandedEltsOp << 1, Depth + 1));
38563 };
38564
38565 if (DemandedEltsRHS.isZero())
38566 return ComputeForSingleOpFunc(Op.getOperand(0), DemandedEltsLHS);
38567 if (DemandedEltsLHS.isZero())
38568 return ComputeForSingleOpFunc(Op.getOperand(1), DemandedEltsRHS);
38569
38570 return ComputeForSingleOpFunc(Op.getOperand(0), DemandedEltsLHS)
38571 .intersectWith(ComputeForSingleOpFunc(Op.getOperand(1), DemandedEltsRHS));
38572}
38573
38575 KnownBits &Known,
38576 const APInt &DemandedElts,
38577 const SelectionDAG &DAG,
38578 unsigned Depth) const {
38579 unsigned BitWidth = Known.getBitWidth();
38580 unsigned NumElts = DemandedElts.getBitWidth();
38581 unsigned Opc = Op.getOpcode();
38582 EVT VT = Op.getValueType();
38587 "Should use MaskedValueIsZero if you don't know whether Op"
38588 " is a target node!");
38589
38590 Known.resetAll();
38591 switch (Opc) {
38592 default: break;
38593 case X86ISD::MUL_IMM: {
38594 KnownBits Known2;
38595 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38596 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38597 Known = KnownBits::mul(Known, Known2);
38598 break;
38599 }
38600 case X86ISD::BSF: {
38602
38603 KnownBits Known2;
38604 Known2 = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38605 if (Known2.isNonZero()) {
38606 // If we have a known 1, its position is our upper bound.
38607 unsigned PossibleTZ = Known2.countMaxTrailingZeros();
38608 unsigned LowBits = llvm::bit_width(PossibleTZ);
38609 Known.Zero.setBitsFrom(LowBits);
38610 } else if (!Op.getOperand(0).isUndef()) {
38611 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38612 Known = Known.intersectWith(Known2);
38613 }
38614 break;
38615 }
38616 case X86ISD::BSR: {
38617 // TODO: Bound with input known bits?
38619
38620 if (!Op.getOperand(0).isUndef() &&
38621 !DAG.isKnownNeverZero(Op.getOperand(1), Depth + 1)) {
38622 KnownBits Known2;
38623 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38624 Known = Known.intersectWith(Known2);
38625 }
38626 break;
38627 }
38628 case X86ISD::SETCC:
38629 Known.Zero.setBitsFrom(1);
38630 break;
38631 case X86ISD::MOVMSK: {
38632 unsigned NumLoBits = Op.getOperand(0).getValueType().getVectorNumElements();
38633 Known.Zero.setBitsFrom(NumLoBits);
38634 break;
38635 }
38636 case X86ISD::PEXTRB:
38637 case X86ISD::PEXTRW: {
38638 SDValue Src = Op.getOperand(0);
38639 EVT SrcVT = Src.getValueType();
38640 APInt DemandedElt = APInt::getOneBitSet(SrcVT.getVectorNumElements(),
38641 Op.getConstantOperandVal(1));
38642 Known = DAG.computeKnownBits(Src, DemandedElt, Depth + 1);
38643 Known = Known.anyextOrTrunc(BitWidth);
38644 Known.Zero.setBitsFrom(SrcVT.getScalarSizeInBits());
38645 break;
38646 }
38647 case X86ISD::VSRAI:
38648 case X86ISD::VSHLI:
38649 case X86ISD::VSRLI: {
38650 unsigned ShAmt = Op.getConstantOperandVal(1);
38651 if (ShAmt >= VT.getScalarSizeInBits()) {
38652 // Out of range logical bit shifts are guaranteed to be zero.
38653 // Out of range arithmetic bit shifts splat the sign bit.
38654 if (Opc != X86ISD::VSRAI) {
38655 Known.setAllZero();
38656 break;
38657 }
38658
38659 ShAmt = VT.getScalarSizeInBits() - 1;
38660 }
38661
38662 Known = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38663 if (Opc == X86ISD::VSHLI) {
38664 Known <<= ShAmt;
38665 // Low bits are known zero.
38666 Known.Zero.setLowBits(ShAmt);
38667 } else if (Opc == X86ISD::VSRLI) {
38668 Known >>= ShAmt;
38669 // High bits are known zero.
38670 Known.Zero.setHighBits(ShAmt);
38671 } else {
38672 Known.Zero.ashrInPlace(ShAmt);
38673 Known.One.ashrInPlace(ShAmt);
38674 }
38675 break;
38676 }
38677 case X86ISD::PACKUS: {
38678 // PACKUS is just a truncation if the upper half is zero.
38679 APInt DemandedLHS, DemandedRHS;
38680 getPackDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);
38681
38682 Known.One = APInt::getAllOnes(BitWidth * 2);
38683 Known.Zero = APInt::getAllOnes(BitWidth * 2);
38684
38685 KnownBits Known2;
38686 if (!!DemandedLHS) {
38687 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedLHS, Depth + 1);
38688 Known = Known.intersectWith(Known2);
38689 }
38690 if (!!DemandedRHS) {
38691 Known2 = DAG.computeKnownBits(Op.getOperand(1), DemandedRHS, Depth + 1);
38692 Known = Known.intersectWith(Known2);
38693 }
38694
38695 if (Known.countMinLeadingZeros() < BitWidth)
38696 Known.resetAll();
38697 Known = Known.trunc(BitWidth);
38698 break;
38699 }
38700 case X86ISD::PSHUFB: {
38701 SDValue Src = Op.getOperand(0);
38702 SDValue Idx = Op.getOperand(1);
38703
38704 // If the index vector is never negative (MSB is zero), then all elements
38705 // come from the source vector. This is useful for cases where
38706 // PSHUFB is being used as a LUT (ctpop etc.) - the target shuffle handling
38707 // below will handle the more common constant shuffle mask case.
38708 KnownBits KnownIdx = DAG.computeKnownBits(Idx, DemandedElts, Depth + 1);
38709 if (KnownIdx.isNonNegative())
38710 Known = DAG.computeKnownBits(Src, Depth + 1);
38711 break;
38712 }
38713 case X86ISD::VBROADCAST: {
38714 SDValue Src = Op.getOperand(0);
38715 if (!Src.getSimpleValueType().isVector()) {
38716 Known = DAG.computeKnownBits(Src, Depth + 1);
38717 return;
38718 }
38719 break;
38720 }
38721 case X86ISD::AND: {
38722 if (Op.getResNo() == 0) {
38723 KnownBits Known2;
38724 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38725 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38726 Known &= Known2;
38727 }
38728 break;
38729 }
38730 case X86ISD::ANDNP: {
38731 KnownBits Known2;
38732 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38733 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38734
38735 // ANDNP = (~X & Y);
38736 Known.One &= Known2.Zero;
38737 Known.Zero |= Known2.One;
38738 break;
38739 }
38740 case X86ISD::FOR: {
38741 KnownBits Known2;
38742 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38743 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38744
38745 Known |= Known2;
38746 break;
38747 }
38748 case X86ISD::PSADBW: {
38749 SDValue LHS = Op.getOperand(0);
38750 SDValue RHS = Op.getOperand(1);
38751 assert(VT.getScalarType() == MVT::i64 &&
38752 LHS.getValueType() == RHS.getValueType() &&
38753 LHS.getValueType().getScalarType() == MVT::i8 &&
38754 "Unexpected PSADBW types");
38755 computeKnownBitsForPSADBW(LHS, RHS, Known, DemandedElts, DAG, Depth);
38756 break;
38757 }
38758 case X86ISD::PCMPGT:
38759 case X86ISD::PCMPEQ: {
38760 KnownBits KnownLhs =
38761 DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38762 KnownBits KnownRhs =
38763 DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38764 std::optional<bool> Res = Opc == X86ISD::PCMPEQ
38765 ? KnownBits::eq(KnownLhs, KnownRhs)
38766 : KnownBits::sgt(KnownLhs, KnownRhs);
38767 if (Res) {
38768 if (*Res)
38769 Known.setAllOnes();
38770 else
38771 Known.setAllZero();
38772 }
38773 break;
38774 }
38775 case X86ISD::VPMADDWD: {
38776 SDValue LHS = Op.getOperand(0);
38777 SDValue RHS = Op.getOperand(1);
38778 assert(VT.getVectorElementType() == MVT::i32 &&
38779 LHS.getValueType() == RHS.getValueType() &&
38780 LHS.getValueType().getVectorElementType() == MVT::i16 &&
38781 "Unexpected PMADDWD types");
38782 computeKnownBitsForPMADDWD(LHS, RHS, Known, DemandedElts, DAG, Depth);
38783 break;
38784 }
38785 case X86ISD::VPMADDUBSW: {
38786 SDValue LHS = Op.getOperand(0);
38787 SDValue RHS = Op.getOperand(1);
38788 assert(VT.getVectorElementType() == MVT::i16 &&
38789 LHS.getValueType() == RHS.getValueType() &&
38790 LHS.getValueType().getVectorElementType() == MVT::i8 &&
38791 "Unexpected PMADDUBSW types");
38792 computeKnownBitsForPMADDUBSW(LHS, RHS, Known, DemandedElts, DAG, Depth);
38793 break;
38794 }
38795 case X86ISD::PMULUDQ: {
38796 KnownBits Known2;
38797 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38798 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38799
38800 Known = Known.trunc(BitWidth / 2).zext(BitWidth);
38801 Known2 = Known2.trunc(BitWidth / 2).zext(BitWidth);
38802 Known = KnownBits::mul(Known, Known2);
38803 break;
38804 }
38805 case X86ISD::CMOV: {
38806 Known = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
38807 // If we don't know any bits, early out.
38808 if (Known.isUnknown())
38809 break;
38810 KnownBits Known2 = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
38811
38812 // Only known if known in both the LHS and RHS.
38813 Known = Known.intersectWith(Known2);
38814 break;
38815 }
38816 case X86ISD::BEXTR:
38817 case X86ISD::BEXTRI: {
38818 SDValue Op0 = Op.getOperand(0);
38819 SDValue Op1 = Op.getOperand(1);
38820
38821 if (auto* Cst1 = dyn_cast<ConstantSDNode>(Op1)) {
38822 unsigned Shift = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 0);
38823 unsigned Length = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 8);
38824
38825 // If the length is 0, the result is 0.
38826 if (Length == 0) {
38827 Known.setAllZero();
38828 break;
38829 }
38830
38831 if ((Shift + Length) <= BitWidth) {
38832 Known = DAG.computeKnownBits(Op0, Depth + 1);
38833 Known = Known.extractBits(Length, Shift);
38834 Known = Known.zextOrTrunc(BitWidth);
38835 }
38836 }
38837 break;
38838 }
38839 case X86ISD::PDEP: {
38840 KnownBits Known2;
38841 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38842 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38843 // Zeros are retained from the mask operand. But not ones.
38844 Known.One.clearAllBits();
38845 // The result will have at least as many trailing zeros as the non-mask
38846 // operand since bits can only map to the same or higher bit position.
38847 Known.Zero.setLowBits(Known2.countMinTrailingZeros());
38848 break;
38849 }
38850 case X86ISD::PEXT: {
38851 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38852 // The result has as many leading zeros as the number of zeroes in the mask.
38853 unsigned Count = Known.Zero.popcount();
38855 Known.One.clearAllBits();
38856 break;
38857 }
38858 case X86ISD::VTRUNC:
38859 case X86ISD::VTRUNCS:
38860 case X86ISD::VTRUNCUS:
38861 case X86ISD::CVTSI2P:
38862 case X86ISD::CVTUI2P:
38863 case X86ISD::CVTP2SI:
38864 case X86ISD::CVTP2UI:
38865 case X86ISD::MCVTP2SI:
38866 case X86ISD::MCVTP2UI:
38867 case X86ISD::CVTTP2SI:
38868 case X86ISD::CVTTP2UI:
38869 case X86ISD::MCVTTP2SI:
38870 case X86ISD::MCVTTP2UI:
38871 case X86ISD::MCVTSI2P:
38872 case X86ISD::MCVTUI2P:
38873 case X86ISD::VFPROUND:
38874 case X86ISD::VMFPROUND:
38875 case X86ISD::CVTPS2PH:
38876 case X86ISD::MCVTPS2PH:
38877 case X86ISD::MCVTTP2SIS:
38878 case X86ISD::MCVTTP2UIS: {
38879 // Truncations/Conversions - upper elements are known zero.
38880 EVT SrcVT = Op.getOperand(0).getValueType();
38881 if (SrcVT.isVector()) {
38882 unsigned NumSrcElts = SrcVT.getVectorNumElements();
38883 if (NumElts > NumSrcElts && DemandedElts.countr_zero() >= NumSrcElts)
38884 Known.setAllZero();
38885 }
38886 break;
38887 }
38894 // Strict Conversions - upper elements are known zero.
38895 EVT SrcVT = Op.getOperand(1).getValueType();
38896 if (SrcVT.isVector()) {
38897 unsigned NumSrcElts = SrcVT.getVectorNumElements();
38898 if (NumElts > NumSrcElts && DemandedElts.countr_zero() >= NumSrcElts)
38899 Known.setAllZero();
38900 }
38901 break;
38902 }
38903 case X86ISD::MOVQ2DQ: {
38904 // Move from MMX to XMM. Upper half of XMM should be 0.
38905 if (DemandedElts.countr_zero() >= (NumElts / 2))
38906 Known.setAllZero();
38907 break;
38908 }
38910 APInt UndefElts;
38911 SmallVector<APInt, 16> EltBits;
38912 if (getTargetConstantBitsFromNode(Op, BitWidth, UndefElts, EltBits,
38913 /*AllowWholeUndefs*/ false,
38914 /*AllowPartialUndefs*/ false)) {
38915 Known.Zero.setAllBits();
38916 Known.One.setAllBits();
38917 for (unsigned I = 0; I != NumElts; ++I) {
38918 if (!DemandedElts[I])
38919 continue;
38920 if (UndefElts[I]) {
38921 Known.resetAll();
38922 break;
38923 }
38924 KnownBits Known2 = KnownBits::makeConstant(EltBits[I]);
38925 Known = Known.intersectWith(Known2);
38926 }
38927 return;
38928 }
38929 break;
38930 }
38931 case X86ISD::HADD:
38932 case X86ISD::HSUB: {
38934 Op, DemandedElts, Depth, DAG,
38935 [Opc](const KnownBits &KnownLHS, const KnownBits &KnownRHS) {
38937 /*Add=*/Opc == X86ISD::HADD, /*NSW=*/false, /*NUW=*/false,
38938 KnownLHS, KnownRHS);
38939 });
38940 break;
38941 }
38943 switch (Op->getConstantOperandVal(0)) {
38944 case Intrinsic::x86_sse2_pmadd_wd:
38945 case Intrinsic::x86_avx2_pmadd_wd:
38946 case Intrinsic::x86_avx512_pmaddw_d_512: {
38947 SDValue LHS = Op.getOperand(1);
38948 SDValue RHS = Op.getOperand(2);
38949 assert(VT.getScalarType() == MVT::i32 &&
38950 LHS.getValueType() == RHS.getValueType() &&
38951 LHS.getValueType().getScalarType() == MVT::i16 &&
38952 "Unexpected PMADDWD types");
38953 computeKnownBitsForPMADDWD(LHS, RHS, Known, DemandedElts, DAG, Depth);
38954 break;
38955 }
38956 case Intrinsic::x86_ssse3_pmadd_ub_sw_128:
38957 case Intrinsic::x86_avx2_pmadd_ub_sw:
38958 case Intrinsic::x86_avx512_pmaddubs_w_512: {
38959 SDValue LHS = Op.getOperand(1);
38960 SDValue RHS = Op.getOperand(2);
38961 assert(VT.getScalarType() == MVT::i16 &&
38962 LHS.getValueType() == RHS.getValueType() &&
38963 LHS.getValueType().getScalarType() == MVT::i8 &&
38964 "Unexpected PMADDUBSW types");
38965 computeKnownBitsForPMADDUBSW(LHS, RHS, Known, DemandedElts, DAG, Depth);
38966 break;
38967 }
38968 case Intrinsic::x86_sse2_psad_bw:
38969 case Intrinsic::x86_avx2_psad_bw:
38970 case Intrinsic::x86_avx512_psad_bw_512: {
38971 SDValue LHS = Op.getOperand(1);
38972 SDValue RHS = Op.getOperand(2);
38973 assert(VT.getScalarType() == MVT::i64 &&
38974 LHS.getValueType() == RHS.getValueType() &&
38975 LHS.getValueType().getScalarType() == MVT::i8 &&
38976 "Unexpected PSADBW types");
38977 computeKnownBitsForPSADBW(LHS, RHS, Known, DemandedElts, DAG, Depth);
38978 break;
38979 }
38980 }
38981 break;
38982 }
38983 case X86ISD::VPMADD52L:
38984 case X86ISD::VPMADD52H: {
38985 assert(Op.getValueType().isVector() &&
38986 Op.getValueType().getScalarType() == MVT::i64 &&
38987 "Unexpected VPMADD52 type");
38988 KnownBits K0 =
38989 DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
38990 KnownBits K1 =
38991 DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
38992 KnownBits KAcc =
38993 DAG.computeKnownBits(Op.getOperand(2), DemandedElts, Depth + 1);
38994 K0 = K0.trunc(52);
38995 K1 = K1.trunc(52);
38996 KnownBits KnownMul = (Op.getOpcode() == X86ISD::VPMADD52L)
38997 ? KnownBits::mul(K0, K1)
38998 : KnownBits::mulhu(K0, K1);
38999 KnownMul = KnownMul.zext(64);
39000 Known = KnownBits::add(KAcc, KnownMul);
39001 return;
39002 }
39003 }
39004
39005 // Handle target shuffles.
39006 // TODO - use resolveTargetShuffleInputs once we can limit recursive depth.
39007 if (isTargetShuffle(Opc)) {
39010 if (getTargetShuffleMask(Op, true, Ops, Mask)) {
39011 unsigned NumOps = Ops.size();
39012 unsigned NumElts = VT.getVectorNumElements();
39013 if (Mask.size() == NumElts) {
39014 SmallVector<APInt, 2> DemandedOps(NumOps, APInt(NumElts, 0));
39015 Known.Zero.setAllBits(); Known.One.setAllBits();
39016 for (unsigned i = 0; i != NumElts; ++i) {
39017 if (!DemandedElts[i])
39018 continue;
39019 int M = Mask[i];
39020 if (M == SM_SentinelUndef) {
39021 // For UNDEF elements, we don't know anything about the common state
39022 // of the shuffle result.
39023 Known.resetAll();
39024 break;
39025 }
39026 if (M == SM_SentinelZero) {
39027 Known.One.clearAllBits();
39028 continue;
39029 }
39030 assert(0 <= M && (unsigned)M < (NumOps * NumElts) &&
39031 "Shuffle index out of range");
39032
39033 unsigned OpIdx = (unsigned)M / NumElts;
39034 unsigned EltIdx = (unsigned)M % NumElts;
39035 if (Ops[OpIdx].getValueType() != VT) {
39036 // TODO - handle target shuffle ops with different value types.
39037 Known.resetAll();
39038 break;
39039 }
39040 DemandedOps[OpIdx].setBit(EltIdx);
39041 }
39042 // Known bits are the values that are shared by every demanded element.
39043 for (unsigned i = 0; i != NumOps && !Known.isUnknown(); ++i) {
39044 if (!DemandedOps[i])
39045 continue;
39046 KnownBits Known2 =
39047 DAG.computeKnownBits(Ops[i], DemandedOps[i], Depth + 1);
39048 Known = Known.intersectWith(Known2);
39049 }
39050 }
39051 }
39052 }
39053}
39054
39056 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
39057 unsigned Depth) const {
39058 EVT VT = Op.getValueType();
39059 unsigned VTBits = VT.getScalarSizeInBits();
39060 unsigned Opcode = Op.getOpcode();
39061 switch (Opcode) {
39063 // SETCC_CARRY sets the dest to ~0 for true or 0 for false.
39064 return VTBits;
39065
39066 case X86ISD::VTRUNC: {
39067 SDValue Src = Op.getOperand(0);
39068 MVT SrcVT = Src.getSimpleValueType();
39069 unsigned NumSrcBits = SrcVT.getScalarSizeInBits();
39070 assert(VTBits < NumSrcBits && "Illegal truncation input type");
39071 APInt DemandedSrc = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
39072 unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedSrc, Depth + 1);
39073 if (Tmp > (NumSrcBits - VTBits))
39074 return Tmp - (NumSrcBits - VTBits);
39075 return 1;
39076 }
39077
39078 case X86ISD::PACKSS: {
39079 // PACKSS is just a truncation if the sign bits extend to the packed size.
39080 APInt DemandedLHS, DemandedRHS;
39081 getPackDemandedElts(Op.getValueType(), DemandedElts, DemandedLHS,
39082 DemandedRHS);
39083
39084 // Helper to detect PACKSSDW(BITCAST(PACKSSDW(X)),BITCAST(PACKSSDW(Y)))
39085 // patterns often used to compact vXi64 allsignbit patterns.
39086 auto NumSignBitsPACKSS = [&](SDValue V, const APInt &Elts) -> unsigned {
39088 if (BC.getOpcode() == X86ISD::PACKSS &&
39089 BC.getScalarValueSizeInBits() == 16 &&
39090 V.getScalarValueSizeInBits() == 32) {
39093 if (BC0.getScalarValueSizeInBits() == 64 &&
39094 BC1.getScalarValueSizeInBits() == 64 &&
39095 DAG.ComputeNumSignBits(BC0, Depth + 1) == 64 &&
39096 DAG.ComputeNumSignBits(BC1, Depth + 1) == 64)
39097 return 32;
39098 }
39099 return DAG.ComputeNumSignBits(V, Elts, Depth + 1);
39100 };
39101
39102 unsigned SrcBits = Op.getOperand(0).getScalarValueSizeInBits();
39103 unsigned Tmp0 = SrcBits, Tmp1 = SrcBits;
39104 if (!!DemandedLHS)
39105 Tmp0 = NumSignBitsPACKSS(Op.getOperand(0), DemandedLHS);
39106 if (!!DemandedRHS)
39107 Tmp1 = NumSignBitsPACKSS(Op.getOperand(1), DemandedRHS);
39108 unsigned Tmp = std::min(Tmp0, Tmp1);
39109 if (Tmp > (SrcBits - VTBits))
39110 return Tmp - (SrcBits - VTBits);
39111 return 1;
39112 }
39113
39114 case X86ISD::VBROADCAST: {
39115 SDValue Src = Op.getOperand(0);
39116 if (!Src.getSimpleValueType().isVector())
39117 return DAG.ComputeNumSignBits(Src, Depth + 1);
39118 break;
39119 }
39120
39121 case X86ISD::VSHLI: {
39122 SDValue Src = Op.getOperand(0);
39123 const APInt &ShiftVal = Op.getConstantOperandAPInt(1);
39124 if (ShiftVal.uge(VTBits))
39125 return VTBits; // Shifted all bits out --> zero.
39126 unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1);
39127 if (ShiftVal.uge(Tmp))
39128 return 1; // Shifted all sign bits out --> unknown.
39129 return Tmp - ShiftVal.getZExtValue();
39130 }
39131
39132 case X86ISD::VSRAI: {
39133 SDValue Src = Op.getOperand(0);
39134 APInt ShiftVal = Op.getConstantOperandAPInt(1);
39135 if (ShiftVal.uge(VTBits - 1))
39136 return VTBits; // Sign splat.
39137 unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1);
39138 ShiftVal += Tmp;
39139 return ShiftVal.uge(VTBits) ? VTBits : ShiftVal.getZExtValue();
39140 }
39141
39142 case X86ISD::FSETCC:
39143 // cmpss/cmpsd return zero/all-bits result values in the bottom element.
39144 if (VT == MVT::f32 || VT == MVT::f64 ||
39145 ((VT == MVT::v4f32 || VT == MVT::v2f64) && DemandedElts == 1))
39146 return VTBits;
39147 break;
39148
39149 case X86ISD::PCMPGT:
39150 case X86ISD::PCMPEQ:
39151 case X86ISD::CMPP:
39152 case X86ISD::VPCOM:
39153 case X86ISD::VPCOMU:
39154 // Vector compares return zero/all-bits result values.
39155 return VTBits;
39156
39157 case X86ISD::ANDNP: {
39158 unsigned Tmp0 =
39159 DAG.ComputeNumSignBits(Op.getOperand(0), DemandedElts, Depth + 1);
39160 if (Tmp0 == 1) return 1; // Early out.
39161 unsigned Tmp1 =
39162 DAG.ComputeNumSignBits(Op.getOperand(1), DemandedElts, Depth + 1);
39163 return std::min(Tmp0, Tmp1);
39164 }
39165
39166 case X86ISD::CMOV: {
39167 unsigned Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), Depth+1);
39168 if (Tmp0 == 1) return 1; // Early out.
39169 unsigned Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), Depth+1);
39170 return std::min(Tmp0, Tmp1);
39171 }
39172 }
39173
39174 // Handle target shuffles.
39175 // TODO - use resolveTargetShuffleInputs once we can limit recursive depth.
39176 if (isTargetShuffle(Opcode)) {
39179 if (getTargetShuffleMask(Op, true, Ops, Mask)) {
39180 unsigned NumOps = Ops.size();
39181 unsigned NumElts = VT.getVectorNumElements();
39182 if (Mask.size() == NumElts) {
39183 SmallVector<APInt, 2> DemandedOps(NumOps, APInt(NumElts, 0));
39184 for (unsigned i = 0; i != NumElts; ++i) {
39185 if (!DemandedElts[i])
39186 continue;
39187 int M = Mask[i];
39188 if (M == SM_SentinelUndef) {
39189 // For UNDEF elements, we don't know anything about the common state
39190 // of the shuffle result.
39191 return 1;
39192 } else if (M == SM_SentinelZero) {
39193 // Zero = all sign bits.
39194 continue;
39195 }
39196 assert(0 <= M && (unsigned)M < (NumOps * NumElts) &&
39197 "Shuffle index out of range");
39198
39199 unsigned OpIdx = (unsigned)M / NumElts;
39200 unsigned EltIdx = (unsigned)M % NumElts;
39201 if (Ops[OpIdx].getValueType() != VT) {
39202 // TODO - handle target shuffle ops with different value types.
39203 return 1;
39204 }
39205 DemandedOps[OpIdx].setBit(EltIdx);
39206 }
39207 unsigned Tmp0 = VTBits;
39208 for (unsigned i = 0; i != NumOps && Tmp0 > 1; ++i) {
39209 if (!DemandedOps[i])
39210 continue;
39211 unsigned Tmp1 =
39212 DAG.ComputeNumSignBits(Ops[i], DemandedOps[i], Depth + 1);
39213 Tmp0 = std::min(Tmp0, Tmp1);
39214 }
39215 return Tmp0;
39216 }
39217 }
39218 }
39219
39220 // Fallback case.
39221 return 1;
39222}
39223
39225 if (N->getOpcode() == X86ISD::Wrapper || N->getOpcode() == X86ISD::WrapperRIP)
39226 return N->getOperand(0);
39227 return N;
39228}
39229
39230// Helper to look for a normal load that can be narrowed into a vzload with the
39231// specified VT and memory VT. Returns SDValue() on failure.
39233 SelectionDAG &DAG) {
39234 // Can't if the load is volatile or atomic.
39235 if (!LN->isSimple())
39236 return SDValue();
39237
39238 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
39239 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
39240 return DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, SDLoc(LN), Tys, Ops, MemVT,
39241 LN->getPointerInfo(), LN->getBaseAlign(),
39242 LN->getMemOperand()->getFlags());
39243}
39244
39245// Attempt to match a combined shuffle mask against supported unary shuffle
39246// instructions.
39247// TODO: Investigate sharing more of this with shuffle lowering.
39248static bool matchUnaryShuffle(MVT MaskVT, ArrayRef<int> Mask,
39249 bool AllowFloatDomain, bool AllowIntDomain,
39250 SDValue V1, const SelectionDAG &DAG,
39251 const X86Subtarget &Subtarget, unsigned &Shuffle,
39252 MVT &SrcVT, MVT &DstVT) {
39253 unsigned NumMaskElts = Mask.size();
39254 unsigned MaskEltSize = MaskVT.getScalarSizeInBits();
39255
39256 // Match against a VZEXT_MOVL vXi32 and vXi16 zero-extending instruction.
39257 if (Mask[0] == 0 &&
39258 (MaskEltSize == 32 || (MaskEltSize == 16 && Subtarget.hasFP16()))) {
39259 if ((isUndefOrZero(Mask[1]) && isUndefInRange(Mask, 2, NumMaskElts - 2)) ||
39261 isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1))) {
39262 Shuffle = X86ISD::VZEXT_MOVL;
39263 if (MaskEltSize == 16)
39264 SrcVT = DstVT = MaskVT.changeVectorElementType(MVT::f16);
39265 else
39266 SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
39267 return true;
39268 }
39269 }
39270
39271 // Match against a ANY/SIGN/ZERO_EXTEND_VECTOR_INREG instruction.
39272 if (AllowIntDomain &&
39273 ((MaskVT.is128BitVector() && Subtarget.hasSSE41()) ||
39274 (MaskVT.is256BitVector() && Subtarget.hasInt256()) ||
39275 (MaskVT.is512BitVector() && Subtarget.useAVX512Regs()))) {
39276 unsigned MaxScale = 64 / MaskEltSize;
39277 bool UseSign = V1.getScalarValueSizeInBits() == MaskEltSize &&
39278 DAG.ComputeNumSignBits(V1) == MaskEltSize;
39279 for (unsigned Scale = 2; Scale <= MaxScale; Scale *= 2) {
39280 // Skip 512-bit VPMOV?XBW on non-AVX512BW targets.
39281 if (Scale == 2 && MaskVT == MVT::v64i8 && !Subtarget.useBWIRegs())
39282 continue;
39283 bool MatchAny = true;
39284 bool MatchZero = true;
39285 bool MatchSign = UseSign;
39286 unsigned NumDstElts = NumMaskElts / Scale;
39287 for (unsigned i = 0;
39288 i != NumDstElts && (MatchAny || MatchSign || MatchZero); ++i) {
39289 if (!isUndefOrEqual(Mask[i * Scale], (int)i)) {
39290 MatchAny = MatchSign = MatchZero = false;
39291 break;
39292 }
39293 unsigned Pos = (i * Scale) + 1;
39294 unsigned Len = Scale - 1;
39295 MatchAny &= isUndefInRange(Mask, Pos, Len);
39296 MatchZero &= isUndefOrZeroInRange(Mask, Pos, Len);
39297 MatchSign &= isUndefOrEqualInRange(Mask, (int)i, Pos, Len);
39298 }
39299 if (MatchAny || MatchSign || MatchZero) {
39300 assert((MatchSign || MatchZero) &&
39301 "Failed to match sext/zext but matched aext?");
39302 unsigned SrcSize = std::max(128u, NumDstElts * MaskEltSize);
39303 MVT ScalarTy = MaskVT.isInteger() ? MaskVT.getScalarType()
39304 : MVT::getIntegerVT(MaskEltSize);
39305 SrcVT = MVT::getVectorVT(ScalarTy, SrcSize / MaskEltSize);
39306
39307 Shuffle = unsigned(
39308 MatchAny ? ISD::ANY_EXTEND
39309 : (MatchSign ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND));
39310 if (SrcVT.getVectorNumElements() != NumDstElts)
39311 Shuffle = DAG.getOpcode_EXTEND_VECTOR_INREG(Shuffle);
39312
39313 DstVT = MVT::getIntegerVT(Scale * MaskEltSize);
39314 DstVT = MVT::getVectorVT(DstVT, NumDstElts);
39315 return true;
39316 }
39317 }
39318 }
39319
39320 // Match against a VZEXT_MOVL instruction, SSE1 only supports 32-bits (MOVSS).
39321 if (((MaskEltSize == 32) || (MaskEltSize == 64 && Subtarget.hasSSE2()) ||
39322 (MaskEltSize == 16 && Subtarget.hasFP16())) &&
39323 isUndefOrEqual(Mask[0], 0) &&
39324 isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1)) {
39325 Shuffle = X86ISD::VZEXT_MOVL;
39326 if (MaskEltSize == 16)
39327 SrcVT = DstVT = MaskVT.changeVectorElementType(MVT::f16);
39328 else
39329 SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
39330 return true;
39331 }
39332
39333 // Check if we have SSE3 which will let us use MOVDDUP etc. The
39334 // instructions are no slower than UNPCKLPD but has the option to
39335 // fold the input operand into even an unaligned memory load.
39336 if (MaskVT.is128BitVector() && Subtarget.hasSSE3() && AllowFloatDomain) {
39337 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0}, DAG, V1)) {
39338 Shuffle = X86ISD::MOVDDUP;
39339 SrcVT = DstVT = MVT::v2f64;
39340 return true;
39341 }
39342 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2}, DAG, V1)) {
39343 Shuffle = X86ISD::MOVSLDUP;
39344 SrcVT = DstVT = MVT::v4f32;
39345 return true;
39346 }
39347 if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1, 3, 3}, DAG, V1)) {
39348 Shuffle = X86ISD::MOVSHDUP;
39349 SrcVT = DstVT = MVT::v4f32;
39350 return true;
39351 }
39352 }
39353
39354 if (MaskVT.is256BitVector() && AllowFloatDomain) {
39355 assert(Subtarget.hasAVX() && "AVX required for 256-bit vector shuffles");
39356 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2}, DAG, V1)) {
39357 Shuffle = X86ISD::MOVDDUP;
39358 SrcVT = DstVT = MVT::v4f64;
39359 return true;
39360 }
39361 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2, 4, 4, 6, 6}, DAG,
39362 V1)) {
39363 Shuffle = X86ISD::MOVSLDUP;
39364 SrcVT = DstVT = MVT::v8f32;
39365 return true;
39366 }
39367 if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1, 3, 3, 5, 5, 7, 7}, DAG,
39368 V1)) {
39369 Shuffle = X86ISD::MOVSHDUP;
39370 SrcVT = DstVT = MVT::v8f32;
39371 return true;
39372 }
39373 }
39374
39375 if (MaskVT.is512BitVector() && AllowFloatDomain) {
39376 assert(Subtarget.hasAVX512() &&
39377 "AVX512 required for 512-bit vector shuffles");
39378 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2, 4, 4, 6, 6}, DAG,
39379 V1)) {
39380 Shuffle = X86ISD::MOVDDUP;
39381 SrcVT = DstVT = MVT::v8f64;
39382 return true;
39383 }
39385 MaskVT, Mask,
39386 {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14}, DAG, V1)) {
39387 Shuffle = X86ISD::MOVSLDUP;
39388 SrcVT = DstVT = MVT::v16f32;
39389 return true;
39390 }
39392 MaskVT, Mask,
39393 {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15}, DAG, V1)) {
39394 Shuffle = X86ISD::MOVSHDUP;
39395 SrcVT = DstVT = MVT::v16f32;
39396 return true;
39397 }
39398 }
39399
39400 return false;
39401}
39402
39403// Attempt to match a combined shuffle mask against supported unary immediate
39404// permute instructions.
39405// TODO: Investigate sharing more of this with shuffle lowering.
39407 const APInt &Zeroable,
39408 bool AllowFloatDomain, bool AllowIntDomain,
39409 const SelectionDAG &DAG,
39410 const X86Subtarget &Subtarget,
39411 unsigned &Shuffle, MVT &ShuffleVT,
39412 unsigned &PermuteImm) {
39413 unsigned NumMaskElts = Mask.size();
39414 unsigned InputSizeInBits = MaskVT.getSizeInBits();
39415 unsigned MaskScalarSizeInBits = InputSizeInBits / NumMaskElts;
39416 MVT MaskEltVT = MVT::getIntegerVT(MaskScalarSizeInBits);
39417 bool ContainsZeros = isAnyZero(Mask);
39418
39419 // Handle VPERMI/VPERMILPD vXi64/vXi64 patterns.
39420 if (!ContainsZeros && MaskScalarSizeInBits == 64) {
39421 // Check for lane crossing permutes.
39422 if (is128BitLaneCrossingShuffleMask(MaskEltVT, Mask)) {
39423 // PERMPD/PERMQ permutes within a 256-bit vector (AVX2+).
39424 if (Subtarget.hasAVX2() && MaskVT.is256BitVector()) {
39425 Shuffle = X86ISD::VPERMI;
39426 ShuffleVT = (AllowFloatDomain ? MVT::v4f64 : MVT::v4i64);
39427 PermuteImm = getV4X86ShuffleImm(Mask);
39428 return true;
39429 }
39430 if (Subtarget.hasAVX512() && MaskVT.is512BitVector()) {
39431 SmallVector<int, 4> RepeatedMask;
39432 if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask)) {
39433 Shuffle = X86ISD::VPERMI;
39434 ShuffleVT = (AllowFloatDomain ? MVT::v8f64 : MVT::v8i64);
39435 PermuteImm = getV4X86ShuffleImm(RepeatedMask);
39436 return true;
39437 }
39438 }
39439 } else if (AllowFloatDomain && Subtarget.hasAVX()) {
39440 // VPERMILPD can permute with a non-repeating shuffle.
39441 Shuffle = X86ISD::VPERMILPI;
39442 ShuffleVT = MVT::getVectorVT(MVT::f64, Mask.size());
39443 PermuteImm = 0;
39444 for (int i = 0, e = Mask.size(); i != e; ++i) {
39445 int M = Mask[i];
39446 if (M == SM_SentinelUndef)
39447 continue;
39448 assert(((M / 2) == (i / 2)) && "Out of range shuffle mask index");
39449 PermuteImm |= (M & 1) << i;
39450 }
39451 return true;
39452 }
39453 }
39454
39455 // We are checking for shuffle match or shift match. Loop twice so we can
39456 // order which we try and match first depending on target preference.
39457 for (unsigned Order = 0; Order < 2; ++Order) {
39458 if (Subtarget.preferLowerShuffleAsShift() ? (Order == 1) : (Order == 0)) {
39459 // Handle PSHUFD/VPERMILPI vXi32/vXf32 repeated patterns.
39460 // AVX introduced the VPERMILPD/VPERMILPS float permutes, before then we
39461 // had to use 2-input SHUFPD/SHUFPS shuffles (not handled here).
39462 if ((MaskScalarSizeInBits == 64 || MaskScalarSizeInBits == 32) &&
39463 !ContainsZeros && (AllowIntDomain || Subtarget.hasAVX())) {
39464 SmallVector<int, 4> RepeatedMask;
39465 if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
39466 // Narrow the repeated mask to create 32-bit element permutes.
39467 SmallVector<int, 4> WordMask = RepeatedMask;
39468 if (MaskScalarSizeInBits == 64)
39469 narrowShuffleMaskElts(2, RepeatedMask, WordMask);
39470
39471 Shuffle = (AllowIntDomain ? X86ISD::PSHUFD : X86ISD::VPERMILPI);
39472 ShuffleVT = (AllowIntDomain ? MVT::i32 : MVT::f32);
39473 ShuffleVT = MVT::getVectorVT(ShuffleVT, InputSizeInBits / 32);
39474 PermuteImm = getV4X86ShuffleImm(WordMask);
39475 return true;
39476 }
39477 }
39478
39479 // Handle PSHUFLW/PSHUFHW vXi16 repeated patterns.
39480 if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits == 16 &&
39481 ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
39482 (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
39483 (MaskVT.is512BitVector() && Subtarget.hasBWI()))) {
39484 SmallVector<int, 4> RepeatedMask;
39485 if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
39486 ArrayRef<int> LoMask(RepeatedMask.data() + 0, 4);
39487 ArrayRef<int> HiMask(RepeatedMask.data() + 4, 4);
39488
39489 // PSHUFLW: permute lower 4 elements only.
39490 if (isUndefOrInRange(LoMask, 0, 4) &&
39491 isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {
39492 Shuffle = X86ISD::PSHUFLW;
39493 ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
39494 PermuteImm = getV4X86ShuffleImm(LoMask);
39495 return true;
39496 }
39497
39498 // PSHUFHW: permute upper 4 elements only.
39499 if (isUndefOrInRange(HiMask, 4, 8) &&
39500 isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {
39501 // Offset the HiMask so that we can create the shuffle immediate.
39502 int OffsetHiMask[4];
39503 for (int i = 0; i != 4; ++i)
39504 OffsetHiMask[i] = (HiMask[i] < 0 ? HiMask[i] : HiMask[i] - 4);
39505
39506 Shuffle = X86ISD::PSHUFHW;
39507 ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
39508 PermuteImm = getV4X86ShuffleImm(OffsetHiMask);
39509 return true;
39510 }
39511 }
39512 }
39513 } else {
39514 // Attempt to match against bit rotates.
39515 if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits < 64 &&
39516 ((MaskVT.is128BitVector() && Subtarget.hasXOP()) ||
39517 Subtarget.hasAVX512())) {
39518 int RotateAmt = matchShuffleAsBitRotate(ShuffleVT, MaskScalarSizeInBits,
39519 Subtarget, Mask);
39520 if (0 < RotateAmt) {
39521 Shuffle = X86ISD::VROTLI;
39522 PermuteImm = (unsigned)RotateAmt;
39523 return true;
39524 }
39525 }
39526 }
39527 // Attempt to match against byte/bit shifts.
39528 if (AllowIntDomain &&
39529 ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
39530 (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
39531 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
39532 int ShiftAmt =
39533 matchShuffleAsShift(ShuffleVT, Shuffle, MaskScalarSizeInBits, Mask, 0,
39534 Zeroable, Subtarget);
39535 if (0 < ShiftAmt && (!ShuffleVT.is512BitVector() || Subtarget.hasBWI() ||
39536 32 <= ShuffleVT.getScalarSizeInBits())) {
39537 // Byte shifts can be slower so only match them on second attempt.
39538 if (Order == 0 &&
39539 (Shuffle == X86ISD::VSHLDQ || Shuffle == X86ISD::VSRLDQ))
39540 continue;
39541
39542 PermuteImm = (unsigned)ShiftAmt;
39543 return true;
39544 }
39545
39546 }
39547 }
39548
39549 return false;
39550}
39551
39552// Attempt to match a combined unary shuffle mask against supported binary
39553// shuffle instructions.
39554// TODO: Investigate sharing more of this with shuffle lowering.
39555static bool matchBinaryShuffle(MVT MaskVT, ArrayRef<int> Mask,
39556 bool AllowFloatDomain, bool AllowIntDomain,
39557 SDValue &V1, SDValue &V2, const SDLoc &DL,
39558 SelectionDAG &DAG, const X86Subtarget &Subtarget,
39559 unsigned &Shuffle, MVT &SrcVT, MVT &DstVT,
39560 bool IsUnary) {
39561 unsigned NumMaskElts = Mask.size();
39562 unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
39563 unsigned SizeInBits = MaskVT.getSizeInBits();
39564
39565 if (MaskVT.is128BitVector()) {
39566 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0}, DAG) &&
39567 AllowFloatDomain) {
39568 V2 = V1;
39569 V1 = (SM_SentinelUndef == Mask[0] ? DAG.getUNDEF(MVT::v4f32) : V1);
39570 Shuffle = Subtarget.hasSSE2() ? X86ISD::UNPCKL : X86ISD::MOVLHPS;
39571 SrcVT = DstVT = Subtarget.hasSSE2() ? MVT::v2f64 : MVT::v4f32;
39572 return true;
39573 }
39574 if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1}, DAG) &&
39575 AllowFloatDomain) {
39576 V2 = V1;
39577 Shuffle = Subtarget.hasSSE2() ? X86ISD::UNPCKH : X86ISD::MOVHLPS;
39578 SrcVT = DstVT = Subtarget.hasSSE2() ? MVT::v2f64 : MVT::v4f32;
39579 return true;
39580 }
39581 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 3}, DAG) &&
39582 Subtarget.hasSSE2() && (AllowFloatDomain || !Subtarget.hasSSE41())) {
39583 std::swap(V1, V2);
39584 Shuffle = X86ISD::MOVSD;
39585 SrcVT = DstVT = MVT::v2f64;
39586 return true;
39587 }
39588 if (isTargetShuffleEquivalent(MaskVT, Mask, {4, 1, 2, 3}, DAG) &&
39589 (AllowFloatDomain || !Subtarget.hasSSE41())) {
39590 Shuffle = X86ISD::MOVSS;
39591 SrcVT = DstVT = MVT::v4f32;
39592 return true;
39593 }
39594 if (isTargetShuffleEquivalent(MaskVT, Mask, {8, 1, 2, 3, 4, 5, 6, 7},
39595 DAG) &&
39596 Subtarget.hasFP16()) {
39597 Shuffle = X86ISD::MOVSH;
39598 SrcVT = DstVT = MVT::v8f16;
39599 return true;
39600 }
39601 }
39602
39603 // Attempt to match against either an unary or binary PACKSS/PACKUS shuffle.
39604 if (((MaskVT == MVT::v8i16 || MaskVT == MVT::v16i8) && Subtarget.hasSSE2()) ||
39605 ((MaskVT == MVT::v16i16 || MaskVT == MVT::v32i8) && Subtarget.hasInt256()) ||
39606 ((MaskVT == MVT::v32i16 || MaskVT == MVT::v64i8) && Subtarget.hasBWI())) {
39607 if (matchShuffleWithPACK(MaskVT, SrcVT, V1, V2, Shuffle, Mask, DAG,
39608 Subtarget)) {
39609 DstVT = MaskVT;
39610 return true;
39611 }
39612 }
39613 // TODO: Can we handle this inside matchShuffleWithPACK?
39614 if (MaskVT == MVT::v4i32 && Subtarget.hasSSE2() &&
39615 isTargetShuffleEquivalent(MaskVT, Mask, {0, 2, 4, 6}, DAG) &&
39616 V1.getScalarValueSizeInBits() == 64 &&
39617 V2.getScalarValueSizeInBits() == 64) {
39618 // Use (SSE41) PACKUSWD if the leading zerobits goto the lowest 16-bits.
39619 unsigned MinLZV1 = DAG.computeKnownBits(V1).countMinLeadingZeros();
39620 unsigned MinLZV2 = DAG.computeKnownBits(V2).countMinLeadingZeros();
39621 if (Subtarget.hasSSE41() && MinLZV1 >= 48 && MinLZV2 >= 48) {
39622 SrcVT = MVT::v4i32;
39623 DstVT = MVT::v8i16;
39624 Shuffle = X86ISD::PACKUS;
39625 return true;
39626 }
39627 // Use PACKUSBW if the leading zerobits goto the lowest 8-bits.
39628 if (MinLZV1 >= 56 && MinLZV2 >= 56) {
39629 SrcVT = MVT::v8i16;
39630 DstVT = MVT::v16i8;
39631 Shuffle = X86ISD::PACKUS;
39632 return true;
39633 }
39634 // Use PACKSSWD if the signbits extend to the lowest 16-bits.
39635 if (DAG.ComputeNumSignBits(V1) > 48 && DAG.ComputeNumSignBits(V2) > 48) {
39636 SrcVT = MVT::v4i32;
39637 DstVT = MVT::v8i16;
39638 Shuffle = X86ISD::PACKSS;
39639 return true;
39640 }
39641 }
39642
39643 // Attempt to match against either a unary or binary UNPCKL/UNPCKH shuffle.
39644 if ((MaskVT == MVT::v4f32 && Subtarget.hasSSE1()) ||
39645 (MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
39646 (MaskVT.is256BitVector() && 32 <= EltSizeInBits && Subtarget.hasAVX()) ||
39647 (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
39648 (MaskVT.is512BitVector() && Subtarget.hasAVX512() &&
39649 (32 <= EltSizeInBits || Subtarget.hasBWI()))) {
39650 if (matchShuffleWithUNPCK(MaskVT, V1, V2, Shuffle, IsUnary, Mask, DL, DAG,
39651 Subtarget)) {
39652 SrcVT = DstVT = MaskVT;
39653 if (MaskVT.is256BitVector() && !Subtarget.hasAVX2())
39654 SrcVT = DstVT = (32 == EltSizeInBits ? MVT::v8f32 : MVT::v4f64);
39655 return true;
39656 }
39657 }
39658
39659 // Attempt to match against a OR if we're performing a blend shuffle and the
39660 // non-blended source element is zero in each case.
39661 // TODO: Handle cases where V1/V2 sizes doesn't match SizeInBits.
39662 if (SizeInBits == V1.getValueSizeInBits() &&
39663 SizeInBits == V2.getValueSizeInBits() &&
39664 (EltSizeInBits % V1.getScalarValueSizeInBits()) == 0 &&
39665 (EltSizeInBits % V2.getScalarValueSizeInBits()) == 0) {
39666 bool IsBlend = true;
39667 unsigned NumV1Elts = V1.getValueType().getVectorNumElements();
39668 unsigned NumV2Elts = V2.getValueType().getVectorNumElements();
39669 unsigned Scale1 = NumV1Elts / NumMaskElts;
39670 unsigned Scale2 = NumV2Elts / NumMaskElts;
39671 APInt DemandedZeroV1 = APInt::getZero(NumV1Elts);
39672 APInt DemandedZeroV2 = APInt::getZero(NumV2Elts);
39673 for (unsigned i = 0; i != NumMaskElts; ++i) {
39674 int M = Mask[i];
39675 if (M == SM_SentinelUndef)
39676 continue;
39677 if (M == SM_SentinelZero) {
39678 DemandedZeroV1.setBits(i * Scale1, (i + 1) * Scale1);
39679 DemandedZeroV2.setBits(i * Scale2, (i + 1) * Scale2);
39680 continue;
39681 }
39682 if (M == (int)i) {
39683 DemandedZeroV2.setBits(i * Scale2, (i + 1) * Scale2);
39684 continue;
39685 }
39686 if (M == (int)(i + NumMaskElts)) {
39687 DemandedZeroV1.setBits(i * Scale1, (i + 1) * Scale1);
39688 continue;
39689 }
39690 IsBlend = false;
39691 break;
39692 }
39693 if (IsBlend) {
39694 if (DAG.MaskedVectorIsZero(V1, DemandedZeroV1) &&
39695 DAG.MaskedVectorIsZero(V2, DemandedZeroV2)) {
39696 Shuffle = ISD::OR;
39697 SrcVT = DstVT = MaskVT.changeTypeToInteger();
39698 return true;
39699 }
39700 if (NumV1Elts == NumV2Elts && NumV1Elts == NumMaskElts) {
39701 // FIXME: handle mismatched sizes?
39702 // TODO: investigate if `ISD::OR` handling in
39703 // `TargetLowering::SimplifyDemandedVectorElts` can be improved instead.
39704 auto computeKnownBitsElementWise = [&DAG](SDValue V) {
39705 unsigned NumElts = V.getValueType().getVectorNumElements();
39706 KnownBits Known(NumElts);
39707 for (unsigned EltIdx = 0; EltIdx != NumElts; ++EltIdx) {
39708 APInt Mask = APInt::getOneBitSet(NumElts, EltIdx);
39709 KnownBits PeepholeKnown = DAG.computeKnownBits(V, Mask);
39710 if (PeepholeKnown.isZero())
39711 Known.Zero.setBit(EltIdx);
39712 if (PeepholeKnown.isAllOnes())
39713 Known.One.setBit(EltIdx);
39714 }
39715 return Known;
39716 };
39717
39718 KnownBits V1Known = computeKnownBitsElementWise(V1);
39719 KnownBits V2Known = computeKnownBitsElementWise(V2);
39720
39721 for (unsigned i = 0; i != NumMaskElts && IsBlend; ++i) {
39722 int M = Mask[i];
39723 if (M == SM_SentinelUndef)
39724 continue;
39725 if (M == SM_SentinelZero) {
39726 IsBlend &= V1Known.Zero[i] && V2Known.Zero[i];
39727 continue;
39728 }
39729 if (M == (int)i) {
39730 IsBlend &= V2Known.Zero[i] || V1Known.One[i];
39731 continue;
39732 }
39733 if (M == (int)(i + NumMaskElts)) {
39734 IsBlend &= V1Known.Zero[i] || V2Known.One[i];
39735 continue;
39736 }
39737 llvm_unreachable("will not get here.");
39738 }
39739 if (IsBlend) {
39740 Shuffle = ISD::OR;
39741 SrcVT = DstVT = MaskVT.changeTypeToInteger();
39742 return true;
39743 }
39744 }
39745 }
39746 }
39747
39748 return false;
39749}
39750
39752 MVT MaskVT, ArrayRef<int> Mask, const APInt &Zeroable,
39753 bool AllowFloatDomain, bool AllowIntDomain, SDValue &V1, SDValue &V2,
39754 const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget,
39755 unsigned &Shuffle, MVT &ShuffleVT, unsigned &PermuteImm) {
39756 unsigned NumMaskElts = Mask.size();
39757 unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
39758
39759 // Attempt to match against VALIGND/VALIGNQ rotate.
39760 if (AllowIntDomain && (EltSizeInBits == 64 || EltSizeInBits == 32) &&
39761 ((MaskVT.is128BitVector() && Subtarget.hasVLX()) ||
39762 (MaskVT.is256BitVector() && Subtarget.hasVLX()) ||
39763 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
39764 MVT AlignVT = MVT::getVectorVT(MVT::getIntegerVT(EltSizeInBits),
39765 MaskVT.getSizeInBits() / EltSizeInBits);
39766 if (!isAnyZero(Mask)) {
39767 int Rotation = matchShuffleAsElementRotate(V1, V2, Mask);
39768 if (0 < Rotation) {
39769 Shuffle = X86ISD::VALIGN;
39770 ShuffleVT = AlignVT;
39771 PermuteImm = Rotation;
39772 return true;
39773 }
39774 }
39775 // See if we can use VALIGN as a cross-lane version of VSHLDQ/VSRLDQ.
39776 unsigned ZeroLo = Zeroable.countr_one();
39777 unsigned ZeroHi = Zeroable.countl_one();
39778 assert((ZeroLo + ZeroHi) < NumMaskElts && "Zeroable shuffle detected");
39779 if (ZeroLo) {
39780 SmallVector<int, 16> ShiftMask(NumMaskElts, SM_SentinelZero);
39781 std::iota(ShiftMask.begin() + ZeroLo, ShiftMask.end(), 0);
39782 if (isTargetShuffleEquivalent(MaskVT, Mask, ShiftMask, DAG, V1)) {
39783 V2 = getZeroVector(AlignVT, Subtarget, DAG, DL);
39784 Shuffle = X86ISD::VALIGN;
39785 ShuffleVT = AlignVT;
39786 PermuteImm = NumMaskElts - ZeroLo;
39787 return true;
39788 }
39789 }
39790 if (ZeroHi) {
39791 SmallVector<int, 16> ShiftMask(NumMaskElts, SM_SentinelZero);
39792 std::iota(ShiftMask.begin(), ShiftMask.begin() + NumMaskElts - ZeroHi,
39793 ZeroHi);
39794 if (isTargetShuffleEquivalent(MaskVT, Mask, ShiftMask, DAG, V1)) {
39795 V2 = V1;
39796 V1 = getZeroVector(AlignVT, Subtarget, DAG, DL);
39797 Shuffle = X86ISD::VALIGN;
39798 ShuffleVT = AlignVT;
39799 PermuteImm = ZeroHi;
39800 return true;
39801 }
39802 }
39803 }
39804
39805 // Attempt to match against PALIGNR byte rotate.
39806 if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSSE3()) ||
39807 (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
39808 (MaskVT.is512BitVector() && Subtarget.hasBWI()))) {
39809 int ByteRotation = matchShuffleAsByteRotate(MaskVT, V1, V2, Mask);
39810 if (0 < ByteRotation) {
39811 Shuffle = X86ISD::PALIGNR;
39812 ShuffleVT = MVT::getVectorVT(MVT::i8, MaskVT.getSizeInBits() / 8);
39813 PermuteImm = ByteRotation;
39814 return true;
39815 }
39816 }
39817
39818 // Attempt to combine to X86ISD::BLENDI.
39819 if ((NumMaskElts <= 8 && ((Subtarget.hasSSE41() && MaskVT.is128BitVector()) ||
39820 (Subtarget.hasAVX() && MaskVT.is256BitVector()))) ||
39821 (MaskVT == MVT::v16i16 && Subtarget.hasAVX2())) {
39822 uint64_t BlendMask = 0;
39823 bool ForceV1Zero = false, ForceV2Zero = false;
39824 SmallVector<int, 8> TargetMask(Mask);
39825 if (matchShuffleAsBlend(MaskVT, V1, V2, TargetMask, Zeroable, ForceV1Zero,
39826 ForceV2Zero, BlendMask)) {
39827 if (MaskVT == MVT::v16i16) {
39828 // We can only use v16i16 PBLENDW if the lanes are repeated.
39829 SmallVector<int, 8> RepeatedMask;
39830 if (isRepeatedTargetShuffleMask(128, MaskVT, TargetMask,
39831 RepeatedMask)) {
39832 assert(RepeatedMask.size() == 8 &&
39833 "Repeated mask size doesn't match!");
39834 PermuteImm = 0;
39835 for (int i = 0; i < 8; ++i)
39836 if (RepeatedMask[i] >= 8)
39837 PermuteImm |= 1 << i;
39838 V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
39839 V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
39840 Shuffle = X86ISD::BLENDI;
39841 ShuffleVT = MaskVT;
39842 return true;
39843 }
39844 } else {
39845 V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
39846 V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
39847 PermuteImm = (unsigned)BlendMask;
39848 Shuffle = X86ISD::BLENDI;
39849 ShuffleVT = MaskVT;
39850 return true;
39851 }
39852 }
39853 }
39854
39855 // Attempt to combine to INSERTPS, but only if it has elements that need to
39856 // be set to zero.
39857 if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() &&
39858 MaskVT.is128BitVector() && isAnyZero(Mask) &&
39859 matchShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) {
39860 Shuffle = X86ISD::INSERTPS;
39861 ShuffleVT = MVT::v4f32;
39862 return true;
39863 }
39864
39865 // Attempt to combine to SHUFPD.
39866 if (AllowFloatDomain && EltSizeInBits == 64 &&
39867 ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
39868 (MaskVT.is256BitVector() && Subtarget.hasAVX()) ||
39869 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
39870 bool ForceV1Zero = false, ForceV2Zero = false;
39871 if (matchShuffleWithSHUFPD(MaskVT, V1, V2, ForceV1Zero, ForceV2Zero,
39872 PermuteImm, Mask, Zeroable)) {
39873 V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
39874 V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
39875 Shuffle = X86ISD::SHUFP;
39876 ShuffleVT = MVT::getVectorVT(MVT::f64, MaskVT.getSizeInBits() / 64);
39877 return true;
39878 }
39879 }
39880
39881 // Attempt to combine to SHUFPS.
39882 if (AllowFloatDomain && EltSizeInBits == 32 &&
39883 ((MaskVT.is128BitVector() && Subtarget.hasSSE1()) ||
39884 (MaskVT.is256BitVector() && Subtarget.hasAVX()) ||
39885 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
39886 SmallVector<int, 4> RepeatedMask;
39887 if (isRepeatedTargetShuffleMask(128, MaskVT, Mask, RepeatedMask)) {
39888 // Match each half of the repeated mask, to determine if its just
39889 // referencing one of the vectors, is zeroable or entirely undef.
39890 auto MatchHalf = [&](unsigned Offset, int &S0, int &S1) {
39891 int M0 = RepeatedMask[Offset];
39892 int M1 = RepeatedMask[Offset + 1];
39893
39894 if (isUndefInRange(RepeatedMask, Offset, 2)) {
39895 return DAG.getUNDEF(MaskVT);
39896 } else if (isUndefOrZeroInRange(RepeatedMask, Offset, 2)) {
39897 S0 = (SM_SentinelUndef == M0 ? -1 : 0);
39898 S1 = (SM_SentinelUndef == M1 ? -1 : 1);
39899 return getZeroVector(MaskVT, Subtarget, DAG, DL);
39900 } else if (isUndefOrInRange(M0, 0, 4) && isUndefOrInRange(M1, 0, 4)) {
39901 S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
39902 S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
39903 return V1;
39904 } else if (isUndefOrInRange(M0, 4, 8) && isUndefOrInRange(M1, 4, 8)) {
39905 S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
39906 S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
39907 return V2;
39908 }
39909
39910 return SDValue();
39911 };
39912
39913 int ShufMask[4] = {-1, -1, -1, -1};
39914 SDValue Lo = MatchHalf(0, ShufMask[0], ShufMask[1]);
39915 SDValue Hi = MatchHalf(2, ShufMask[2], ShufMask[3]);
39916
39917 if (Lo && Hi) {
39918 V1 = Lo;
39919 V2 = Hi;
39920 Shuffle = X86ISD::SHUFP;
39921 ShuffleVT = MVT::getVectorVT(MVT::f32, MaskVT.getSizeInBits() / 32);
39922 PermuteImm = getV4X86ShuffleImm(ShufMask);
39923 return true;
39924 }
39925 }
39926 }
39927
39928 // Attempt to combine to INSERTPS more generally if X86ISD::SHUFP failed.
39929 if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() &&
39930 MaskVT.is128BitVector() &&
39931 matchShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) {
39932 Shuffle = X86ISD::INSERTPS;
39933 ShuffleVT = MVT::v4f32;
39934 return true;
39935 }
39936
39937 return false;
39938}
39939
39941 ArrayRef<SDValue> Inputs, unsigned RootOpcode, MVT RootVT,
39942 ArrayRef<int> BaseMask, int Depth, ArrayRef<const SDNode *> SrcNodes,
39943 bool AllowVariableCrossLaneMask, bool AllowVariablePerLaneMask,
39944 bool IsMaskedShuffle, SelectionDAG &DAG, const SDLoc &DL,
39945 const X86Subtarget &Subtarget);
39946
39947/// Combine an arbitrary chain of shuffles into a single instruction if
39948/// possible.
39949///
39950/// This is the leaf of the recursive combine below. When we have found some
39951/// chain of single-use x86 shuffle instructions and accumulated the combined
39952/// shuffle mask represented by them, this will try to pattern match that mask
39953/// into either a single instruction if there is a special purpose instruction
39954/// for this operation, or into a PSHUFB instruction which is a fully general
39955/// instruction but should only be used to replace chains over a certain depth.
39957 ArrayRef<SDValue> Inputs, unsigned RootOpc, MVT RootVT,
39958 ArrayRef<int> BaseMask, int Depth, ArrayRef<const SDNode *> SrcNodes,
39959 bool AllowVariableCrossLaneMask, bool AllowVariablePerLaneMask,
39960 bool IsMaskedShuffle, SelectionDAG &DAG, const SDLoc &DL,
39961 const X86Subtarget &Subtarget) {
39962 assert(!BaseMask.empty() && "Cannot combine an empty shuffle mask!");
39963 assert((Inputs.size() == 1 || Inputs.size() == 2) &&
39964 "Unexpected number of shuffle inputs!");
39965 unsigned RootSizeInBits = RootVT.getSizeInBits();
39966 unsigned NumRootElts = RootVT.getVectorNumElements();
39967
39968 // Canonicalize shuffle input op to the requested type.
39969 auto CanonicalizeShuffleInput = [&](MVT VT, SDValue Op) {
39970 if (VT.getSizeInBits() > Op.getValueSizeInBits())
39971 Op = widenSubVector(Op, false, Subtarget, DAG, DL, VT.getSizeInBits());
39972 else if (VT.getSizeInBits() < Op.getValueSizeInBits())
39973 Op = extractSubVector(Op, 0, DAG, DL, VT.getSizeInBits());
39974 return DAG.getBitcast(VT, Op);
39975 };
39976
39977 // Find the inputs that enter the chain. Note that multiple uses are OK
39978 // here, we're not going to remove the operands we find.
39979 bool UnaryShuffle = (Inputs.size() == 1);
39980 SDValue V1 = peekThroughBitcasts(Inputs[0]);
39981 SDValue V2 = (UnaryShuffle ? DAG.getUNDEF(V1.getValueType())
39982 : peekThroughBitcasts(Inputs[1]));
39983
39984 MVT VT1 = V1.getSimpleValueType();
39985 MVT VT2 = V2.getSimpleValueType();
39986 assert((RootSizeInBits % VT1.getSizeInBits()) == 0 &&
39987 (RootSizeInBits % VT2.getSizeInBits()) == 0 && "Vector size mismatch");
39988
39989 SDValue Res;
39990
39991 unsigned NumBaseMaskElts = BaseMask.size();
39992 if (NumBaseMaskElts == 1) {
39993 assert(BaseMask[0] == 0 && "Invalid shuffle index found!");
39994 return CanonicalizeShuffleInput(RootVT, V1);
39995 }
39996
39997 bool OptForSize = DAG.shouldOptForSize();
39998 unsigned BaseMaskEltSizeInBits = RootSizeInBits / NumBaseMaskElts;
39999 bool FloatDomain = VT1.isFloatingPoint() || VT2.isFloatingPoint() ||
40000 (RootVT.isFloatingPoint() && Depth >= 1) ||
40001 (RootVT.is256BitVector() && !Subtarget.hasAVX2());
40002
40003 // If we are shuffling a splat (and not introducing zeros) then we can just
40004 // use it directly. This works for smaller elements as well as they already
40005 // repeat across each mask element.
40006 if (UnaryShuffle && !isAnyZero(BaseMask) &&
40007 V1.getValueSizeInBits() >= RootSizeInBits &&
40008 (BaseMaskEltSizeInBits % V1.getScalarValueSizeInBits()) == 0 &&
40009 DAG.isSplatValue(V1, /*AllowUndefs*/ false)) {
40010 return CanonicalizeShuffleInput(RootVT, V1);
40011 }
40012
40013 SmallVector<int, 64> Mask(BaseMask);
40014
40015 // See if the shuffle is a hidden identity shuffle - repeated args in HOPs
40016 // etc. can be simplified.
40017 if (VT1 == VT2 && VT1.getSizeInBits() == RootSizeInBits && VT1.isVector()) {
40018 SmallVector<int> ScaledMask, IdentityMask;
40019 unsigned NumElts = VT1.getVectorNumElements();
40020 if (Mask.size() <= NumElts &&
40021 scaleShuffleElements(Mask, NumElts, ScaledMask)) {
40022 for (unsigned i = 0; i != NumElts; ++i)
40023 IdentityMask.push_back(i);
40024 if (isTargetShuffleEquivalent(RootVT, ScaledMask, IdentityMask, DAG, V1,
40025 V2))
40026 return CanonicalizeShuffleInput(RootVT, V1);
40027 }
40028 }
40029
40030 // Handle 128/256-bit lane shuffles of 512-bit vectors.
40031 if (RootVT.is512BitVector() &&
40032 (NumBaseMaskElts == 2 || NumBaseMaskElts == 4)) {
40033 // If the upper subvectors are zeroable, then an extract+insert is more
40034 // optimal than using X86ISD::SHUF128. The insertion is free, even if it has
40035 // to zero the upper subvectors.
40036 if (isUndefOrZeroInRange(Mask, 1, NumBaseMaskElts - 1)) {
40037 if (Depth == 0 && RootOpc == ISD::INSERT_SUBVECTOR)
40038 return SDValue(); // Nothing to do!
40039 assert(isInRange(Mask[0], 0, NumBaseMaskElts) &&
40040 "Unexpected lane shuffle");
40041 Res = CanonicalizeShuffleInput(RootVT, V1);
40042 unsigned SubIdx = Mask[0] * (NumRootElts / NumBaseMaskElts);
40043 bool UseZero = isAnyZero(Mask);
40044 Res = extractSubVector(Res, SubIdx, DAG, DL, BaseMaskEltSizeInBits);
40045 return widenSubVector(Res, UseZero, Subtarget, DAG, DL, RootSizeInBits);
40046 }
40047
40048 // Narrow shuffle mask to v4x128.
40049 SmallVector<int, 4> ScaledMask;
40050 assert((BaseMaskEltSizeInBits % 128) == 0 && "Illegal mask size");
40051 narrowShuffleMaskElts(BaseMaskEltSizeInBits / 128, Mask, ScaledMask);
40052
40053 // Try to lower to vshuf64x2/vshuf32x4.
40054 auto MatchSHUF128 = [&](MVT ShuffleVT, const SDLoc &DL,
40055 ArrayRef<int> ScaledMask, SDValue V1, SDValue V2,
40056 SelectionDAG &DAG) {
40057 int PermMask[4] = {-1, -1, -1, -1};
40058 // Ensure elements came from the same Op.
40059 SDValue Ops[2] = {DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT)};
40060 for (int i = 0; i < 4; ++i) {
40061 assert(ScaledMask[i] >= -1 && "Illegal shuffle sentinel value");
40062 if (ScaledMask[i] < 0)
40063 continue;
40064
40065 SDValue Op = ScaledMask[i] >= 4 ? V2 : V1;
40066 unsigned OpIndex = i / 2;
40067 if (Ops[OpIndex].isUndef())
40068 Ops[OpIndex] = Op;
40069 else if (Ops[OpIndex] != Op)
40070 return SDValue();
40071
40072 PermMask[i] = ScaledMask[i] % 4;
40073 }
40074
40075 return DAG.getNode(X86ISD::SHUF128, DL, ShuffleVT,
40076 CanonicalizeShuffleInput(ShuffleVT, Ops[0]),
40077 CanonicalizeShuffleInput(ShuffleVT, Ops[1]),
40078 getV4X86ShuffleImm8ForMask(PermMask, DL, DAG));
40079 };
40080
40081 // FIXME: Is there a better way to do this? is256BitLaneRepeatedShuffleMask
40082 // doesn't work because our mask is for 128 bits and we don't have an MVT
40083 // to match that.
40084 bool PreferPERMQ = UnaryShuffle && !isFreeToSplitVector(V1, DAG) &&
40085 isUndefOrInRange(ScaledMask[0], 0, 2) &&
40086 isUndefOrInRange(ScaledMask[1], 0, 2) &&
40087 isUndefOrInRange(ScaledMask[2], 2, 4) &&
40088 isUndefOrInRange(ScaledMask[3], 2, 4) &&
40089 (ScaledMask[0] < 0 || ScaledMask[2] < 0 ||
40090 ScaledMask[0] == (ScaledMask[2] % 2)) &&
40091 (ScaledMask[1] < 0 || ScaledMask[3] < 0 ||
40092 ScaledMask[1] == (ScaledMask[3] % 2));
40093
40094 if (!isAnyZero(ScaledMask) && !PreferPERMQ) {
40095 if (Depth == 0 && RootOpc == X86ISD::SHUF128)
40096 return SDValue(); // Nothing to do!
40097 MVT ShuffleVT = (FloatDomain ? MVT::v8f64 : MVT::v8i64);
40098 if (SDValue V = MatchSHUF128(ShuffleVT, DL, ScaledMask, V1, V2, DAG))
40099 return DAG.getBitcast(RootVT, V);
40100 }
40101 }
40102
40103 // Handle 128-bit lane shuffles of 256-bit vectors.
40104 if (RootVT.is256BitVector() && NumBaseMaskElts == 2) {
40105 // If the upper half is zeroable, then an extract+insert is more optimal
40106 // than using X86ISD::VPERM2X128. The insertion is free, even if it has to
40107 // zero the upper half.
40108 if (isUndefOrZero(Mask[1])) {
40109 if (Depth == 0 && RootOpc == ISD::INSERT_SUBVECTOR)
40110 return SDValue(); // Nothing to do!
40111 assert(isInRange(Mask[0], 0, 2) && "Unexpected lane shuffle");
40112 Res = CanonicalizeShuffleInput(RootVT, V1);
40113 Res = extract128BitVector(Res, Mask[0] * (NumRootElts / 2), DAG, DL);
40114 return widenSubVector(Res, Mask[1] == SM_SentinelZero, Subtarget, DAG, DL,
40115 256);
40116 }
40117
40118 // If we're inserting the low subvector, an insert-subvector 'concat'
40119 // pattern is quicker than VPERM2X128.
40120 if (BaseMask[0] == 0 && (BaseMask[1] == 0 || BaseMask[1] == 2) &&
40121 !Subtarget.hasAVX2()) {
40122 if (Depth == 0 && RootOpc == ISD::INSERT_SUBVECTOR)
40123 return SDValue(); // Nothing to do!
40124 SDValue Lo = CanonicalizeShuffleInput(RootVT, V1);
40125 SDValue Hi = CanonicalizeShuffleInput(RootVT, BaseMask[1] == 0 ? V1 : V2);
40126 Hi = extractSubVector(Hi, 0, DAG, DL, 128);
40127 return insertSubVector(Lo, Hi, NumRootElts / 2, DAG, DL, 128);
40128 }
40129
40130 // Don't lower to VPERM2X128 here if we have AVX2+, prefer to use
40131 // VPERMQ/VPERMPD for unary shuffles unless we need to use the zeroing
40132 // feature.
40133 // Prefer blends for sequential shuffles unless we are optimizing for size.
40134 if (UnaryShuffle &&
40135 !(Subtarget.hasAVX2() && isUndefOrInRange(Mask, 0, 2)) &&
40136 (OptForSize || !isSequentialOrUndefOrZeroInRange(Mask, 0, 2, 0))) {
40137 if (Depth == 0 && RootOpc == X86ISD::VPERM2X128)
40138 return SDValue(); // Nothing to do!
40139 unsigned PermMask = 0;
40140 PermMask |= ((Mask[0] < 0 ? 0x8 : (Mask[0] & 1)) << 0);
40141 PermMask |= ((Mask[1] < 0 ? 0x8 : (Mask[1] & 1)) << 4);
40142 return DAG.getNode(
40143 X86ISD::VPERM2X128, DL, RootVT, CanonicalizeShuffleInput(RootVT, V1),
40144 DAG.getUNDEF(RootVT), DAG.getTargetConstant(PermMask, DL, MVT::i8));
40145 }
40146
40147 if (Depth == 0 && RootOpc == X86ISD::SHUF128)
40148 return SDValue(); // Nothing to do!
40149
40150 // TODO - handle AVX512VL cases with X86ISD::SHUF128.
40151 if (!UnaryShuffle && !IsMaskedShuffle) {
40152 assert(llvm::all_of(Mask, [](int M) { return 0 <= M && M < 4; }) &&
40153 "Unexpected shuffle sentinel value");
40154 // Prefer blends to X86ISD::VPERM2X128.
40155 if (!((Mask[0] == 0 && Mask[1] == 3) || (Mask[0] == 2 && Mask[1] == 1))) {
40156 if (Depth == 0 && RootOpc == X86ISD::VPERM2X128)
40157 return SDValue(); // Nothing to do!
40158 unsigned PermMask = 0;
40159 PermMask |= ((Mask[0] & 3) << 0);
40160 PermMask |= ((Mask[1] & 3) << 4);
40161 SDValue LHS = isInRange(Mask[0], 0, 2) ? V1 : V2;
40162 SDValue RHS = isInRange(Mask[1], 0, 2) ? V1 : V2;
40163 return DAG.getNode(X86ISD::VPERM2X128, DL, RootVT,
40164 CanonicalizeShuffleInput(RootVT, LHS),
40165 CanonicalizeShuffleInput(RootVT, RHS),
40166 DAG.getTargetConstant(PermMask, DL, MVT::i8));
40167 }
40168 }
40169 }
40170
40171 // For masks that have been widened to 128-bit elements or more,
40172 // narrow back down to 64-bit elements.
40173 if (BaseMaskEltSizeInBits > 64) {
40174 assert((BaseMaskEltSizeInBits % 64) == 0 && "Illegal mask size");
40175 int MaskScale = BaseMaskEltSizeInBits / 64;
40176 SmallVector<int, 64> ScaledMask;
40177 narrowShuffleMaskElts(MaskScale, Mask, ScaledMask);
40178 Mask = std::move(ScaledMask);
40179 }
40180
40181 // For masked shuffles, we're trying to match the root width for better
40182 // writemask folding, attempt to scale the mask.
40183 // TODO - variable shuffles might need this to be widened again.
40184 if (IsMaskedShuffle && NumRootElts > Mask.size()) {
40185 assert((NumRootElts % Mask.size()) == 0 && "Illegal mask size");
40186 int MaskScale = NumRootElts / Mask.size();
40187 SmallVector<int, 64> ScaledMask;
40188 narrowShuffleMaskElts(MaskScale, Mask, ScaledMask);
40189 Mask = std::move(ScaledMask);
40190 }
40191
40192 unsigned NumMaskElts = Mask.size();
40193 unsigned MaskEltSizeInBits = RootSizeInBits / NumMaskElts;
40194 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
40195
40196 // Determine the effective mask value type.
40197 FloatDomain &= (32 <= MaskEltSizeInBits);
40198 MVT MaskVT = FloatDomain ? MVT::getFloatingPointVT(MaskEltSizeInBits)
40199 : MVT::getIntegerVT(MaskEltSizeInBits);
40200 MaskVT = MVT::getVectorVT(MaskVT, NumMaskElts);
40201
40202 // Only allow legal mask types.
40203 if (!TLI.isTypeLegal(MaskVT))
40204 return SDValue();
40205
40206 // Attempt to match the mask against known shuffle patterns.
40207 MVT ShuffleSrcVT, ShuffleVT;
40208 unsigned Shuffle, PermuteImm;
40209
40210 // Which shuffle domains are permitted?
40211 // Permit domain crossing at higher combine depths.
40212 // TODO: Should we indicate which domain is preferred if both are allowed?
40213 bool AllowFloatDomain = FloatDomain || (Depth >= 3);
40214 bool AllowIntDomain = (!FloatDomain || (Depth >= 3)) && Subtarget.hasSSE2() &&
40215 (!MaskVT.is256BitVector() || Subtarget.hasAVX2());
40216
40217 // Determine zeroable mask elements.
40218 APInt KnownUndef, KnownZero;
40219 resolveZeroablesFromTargetShuffle(Mask, KnownUndef, KnownZero);
40220 APInt Zeroable = KnownUndef | KnownZero;
40221
40222 if (UnaryShuffle) {
40223 // Attempt to match against broadcast-from-vector.
40224 // Limit AVX1 to cases where we're loading+broadcasting a scalar element.
40225 if ((Subtarget.hasAVX2() ||
40226 (Subtarget.hasAVX() && 32 <= MaskEltSizeInBits)) &&
40227 (!IsMaskedShuffle || NumRootElts == NumMaskElts)) {
40228 if (isUndefOrEqual(Mask, 0)) {
40229 if (V1.getValueType() == MaskVT &&
40231 X86::mayFoldLoad(V1.getOperand(0), Subtarget)) {
40232 if (Depth == 0 && RootOpc == X86ISD::VBROADCAST)
40233 return SDValue(); // Nothing to do!
40234 Res = V1.getOperand(0);
40235 Res = DAG.getNode(X86ISD::VBROADCAST, DL, MaskVT, Res);
40236 return DAG.getBitcast(RootVT, Res);
40237 }
40238 if (Subtarget.hasAVX2()) {
40239 if (Depth == 0 && RootOpc == X86ISD::VBROADCAST)
40240 return SDValue(); // Nothing to do!
40241 Res = CanonicalizeShuffleInput(MaskVT, V1);
40242 Res = DAG.getNode(X86ISD::VBROADCAST, DL, MaskVT, Res);
40243 return DAG.getBitcast(RootVT, Res);
40244 }
40245 }
40246 }
40247
40248 if (matchUnaryShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, V1,
40249 DAG, Subtarget, Shuffle, ShuffleSrcVT, ShuffleVT) &&
40250 (!IsMaskedShuffle ||
40251 (NumRootElts == ShuffleVT.getVectorNumElements()))) {
40252 if (Depth == 0 && RootOpc == Shuffle)
40253 return SDValue(); // Nothing to do!
40254 Res = CanonicalizeShuffleInput(ShuffleSrcVT, V1);
40255 Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res);
40256 return DAG.getBitcast(RootVT, Res);
40257 }
40258
40259 if (matchUnaryPermuteShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,
40260 AllowIntDomain, DAG, Subtarget, Shuffle, ShuffleVT,
40261 PermuteImm) &&
40262 (!IsMaskedShuffle ||
40263 (NumRootElts == ShuffleVT.getVectorNumElements()))) {
40264 if (Depth == 0 && RootOpc == Shuffle)
40265 return SDValue(); // Nothing to do!
40266 Res = CanonicalizeShuffleInput(ShuffleVT, V1);
40267 Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res,
40268 DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
40269 return DAG.getBitcast(RootVT, Res);
40270 }
40271 }
40272
40273 // Attempt to combine to INSERTPS, but only if the inserted element has come
40274 // from a scalar.
40275 // TODO: Handle other insertions here as well?
40276 if (!UnaryShuffle && AllowFloatDomain && RootSizeInBits == 128 &&
40277 Subtarget.hasSSE41() &&
40278 !isTargetShuffleEquivalent(MaskVT, Mask, {4, 1, 2, 3}, DAG)) {
40279 if (MaskEltSizeInBits == 32) {
40280 SDValue SrcV1 = V1, SrcV2 = V2;
40281 if (matchShuffleAsInsertPS(SrcV1, SrcV2, PermuteImm, Zeroable, Mask,
40282 DAG) &&
40283 SrcV2.getOpcode() == ISD::SCALAR_TO_VECTOR) {
40284 if (Depth == 0 && RootOpc == X86ISD::INSERTPS)
40285 return SDValue(); // Nothing to do!
40286 Res = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32,
40287 CanonicalizeShuffleInput(MVT::v4f32, SrcV1),
40288 CanonicalizeShuffleInput(MVT::v4f32, SrcV2),
40289 DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
40290 return DAG.getBitcast(RootVT, Res);
40291 }
40292 }
40293 if (MaskEltSizeInBits == 64 &&
40294 isTargetShuffleEquivalent(MaskVT, Mask, {0, 2}, DAG) &&
40296 V2.getScalarValueSizeInBits() <= 32) {
40297 if (Depth == 0 && RootOpc == X86ISD::INSERTPS)
40298 return SDValue(); // Nothing to do!
40299 PermuteImm = (/*DstIdx*/ 2 << 4) | (/*SrcIdx*/ 0 << 0);
40300 Res = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32,
40301 CanonicalizeShuffleInput(MVT::v4f32, V1),
40302 CanonicalizeShuffleInput(MVT::v4f32, V2),
40303 DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
40304 return DAG.getBitcast(RootVT, Res);
40305 }
40306 }
40307
40308 SDValue NewV1 = V1; // Save operands in case early exit happens.
40309 SDValue NewV2 = V2;
40310 if (matchBinaryShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, NewV1,
40311 NewV2, DL, DAG, Subtarget, Shuffle, ShuffleSrcVT,
40312 ShuffleVT, UnaryShuffle) &&
40313 (!IsMaskedShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
40314 if (Depth == 0 && RootOpc == Shuffle)
40315 return SDValue(); // Nothing to do!
40316 NewV1 = CanonicalizeShuffleInput(ShuffleSrcVT, NewV1);
40317 NewV2 = CanonicalizeShuffleInput(ShuffleSrcVT, NewV2);
40318 Res = DAG.getNode(Shuffle, DL, ShuffleVT, NewV1, NewV2);
40319 return DAG.getBitcast(RootVT, Res);
40320 }
40321
40322 NewV1 = V1; // Save operands in case early exit happens.
40323 NewV2 = V2;
40324 if (matchBinaryPermuteShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,
40325 AllowIntDomain, NewV1, NewV2, DL, DAG,
40326 Subtarget, Shuffle, ShuffleVT, PermuteImm) &&
40327 (!IsMaskedShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
40328 if (Depth == 0 && RootOpc == Shuffle)
40329 return SDValue(); // Nothing to do!
40330 NewV1 = CanonicalizeShuffleInput(ShuffleVT, NewV1);
40331 NewV2 = CanonicalizeShuffleInput(ShuffleVT, NewV2);
40332 Res = DAG.getNode(Shuffle, DL, ShuffleVT, NewV1, NewV2,
40333 DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
40334 return DAG.getBitcast(RootVT, Res);
40335 }
40336
40337 // Typically from here on, we need an integer version of MaskVT.
40338 MVT IntMaskVT = MVT::getIntegerVT(MaskEltSizeInBits);
40339 IntMaskVT = MVT::getVectorVT(IntMaskVT, NumMaskElts);
40340
40341 // Annoyingly, SSE4A instructions don't map into the above match helpers.
40342 if (Subtarget.hasSSE4A() && AllowIntDomain && RootSizeInBits == 128) {
40343 uint64_t BitLen, BitIdx;
40344 if (matchShuffleAsEXTRQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx,
40345 Zeroable)) {
40346 if (Depth == 0 && RootOpc == X86ISD::EXTRQI)
40347 return SDValue(); // Nothing to do!
40348 V1 = CanonicalizeShuffleInput(IntMaskVT, V1);
40349 Res = DAG.getNode(X86ISD::EXTRQI, DL, IntMaskVT, V1,
40350 DAG.getTargetConstant(BitLen, DL, MVT::i8),
40351 DAG.getTargetConstant(BitIdx, DL, MVT::i8));
40352 return DAG.getBitcast(RootVT, Res);
40353 }
40354
40355 if (matchShuffleAsINSERTQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx)) {
40356 if (Depth == 0 && RootOpc == X86ISD::INSERTQI)
40357 return SDValue(); // Nothing to do!
40358 V1 = CanonicalizeShuffleInput(IntMaskVT, V1);
40359 V2 = CanonicalizeShuffleInput(IntMaskVT, V2);
40360 Res = DAG.getNode(X86ISD::INSERTQI, DL, IntMaskVT, V1, V2,
40361 DAG.getTargetConstant(BitLen, DL, MVT::i8),
40362 DAG.getTargetConstant(BitIdx, DL, MVT::i8));
40363 return DAG.getBitcast(RootVT, Res);
40364 }
40365 }
40366
40367 // Match shuffle against TRUNCATE patterns.
40368 if (AllowIntDomain && MaskEltSizeInBits < 64 && Subtarget.hasAVX512()) {
40369 // Match against a VTRUNC instruction, accounting for src/dst sizes.
40370 if (matchShuffleAsVTRUNC(ShuffleSrcVT, ShuffleVT, IntMaskVT, Mask, Zeroable,
40371 Subtarget)) {
40372 bool IsTRUNCATE = ShuffleVT.getVectorNumElements() ==
40373 ShuffleSrcVT.getVectorNumElements();
40374 unsigned Opc =
40376 if (Depth == 0 && RootOpc == Opc)
40377 return SDValue(); // Nothing to do!
40378 V1 = CanonicalizeShuffleInput(ShuffleSrcVT, V1);
40379 Res = DAG.getNode(Opc, DL, ShuffleVT, V1);
40380 if (ShuffleVT.getSizeInBits() < RootSizeInBits)
40381 Res = widenSubVector(Res, true, Subtarget, DAG, DL, RootSizeInBits);
40382 return DAG.getBitcast(RootVT, Res);
40383 }
40384
40385 // Do we need a more general binary truncation pattern?
40386 if (RootSizeInBits < 512 &&
40387 ((RootVT.is256BitVector() && Subtarget.useAVX512Regs()) ||
40388 (RootVT.is128BitVector() && Subtarget.hasVLX())) &&
40389 (MaskEltSizeInBits > 8 || Subtarget.hasBWI()) &&
40390 isSequentialOrUndefInRange(Mask, 0, NumMaskElts, 0, 2)) {
40391 // Bail if this was already a truncation or PACK node.
40392 // We sometimes fail to match PACK if we demand known undef elements.
40393 if (Depth == 0 &&
40394 (RootOpc == ISD::TRUNCATE || RootOpc == X86ISD::PACKSS ||
40395 RootOpc == X86ISD::PACKUS))
40396 return SDValue(); // Nothing to do!
40397 ShuffleSrcVT = MVT::getIntegerVT(MaskEltSizeInBits * 2);
40398 ShuffleSrcVT = MVT::getVectorVT(ShuffleSrcVT, NumMaskElts / 2);
40399 V1 = CanonicalizeShuffleInput(ShuffleSrcVT, V1);
40400 V2 = CanonicalizeShuffleInput(ShuffleSrcVT, V2);
40401 ShuffleSrcVT = MVT::getIntegerVT(MaskEltSizeInBits * 2);
40402 ShuffleSrcVT = MVT::getVectorVT(ShuffleSrcVT, NumMaskElts);
40403 Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, ShuffleSrcVT, V1, V2);
40404 Res = DAG.getNode(ISD::TRUNCATE, DL, IntMaskVT, Res);
40405 return DAG.getBitcast(RootVT, Res);
40406 }
40407 }
40408
40409 // Don't try to re-form single instruction chains under any circumstances now
40410 // that we've done encoding canonicalization for them.
40411 if (Depth < 1)
40412 return SDValue();
40413
40414 int NumVariableMasks = llvm::count_if(SrcNodes, [](const SDNode *N) {
40415 return isTargetShuffleVariableMask(N->getOpcode());
40416 });
40417 bool HasSlowVariableMask = llvm::any_of(SrcNodes, [](const SDNode *N) {
40418 return (N->getOpcode() == X86ISD::VPERMV3 ||
40419 N->getOpcode() == X86ISD::VPERMV);
40420 });
40421
40422 // Depth threshold above which we can efficiently use variable mask shuffles.
40423 int VariableCrossLaneShuffleDepth =
40424 Subtarget.hasFastVariableCrossLaneShuffle() ? 1 : 2;
40425 int VariablePerLaneShuffleDepth =
40426 Subtarget.hasFastVariablePerLaneShuffle() ? 1 : 2;
40427 AllowVariableCrossLaneMask &=
40428 (Depth >= VariableCrossLaneShuffleDepth) || NumVariableMasks;
40429 AllowVariablePerLaneMask &=
40430 (Depth >= VariablePerLaneShuffleDepth) || NumVariableMasks;
40431 // VPERM2W/VPERM2B are 3 uops on Skylake and Icelake so we require a
40432 // higher depth before combining them.
40433 int BWIVPERMV3ShuffleDepth =
40434 VariableCrossLaneShuffleDepth + 2 - NumVariableMasks;
40435 bool AllowBWIVPERMV3 =
40436 (Depth >= BWIVPERMV3ShuffleDepth || HasSlowVariableMask);
40437
40438 // If root was a VPERMV/VPERMV3 node, always allow a variable shuffle.
40439 if ((UnaryShuffle && RootOpc == X86ISD::VPERMV) || RootOpc == X86ISD::VPERMV3)
40440 AllowVariableCrossLaneMask = AllowVariablePerLaneMask = true;
40441
40442 bool MaskContainsZeros = isAnyZero(Mask);
40443
40444 if (is128BitLaneCrossingShuffleMask(MaskVT, Mask)) {
40445 // If we have a single input lane-crossing shuffle then lower to VPERMV.
40446 if (UnaryShuffle && AllowVariableCrossLaneMask && !MaskContainsZeros) {
40447 if (Subtarget.hasAVX2() &&
40448 (MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) {
40449 SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
40450 Res = CanonicalizeShuffleInput(MaskVT, V1);
40451 Res = DAG.getNode(X86ISD::VPERMV, DL, MaskVT, VPermMask, Res);
40452 return DAG.getBitcast(RootVT, Res);
40453 }
40454 // AVX512 variants (non-VLX will pad to 512-bit shuffles).
40455 if ((Subtarget.hasAVX512() &&
40456 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
40457 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
40458 (Subtarget.hasBWI() &&
40459 (MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) ||
40460 (Subtarget.hasVBMI() &&
40461 (MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8))) {
40462 V1 = CanonicalizeShuffleInput(MaskVT, V1);
40463 V2 = DAG.getUNDEF(MaskVT);
40464 Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
40465 return DAG.getBitcast(RootVT, Res);
40466 }
40467 }
40468
40469 // Lower a unary+zero lane-crossing shuffle as VPERMV3 with a zero
40470 // vector as the second source (non-VLX will pad to 512-bit shuffles).
40471 if (UnaryShuffle && AllowVariableCrossLaneMask &&
40472 ((Subtarget.hasAVX512() &&
40473 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
40474 MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
40475 MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32 ||
40476 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
40477 (Subtarget.hasBWI() && AllowBWIVPERMV3 &&
40478 (MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) ||
40479 (Subtarget.hasVBMI() && AllowBWIVPERMV3 &&
40480 (MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8)))) {
40481 // Adjust shuffle mask - replace SM_SentinelZero with second source index.
40482 for (unsigned i = 0; i != NumMaskElts; ++i)
40483 if (Mask[i] == SM_SentinelZero)
40484 Mask[i] = NumMaskElts + i;
40485 V1 = CanonicalizeShuffleInput(MaskVT, V1);
40486 V2 = getZeroVector(MaskVT, Subtarget, DAG, DL);
40487 Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
40488 return DAG.getBitcast(RootVT, Res);
40489 }
40490
40491 // If that failed and either input is extracted then try to combine as a
40492 // shuffle with the larger type.
40494 Inputs, RootOpc, RootVT, BaseMask, Depth, SrcNodes,
40495 AllowVariableCrossLaneMask, AllowVariablePerLaneMask,
40496 IsMaskedShuffle, DAG, DL, Subtarget))
40497 return WideShuffle;
40498
40499 // If we have a dual input lane-crossing shuffle then lower to VPERMV3,
40500 // (non-VLX will pad to 512-bit shuffles).
40501 if (AllowVariableCrossLaneMask && !MaskContainsZeros &&
40502 ((Subtarget.hasAVX512() &&
40503 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
40504 MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
40505 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32 ||
40506 MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
40507 (Subtarget.hasBWI() && AllowBWIVPERMV3 &&
40508 (MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) ||
40509 (Subtarget.hasVBMI() && AllowBWIVPERMV3 &&
40510 (MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8)))) {
40511 V1 = CanonicalizeShuffleInput(MaskVT, V1);
40512 V2 = CanonicalizeShuffleInput(MaskVT, V2);
40513 Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
40514 return DAG.getBitcast(RootVT, Res);
40515 }
40516 return SDValue();
40517 }
40518
40519 // See if we can combine a single input shuffle with zeros to a bit-mask,
40520 // which is much simpler than any shuffle.
40521 if (UnaryShuffle && MaskContainsZeros && AllowVariablePerLaneMask &&
40522 isSequentialOrUndefOrZeroInRange(Mask, 0, NumMaskElts, 0) &&
40523 TLI.isTypeLegal(MaskVT)) {
40524 APInt Zero = APInt::getZero(MaskEltSizeInBits);
40525 APInt AllOnes = APInt::getAllOnes(MaskEltSizeInBits);
40526 APInt UndefElts(NumMaskElts, 0);
40527 SmallVector<APInt, 64> EltBits(NumMaskElts, Zero);
40528 for (unsigned i = 0; i != NumMaskElts; ++i) {
40529 int M = Mask[i];
40530 if (M == SM_SentinelUndef) {
40531 UndefElts.setBit(i);
40532 continue;
40533 }
40534 if (M == SM_SentinelZero)
40535 continue;
40536 EltBits[i] = AllOnes;
40537 }
40538 SDValue BitMask = getConstVector(EltBits, UndefElts, MaskVT, DAG, DL);
40539 Res = CanonicalizeShuffleInput(MaskVT, V1);
40540 unsigned AndOpcode =
40542 Res = DAG.getNode(AndOpcode, DL, MaskVT, Res, BitMask);
40543 return DAG.getBitcast(RootVT, Res);
40544 }
40545
40546 // If we have a single input shuffle with different shuffle patterns in the
40547 // the 128-bit lanes use the variable mask to VPERMILPS.
40548 // TODO Combine other mask types at higher depths.
40549 if (UnaryShuffle && AllowVariablePerLaneMask && !MaskContainsZeros &&
40550 ((MaskVT == MVT::v8f32 && Subtarget.hasAVX()) ||
40551 (MaskVT == MVT::v16f32 && Subtarget.hasAVX512()))) {
40552 SmallVector<SDValue, 16> VPermIdx;
40553 for (int M : Mask) {
40554 SDValue Idx =
40555 M < 0 ? DAG.getUNDEF(MVT::i32) : DAG.getConstant(M % 4, DL, MVT::i32);
40556 VPermIdx.push_back(Idx);
40557 }
40558 SDValue VPermMask = DAG.getBuildVector(IntMaskVT, DL, VPermIdx);
40559 Res = CanonicalizeShuffleInput(MaskVT, V1);
40560 Res = DAG.getNode(X86ISD::VPERMILPV, DL, MaskVT, Res, VPermMask);
40561 return DAG.getBitcast(RootVT, Res);
40562 }
40563
40564 // With XOP, binary shuffles of 128/256-bit floating point vectors can combine
40565 // to VPERMIL2PD/VPERMIL2PS.
40566 if (AllowVariablePerLaneMask && Subtarget.hasXOP() &&
40567 (MaskVT == MVT::v2f64 || MaskVT == MVT::v4f64 || MaskVT == MVT::v4f32 ||
40568 MaskVT == MVT::v8f32)) {
40569 // VPERMIL2 Operation.
40570 // Bits[3] - Match Bit.
40571 // Bits[2:1] - (Per Lane) PD Shuffle Mask.
40572 // Bits[2:0] - (Per Lane) PS Shuffle Mask.
40573 unsigned NumLanes = MaskVT.getSizeInBits() / 128;
40574 unsigned NumEltsPerLane = NumMaskElts / NumLanes;
40575 SmallVector<int, 8> VPerm2Idx;
40576 unsigned M2ZImm = 0;
40577 for (int M : Mask) {
40578 if (M == SM_SentinelUndef) {
40579 VPerm2Idx.push_back(-1);
40580 continue;
40581 }
40582 if (M == SM_SentinelZero) {
40583 M2ZImm = 2;
40584 VPerm2Idx.push_back(8);
40585 continue;
40586 }
40587 int Index = (M % NumEltsPerLane) + ((M / NumMaskElts) * NumEltsPerLane);
40588 Index = (MaskVT.getScalarSizeInBits() == 64 ? Index << 1 : Index);
40589 VPerm2Idx.push_back(Index);
40590 }
40591 V1 = CanonicalizeShuffleInput(MaskVT, V1);
40592 V2 = CanonicalizeShuffleInput(MaskVT, V2);
40593 SDValue VPerm2MaskOp = getConstVector(VPerm2Idx, IntMaskVT, DAG, DL, true);
40594 Res = DAG.getNode(X86ISD::VPERMIL2, DL, MaskVT, V1, V2, VPerm2MaskOp,
40595 DAG.getTargetConstant(M2ZImm, DL, MVT::i8));
40596 return DAG.getBitcast(RootVT, Res);
40597 }
40598
40599 // If we have 3 or more shuffle instructions or a chain involving a variable
40600 // mask, we can replace them with a single PSHUFB instruction profitably.
40601 // Intel's manuals suggest only using PSHUFB if doing so replacing 5
40602 // instructions, but in practice PSHUFB tends to be *very* fast so we're
40603 // more aggressive.
40604 if (UnaryShuffle && AllowVariablePerLaneMask &&
40605 ((RootVT.is128BitVector() && Subtarget.hasSSSE3()) ||
40606 (RootVT.is256BitVector() && Subtarget.hasAVX2()) ||
40607 (RootVT.is512BitVector() && Subtarget.hasBWI()))) {
40608 SmallVector<SDValue, 16> PSHUFBMask;
40609 int NumBytes = RootVT.getSizeInBits() / 8;
40610 int Ratio = NumBytes / NumMaskElts;
40611 for (int i = 0; i < NumBytes; ++i) {
40612 int M = Mask[i / Ratio];
40613 if (M == SM_SentinelUndef) {
40614 PSHUFBMask.push_back(DAG.getUNDEF(MVT::i8));
40615 continue;
40616 }
40617 if (M == SM_SentinelZero) {
40618 PSHUFBMask.push_back(DAG.getConstant(0x80, DL, MVT::i8));
40619 continue;
40620 }
40621 M = Ratio * M + i % Ratio;
40622 assert((M / 16) == (i / 16) && "Lane crossing detected");
40623 PSHUFBMask.push_back(DAG.getConstant(M, DL, MVT::i8));
40624 }
40625 MVT ByteVT = MVT::getVectorVT(MVT::i8, NumBytes);
40626 Res = CanonicalizeShuffleInput(ByteVT, V1);
40627 SDValue PSHUFBMaskOp = DAG.getBuildVector(ByteVT, DL, PSHUFBMask);
40628 Res = DAG.getNode(X86ISD::PSHUFB, DL, ByteVT, Res, PSHUFBMaskOp);
40629 return DAG.getBitcast(RootVT, Res);
40630 }
40631
40632 // With XOP, if we have a 128-bit binary input shuffle we can always combine
40633 // to VPPERM. We match the depth requirement of PSHUFB - VPPERM is never
40634 // slower than PSHUFB on targets that support both.
40635 if (AllowVariablePerLaneMask && RootVT.is128BitVector() &&
40636 Subtarget.hasXOP()) {
40637 // VPPERM Mask Operation
40638 // Bits[4:0] - Byte Index (0 - 31)
40639 // Bits[7:5] - Permute Operation (0 - Source byte, 4 - ZERO)
40640 SmallVector<SDValue, 16> VPPERMMask;
40641 int NumBytes = 16;
40642 int Ratio = NumBytes / NumMaskElts;
40643 for (int i = 0; i < NumBytes; ++i) {
40644 int M = Mask[i / Ratio];
40645 if (M == SM_SentinelUndef) {
40646 VPPERMMask.push_back(DAG.getUNDEF(MVT::i8));
40647 continue;
40648 }
40649 if (M == SM_SentinelZero) {
40650 VPPERMMask.push_back(DAG.getConstant(0x80, DL, MVT::i8));
40651 continue;
40652 }
40653 M = Ratio * M + i % Ratio;
40654 VPPERMMask.push_back(DAG.getConstant(M, DL, MVT::i8));
40655 }
40656 MVT ByteVT = MVT::v16i8;
40657 V1 = CanonicalizeShuffleInput(ByteVT, V1);
40658 V2 = CanonicalizeShuffleInput(ByteVT, V2);
40659 SDValue VPPERMMaskOp = DAG.getBuildVector(ByteVT, DL, VPPERMMask);
40660 Res = DAG.getNode(X86ISD::VPPERM, DL, ByteVT, V1, V2, VPPERMMaskOp);
40661 return DAG.getBitcast(RootVT, Res);
40662 }
40663
40664 // If that failed and either input is extracted then try to combine as a
40665 // shuffle with the larger type.
40667 Inputs, RootOpc, RootVT, BaseMask, Depth, SrcNodes,
40668 AllowVariableCrossLaneMask, AllowVariablePerLaneMask, IsMaskedShuffle,
40669 DAG, DL, Subtarget))
40670 return WideShuffle;
40671
40672 // If we have a dual input shuffle then lower to VPERMV3,
40673 // (non-VLX will pad to 512-bit shuffles)
40674 if (!UnaryShuffle && AllowVariablePerLaneMask && !MaskContainsZeros &&
40675 ((Subtarget.hasAVX512() &&
40676 (MaskVT == MVT::v2f64 || MaskVT == MVT::v4f64 || MaskVT == MVT::v8f64 ||
40677 MaskVT == MVT::v2i64 || MaskVT == MVT::v4i64 || MaskVT == MVT::v8i64 ||
40678 MaskVT == MVT::v4f32 || MaskVT == MVT::v4i32 || MaskVT == MVT::v8f32 ||
40679 MaskVT == MVT::v8i32 || MaskVT == MVT::v16f32 ||
40680 MaskVT == MVT::v16i32)) ||
40681 (Subtarget.hasBWI() && AllowBWIVPERMV3 &&
40682 (MaskVT == MVT::v8i16 || MaskVT == MVT::v16i16 ||
40683 MaskVT == MVT::v32i16)) ||
40684 (Subtarget.hasVBMI() && AllowBWIVPERMV3 &&
40685 (MaskVT == MVT::v16i8 || MaskVT == MVT::v32i8 ||
40686 MaskVT == MVT::v64i8)))) {
40687 V1 = CanonicalizeShuffleInput(MaskVT, V1);
40688 V2 = CanonicalizeShuffleInput(MaskVT, V2);
40689 Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
40690 return DAG.getBitcast(RootVT, Res);
40691 }
40692
40693 // Failed to find any combines.
40694 return SDValue();
40695}
40696
40697// Combine an arbitrary chain of shuffles + extract_subvectors into a single
40698// instruction if possible.
40699//
40700// Wrapper for combineX86ShuffleChain that extends the shuffle mask to a larger
40701// type size to attempt to combine:
40702// shuffle(extract_subvector(x,c1),extract_subvector(y,c2),m1)
40703// -->
40704// extract_subvector(shuffle(x,y,m2),0)
40706 ArrayRef<SDValue> Inputs, unsigned RootOpcode, MVT RootVT,
40707 ArrayRef<int> BaseMask, int Depth, ArrayRef<const SDNode *> SrcNodes,
40708 bool AllowVariableCrossLaneMask, bool AllowVariablePerLaneMask,
40709 bool IsMaskedShuffle, SelectionDAG &DAG, const SDLoc &DL,
40710 const X86Subtarget &Subtarget) {
40711 unsigned NumMaskElts = BaseMask.size();
40712 unsigned NumInputs = Inputs.size();
40713 if (NumInputs == 0)
40714 return SDValue();
40715
40716 unsigned RootSizeInBits = RootVT.getSizeInBits();
40717 unsigned RootEltSizeInBits = RootSizeInBits / NumMaskElts;
40718 assert((RootSizeInBits % NumMaskElts) == 0 && "Unexpected root shuffle mask");
40719
40720 // Peek through subvectors to find widest legal vector.
40721 // TODO: Handle ISD::TRUNCATE
40722 unsigned WideSizeInBits = RootSizeInBits;
40723 for (SDValue Input : Inputs) {
40725 while (1) {
40726 if (Input.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
40727 Input = peekThroughBitcasts(Input.getOperand(0));
40728 continue;
40729 }
40730 if (Input.getOpcode() == ISD::INSERT_SUBVECTOR &&
40731 Input.getOperand(0).isUndef() &&
40732 isNullConstant(Input.getOperand(2))) {
40733 Input = peekThroughBitcasts(Input.getOperand(1));
40734 continue;
40735 }
40736 break;
40737 }
40738 if (DAG.getTargetLoweringInfo().isTypeLegal(Input.getValueType()) &&
40739 WideSizeInBits < Input.getValueSizeInBits())
40740 WideSizeInBits = Input.getValueSizeInBits();
40741 }
40742
40743 // Bail if we fail to find a source larger than the existing root.
40744 if (WideSizeInBits <= RootSizeInBits ||
40745 (WideSizeInBits % RootSizeInBits) != 0)
40746 return SDValue();
40747
40748 // Create new mask for larger type.
40749 SmallVector<int, 64> WideMask;
40750 growShuffleMask(BaseMask, WideMask, RootSizeInBits, WideSizeInBits);
40751
40752 // Attempt to peek through inputs and adjust mask when we extract from an
40753 // upper subvector.
40754 int AdjustedMasks = 0;
40755 SmallVector<SDValue, 4> WideInputs(Inputs);
40756 for (unsigned I = 0; I != NumInputs; ++I) {
40757 SDValue &Input = WideInputs[I];
40759 while (1) {
40760 if (Input.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
40761 Input.getOperand(0).getValueSizeInBits() <= WideSizeInBits) {
40762 uint64_t Idx = Input.getConstantOperandVal(1);
40763 if (Idx != 0) {
40764 ++AdjustedMasks;
40765 unsigned InputEltSizeInBits = Input.getScalarValueSizeInBits();
40766 Idx = (Idx * InputEltSizeInBits) / RootEltSizeInBits;
40767
40768 int lo = I * WideMask.size();
40769 int hi = (I + 1) * WideMask.size();
40770 for (int &M : WideMask)
40771 if (lo <= M && M < hi)
40772 M += Idx;
40773 }
40774 Input = peekThroughBitcasts(Input.getOperand(0));
40775 continue;
40776 }
40777 // TODO: Handle insertions into upper subvectors.
40778 if (Input.getOpcode() == ISD::INSERT_SUBVECTOR &&
40779 Input.getOperand(0).isUndef() &&
40780 isNullConstant(Input.getOperand(2))) {
40781 Input = peekThroughBitcasts(Input.getOperand(1));
40782 continue;
40783 }
40784 break;
40785 }
40786 }
40787
40788 // Remove unused/repeated shuffle source ops.
40789 resolveTargetShuffleInputsAndMask(WideInputs, WideMask);
40790 assert(!WideInputs.empty() && "Shuffle with no inputs detected");
40791
40792 // Bail if we're always extracting from the lowest subvectors,
40793 // combineX86ShuffleChain should match this for the current width, or the
40794 // shuffle still references too many inputs.
40795 if (AdjustedMasks == 0 || WideInputs.size() > 2)
40796 return SDValue();
40797
40798 // Minor canonicalization of the accumulated shuffle mask to make it easier
40799 // to match below. All this does is detect masks with sequential pairs of
40800 // elements, and shrink them to the half-width mask. It does this in a loop
40801 // so it will reduce the size of the mask to the minimal width mask which
40802 // performs an equivalent shuffle.
40803 while (WideMask.size() > 1) {
40804 SmallVector<int, 64> WidenedMask;
40805 if (!canWidenShuffleElements(WideMask, WidenedMask))
40806 break;
40807 WideMask = std::move(WidenedMask);
40808 }
40809
40810 // Canonicalization of binary shuffle masks to improve pattern matching by
40811 // commuting the inputs.
40812 if (WideInputs.size() == 2 && canonicalizeShuffleMaskWithCommute(WideMask)) {
40814 std::swap(WideInputs[0], WideInputs[1]);
40815 }
40816
40817 // Increase depth for every upper subvector we've peeked through.
40818 Depth += AdjustedMasks;
40819
40820 // Attempt to combine wider chain.
40821 // TODO: Can we use a better Root?
40822 SDValue WideRoot = WideInputs.front().getValueSizeInBits() >
40823 WideInputs.back().getValueSizeInBits()
40824 ? WideInputs.front()
40825 : WideInputs.back();
40826 assert(WideRoot.getValueSizeInBits() == WideSizeInBits &&
40827 "WideRootSize mismatch");
40828
40829 if (SDValue WideShuffle = combineX86ShuffleChain(
40830 WideInputs, RootOpcode, WideRoot.getSimpleValueType(), WideMask,
40831 Depth, SrcNodes, AllowVariableCrossLaneMask, AllowVariablePerLaneMask,
40832 IsMaskedShuffle, DAG, SDLoc(WideRoot), Subtarget)) {
40833 WideShuffle = extractSubVector(WideShuffle, 0, DAG, DL, RootSizeInBits);
40834 return DAG.getBitcast(RootVT, WideShuffle);
40835 }
40836
40837 return SDValue();
40838}
40839
40840// Canonicalize the combined shuffle mask chain with horizontal ops.
40841// NOTE: This may update the Ops and Mask.
40844 unsigned RootSizeInBits, const SDLoc &DL, SelectionDAG &DAG,
40845 const X86Subtarget &Subtarget) {
40846 if (Mask.empty() || Ops.empty())
40847 return SDValue();
40848
40850 for (SDValue Op : Ops)
40852
40853 // All ops must be the same horizop + type.
40854 SDValue BC0 = BC[0];
40855 EVT VT0 = BC0.getValueType();
40856 unsigned Opcode0 = BC0.getOpcode();
40857 if (VT0.getSizeInBits() != RootSizeInBits || llvm::any_of(BC, [&](SDValue V) {
40858 return V.getOpcode() != Opcode0 || V.getValueType() != VT0;
40859 }))
40860 return SDValue();
40861
40862 bool isHoriz = (Opcode0 == X86ISD::FHADD || Opcode0 == X86ISD::HADD ||
40863 Opcode0 == X86ISD::FHSUB || Opcode0 == X86ISD::HSUB);
40864 bool isPack = (Opcode0 == X86ISD::PACKSS || Opcode0 == X86ISD::PACKUS);
40865 if (!isHoriz && !isPack)
40866 return SDValue();
40867
40868 // Do all ops have a single use?
40869 bool OneUseOps = llvm::all_of(Ops, [](SDValue Op) {
40870 return Op.hasOneUse() &&
40872 });
40873
40874 int NumElts = VT0.getVectorNumElements();
40875 int NumLanes = VT0.getSizeInBits() / 128;
40876 int NumEltsPerLane = NumElts / NumLanes;
40877 int NumHalfEltsPerLane = NumEltsPerLane / 2;
40878 MVT SrcVT = BC0.getOperand(0).getSimpleValueType();
40879 unsigned EltSizeInBits = RootSizeInBits / Mask.size();
40880
40881 if (NumEltsPerLane >= 4 &&
40882 (isPack || shouldUseHorizontalOp(Ops.size() == 1, DAG, Subtarget))) {
40883 SmallVector<int> LaneMask, ScaledMask;
40884 if (isRepeatedTargetShuffleMask(128, EltSizeInBits, Mask, LaneMask) &&
40885 scaleShuffleElements(LaneMask, 4, ScaledMask)) {
40886 // See if we can remove the shuffle by resorting the HOP chain so that
40887 // the HOP args are pre-shuffled.
40888 // TODO: Generalize to any sized/depth chain.
40889 // TODO: Add support for PACKSS/PACKUS.
40890 if (isHoriz) {
40891 // Attempt to find a HOP(HOP(X,Y),HOP(Z,W)) source operand.
40892 auto GetHOpSrc = [&](int M) {
40893 if (M == SM_SentinelUndef)
40894 return DAG.getUNDEF(VT0);
40895 if (M == SM_SentinelZero)
40896 return getZeroVector(VT0.getSimpleVT(), Subtarget, DAG, DL);
40897 SDValue Src0 = BC[M / 4];
40898 SDValue Src1 = Src0.getOperand((M % 4) >= 2);
40899 if (Src1.getOpcode() == Opcode0 && Src0->isOnlyUserOf(Src1.getNode()))
40900 return Src1.getOperand(M % 2);
40901 return SDValue();
40902 };
40903 SDValue M0 = GetHOpSrc(ScaledMask[0]);
40904 SDValue M1 = GetHOpSrc(ScaledMask[1]);
40905 SDValue M2 = GetHOpSrc(ScaledMask[2]);
40906 SDValue M3 = GetHOpSrc(ScaledMask[3]);
40907 if (M0 && M1 && M2 && M3) {
40908 SDValue LHS = DAG.getNode(Opcode0, DL, SrcVT, M0, M1);
40909 SDValue RHS = DAG.getNode(Opcode0, DL, SrcVT, M2, M3);
40910 return DAG.getNode(Opcode0, DL, VT0, LHS, RHS);
40911 }
40912 }
40913 // shuffle(hop(x,y),hop(z,w)) -> permute(hop(x,z)) etc.
40914 if (Ops.size() >= 2) {
40915 SDValue LHS, RHS;
40916 auto GetHOpSrc = [&](int M, int &OutM) {
40917 // TODO: Support SM_SentinelZero
40918 if (M < 0)
40919 return M == SM_SentinelUndef;
40920 SDValue Src = BC[M / 4].getOperand((M % 4) >= 2);
40921 if (!LHS || LHS == Src) {
40922 LHS = Src;
40923 OutM = (M % 2);
40924 return true;
40925 }
40926 if (!RHS || RHS == Src) {
40927 RHS = Src;
40928 OutM = (M % 2) + 2;
40929 return true;
40930 }
40931 return false;
40932 };
40933 int PostMask[4] = {-1, -1, -1, -1};
40934 if (GetHOpSrc(ScaledMask[0], PostMask[0]) &&
40935 GetHOpSrc(ScaledMask[1], PostMask[1]) &&
40936 GetHOpSrc(ScaledMask[2], PostMask[2]) &&
40937 GetHOpSrc(ScaledMask[3], PostMask[3])) {
40938 LHS = DAG.getBitcast(SrcVT, LHS);
40939 RHS = DAG.getBitcast(SrcVT, RHS ? RHS : LHS);
40940 SDValue Res = DAG.getNode(Opcode0, DL, VT0, LHS, RHS);
40941 // Use SHUFPS for the permute so this will work on SSE2 targets,
40942 // shuffle combining and domain handling will simplify this later on.
40943 MVT ShuffleVT = MVT::getVectorVT(MVT::f32, RootSizeInBits / 32);
40944 Res = DAG.getBitcast(ShuffleVT, Res);
40945 return DAG.getNode(X86ISD::SHUFP, DL, ShuffleVT, Res, Res,
40946 getV4X86ShuffleImm8ForMask(PostMask, DL, DAG));
40947 }
40948 }
40949 }
40950 }
40951
40952 if (2 < Ops.size())
40953 return SDValue();
40954
40955 SDValue BC1 = BC[BC.size() - 1];
40956 if (Mask.size() == VT0.getVectorNumElements()) {
40957 // Canonicalize binary shuffles of horizontal ops that use the
40958 // same sources to an unary shuffle.
40959 // TODO: Try to perform this fold even if the shuffle remains.
40960 if (Ops.size() == 2) {
40961 auto ContainsOps = [](SDValue HOp, SDValue Op) {
40962 return Op == HOp.getOperand(0) || Op == HOp.getOperand(1);
40963 };
40964 // Commute if all BC0's ops are contained in BC1.
40965 if (ContainsOps(BC1, BC0.getOperand(0)) &&
40966 ContainsOps(BC1, BC0.getOperand(1))) {
40968 std::swap(Ops[0], Ops[1]);
40969 std::swap(BC0, BC1);
40970 }
40971
40972 // If BC1 can be represented by BC0, then convert to unary shuffle.
40973 if (ContainsOps(BC0, BC1.getOperand(0)) &&
40974 ContainsOps(BC0, BC1.getOperand(1))) {
40975 for (int &M : Mask) {
40976 if (M < NumElts) // BC0 element or UNDEF/Zero sentinel.
40977 continue;
40978 int SubLane = ((M % NumEltsPerLane) >= NumHalfEltsPerLane) ? 1 : 0;
40979 M -= NumElts + (SubLane * NumHalfEltsPerLane);
40980 if (BC1.getOperand(SubLane) != BC0.getOperand(0))
40981 M += NumHalfEltsPerLane;
40982 }
40983 }
40984 }
40985
40986 // Canonicalize unary horizontal ops to only refer to lower halves.
40987 for (int i = 0; i != NumElts; ++i) {
40988 int &M = Mask[i];
40989 if (isUndefOrZero(M))
40990 continue;
40991 if (M < NumElts && BC0.getOperand(0) == BC0.getOperand(1) &&
40992 (M % NumEltsPerLane) >= NumHalfEltsPerLane)
40993 M -= NumHalfEltsPerLane;
40994 if (NumElts <= M && BC1.getOperand(0) == BC1.getOperand(1) &&
40995 (M % NumEltsPerLane) >= NumHalfEltsPerLane)
40996 M -= NumHalfEltsPerLane;
40997 }
40998 }
40999
41000 // Combine binary shuffle of 2 similar 'Horizontal' instructions into a
41001 // single instruction. Attempt to match a v2X64 repeating shuffle pattern that
41002 // represents the LHS/RHS inputs for the lower/upper halves.
41003 SmallVector<int, 16> TargetMask128, WideMask128;
41004 if (isRepeatedTargetShuffleMask(128, EltSizeInBits, Mask, TargetMask128) &&
41005 scaleShuffleElements(TargetMask128, 2, WideMask128)) {
41006 assert(isUndefOrZeroOrInRange(WideMask128, 0, 4) && "Illegal shuffle");
41007 bool SingleOp = (Ops.size() == 1);
41008 if (isPack || OneUseOps ||
41009 shouldUseHorizontalOp(SingleOp, DAG, Subtarget)) {
41010 SDValue Lo = isInRange(WideMask128[0], 0, 2) ? BC0 : BC1;
41011 SDValue Hi = isInRange(WideMask128[1], 0, 2) ? BC0 : BC1;
41012 Lo = Lo.getOperand(WideMask128[0] & 1);
41013 Hi = Hi.getOperand(WideMask128[1] & 1);
41014 if (SingleOp) {
41015 SDValue Undef = DAG.getUNDEF(SrcVT);
41016 SDValue Zero = getZeroVector(SrcVT, Subtarget, DAG, DL);
41017 Lo = (WideMask128[0] == SM_SentinelZero ? Zero : Lo);
41018 Hi = (WideMask128[1] == SM_SentinelZero ? Zero : Hi);
41019 Lo = (WideMask128[0] == SM_SentinelUndef ? Undef : Lo);
41020 Hi = (WideMask128[1] == SM_SentinelUndef ? Undef : Hi);
41021 }
41022 return DAG.getNode(Opcode0, DL, VT0, Lo, Hi);
41023 }
41024 }
41025
41026 // If we are post-shuffling a 256-bit hop and not requiring the upper
41027 // elements, then try to narrow to a 128-bit hop directly.
41028 SmallVector<int, 16> WideMask64;
41029 if (Ops.size() == 1 && NumLanes == 2 &&
41030 scaleShuffleElements(Mask, 4, WideMask64) &&
41031 isUndefInRange(WideMask64, 2, 2)) {
41032 int M0 = WideMask64[0];
41033 int M1 = WideMask64[1];
41034 if (isInRange(M0, 0, 4) && isInRange(M1, 0, 4)) {
41036 unsigned Idx0 = (M0 & 2) ? (SrcVT.getVectorNumElements() / 2) : 0;
41037 unsigned Idx1 = (M1 & 2) ? (SrcVT.getVectorNumElements() / 2) : 0;
41038 SDValue V0 = extract128BitVector(BC[0].getOperand(M0 & 1), Idx0, DAG, DL);
41039 SDValue V1 = extract128BitVector(BC[0].getOperand(M1 & 1), Idx1, DAG, DL);
41040 SDValue Res = DAG.getNode(Opcode0, DL, HalfVT, V0, V1);
41041 return widenSubVector(Res, false, Subtarget, DAG, DL, 256);
41042 }
41043 }
41044
41045 return SDValue();
41046}
41047
41048// Attempt to constant fold all of the constant source ops.
41049// Returns true if the entire shuffle is folded to a constant.
41050// TODO: Extend this to merge multiple constant Ops and update the mask.
41052 ArrayRef<int> Mask,
41053 ArrayRef<const SDNode *> SrcNodes,
41054 SelectionDAG &DAG, const SDLoc &DL,
41055 const X86Subtarget &Subtarget) {
41056 unsigned SizeInBits = VT.getSizeInBits();
41057 unsigned NumMaskElts = Mask.size();
41058 unsigned MaskSizeInBits = SizeInBits / NumMaskElts;
41059 unsigned NumOps = Ops.size();
41060
41061 // Extract constant bits from each source op.
41062 SmallVector<APInt, 16> UndefEltsOps(NumOps);
41064 for (unsigned I = 0; I != NumOps; ++I)
41065 if (!getTargetConstantBitsFromNode(Ops[I], MaskSizeInBits, UndefEltsOps[I],
41066 RawBitsOps[I],
41067 /*AllowWholeUndefs*/ true,
41068 /*AllowPartialUndefs*/ true))
41069 return SDValue();
41070
41071 // If we're optimizing for size, only fold if at least one of the constants is
41072 // only used once or the combined shuffle has included a variable mask
41073 // shuffle, this is to avoid constant pool bloat.
41074 bool IsOptimizingSize = DAG.shouldOptForSize();
41075 bool HasVariableMask = llvm::any_of(SrcNodes, [](const SDNode *N) {
41076 return isTargetShuffleVariableMask(N->getOpcode());
41077 });
41078 if (IsOptimizingSize && !HasVariableMask &&
41079 llvm::none_of(Ops, [](SDValue SrcOp) { return SrcOp->hasOneUse(); }))
41080 return SDValue();
41081
41082 // Shuffle the constant bits according to the mask.
41083 APInt UndefElts(NumMaskElts, 0);
41084 APInt ZeroElts(NumMaskElts, 0);
41085 APInt ConstantElts(NumMaskElts, 0);
41086 SmallVector<APInt, 8> ConstantBitData(NumMaskElts,
41087 APInt::getZero(MaskSizeInBits));
41088 for (unsigned i = 0; i != NumMaskElts; ++i) {
41089 int M = Mask[i];
41090 if (M == SM_SentinelUndef) {
41091 UndefElts.setBit(i);
41092 continue;
41093 } else if (M == SM_SentinelZero) {
41094 ZeroElts.setBit(i);
41095 continue;
41096 }
41097 assert(0 <= M && M < (int)(NumMaskElts * NumOps));
41098
41099 unsigned SrcOpIdx = (unsigned)M / NumMaskElts;
41100 unsigned SrcMaskIdx = (unsigned)M % NumMaskElts;
41101
41102 auto &SrcUndefElts = UndefEltsOps[SrcOpIdx];
41103 if (SrcUndefElts[SrcMaskIdx]) {
41104 UndefElts.setBit(i);
41105 continue;
41106 }
41107
41108 auto &SrcEltBits = RawBitsOps[SrcOpIdx];
41109 APInt &Bits = SrcEltBits[SrcMaskIdx];
41110 if (!Bits) {
41111 ZeroElts.setBit(i);
41112 continue;
41113 }
41114
41115 ConstantElts.setBit(i);
41116 ConstantBitData[i] = Bits;
41117 }
41118 assert((UndefElts | ZeroElts | ConstantElts).isAllOnes());
41119
41120 // Attempt to create a zero vector.
41121 if ((UndefElts | ZeroElts).isAllOnes())
41122 return getZeroVector(VT, Subtarget, DAG, DL);
41123
41124 // Create the constant data.
41125 MVT MaskSVT;
41126 if (VT.isFloatingPoint() && (MaskSizeInBits == 32 || MaskSizeInBits == 64))
41127 MaskSVT = MVT::getFloatingPointVT(MaskSizeInBits);
41128 else
41129 MaskSVT = MVT::getIntegerVT(MaskSizeInBits);
41130
41131 MVT MaskVT = MVT::getVectorVT(MaskSVT, NumMaskElts);
41132 if (!DAG.getTargetLoweringInfo().isTypeLegal(MaskVT))
41133 return SDValue();
41134
41135 SDValue CstOp = getConstVector(ConstantBitData, UndefElts, MaskVT, DAG, DL);
41136 return DAG.getBitcast(VT, CstOp);
41137}
41138
41139namespace llvm {
41140 namespace X86 {
41141 enum {
41143 };
41144 } // namespace X86
41145} // namespace llvm
41146
41147/// Fully generic combining of x86 shuffle instructions.
41148///
41149/// This should be the last combine run over the x86 shuffle instructions. Once
41150/// they have been fully optimized, this will recursively consider all chains
41151/// of single-use shuffle instructions, build a generic model of the cumulative
41152/// shuffle operation, and check for simpler instructions which implement this
41153/// operation. We use this primarily for two purposes:
41154///
41155/// 1) Collapse generic shuffles to specialized single instructions when
41156/// equivalent. In most cases, this is just an encoding size win, but
41157/// sometimes we will collapse multiple generic shuffles into a single
41158/// special-purpose shuffle.
41159/// 2) Look for sequences of shuffle instructions with 3 or more total
41160/// instructions, and replace them with the slightly more expensive SSSE3
41161/// PSHUFB instruction if available. We do this as the last combining step
41162/// to ensure we avoid using PSHUFB if we can implement the shuffle with
41163/// a suitable short sequence of other instructions. The PSHUFB will either
41164/// use a register or have to read from memory and so is slightly (but only
41165/// slightly) more expensive than the other shuffle instructions.
41166///
41167/// Because this is inherently a quadratic operation (for each shuffle in
41168/// a chain, we recurse up the chain), the depth is limited to 8 instructions.
41169/// This should never be an issue in practice as the shuffle lowering doesn't
41170/// produce sequences of more than 8 instructions.
41171///
41172/// FIXME: We will currently miss some cases where the redundant shuffling
41173/// would simplify under the threshold for PSHUFB formation because of
41174/// combine-ordering. To fix this, we should do the redundant instruction
41175/// combining in this recursive walk.
41177 ArrayRef<SDValue> SrcOps, int SrcOpIndex, unsigned RootOpc, MVT RootVT,
41178 ArrayRef<int> RootMask, ArrayRef<const SDNode *> SrcNodes, unsigned Depth,
41179 unsigned MaxDepth, bool AllowVariableCrossLaneMask,
41180 bool AllowVariablePerLaneMask, bool IsMaskedShuffle, SelectionDAG &DAG,
41181 const SDLoc &DL, const X86Subtarget &Subtarget) {
41182 assert(!RootMask.empty() &&
41183 (RootMask.size() > 1 || (RootMask[0] == 0 && SrcOpIndex == 0)) &&
41184 "Illegal shuffle root mask");
41185 assert(RootVT.isVector() && "Shuffles operate on vector types!");
41186 unsigned RootSizeInBits = RootVT.getSizeInBits();
41187 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
41188
41189 // Bound the depth of our recursive combine because this is ultimately
41190 // quadratic in nature.
41191 if (Depth >= MaxDepth)
41192 return SDValue();
41193
41194 // Directly rip through bitcasts to find the underlying operand.
41195 SDValue Op = SrcOps[SrcOpIndex];
41197
41198 EVT VT = Op.getValueType();
41199 if (!VT.isVector() || !VT.isSimple())
41200 return SDValue(); // Bail if we hit a non-simple non-vector.
41201
41202 // FIXME: Just bail on f16 for now.
41203 if (VT.getVectorElementType() == MVT::f16)
41204 return SDValue();
41205
41206 assert((RootSizeInBits % VT.getSizeInBits()) == 0 &&
41207 "Can only combine shuffles upto size of the root op.");
41208
41209 // Create a demanded elts mask from the referenced elements of Op.
41210 APInt OpDemandedElts = APInt::getZero(RootMask.size());
41211 for (int M : RootMask) {
41212 int BaseIdx = RootMask.size() * SrcOpIndex;
41213 if (isInRange(M, BaseIdx, BaseIdx + RootMask.size()))
41214 OpDemandedElts.setBit(M - BaseIdx);
41215 }
41216 if (RootSizeInBits != VT.getSizeInBits()) {
41217 // Op is smaller than Root - extract the demanded elts for the subvector.
41218 unsigned Scale = RootSizeInBits / VT.getSizeInBits();
41219 unsigned NumOpMaskElts = RootMask.size() / Scale;
41220 assert((RootMask.size() % Scale) == 0 && "Root mask size mismatch");
41221 assert(OpDemandedElts
41222 .extractBits(RootMask.size() - NumOpMaskElts, NumOpMaskElts)
41223 .isZero() &&
41224 "Out of range elements referenced in root mask");
41225 OpDemandedElts = OpDemandedElts.extractBits(NumOpMaskElts, 0);
41226 }
41227 OpDemandedElts =
41228 APIntOps::ScaleBitMask(OpDemandedElts, VT.getVectorNumElements());
41229
41230 // Extract target shuffle mask and resolve sentinels and inputs.
41231 SmallVector<int, 64> OpMask;
41232 SmallVector<SDValue, 2> OpInputs;
41233 APInt OpUndef, OpZero;
41234 if (getTargetShuffleInputs(Op, OpDemandedElts, OpInputs, OpMask, OpUndef,
41235 OpZero, DAG, Depth, false)) {
41236 // Shuffle inputs must not be larger than the shuffle result.
41237 // TODO: Relax this for single input faux shuffles (e.g. trunc).
41238 if (llvm::any_of(OpInputs, [VT](SDValue OpInput) {
41239 return OpInput.getValueSizeInBits() > VT.getSizeInBits();
41240 }))
41241 return SDValue();
41242 } else if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
41243 (RootSizeInBits % Op.getOperand(0).getValueSizeInBits()) == 0 &&
41244 !isNullConstant(Op.getOperand(1))) {
41245 SDValue SrcVec = Op.getOperand(0);
41246 int ExtractIdx = Op.getConstantOperandVal(1);
41247 unsigned NumElts = VT.getVectorNumElements();
41248 OpInputs.assign({SrcVec});
41249 OpMask.assign(NumElts, SM_SentinelUndef);
41250 std::iota(OpMask.begin(), OpMask.end(), ExtractIdx);
41251 OpZero = OpUndef = APInt::getZero(NumElts);
41252 } else {
41253 return SDValue();
41254 }
41255
41256 // If the shuffle result was smaller than the root, we need to adjust the
41257 // mask indices and pad the mask with undefs.
41258 if (RootSizeInBits > VT.getSizeInBits()) {
41259 unsigned NumSubVecs = RootSizeInBits / VT.getSizeInBits();
41260 unsigned OpMaskSize = OpMask.size();
41261 if (OpInputs.size() > 1) {
41262 unsigned PaddedMaskSize = NumSubVecs * OpMaskSize;
41263 for (int &M : OpMask) {
41264 if (M < 0)
41265 continue;
41266 int EltIdx = M % OpMaskSize;
41267 int OpIdx = M / OpMaskSize;
41268 M = (PaddedMaskSize * OpIdx) + EltIdx;
41269 }
41270 }
41271 OpZero = OpZero.zext(NumSubVecs * OpMaskSize);
41272 OpUndef = OpUndef.zext(NumSubVecs * OpMaskSize);
41273 OpMask.append((NumSubVecs - 1) * OpMaskSize, SM_SentinelUndef);
41274 }
41275
41278
41279 // We don't need to merge masks if the root is empty.
41280 bool EmptyRoot = (Depth == 0) && (RootMask.size() == 1);
41281 if (EmptyRoot) {
41282 // Only resolve zeros if it will remove an input, otherwise we might end
41283 // up in an infinite loop.
41284 bool ResolveKnownZeros = true;
41285 if (!OpZero.isZero()) {
41286 APInt UsedInputs = APInt::getZero(OpInputs.size());
41287 for (int i = 0, e = OpMask.size(); i != e; ++i) {
41288 int M = OpMask[i];
41289 if (OpUndef[i] || OpZero[i] || isUndefOrZero(M))
41290 continue;
41291 UsedInputs.setBit(M / OpMask.size());
41292 if (UsedInputs.isAllOnes()) {
41293 ResolveKnownZeros = false;
41294 break;
41295 }
41296 }
41297 }
41298 resolveTargetShuffleFromZeroables(OpMask, OpUndef, OpZero,
41299 ResolveKnownZeros);
41300
41301 Mask = OpMask;
41302 Ops.append(OpInputs.begin(), OpInputs.end());
41303 } else {
41304 resolveTargetShuffleFromZeroables(OpMask, OpUndef, OpZero);
41305
41306 // Add the inputs to the Ops list, avoiding duplicates.
41307 Ops.append(SrcOps.begin(), SrcOps.end());
41308
41309 auto AddOp = [&Ops](SDValue Input, int InsertionPoint) -> int {
41310 // Attempt to find an existing match.
41312 for (int i = 0, e = Ops.size(); i < e; ++i)
41313 if (InputBC == peekThroughBitcasts(Ops[i]))
41314 return i;
41315 // Match failed - should we replace an existing Op?
41316 if (InsertionPoint >= 0) {
41318 return InsertionPoint;
41319 }
41320 // Add to the end of the Ops list.
41321 Ops.push_back(Input);
41322 return Ops.size() - 1;
41323 };
41324
41325 SmallVector<int, 2> OpInputIdx;
41326 for (SDValue OpInput : OpInputs)
41327 OpInputIdx.push_back(
41328 AddOp(OpInput, OpInputIdx.empty() ? SrcOpIndex : -1));
41329
41330 assert(((RootMask.size() > OpMask.size() &&
41331 RootMask.size() % OpMask.size() == 0) ||
41332 (OpMask.size() > RootMask.size() &&
41333 OpMask.size() % RootMask.size() == 0) ||
41334 OpMask.size() == RootMask.size()) &&
41335 "The smaller number of elements must divide the larger.");
41336
41337 // This function can be performance-critical, so we rely on the power-of-2
41338 // knowledge that we have about the mask sizes to replace div/rem ops with
41339 // bit-masks and shifts.
41341 "Non-power-of-2 shuffle mask sizes");
41343 "Non-power-of-2 shuffle mask sizes");
41344 unsigned RootMaskSizeLog2 = llvm::countr_zero(RootMask.size());
41345 unsigned OpMaskSizeLog2 = llvm::countr_zero(OpMask.size());
41346
41347 unsigned MaskWidth = std::max<unsigned>(OpMask.size(), RootMask.size());
41348 unsigned RootRatio =
41349 std::max<unsigned>(1, OpMask.size() >> RootMaskSizeLog2);
41350 unsigned OpRatio = std::max<unsigned>(1, RootMask.size() >> OpMaskSizeLog2);
41351 assert((RootRatio == 1 || OpRatio == 1) &&
41352 "Must not have a ratio for both incoming and op masks!");
41353
41354 assert(isPowerOf2_32(MaskWidth) && "Non-power-of-2 shuffle mask sizes");
41355 assert(isPowerOf2_32(RootRatio) && "Non-power-of-2 shuffle mask sizes");
41356 assert(isPowerOf2_32(OpRatio) && "Non-power-of-2 shuffle mask sizes");
41357 unsigned RootRatioLog2 = llvm::countr_zero(RootRatio);
41358 unsigned OpRatioLog2 = llvm::countr_zero(OpRatio);
41359
41360 Mask.resize(MaskWidth, SM_SentinelUndef);
41361
41362 // Merge this shuffle operation's mask into our accumulated mask. Note that
41363 // this shuffle's mask will be the first applied to the input, followed by
41364 // the root mask to get us all the way to the root value arrangement. The
41365 // reason for this order is that we are recursing up the operation chain.
41366 for (unsigned i = 0; i < MaskWidth; ++i) {
41367 unsigned RootIdx = i >> RootRatioLog2;
41368 if (RootMask[RootIdx] < 0) {
41369 // This is a zero or undef lane, we're done.
41370 Mask[i] = RootMask[RootIdx];
41371 continue;
41372 }
41373
41374 unsigned RootMaskedIdx =
41375 RootRatio == 1
41376 ? RootMask[RootIdx]
41377 : (RootMask[RootIdx] << RootRatioLog2) + (i & (RootRatio - 1));
41378
41379 // Just insert the scaled root mask value if it references an input other
41380 // than the SrcOp we're currently inserting.
41381 if ((RootMaskedIdx < (SrcOpIndex * MaskWidth)) ||
41382 (((SrcOpIndex + 1) * MaskWidth) <= RootMaskedIdx)) {
41383 Mask[i] = RootMaskedIdx;
41384 continue;
41385 }
41386
41387 RootMaskedIdx = RootMaskedIdx & (MaskWidth - 1);
41388 unsigned OpIdx = RootMaskedIdx >> OpRatioLog2;
41389 if (OpMask[OpIdx] < 0) {
41390 // The incoming lanes are zero or undef, it doesn't matter which ones we
41391 // are using.
41392 Mask[i] = OpMask[OpIdx];
41393 continue;
41394 }
41395
41396 // Ok, we have non-zero lanes, map them through to one of the Op's inputs.
41397 unsigned OpMaskedIdx = OpRatio == 1 ? OpMask[OpIdx]
41398 : (OpMask[OpIdx] << OpRatioLog2) +
41399 (RootMaskedIdx & (OpRatio - 1));
41400
41401 OpMaskedIdx = OpMaskedIdx & (MaskWidth - 1);
41402 int InputIdx = OpMask[OpIdx] / (int)OpMask.size();
41403 assert(0 <= OpInputIdx[InputIdx] && "Unknown target shuffle input");
41404 OpMaskedIdx += OpInputIdx[InputIdx] * MaskWidth;
41405
41406 Mask[i] = OpMaskedIdx;
41407 }
41408 }
41409
41410 // Peek through any free bitcasts to insert_subvector vector widenings or
41411 // extract_subvector nodes back to root size.
41412 // TODO: Can resolveTargetShuffleInputsAndMask do some of this?
41413 for (auto [I, Op] : enumerate(Ops)) {
41414 SDValue BC = Op;
41415 while (1) {
41416 if (BC.getOpcode() == ISD::BITCAST && BC.hasOneUse()) {
41417 BC = BC.getOperand(0);
41418 continue;
41419 }
41420 if (BC.getOpcode() == ISD::INSERT_SUBVECTOR &&
41421 BC.getOperand(0).isUndef() && isNullConstant(BC.getOperand(2))) {
41422 // Set out of bounds mask indices to undef.
41423 Op = BC = BC.getOperand(1);
41424 unsigned Scale = RootSizeInBits / Op.getValueSizeInBits();
41425 int Lo = I * Mask.size();
41426 int Hi = (I + 1) * Mask.size();
41427 int NewHi = Lo + (Mask.size() / Scale);
41428 for (int &M : Mask) {
41429 if (Lo <= M && NewHi <= M && M < Hi)
41430 M = SM_SentinelUndef;
41431 }
41432 continue;
41433 }
41434 if (BC.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
41435 (RootSizeInBits % BC.getOperand(0).getValueSizeInBits()) == 0 &&
41436 isNullConstant(BC.getOperand(1))) {
41437 Op = BC = BC.getOperand(0);
41438 continue;
41439 }
41440 break;
41441 }
41442 }
41443
41444 // Remove unused/repeated shuffle source ops.
41446
41447 // Handle the all undef/zero/ones cases early.
41448 if (all_of(Mask, [](int Idx) { return Idx == SM_SentinelUndef; }))
41449 return DAG.getUNDEF(RootVT);
41450 if (all_of(Mask, [](int Idx) { return Idx < 0; }))
41451 return getZeroVector(RootVT, Subtarget, DAG, DL);
41452 if (Ops.size() == 1 && ISD::isBuildVectorAllOnes(Ops[0].getNode()) &&
41454 return getOnesVector(RootVT, DAG, DL);
41455
41456 assert(!Ops.empty() && "Shuffle with no inputs detected");
41457
41458 // Update the list of shuffle nodes that have been combined so far.
41459 SmallVector<const SDNode *, 16> CombinedNodes(SrcNodes);
41460 CombinedNodes.push_back(Op.getNode());
41461
41462 // See if we can recurse into each shuffle source op (if it's a target
41463 // shuffle). The source op should only be generally combined if it either has
41464 // a single use (i.e. current Op) or all its users have already been combined,
41465 // if not then we can still combine but should prevent generation of variable
41466 // shuffles to avoid constant pool bloat.
41467 // Don't recurse if we already have more source ops than we can combine in
41468 // the remaining recursion depth.
41469 if (Ops.size() < (MaxDepth - Depth)) {
41470 for (int i = 0, e = Ops.size(); i < e; ++i) {
41471 // For empty roots, we need to resolve zeroable elements before combining
41472 // them with other shuffles.
41473 SmallVector<int, 64> ResolvedMask = Mask;
41474 if (EmptyRoot)
41475 resolveTargetShuffleFromZeroables(ResolvedMask, OpUndef, OpZero);
41476 bool AllowCrossLaneVar = false;
41477 bool AllowPerLaneVar = false;
41478 if (Ops[i].getNode()->hasOneUse() ||
41479 SDNode::areOnlyUsersOf(CombinedNodes, Ops[i].getNode())) {
41480 AllowCrossLaneVar = AllowVariableCrossLaneMask;
41481 AllowPerLaneVar = AllowVariablePerLaneMask;
41482 }
41484 Ops, i, RootOpc, RootVT, ResolvedMask, CombinedNodes, Depth + 1,
41485 MaxDepth, AllowCrossLaneVar, AllowPerLaneVar, IsMaskedShuffle,
41486 DAG, DL, Subtarget))
41487 return Res;
41488 }
41489 }
41490
41491 // Attempt to constant fold all of the constant source ops.
41493 RootVT, Ops, Mask, CombinedNodes, DAG, DL, Subtarget))
41494 return Cst;
41495
41496 // If constant fold failed and we only have constants - then we have
41497 // multiple uses by a single non-variable shuffle - just bail.
41498 if (Depth == 0 && llvm::all_of(Ops, [&](SDValue Op) {
41499 APInt UndefElts;
41500 SmallVector<APInt> RawBits;
41501 unsigned EltSizeInBits = RootSizeInBits / Mask.size();
41502 return getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts,
41503 RawBits,
41504 /*AllowWholeUndefs*/ true,
41505 /*AllowPartialUndefs*/ true);
41506 })) {
41507 return SDValue();
41508 }
41509
41510 // Canonicalize the combined shuffle mask chain with horizontal ops.
41511 // NOTE: This will update the Ops and Mask.
41513 Ops, Mask, RootSizeInBits, DL, DAG, Subtarget))
41514 return DAG.getBitcast(RootVT, HOp);
41515
41516 // Try to refine our inputs given our knowledge of target shuffle mask.
41517 for (auto I : enumerate(Ops)) {
41518 int OpIdx = I.index();
41519 SDValue &Op = I.value();
41520
41521 // What range of shuffle mask element values results in picking from Op?
41522 int Lo = OpIdx * Mask.size();
41523 int Hi = Lo + Mask.size();
41524
41525 // Which elements of Op do we demand, given the mask's granularity?
41526 APInt OpDemandedElts(Mask.size(), 0);
41527 for (int MaskElt : Mask) {
41528 if (isInRange(MaskElt, Lo, Hi)) { // Picks from Op?
41529 int OpEltIdx = MaskElt - Lo;
41530 OpDemandedElts.setBit(OpEltIdx);
41531 }
41532 }
41533
41534 // Is the shuffle result smaller than the root?
41535 if (Op.getValueSizeInBits() < RootSizeInBits) {
41536 // We padded the mask with undefs. But we now need to undo that.
41537 unsigned NumExpectedVectorElts = Mask.size();
41538 unsigned EltSizeInBits = RootSizeInBits / NumExpectedVectorElts;
41539 unsigned NumOpVectorElts = Op.getValueSizeInBits() / EltSizeInBits;
41540 assert(!OpDemandedElts.extractBits(
41541 NumExpectedVectorElts - NumOpVectorElts, NumOpVectorElts) &&
41542 "Demanding the virtual undef widening padding?");
41543 OpDemandedElts = OpDemandedElts.trunc(NumOpVectorElts); // NUW
41544 }
41545
41546 // The Op itself may be of different VT, so we need to scale the mask.
41547 unsigned NumOpElts = Op.getValueType().getVectorNumElements();
41548 APInt OpScaledDemandedElts = APIntOps::ScaleBitMask(OpDemandedElts, NumOpElts);
41549
41550 // Can this operand be simplified any further, given it's demanded elements?
41552 Op, OpScaledDemandedElts, DAG))
41553 Op = NewOp;
41554 }
41555 // FIXME: should we rerun resolveTargetShuffleInputsAndMask() now?
41556
41557 // Widen any subvector shuffle inputs we've collected.
41558 // TODO: Remove this to avoid generating temporary nodes, we should only
41559 // widen once combineX86ShuffleChain has found a match.
41560 if (any_of(Ops, [RootSizeInBits](SDValue Op) {
41561 return Op.getValueSizeInBits() < RootSizeInBits;
41562 })) {
41563 for (SDValue &Op : Ops)
41564 if (Op.getValueSizeInBits() < RootSizeInBits)
41565 Op = widenSubVector(Op, false, Subtarget, DAG, SDLoc(Op),
41566 RootSizeInBits);
41567 // Reresolve - we might have repeated subvector sources.
41569 }
41570
41571 // Handle the all undef/zero/ones cases.
41572 if (all_of(Mask, [](int Idx) { return Idx == SM_SentinelUndef; }))
41573 return DAG.getUNDEF(RootVT);
41574 if (all_of(Mask, [](int Idx) { return Idx < 0; }))
41575 return getZeroVector(RootVT, Subtarget, DAG, DL);
41576 if (Ops.size() == 1 && ISD::isBuildVectorAllOnes(Ops[0].getNode()) &&
41578 return getOnesVector(RootVT, DAG, DL);
41579
41580 assert(!Ops.empty() && "Shuffle with no inputs detected");
41581
41582 // We can only combine unary and binary shuffle mask cases.
41583 if (Ops.size() <= 2) {
41584 // Minor canonicalization of the accumulated shuffle mask to make it easier
41585 // to match below. All this does is detect masks with sequential pairs of
41586 // elements, and shrink them to the half-width mask. It does this in a loop
41587 // so it will reduce the size of the mask to the minimal width mask which
41588 // performs an equivalent shuffle.
41589 while (Mask.size() > 1) {
41590 SmallVector<int, 64> WidenedMask;
41591 if (!canWidenShuffleElements(Mask, WidenedMask))
41592 break;
41593 Mask = std::move(WidenedMask);
41594 }
41595
41596 // Canonicalization of binary shuffle masks to improve pattern matching by
41597 // commuting the inputs.
41598 if (Ops.size() == 2 && canonicalizeShuffleMaskWithCommute(Mask)) {
41600 std::swap(Ops[0], Ops[1]);
41601 }
41602
41603 // Try to combine into a single shuffle instruction.
41604 if (SDValue Shuffle = combineX86ShuffleChain(
41605 Ops, RootOpc, RootVT, Mask, Depth, CombinedNodes,
41606 AllowVariableCrossLaneMask, AllowVariablePerLaneMask,
41607 IsMaskedShuffle, DAG, DL, Subtarget))
41608 return Shuffle;
41609
41610 // If all the operands come from the same larger vector, fallthrough and try
41611 // to use combineX86ShuffleChainWithExtract.
41614 if (Ops.size() != 2 || !Subtarget.hasAVX2() || RootSizeInBits != 128 ||
41615 (RootSizeInBits / Mask.size()) != 64 ||
41616 LHS.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
41617 RHS.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
41618 LHS.getOperand(0) != RHS.getOperand(0))
41619 return SDValue();
41620 }
41621
41622 // If that failed and any input is extracted then try to combine as a
41623 // shuffle with the larger type.
41625 Ops, RootOpc, RootVT, Mask, Depth, CombinedNodes,
41626 AllowVariableCrossLaneMask, AllowVariablePerLaneMask, IsMaskedShuffle,
41627 DAG, DL, Subtarget);
41628}
41629
41630/// Helper entry wrapper to combineX86ShufflesRecursively.
41632 const X86Subtarget &Subtarget) {
41634 {Op}, 0, Op.getOpcode(), Op.getSimpleValueType(), {0}, {}, /*Depth=*/0,
41635 X86::MaxShuffleCombineDepth, /*AllowVariableCrossLaneMask=*/true,
41636 /*AllowVariablePerLaneMask=*/true, isMaskableNode(Op, Subtarget), DAG,
41637 SDLoc(Op), Subtarget);
41638}
41639
41640/// Get the PSHUF-style mask from PSHUF node.
41641///
41642/// This is a very minor wrapper around getTargetShuffleMask to easy forming v4
41643/// PSHUF-style masks that can be reused with such instructions.
41645 MVT VT = N.getSimpleValueType();
41648 bool HaveMask = getTargetShuffleMask(N, false, Ops, Mask);
41649 (void)HaveMask;
41650 assert(HaveMask);
41651
41652 // If we have more than 128-bits, only the low 128-bits of shuffle mask
41653 // matter. Check that the upper masks are repeats and remove them.
41654 if (VT.getSizeInBits() > 128) {
41655 int LaneElts = 128 / VT.getScalarSizeInBits();
41656#ifndef NDEBUG
41657 for (int i = 1, NumLanes = VT.getSizeInBits() / 128; i < NumLanes; ++i)
41658 for (int j = 0; j < LaneElts; ++j)
41659 assert(Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) &&
41660 "Mask doesn't repeat in high 128-bit lanes!");
41661#endif
41662 Mask.resize(LaneElts);
41663 }
41664
41665 switch (N.getOpcode()) {
41666 case X86ISD::PSHUFD:
41667 return Mask;
41668 case X86ISD::PSHUFLW:
41669 Mask.resize(4);
41670 return Mask;
41671 case X86ISD::PSHUFHW:
41672 Mask.erase(Mask.begin(), Mask.begin() + 4);
41673 for (int &M : Mask)
41674 M -= 4;
41675 return Mask;
41676 default:
41677 llvm_unreachable("No valid shuffle instruction found!");
41678 }
41679}
41680
41681/// Get the expanded blend mask from a BLENDI node.
41682/// For v16i16 nodes, this will splat the repeated i8 mask.
41684 assert(V.getOpcode() == X86ISD::BLENDI && "Unknown blend shuffle");
41685 unsigned NumElts = V.getSimpleValueType().getVectorNumElements();
41686 APInt Mask = V.getConstantOperandAPInt(2);
41687 if (Mask.getBitWidth() > NumElts)
41688 Mask = Mask.trunc(NumElts);
41689 if (NumElts == 16) {
41690 assert(Mask.getBitWidth() == 8 && "Unexpected v16i16 blend mask width");
41691 Mask = APInt::getSplat(16, Mask);
41692 }
41693 assert(Mask.getBitWidth() == NumElts && "Unexpected blend mask width");
41694 return Mask;
41695}
41696
41697/// Search for a combinable shuffle across a chain ending in pshufd.
41698///
41699/// We walk up the chain and look for a combinable shuffle, skipping over
41700/// shuffles that we could hoist this shuffle's transformation past without
41701/// altering anything.
41704 const SDLoc &DL,
41705 SelectionDAG &DAG) {
41706 assert(N.getOpcode() == X86ISD::PSHUFD &&
41707 "Called with something other than an x86 128-bit half shuffle!");
41708
41709 // Walk up a single-use chain looking for a combinable shuffle. Keep a stack
41710 // of the shuffles in the chain so that we can form a fresh chain to replace
41711 // this one.
41713 SDValue V = N.getOperand(0);
41714 for (; V.hasOneUse(); V = V.getOperand(0)) {
41715 switch (V.getOpcode()) {
41716 default:
41717 return SDValue(); // Nothing combined!
41718
41719 case ISD::BITCAST:
41720 // Skip bitcasts as we always know the type for the target specific
41721 // instructions.
41722 continue;
41723
41724 case X86ISD::PSHUFD:
41725 // Found another dword shuffle.
41726 break;
41727
41728 case X86ISD::PSHUFLW:
41729 // Check that the low words (being shuffled) are the identity in the
41730 // dword shuffle, and the high words are self-contained.
41731 if (Mask[0] != 0 || Mask[1] != 1 ||
41732 !(Mask[2] >= 2 && Mask[2] < 4 && Mask[3] >= 2 && Mask[3] < 4))
41733 return SDValue();
41734
41735 Chain.push_back(V);
41736 continue;
41737
41738 case X86ISD::PSHUFHW:
41739 // Check that the high words (being shuffled) are the identity in the
41740 // dword shuffle, and the low words are self-contained.
41741 if (Mask[2] != 2 || Mask[3] != 3 ||
41742 !(Mask[0] >= 0 && Mask[0] < 2 && Mask[1] >= 0 && Mask[1] < 2))
41743 return SDValue();
41744
41745 Chain.push_back(V);
41746 continue;
41747
41748 case X86ISD::UNPCKL:
41749 case X86ISD::UNPCKH:
41750 // For either i8 -> i16 or i16 -> i32 unpacks, we can combine a dword
41751 // shuffle into a preceding word shuffle.
41752 if (V.getSimpleValueType().getVectorElementType() != MVT::i8 &&
41753 V.getSimpleValueType().getVectorElementType() != MVT::i16)
41754 return SDValue();
41755
41756 // Search for a half-shuffle which we can combine with.
41757 unsigned CombineOp =
41758 V.getOpcode() == X86ISD::UNPCKL ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
41759 if (V.getOperand(0) != V.getOperand(1) ||
41760 !V->isOnlyUserOf(V.getOperand(0).getNode()))
41761 return SDValue();
41762 Chain.push_back(V);
41763 V = V.getOperand(0);
41764 do {
41765 switch (V.getOpcode()) {
41766 default:
41767 return SDValue(); // Nothing to combine.
41768
41769 case X86ISD::PSHUFLW:
41770 case X86ISD::PSHUFHW:
41771 if (V.getOpcode() == CombineOp)
41772 break;
41773
41774 Chain.push_back(V);
41775
41776 [[fallthrough]];
41777 case ISD::BITCAST:
41778 V = V.getOperand(0);
41779 continue;
41780 }
41781 break;
41782 } while (V.hasOneUse());
41783 break;
41784 }
41785 // Break out of the loop if we break out of the switch.
41786 break;
41787 }
41788
41789 if (!V.hasOneUse())
41790 // We fell out of the loop without finding a viable combining instruction.
41791 return SDValue();
41792
41793 // Merge this node's mask and our incoming mask.
41795 for (int &M : Mask)
41796 M = VMask[M];
41797 V = DAG.getNode(V.getOpcode(), DL, V.getValueType(), V.getOperand(0),
41798 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
41799
41800 // Rebuild the chain around this new shuffle.
41801 while (!Chain.empty()) {
41802 SDValue W = Chain.pop_back_val();
41803
41804 if (V.getValueType() != W.getOperand(0).getValueType())
41805 V = DAG.getBitcast(W.getOperand(0).getValueType(), V);
41806
41807 switch (W.getOpcode()) {
41808 default:
41809 llvm_unreachable("Only PSHUF and UNPCK instructions get here!");
41810
41811 case X86ISD::UNPCKL:
41812 case X86ISD::UNPCKH:
41813 V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, V);
41814 break;
41815
41816 case X86ISD::PSHUFD:
41817 case X86ISD::PSHUFLW:
41818 case X86ISD::PSHUFHW:
41819 V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, W.getOperand(1));
41820 break;
41821 }
41822 }
41823 if (V.getValueType() != N.getValueType())
41824 V = DAG.getBitcast(N.getValueType(), V);
41825
41826 // Return the new chain to replace N.
41827 return V;
41828}
41829
41830// Attempt to commute shufps LHS loads:
41831// permilps(shufps(load(),x)) --> permilps(shufps(x,load()))
41833 SelectionDAG &DAG) {
41834 // TODO: Add vXf64 support.
41835 if (VT != MVT::v4f32 && VT != MVT::v8f32 && VT != MVT::v16f32)
41836 return SDValue();
41837
41838 // SHUFP(LHS, RHS) -> SHUFP(RHS, LHS) iff LHS is foldable + RHS is not.
41839 auto commuteSHUFP = [&VT, &DL, &DAG](SDValue Parent, SDValue V) {
41840 if (V.getOpcode() != X86ISD::SHUFP || !Parent->isOnlyUserOf(V.getNode()))
41841 return SDValue();
41842 SDValue N0 = V.getOperand(0);
41843 SDValue N1 = V.getOperand(1);
41844 unsigned Imm = V.getConstantOperandVal(2);
41845 const X86Subtarget &Subtarget = DAG.getSubtarget<X86Subtarget>();
41846 if (!X86::mayFoldLoad(peekThroughOneUseBitcasts(N0), Subtarget) ||
41848 return SDValue();
41849 Imm = llvm::rotl<uint8_t>(Imm, 4);
41850 return DAG.getNode(X86ISD::SHUFP, DL, VT, N1, N0,
41851 DAG.getTargetConstant(Imm, DL, MVT::i8));
41852 };
41853
41854 switch (N.getOpcode()) {
41855 case X86ISD::VPERMILPI:
41856 if (SDValue NewSHUFP = commuteSHUFP(N, N.getOperand(0))) {
41857 unsigned Imm = N.getConstantOperandVal(1);
41858 return DAG.getNode(X86ISD::VPERMILPI, DL, VT, NewSHUFP,
41859 DAG.getTargetConstant(Imm ^ 0xAA, DL, MVT::i8));
41860 }
41861 break;
41862 case X86ISD::SHUFP: {
41863 SDValue N0 = N.getOperand(0);
41864 SDValue N1 = N.getOperand(1);
41865 unsigned Imm = N.getConstantOperandVal(2);
41866 if (N0 == N1) {
41867 if (SDValue NewSHUFP = commuteSHUFP(N, N0))
41868 return DAG.getNode(X86ISD::SHUFP, DL, VT, NewSHUFP, NewSHUFP,
41869 DAG.getTargetConstant(Imm ^ 0xAA, DL, MVT::i8));
41870 } else if (SDValue NewSHUFP = commuteSHUFP(N, N0)) {
41871 return DAG.getNode(X86ISD::SHUFP, DL, VT, NewSHUFP, N1,
41872 DAG.getTargetConstant(Imm ^ 0x0A, DL, MVT::i8));
41873 } else if (SDValue NewSHUFP = commuteSHUFP(N, N1)) {
41874 return DAG.getNode(X86ISD::SHUFP, DL, VT, N0, NewSHUFP,
41875 DAG.getTargetConstant(Imm ^ 0xA0, DL, MVT::i8));
41876 }
41877 break;
41878 }
41879 }
41880
41881 return SDValue();
41882}
41883
41884// Attempt to fold BLEND(PERMUTE(X),PERMUTE(Y)) -> PERMUTE(BLEND(X,Y))
41885// iff we don't demand the same element index for both X and Y.
41886static SDValue
41888 const APInt &DemandedElts, SelectionDAG &DAG,
41889 const X86Subtarget &Subtarget, const SDLoc &DL) {
41890 assert(isBlendOrUndef(BlendMask) && "Blend shuffle expected");
41891 if (!N0.hasOneUse() || !N1.hasOneUse())
41892 return SDValue();
41893
41894 unsigned NumElts = VT.getVectorNumElements();
41897
41898 // See if both operands are shuffles, and that we can scale the shuffle masks
41899 // to the same width as the blend mask.
41900 // TODO: Support SM_SentinelZero?
41901 SmallVector<SDValue, 2> Ops0, Ops1;
41902 SmallVector<int, 32> Mask0, Mask1, ScaledMask0, ScaledMask1;
41903 if (!getTargetShuffleMask(BC0, /*AllowSentinelZero=*/false, Ops0, Mask0) ||
41904 !getTargetShuffleMask(BC1, /*AllowSentinelZero=*/false, Ops1, Mask1) ||
41905 !scaleShuffleElements(Mask0, NumElts, ScaledMask0) ||
41906 !scaleShuffleElements(Mask1, NumElts, ScaledMask1))
41907 return SDValue();
41908
41909 // Determine the demanded elts from both permutes.
41910 APInt Demanded0, DemandedLHS0, DemandedRHS0;
41911 APInt Demanded1, DemandedLHS1, DemandedRHS1;
41912 if (!getShuffleDemandedElts(NumElts, BlendMask, DemandedElts, Demanded0,
41913 Demanded1,
41914 /*AllowUndefElts=*/true) ||
41915 !getShuffleDemandedElts(NumElts, ScaledMask0, Demanded0, DemandedLHS0,
41916 DemandedRHS0, /*AllowUndefElts=*/true) ||
41917 !getShuffleDemandedElts(NumElts, ScaledMask1, Demanded1, DemandedLHS1,
41918 DemandedRHS1, /*AllowUndefElts=*/true))
41919 return SDValue();
41920
41921 // Confirm that we only use a single operand from both permutes and that we
41922 // don't demand the same index from both.
41923 if (!DemandedRHS0.isZero() || !DemandedRHS1.isZero() ||
41924 DemandedLHS0.intersects(DemandedLHS1))
41925 return SDValue();
41926
41927 // Use the permute demanded elts masks as the new blend mask.
41928 // Create the new permute mask as a blend of the 2 original permute masks.
41929 SmallVector<int, 32> NewBlendMask(NumElts, SM_SentinelUndef);
41930 SmallVector<int, 32> NewPermuteMask(NumElts, SM_SentinelUndef);
41931 for (unsigned I = 0; I != NumElts; ++I) {
41932 if (Demanded0[I]) {
41933 int M = ScaledMask0[I];
41934 if (0 <= M) {
41935 assert(isUndefOrEqual(NewBlendMask[M], M) &&
41936 "BlendMask demands LHS AND RHS");
41937 NewBlendMask[M] = M;
41938 NewPermuteMask[I] = M;
41939 }
41940 } else if (Demanded1[I]) {
41941 int M = ScaledMask1[I];
41942 if (0 <= M) {
41943 assert(isUndefOrEqual(NewBlendMask[M], M + NumElts) &&
41944 "BlendMask demands LHS AND RHS");
41945 NewBlendMask[M] = M + NumElts;
41946 NewPermuteMask[I] = M;
41947 }
41948 }
41949 }
41950 assert(isBlendOrUndef(NewBlendMask) && "Bad blend");
41951 assert(isUndefOrInRange(NewPermuteMask, 0, NumElts) && "Bad permute");
41952
41953 // v16i16 shuffles can explode in complexity very easily, only accept them if
41954 // the blend mask is the same in the 128-bit subvectors (or can widen to
41955 // v8i32) and the permute can be widened as well.
41956 if (VT == MVT::v16i16) {
41957 if (!is128BitLaneRepeatedShuffleMask(VT, NewBlendMask) &&
41958 !canWidenShuffleElements(NewBlendMask))
41959 return SDValue();
41960 if (!canWidenShuffleElements(NewPermuteMask))
41961 return SDValue();
41962 }
41963
41964 // Don't introduce lane-crossing permutes without AVX2, unless it can be
41965 // widened to a lane permute (vperm2f128).
41966 if (VT.is256BitVector() && !Subtarget.hasAVX2() &&
41968 NewPermuteMask) &&
41969 !canScaleShuffleElements(NewPermuteMask, 2))
41970 return SDValue();
41971
41972 SDValue NewBlend =
41973 DAG.getVectorShuffle(VT, DL, DAG.getBitcast(VT, Ops0[0]),
41974 DAG.getBitcast(VT, Ops1[0]), NewBlendMask);
41975 return DAG.getVectorShuffle(VT, DL, NewBlend, DAG.getUNDEF(VT),
41976 NewPermuteMask);
41977}
41978
41979// TODO - move this to TLI like isBinOp?
41980static bool isUnaryOp(unsigned Opcode) {
41981 switch (Opcode) {
41982 case ISD::CTLZ:
41983 case ISD::CTTZ:
41984 case ISD::CTPOP:
41985 return true;
41986 }
41987 return false;
41988}
41989
41990// Canonicalize SHUFFLE(UNARYOP(X)) -> UNARYOP(SHUFFLE(X)).
41991// Canonicalize SHUFFLE(BINOP(X,Y)) -> BINOP(SHUFFLE(X),SHUFFLE(Y)).
41993 const SDLoc &DL) {
41994 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
41995 EVT ShuffleVT = N.getValueType();
41996 unsigned Opc = N.getOpcode();
41997
41998 auto IsMergeableWithShuffle = [Opc, &DAG](SDValue Op, bool FoldShuf = true) {
41999 // AllZeros/AllOnes constants are freely shuffled and will peek through
42000 // bitcasts. Other constant build vectors do not peek through bitcasts. Only
42001 // merge with target shuffles if it has one use so shuffle combining is
42002 // likely to kick in. Shuffles of splats are expected to be removed.
42003 return ISD::isBuildVectorAllOnes(Op.getNode()) ||
42004 ISD::isBuildVectorAllZeros(Op.getNode()) ||
42008 (Op.getOpcode() == Opc && Op->hasOneUse()) ||
42009 (Op.getOpcode() == ISD::INSERT_SUBVECTOR && Op->hasOneUse()) ||
42010 (Op.getOpcode() == ISD::CONCAT_VECTORS && Op->hasOneUse()) ||
42011 (FoldShuf && isTargetShuffle(Op.getOpcode()) && Op->hasOneUse()) ||
42012 DAG.isSplatValue(Op, /*AllowUndefs*/ false);
42013 };
42014 auto IsSafeToMoveShuffle = [ShuffleVT](SDValue Op, unsigned BinOp) {
42015 // Ensure we only shuffle whole vector src elements, unless its a logical
42016 // binops where we can more aggressively move shuffles from dst to src.
42017 return isLogicOp(BinOp) ||
42018 (Op.getScalarValueSizeInBits() <= ShuffleVT.getScalarSizeInBits());
42019 };
42020
42021 switch (Opc) {
42022 // Unary and Unary+Permute Shuffles.
42023 case X86ISD::PSHUFB: {
42024 // Don't merge PSHUFB if it contains zero'd elements.
42025 SmallVector<int> Mask;
42027 if (!getTargetShuffleMask(N, false, Ops, Mask))
42028 break;
42029 [[fallthrough]];
42030 }
42031 case X86ISD::VBROADCAST:
42032 case X86ISD::MOVDDUP:
42033 case X86ISD::PSHUFD:
42034 case X86ISD::PSHUFHW:
42035 case X86ISD::PSHUFLW:
42036 case X86ISD::VPERMV:
42037 case X86ISD::VPERMI:
42038 case X86ISD::VPERMILPI: {
42039 unsigned SrcIdx = Opc == X86ISD::VPERMV ? 1 : 0;
42040 if (N.getOperand(SrcIdx).getValueType() == ShuffleVT &&
42041 N->isOnlyUserOf(N.getOperand(SrcIdx).getNode())) {
42042 SDValue N0 = peekThroughOneUseBitcasts(N.getOperand(SrcIdx));
42043 unsigned SrcOpcode = N0.getOpcode();
42044 EVT OpVT = N0.getValueType();
42045 if (TLI.isBinOp(SrcOpcode) && IsSafeToMoveShuffle(N0, SrcOpcode)) {
42048 bool FoldShuf = Opc != X86ISD::VPERMI && Opc != X86ISD::VPERMV;
42049 if (IsMergeableWithShuffle(Op00, FoldShuf) ||
42050 IsMergeableWithShuffle(Op01, FoldShuf)) {
42051 SDValue LHS, RHS;
42052 Op00 = DAG.getBitcast(ShuffleVT, Op00);
42053 Op01 = DAG.getBitcast(ShuffleVT, Op01);
42054 if (Opc == X86ISD::VPERMV) {
42055 LHS = DAG.getNode(Opc, DL, ShuffleVT, N.getOperand(0), Op00);
42056 RHS = DAG.getNode(Opc, DL, ShuffleVT, N.getOperand(0), Op01);
42057 } else if (N.getNumOperands() == 2) {
42058 LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00, N.getOperand(1));
42059 RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01, N.getOperand(1));
42060 } else {
42061 LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00);
42062 RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01);
42063 }
42064 return DAG.getBitcast(ShuffleVT,
42065 DAG.getNode(SrcOpcode, DL, OpVT,
42066 DAG.getBitcast(OpVT, LHS),
42067 DAG.getBitcast(OpVT, RHS)));
42068 }
42069 }
42070 if (SrcOpcode == ISD::SINT_TO_FP && IsSafeToMoveShuffle(N0, SrcOpcode) &&
42071 OpVT.getScalarSizeInBits() ==
42073 SDValue Res = DAG.getBitcast(ShuffleVT, N0.getOperand(0));
42074 if (Opc == X86ISD::VPERMV)
42075 Res = DAG.getNode(Opc, DL, ShuffleVT, N.getOperand(0), Res);
42076 else if (N.getNumOperands() == 2)
42077 Res = DAG.getNode(Opc, DL, ShuffleVT, Res, N.getOperand(1));
42078 else
42079 Res = DAG.getNode(Opc, DL, ShuffleVT, Res);
42080 Res = DAG.getBitcast(N0.getOperand(0).getValueType(), Res);
42081 return DAG.getBitcast(ShuffleVT, DAG.getNode(SrcOpcode, DL, OpVT, Res));
42082 }
42083 }
42084 break;
42085 }
42086 // Binary and Binary+Permute Shuffles.
42087 case X86ISD::INSERTPS: {
42088 // Don't merge INSERTPS if it contains zero'd elements.
42089 unsigned InsertPSMask = N.getConstantOperandVal(2);
42090 unsigned ZeroMask = InsertPSMask & 0xF;
42091 if (ZeroMask != 0)
42092 break;
42093 [[fallthrough]];
42094 }
42095 case X86ISD::MOVSD:
42096 case X86ISD::MOVSS:
42097 case X86ISD::BLENDI:
42098 case X86ISD::SHUFP:
42099 case X86ISD::UNPCKH:
42100 case X86ISD::UNPCKL: {
42101 if (N->isOnlyUserOf(N.getOperand(0).getNode()) &&
42102 N->isOnlyUserOf(N.getOperand(1).getNode())) {
42103 SDValue N0 = peekThroughOneUseBitcasts(N.getOperand(0));
42104 SDValue N1 = peekThroughOneUseBitcasts(N.getOperand(1));
42105 unsigned SrcOpcode = N0.getOpcode();
42106 if (TLI.isBinOp(SrcOpcode) && N1.getOpcode() == SrcOpcode &&
42107 N0.getValueType() == N1.getValueType() &&
42108 IsSafeToMoveShuffle(N0, SrcOpcode) &&
42109 IsSafeToMoveShuffle(N1, SrcOpcode)) {
42114 // Ensure the total number of shuffles doesn't increase by folding this
42115 // shuffle through to the source ops.
42116 if (((IsMergeableWithShuffle(Op00) && IsMergeableWithShuffle(Op10)) ||
42117 (IsMergeableWithShuffle(Op01) && IsMergeableWithShuffle(Op11))) ||
42118 ((IsMergeableWithShuffle(Op00) || IsMergeableWithShuffle(Op10)) &&
42119 (IsMergeableWithShuffle(Op01) || IsMergeableWithShuffle(Op11)))) {
42120 SDValue LHS, RHS;
42121 Op00 = DAG.getBitcast(ShuffleVT, Op00);
42122 Op10 = DAG.getBitcast(ShuffleVT, Op10);
42123 Op01 = DAG.getBitcast(ShuffleVT, Op01);
42124 Op11 = DAG.getBitcast(ShuffleVT, Op11);
42125 if (N.getNumOperands() == 3) {
42126 LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00, Op10, N.getOperand(2));
42127 RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01, Op11, N.getOperand(2));
42128 } else {
42129 LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00, Op10);
42130 RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01, Op11);
42131 }
42132 EVT OpVT = N0.getValueType();
42133 return DAG.getBitcast(ShuffleVT,
42134 DAG.getNode(SrcOpcode, DL, OpVT,
42135 DAG.getBitcast(OpVT, LHS),
42136 DAG.getBitcast(OpVT, RHS)));
42137 }
42138 }
42139 if (isUnaryOp(SrcOpcode) && N1.getOpcode() == SrcOpcode &&
42140 N0.getValueType() == N1.getValueType() &&
42141 IsSafeToMoveShuffle(N0, SrcOpcode) &&
42142 IsSafeToMoveShuffle(N1, SrcOpcode)) {
42145 SDValue Res;
42146 Op00 = DAG.getBitcast(ShuffleVT, Op00);
42147 Op10 = DAG.getBitcast(ShuffleVT, Op10);
42148 if (N.getNumOperands() == 3) {
42149 Res = DAG.getNode(Opc, DL, ShuffleVT, Op00, Op10, N.getOperand(2));
42150 } else {
42151 Res = DAG.getNode(Opc, DL, ShuffleVT, Op00, Op10);
42152 }
42153 EVT OpVT = N0.getValueType();
42154 return DAG.getBitcast(
42155 ShuffleVT,
42156 DAG.getNode(SrcOpcode, DL, OpVT, DAG.getBitcast(OpVT, Res)));
42157 }
42158 // TODO: We can generalize this for other shuffles/conversions.
42159 if (Opc == X86ISD::UNPCKL && SrcOpcode == X86ISD::CVTPH2PS &&
42160 N1.getOpcode() == SrcOpcode &&
42161 N0.getValueType() == N1.getValueType() &&
42162 N0.getOperand(0).getValueType() == N1.getOperand(0).getValueType() &&
42163 ShuffleVT.getScalarSizeInBits() == N0.getScalarValueSizeInBits() &&
42164 IsSafeToMoveShuffle(N0, SrcOpcode) &&
42165 IsSafeToMoveShuffle(N1, SrcOpcode)) {
42166 EVT OpSrcVT = N0.getOperand(0).getValueType();
42167 EVT OpDstVT = N0.getValueType();
42168 SDValue Res =
42169 DAG.getNode(Opc, DL, OpSrcVT, N0.getOperand(0), N1.getOperand(0));
42170 return DAG.getBitcast(ShuffleVT,
42171 DAG.getNode(SrcOpcode, DL, OpDstVT, Res));
42172 }
42173 }
42174 break;
42175 }
42176 }
42177 return SDValue();
42178}
42179
42180/// Attempt to fold vpermf128(op(),op()) -> op(vpermf128(),vpermf128()).
42182 SelectionDAG &DAG,
42183 const SDLoc &DL) {
42184 assert(V.getOpcode() == X86ISD::VPERM2X128 && "Unknown lane shuffle");
42185
42186 MVT VT = V.getSimpleValueType();
42187 SDValue Src0 = peekThroughBitcasts(V.getOperand(0));
42188 SDValue Src1 = peekThroughBitcasts(V.getOperand(1));
42189 unsigned SrcOpc0 = Src0.getOpcode();
42190 unsigned SrcOpc1 = Src1.getOpcode();
42191 EVT SrcVT0 = Src0.getValueType();
42192 EVT SrcVT1 = Src1.getValueType();
42193
42194 if (!Src1.isUndef() && (SrcVT0 != SrcVT1 || SrcOpc0 != SrcOpc1))
42195 return SDValue();
42196
42197 switch (SrcOpc0) {
42198 case X86ISD::MOVDDUP: {
42199 SDValue LHS = Src0.getOperand(0);
42200 SDValue RHS = Src1.isUndef() ? Src1 : Src1.getOperand(0);
42201 SDValue Res =
42202 DAG.getNode(X86ISD::VPERM2X128, DL, SrcVT0, LHS, RHS, V.getOperand(2));
42203 Res = DAG.getNode(SrcOpc0, DL, SrcVT0, Res);
42204 return DAG.getBitcast(VT, Res);
42205 }
42206 case X86ISD::VPERMILPI:
42207 // TODO: Handle v4f64 permutes with different low/high lane masks.
42208 if (SrcVT0 == MVT::v4f64) {
42209 uint64_t Mask = Src0.getConstantOperandVal(1);
42210 if ((Mask & 0x3) != ((Mask >> 2) & 0x3))
42211 break;
42212 }
42213 [[fallthrough]];
42214 case X86ISD::VSHLI:
42215 case X86ISD::VSRLI:
42216 case X86ISD::VSRAI:
42217 case X86ISD::PSHUFD:
42218 if (Src1.isUndef() || Src0.getOperand(1) == Src1.getOperand(1)) {
42219 SDValue LHS = Src0.getOperand(0);
42220 SDValue RHS = Src1.isUndef() ? Src1 : Src1.getOperand(0);
42221 SDValue Res = DAG.getNode(X86ISD::VPERM2X128, DL, SrcVT0, LHS, RHS,
42222 V.getOperand(2));
42223 Res = DAG.getNode(SrcOpc0, DL, SrcVT0, Res, Src0.getOperand(1));
42224 return DAG.getBitcast(VT, Res);
42225 }
42226 break;
42227 }
42228
42229 return SDValue();
42230}
42231
42232/// Try to combine x86 target specific shuffles.
42234 SelectionDAG &DAG,
42236 const X86Subtarget &Subtarget) {
42237 using namespace SDPatternMatch;
42238
42239 MVT VT = N.getSimpleValueType();
42240 unsigned NumElts = VT.getVectorNumElements();
42242 unsigned Opcode = N.getOpcode();
42243 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
42244
42245 if (SDValue R = combineCommutableSHUFP(N, VT, DL, DAG))
42246 return R;
42247
42248 // Handle specific target shuffles.
42249 switch (Opcode) {
42250 case X86ISD::MOVDDUP: {
42251 SDValue Src = N.getOperand(0);
42252 // Turn a 128-bit MOVDDUP of a full vector load into movddup+vzload.
42253 if (VT == MVT::v2f64 && Src.hasOneUse() &&
42254 ISD::isNormalLoad(Src.getNode())) {
42255 LoadSDNode *LN = cast<LoadSDNode>(Src);
42256 if (SDValue VZLoad = narrowLoadToVZLoad(LN, MVT::f64, MVT::v2f64, DAG)) {
42257 SDValue Movddup = DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v2f64, VZLoad);
42258 DCI.CombineTo(N.getNode(), Movddup);
42259 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
42261 return N; // Return N so it doesn't get rechecked!
42262 }
42263 }
42264
42265 return SDValue();
42266 }
42267 case X86ISD::VBROADCAST: {
42268 SDValue Src = N.getOperand(0);
42269 SDValue BC = peekThroughBitcasts(Src);
42270 EVT SrcVT = Src.getValueType();
42271 EVT BCVT = BC.getValueType();
42272
42273 // If broadcasting from another shuffle, attempt to simplify it.
42274 // TODO - we really need a general SimplifyDemandedVectorElts mechanism.
42275 if (isTargetShuffle(BC.getOpcode()) &&
42276 VT.getScalarSizeInBits() % BCVT.getScalarSizeInBits() == 0) {
42277 unsigned Scale = VT.getScalarSizeInBits() / BCVT.getScalarSizeInBits();
42278 SmallVector<int, 16> DemandedMask(BCVT.getVectorNumElements(),
42280 for (unsigned i = 0; i != Scale; ++i)
42281 DemandedMask[i] = i;
42283 {BC}, 0, BC.getOpcode(), BC.getSimpleValueType(), DemandedMask,
42284 {}, /*Depth=*/0, X86::MaxShuffleCombineDepth,
42285 /*AllowVariableCrossLaneMask=*/true,
42286 /*AllowVariablePerLaneMask=*/true,
42287 /*IsMaskedShuffle=*/false, DAG, DL, Subtarget))
42288 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
42289 DAG.getBitcast(SrcVT, Res));
42290 }
42291
42292 // broadcast(bitcast(src)) -> bitcast(broadcast(src))
42293 // 32-bit targets have to bitcast i64 to f64, so better to bitcast upward.
42294 if (Src.getOpcode() == ISD::BITCAST &&
42295 SrcVT.getScalarSizeInBits() == BCVT.getScalarSizeInBits() &&
42296 TLI.isTypeLegal(BCVT) &&
42298 BCVT.getScalarType().getTypeForEVT(*DAG.getContext()))) {
42299 EVT NewVT = EVT::getVectorVT(*DAG.getContext(), BCVT.getScalarType(),
42301 return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, DL, NewVT, BC));
42302 }
42303
42304 // vbroadcast(bitcast(vbroadcast(src))) -> bitcast(vbroadcast(src))
42305 // If we're re-broadcasting a smaller type then broadcast with that type and
42306 // bitcast.
42307 // TODO: Do this for any splat?
42308 if (Src.getOpcode() == ISD::BITCAST &&
42309 (BC.getOpcode() == X86ISD::VBROADCAST ||
42311 (VT.getScalarSizeInBits() % BCVT.getScalarSizeInBits()) == 0 &&
42312 (VT.getSizeInBits() % BCVT.getSizeInBits()) == 0) {
42313 MVT NewVT =
42315 VT.getSizeInBits() / BCVT.getScalarSizeInBits());
42316 return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, DL, NewVT, BC));
42317 }
42318
42319 // Reduce broadcast source vector to lowest 128-bits.
42320 if (SrcVT.getSizeInBits() > 128)
42321 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
42322 extract128BitVector(Src, 0, DAG, DL));
42323
42324 // broadcast(scalar_to_vector(x)) -> broadcast(x).
42325 if (Src.getOpcode() == ISD::SCALAR_TO_VECTOR &&
42326 Src.getValueType().getScalarType() == Src.getOperand(0).getValueType())
42327 return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Src.getOperand(0));
42328
42329 // broadcast(extract_vector_elt(x, 0)) -> broadcast(x).
42330 if (Src.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
42331 isNullConstant(Src.getOperand(1)) &&
42332 Src.getValueType() ==
42333 Src.getOperand(0).getValueType().getScalarType() &&
42334 TLI.isTypeLegal(Src.getOperand(0).getValueType()))
42335 return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Src.getOperand(0));
42336
42337 // Share broadcast with the longest vector and extract low subvector (free).
42338 // Ensure the same SDValue from the SDNode use is being used.
42339 for (SDNode *User : Src->users())
42340 if (User != N.getNode() && User->getOpcode() == X86ISD::VBROADCAST &&
42341 Src == User->getOperand(0) &&
42342 User->getValueSizeInBits(0).getFixedValue() >
42343 VT.getFixedSizeInBits()) {
42344 return extractSubVector(SDValue(User, 0), 0, DAG, DL,
42345 VT.getSizeInBits());
42346 }
42347
42348 // vbroadcast(scalarload X) -> vbroadcast_load X
42349 // For float loads, extract other uses of the scalar from the broadcast.
42350 if (!SrcVT.isVector() && (Src.hasOneUse() || VT.isFloatingPoint()) &&
42351 ISD::isNormalLoad(Src.getNode())) {
42352 LoadSDNode *LN = cast<LoadSDNode>(Src);
42353 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
42354 SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
42355 SDValue BcastLd =
42357 LN->getMemoryVT(), LN->getMemOperand());
42358 // If the load value is used only by N, replace it via CombineTo N.
42359 bool NoReplaceExtract = Src.hasOneUse();
42360 DCI.CombineTo(N.getNode(), BcastLd);
42361 if (NoReplaceExtract) {
42362 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
42364 } else {
42365 SDValue Scl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SrcVT, BcastLd,
42366 DAG.getVectorIdxConstant(0, DL));
42367 DCI.CombineTo(LN, Scl, BcastLd.getValue(1));
42368 }
42369 return N; // Return N so it doesn't get rechecked!
42370 }
42371
42372 // Due to isTypeDesirableForOp, we won't always shrink a load truncated to
42373 // i16. So shrink it ourselves if we can make a broadcast_load.
42374 if (SrcVT == MVT::i16 && Src.getOpcode() == ISD::TRUNCATE &&
42375 Src.hasOneUse() && Src.getOperand(0).hasOneUse()) {
42376 assert(Subtarget.hasAVX2() && "Expected AVX2");
42377 SDValue TruncIn = Src.getOperand(0);
42378
42379 // If this is a truncate of a non extending load we can just narrow it to
42380 // use a broadcast_load.
42381 if (ISD::isNormalLoad(TruncIn.getNode())) {
42382 LoadSDNode *LN = cast<LoadSDNode>(TruncIn);
42383 // Unless its volatile or atomic.
42384 if (LN->isSimple()) {
42385 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
42386 SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
42387 SDValue BcastLd = DAG.getMemIntrinsicNode(
42388 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::i16,
42389 LN->getPointerInfo(), LN->getBaseAlign(),
42390 LN->getMemOperand()->getFlags());
42391 DCI.CombineTo(N.getNode(), BcastLd);
42392 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
42393 DCI.recursivelyDeleteUnusedNodes(Src.getNode());
42394 return N; // Return N so it doesn't get rechecked!
42395 }
42396 }
42397
42398 // If this is a truncate of an i16 extload, we can directly replace it.
42399 if (ISD::isUNINDEXEDLoad(Src.getOperand(0).getNode()) &&
42400 ISD::isEXTLoad(Src.getOperand(0).getNode())) {
42401 LoadSDNode *LN = cast<LoadSDNode>(Src.getOperand(0));
42402 if (LN->getMemoryVT().getSizeInBits() == 16) {
42403 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
42404 SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
42405 SDValue BcastLd =
42407 LN->getMemoryVT(), LN->getMemOperand());
42408 DCI.CombineTo(N.getNode(), BcastLd);
42409 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
42410 DCI.recursivelyDeleteUnusedNodes(Src.getNode());
42411 return N; // Return N so it doesn't get rechecked!
42412 }
42413 }
42414
42415 // If this is a truncate of load that has been shifted right, we can
42416 // offset the pointer and use a narrower load.
42417 if (TruncIn.getOpcode() == ISD::SRL &&
42418 TruncIn.getOperand(0).hasOneUse() &&
42419 isa<ConstantSDNode>(TruncIn.getOperand(1)) &&
42420 ISD::isNormalLoad(TruncIn.getOperand(0).getNode())) {
42421 LoadSDNode *LN = cast<LoadSDNode>(TruncIn.getOperand(0));
42422 unsigned ShiftAmt = TruncIn.getConstantOperandVal(1);
42423 // Make sure the shift amount and the load size are divisible by 16.
42424 // Don't do this if the load is volatile or atomic.
42425 if (ShiftAmt % 16 == 0 && TruncIn.getValueSizeInBits() % 16 == 0 &&
42426 LN->isSimple()) {
42427 unsigned Offset = ShiftAmt / 8;
42428 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
42431 SDValue Ops[] = { LN->getChain(), Ptr };
42432 SDValue BcastLd = DAG.getMemIntrinsicNode(
42433 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::i16,
42435 LN->getMemOperand()->getFlags());
42436 DCI.CombineTo(N.getNode(), BcastLd);
42437 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
42438 DCI.recursivelyDeleteUnusedNodes(Src.getNode());
42439 return N; // Return N so it doesn't get rechecked!
42440 }
42441 }
42442 }
42443
42444 // vbroadcast(vzload X) -> vbroadcast_load X
42445 if (Src.getOpcode() == X86ISD::VZEXT_LOAD && Src.hasOneUse()) {
42447 if (LN->getMemoryVT().getSizeInBits() == VT.getScalarSizeInBits()) {
42448 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
42449 SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
42450 SDValue BcastLd =
42452 LN->getMemoryVT(), LN->getMemOperand());
42453 DCI.CombineTo(N.getNode(), BcastLd);
42454 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
42456 return N; // Return N so it doesn't get rechecked!
42457 }
42458 }
42459
42460 // vbroadcast(vector load X) -> vbroadcast_load
42461 if (Src.hasOneUse() && ISD::isNormalLoad(Src.getNode())) {
42462 LoadSDNode *LN = cast<LoadSDNode>(Src);
42463 // Unless the load is volatile or atomic.
42464 if (LN->isSimple()) {
42465 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
42466 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
42467 SDValue BcastLd = DAG.getMemIntrinsicNode(
42469 LN->getPointerInfo(), LN->getBaseAlign(),
42470 LN->getMemOperand()->getFlags());
42471 DCI.CombineTo(N.getNode(), BcastLd);
42472 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
42474 return N; // Return N so it doesn't get rechecked!
42475 }
42476 }
42477
42478 return SDValue();
42479 }
42480 case X86ISD::VZEXT_MOVL: {
42481 SDValue N0 = N.getOperand(0);
42482
42483 // Fold (vzmovl (shift x, y)) -> (shift (vzmovl x), y)
42484 // Zeroing out the upper elements means we're just shifting a zero value.
42485 // TODO: Try harder to move vzmovl upward towards SCALAR_TO_VECTOR nodes.
42486 // TODO: Move this to canonicalizeShuffleWithOp once we add zero handling.
42487 if (N0.getOpcode() == X86ISD::VSHL || N0.getOpcode() == X86ISD::VSHLI ||
42488 N0.getOpcode() == X86ISD::VSRL || N0.getOpcode() == X86ISD::VSRLI ||
42489 N0.getOpcode() == X86ISD::VSRA || N0.getOpcode() == X86ISD::VSRAI) {
42490 if (N0.hasOneUse())
42491 return DAG.getNode(
42492 N0.getOpcode(), DL, VT,
42493 DAG.getNode(X86ISD::VZEXT_MOVL, DL, VT, N0.getOperand(0)),
42494 N0.getOperand(1));
42495 }
42496
42497 // If this a vzmovl of a full vector load, replace it with a vzload, unless
42498 // the load is volatile.
42499 if (N0.hasOneUse() && ISD::isNormalLoad(N0.getNode())) {
42500 auto *LN = cast<LoadSDNode>(N0);
42501 if (SDValue VZLoad =
42502 narrowLoadToVZLoad(LN, VT.getVectorElementType(), VT, DAG)) {
42503 DCI.CombineTo(N.getNode(), VZLoad);
42504 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
42506 return N;
42507 }
42508 }
42509
42510 // If this a VZEXT_MOVL of a VBROADCAST_LOAD, we don't need the broadcast
42511 // and can just use a VZEXT_LOAD.
42512 // FIXME: Is there some way to do this with SimplifyDemandedVectorElts?
42513 if (N0.hasOneUse() && N0.getOpcode() == X86ISD::VBROADCAST_LOAD) {
42514 auto *LN = cast<MemSDNode>(N0);
42515 if (VT.getScalarSizeInBits() == LN->getMemoryVT().getSizeInBits()) {
42516 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
42517 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
42518 SDValue VZLoad =
42520 LN->getMemoryVT(), LN->getMemOperand());
42521 DCI.CombineTo(N.getNode(), VZLoad);
42522 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
42524 return N;
42525 }
42526 }
42527
42528 // Turn (v2i64 (vzext_movl (scalar_to_vector (i64 X)))) into
42529 // (v2i64 (bitcast (v4i32 (vzext_movl (scalar_to_vector (i32 (trunc X)))))))
42530 // if the upper bits of the i64 are zero.
42531 if (N0.hasOneUse() && N0.getOpcode() == ISD::SCALAR_TO_VECTOR &&
42532 N0.getOperand(0).hasOneUse() &&
42533 N0.getOperand(0).getValueType() == MVT::i64) {
42534 SDValue In = N0.getOperand(0);
42535 APInt Mask = APInt::getHighBitsSet(64, 32);
42536 if (DAG.MaskedValueIsZero(In, Mask)) {
42537 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, In);
42538 MVT VecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
42539 SDValue SclVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Trunc);
42540 SDValue Movl = DAG.getNode(X86ISD::VZEXT_MOVL, DL, VecVT, SclVec);
42541 return DAG.getBitcast(VT, Movl);
42542 }
42543 }
42544
42545 // Load a scalar integer constant directly to XMM instead of transferring an
42546 // immediate value from GPR.
42547 // vzext_movl (scalar_to_vector C) --> load [C,0...]
42548 if (N0.getOpcode() == ISD::SCALAR_TO_VECTOR) {
42549 if (auto *C = dyn_cast<ConstantSDNode>(N0.getOperand(0))) {
42550 // Create a vector constant - scalar constant followed by zeros.
42551 EVT ScalarVT = N0.getOperand(0).getValueType();
42552 Type *ScalarTy = ScalarVT.getTypeForEVT(*DAG.getContext());
42553 Constant *Zero = ConstantInt::getNullValue(ScalarTy);
42554 SmallVector<Constant *, 32> ConstantVec(NumElts, Zero);
42555 ConstantVec[0] = const_cast<ConstantInt *>(C->getConstantIntValue());
42556
42557 // Load the vector constant from constant pool.
42558 MVT PVT = TLI.getPointerTy(DAG.getDataLayout());
42559 SDValue CP = DAG.getConstantPool(ConstantVector::get(ConstantVec), PVT);
42560 MachinePointerInfo MPI =
42562 Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();
42563 return DAG.getLoad(VT, DL, DAG.getEntryNode(), CP, MPI, Alignment,
42565 }
42566 }
42567
42568 // Pull subvector inserts into undef through VZEXT_MOVL by making it an
42569 // insert into a zero vector. This helps get VZEXT_MOVL closer to
42570 // scalar_to_vectors where 256/512 are canonicalized to an insert and a
42571 // 128-bit scalar_to_vector. This reduces the number of isel patterns.
42572 if (!DCI.isBeforeLegalizeOps() && N0.hasOneUse()) {
42574
42575 if (V.getOpcode() == ISD::INSERT_SUBVECTOR && V.getOperand(0).isUndef() &&
42576 isNullConstant(V.getOperand(2))) {
42577 SDValue In = V.getOperand(1);
42579 In.getValueSizeInBits() /
42580 VT.getScalarSizeInBits());
42581 In = DAG.getBitcast(SubVT, In);
42582 SDValue Movl = DAG.getNode(X86ISD::VZEXT_MOVL, DL, SubVT, In);
42583 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
42584 getZeroVector(VT, Subtarget, DAG, DL), Movl,
42585 V.getOperand(2));
42586 }
42587 }
42588
42589 return SDValue();
42590 }
42591 case X86ISD::BLENDI: {
42592 SDValue N0 = N.getOperand(0);
42593 SDValue N1 = N.getOperand(1);
42594 unsigned EltBits = VT.getScalarSizeInBits();
42595
42596 if (N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST) {
42597 // blend(bitcast(x),bitcast(y)) -> bitcast(blend(x,y)) to narrower types.
42598 // TODO: Handle MVT::v16i16 repeated blend mask.
42599 if (N0.getOperand(0).getValueType() == N1.getOperand(0).getValueType()) {
42600 MVT SrcVT = N0.getOperand(0).getSimpleValueType();
42601 unsigned SrcBits = SrcVT.getScalarSizeInBits();
42602 if ((EltBits % SrcBits) == 0 && SrcBits >= 32) {
42603 unsigned NewSize = SrcVT.getVectorNumElements();
42604 APInt BlendMask = getBLENDIBlendMask(N);
42605 APInt NewBlendMask = APIntOps::ScaleBitMask(BlendMask, NewSize);
42606 return DAG.getBitcast(
42607 VT, DAG.getNode(X86ISD::BLENDI, DL, SrcVT, N0.getOperand(0),
42608 N1.getOperand(0),
42609 DAG.getTargetConstant(NewBlendMask.getZExtValue(),
42610 DL, MVT::i8)));
42611 }
42612 }
42613 // Share PSHUFB masks:
42614 // blend(pshufb(x,m1),pshufb(y,m2))
42615 // --> m3 = blend(m1,m2)
42616 // blend(pshufb(x,m3),pshufb(y,m3))
42617 if (N0.hasOneUse() && N1.hasOneUse()) {
42618 SmallVector<int> Mask, ByteMask;
42622 if (LHS.getOpcode() == X86ISD::PSHUFB &&
42623 RHS.getOpcode() == X86ISD::PSHUFB &&
42624 LHS.getOperand(1) != RHS.getOperand(1) &&
42625 LHS.getOperand(1).hasOneUse() && RHS.getOperand(1).hasOneUse() &&
42626 getTargetShuffleMask(N, /*AllowSentinelZero=*/false, Ops, Mask)) {
42627 assert(Ops.size() == 2 && LHS == peekThroughOneUseBitcasts(Ops[0]) &&
42629 "BLENDI decode mismatch");
42630 MVT ShufVT = LHS.getSimpleValueType();
42631 SDValue MaskLHS = LHS.getOperand(1);
42632 SDValue MaskRHS = RHS.getOperand(1);
42633 llvm::narrowShuffleMaskElts(EltBits / 8, Mask, ByteMask);
42635 ShufVT, {MaskLHS, MaskRHS}, ByteMask,
42636 {LHS.getNode(), RHS.getNode()}, DAG, DL, Subtarget)) {
42637 SDValue NewLHS = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT,
42638 LHS.getOperand(0), NewMask);
42639 SDValue NewRHS = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT,
42640 RHS.getOperand(0), NewMask);
42641 return DAG.getNode(X86ISD::BLENDI, DL, VT,
42642 DAG.getBitcast(VT, NewLHS),
42643 DAG.getBitcast(VT, NewRHS), N.getOperand(2));
42644 }
42645 }
42646 }
42647 }
42648 return SDValue();
42649 }
42650 case X86ISD::SHUFP: {
42651 // Fold shufps(shuffle(x),shuffle(y)) -> shufps(x,y).
42652 // This is a more relaxed shuffle combiner that can ignore oneuse limits.
42653 // TODO: Support types other than v4f32.
42654 if (VT == MVT::v4f32) {
42655 bool Updated = false;
42656 SmallVector<int> Mask;
42658 if (getTargetShuffleMask(N, false, Ops, Mask) && Ops.size() == 2) {
42659 for (int i = 0; i != 2; ++i) {
42660 SmallVector<SDValue> SubOps;
42661 SmallVector<int> SubMask, SubScaledMask;
42663 // TODO: Scaling might be easier if we specify the demanded elts.
42664 if (getTargetShuffleInputs(Sub, SubOps, SubMask, DAG, 0, false) &&
42665 scaleShuffleElements(SubMask, 4, SubScaledMask) &&
42666 SubOps.size() == 1 && isUndefOrInRange(SubScaledMask, 0, 4)) {
42667 int Ofs = i * 2;
42668 Mask[Ofs + 0] = SubScaledMask[Mask[Ofs + 0] % 4] + (i * 4);
42669 Mask[Ofs + 1] = SubScaledMask[Mask[Ofs + 1] % 4] + (i * 4);
42670 Ops[i] = DAG.getBitcast(VT, SubOps[0]);
42671 Updated = true;
42672 }
42673 }
42674 }
42675 if (Updated) {
42676 for (int &M : Mask)
42677 M %= 4;
42678 Ops.push_back(getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
42679 return DAG.getNode(X86ISD::SHUFP, DL, VT, Ops);
42680 }
42681 }
42682 return SDValue();
42683 }
42684 case X86ISD::VPERMI: {
42685 // vpermi(bitcast(x)) -> bitcast(vpermi(x)) for same number of elements.
42686 // TODO: Remove when we have preferred domains in combineX86ShuffleChain.
42687 SDValue N0 = N.getOperand(0);
42688 SDValue N1 = N.getOperand(1);
42689 unsigned EltSizeInBits = VT.getScalarSizeInBits();
42690 if (N0.getOpcode() == ISD::BITCAST &&
42691 N0.getOperand(0).getScalarValueSizeInBits() == EltSizeInBits) {
42692 SDValue Src = N0.getOperand(0);
42693 EVT SrcVT = Src.getValueType();
42694 SDValue Res = DAG.getNode(X86ISD::VPERMI, DL, SrcVT, Src, N1);
42695 return DAG.getBitcast(VT, Res);
42696 }
42697 return SDValue();
42698 }
42699 case X86ISD::SHUF128: {
42700 // If we're permuting the upper 256-bits subvectors of a concatenation, then
42701 // see if we can peek through and access the subvector directly.
42702 if (VT.is512BitVector()) {
42703 // 512-bit mask uses 4 x i2 indices - if the msb is always set then only
42704 // the upper subvector is used.
42705 SDValue LHS = peekThroughBitcasts(N->getOperand(0));
42706 SDValue RHS = peekThroughBitcasts(N->getOperand(1));
42707 uint64_t Mask = N->getConstantOperandVal(2);
42708 SmallVector<SDValue> LHSOps, RHSOps;
42709 SDValue NewLHS, NewRHS;
42710 if ((Mask & 0x0A) == 0x0A &&
42711 collectConcatOps(LHS.getNode(), LHSOps, DAG) && LHSOps.size() == 2) {
42712 NewLHS = widenSubVector(LHSOps[1], false, Subtarget, DAG, DL, 512);
42713 Mask &= ~0x0A;
42714 }
42715 if ((Mask & 0xA0) == 0xA0 &&
42716 collectConcatOps(RHS.getNode(), RHSOps, DAG) && RHSOps.size() == 2) {
42717 NewRHS = widenSubVector(RHSOps[1], false, Subtarget, DAG, DL, 512);
42718 Mask &= ~0xA0;
42719 }
42720 if (NewLHS || NewRHS)
42721 return DAG.getNode(X86ISD::SHUF128, DL, VT,
42722 DAG.getBitcast(VT, NewLHS ? NewLHS : LHS),
42723 DAG.getBitcast(VT, NewRHS ? NewRHS : RHS),
42724 DAG.getTargetConstant(Mask, DL, MVT::i8));
42725 }
42726 return SDValue();
42727 }
42728 case X86ISD::VPERM2X128: {
42729 SDValue LHS = N->getOperand(0);
42730 SDValue RHS = N->getOperand(1);
42731 unsigned Imm = N.getConstantOperandVal(2) & 255;
42732
42733 // Canonicalize unary/repeated operands to LHS.
42734 if (LHS.isUndef() && !RHS.isUndef())
42735 return DAG.getNode(X86ISD::VPERM2X128, DL, VT, RHS, LHS,
42736 DAG.getTargetConstant(Imm ^ 0x22, DL, MVT::i8));
42737 if (LHS == RHS)
42738 return DAG.getNode(X86ISD::VPERM2X128, DL, VT, LHS, DAG.getUNDEF(VT),
42739 DAG.getTargetConstant(Imm & ~0x22, DL, MVT::i8));
42740
42741 // Fold vperm2x128(bitcast(x),bitcast(y),c) -> bitcast(vperm2x128(x,y,c)).
42742 if (LHS.getOpcode() == ISD::BITCAST &&
42743 (RHS.getOpcode() == ISD::BITCAST || RHS.isUndef())) {
42744 EVT SrcVT = LHS.getOperand(0).getValueType();
42745 if (RHS.isUndef() || SrcVT == RHS.getOperand(0).getValueType()) {
42746 return DAG.getBitcast(VT, DAG.getNode(X86ISD::VPERM2X128, DL, SrcVT,
42747 DAG.getBitcast(SrcVT, LHS),
42748 DAG.getBitcast(SrcVT, RHS),
42749 N->getOperand(2)));
42750 }
42751 }
42752
42753 // Fold vperm2x128(op(),op()) -> op(vperm2x128(),vperm2x128()).
42755 return Res;
42756
42757 // Fold vperm2x128 subvector shuffle with an inner concat pattern.
42758 // vperm2x128(concat(X,Y),concat(Z,W)) --> concat X,Y etc.
42759 auto FindSubVector128 = [&](unsigned Idx) {
42760 if (Idx > 3)
42761 return SDValue();
42762 SDValue Src = peekThroughBitcasts(N.getOperand(Idx < 2 ? 0 : 1));
42763 SmallVector<SDValue> SubOps;
42764 if (collectConcatOps(Src.getNode(), SubOps, DAG) && SubOps.size() == 2)
42765 return SubOps[Idx & 1];
42766 unsigned NumElts = Src.getValueType().getVectorNumElements();
42767 if ((Idx & 1) == 1 && Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
42768 Src.getOperand(1).getValueSizeInBits() == 128 &&
42769 Src.getConstantOperandAPInt(2) == (NumElts / 2)) {
42770 return Src.getOperand(1);
42771 }
42772 return SDValue();
42773 };
42774 if (SDValue SubLo = FindSubVector128(Imm & 0x0F)) {
42775 if (SDValue SubHi = FindSubVector128((Imm & 0xF0) >> 4)) {
42776 MVT SubVT = VT.getHalfNumVectorElementsVT();
42777 SubLo = DAG.getBitcast(SubVT, SubLo);
42778 SubHi = DAG.getBitcast(SubVT, SubHi);
42779 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, SubLo, SubHi);
42780 }
42781 }
42782
42783 // Attempt to match VBROADCAST*128 subvector broadcast load.
42784 if (RHS.isUndef()) {
42786 DecodeVPERM2X128Mask(4, Imm, Mask);
42787 if (isUndefOrInRange(Mask, 0, 4)) {
42788 bool SplatLo = isShuffleEquivalent(Mask, {0, 1, 0, 1}, LHS);
42789 bool SplatHi = isShuffleEquivalent(Mask, {2, 3, 2, 3}, LHS);
42790 if ((SplatLo || SplatHi) && !Subtarget.hasAVX512() &&
42791 X86::mayFoldLoad(LHS, Subtarget, /*AssumeSingleUse=*/true)) {
42792 MVT MemVT = VT.getHalfNumVectorElementsVT();
42793 unsigned Ofs = SplatLo ? 0 : MemVT.getStoreSize();
42795 cast<LoadSDNode>(LHS), Ofs, DAG);
42796 }
42797 }
42798 }
42799
42800 return SDValue();
42801 }
42802 case X86ISD::PSHUFD:
42803 case X86ISD::PSHUFLW:
42804 case X86ISD::PSHUFHW: {
42805 SDValue N0 = N.getOperand(0);
42806 SDValue N1 = N.getOperand(1);
42807 if (N0->hasOneUse()) {
42809 switch (V.getOpcode()) {
42810 case X86ISD::VSHL:
42811 case X86ISD::VSRL:
42812 case X86ISD::VSRA:
42813 case X86ISD::VSHLI:
42814 case X86ISD::VSRLI:
42815 case X86ISD::VSRAI:
42816 case X86ISD::VROTLI:
42817 case X86ISD::VROTRI: {
42818 MVT InnerVT = V.getSimpleValueType();
42819 if (InnerVT.getScalarSizeInBits() <= VT.getScalarSizeInBits()) {
42820 SDValue Res = DAG.getNode(Opcode, DL, VT,
42821 DAG.getBitcast(VT, V.getOperand(0)), N1);
42822 Res = DAG.getBitcast(InnerVT, Res);
42823 Res = DAG.getNode(V.getOpcode(), DL, InnerVT, Res, V.getOperand(1));
42824 return DAG.getBitcast(VT, Res);
42825 }
42826 break;
42827 }
42828 }
42829 }
42830
42831 Mask = getPSHUFShuffleMask(N);
42832 assert(Mask.size() == 4);
42833 break;
42834 }
42835 case X86ISD::MOVSD:
42836 case X86ISD::MOVSH:
42837 case X86ISD::MOVSS: {
42838 SDValue N0 = N.getOperand(0);
42839 SDValue N1 = N.getOperand(1);
42840
42841 // Canonicalize scalar FPOps:
42842 // MOVS*(N0, OP(N0, N1)) --> MOVS*(N0, SCALAR_TO_VECTOR(OP(N0[0], N1[0])))
42843 // If commutable, allow OP(N1[0], N0[0]).
42844 unsigned Opcode1 = N1.getOpcode();
42845 if (Opcode1 == ISD::FADD || Opcode1 == ISD::FMUL || Opcode1 == ISD::FSUB ||
42846 Opcode1 == ISD::FDIV) {
42847 SDValue N10 = N1.getOperand(0);
42848 SDValue N11 = N1.getOperand(1);
42849 if (N10 == N0 ||
42850 (N11 == N0 && (Opcode1 == ISD::FADD || Opcode1 == ISD::FMUL))) {
42851 if (N10 != N0)
42852 std::swap(N10, N11);
42853 MVT SVT = VT.getVectorElementType();
42854 SDValue ZeroIdx = DAG.getVectorIdxConstant(0, DL);
42855 N10 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SVT, N10, ZeroIdx);
42856 N11 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SVT, N11, ZeroIdx);
42857 SDValue Scl = DAG.getNode(Opcode1, DL, SVT, N10, N11);
42858 SDValue SclVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Scl);
42859 return DAG.getNode(Opcode, DL, VT, N0, SclVec);
42860 }
42861 }
42862
42863 return SDValue();
42864 }
42865 case X86ISD::INSERTPS: {
42866 assert(VT == MVT::v4f32 && "INSERTPS ValueType must be MVT::v4f32");
42867 SDValue Op0 = N.getOperand(0);
42868 SDValue Op1 = N.getOperand(1);
42869 unsigned InsertPSMask = N.getConstantOperandVal(2);
42870 unsigned SrcIdx = (InsertPSMask >> 6) & 0x3;
42871 unsigned DstIdx = (InsertPSMask >> 4) & 0x3;
42872 unsigned ZeroMask = InsertPSMask & 0xF;
42873
42874 // If we zero out all elements from Op0 then we don't need to reference it.
42875 if (((ZeroMask | (1u << DstIdx)) == 0xF) && !Op0.isUndef())
42876 return DAG.getNode(X86ISD::INSERTPS, DL, VT, DAG.getUNDEF(VT), Op1,
42877 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
42878
42879 // If we zero out the element from Op1 then we don't need to reference it.
42880 if ((ZeroMask & (1u << DstIdx)) && !Op1.isUndef())
42881 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
42882 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
42883
42884 // Attempt to merge insertps Op1 with an inner target shuffle node.
42885 SmallVector<int, 8> TargetMask1;
42887 APInt KnownUndef1, KnownZero1;
42888 if (getTargetShuffleAndZeroables(Op1, TargetMask1, Ops1, KnownUndef1,
42889 KnownZero1)) {
42890 if (KnownUndef1[SrcIdx] || KnownZero1[SrcIdx]) {
42891 // Zero/UNDEF insertion - zero out element and remove dependency.
42892 InsertPSMask |= (1u << DstIdx);
42893 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
42894 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
42895 }
42896 // Update insertps mask srcidx and reference the source input directly.
42897 int M = TargetMask1[SrcIdx];
42898 assert(0 <= M && M < 8 && "Shuffle index out of range");
42899 InsertPSMask = (InsertPSMask & 0x3f) | ((M & 0x3) << 6);
42900 Op1 = Ops1[M < 4 ? 0 : 1];
42901 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
42902 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
42903 }
42904
42905 // Attempt to merge insertps Op0 with an inner target shuffle node.
42906 SmallVector<int, 8> TargetMask0;
42908 APInt KnownUndef0, KnownZero0;
42909 if (getTargetShuffleAndZeroables(Op0, TargetMask0, Ops0, KnownUndef0,
42910 KnownZero0)) {
42911 bool Updated = false;
42912 bool UseInput00 = false;
42913 bool UseInput01 = false;
42914 for (int i = 0; i != 4; ++i) {
42915 if ((InsertPSMask & (1u << i)) || (i == (int)DstIdx)) {
42916 // No change if element is already zero or the inserted element.
42917 continue;
42918 }
42919
42920 if (KnownUndef0[i] || KnownZero0[i]) {
42921 // If the target mask is undef/zero then we must zero the element.
42922 InsertPSMask |= (1u << i);
42923 Updated = true;
42924 continue;
42925 }
42926
42927 // The input vector element must be inline.
42928 int M = TargetMask0[i];
42929 if (M != i && M != (i + 4))
42930 return SDValue();
42931
42932 // Determine which inputs of the target shuffle we're using.
42933 UseInput00 |= (0 <= M && M < 4);
42934 UseInput01 |= (4 <= M);
42935 }
42936
42937 // If we're not using both inputs of the target shuffle then use the
42938 // referenced input directly.
42939 if (UseInput00 && !UseInput01) {
42940 Updated = true;
42941 Op0 = Ops0[0];
42942 } else if (!UseInput00 && UseInput01) {
42943 Updated = true;
42944 Op0 = Ops0[1];
42945 }
42946
42947 if (Updated)
42948 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
42949 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
42950 }
42951
42952 // If we're inserting an element from a vbroadcast load, fold the
42953 // load into the X86insertps instruction. We need to convert the scalar
42954 // load to a vector and clear the source lane of the INSERTPS control.
42955 if (Op1.getOpcode() == X86ISD::VBROADCAST_LOAD && Op1.hasOneUse()) {
42956 auto *MemIntr = cast<MemIntrinsicSDNode>(Op1);
42957 if (MemIntr->getMemoryVT().getScalarSizeInBits() == 32) {
42958 SDValue Load = DAG.getLoad(MVT::f32, DL, MemIntr->getChain(),
42959 MemIntr->getBasePtr(),
42960 MemIntr->getMemOperand());
42961 SDValue Insert = DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0,
42963 Load),
42964 DAG.getTargetConstant(InsertPSMask & 0x3f, DL, MVT::i8));
42965 DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), Load.getValue(1));
42966 return Insert;
42967 }
42968 }
42969
42970 return SDValue();
42971 }
42972 case X86ISD::VPERMV: {
42973 // Combine VPERMV to VPERMV3 if the source operand can be freely split.
42975 SmallVector<SDValue, 2> SrcOps, SubOps;
42976 SDValue Src = peekThroughBitcasts(N.getOperand(1));
42977 if ((Subtarget.hasVLX() || VT.is512BitVector()) &&
42978 getTargetShuffleMask(N, /*AllowSentinelZero=*/false, SrcOps, Mask) &&
42979 collectConcatOps(Src.getNode(), SubOps, DAG)) {
42980 assert(Mask.size() == NumElts && "Unexpected shuffle mask size");
42981 assert(SrcOps.size() == 1 && "Unexpected shuffle ops");
42982 assert((SubOps.size() == 2 || SubOps.size() == 4) &&
42983 "Unexpected split ops");
42984 // Bail if we were permuting a widened vector.
42985 if (SubOps[1].isUndef() &&
42986 (SubOps.size() == 2 || (SubOps[2].isUndef() && SubOps[3].isUndef())))
42987 return SDValue();
42988 // Bail if any subops would have folded into the concat.
42989 if (any_of(SubOps, isShuffleFoldableLoad))
42990 return SDValue();
42991 // Concat 4x128 back to 2x256.
42992 if (SubOps.size() == 4) {
42993 SubOps[0] = concatSubVectors(SubOps[0], SubOps[1], DAG, DL);
42994 SubOps[1] = concatSubVectors(SubOps[2], SubOps[3], DAG, DL);
42995 }
42996 // Convert mask to 2 operand shuffle.
42997 int HalfElts = NumElts / 2;
42998 for (int &M : Mask)
42999 M += M >= HalfElts ? HalfElts : 0;
43000 SDValue Lo = widenSubVector(SubOps[0], false, Subtarget, DAG, DL,
43001 VT.getSizeInBits());
43002 SDValue Hi = widenSubVector(SubOps[1], false, Subtarget, DAG, DL,
43003 VT.getSizeInBits());
43004 return lowerShuffleWithPERMV(DL, VT, Mask, DAG.getBitcast(VT, Lo),
43005 DAG.getBitcast(VT, Hi), Subtarget, DAG);
43006 }
43007 return SDValue();
43008 }
43009 case X86ISD::VPERMV3: {
43010 MVT WideVT = VT.getDoubleNumVectorElementsVT();
43011 bool CanConcat = VT.is128BitVector() ||
43012 (VT.is256BitVector() && Subtarget.useAVX512Regs());
43015 if (getTargetShuffleMask(N, /*AllowSentinelZero=*/false, SrcOps, Mask)) {
43016 assert(Mask.size() == NumElts && "Unexpected shuffle mask size");
43017 SDValue V1 = peekThroughBitcasts(N.getOperand(0));
43018 SDValue V2 = peekThroughBitcasts(N.getOperand(2));
43019 // Canonicalize to VPERMV if both sources are the same.
43020 if (V1 == V2) {
43021 for (int &M : Mask)
43022 M = (M < 0 ? M : (M & (NumElts - 1)));
43023 return lowerShuffleWithPERMV(DL, VT, Mask, N.getOperand(0),
43024 DAG.getUNDEF(VT), Subtarget, DAG);
43025 }
43026 // If sources are half width, then concat and use VPERMV with adjusted
43027 // mask.
43028 SDValue Ops[2];
43029 MVT HalfVT = VT.getHalfNumVectorElementsVT();
43030 if (sd_match(V1,
43032 sd_match(V2,
43034 Ops[0].getValueType() == HalfVT && Ops[1].getValueType() == HalfVT) {
43035 if (SDValue ConcatSrc =
43036 combineConcatVectorOps(DL, VT, Ops, DAG, Subtarget)) {
43037 for (int &M : Mask)
43038 M = (M < (int)NumElts ? M : (M - (NumElts / 2)));
43039 return lowerShuffleWithPERMV(DL, VT, Mask, ConcatSrc,
43040 DAG.getUNDEF(VT), Subtarget, DAG);
43041 }
43042 }
43043 // Commute foldable source to the RHS.
43044 if (isShuffleFoldableLoad(N.getOperand(0)) &&
43045 !isShuffleFoldableLoad(N.getOperand(2))) {
43047 return lowerShuffleWithPERMV(DL, VT, Mask, N.getOperand(2),
43048 N.getOperand(0), Subtarget, DAG);
43049 }
43050 // Combine VPERMV3 to widened VPERMV if the two source operands can be
43051 // freely concatenated, with a commuted shuffle mask.
43052 if (CanConcat) {
43053 if (SDValue ConcatSrc = combineConcatVectorOps(
43054 DL, WideVT, {N.getOperand(2), N.getOperand(0)}, DAG,
43055 Subtarget)) {
43057 Mask.append(NumElts, SM_SentinelUndef);
43058 SDValue Perm =
43059 lowerShuffleWithPERMV(DL, WideVT, Mask, ConcatSrc,
43060 DAG.getUNDEF(WideVT), Subtarget, DAG);
43061 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Perm,
43062 DAG.getVectorIdxConstant(0, DL));
43063 }
43064 }
43065 }
43066 // Combine VPERMV3 to widened VPERMV if the two source operands can be
43067 // freely concatenated.
43068 if (CanConcat) {
43069 if (SDValue ConcatSrc = combineConcatVectorOps(
43070 DL, WideVT, {N.getOperand(0), N.getOperand(2)}, DAG, Subtarget)) {
43071 SDValue Mask = widenSubVector(N.getOperand(1), false, Subtarget, DAG,
43072 DL, WideVT.getSizeInBits());
43073 SDValue Perm = DAG.getNode(X86ISD::VPERMV, DL, WideVT, Mask, ConcatSrc);
43074 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Perm,
43075 DAG.getVectorIdxConstant(0, DL));
43076 }
43077 }
43078 return SDValue();
43079 }
43080 default:
43081 return SDValue();
43082 }
43083
43084 // Nuke no-op shuffles that show up after combining.
43085 if (isNoopShuffleMask(Mask))
43086 return N.getOperand(0);
43087
43088 // Look for simplifications involving one or two shuffle instructions.
43089 SDValue V = N.getOperand(0);
43090 switch (N.getOpcode()) {
43091 default:
43092 break;
43093 case X86ISD::PSHUFLW:
43094 case X86ISD::PSHUFHW:
43095 assert(VT.getVectorElementType() == MVT::i16 && "Bad word shuffle type!");
43096
43097 // See if this reduces to a PSHUFD which is no more expensive and can
43098 // combine with more operations. Note that it has to at least flip the
43099 // dwords as otherwise it would have been removed as a no-op.
43100 if (ArrayRef<int>(Mask).equals({2, 3, 0, 1})) {
43101 int DMask[] = {0, 1, 2, 3};
43102 int DOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 2;
43103 DMask[DOffset + 0] = DOffset + 1;
43104 DMask[DOffset + 1] = DOffset + 0;
43105 MVT DVT = MVT::getVectorVT(MVT::i32, NumElts / 2);
43106 V = DAG.getBitcast(DVT, V);
43107 V = DAG.getNode(X86ISD::PSHUFD, DL, DVT, V,
43108 getV4X86ShuffleImm8ForMask(DMask, DL, DAG));
43109 return DAG.getBitcast(VT, V);
43110 }
43111
43112 // Look for shuffle patterns which can be implemented as a single unpack.
43113 // FIXME: This doesn't handle the location of the PSHUFD generically, and
43114 // only works when we have a PSHUFD followed by two half-shuffles.
43115 if (Mask[0] == Mask[1] && Mask[2] == Mask[3] &&
43116 (V.getOpcode() == X86ISD::PSHUFLW ||
43117 V.getOpcode() == X86ISD::PSHUFHW) &&
43118 V.getOpcode() != N.getOpcode() &&
43119 V.hasOneUse() && V.getOperand(0).hasOneUse()) {
43120 SDValue D = peekThroughOneUseBitcasts(V.getOperand(0));
43121 if (D.getOpcode() == X86ISD::PSHUFD) {
43124 int NOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
43125 int VOffset = V.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
43126 int WordMask[8];
43127 for (int i = 0; i < 4; ++i) {
43128 WordMask[i + NOffset] = Mask[i] + NOffset;
43129 WordMask[i + VOffset] = VMask[i] + VOffset;
43130 }
43131 // Map the word mask through the DWord mask.
43132 int MappedMask[8];
43133 for (int i = 0; i < 8; ++i)
43134 MappedMask[i] = 2 * DMask[WordMask[i] / 2] + WordMask[i] % 2;
43135 if (ArrayRef<int>(MappedMask).equals({0, 0, 1, 1, 2, 2, 3, 3}) ||
43136 ArrayRef<int>(MappedMask).equals({4, 4, 5, 5, 6, 6, 7, 7})) {
43137 // We can replace all three shuffles with an unpack.
43138 V = DAG.getBitcast(VT, D.getOperand(0));
43139 return DAG.getNode(MappedMask[0] == 0 ? X86ISD::UNPCKL
43141 DL, VT, V, V);
43142 }
43143 }
43144 }
43145
43146 break;
43147
43148 case X86ISD::PSHUFD:
43149 if (SDValue NewN = combineRedundantDWordShuffle(N, Mask, DL, DAG))
43150 return NewN;
43151
43152 break;
43153 }
43154
43155 return SDValue();
43156}
43157
43158/// Checks if the shuffle mask takes subsequent elements
43159/// alternately from two vectors.
43160/// For example <0, 5, 2, 7> or <8, 1, 10, 3, 12, 5, 14, 7> are both correct.
43161static bool isAddSubOrSubAddMask(ArrayRef<int> Mask, bool &Op0Even) {
43162
43163 int ParitySrc[2] = {-1, -1};
43164 unsigned Size = Mask.size();
43165 for (unsigned i = 0; i != Size; ++i) {
43166 int M = Mask[i];
43167 if (M < 0)
43168 continue;
43169
43170 // Make sure we are using the matching element from the input.
43171 if ((M % Size) != i)
43172 return false;
43173
43174 // Make sure we use the same input for all elements of the same parity.
43175 int Src = M / Size;
43176 if (ParitySrc[i % 2] >= 0 && ParitySrc[i % 2] != Src)
43177 return false;
43178 ParitySrc[i % 2] = Src;
43179 }
43180
43181 // Make sure each input is used.
43182 if (ParitySrc[0] < 0 || ParitySrc[1] < 0 || ParitySrc[0] == ParitySrc[1])
43183 return false;
43184
43185 Op0Even = ParitySrc[0] == 0;
43186 return true;
43187}
43188
43189/// Returns true iff the shuffle node \p N can be replaced with ADDSUB(SUBADD)
43190/// operation. If true is returned then the operands of ADDSUB(SUBADD) operation
43191/// are written to the parameters \p Opnd0 and \p Opnd1.
43192///
43193/// We combine shuffle to ADDSUB(SUBADD) directly on the abstract vector shuffle nodes
43194/// so it is easier to generically match. We also insert dummy vector shuffle
43195/// nodes for the operands which explicitly discard the lanes which are unused
43196/// by this operation to try to flow through the rest of the combiner
43197/// the fact that they're unused.
43198static bool isAddSubOrSubAdd(SDNode *N, const X86Subtarget &Subtarget,
43199 SelectionDAG &DAG, SDValue &Opnd0, SDValue &Opnd1,
43200 bool &IsSubAdd, bool &HasAllowContract) {
43201
43202 EVT VT = N->getValueType(0);
43203 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
43204 if (!Subtarget.hasSSE3() || !TLI.isTypeLegal(VT) ||
43206 return false;
43207
43208 // We only handle target-independent shuffles.
43209 // FIXME: It would be easy and harmless to use the target shuffle mask
43210 // extraction tool to support more.
43211 if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
43212 return false;
43213
43214 SDValue V1 = N->getOperand(0);
43215 SDValue V2 = N->getOperand(1);
43216
43217 // Make sure we have an FADD and an FSUB.
43218 if ((V1.getOpcode() != ISD::FADD && V1.getOpcode() != ISD::FSUB) ||
43219 (V2.getOpcode() != ISD::FADD && V2.getOpcode() != ISD::FSUB) ||
43220 V1.getOpcode() == V2.getOpcode())
43221 return false;
43222
43223 // If there are other uses of these operations we can't fold them.
43224 if (!V1->hasOneUse() || !V2->hasOneUse())
43225 return false;
43226
43227 // Ensure that both operations have the same operands. Note that we can
43228 // commute the FADD operands.
43229 SDValue LHS, RHS;
43230 if (V1.getOpcode() == ISD::FSUB) {
43231 LHS = V1->getOperand(0); RHS = V1->getOperand(1);
43232 if ((V2->getOperand(0) != LHS || V2->getOperand(1) != RHS) &&
43233 (V2->getOperand(0) != RHS || V2->getOperand(1) != LHS))
43234 return false;
43235 } else {
43236 assert(V2.getOpcode() == ISD::FSUB && "Unexpected opcode");
43237 LHS = V2->getOperand(0); RHS = V2->getOperand(1);
43238 if ((V1->getOperand(0) != LHS || V1->getOperand(1) != RHS) &&
43239 (V1->getOperand(0) != RHS || V1->getOperand(1) != LHS))
43240 return false;
43241 }
43242
43243 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();
43244 bool Op0Even;
43245 if (!isAddSubOrSubAddMask(Mask, Op0Even))
43246 return false;
43247
43248 // It's a subadd if the vector in the even parity is an FADD.
43249 IsSubAdd = Op0Even ? V1->getOpcode() == ISD::FADD
43250 : V2->getOpcode() == ISD::FADD;
43251 HasAllowContract =
43253
43254 Opnd0 = LHS;
43255 Opnd1 = RHS;
43256 return true;
43257}
43258
43259/// Combine shuffle of two fma nodes into FMAddSub or FMSubAdd.
43261 const X86Subtarget &Subtarget,
43262 SelectionDAG &DAG) {
43263 // We only handle target-independent shuffles.
43264 // FIXME: It would be easy and harmless to use the target shuffle mask
43265 // extraction tool to support more.
43266 if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
43267 return SDValue();
43268
43269 MVT VT = N->getSimpleValueType(0);
43270 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
43271 if (!Subtarget.hasAnyFMA() || !TLI.isTypeLegal(VT))
43272 return SDValue();
43273
43274 // We're trying to match (shuffle fma(a, b, c), X86Fmsub(a, b, c).
43275 SDValue Op0 = N->getOperand(0);
43276 SDValue Op1 = N->getOperand(1);
43277 SDValue FMAdd = Op0, FMSub = Op1;
43278 if (FMSub.getOpcode() != X86ISD::FMSUB)
43279 std::swap(FMAdd, FMSub);
43280
43281 if (FMAdd.getOpcode() != ISD::FMA || FMSub.getOpcode() != X86ISD::FMSUB ||
43282 FMAdd.getOperand(0) != FMSub.getOperand(0) || !FMAdd.hasOneUse() ||
43283 FMAdd.getOperand(1) != FMSub.getOperand(1) || !FMSub.hasOneUse() ||
43284 FMAdd.getOperand(2) != FMSub.getOperand(2))
43285 return SDValue();
43286
43287 // Check for correct shuffle mask.
43288 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();
43289 bool Op0Even;
43290 if (!isAddSubOrSubAddMask(Mask, Op0Even))
43291 return SDValue();
43292
43293 // FMAddSub takes zeroth operand from FMSub node.
43294 bool IsSubAdd = Op0Even ? Op0 == FMAdd : Op1 == FMAdd;
43295 unsigned Opcode = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
43296 return DAG.getNode(Opcode, DL, VT, FMAdd.getOperand(0), FMAdd.getOperand(1),
43297 FMAdd.getOperand(2));
43298}
43299
43300/// Try to combine a shuffle into a target-specific add-sub or
43301/// mul-add-sub node.
43303 const X86Subtarget &Subtarget,
43304 SelectionDAG &DAG) {
43305 if (SDValue V = combineShuffleToFMAddSub(N, DL, Subtarget, DAG))
43306 return V;
43307
43308 SDValue Opnd0, Opnd1;
43309 bool IsSubAdd;
43310 bool HasAllowContract;
43311 if (!isAddSubOrSubAdd(N, Subtarget, DAG, Opnd0, Opnd1, IsSubAdd,
43312 HasAllowContract))
43313 return SDValue();
43314
43315 MVT VT = N->getSimpleValueType(0);
43316
43317 // Try to generate X86ISD::FMADDSUB node here.
43318 SDValue Opnd2;
43319 if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, 2,
43320 HasAllowContract)) {
43321 unsigned Opc = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
43322 return DAG.getNode(Opc, DL, VT, Opnd0, Opnd1, Opnd2);
43323 }
43324
43325 if (IsSubAdd)
43326 return SDValue();
43327
43328 // Do not generate X86ISD::ADDSUB node for 512-bit types even though
43329 // the ADDSUB idiom has been successfully recognized. There are no known
43330 // X86 targets with 512-bit ADDSUB instructions!
43331 if (VT.is512BitVector())
43332 return SDValue();
43333
43334 // Do not generate X86ISD::ADDSUB node for FP16's vector types even though
43335 // the ADDSUB idiom has been successfully recognized. There are no known
43336 // X86 targets with FP16 ADDSUB instructions!
43337 if (VT.getVectorElementType() == MVT::f16)
43338 return SDValue();
43339
43340 return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
43341}
43342
43343/// If we have a shuffle of AVX/AVX512 (256/512 bit) vectors that only uses the
43344/// low half of each source vector and does not set any high half elements in
43345/// the destination vector, narrow the shuffle to half its original size.
43347 EVT VT = Shuf->getValueType(0);
43348 if (!DAG.getTargetLoweringInfo().isTypeLegal(Shuf->getValueType(0)))
43349 return SDValue();
43350 if (!VT.is256BitVector() && !VT.is512BitVector())
43351 return SDValue();
43352
43353 // See if we can ignore all of the high elements of the shuffle.
43354 ArrayRef<int> Mask = Shuf->getMask();
43355 if (!isUndefUpperHalf(Mask))
43356 return SDValue();
43357
43358 // Check if the shuffle mask accesses only the low half of each input vector
43359 // (half-index output is 0 or 2).
43360 int HalfIdx1, HalfIdx2;
43361 SmallVector<int, 8> HalfMask(Mask.size() / 2);
43362 if (!getHalfShuffleMask(Mask, HalfMask, HalfIdx1, HalfIdx2) ||
43363 (HalfIdx1 % 2 == 1) || (HalfIdx2 % 2 == 1))
43364 return SDValue();
43365
43366 // Create a half-width shuffle to replace the unnecessarily wide shuffle.
43367 // The trick is knowing that all of the insert/extract are actually free
43368 // subregister (zmm<->ymm or ymm<->xmm) ops. That leaves us with a shuffle
43369 // of narrow inputs into a narrow output, and that is always cheaper than
43370 // the wide shuffle that we started with.
43371 return getShuffleHalfVectors(SDLoc(Shuf), Shuf->getOperand(0),
43372 Shuf->getOperand(1), HalfMask, HalfIdx1,
43373 HalfIdx2, false, DAG, /*UseConcat*/ true);
43374}
43375
43378 const X86Subtarget &Subtarget) {
43379 if (auto *Shuf = dyn_cast<ShuffleVectorSDNode>(N))
43380 if (SDValue V = narrowShuffle(Shuf, DAG))
43381 return V;
43382
43383 // If we have legalized the vector types, look for blends of FADD and FSUB
43384 // nodes that we can fuse into an ADDSUB, FMADDSUB, or FMSUBADD node.
43385 SDLoc dl(N);
43386 EVT VT = N->getValueType(0);
43387 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
43388 if (TLI.isTypeLegal(VT) && !isSoftF16(VT, Subtarget))
43389 if (SDValue AddSub =
43390 combineShuffleToAddSubOrFMAddSub(N, dl, Subtarget, DAG))
43391 return AddSub;
43392
43393 // Attempt to combine into a vector load/broadcast.
43395 VT, SDValue(N, 0), dl, DAG, Subtarget, /*IsAfterLegalize*/ true))
43396 return LD;
43397
43398 if (isTargetShuffle(N->getOpcode())) {
43399 SDValue Op(N, 0);
43400 if (SDValue Shuffle = combineTargetShuffle(Op, dl, DAG, DCI, Subtarget))
43401 return Shuffle;
43402
43403 // Try recursively combining arbitrary sequences of x86 shuffle
43404 // instructions into higher-order shuffles. We do this after combining
43405 // specific PSHUF instruction sequences into their minimal form so that we
43406 // can evaluate how many specialized shuffle instructions are involved in
43407 // a particular chain.
43408 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
43409 return Res;
43410
43411 // Simplify source operands based on shuffle mask.
43412 // TODO - merge this into combineX86ShufflesRecursively.
43413 APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
43414 if (TLI.SimplifyDemandedVectorElts(Op, DemandedElts, DCI))
43415 return SDValue(N, 0);
43416
43417 // Canonicalize SHUFFLE(UNARYOP(X)) -> UNARYOP(SHUFFLE(X)).
43418 // Canonicalize SHUFFLE(BINOP(X,Y)) -> BINOP(SHUFFLE(X),SHUFFLE(Y)).
43419 // Perform this after other shuffle combines to allow inner shuffles to be
43420 // combined away first.
43421 if (SDValue BinOp = canonicalizeShuffleWithOp(Op, DAG, dl))
43422 return BinOp;
43423 }
43424
43425 return SDValue();
43426}
43427
43428// Simplify variable target shuffle masks based on the demanded elements.
43429// TODO: Handle DemandedBits in mask indices as well?
43431 SDValue Op, const APInt &DemandedElts, unsigned MaskIndex,
43432 TargetLowering::TargetLoweringOpt &TLO, unsigned Depth) const {
43433 // If we're demanding all elements don't bother trying to simplify the mask.
43434 unsigned NumElts = DemandedElts.getBitWidth();
43435 if (DemandedElts.isAllOnes())
43436 return false;
43437
43438 SDValue Mask = Op.getOperand(MaskIndex);
43439 if (!Mask.hasOneUse())
43440 return false;
43441
43442 // Attempt to generically simplify the variable shuffle mask.
43443 APInt MaskUndef, MaskZero;
43444 if (SimplifyDemandedVectorElts(Mask, DemandedElts, MaskUndef, MaskZero, TLO,
43445 Depth + 1))
43446 return true;
43447
43448 // Attempt to extract+simplify a (constant pool load) shuffle mask.
43449 // TODO: Support other types from getTargetShuffleMaskIndices?
43451 EVT BCVT = BC.getValueType();
43452 auto *Load = dyn_cast<LoadSDNode>(BC);
43453 if (!Load || !Load->getBasePtr().hasOneUse())
43454 return false;
43455
43456 const Constant *C = getTargetConstantFromNode(Load);
43457 if (!C)
43458 return false;
43459
43460 Type *CTy = C->getType();
43461 if (!CTy->isVectorTy() ||
43462 CTy->getPrimitiveSizeInBits() != Mask.getValueSizeInBits())
43463 return false;
43464
43465 // Handle scaling for i64 elements on 32-bit targets.
43466 unsigned NumCstElts = cast<FixedVectorType>(CTy)->getNumElements();
43467 if (NumCstElts != NumElts && NumCstElts != (NumElts * 2))
43468 return false;
43469 unsigned Scale = NumCstElts / NumElts;
43470
43471 // Simplify mask if we have an undemanded element that is not undef.
43472 bool Simplified = false;
43473 SmallVector<Constant *, 32> ConstVecOps;
43474 for (unsigned i = 0; i != NumCstElts; ++i) {
43475 Constant *Elt = C->getAggregateElement(i);
43476 if (!DemandedElts[i / Scale] && !isa<UndefValue>(Elt)) {
43477 ConstVecOps.push_back(UndefValue::get(Elt->getType()));
43478 Simplified = true;
43479 continue;
43480 }
43481 ConstVecOps.push_back(Elt);
43482 }
43483 if (!Simplified)
43484 return false;
43485
43486 // Generate new constant pool entry + legalize immediately for the load.
43487 SDLoc DL(Op);
43488 SDValue CV = TLO.DAG.getConstantPool(ConstantVector::get(ConstVecOps), BCVT);
43489 SDValue LegalCV = LowerConstantPool(CV, TLO.DAG);
43490 SDValue NewMask = TLO.DAG.getLoad(
43491 BCVT, DL, TLO.DAG.getEntryNode(), LegalCV,
43493 Load->getAlign());
43494 return TLO.CombineTo(Mask, TLO.DAG.getBitcast(Mask.getValueType(), NewMask));
43495}
43496
43498 SDValue Op, const APInt &DemandedElts, APInt &KnownUndef, APInt &KnownZero,
43499 TargetLoweringOpt &TLO, unsigned Depth) const {
43500 int NumElts = DemandedElts.getBitWidth();
43501 unsigned Opc = Op.getOpcode();
43502 EVT VT = Op.getValueType();
43503
43504 // Handle special case opcodes.
43505 switch (Opc) {
43506 case X86ISD::PMULDQ:
43507 case X86ISD::PMULUDQ: {
43508 APInt LHSUndef, LHSZero;
43509 APInt RHSUndef, RHSZero;
43510 SDValue LHS = Op.getOperand(0);
43511 SDValue RHS = Op.getOperand(1);
43512 if (SimplifyDemandedVectorElts(LHS, DemandedElts, LHSUndef, LHSZero, TLO,
43513 Depth + 1))
43514 return true;
43515 if (SimplifyDemandedVectorElts(RHS, DemandedElts, RHSUndef, RHSZero, TLO,
43516 Depth + 1))
43517 return true;
43518 // Multiply by zero.
43519 KnownZero = LHSZero | RHSZero;
43520 break;
43521 }
43522 case X86ISD::VPMADDUBSW:
43523 case X86ISD::VPMADDWD: {
43524 APInt LHSUndef, LHSZero;
43525 APInt RHSUndef, RHSZero;
43526 SDValue LHS = Op.getOperand(0);
43527 SDValue RHS = Op.getOperand(1);
43528 APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedElts, 2 * NumElts);
43529
43530 if (SimplifyDemandedVectorElts(LHS, DemandedSrcElts, LHSUndef, LHSZero, TLO,
43531 Depth + 1))
43532 return true;
43533 if (SimplifyDemandedVectorElts(RHS, DemandedSrcElts, RHSUndef, RHSZero, TLO,
43534 Depth + 1))
43535 return true;
43536
43537 // TODO: Multiply by zero.
43538
43539 // If RHS/LHS elements are known zero then we don't need the LHS/RHS equivalent.
43540 APInt DemandedLHSElts = DemandedSrcElts & ~RHSZero;
43541 if (SimplifyDemandedVectorElts(LHS, DemandedLHSElts, LHSUndef, LHSZero, TLO,
43542 Depth + 1))
43543 return true;
43544 APInt DemandedRHSElts = DemandedSrcElts & ~LHSZero;
43545 if (SimplifyDemandedVectorElts(RHS, DemandedRHSElts, RHSUndef, RHSZero, TLO,
43546 Depth + 1))
43547 return true;
43548 break;
43549 }
43550 case X86ISD::PSADBW: {
43551 SDValue LHS = Op.getOperand(0);
43552 SDValue RHS = Op.getOperand(1);
43553 assert(VT.getScalarType() == MVT::i64 &&
43554 LHS.getValueType() == RHS.getValueType() &&
43555 LHS.getValueType().getScalarType() == MVT::i8 &&
43556 "Unexpected PSADBW types");
43557
43558 // Aggressively peek through ops to get at the demanded elts.
43559 if (!DemandedElts.isAllOnes()) {
43560 unsigned NumSrcElts = LHS.getValueType().getVectorNumElements();
43561 APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedElts, NumSrcElts);
43563 LHS, DemandedSrcElts, TLO.DAG, Depth + 1);
43565 RHS, DemandedSrcElts, TLO.DAG, Depth + 1);
43566 if (NewLHS || NewRHS) {
43567 NewLHS = NewLHS ? NewLHS : LHS;
43568 NewRHS = NewRHS ? NewRHS : RHS;
43569 return TLO.CombineTo(
43570 Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewLHS, NewRHS));
43571 }
43572 }
43573 break;
43574 }
43575 case X86ISD::VSHL:
43576 case X86ISD::VSRL:
43577 case X86ISD::VSRA: {
43578 // We only need the bottom 64-bits of the (128-bit) shift amount.
43579 SDValue Amt = Op.getOperand(1);
43580 MVT AmtVT = Amt.getSimpleValueType();
43581 assert(AmtVT.is128BitVector() && "Unexpected value type");
43582
43583 // If we reuse the shift amount just for sse shift amounts then we know that
43584 // only the bottom 64-bits are only ever used.
43585 bool AssumeSingleUse = llvm::all_of(Amt->users(), [&Amt](SDNode *Use) {
43586 unsigned UseOpc = Use->getOpcode();
43587 return (UseOpc == X86ISD::VSHL || UseOpc == X86ISD::VSRL ||
43588 UseOpc == X86ISD::VSRA) &&
43589 Use->getOperand(0) != Amt;
43590 });
43591
43592 APInt AmtUndef, AmtZero;
43593 unsigned NumAmtElts = AmtVT.getVectorNumElements();
43594 APInt AmtElts = APInt::getLowBitsSet(NumAmtElts, NumAmtElts / 2);
43595 if (SimplifyDemandedVectorElts(Amt, AmtElts, AmtUndef, AmtZero, TLO,
43596 Depth + 1, AssumeSingleUse))
43597 return true;
43598 [[fallthrough]];
43599 }
43600 case X86ISD::VSHLI:
43601 case X86ISD::VSRLI:
43602 case X86ISD::VSRAI: {
43603 SDValue Src = Op.getOperand(0);
43604 APInt SrcUndef;
43605 if (SimplifyDemandedVectorElts(Src, DemandedElts, SrcUndef, KnownZero, TLO,
43606 Depth + 1))
43607 return true;
43608
43609 // Fold shift(0,x) -> 0
43610 if (DemandedElts.isSubsetOf(KnownZero))
43611 return TLO.CombineTo(
43612 Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, SDLoc(Op)));
43613
43614 // Aggressively peek through ops to get at the demanded elts.
43615 if (!DemandedElts.isAllOnes())
43617 Src, DemandedElts, TLO.DAG, Depth + 1))
43618 return TLO.CombineTo(
43619 Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc, Op.getOperand(1)));
43620 break;
43621 }
43622 case X86ISD::VPSHA:
43623 case X86ISD::VPSHL:
43624 case X86ISD::VSHLV:
43625 case X86ISD::VSRLV:
43626 case X86ISD::VSRAV: {
43627 APInt LHSUndef, LHSZero;
43628 APInt RHSUndef, RHSZero;
43629 SDValue LHS = Op.getOperand(0);
43630 SDValue RHS = Op.getOperand(1);
43631 if (SimplifyDemandedVectorElts(LHS, DemandedElts, LHSUndef, LHSZero, TLO,
43632 Depth + 1))
43633 return true;
43634
43635 // Fold shift(0,x) -> 0
43636 if (DemandedElts.isSubsetOf(LHSZero))
43637 return TLO.CombineTo(
43638 Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, SDLoc(Op)));
43639
43640 if (SimplifyDemandedVectorElts(RHS, DemandedElts, RHSUndef, RHSZero, TLO,
43641 Depth + 1))
43642 return true;
43643
43644 KnownZero = LHSZero;
43645 break;
43646 }
43647 case X86ISD::CMPM:
43648 case X86ISD::CMPP: {
43649 // Scalarize packed fp comparison if we only require element 0.
43650 if (DemandedElts == 1) {
43651 SDLoc dl(Op);
43652 MVT VT = Op.getSimpleValueType();
43653 MVT OpSVT = Op.getOperand(0).getSimpleValueType().getScalarType();
43654 SDValue LHS = TLO.DAG.getExtractVectorElt(dl, OpSVT, Op.getOperand(0), 0);
43655 SDValue RHS = TLO.DAG.getExtractVectorElt(dl, OpSVT, Op.getOperand(1), 0);
43656 SDValue CC = Op.getOperand(2);
43657 if (Opc == X86ISD::CMPM) {
43658 SDValue Cmp =
43659 TLO.DAG.getNode(X86ISD::FSETCCM, dl, MVT::v1i1, LHS, RHS, CC);
43660 return TLO.CombineTo(
43661 Op, TLO.DAG.getInsertSubvector(dl, TLO.DAG.getUNDEF(VT), Cmp, 0));
43662 }
43663 SDValue Cmp = TLO.DAG.getNode(X86ISD::FSETCC, dl, OpSVT, LHS, RHS, CC);
43664 return TLO.CombineTo(Op,
43665 TLO.DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Cmp));
43666 }
43667 break;
43668 }
43669 case X86ISD::PCMPEQ:
43670 case X86ISD::PCMPGT: {
43671 APInt LHSUndef, LHSZero;
43672 APInt RHSUndef, RHSZero;
43673 SDValue LHS = Op.getOperand(0);
43674 SDValue RHS = Op.getOperand(1);
43675 if (SimplifyDemandedVectorElts(LHS, DemandedElts, LHSUndef, LHSZero, TLO,
43676 Depth + 1))
43677 return true;
43678 if (SimplifyDemandedVectorElts(RHS, DemandedElts, RHSUndef, RHSZero, TLO,
43679 Depth + 1))
43680 return true;
43681 break;
43682 }
43683 case X86ISD::KSHIFTL: {
43684 SDValue Src = Op.getOperand(0);
43685 auto *Amt = cast<ConstantSDNode>(Op.getOperand(1));
43686 assert(Amt->getAPIntValue().ult(NumElts) && "Out of range shift amount");
43687 unsigned ShiftAmt = Amt->getZExtValue();
43688
43689 if (ShiftAmt == 0)
43690 return TLO.CombineTo(Op, Src);
43691
43692 // If this is ((X >>u C1) << ShAmt), see if we can simplify this into a
43693 // single shift. We can do this if the bottom bits (which are shifted
43694 // out) are never demanded.
43695 if (Src.getOpcode() == X86ISD::KSHIFTR) {
43696 if (!DemandedElts.intersects(APInt::getLowBitsSet(NumElts, ShiftAmt))) {
43697 unsigned C1 = Src.getConstantOperandVal(1);
43698 unsigned NewOpc = X86ISD::KSHIFTL;
43699 int Diff = ShiftAmt - C1;
43700 if (Diff < 0) {
43701 Diff = -Diff;
43702 NewOpc = X86ISD::KSHIFTR;
43703 }
43704
43705 SDLoc dl(Op);
43706 SDValue NewSA = TLO.DAG.getTargetConstant(Diff, dl, MVT::i8);
43707 return TLO.CombineTo(
43708 Op, TLO.DAG.getNode(NewOpc, dl, VT, Src.getOperand(0), NewSA));
43709 }
43710 }
43711
43712 APInt DemandedSrc = DemandedElts.lshr(ShiftAmt);
43713 if (SimplifyDemandedVectorElts(Src, DemandedSrc, KnownUndef, KnownZero, TLO,
43714 Depth + 1))
43715 return true;
43716
43717 KnownUndef <<= ShiftAmt;
43718 KnownZero <<= ShiftAmt;
43719 KnownZero.setLowBits(ShiftAmt);
43720 break;
43721 }
43722 case X86ISD::KSHIFTR: {
43723 SDValue Src = Op.getOperand(0);
43724 auto *Amt = cast<ConstantSDNode>(Op.getOperand(1));
43725 assert(Amt->getAPIntValue().ult(NumElts) && "Out of range shift amount");
43726 unsigned ShiftAmt = Amt->getZExtValue();
43727
43728 if (ShiftAmt == 0)
43729 return TLO.CombineTo(Op, Src);
43730
43731 // If this is ((X << C1) >>u ShAmt), see if we can simplify this into a
43732 // single shift. We can do this if the top bits (which are shifted
43733 // out) are never demanded.
43734 if (Src.getOpcode() == X86ISD::KSHIFTL) {
43735 if (!DemandedElts.intersects(APInt::getHighBitsSet(NumElts, ShiftAmt))) {
43736 unsigned C1 = Src.getConstantOperandVal(1);
43737 unsigned NewOpc = X86ISD::KSHIFTR;
43738 int Diff = ShiftAmt - C1;
43739 if (Diff < 0) {
43740 Diff = -Diff;
43741 NewOpc = X86ISD::KSHIFTL;
43742 }
43743
43744 SDLoc dl(Op);
43745 SDValue NewSA = TLO.DAG.getTargetConstant(Diff, dl, MVT::i8);
43746 return TLO.CombineTo(
43747 Op, TLO.DAG.getNode(NewOpc, dl, VT, Src.getOperand(0), NewSA));
43748 }
43749 }
43750
43751 APInt DemandedSrc = DemandedElts.shl(ShiftAmt);
43752 if (SimplifyDemandedVectorElts(Src, DemandedSrc, KnownUndef, KnownZero, TLO,
43753 Depth + 1))
43754 return true;
43755
43756 KnownUndef.lshrInPlace(ShiftAmt);
43757 KnownZero.lshrInPlace(ShiftAmt);
43758 KnownZero.setHighBits(ShiftAmt);
43759 break;
43760 }
43761 case X86ISD::ANDNP: {
43762 // ANDNP = (~LHS & RHS);
43763 SDValue LHS = Op.getOperand(0);
43764 SDValue RHS = Op.getOperand(1);
43765
43766 auto GetDemandedMasks = [&](SDValue Op, bool Invert = false) {
43767 APInt UndefElts;
43768 SmallVector<APInt> EltBits;
43769 int NumElts = VT.getVectorNumElements();
43770 int EltSizeInBits = VT.getScalarSizeInBits();
43771 APInt OpBits = APInt::getAllOnes(EltSizeInBits);
43772 APInt OpElts = DemandedElts;
43773 if (getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts,
43774 EltBits)) {
43775 OpBits.clearAllBits();
43776 OpElts.clearAllBits();
43777 for (int I = 0; I != NumElts; ++I) {
43778 if (!DemandedElts[I])
43779 continue;
43780 if (UndefElts[I]) {
43781 // We can't assume an undef src element gives an undef dst - the
43782 // other src might be zero.
43783 OpBits.setAllBits();
43784 OpElts.setBit(I);
43785 } else if ((Invert && !EltBits[I].isAllOnes()) ||
43786 (!Invert && !EltBits[I].isZero())) {
43787 OpBits |= Invert ? ~EltBits[I] : EltBits[I];
43788 OpElts.setBit(I);
43789 }
43790 }
43791 }
43792 return std::make_pair(OpBits, OpElts);
43793 };
43794 APInt BitsLHS, EltsLHS;
43795 APInt BitsRHS, EltsRHS;
43796 std::tie(BitsLHS, EltsLHS) = GetDemandedMasks(RHS);
43797 std::tie(BitsRHS, EltsRHS) = GetDemandedMasks(LHS, true);
43798
43799 APInt LHSUndef, LHSZero;
43800 APInt RHSUndef, RHSZero;
43801 if (SimplifyDemandedVectorElts(LHS, EltsLHS, LHSUndef, LHSZero, TLO,
43802 Depth + 1))
43803 return true;
43804 if (SimplifyDemandedVectorElts(RHS, EltsRHS, RHSUndef, RHSZero, TLO,
43805 Depth + 1))
43806 return true;
43807
43808 if (!DemandedElts.isAllOnes()) {
43809 SDValue NewLHS = SimplifyMultipleUseDemandedBits(LHS, BitsLHS, EltsLHS,
43810 TLO.DAG, Depth + 1);
43811 SDValue NewRHS = SimplifyMultipleUseDemandedBits(RHS, BitsRHS, EltsRHS,
43812 TLO.DAG, Depth + 1);
43813 if (NewLHS || NewRHS) {
43814 NewLHS = NewLHS ? NewLHS : LHS;
43815 NewRHS = NewRHS ? NewRHS : RHS;
43816 return TLO.CombineTo(
43817 Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewLHS, NewRHS));
43818 }
43819 }
43820 break;
43821 }
43822 case X86ISD::CVTSI2P:
43823 case X86ISD::CVTUI2P:
43824 case X86ISD::CVTPH2PS:
43825 case X86ISD::CVTPS2PH: {
43826 SDValue Src = Op.getOperand(0);
43827 EVT SrcVT = Src.getValueType();
43828 APInt SrcUndef, SrcZero;
43829 APInt SrcElts = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
43830 if (SimplifyDemandedVectorElts(Src, SrcElts, SrcUndef, SrcZero, TLO,
43831 Depth + 1))
43832 return true;
43833 break;
43834 }
43835 case X86ISD::PACKSS:
43836 case X86ISD::PACKUS: {
43837 SDValue N0 = Op.getOperand(0);
43838 SDValue N1 = Op.getOperand(1);
43839
43840 APInt DemandedLHS, DemandedRHS;
43841 getPackDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);
43842
43843 APInt LHSUndef, LHSZero;
43844 if (SimplifyDemandedVectorElts(N0, DemandedLHS, LHSUndef, LHSZero, TLO,
43845 Depth + 1))
43846 return true;
43847 APInt RHSUndef, RHSZero;
43848 if (SimplifyDemandedVectorElts(N1, DemandedRHS, RHSUndef, RHSZero, TLO,
43849 Depth + 1))
43850 return true;
43851
43852 // TODO - pass on known zero/undef.
43853
43854 // Aggressively peek through ops to get at the demanded elts.
43855 // TODO - we should do this for all target/faux shuffles ops.
43856 if (!DemandedElts.isAllOnes()) {
43857 SDValue NewN0 = SimplifyMultipleUseDemandedVectorElts(N0, DemandedLHS,
43858 TLO.DAG, Depth + 1);
43859 SDValue NewN1 = SimplifyMultipleUseDemandedVectorElts(N1, DemandedRHS,
43860 TLO.DAG, Depth + 1);
43861 if (NewN0 || NewN1) {
43862 NewN0 = NewN0 ? NewN0 : N0;
43863 NewN1 = NewN1 ? NewN1 : N1;
43864 return TLO.CombineTo(Op,
43865 TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewN0, NewN1));
43866 }
43867 }
43868 break;
43869 }
43870 case X86ISD::HADD:
43871 case X86ISD::HSUB:
43872 case X86ISD::FHADD:
43873 case X86ISD::FHSUB: {
43874 SDValue N0 = Op.getOperand(0);
43875 SDValue N1 = Op.getOperand(1);
43876
43877 APInt DemandedLHS, DemandedRHS;
43878 getHorizDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);
43879
43880 APInt LHSUndef, LHSZero;
43881 if (SimplifyDemandedVectorElts(N0, DemandedLHS, LHSUndef, LHSZero, TLO,
43882 Depth + 1))
43883 return true;
43884 APInt RHSUndef, RHSZero;
43885 if (SimplifyDemandedVectorElts(N1, DemandedRHS, RHSUndef, RHSZero, TLO,
43886 Depth + 1))
43887 return true;
43888
43889 // TODO - pass on known zero/undef.
43890
43891 // Aggressively peek through ops to get at the demanded elts.
43892 // TODO: Handle repeated operands.
43893 if (N0 != N1 && !DemandedElts.isAllOnes()) {
43894 SDValue NewN0 = SimplifyMultipleUseDemandedVectorElts(N0, DemandedLHS,
43895 TLO.DAG, Depth + 1);
43896 SDValue NewN1 = SimplifyMultipleUseDemandedVectorElts(N1, DemandedRHS,
43897 TLO.DAG, Depth + 1);
43898 if (NewN0 || NewN1) {
43899 NewN0 = NewN0 ? NewN0 : N0;
43900 NewN1 = NewN1 ? NewN1 : N1;
43901 return TLO.CombineTo(Op,
43902 TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewN0, NewN1));
43903 }
43904 }
43905 break;
43906 }
43907 case X86ISD::VTRUNC:
43908 case X86ISD::VTRUNCS:
43909 case X86ISD::VTRUNCUS: {
43910 SDValue Src = Op.getOperand(0);
43911 MVT SrcVT = Src.getSimpleValueType();
43912 APInt DemandedSrc = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
43913 APInt SrcUndef, SrcZero;
43914 if (SimplifyDemandedVectorElts(Src, DemandedSrc, SrcUndef, SrcZero, TLO,
43915 Depth + 1))
43916 return true;
43917 KnownZero = SrcZero.zextOrTrunc(NumElts);
43918 KnownUndef = SrcUndef.zextOrTrunc(NumElts);
43919 break;
43920 }
43921 case X86ISD::BLENDI: {
43922 SmallVector<int, 16> BlendMask;
43923 DecodeBLENDMask(NumElts, Op.getConstantOperandVal(2), BlendMask);
43925 VT.getSimpleVT(), Op.getOperand(0), Op.getOperand(1), BlendMask,
43926 DemandedElts, TLO.DAG, Subtarget, SDLoc(Op)))
43927 return TLO.CombineTo(Op, R);
43928 break;
43929 }
43930 case X86ISD::BLENDV: {
43931 APInt SelUndef, SelZero;
43932 if (SimplifyDemandedVectorElts(Op.getOperand(0), DemandedElts, SelUndef,
43933 SelZero, TLO, Depth + 1))
43934 return true;
43935
43936 // TODO: Use SelZero to adjust LHS/RHS DemandedElts.
43937 APInt LHSUndef, LHSZero;
43938 if (SimplifyDemandedVectorElts(Op.getOperand(1), DemandedElts, LHSUndef,
43939 LHSZero, TLO, Depth + 1))
43940 return true;
43941
43942 APInt RHSUndef, RHSZero;
43943 if (SimplifyDemandedVectorElts(Op.getOperand(2), DemandedElts, RHSUndef,
43944 RHSZero, TLO, Depth + 1))
43945 return true;
43946
43947 KnownZero = LHSZero & RHSZero;
43948 KnownUndef = LHSUndef & RHSUndef;
43949 break;
43950 }
43951 case X86ISD::VZEXT_MOVL: {
43952 // If upper demanded elements are already zero then we have nothing to do.
43953 SDValue Src = Op.getOperand(0);
43954 APInt DemandedUpperElts = DemandedElts;
43955 DemandedUpperElts.clearLowBits(1);
43956 if (TLO.DAG.MaskedVectorIsZero(Src, DemandedUpperElts, Depth + 1))
43957 return TLO.CombineTo(Op, Src);
43958 break;
43959 }
43960 case X86ISD::VZEXT_LOAD: {
43961 // If upper demanded elements are not demanded then simplify to a
43962 // scalar_to_vector(load()).
43964 if (DemandedElts == 1 && Op.getValue(1).use_empty() && isTypeLegal(SVT)) {
43965 SDLoc DL(Op);
43966 auto *Mem = cast<MemSDNode>(Op);
43967 SDValue Elt = TLO.DAG.getLoad(SVT, DL, Mem->getChain(), Mem->getBasePtr(),
43968 Mem->getMemOperand());
43969 SDValue Vec = TLO.DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Elt);
43970 return TLO.CombineTo(Op, TLO.DAG.getBitcast(VT, Vec));
43971 }
43972 break;
43973 }
43974 case X86ISD::VBROADCAST: {
43975 SDValue Src = Op.getOperand(0);
43976 MVT SrcVT = Src.getSimpleValueType();
43977 // Don't bother broadcasting if we just need the 0'th element.
43978 if (DemandedElts == 1) {
43979 if (!SrcVT.isVector())
43980 Src = TLO.DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(Op), VT, Src);
43981 else if (Src.getValueType() != VT)
43982 Src = widenSubVector(VT.getSimpleVT(), Src, false, Subtarget, TLO.DAG,
43983 SDLoc(Op));
43984 return TLO.CombineTo(Op, Src);
43985 }
43986 if (!SrcVT.isVector())
43987 break;
43988 APInt SrcUndef, SrcZero;
43989 APInt SrcElts = APInt::getOneBitSet(SrcVT.getVectorNumElements(), 0);
43990 if (SimplifyDemandedVectorElts(Src, SrcElts, SrcUndef, SrcZero, TLO,
43991 Depth + 1))
43992 return true;
43993 // Aggressively peek through src to get at the demanded elt.
43994 // TODO - we should do this for all target/faux shuffles ops.
43996 Src, SrcElts, TLO.DAG, Depth + 1))
43997 return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc));
43998 break;
43999 }
44000 case X86ISD::VPERMV:
44001 if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 0, TLO,
44002 Depth))
44003 return true;
44004 break;
44005 case X86ISD::PSHUFB:
44006 case X86ISD::VPERMV3:
44007 case X86ISD::VPERMILPV:
44008 if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 1, TLO,
44009 Depth))
44010 return true;
44011 break;
44012 case X86ISD::VPPERM:
44013 case X86ISD::VPERMIL2:
44014 if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 2, TLO,
44015 Depth))
44016 return true;
44017 break;
44018 }
44019
44020 // For 256/512-bit ops that are 128/256-bit ops glued together, if we do not
44021 // demand any of the high elements, then narrow the op to 128/256-bits: e.g.
44022 // (op ymm0, ymm1) --> insert undef, (op xmm0, xmm1), 0
44023 if ((VT.is256BitVector() || VT.is512BitVector()) &&
44024 DemandedElts.lshr(NumElts / 2) == 0) {
44025 unsigned SizeInBits = VT.getSizeInBits();
44026 unsigned ExtSizeInBits = SizeInBits / 2;
44027
44028 // See if 512-bit ops only use the bottom 128-bits.
44029 if (VT.is512BitVector() && DemandedElts.lshr(NumElts / 4) == 0)
44030 ExtSizeInBits = SizeInBits / 4;
44031
44032 switch (Opc) {
44033 // Scalar broadcast.
44034 case X86ISD::VBROADCAST: {
44035 SDLoc DL(Op);
44036 SDValue Src = Op.getOperand(0);
44037 if (Src.getValueSizeInBits() > ExtSizeInBits)
44038 Src = extractSubVector(Src, 0, TLO.DAG, DL, ExtSizeInBits);
44039 EVT BcstVT = EVT::getVectorVT(*TLO.DAG.getContext(), VT.getScalarType(),
44040 ExtSizeInBits / VT.getScalarSizeInBits());
44041 SDValue Bcst = TLO.DAG.getNode(X86ISD::VBROADCAST, DL, BcstVT, Src);
44042 return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Bcst, 0,
44043 TLO.DAG, DL, ExtSizeInBits));
44044 }
44046 SDLoc DL(Op);
44047 auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
44048 EVT BcstVT = EVT::getVectorVT(*TLO.DAG.getContext(), VT.getScalarType(),
44049 ExtSizeInBits / VT.getScalarSizeInBits());
44050 SDVTList Tys = TLO.DAG.getVTList(BcstVT, MVT::Other);
44051 SDValue Ops[] = {MemIntr->getOperand(0), MemIntr->getOperand(1)};
44052 SDValue Bcst = TLO.DAG.getMemIntrinsicNode(
44053 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MemIntr->getMemoryVT(),
44054 MemIntr->getMemOperand());
44056 Bcst.getValue(1));
44057 return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Bcst, 0,
44058 TLO.DAG, DL, ExtSizeInBits));
44059 }
44060 // Subvector broadcast.
44062 auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
44063 EVT MemVT = MemIntr->getMemoryVT();
44064 if (ExtSizeInBits == MemVT.getStoreSizeInBits()) {
44065 SDLoc DL(Op);
44066 SDValue Ld =
44067 TLO.DAG.getLoad(MemVT, DL, MemIntr->getChain(),
44068 MemIntr->getBasePtr(), MemIntr->getMemOperand());
44070 Ld.getValue(1));
44071 return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Ld, 0,
44072 TLO.DAG, DL, ExtSizeInBits));
44073 } else if ((ExtSizeInBits % MemVT.getStoreSizeInBits()) == 0) {
44074 SDLoc DL(Op);
44075 EVT BcstVT = EVT::getVectorVT(*TLO.DAG.getContext(), VT.getScalarType(),
44076 ExtSizeInBits / VT.getScalarSizeInBits());
44077 if (SDValue BcstLd =
44078 getBROADCAST_LOAD(Opc, DL, BcstVT, MemVT, MemIntr, 0, TLO.DAG))
44079 return TLO.CombineTo(Op,
44080 insertSubVector(TLO.DAG.getUNDEF(VT), BcstLd, 0,
44081 TLO.DAG, DL, ExtSizeInBits));
44082 }
44083 break;
44084 }
44085 // Byte shifts by immediate.
44086 case X86ISD::VSHLDQ:
44087 case X86ISD::VSRLDQ:
44088 // Shift by uniform.
44089 case X86ISD::VSHL:
44090 case X86ISD::VSRL:
44091 case X86ISD::VSRA:
44092 // Shift by immediate.
44093 case X86ISD::VSHLI:
44094 case X86ISD::VSRLI:
44095 case X86ISD::VSRAI: {
44096 SDLoc DL(Op);
44097 SDValue Ext0 =
44098 extractSubVector(Op.getOperand(0), 0, TLO.DAG, DL, ExtSizeInBits);
44099 SDValue ExtOp =
44100 TLO.DAG.getNode(Opc, DL, Ext0.getValueType(), Ext0, Op.getOperand(1));
44101 SDValue UndefVec = TLO.DAG.getUNDEF(VT);
44102 SDValue Insert =
44103 insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
44104 return TLO.CombineTo(Op, Insert);
44105 }
44106 case X86ISD::VPERMI: {
44107 // Simplify 256-bit PERMPD/PERMQ to extract_subvector.
44108 // TODO: This should be done in shuffle combining.
44109 if (VT == MVT::v4f64 || VT == MVT::v4i64) {
44111 DecodeVPERMMask(NumElts, Op.getConstantOperandVal(1), Mask);
44112 if (isUndefOrEqual(Mask[0], 2) && isUndefOrEqual(Mask[1], 3)) {
44113 SDLoc DL(Op);
44114 SDValue Ext = extractSubVector(Op.getOperand(0), 2, TLO.DAG, DL, 128);
44115 SDValue UndefVec = TLO.DAG.getUNDEF(VT);
44116 SDValue Insert = insertSubVector(UndefVec, Ext, 0, TLO.DAG, DL, 128);
44117 return TLO.CombineTo(Op, Insert);
44118 }
44119 }
44120 // Simplify 512-bit PERMPD/PERMQ to 256-bit variant on lower half.
44121 if (VT == MVT::v8f64 || VT == MVT::v8i64) {
44122 SDLoc DL(Op);
44123 SDValue Ext0 = extractSubVector(Op.getOperand(0), 0, TLO.DAG, DL, 256);
44124 SDValue ExtOp = TLO.DAG.getNode(Opc, DL, Ext0.getValueType(), Ext0,
44125 Op.getOperand(1));
44126 SDValue UndefVec = TLO.DAG.getUNDEF(VT);
44127 SDValue Insert = insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, 256);
44128 return TLO.CombineTo(Op, Insert);
44129 }
44130 break;
44131 }
44132 case X86ISD::VPERMV: {
44135 // We can always split v16i32/v16f32 AVX512 to v8i32/v8f32 AVX2 variants.
44136 if ((VT.is256BitVector() || Subtarget.hasVLX() || VT == MVT::v16i32 ||
44137 VT == MVT::v16f32) &&
44138 getTargetShuffleMask(Op, /*AllowSentinelZero=*/false, Ops, Mask)) {
44139 // For lane-crossing shuffles, only split in half in case we're still
44140 // referencing higher elements.
44141 unsigned HalfElts = NumElts / 2;
44142 unsigned HalfSize = SizeInBits / 2;
44143 Mask.resize(HalfElts);
44144 if (all_of(Mask,
44145 [&](int M) { return isUndefOrInRange(M, 0, HalfElts); })) {
44147 SDLoc DL(Op);
44148 SDValue Ext;
44149 SDValue M =
44150 extractSubVector(Op.getOperand(0), 0, TLO.DAG, DL, HalfSize);
44151 SDValue V =
44152 extractSubVector(Op.getOperand(1), 0, TLO.DAG, DL, HalfSize);
44153 // For 128-bit v2X64/v4X32 instructions, use VPERMILPD/VPERMILPS.
44154 if (VT.is512BitVector() || VT.getScalarSizeInBits() <= 16)
44155 Ext = TLO.DAG.getNode(Opc, DL, HalfVT, M, V);
44156 else {
44158 MVT ShufVT = HalfVT.changeVectorElementType(ShufSVT);
44159 Ext = TLO.DAG.getNode(X86ISD::VPERMILPV, DL, ShufVT,
44160 TLO.DAG.getBitcast(ShufVT, V), M);
44161 Ext = TLO.DAG.getBitcast(HalfVT, Ext);
44162 }
44163 SDValue Insert = widenSubVector(Ext, /*ZeroNewElements=*/false,
44164 Subtarget, TLO.DAG, DL, SizeInBits);
44165 return TLO.CombineTo(Op, Insert);
44166 }
44167 }
44168 break;
44169 }
44170 case X86ISD::VPERMV3: {
44173 if (Subtarget.hasVLX() &&
44174 getTargetShuffleMask(Op, /*AllowSentinelZero=*/false, Ops, Mask)) {
44175 // For lane-crossing shuffles, only split in half in case we're still
44176 // referencing higher elements.
44177 unsigned HalfElts = NumElts / 2;
44178 unsigned HalfSize = SizeInBits / 2;
44179 Mask.resize(HalfElts);
44180 if (all_of(Mask, [&](int M) {
44181 return isUndefOrInRange(M, 0, HalfElts) ||
44182 isUndefOrInRange(M, NumElts, NumElts + HalfElts);
44183 })) {
44184 // Adjust mask elements for 2nd operand to point to half width.
44185 for (int &M : Mask)
44186 M = (M < NumElts) ? M : (M - HalfElts);
44188 MVT HalfIntVT = HalfVT.changeVectorElementTypeToInteger();
44189 SDLoc DL(Op);
44190 SDValue Ext = TLO.DAG.getNode(
44191 Opc, DL, HalfVT,
44192 extractSubVector(Op.getOperand(0), 0, TLO.DAG, DL, HalfSize),
44193 getConstVector(Mask, HalfIntVT, TLO.DAG, DL, /*IsMask=*/true),
44194 extractSubVector(Op.getOperand(2), 0, TLO.DAG, DL, HalfSize));
44195 SDValue Insert = widenSubVector(Ext, /*ZeroNewElements=*/false,
44196 Subtarget, TLO.DAG, DL, SizeInBits);
44197 return TLO.CombineTo(Op, Insert);
44198 }
44199 }
44200 break;
44201 }
44202 case X86ISD::VPERM2X128: {
44203 // Simplify VPERM2F128/VPERM2I128 to extract_subvector.
44204 SDLoc DL(Op);
44205 unsigned LoMask = Op.getConstantOperandVal(2) & 0xF;
44206 if (LoMask & 0x8)
44207 return TLO.CombineTo(
44208 Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, DL));
44209 unsigned EltIdx = (LoMask & 0x1) * (NumElts / 2);
44210 unsigned SrcIdx = (LoMask & 0x2) >> 1;
44211 SDValue ExtOp =
44212 extractSubVector(Op.getOperand(SrcIdx), EltIdx, TLO.DAG, DL, 128);
44213 SDValue UndefVec = TLO.DAG.getUNDEF(VT);
44214 SDValue Insert =
44215 insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
44216 return TLO.CombineTo(Op, Insert);
44217 }
44218 // Conversions.
44219 // TODO: Add more CVT opcodes when we have test coverage.
44220 case X86ISD::CVTTP2UI: {
44221 if (!Subtarget.hasVLX())
44222 break;
44223 [[fallthrough]];
44224 }
44225 case X86ISD::CVTTP2SI: {
44226 if (Op.getOperand(0).getValueType().getVectorElementType() == MVT::f16 &&
44227 !Subtarget.hasVLX())
44228 break;
44229 [[fallthrough]];
44230 }
44231 case X86ISD::CVTPH2PS: {
44232 SDLoc DL(Op);
44233 unsigned Scale = SizeInBits / ExtSizeInBits;
44234 SDValue SrcOp = Op.getOperand(0);
44235 MVT SrcVT = SrcOp.getSimpleValueType();
44236 unsigned SrcExtSize =
44237 std::max<unsigned>(SrcVT.getSizeInBits() / Scale, 128);
44239 ExtSizeInBits / VT.getScalarSizeInBits());
44240 SDValue ExtOp = TLO.DAG.getNode(
44241 Opc, DL, ExtVT, extractSubVector(SrcOp, 0, TLO.DAG, DL, SrcExtSize));
44242 SDValue UndefVec = TLO.DAG.getUNDEF(VT);
44243 SDValue Insert =
44244 insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
44245 return TLO.CombineTo(Op, Insert);
44246 }
44247 // Zero upper elements.
44248 case X86ISD::VZEXT_MOVL:
44249 // Variable blend.
44250 case X86ISD::BLENDV:
44251 // Target unary shuffles:
44252 case X86ISD::MOVDDUP:
44253 // Target unary shuffles by immediate:
44254 case X86ISD::PSHUFD:
44255 case X86ISD::PSHUFLW:
44256 case X86ISD::PSHUFHW:
44257 case X86ISD::VPERMILPI:
44258 // (Non-Lane Crossing) Target Shuffles.
44259 case X86ISD::VPERMILPV:
44260 case X86ISD::VPERMIL2:
44261 case X86ISD::PSHUFB:
44262 case X86ISD::UNPCKL:
44263 case X86ISD::UNPCKH:
44264 case X86ISD::BLENDI:
44265 // Integer ops.
44266 case X86ISD::PACKSS:
44267 case X86ISD::PACKUS:
44268 case X86ISD::PCMPEQ:
44269 case X86ISD::PCMPGT:
44270 case X86ISD::PMULUDQ:
44271 case X86ISD::PMULDQ:
44272 case X86ISD::VSHLV:
44273 case X86ISD::VSRLV:
44274 case X86ISD::VSRAV:
44275 // Float ops.
44276 case X86ISD::FMAX:
44277 case X86ISD::FMIN:
44278 case X86ISD::FMAXC:
44279 case X86ISD::FMINC:
44280 case X86ISD::FRSQRT:
44281 case X86ISD::FRCP:
44282 // Horizontal Ops.
44283 case X86ISD::HADD:
44284 case X86ISD::HSUB:
44285 case X86ISD::FHADD:
44286 case X86ISD::FHSUB: {
44287 SDLoc DL(Op);
44289 for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
44290 SDValue SrcOp = Op.getOperand(i);
44291 EVT SrcVT = SrcOp.getValueType();
44292 assert((!SrcVT.isVector() || SrcVT.getSizeInBits() == SizeInBits) &&
44293 "Unsupported vector size");
44294 Ops.push_back(SrcVT.isVector() ? extractSubVector(SrcOp, 0, TLO.DAG, DL,
44295 ExtSizeInBits)
44296 : SrcOp);
44297 }
44298 MVT ExtVT = VT.getSimpleVT();
44299 ExtVT = MVT::getVectorVT(ExtVT.getScalarType(),
44300 ExtSizeInBits / ExtVT.getScalarSizeInBits());
44301 SDValue ExtOp = TLO.DAG.getNode(Opc, DL, ExtVT, Ops);
44302 SDValue UndefVec = TLO.DAG.getUNDEF(VT);
44303 SDValue Insert =
44304 insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
44305 return TLO.CombineTo(Op, Insert);
44306 }
44307 }
44308 }
44309
44310 // For splats, unless we *only* demand the 0'th element,
44311 // stop attempts at simplification here, we aren't going to improve things,
44312 // this is better than any potential shuffle.
44313 if (!DemandedElts.isOne() && TLO.DAG.isSplatValue(Op, /*AllowUndefs*/false))
44314 return false;
44315
44316 // Get target/faux shuffle mask.
44317 APInt OpUndef, OpZero;
44318 SmallVector<int, 64> OpMask;
44319 SmallVector<SDValue, 2> OpInputs;
44320 if (!getTargetShuffleInputs(Op, DemandedElts, OpInputs, OpMask, OpUndef,
44321 OpZero, TLO.DAG, Depth, false))
44322 return false;
44323
44324 // Shuffle inputs must be the same size as the result.
44325 if (OpMask.size() != (unsigned)NumElts ||
44326 llvm::any_of(OpInputs, [VT](SDValue V) {
44327 return VT.getSizeInBits() != V.getValueSizeInBits() ||
44328 !V.getValueType().isVector();
44329 }))
44330 return false;
44331
44332 KnownZero = OpZero;
44333 KnownUndef = OpUndef;
44334
44335 // Check if shuffle mask can be simplified to undef/zero/identity.
44336 int NumSrcs = OpInputs.size();
44337 for (int i = 0; i != NumElts; ++i)
44338 if (!DemandedElts[i])
44339 OpMask[i] = SM_SentinelUndef;
44340
44341 if (isUndefInRange(OpMask, 0, NumElts)) {
44342 KnownUndef.setAllBits();
44343 return TLO.CombineTo(Op, TLO.DAG.getUNDEF(VT));
44344 }
44345 if (isUndefOrZeroInRange(OpMask, 0, NumElts)) {
44346 KnownZero.setAllBits();
44347 return TLO.CombineTo(
44348 Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, SDLoc(Op)));
44349 }
44350 for (int Src = 0; Src != NumSrcs; ++Src)
44351 if (isSequentialOrUndefInRange(OpMask, 0, NumElts, Src * NumElts))
44352 return TLO.CombineTo(Op, TLO.DAG.getBitcast(VT, OpInputs[Src]));
44353
44354 // Attempt to simplify inputs.
44355 for (int Src = 0; Src != NumSrcs; ++Src) {
44356 // TODO: Support inputs of different types.
44357 if (OpInputs[Src].getValueType() != VT)
44358 continue;
44359
44360 int Lo = Src * NumElts;
44361 APInt SrcElts = APInt::getZero(NumElts);
44362 for (int i = 0; i != NumElts; ++i)
44363 if (DemandedElts[i]) {
44364 int M = OpMask[i] - Lo;
44365 if (0 <= M && M < NumElts)
44366 SrcElts.setBit(M);
44367 }
44368
44369 // TODO - Propagate input undef/zero elts.
44370 APInt SrcUndef, SrcZero;
44371 if (SimplifyDemandedVectorElts(OpInputs[Src], SrcElts, SrcUndef, SrcZero,
44372 TLO, Depth + 1))
44373 return true;
44374 }
44375
44376 // If we don't demand all elements, then attempt to combine to a simpler
44377 // shuffle.
44378 // We need to convert the depth to something combineX86ShufflesRecursively
44379 // can handle - so pretend its Depth == 0 again, and reduce the max depth
44380 // to match. This prevents combineX86ShuffleChain from returning a
44381 // combined shuffle that's the same as the original root, causing an
44382 // infinite loop.
44383 if (!DemandedElts.isAllOnes()) {
44384 assert(Depth < X86::MaxShuffleCombineDepth && "Depth out of range");
44385
44386 SmallVector<int, 64> DemandedMask(NumElts, SM_SentinelUndef);
44387 for (int i = 0; i != NumElts; ++i)
44388 if (DemandedElts[i])
44389 DemandedMask[i] = i;
44390
44392 {Op}, 0, Op.getOpcode(), Op.getSimpleValueType(), DemandedMask, {}, 0,
44394 /*AllowVariableCrossLaneMask=*/true,
44395 /*AllowVariablePerLaneMask=*/true, isMaskableNode(Op, Subtarget),
44396 TLO.DAG, SDLoc(Op), Subtarget);
44397 if (NewShuffle)
44398 return TLO.CombineTo(Op, NewShuffle);
44399 }
44400
44401 return false;
44402}
44403
44405 SDValue Op, const APInt &OriginalDemandedBits,
44406 const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO,
44407 unsigned Depth) const {
44408 EVT VT = Op.getValueType();
44409 unsigned BitWidth = OriginalDemandedBits.getBitWidth();
44410 unsigned Opc = Op.getOpcode();
44411 switch(Opc) {
44412 case X86ISD::VTRUNC: {
44413 KnownBits KnownOp;
44414 SDValue Src = Op.getOperand(0);
44415 MVT SrcVT = Src.getSimpleValueType();
44416
44417 // Simplify the input, using demanded bit information.
44418 APInt TruncMask = OriginalDemandedBits.zext(SrcVT.getScalarSizeInBits());
44419 APInt DemandedElts = OriginalDemandedElts.trunc(SrcVT.getVectorNumElements());
44420 if (SimplifyDemandedBits(Src, TruncMask, DemandedElts, KnownOp, TLO, Depth + 1))
44421 return true;
44422 break;
44423 }
44424 case X86ISD::PMULDQ:
44425 case X86ISD::PMULUDQ: {
44426 // PMULDQ/PMULUDQ only uses lower 32 bits from each vector element.
44427 KnownBits KnownLHS, KnownRHS;
44428 SDValue LHS = Op.getOperand(0);
44429 SDValue RHS = Op.getOperand(1);
44430
44431 // Don't mask bits on 32-bit AVX512 targets which might lose a broadcast.
44432 // FIXME: Can we bound this better?
44433 APInt DemandedMask = APInt::getLowBitsSet(64, 32);
44434 APInt DemandedMaskLHS = APInt::getAllOnes(64);
44435 APInt DemandedMaskRHS = APInt::getAllOnes(64);
44436
44437 bool Is32BitAVX512 = !Subtarget.is64Bit() && Subtarget.hasAVX512();
44438 if (!Is32BitAVX512 || !TLO.DAG.isSplatValue(LHS))
44439 DemandedMaskLHS = DemandedMask;
44440 if (!Is32BitAVX512 || !TLO.DAG.isSplatValue(RHS))
44441 DemandedMaskRHS = DemandedMask;
44442
44443 if (SimplifyDemandedBits(LHS, DemandedMaskLHS, OriginalDemandedElts,
44444 KnownLHS, TLO, Depth + 1))
44445 return true;
44446 if (SimplifyDemandedBits(RHS, DemandedMaskRHS, OriginalDemandedElts,
44447 KnownRHS, TLO, Depth + 1))
44448 return true;
44449
44450 // PMULUDQ(X,1) -> AND(X,(1<<32)-1) 'getZeroExtendInReg'.
44451 KnownRHS = KnownRHS.trunc(32);
44452 if (Opc == X86ISD::PMULUDQ && KnownRHS.isConstant() &&
44453 KnownRHS.getConstant().isOne()) {
44454 SDLoc DL(Op);
44455 SDValue Mask = TLO.DAG.getConstant(DemandedMask, DL, VT);
44456 return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::AND, DL, VT, LHS, Mask));
44457 }
44458
44459 // Aggressively peek through ops to get at the demanded low bits.
44461 LHS, DemandedMaskLHS, OriginalDemandedElts, TLO.DAG, Depth + 1);
44463 RHS, DemandedMaskRHS, OriginalDemandedElts, TLO.DAG, Depth + 1);
44464 if (DemandedLHS || DemandedRHS) {
44465 DemandedLHS = DemandedLHS ? DemandedLHS : LHS;
44466 DemandedRHS = DemandedRHS ? DemandedRHS : RHS;
44467 return TLO.CombineTo(
44468 Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, DemandedLHS, DemandedRHS));
44469 }
44470 break;
44471 }
44472 case X86ISD::ANDNP: {
44473 KnownBits Known2;
44474 SDValue Op0 = Op.getOperand(0);
44475 SDValue Op1 = Op.getOperand(1);
44476
44477 if (SimplifyDemandedBits(Op1, OriginalDemandedBits, OriginalDemandedElts,
44478 Known, TLO, Depth + 1))
44479 return true;
44480
44481 if (SimplifyDemandedBits(Op0, ~Known.Zero & OriginalDemandedBits,
44482 OriginalDemandedElts, Known2, TLO, Depth + 1))
44483 return true;
44484
44485 // If the RHS is a constant, see if we can simplify it.
44486 if (ShrinkDemandedConstant(Op, ~Known2.One & OriginalDemandedBits,
44487 OriginalDemandedElts, TLO))
44488 return true;
44489
44490 // ANDNP = (~Op0 & Op1);
44491 Known.One &= Known2.Zero;
44492 Known.Zero |= Known2.One;
44493 break;
44494 }
44495 case X86ISD::VSHLI: {
44496 SDValue Op0 = Op.getOperand(0);
44497 SDValue Op1 = Op.getOperand(1);
44498
44499 unsigned ShAmt = Op1->getAsZExtVal();
44500 if (ShAmt >= BitWidth)
44501 break;
44502
44503 APInt DemandedMask = OriginalDemandedBits.lshr(ShAmt);
44504
44505 // If this is ((X >>u C1) << ShAmt), see if we can simplify this into a
44506 // single shift. We can do this if the bottom bits (which are shifted
44507 // out) are never demanded.
44508 if (Op0.getOpcode() == X86ISD::VSRLI &&
44509 OriginalDemandedBits.countr_zero() >= ShAmt) {
44510 unsigned Shift2Amt = Op0.getConstantOperandVal(1);
44511 if (Shift2Amt < BitWidth) {
44512 int Diff = ShAmt - Shift2Amt;
44513 if (Diff == 0)
44514 return TLO.CombineTo(Op, Op0.getOperand(0));
44515
44516 unsigned NewOpc = Diff < 0 ? X86ISD::VSRLI : X86ISD::VSHLI;
44517 SDValue NewShift = TLO.DAG.getNode(
44518 NewOpc, SDLoc(Op), VT, Op0.getOperand(0),
44519 TLO.DAG.getTargetConstant(std::abs(Diff), SDLoc(Op), MVT::i8));
44520 return TLO.CombineTo(Op, NewShift);
44521 }
44522 }
44523
44524 // If we are only demanding sign bits then we can use the shift source directly.
44525 unsigned NumSignBits =
44526 TLO.DAG.ComputeNumSignBits(Op0, OriginalDemandedElts, Depth + 1);
44527 unsigned UpperDemandedBits = BitWidth - OriginalDemandedBits.countr_zero();
44528 if (NumSignBits > ShAmt && (NumSignBits - ShAmt) >= UpperDemandedBits)
44529 return TLO.CombineTo(Op, Op0);
44530
44531 if (SimplifyDemandedBits(Op0, DemandedMask, OriginalDemandedElts, Known,
44532 TLO, Depth + 1))
44533 return true;
44534
44535 Known <<= ShAmt;
44536
44537 // Low bits known zero.
44538 Known.Zero.setLowBits(ShAmt);
44539
44540 if (!OriginalDemandedBits.isSubsetOf(Known.Zero | Known.One)) {
44541 // Attempt to avoid multi-use ops if we don't need anything from them.
44542 if (SDValue DemandedOp0 = SimplifyMultipleUseDemandedBits(
44543 Op0, DemandedMask, OriginalDemandedElts, TLO.DAG, Depth + 1)) {
44544 SDValue NewOp =
44545 TLO.DAG.getNode(Op.getOpcode(), SDLoc(Op), VT, DemandedOp0, Op1);
44546 return TLO.CombineTo(Op, NewOp);
44547 }
44548 }
44549 return false;
44550 }
44551 case X86ISD::VSRLI: {
44552 SDValue Op0 = Op.getOperand(0);
44553 SDValue Op1 = Op.getOperand(1);
44554
44555 unsigned ShAmt = Op1->getAsZExtVal();
44556 if (ShAmt >= BitWidth)
44557 break;
44558
44559 APInt DemandedMask = OriginalDemandedBits << ShAmt;
44560
44561 if (SimplifyDemandedBits(Op0, DemandedMask, OriginalDemandedElts, Known,
44562 TLO, Depth + 1))
44563 return true;
44564
44565 Known >>= ShAmt;
44566
44567 // High bits known zero.
44568 Known.Zero.setHighBits(ShAmt);
44569
44570 if (!OriginalDemandedBits.isSubsetOf(Known.Zero | Known.One)) {
44571 // Attempt to avoid multi-use ops if we don't need anything from them.
44572 if (SDValue DemandedOp0 = SimplifyMultipleUseDemandedBits(
44573 Op0, DemandedMask, OriginalDemandedElts, TLO.DAG, Depth + 1)) {
44574 SDValue NewOp =
44575 TLO.DAG.getNode(Op.getOpcode(), SDLoc(Op), VT, DemandedOp0, Op1);
44576 return TLO.CombineTo(Op, NewOp);
44577 }
44578 }
44579 return false;
44580 }
44581 case X86ISD::VSRAI: {
44582 SDValue Op0 = Op.getOperand(0);
44583 SDValue Op1 = Op.getOperand(1);
44584
44585 unsigned ShAmt = Op1->getAsZExtVal();
44586 if (ShAmt >= BitWidth)
44587 break;
44588
44589 APInt DemandedMask = OriginalDemandedBits << ShAmt;
44590
44591 // If we only want bits that already match the signbit then we don't need
44592 // to shift.
44593 unsigned NumHiDemandedBits = BitWidth - OriginalDemandedBits.countr_zero();
44594 if (TLO.DAG.ComputeNumSignBits(Op0, OriginalDemandedElts, Depth + 1) >=
44595 NumHiDemandedBits)
44596 return TLO.CombineTo(Op, Op0);
44597
44598 // fold (VSRAI (VSHLI X, C1), C1) --> X iff NumSignBits(X) > C1
44599 if (Op0.getOpcode() == X86ISD::VSHLI && Op1 == Op0.getOperand(1)) {
44600 SDValue Op00 = Op0.getOperand(0);
44601 unsigned NumSignBits =
44602 TLO.DAG.ComputeNumSignBits(Op00, OriginalDemandedElts);
44603 if (ShAmt < NumSignBits)
44604 return TLO.CombineTo(Op, Op00);
44605 }
44606
44607 // If any of the demanded bits are produced by the sign extension, we also
44608 // demand the input sign bit.
44609 if (OriginalDemandedBits.countl_zero() < ShAmt)
44610 DemandedMask.setSignBit();
44611
44612 if (SimplifyDemandedBits(Op0, DemandedMask, OriginalDemandedElts, Known,
44613 TLO, Depth + 1))
44614 return true;
44615
44616 Known >>= ShAmt;
44617
44618 // If the input sign bit is known to be zero, or if none of the top bits
44619 // are demanded, turn this into an unsigned shift right.
44620 if (Known.Zero[BitWidth - ShAmt - 1] ||
44621 OriginalDemandedBits.countl_zero() >= ShAmt)
44622 return TLO.CombineTo(
44623 Op, TLO.DAG.getNode(X86ISD::VSRLI, SDLoc(Op), VT, Op0, Op1));
44624
44625 // High bits are known one.
44626 if (Known.One[BitWidth - ShAmt - 1])
44627 Known.One.setHighBits(ShAmt);
44628
44629 if (!OriginalDemandedBits.isSubsetOf(Known.Zero | Known.One)) {
44630 // Attempt to avoid multi-use ops if we don't need anything from them.
44631 if (SDValue DemandedOp0 = SimplifyMultipleUseDemandedBits(
44632 Op0, DemandedMask, OriginalDemandedElts, TLO.DAG, Depth + 1)) {
44633 SDValue NewOp =
44634 TLO.DAG.getNode(Op.getOpcode(), SDLoc(Op), VT, DemandedOp0, Op1);
44635 return TLO.CombineTo(Op, NewOp);
44636 }
44637 }
44638 return false;
44639 }
44640 case X86ISD::BLENDI: {
44641 SDValue LHS = Op.getOperand(0);
44642 SDValue RHS = Op.getOperand(1);
44643 APInt Mask = getBLENDIBlendMask(Op);
44644
44645 APInt DemandedEltsLHS = OriginalDemandedElts & ~Mask;
44646 if (SimplifyDemandedBits(LHS, OriginalDemandedBits, DemandedEltsLHS, Known,
44647 TLO, Depth + 1))
44648 return true;
44649
44650 APInt DemandedEltsRHS = OriginalDemandedElts & Mask;
44651 if (SimplifyDemandedBits(RHS, OriginalDemandedBits, DemandedEltsRHS, Known,
44652 TLO, Depth + 1))
44653 return true;
44654
44655 // Attempt to avoid multi-use ops if we don't need anything from them.
44657 LHS, OriginalDemandedBits, DemandedEltsLHS, TLO.DAG, Depth + 1);
44659 RHS, OriginalDemandedBits, DemandedEltsRHS, TLO.DAG, Depth + 1);
44660 if (NewLHS || NewRHS) {
44661 NewLHS = NewLHS ? NewLHS : LHS;
44662 NewRHS = NewRHS ? NewRHS : RHS;
44663 return TLO.CombineTo(Op,
44664 TLO.DAG.getNode(Op.getOpcode(), SDLoc(Op), VT,
44665 NewLHS, NewRHS, Op.getOperand(2)));
44666 }
44667 break;
44668 }
44669 case X86ISD::BLENDV: {
44670 SDValue Sel = Op.getOperand(0);
44671 SDValue LHS = Op.getOperand(1);
44672 SDValue RHS = Op.getOperand(2);
44673
44674 APInt SignMask = APInt::getSignMask(BitWidth);
44676 Sel, SignMask, OriginalDemandedElts, TLO.DAG, Depth + 1);
44678 LHS, OriginalDemandedBits, OriginalDemandedElts, TLO.DAG, Depth + 1);
44680 RHS, OriginalDemandedBits, OriginalDemandedElts, TLO.DAG, Depth + 1);
44681
44682 if (NewSel || NewLHS || NewRHS) {
44683 NewSel = NewSel ? NewSel : Sel;
44684 NewLHS = NewLHS ? NewLHS : LHS;
44685 NewRHS = NewRHS ? NewRHS : RHS;
44686 return TLO.CombineTo(Op, TLO.DAG.getNode(X86ISD::BLENDV, SDLoc(Op), VT,
44687 NewSel, NewLHS, NewRHS));
44688 }
44689 break;
44690 }
44691 case X86ISD::PEXTRB:
44692 case X86ISD::PEXTRW: {
44693 SDValue Vec = Op.getOperand(0);
44694 auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(1));
44695 MVT VecVT = Vec.getSimpleValueType();
44696 unsigned NumVecElts = VecVT.getVectorNumElements();
44697
44698 if (CIdx && CIdx->getAPIntValue().ult(NumVecElts)) {
44699 unsigned Idx = CIdx->getZExtValue();
44700 unsigned VecBitWidth = VecVT.getScalarSizeInBits();
44701
44702 // If we demand no bits from the vector then we must have demanded
44703 // bits from the implict zext - simplify to zero.
44704 APInt DemandedVecBits = OriginalDemandedBits.trunc(VecBitWidth);
44705 if (DemandedVecBits == 0)
44706 return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));
44707
44708 APInt KnownUndef, KnownZero;
44709 APInt DemandedVecElts = APInt::getOneBitSet(NumVecElts, Idx);
44710 if (SimplifyDemandedVectorElts(Vec, DemandedVecElts, KnownUndef,
44711 KnownZero, TLO, Depth + 1))
44712 return true;
44713
44714 KnownBits KnownVec;
44715 if (SimplifyDemandedBits(Vec, DemandedVecBits, DemandedVecElts,
44716 KnownVec, TLO, Depth + 1))
44717 return true;
44718
44720 Vec, DemandedVecBits, DemandedVecElts, TLO.DAG, Depth + 1))
44721 return TLO.CombineTo(
44722 Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, V, Op.getOperand(1)));
44723
44724 Known = KnownVec.zext(BitWidth);
44725 return false;
44726 }
44727 break;
44728 }
44729 case X86ISD::PINSRB:
44730 case X86ISD::PINSRW: {
44731 SDValue Vec = Op.getOperand(0);
44732 SDValue Scl = Op.getOperand(1);
44733 auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(2));
44734 MVT VecVT = Vec.getSimpleValueType();
44735
44736 if (CIdx && CIdx->getAPIntValue().ult(VecVT.getVectorNumElements())) {
44737 unsigned Idx = CIdx->getZExtValue();
44738 if (!OriginalDemandedElts[Idx])
44739 return TLO.CombineTo(Op, Vec);
44740
44741 KnownBits KnownVec;
44742 APInt DemandedVecElts(OriginalDemandedElts);
44743 DemandedVecElts.clearBit(Idx);
44744 if (SimplifyDemandedBits(Vec, OriginalDemandedBits, DemandedVecElts,
44745 KnownVec, TLO, Depth + 1))
44746 return true;
44747
44748 KnownBits KnownScl;
44749 unsigned NumSclBits = Scl.getScalarValueSizeInBits();
44750 APInt DemandedSclBits = OriginalDemandedBits.zext(NumSclBits);
44751 if (SimplifyDemandedBits(Scl, DemandedSclBits, KnownScl, TLO, Depth + 1))
44752 return true;
44753
44754 KnownScl = KnownScl.trunc(VecVT.getScalarSizeInBits());
44755 Known = KnownVec.intersectWith(KnownScl);
44756 return false;
44757 }
44758 break;
44759 }
44760 case X86ISD::PACKSS:
44761 // PACKSS saturates to MIN/MAX integer values. So if we just want the
44762 // sign bit then we can just ask for the source operands sign bit.
44763 // TODO - add known bits handling.
44764 if (OriginalDemandedBits.isSignMask()) {
44765 APInt DemandedLHS, DemandedRHS;
44766 getPackDemandedElts(VT, OriginalDemandedElts, DemandedLHS, DemandedRHS);
44767
44768 KnownBits KnownLHS, KnownRHS;
44769 APInt SignMask = APInt::getSignMask(BitWidth * 2);
44770 if (SimplifyDemandedBits(Op.getOperand(0), SignMask, DemandedLHS,
44771 KnownLHS, TLO, Depth + 1))
44772 return true;
44773 if (SimplifyDemandedBits(Op.getOperand(1), SignMask, DemandedRHS,
44774 KnownRHS, TLO, Depth + 1))
44775 return true;
44776
44777 // Attempt to avoid multi-use ops if we don't need anything from them.
44779 Op.getOperand(0), SignMask, DemandedLHS, TLO.DAG, Depth + 1);
44781 Op.getOperand(1), SignMask, DemandedRHS, TLO.DAG, Depth + 1);
44782 if (DemandedOp0 || DemandedOp1) {
44783 SDValue Op0 = DemandedOp0 ? DemandedOp0 : Op.getOperand(0);
44784 SDValue Op1 = DemandedOp1 ? DemandedOp1 : Op.getOperand(1);
44785 return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, Op0, Op1));
44786 }
44787 }
44788 // TODO - add general PACKSS/PACKUS SimplifyDemandedBits support.
44789 break;
44790 case X86ISD::VBROADCAST: {
44791 SDValue Src = Op.getOperand(0);
44792 MVT SrcVT = Src.getSimpleValueType();
44793 APInt DemandedElts = APInt::getOneBitSet(
44794 SrcVT.isVector() ? SrcVT.getVectorNumElements() : 1, 0);
44795 if (SimplifyDemandedBits(Src, OriginalDemandedBits, DemandedElts, Known,
44796 TLO, Depth + 1))
44797 return true;
44798 // If we don't need the upper bits, attempt to narrow the broadcast source.
44799 // Don't attempt this on AVX512 as it might affect broadcast folding.
44800 // TODO: Should we attempt this for i32/i16 splats? They tend to be slower.
44801 if ((BitWidth == 64) && SrcVT.isScalarInteger() && !Subtarget.hasAVX512() &&
44802 OriginalDemandedBits.countl_zero() >= (BitWidth / 2) &&
44803 Src->hasOneUse()) {
44804 MVT NewSrcVT = MVT::getIntegerVT(BitWidth / 2);
44805 SDValue NewSrc =
44806 TLO.DAG.getNode(ISD::TRUNCATE, SDLoc(Src), NewSrcVT, Src);
44807 MVT NewVT = MVT::getVectorVT(NewSrcVT, VT.getVectorNumElements() * 2);
44808 SDValue NewBcst =
44809 TLO.DAG.getNode(X86ISD::VBROADCAST, SDLoc(Op), NewVT, NewSrc);
44810 return TLO.CombineTo(Op, TLO.DAG.getBitcast(VT, NewBcst));
44811 }
44812 break;
44813 }
44814 case X86ISD::PCMPGT:
44815 // icmp sgt(0, R) == ashr(R, BitWidth-1).
44816 if (ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode())) {
44817 // iff we only need the signbit then we can use R directly.
44818 if (OriginalDemandedBits.isSignMask())
44819 return TLO.CombineTo(Op, Op.getOperand(1));
44820 // otherwise we just need R's signbit for the comparison.
44821 APInt SignMask = APInt::getSignMask(BitWidth);
44822 if (SimplifyDemandedBits(Op.getOperand(1), SignMask, OriginalDemandedElts,
44823 Known, TLO, Depth + 1))
44824 return true;
44825 }
44826 break;
44827 case X86ISD::MOVMSK: {
44828 SDValue Src = Op.getOperand(0);
44829 MVT SrcVT = Src.getSimpleValueType();
44830 unsigned SrcBits = SrcVT.getScalarSizeInBits();
44831 unsigned NumElts = SrcVT.getVectorNumElements();
44832
44833 // If we don't need the sign bits at all just return zero.
44834 if (OriginalDemandedBits.countr_zero() >= NumElts)
44835 return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));
44836
44837 // See if we only demand bits from the lower 128-bit vector.
44838 if (SrcVT.is256BitVector() &&
44839 OriginalDemandedBits.getActiveBits() <= (NumElts / 2)) {
44840 SDValue NewSrc = extract128BitVector(Src, 0, TLO.DAG, SDLoc(Src));
44841 return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc));
44842 }
44843
44844 // Only demand the vector elements of the sign bits we need.
44845 APInt KnownUndef, KnownZero;
44846 APInt DemandedElts = OriginalDemandedBits.zextOrTrunc(NumElts);
44847 if (SimplifyDemandedVectorElts(Src, DemandedElts, KnownUndef, KnownZero,
44848 TLO, Depth + 1))
44849 return true;
44850
44851 Known.Zero = KnownZero.zext(BitWidth);
44852 Known.Zero.setHighBits(BitWidth - NumElts);
44853
44854 // MOVMSK only uses the MSB from each vector element.
44855 KnownBits KnownSrc;
44856 APInt DemandedSrcBits = APInt::getSignMask(SrcBits);
44857 if (SimplifyDemandedBits(Src, DemandedSrcBits, DemandedElts, KnownSrc, TLO,
44858 Depth + 1))
44859 return true;
44860
44861 if (KnownSrc.One[SrcBits - 1])
44862 Known.One.setLowBits(NumElts);
44863 else if (KnownSrc.Zero[SrcBits - 1])
44864 Known.Zero.setLowBits(NumElts);
44865
44866 // Attempt to avoid multi-use os if we don't need anything from it.
44868 Src, DemandedSrcBits, DemandedElts, TLO.DAG, Depth + 1))
44869 return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc));
44870 return false;
44871 }
44872 case X86ISD::TESTP: {
44873 SDValue Op0 = Op.getOperand(0);
44874 SDValue Op1 = Op.getOperand(1);
44875 MVT OpVT = Op0.getSimpleValueType();
44876 assert((OpVT.getVectorElementType() == MVT::f32 ||
44877 OpVT.getVectorElementType() == MVT::f64) &&
44878 "Illegal vector type for X86ISD::TESTP");
44879
44880 // TESTPS/TESTPD only demands the sign bits of ALL the elements.
44881 KnownBits KnownSrc;
44882 APInt SignMask = APInt::getSignMask(OpVT.getScalarSizeInBits());
44883 bool AssumeSingleUse = (Op0 == Op1) && Op->isOnlyUserOf(Op0.getNode());
44884 return SimplifyDemandedBits(Op0, SignMask, KnownSrc, TLO, Depth + 1,
44885 AssumeSingleUse) ||
44886 SimplifyDemandedBits(Op1, SignMask, KnownSrc, TLO, Depth + 1,
44887 AssumeSingleUse);
44888 }
44889 case X86ISD::CMOV: {
44890 KnownBits Known2;
44891 if (SimplifyDemandedBits(Op.getOperand(1), OriginalDemandedBits,
44892 OriginalDemandedElts, Known2, TLO, Depth + 1))
44893 return true;
44894 if (SimplifyDemandedBits(Op.getOperand(0), OriginalDemandedBits,
44895 OriginalDemandedElts, Known, TLO, Depth + 1))
44896 return true;
44897
44898 // Only known if known in both the LHS and RHS.
44899 Known = Known.intersectWith(Known2);
44900 return false;
44901 }
44902 case X86ISD::BEXTR:
44903 case X86ISD::BEXTRI: {
44904 SDValue Op0 = Op.getOperand(0);
44905 SDValue Op1 = Op.getOperand(1);
44906
44907 // Only bottom 16-bits of the control bits are required.
44908 if (auto *Cst1 = dyn_cast<ConstantSDNode>(Op1)) {
44909 // NOTE: SimplifyDemandedBits won't do this for constants.
44910 uint64_t Val1 = Cst1->getZExtValue();
44911 uint64_t MaskedVal1 = Val1 & 0xFFFF;
44912 if (Opc == X86ISD::BEXTR && MaskedVal1 != Val1) {
44913 SDLoc DL(Op);
44914 return TLO.CombineTo(
44915 Op, TLO.DAG.getNode(X86ISD::BEXTR, DL, VT, Op0,
44916 TLO.DAG.getConstant(MaskedVal1, DL, VT)));
44917 }
44918
44919 unsigned Shift = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 0);
44920 unsigned Length = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 8);
44921
44922 // If the length is 0, the result is 0.
44923 if (Length == 0) {
44924 Known.setAllZero();
44925 return false;
44926 }
44927
44928 if ((Shift + Length) <= BitWidth) {
44929 APInt DemandedMask = APInt::getBitsSet(BitWidth, Shift, Shift + Length);
44930 if (SimplifyDemandedBits(Op0, DemandedMask, Known, TLO, Depth + 1))
44931 return true;
44932
44933 Known = Known.extractBits(Length, Shift);
44934 Known = Known.zextOrTrunc(BitWidth);
44935 return false;
44936 }
44937 } else {
44938 assert(Opc == X86ISD::BEXTR && "Unexpected opcode!");
44939 KnownBits Known1;
44940 APInt DemandedMask(APInt::getLowBitsSet(BitWidth, 16));
44941 if (SimplifyDemandedBits(Op1, DemandedMask, Known1, TLO, Depth + 1))
44942 return true;
44943
44944 // If the length is 0, replace with 0.
44945 KnownBits LengthBits = Known1.extractBits(8, 8);
44946 if (LengthBits.isZero())
44947 return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));
44948 }
44949
44950 break;
44951 }
44952 case X86ISD::PDEP: {
44953 SDValue Op0 = Op.getOperand(0);
44954 SDValue Op1 = Op.getOperand(1);
44955
44956 unsigned DemandedBitsLZ = OriginalDemandedBits.countl_zero();
44957 APInt LoMask = APInt::getLowBitsSet(BitWidth, BitWidth - DemandedBitsLZ);
44958
44959 // If the demanded bits has leading zeroes, we don't demand those from the
44960 // mask.
44961 if (SimplifyDemandedBits(Op1, LoMask, Known, TLO, Depth + 1))
44962 return true;
44963
44964 // The number of possible 1s in the mask determines the number of LSBs of
44965 // operand 0 used. Undemanded bits from the mask don't matter so filter
44966 // them before counting.
44967 KnownBits Known2;
44968 uint64_t Count = (~Known.Zero & LoMask).popcount();
44969 APInt DemandedMask(APInt::getLowBitsSet(BitWidth, Count));
44970 if (SimplifyDemandedBits(Op0, DemandedMask, Known2, TLO, Depth + 1))
44971 return true;
44972
44973 // Zeroes are retained from the mask, but not ones.
44974 Known.One.clearAllBits();
44975 // The result will have at least as many trailing zeros as the non-mask
44976 // operand since bits can only map to the same or higher bit position.
44977 Known.Zero.setLowBits(Known2.countMinTrailingZeros());
44978 return false;
44979 }
44980 case X86ISD::VPMADD52L:
44981 case X86ISD::VPMADD52H: {
44982 KnownBits KnownOp0, KnownOp1, KnownOp2;
44983 SDValue Op0 = Op.getOperand(0);
44984 SDValue Op1 = Op.getOperand(1);
44985 SDValue Op2 = Op.getOperand(2);
44986 // Only demand the lower 52-bits of operands 0 / 1 (and all 64-bits of
44987 // operand 2).
44988 APInt Low52Bits = APInt::getLowBitsSet(BitWidth, 52);
44989 if (SimplifyDemandedBits(Op0, Low52Bits, OriginalDemandedElts, KnownOp0,
44990 TLO, Depth + 1))
44991 return true;
44992
44993 if (SimplifyDemandedBits(Op1, Low52Bits, OriginalDemandedElts, KnownOp1,
44994 TLO, Depth + 1))
44995 return true;
44996
44997 if (SimplifyDemandedBits(Op2, APInt::getAllOnes(64), OriginalDemandedElts,
44998 KnownOp2, TLO, Depth + 1))
44999 return true;
45000
45001 KnownBits KnownMul;
45002 KnownOp0 = KnownOp0.trunc(52);
45003 KnownOp1 = KnownOp1.trunc(52);
45004 KnownMul = Opc == X86ISD::VPMADD52L ? KnownBits::mul(KnownOp0, KnownOp1)
45005 : KnownBits::mulhu(KnownOp0, KnownOp1);
45006 KnownMul = KnownMul.zext(64);
45007
45008 // lo/hi(X * Y) + Z --> C + Z
45009 if (KnownMul.isConstant()) {
45010 SDLoc DL(Op);
45011 SDValue C = TLO.DAG.getConstant(KnownMul.getConstant(), DL, VT);
45012 return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::ADD, DL, VT, C, Op2));
45013 }
45014
45015 Known = KnownBits::add(KnownMul, KnownOp2);
45016 return false;
45017 }
45018 }
45019
45021 Op, OriginalDemandedBits, OriginalDemandedElts, Known, TLO, Depth);
45022}
45023
45025 SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
45026 SelectionDAG &DAG, unsigned Depth) const {
45027 int NumElts = DemandedElts.getBitWidth();
45028 unsigned Opc = Op.getOpcode();
45029 EVT VT = Op.getValueType();
45030
45031 switch (Opc) {
45032 case X86ISD::PINSRB:
45033 case X86ISD::PINSRW: {
45034 // If we don't demand the inserted element, return the base vector.
45035 SDValue Vec = Op.getOperand(0);
45036 auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(2));
45037 MVT VecVT = Vec.getSimpleValueType();
45038 if (CIdx && CIdx->getAPIntValue().ult(VecVT.getVectorNumElements()) &&
45039 !DemandedElts[CIdx->getZExtValue()])
45040 return Vec;
45041 break;
45042 }
45043 case X86ISD::VSHLI: {
45044 // If we are only demanding sign bits then we can use the shift source
45045 // directly.
45046 SDValue Op0 = Op.getOperand(0);
45047 unsigned ShAmt = Op.getConstantOperandVal(1);
45048 unsigned BitWidth = DemandedBits.getBitWidth();
45049 unsigned NumSignBits = DAG.ComputeNumSignBits(Op0, DemandedElts, Depth + 1);
45050 unsigned UpperDemandedBits = BitWidth - DemandedBits.countr_zero();
45051 if (NumSignBits > ShAmt && (NumSignBits - ShAmt) >= UpperDemandedBits)
45052 return Op0;
45053 break;
45054 }
45055 case X86ISD::VSRAI:
45056 // iff we only need the sign bit then we can use the source directly.
45057 // TODO: generalize where we only demand extended signbits.
45058 if (DemandedBits.isSignMask())
45059 return Op.getOperand(0);
45060 break;
45061 case X86ISD::PCMPGT:
45062 // icmp sgt(0, R) == ashr(R, BitWidth-1).
45063 // iff we only need the sign bit then we can use R directly.
45064 if (DemandedBits.isSignMask() &&
45065 ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode()))
45066 return Op.getOperand(1);
45067 break;
45068 case X86ISD::BLENDV: {
45069 // BLENDV: Cond (MSB) ? LHS : RHS
45070 SDValue Cond = Op.getOperand(0);
45071 SDValue LHS = Op.getOperand(1);
45072 SDValue RHS = Op.getOperand(2);
45073
45074 KnownBits CondKnown = DAG.computeKnownBits(Cond, DemandedElts, Depth + 1);
45075 if (CondKnown.isNegative())
45076 return LHS;
45077 if (CondKnown.isNonNegative())
45078 return RHS;
45079 break;
45080 }
45081 case X86ISD::ANDNP: {
45082 // ANDNP = (~LHS & RHS);
45083 SDValue LHS = Op.getOperand(0);
45084 SDValue RHS = Op.getOperand(1);
45085
45086 KnownBits LHSKnown = DAG.computeKnownBits(LHS, DemandedElts, Depth + 1);
45087 KnownBits RHSKnown = DAG.computeKnownBits(RHS, DemandedElts, Depth + 1);
45088
45089 // If all of the demanded bits are known 0 on LHS and known 0 on RHS, then
45090 // the (inverted) LHS bits cannot contribute to the result of the 'andn' in
45091 // this context, so return RHS.
45092 if (DemandedBits.isSubsetOf(RHSKnown.Zero | LHSKnown.Zero))
45093 return RHS;
45094 break;
45095 }
45096 }
45097
45098 APInt ShuffleUndef, ShuffleZero;
45099 SmallVector<int, 16> ShuffleMask;
45101 if (getTargetShuffleInputs(Op, DemandedElts, ShuffleOps, ShuffleMask,
45102 ShuffleUndef, ShuffleZero, DAG, Depth, false)) {
45103 // If all the demanded elts are from one operand and are inline,
45104 // then we can use the operand directly.
45105 int NumOps = ShuffleOps.size();
45106 if (ShuffleMask.size() == (unsigned)NumElts &&
45108 return VT.getSizeInBits() == V.getValueSizeInBits();
45109 })) {
45110
45111 if (DemandedElts.isSubsetOf(ShuffleUndef))
45112 return DAG.getUNDEF(VT);
45113 if (DemandedElts.isSubsetOf(ShuffleUndef | ShuffleZero))
45114 return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(Op));
45115
45116 // Bitmask that indicates which ops have only been accessed 'inline'.
45117 APInt IdentityOp = APInt::getAllOnes(NumOps);
45118 for (int i = 0; i != NumElts; ++i) {
45119 int M = ShuffleMask[i];
45120 if (!DemandedElts[i] || ShuffleUndef[i])
45121 continue;
45122 int OpIdx = M / NumElts;
45123 int EltIdx = M % NumElts;
45124 if (M < 0 || EltIdx != i) {
45125 IdentityOp.clearAllBits();
45126 break;
45127 }
45128 IdentityOp &= APInt::getOneBitSet(NumOps, OpIdx);
45129 if (IdentityOp == 0)
45130 break;
45131 }
45132 assert((IdentityOp == 0 || IdentityOp.popcount() == 1) &&
45133 "Multiple identity shuffles detected");
45134
45135 if (IdentityOp != 0)
45136 return DAG.getBitcast(VT, ShuffleOps[IdentityOp.countr_zero()]);
45137 }
45138 }
45139
45141 Op, DemandedBits, DemandedElts, DAG, Depth);
45142}
45143
45145 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
45146 bool PoisonOnly, unsigned Depth) const {
45147 unsigned NumElts = DemandedElts.getBitWidth();
45148
45149 switch (Op.getOpcode()) {
45151 case X86ISD::Wrapper:
45152 case X86ISD::WrapperRIP:
45153 return true;
45154 case X86ISD::PACKSS:
45155 case X86ISD::PACKUS: {
45156 APInt DemandedLHS, DemandedRHS;
45157 getPackDemandedElts(Op.getSimpleValueType(), DemandedElts, DemandedLHS,
45158 DemandedRHS);
45159 return (!DemandedLHS ||
45160 DAG.isGuaranteedNotToBeUndefOrPoison(Op.getOperand(0), DemandedLHS,
45161 PoisonOnly, Depth + 1)) &&
45162 (!DemandedRHS ||
45163 DAG.isGuaranteedNotToBeUndefOrPoison(Op.getOperand(1), DemandedRHS,
45164 PoisonOnly, Depth + 1));
45165 }
45166 case X86ISD::INSERTPS:
45167 case X86ISD::BLENDI:
45168 case X86ISD::PSHUFB:
45169 case X86ISD::PSHUFD:
45170 case X86ISD::UNPCKL:
45171 case X86ISD::UNPCKH:
45172 case X86ISD::VPERMILPV:
45173 case X86ISD::VPERMILPI:
45174 case X86ISD::VPERMV:
45175 case X86ISD::VPERMV3: {
45178 if (getTargetShuffleMask(Op, true, Ops, Mask)) {
45179 SmallVector<APInt, 2> DemandedSrcElts(Ops.size(),
45180 APInt::getZero(NumElts));
45181 for (auto M : enumerate(Mask)) {
45182 if (!DemandedElts[M.index()] || M.value() == SM_SentinelZero)
45183 continue;
45184 if (M.value() == SM_SentinelUndef)
45185 return false;
45186 assert(0 <= M.value() && M.value() < (int)(Ops.size() * NumElts) &&
45187 "Shuffle mask index out of range");
45188 DemandedSrcElts[M.value() / NumElts].setBit(M.value() % NumElts);
45189 }
45190 for (auto Op : enumerate(Ops))
45191 if (!DemandedSrcElts[Op.index()].isZero() &&
45193 Op.value(), DemandedSrcElts[Op.index()], PoisonOnly, Depth + 1))
45194 return false;
45195 return true;
45196 }
45197 break;
45198 }
45199 }
45201 Op, DemandedElts, DAG, PoisonOnly, Depth);
45202}
45203
45205 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
45206 bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const {
45207
45208 switch (Op.getOpcode()) {
45209 // SSE bit logic.
45210 case X86ISD::FAND:
45211 case X86ISD::FOR:
45212 case X86ISD::FXOR:
45213 case X86ISD::FANDN:
45214 case X86ISD::ANDNP:
45215 case X86ISD::VPTERNLOG:
45216 return false;
45217 // SSE vector insert/extracts use modulo indices.
45218 case X86ISD::PINSRB:
45219 case X86ISD::PINSRW:
45220 case X86ISD::PEXTRB:
45221 case X86ISD::PEXTRW:
45222 return false;
45223 // SSE vector multiplies are either inbounds or saturate.
45224 case X86ISD::VPMADDUBSW:
45225 case X86ISD::VPMADDWD:
45226 return false;
45227 // SSE vector shifts handle out of bounds shift amounts.
45228 case X86ISD::VSHLI:
45229 case X86ISD::VSRLI:
45230 case X86ISD::VSRAI:
45231 return false;
45232 // SSE blends.
45233 case X86ISD::BLENDI:
45234 case X86ISD::BLENDV:
45235 return false;
45236 // SSE packs.
45237 case X86ISD::PACKSS:
45238 case X86ISD::PACKUS:
45239 return false;
45240 // SSE target shuffles.
45241 case X86ISD::INSERTPS:
45242 case X86ISD::PSHUFB:
45243 case X86ISD::PSHUFD:
45244 case X86ISD::UNPCKL:
45245 case X86ISD::UNPCKH:
45246 case X86ISD::VPERMILPV:
45247 case X86ISD::VPERMILPI:
45248 case X86ISD::VPERMV:
45249 case X86ISD::VPERMV3:
45250 return false;
45251 // SSE comparisons handle all icmp/fcmp cases.
45252 // TODO: Add CMPM/MM with test coverage.
45253 case X86ISD::CMPP:
45254 case X86ISD::PCMPEQ:
45255 case X86ISD::PCMPGT:
45256 return false;
45257 // SSE signbit extraction.
45258 case X86ISD::MOVMSK:
45259 return false;
45260 // GFNI instructions.
45263 case X86ISD::GF2P8MULB:
45264 return false;
45266 switch (Op->getConstantOperandVal(0)) {
45267 case Intrinsic::x86_sse2_pmadd_wd:
45268 case Intrinsic::x86_avx2_pmadd_wd:
45269 case Intrinsic::x86_avx512_pmaddw_d_512:
45270 case Intrinsic::x86_ssse3_pmadd_ub_sw_128:
45271 case Intrinsic::x86_avx2_pmadd_ub_sw:
45272 case Intrinsic::x86_avx512_pmaddubs_w_512:
45273 return false;
45274 case Intrinsic::x86_avx512_vpermi2var_d_128:
45275 case Intrinsic::x86_avx512_vpermi2var_d_256:
45276 case Intrinsic::x86_avx512_vpermi2var_d_512:
45277 case Intrinsic::x86_avx512_vpermi2var_hi_128:
45278 case Intrinsic::x86_avx512_vpermi2var_hi_256:
45279 case Intrinsic::x86_avx512_vpermi2var_hi_512:
45280 case Intrinsic::x86_avx512_vpermi2var_pd_128:
45281 case Intrinsic::x86_avx512_vpermi2var_pd_256:
45282 case Intrinsic::x86_avx512_vpermi2var_pd_512:
45283 case Intrinsic::x86_avx512_vpermi2var_ps_128:
45284 case Intrinsic::x86_avx512_vpermi2var_ps_256:
45285 case Intrinsic::x86_avx512_vpermi2var_ps_512:
45286 case Intrinsic::x86_avx512_vpermi2var_q_128:
45287 case Intrinsic::x86_avx512_vpermi2var_q_256:
45288 case Intrinsic::x86_avx512_vpermi2var_q_512:
45289 case Intrinsic::x86_avx512_vpermi2var_qi_128:
45290 case Intrinsic::x86_avx512_vpermi2var_qi_256:
45291 case Intrinsic::x86_avx512_vpermi2var_qi_512:
45292 return false;
45293 }
45294 }
45296 Op, DemandedElts, DAG, PoisonOnly, ConsiderFlags, Depth);
45297}
45298
45300 const APInt &DemandedElts,
45301 APInt &UndefElts,
45302 const SelectionDAG &DAG,
45303 unsigned Depth) const {
45304 unsigned NumElts = DemandedElts.getBitWidth();
45305 unsigned Opc = Op.getOpcode();
45306
45307 switch (Opc) {
45308 case X86ISD::VBROADCAST:
45310 UndefElts = APInt::getZero(NumElts);
45311 return true;
45312 }
45313
45314 return TargetLowering::isSplatValueForTargetNode(Op, DemandedElts, UndefElts,
45315 DAG, Depth);
45316}
45317
45318// Helper to peek through bitops/trunc/setcc to determine size of source vector.
45319// Allows combineBitcastvxi1 to determine what size vector generated a <X x i1>.
45320static bool checkBitcastSrcVectorSize(SDValue Src, unsigned Size,
45321 bool AllowTruncate, unsigned Depth) {
45322 // Limit recursion.
45324 return false;
45325 switch (Src.getOpcode()) {
45326 case ISD::TRUNCATE:
45327 if (!AllowTruncate)
45328 return false;
45329 [[fallthrough]];
45330 case ISD::SETCC:
45331 return Src.getOperand(0).getValueSizeInBits() == Size;
45332 case ISD::FREEZE:
45333 return checkBitcastSrcVectorSize(Src.getOperand(0), Size, AllowTruncate,
45334 Depth + 1);
45335 case ISD::AND:
45336 case ISD::XOR:
45337 case ISD::OR:
45338 return checkBitcastSrcVectorSize(Src.getOperand(0), Size, AllowTruncate,
45339 Depth + 1) &&
45340 checkBitcastSrcVectorSize(Src.getOperand(1), Size, AllowTruncate,
45341 Depth + 1);
45342 case ISD::SELECT:
45343 case ISD::VSELECT:
45344 return Src.getOperand(0).getScalarValueSizeInBits() == 1 &&
45345 checkBitcastSrcVectorSize(Src.getOperand(1), Size, AllowTruncate,
45346 Depth + 1) &&
45347 checkBitcastSrcVectorSize(Src.getOperand(2), Size, AllowTruncate,
45348 Depth + 1);
45349 case ISD::BUILD_VECTOR:
45350 return ISD::isBuildVectorAllZeros(Src.getNode()) ||
45351 ISD::isBuildVectorAllOnes(Src.getNode());
45352 }
45353 return false;
45354}
45355
45356// Helper to flip between AND/OR/XOR opcodes and their X86ISD FP equivalents.
45357static unsigned getAltBitOpcode(unsigned Opcode) {
45358 switch(Opcode) {
45359 // clang-format off
45360 case ISD::AND: return X86ISD::FAND;
45361 case ISD::OR: return X86ISD::FOR;
45362 case ISD::XOR: return X86ISD::FXOR;
45363 case X86ISD::ANDNP: return X86ISD::FANDN;
45364 // clang-format on
45365 }
45366 llvm_unreachable("Unknown bitwise opcode");
45367}
45368
45369// Helper to adjust v4i32 MOVMSK expansion to work with SSE1-only targets.
45371 const SDLoc &DL) {
45372 EVT SrcVT = Src.getValueType();
45373 if (SrcVT != MVT::v4i1)
45374 return SDValue();
45375
45376 switch (Src.getOpcode()) {
45377 case ISD::SETCC:
45378 if (Src.getOperand(0).getValueType() == MVT::v4i32 &&
45379 ISD::isBuildVectorAllZeros(Src.getOperand(1).getNode()) &&
45380 cast<CondCodeSDNode>(Src.getOperand(2))->get() == ISD::SETLT) {
45381 SDValue Op0 = Src.getOperand(0);
45382 if (ISD::isNormalLoad(Op0.getNode()))
45383 return DAG.getBitcast(MVT::v4f32, Op0);
45384 if (Op0.getOpcode() == ISD::BITCAST &&
45385 Op0.getOperand(0).getValueType() == MVT::v4f32)
45386 return Op0.getOperand(0);
45387 }
45388 break;
45389 case ISD::AND:
45390 case ISD::XOR:
45391 case ISD::OR: {
45392 SDValue Op0 = adjustBitcastSrcVectorSSE1(DAG, Src.getOperand(0), DL);
45393 SDValue Op1 = adjustBitcastSrcVectorSSE1(DAG, Src.getOperand(1), DL);
45394 if (Op0 && Op1)
45395 return DAG.getNode(getAltBitOpcode(Src.getOpcode()), DL, MVT::v4f32, Op0,
45396 Op1);
45397 break;
45398 }
45399 }
45400 return SDValue();
45401}
45402
45403// Helper to push sign extension of vXi1 SETCC result through bitops.
45405 SDValue Src, const SDLoc &DL) {
45406 switch (Src.getOpcode()) {
45407 case ISD::SETCC:
45408 case ISD::FREEZE:
45409 case ISD::TRUNCATE:
45410 case ISD::BUILD_VECTOR:
45411 return DAG.getNode(ISD::SIGN_EXTEND, DL, SExtVT, Src);
45412 case ISD::AND:
45413 case ISD::XOR:
45414 case ISD::OR:
45415 return DAG.getNode(
45416 Src.getOpcode(), DL, SExtVT,
45417 signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(0), DL),
45418 signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(1), DL));
45419 case ISD::SELECT:
45420 case ISD::VSELECT:
45421 return DAG.getSelect(
45422 DL, SExtVT, Src.getOperand(0),
45423 signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(1), DL),
45424 signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(2), DL));
45425 }
45426 llvm_unreachable("Unexpected node type for vXi1 sign extension");
45427}
45428
45429// Try to match patterns such as
45430// (i16 bitcast (v16i1 x))
45431// ->
45432// (i16 movmsk (16i8 sext (v16i1 x)))
45433// before the illegal vector is scalarized on subtargets that don't have legal
45434// vxi1 types.
45436 const SDLoc &DL,
45437 const X86Subtarget &Subtarget) {
45438 EVT SrcVT = Src.getValueType();
45439 if (Subtarget.useSoftFloat() || !SrcVT.isSimple() ||
45440 SrcVT.getScalarType() != MVT::i1)
45441 return SDValue();
45442
45443 // Recognize the IR pattern for the movmsk intrinsic under SSE1 before type
45444 // legalization destroys the v4i32 type.
45445 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2()) {
45446 if (SDValue V = adjustBitcastSrcVectorSSE1(DAG, Src, DL)) {
45447 V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32,
45448 DAG.getBitcast(MVT::v4f32, V));
45449 return DAG.getZExtOrTrunc(V, DL, VT);
45450 }
45451 }
45452
45453 // If the input is a truncate from v16i8 or v32i8 go ahead and use a
45454 // movmskb even with avx512. This will be better than truncating to vXi1 and
45455 // using a kmov. This can especially help KNL if the input is a v16i8/v32i8
45456 // vpcmpeqb/vpcmpgtb.
45457 bool PreferMovMsk = Src.getOpcode() == ISD::TRUNCATE && Src.hasOneUse() &&
45458 (Src.getOperand(0).getValueType() == MVT::v16i8 ||
45459 Src.getOperand(0).getValueType() == MVT::v32i8 ||
45460 Src.getOperand(0).getValueType() == MVT::v64i8);
45461
45462 // Prefer movmsk for AVX512 for (bitcast (setlt X, 0)) which can be handled
45463 // directly with vpmovmskb/vmovmskps/vmovmskpd.
45464 if (Src.getOpcode() == ISD::SETCC && Src.hasOneUse() &&
45465 cast<CondCodeSDNode>(Src.getOperand(2))->get() == ISD::SETLT &&
45466 ISD::isBuildVectorAllZeros(Src.getOperand(1).getNode())) {
45467 EVT CmpVT = Src.getOperand(0).getValueType();
45468 EVT EltVT = CmpVT.getVectorElementType();
45469 if (CmpVT.getSizeInBits() <= 256 &&
45470 (EltVT == MVT::i8 || EltVT == MVT::i32 || EltVT == MVT::i64))
45471 PreferMovMsk = true;
45472 }
45473
45474 // With AVX512 vxi1 types are legal and we prefer using k-regs.
45475 // MOVMSK is supported in SSE2 or later.
45476 if (!Subtarget.hasSSE2() || (Subtarget.hasAVX512() && !PreferMovMsk))
45477 return SDValue();
45478
45479 // If the upper ops of a concatenation are undef, then try to bitcast the
45480 // lower op and extend.
45481 SmallVector<SDValue, 4> SubSrcOps;
45482 if (collectConcatOps(Src.getNode(), SubSrcOps, DAG) &&
45483 SubSrcOps.size() >= 2) {
45484 SDValue LowerOp = SubSrcOps[0];
45485 ArrayRef<SDValue> UpperOps(std::next(SubSrcOps.begin()), SubSrcOps.end());
45486 if (LowerOp.getOpcode() == ISD::SETCC &&
45487 all_of(UpperOps, [](SDValue Op) { return Op.isUndef(); })) {
45488 EVT SubVT = VT.getIntegerVT(
45489 *DAG.getContext(), LowerOp.getValueType().getVectorMinNumElements());
45490 if (SDValue V = combineBitcastvxi1(DAG, SubVT, LowerOp, DL, Subtarget)) {
45491 EVT IntVT = VT.getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
45492 return DAG.getBitcast(VT, DAG.getNode(ISD::ANY_EXTEND, DL, IntVT, V));
45493 }
45494 }
45495 }
45496
45497 // There are MOVMSK flavors for types v16i8, v32i8, v4f32, v8f32, v4f64 and
45498 // v8f64. So all legal 128-bit and 256-bit vectors are covered except for
45499 // v8i16 and v16i16.
45500 // For these two cases, we can shuffle the upper element bytes to a
45501 // consecutive sequence at the start of the vector and treat the results as
45502 // v16i8 or v32i8, and for v16i8 this is the preferable solution. However,
45503 // for v16i16 this is not the case, because the shuffle is expensive, so we
45504 // avoid sign-extending to this type entirely.
45505 // For example, t0 := (v8i16 sext(v8i1 x)) needs to be shuffled as:
45506 // (v16i8 shuffle <0,2,4,6,8,10,12,14,u,u,...,u> (v16i8 bitcast t0), undef)
45507 MVT SExtVT;
45508 bool PropagateSExt = false;
45509 switch (SrcVT.getSimpleVT().SimpleTy) {
45510 default:
45511 return SDValue();
45512 case MVT::v2i1:
45513 SExtVT = MVT::v2i64;
45514 break;
45515 case MVT::v4i1:
45516 SExtVT = MVT::v4i32;
45517 // For cases such as (i4 bitcast (v4i1 setcc v4i64 v1, v2))
45518 // sign-extend to a 256-bit operation to avoid truncation.
45519 if (Subtarget.hasAVX() &&
45520 checkBitcastSrcVectorSize(Src, 256, Subtarget.hasAVX2(), 0)) {
45521 SExtVT = MVT::v4i64;
45522 PropagateSExt = true;
45523 }
45524 break;
45525 case MVT::v8i1:
45526 SExtVT = MVT::v8i16;
45527 // For cases such as (i8 bitcast (v8i1 setcc v8i32 v1, v2)),
45528 // sign-extend to a 256-bit operation to match the compare.
45529 // If the setcc operand is 128-bit, prefer sign-extending to 128-bit over
45530 // 256-bit because the shuffle is cheaper than sign extending the result of
45531 // the compare.
45532 if (Subtarget.hasAVX() && (checkBitcastSrcVectorSize(Src, 256, true, 0) ||
45533 checkBitcastSrcVectorSize(Src, 512, true, 0))) {
45534 SExtVT = MVT::v8i32;
45535 PropagateSExt = true;
45536 }
45537 break;
45538 case MVT::v16i1:
45539 SExtVT = MVT::v16i8;
45540 // For the case (i16 bitcast (v16i1 setcc v16i16 v1, v2)),
45541 // it is not profitable to sign-extend to 256-bit because this will
45542 // require an extra cross-lane shuffle which is more expensive than
45543 // truncating the result of the compare to 128-bits.
45544 break;
45545 case MVT::v32i1:
45546 SExtVT = MVT::v32i8;
45547 break;
45548 case MVT::v64i1:
45549 // If we have AVX512F, but not AVX512BW and the input is truncated from
45550 // v64i8 checked earlier. Then split the input and make two pmovmskbs.
45551 if (Subtarget.hasAVX512()) {
45552 if (Subtarget.hasBWI())
45553 return SDValue();
45554 SExtVT = MVT::v64i8;
45555 break;
45556 }
45557 // Split if this is a <64 x i8> comparison result.
45558 if (checkBitcastSrcVectorSize(Src, 512, false, 0)) {
45559 SExtVT = MVT::v64i8;
45560 break;
45561 }
45562 return SDValue();
45563 };
45564
45565 SDValue V = PropagateSExt ? signExtendBitcastSrcVector(DAG, SExtVT, Src, DL)
45566 : DAG.getNode(ISD::SIGN_EXTEND, DL, SExtVT, Src);
45567
45568 if (SExtVT == MVT::v16i8 || SExtVT == MVT::v32i8 || SExtVT == MVT::v64i8) {
45569 V = getPMOVMSKB(DL, V, DAG, Subtarget);
45570 } else {
45571 if (SExtVT == MVT::v8i16) {
45572 V = widenSubVector(V, false, Subtarget, DAG, DL, 256);
45573 V = DAG.getNode(ISD::TRUNCATE, DL, MVT::v16i8, V);
45574 }
45575 V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
45576 }
45577
45578 EVT IntVT =
45580 V = DAG.getZExtOrTrunc(V, DL, IntVT);
45581 return DAG.getBitcast(VT, V);
45582}
45583
45584// Convert a vXi1 constant build vector to the same width scalar integer.
45586 EVT SrcVT = Op.getValueType();
45587 assert(SrcVT.getVectorElementType() == MVT::i1 &&
45588 "Expected a vXi1 vector");
45590 "Expected a constant build vector");
45591
45592 APInt Imm(SrcVT.getVectorNumElements(), 0);
45593 for (unsigned Idx = 0, e = Op.getNumOperands(); Idx < e; ++Idx) {
45594 SDValue In = Op.getOperand(Idx);
45595 if (!In.isUndef() && (In->getAsZExtVal() & 0x1))
45596 Imm.setBit(Idx);
45597 }
45598 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), Imm.getBitWidth());
45599 return DAG.getConstant(Imm, SDLoc(Op), IntVT);
45600}
45601
45604 const X86Subtarget &Subtarget) {
45605 using namespace SDPatternMatch;
45606 assert(N->getOpcode() == ISD::BITCAST && "Expected a bitcast");
45607
45608 if (!DCI.isBeforeLegalizeOps())
45609 return SDValue();
45610
45611 // Only do this if we have k-registers.
45612 if (!Subtarget.hasAVX512())
45613 return SDValue();
45614
45615 EVT DstVT = N->getValueType(0);
45616 SDValue Op = N->getOperand(0);
45617 EVT SrcVT = Op.getValueType();
45618
45619 // Make sure we have a bitcast between mask registers and a scalar type.
45620 if (!(SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
45621 DstVT.isScalarInteger()) &&
45622 !(DstVT.isVector() && DstVT.getVectorElementType() == MVT::i1 &&
45623 SrcVT.isScalarInteger()))
45624 return SDValue();
45625
45626 SDValue LHS, RHS;
45627
45628 // Look for logic ops.
45630 return SDValue();
45631
45632 // If either operand was bitcast from DstVT, then perform logic with DstVT (at
45633 // least one of the getBitcast() will fold away).
45634 if (sd_match(LHS, m_OneUse(m_BitCast(m_SpecificVT(DstVT)))) ||
45636 return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT,
45637 DAG.getBitcast(DstVT, LHS), DAG.getBitcast(DstVT, RHS));
45638
45639 // If the RHS is a vXi1 build vector, this is a good reason to flip too.
45640 // Most of these have to move a constant from the scalar domain anyway.
45643 return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT,
45644 DAG.getBitcast(DstVT, LHS), RHS);
45645 }
45646
45647 return SDValue();
45648}
45649
45651 const X86Subtarget &Subtarget) {
45652 SDLoc DL(BV);
45653 unsigned NumElts = BV->getNumOperands();
45654 SDValue Splat = BV->getSplatValue();
45655
45656 // Build MMX element from integer GPR or SSE float values.
45657 auto CreateMMXElement = [&](SDValue V) {
45658 if (V.isUndef())
45659 return DAG.getUNDEF(MVT::x86mmx);
45660 if (V.getValueType().isFloatingPoint()) {
45661 if (Subtarget.hasSSE1() && !isa<ConstantFPSDNode>(V)) {
45662 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4f32, V);
45663 V = DAG.getBitcast(MVT::v2i64, V);
45664 return DAG.getNode(X86ISD::MOVDQ2Q, DL, MVT::x86mmx, V);
45665 }
45666 V = DAG.getBitcast(MVT::i32, V);
45667 } else {
45668 V = DAG.getAnyExtOrTrunc(V, DL, MVT::i32);
45669 }
45670 return DAG.getNode(X86ISD::MMX_MOVW2D, DL, MVT::x86mmx, V);
45671 };
45672
45673 // Convert build vector ops to MMX data in the bottom elements.
45675
45676 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
45677
45678 // Broadcast - use (PUNPCKL+)PSHUFW to broadcast single element.
45679 if (Splat) {
45680 if (Splat.isUndef())
45681 return DAG.getUNDEF(MVT::x86mmx);
45682
45683 Splat = CreateMMXElement(Splat);
45684
45685 if (Subtarget.hasSSE1()) {
45686 // Unpack v8i8 to splat i8 elements to lowest 16-bits.
45687 if (NumElts == 8)
45688 Splat = DAG.getNode(
45689 ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx,
45690 DAG.getTargetConstant(Intrinsic::x86_mmx_punpcklbw, DL,
45691 TLI.getPointerTy(DAG.getDataLayout())),
45692 Splat, Splat);
45693
45694 // Use PSHUFW to repeat 16-bit elements.
45695 unsigned ShufMask = (NumElts > 2 ? 0 : 0x44);
45696 return DAG.getNode(
45697 ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx,
45698 DAG.getTargetConstant(Intrinsic::x86_sse_pshuf_w, DL,
45699 TLI.getPointerTy(DAG.getDataLayout())),
45700 Splat, DAG.getTargetConstant(ShufMask, DL, MVT::i8));
45701 }
45702 Ops.append(NumElts, Splat);
45703 } else {
45704 for (unsigned i = 0; i != NumElts; ++i)
45705 Ops.push_back(CreateMMXElement(BV->getOperand(i)));
45706 }
45707
45708 // Use tree of PUNPCKLs to build up general MMX vector.
45709 while (Ops.size() > 1) {
45710 unsigned NumOps = Ops.size();
45711 unsigned IntrinOp =
45712 (NumOps == 2 ? Intrinsic::x86_mmx_punpckldq
45713 : (NumOps == 4 ? Intrinsic::x86_mmx_punpcklwd
45714 : Intrinsic::x86_mmx_punpcklbw));
45715 SDValue Intrin = DAG.getTargetConstant(
45716 IntrinOp, DL, TLI.getPointerTy(DAG.getDataLayout()));
45717 for (unsigned i = 0; i != NumOps; i += 2)
45718 Ops[i / 2] = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx, Intrin,
45719 Ops[i], Ops[i + 1]);
45720 Ops.resize(NumOps / 2);
45721 }
45722
45723 return Ops[0];
45724}
45725
45726// Recursive function that attempts to find if a bool vector node was originally
45727// a vector/float/double that got truncated/extended/bitcast to/from a scalar
45728// integer. If so, replace the scalar ops with bool vector equivalents back down
45729// the chain.
45731 SelectionDAG &DAG,
45732 const X86Subtarget &Subtarget,
45733 unsigned Depth = 0) {
45735 return SDValue(); // Limit search depth.
45736
45737 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
45738 unsigned Opc = V.getOpcode();
45739 switch (Opc) {
45740 case ISD::BITCAST: {
45741 // Bitcast from a vector/float/double, we can cheaply bitcast to VT.
45742 SDValue Src = V.getOperand(0);
45743 EVT SrcVT = Src.getValueType();
45744 if (SrcVT.isVector() || SrcVT.isFloatingPoint())
45745 return DAG.getBitcast(VT, Src);
45746 break;
45747 }
45748 case ISD::Constant: {
45749 auto *C = cast<ConstantSDNode>(V);
45750 if (C->isZero())
45751 return DAG.getConstant(0, DL, VT);
45752 if (C->isAllOnes())
45753 return DAG.getAllOnesConstant(DL, VT);
45754 break;
45755 }
45756 case ISD::TRUNCATE: {
45757 // If we find a suitable source, a truncated scalar becomes a subvector.
45758 SDValue Src = V.getOperand(0);
45759 EVT NewSrcVT =
45760 EVT::getVectorVT(*DAG.getContext(), MVT::i1, Src.getValueSizeInBits());
45761 if (TLI.isTypeLegal(NewSrcVT))
45762 if (SDValue N0 = combineBitcastToBoolVector(NewSrcVT, Src, DL, DAG,
45763 Subtarget, Depth + 1))
45764 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, N0,
45765 DAG.getVectorIdxConstant(0, DL));
45766 break;
45767 }
45768 case ISD::ANY_EXTEND:
45769 case ISD::ZERO_EXTEND: {
45770 // If we find a suitable source, an extended scalar becomes a subvector.
45771 SDValue Src = V.getOperand(0);
45772 EVT NewSrcVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
45773 Src.getScalarValueSizeInBits());
45774 if (TLI.isTypeLegal(NewSrcVT))
45775 if (SDValue N0 = combineBitcastToBoolVector(NewSrcVT, Src, DL, DAG,
45776 Subtarget, Depth + 1))
45777 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
45778 Opc == ISD::ANY_EXTEND ? DAG.getUNDEF(VT)
45779 : DAG.getConstant(0, DL, VT),
45780 N0, DAG.getVectorIdxConstant(0, DL));
45781 break;
45782 }
45783 case ISD::OR:
45784 case ISD::XOR: {
45785 // If we find suitable sources, we can just move the op to the vector
45786 // domain.
45787 if (SDValue N0 = combineBitcastToBoolVector(VT, V.getOperand(0), DL, DAG,
45788 Subtarget, Depth + 1))
45789 if (SDValue N1 = combineBitcastToBoolVector(VT, V.getOperand(1), DL, DAG,
45790 Subtarget, Depth + 1))
45791 return DAG.getNode(Opc, DL, VT, N0, N1);
45792 break;
45793 }
45794 case ISD::SHL: {
45795 // If we find a suitable source, a SHL becomes a KSHIFTL.
45796 SDValue Src0 = V.getOperand(0);
45797 if ((VT == MVT::v8i1 && !Subtarget.hasDQI()) ||
45798 ((VT == MVT::v32i1 || VT == MVT::v64i1) && !Subtarget.hasBWI()))
45799 break;
45800
45801 if (auto *Amt = dyn_cast<ConstantSDNode>(V.getOperand(1)))
45802 if (SDValue N0 = combineBitcastToBoolVector(VT, Src0, DL, DAG, Subtarget,
45803 Depth + 1))
45804 return DAG.getNode(
45805 X86ISD::KSHIFTL, DL, VT, N0,
45806 DAG.getTargetConstant(Amt->getZExtValue(), DL, MVT::i8));
45807 break;
45808 }
45809 }
45810
45811 // Does the inner bitcast already exist?
45812 if (Depth > 0)
45813 if (SDNode *Alt = DAG.getNodeIfExists(ISD::BITCAST, DAG.getVTList(VT), {V}))
45814 return SDValue(Alt, 0);
45815
45816 return SDValue();
45817}
45818
45821 const X86Subtarget &Subtarget) {
45822 SDValue N0 = N->getOperand(0);
45823 EVT VT = N->getValueType(0);
45824 EVT SrcVT = N0.getValueType();
45825 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
45826
45827 // Try to match patterns such as
45828 // (i16 bitcast (v16i1 x))
45829 // ->
45830 // (i16 movmsk (16i8 sext (v16i1 x)))
45831 // before the setcc result is scalarized on subtargets that don't have legal
45832 // vxi1 types.
45833 if (DCI.isBeforeLegalize()) {
45834 SDLoc dl(N);
45835 if (SDValue V = combineBitcastvxi1(DAG, VT, N0, dl, Subtarget))
45836 return V;
45837
45838 // If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer
45839 // type, widen both sides to avoid a trip through memory.
45840 if ((VT == MVT::v4i1 || VT == MVT::v2i1) && SrcVT.isScalarInteger() &&
45841 Subtarget.hasAVX512()) {
45842 N0 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i8, N0);
45843 N0 = DAG.getBitcast(MVT::v8i1, N0);
45844 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, N0,
45845 DAG.getVectorIdxConstant(0, dl));
45846 }
45847
45848 // If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer
45849 // type, widen both sides to avoid a trip through memory.
45850 if ((SrcVT == MVT::v4i1 || SrcVT == MVT::v2i1) && VT.isScalarInteger() &&
45851 Subtarget.hasAVX512()) {
45852 // Use zeros for the widening if we already have some zeroes. This can
45853 // allow SimplifyDemandedBits to remove scalar ANDs that may be down
45854 // stream of this.
45855 // FIXME: It might make sense to detect a concat_vectors with a mix of
45856 // zeroes and undef and turn it into insert_subvector for i1 vectors as
45857 // a separate combine. What we can't do is canonicalize the operands of
45858 // such a concat or we'll get into a loop with SimplifyDemandedBits.
45859 if (N0.getOpcode() == ISD::CONCAT_VECTORS) {
45860 SDValue LastOp = N0.getOperand(N0.getNumOperands() - 1);
45861 if (ISD::isBuildVectorAllZeros(LastOp.getNode())) {
45862 SrcVT = LastOp.getValueType();
45863 unsigned NumConcats = 8 / SrcVT.getVectorNumElements();
45865 Ops.resize(NumConcats, DAG.getConstant(0, dl, SrcVT));
45866 N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
45867 N0 = DAG.getBitcast(MVT::i8, N0);
45868 return DAG.getNode(ISD::TRUNCATE, dl, VT, N0);
45869 }
45870 }
45871
45872 unsigned NumConcats = 8 / SrcVT.getVectorNumElements();
45873 SmallVector<SDValue, 4> Ops(NumConcats, DAG.getUNDEF(SrcVT));
45874 Ops[0] = N0;
45875 N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
45876 N0 = DAG.getBitcast(MVT::i8, N0);
45877 return DAG.getNode(ISD::TRUNCATE, dl, VT, N0);
45878 }
45879 } else if (DCI.isAfterLegalizeDAG()) {
45880 // If we're bitcasting from iX to vXi1, see if the integer originally
45881 // began as a vXi1 and whether we can remove the bitcast entirely.
45882 if (VT.isVector() && VT.getScalarType() == MVT::i1 &&
45883 SrcVT.isScalarInteger() && TLI.isTypeLegal(VT)) {
45884 if (SDValue V =
45885 combineBitcastToBoolVector(VT, N0, SDLoc(N), DAG, Subtarget))
45886 return V;
45887 }
45888 }
45889
45890 // Look for (i8 (bitcast (v8i1 (extract_subvector (v16i1 X), 0)))) and
45891 // replace with (i8 (trunc (i16 (bitcast (v16i1 X))))). This can occur
45892 // due to insert_subvector legalization on KNL. By promoting the copy to i16
45893 // we can help with known bits propagation from the vXi1 domain to the
45894 // scalar domain.
45895 if (VT == MVT::i8 && SrcVT == MVT::v8i1 && Subtarget.hasAVX512() &&
45896 !Subtarget.hasDQI() && N0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
45897 N0.getOperand(0).getValueType() == MVT::v16i1 &&
45899 return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT,
45900 DAG.getBitcast(MVT::i16, N0.getOperand(0)));
45901
45902 // Canonicalize (bitcast (vbroadcast_load)) so that the output of the bitcast
45903 // and the vbroadcast_load are both integer or both fp. In some cases this
45904 // will remove the bitcast entirely.
45905 if (N0.getOpcode() == X86ISD::VBROADCAST_LOAD && N0.hasOneUse() &&
45906 VT.isFloatingPoint() != SrcVT.isFloatingPoint() && VT.isVector()) {
45907 auto *BCast = cast<MemIntrinsicSDNode>(N0);
45908 unsigned SrcVTSize = SrcVT.getScalarSizeInBits();
45909 unsigned MemSize = BCast->getMemoryVT().getScalarSizeInBits();
45910 // Don't swap i8/i16 since don't have fp types that size.
45911 if (MemSize >= 32) {
45912 MVT MemVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(MemSize)
45913 : MVT::getIntegerVT(MemSize);
45914 MVT LoadVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(SrcVTSize)
45915 : MVT::getIntegerVT(SrcVTSize);
45916 LoadVT = MVT::getVectorVT(LoadVT, SrcVT.getVectorNumElements());
45917
45918 SDVTList Tys = DAG.getVTList(LoadVT, MVT::Other);
45919 SDValue Ops[] = { BCast->getChain(), BCast->getBasePtr() };
45920 SDValue ResNode =
45922 MemVT, BCast->getMemOperand());
45923 DAG.ReplaceAllUsesOfValueWith(SDValue(BCast, 1), ResNode.getValue(1));
45924 return DAG.getBitcast(VT, ResNode);
45925 }
45926 }
45927
45928 // Attempt to peek through f16 bitcasted extractions hidden by truncation.
45929 if (VT == MVT::f16 && SrcVT == MVT::i16) {
45930 SDValue Src = peekThroughTruncates(N0);
45931 if (Src.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
45932 Src.getOperand(0).getValueSizeInBits() == 128 &&
45933 isNullConstant(Src.getOperand(1))) {
45934 SDLoc DL(N);
45935 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
45936 DAG.getBitcast(MVT::v8f16, Src.getOperand(0)),
45937 DAG.getVectorIdxConstant(0, DL));
45938 }
45939 }
45940
45941 // Since MMX types are special and don't usually play with other vector types,
45942 // it's better to handle them early to be sure we emit efficient code by
45943 // avoiding store-load conversions.
45944 if (VT == MVT::x86mmx) {
45945 // Detect MMX constant vectors.
45946 APInt UndefElts;
45947 SmallVector<APInt, 1> EltBits;
45948 if (getTargetConstantBitsFromNode(N0, 64, UndefElts, EltBits,
45949 /*AllowWholeUndefs*/ true,
45950 /*AllowPartialUndefs*/ true)) {
45951 SDLoc DL(N0);
45952 // Handle zero-extension of i32 with MOVD.
45953 if (EltBits[0].countl_zero() >= 32)
45954 return DAG.getNode(X86ISD::MMX_MOVW2D, DL, VT,
45955 DAG.getConstant(EltBits[0].trunc(32), DL, MVT::i32));
45956 // Else, bitcast to a double.
45957 // TODO - investigate supporting sext 32-bit immediates on x86_64.
45958 APFloat F64(APFloat::IEEEdouble(), EltBits[0]);
45959 return DAG.getBitcast(VT, DAG.getConstantFP(F64, DL, MVT::f64));
45960 }
45961
45962 // Detect bitcasts to x86mmx low word.
45963 if (N0.getOpcode() == ISD::BUILD_VECTOR &&
45964 (SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8) &&
45965 N0.getOperand(0).getValueType() == SrcVT.getScalarType()) {
45966 bool LowUndef = true, AllUndefOrZero = true;
45967 for (unsigned i = 1, e = SrcVT.getVectorNumElements(); i != e; ++i) {
45968 SDValue Op = N0.getOperand(i);
45969 LowUndef &= Op.isUndef() || (i >= e/2);
45970 AllUndefOrZero &= isNullConstantOrUndef(Op);
45971 }
45972 if (AllUndefOrZero) {
45973 SDValue N00 = N0.getOperand(0);
45974 SDLoc dl(N00);
45975 N00 = LowUndef ? DAG.getAnyExtOrTrunc(N00, dl, MVT::i32)
45976 : DAG.getZExtOrTrunc(N00, dl, MVT::i32);
45977 return DAG.getNode(X86ISD::MMX_MOVW2D, dl, VT, N00);
45978 }
45979 }
45980
45981 // Detect bitcasts of 64-bit build vectors and convert to a
45982 // MMX UNPCK/PSHUFW which takes MMX type inputs with the value in the
45983 // lowest element.
45984 if (N0.getOpcode() == ISD::BUILD_VECTOR &&
45985 (SrcVT == MVT::v2f32 || SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 ||
45986 SrcVT == MVT::v8i8))
45987 return createMMXBuildVector(cast<BuildVectorSDNode>(N0), DAG, Subtarget);
45988
45989 // Detect bitcasts between element or subvector extraction to x86mmx.
45990 if ((N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
45992 isNullConstant(N0.getOperand(1))) {
45993 SDValue N00 = N0.getOperand(0);
45994 if (N00.getValueType().is128BitVector())
45995 return DAG.getNode(X86ISD::MOVDQ2Q, SDLoc(N00), VT,
45996 DAG.getBitcast(MVT::v2i64, N00));
45997 }
45998
45999 // Detect bitcasts from FP_TO_SINT to x86mmx.
46000 if (SrcVT == MVT::v2i32 && N0.getOpcode() == ISD::FP_TO_SINT) {
46001 SDLoc DL(N0);
46002 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
46003 DAG.getUNDEF(MVT::v2i32));
46004 return DAG.getNode(X86ISD::MOVDQ2Q, DL, VT,
46005 DAG.getBitcast(MVT::v2i64, Res));
46006 }
46007 }
46008
46009 // Try to remove a bitcast of constant vXi1 vector. We have to legalize
46010 // most of these to scalar anyway.
46011 if (Subtarget.hasAVX512() && VT.isScalarInteger() &&
46012 SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
46014 return combinevXi1ConstantToInteger(N0, DAG);
46015 }
46016
46017 if (Subtarget.hasAVX512() && SrcVT.isScalarInteger() && VT.isVector() &&
46018 VT.getVectorElementType() == MVT::i1) {
46019 if (auto *C = dyn_cast<ConstantSDNode>(N0)) {
46020 if (C->isAllOnes())
46021 return DAG.getConstant(1, SDLoc(N0), VT);
46022 if (C->isZero())
46023 return DAG.getConstant(0, SDLoc(N0), VT);
46024 }
46025 }
46026
46027 // Look for MOVMSK that is maybe truncated and then bitcasted to vXi1.
46028 // Turn it into a sign bit compare that produces a k-register. This avoids
46029 // a trip through a GPR.
46030 if (Subtarget.hasAVX512() && SrcVT.isScalarInteger() &&
46031 VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
46033 unsigned NumElts = VT.getVectorNumElements();
46034 SDValue Src = N0;
46035
46036 // Peek through truncate.
46037 if (N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse())
46038 Src = N0.getOperand(0);
46039
46040 if (Src.getOpcode() == X86ISD::MOVMSK && Src.hasOneUse()) {
46041 SDValue MovmskIn = Src.getOperand(0);
46042 MVT MovmskVT = MovmskIn.getSimpleValueType();
46043 unsigned MovMskElts = MovmskVT.getVectorNumElements();
46044
46045 // We allow extra bits of the movmsk to be used since they are known zero.
46046 // We can't convert a VPMOVMSKB without avx512bw.
46047 if (MovMskElts <= NumElts &&
46048 (Subtarget.hasBWI() || MovmskVT.getVectorElementType() != MVT::i8)) {
46049 EVT IntVT = EVT(MovmskVT).changeVectorElementTypeToInteger();
46050 MovmskIn = DAG.getBitcast(IntVT, MovmskIn);
46051 SDLoc dl(N);
46052 MVT CmpVT = MVT::getVectorVT(MVT::i1, MovMskElts);
46053 SDValue Cmp = DAG.getSetCC(dl, CmpVT, MovmskIn,
46054 DAG.getConstant(0, dl, IntVT), ISD::SETLT);
46055 if (EVT(CmpVT) == VT)
46056 return Cmp;
46057
46058 // Pad with zeroes up to original VT to replace the zeroes that were
46059 // being used from the MOVMSK.
46060 unsigned NumConcats = NumElts / MovMskElts;
46061 SmallVector<SDValue, 4> Ops(NumConcats, DAG.getConstant(0, dl, CmpVT));
46062 Ops[0] = Cmp;
46063 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Ops);
46064 }
46065 }
46066 }
46067
46068 // Try to remove bitcasts from input and output of mask arithmetic to
46069 // remove GPR<->K-register crossings.
46070 if (SDValue V = combineCastedMaskArithmetic(N, DAG, DCI, Subtarget))
46071 return V;
46072
46073 // bitcast(v1Ty insert_vector_elt(X, Y, 0)) --> Y
46074 if (N0.getOpcode() == ISD::INSERT_VECTOR_ELT && SrcVT.getScalarType() == VT &&
46075 SrcVT.getVectorNumElements() == 1)
46076 return N0.getOperand(1);
46077
46078 // Convert a bitcasted integer logic operation that has one bitcasted
46079 // floating-point operand into a floating-point logic operation. This may
46080 // create a load of a constant, but that is cheaper than materializing the
46081 // constant in an integer register and transferring it to an SSE register or
46082 // transferring the SSE operand to integer register and back.
46083 unsigned FPOpcode;
46084 switch (N0.getOpcode()) {
46085 // clang-format off
46086 case ISD::AND: FPOpcode = X86ISD::FAND; break;
46087 case ISD::OR: FPOpcode = X86ISD::FOR; break;
46088 case ISD::XOR: FPOpcode = X86ISD::FXOR; break;
46089 default: return SDValue();
46090 // clang-format on
46091 }
46092
46093 // Check if we have a bitcast from another integer type as well.
46094 if (!((Subtarget.hasSSE1() && VT == MVT::f32) ||
46095 (Subtarget.hasSSE2() && VT == MVT::f64) ||
46096 (Subtarget.hasFP16() && VT == MVT::f16) ||
46097 (Subtarget.hasSSE2() && VT.isInteger() && VT.isVector() &&
46098 TLI.isTypeLegal(VT))))
46099 return SDValue();
46100
46101 SDValue LogicOp0 = N0.getOperand(0);
46102 SDValue LogicOp1 = N0.getOperand(1);
46103 SDLoc DL0(N0);
46104
46105 // bitcast(logic(bitcast(X), Y)) --> logic'(X, bitcast(Y))
46106 if (N0.hasOneUse() && LogicOp0.getOpcode() == ISD::BITCAST &&
46107 LogicOp0.hasOneUse() && LogicOp0.getOperand(0).hasOneUse() &&
46108 LogicOp0.getOperand(0).getValueType() == VT &&
46109 !isa<ConstantSDNode>(LogicOp0.getOperand(0))) {
46110 SDValue CastedOp1 = DAG.getBitcast(VT, LogicOp1);
46111 unsigned Opcode = VT.isFloatingPoint() ? FPOpcode : N0.getOpcode();
46112 return DAG.getNode(Opcode, DL0, VT, LogicOp0.getOperand(0), CastedOp1);
46113 }
46114 // bitcast(logic(X, bitcast(Y))) --> logic'(bitcast(X), Y)
46115 if (N0.hasOneUse() && LogicOp1.getOpcode() == ISD::BITCAST &&
46116 LogicOp1.hasOneUse() && LogicOp1.getOperand(0).hasOneUse() &&
46117 LogicOp1.getOperand(0).getValueType() == VT &&
46118 !isa<ConstantSDNode>(LogicOp1.getOperand(0))) {
46119 SDValue CastedOp0 = DAG.getBitcast(VT, LogicOp0);
46120 unsigned Opcode = VT.isFloatingPoint() ? FPOpcode : N0.getOpcode();
46121 return DAG.getNode(Opcode, DL0, VT, LogicOp1.getOperand(0), CastedOp0);
46122 }
46123
46124 return SDValue();
46125}
46126
46127// (mul (zext a), (sext, b))
46128static bool detectExtMul(SelectionDAG &DAG, const SDValue &Mul, SDValue &Op0,
46129 SDValue &Op1) {
46130 Op0 = Mul.getOperand(0);
46131 Op1 = Mul.getOperand(1);
46132
46133 // The operand1 should be signed extend
46134 if (Op0.getOpcode() == ISD::SIGN_EXTEND)
46135 std::swap(Op0, Op1);
46136
46137 auto IsFreeTruncation = [](SDValue &Op) -> bool {
46138 if ((Op.getOpcode() == ISD::ZERO_EXTEND ||
46139 Op.getOpcode() == ISD::SIGN_EXTEND) &&
46140 Op.getOperand(0).getScalarValueSizeInBits() <= 8)
46141 return true;
46142
46143 auto *BV = dyn_cast<BuildVectorSDNode>(Op);
46144 return (BV && BV->isConstant());
46145 };
46146
46147 // (dpbusd (zext a), (sext, b)). Since the first operand should be unsigned
46148 // value, we need to check Op0 is zero extended value. Op1 should be signed
46149 // value, so we just check the signed bits.
46150 if ((IsFreeTruncation(Op0) &&
46151 DAG.computeKnownBits(Op0).countMaxActiveBits() <= 8) &&
46152 (IsFreeTruncation(Op1) && DAG.ComputeMaxSignificantBits(Op1) <= 8))
46153 return true;
46154
46155 return false;
46156}
46157
46159 unsigned &LogBias, const SDLoc &DL,
46160 const X86Subtarget &Subtarget) {
46161 // Extend or truncate to MVT::i8 first.
46162 MVT Vi8VT =
46163 MVT::getVectorVT(MVT::i8, LHS.getValueType().getVectorElementCount());
46164 LHS = DAG.getZExtOrTrunc(LHS, DL, Vi8VT);
46165 RHS = DAG.getSExtOrTrunc(RHS, DL, Vi8VT);
46166
46167 // VPDPBUSD(<16 x i32>C, <16 x i8>A, <16 x i8>B). For each dst element
46168 // C[0] = C[0] + A[0]B[0] + A[1]B[1] + A[2]B[2] + A[3]B[3].
46169 // The src A, B element type is i8, but the dst C element type is i32.
46170 // When we calculate the reduce stage, we use src vector type vXi8 for it
46171 // so we need logbias 2 to avoid extra 2 stages.
46172 LogBias = 2;
46173
46174 unsigned RegSize = std::max(128u, (unsigned)Vi8VT.getSizeInBits());
46175 if (Subtarget.hasVNNI() && !Subtarget.hasVLX())
46176 RegSize = std::max(512u, RegSize);
46177
46178 // "Zero-extend" the i8 vectors. This is not a per-element zext, rather we
46179 // fill in the missing vector elements with 0.
46180 unsigned NumConcat = RegSize / Vi8VT.getSizeInBits();
46181 SmallVector<SDValue, 16> Ops(NumConcat, DAG.getConstant(0, DL, Vi8VT));
46182 Ops[0] = LHS;
46183 MVT ExtendedVT = MVT::getVectorVT(MVT::i8, RegSize / 8);
46184 SDValue DpOp0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
46185 Ops[0] = RHS;
46186 SDValue DpOp1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
46187
46188 // Actually build the DotProduct, split as 256/512 bits for
46189 // AVXVNNI/AVX512VNNI.
46190 auto DpBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
46192 MVT VT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32);
46193 return DAG.getNode(X86ISD::VPDPBUSD, DL, VT, Ops);
46194 };
46195 MVT DpVT = MVT::getVectorVT(MVT::i32, RegSize / 32);
46196 SDValue Zero = DAG.getConstant(0, DL, DpVT);
46197
46198 return SplitOpsAndApply(DAG, Subtarget, DL, DpVT, {Zero, DpOp0, DpOp1},
46199 DpBuilder, /*CheckBWI=*/false, Subtarget.hasVNNI());
46200}
46201
46202// Create a PSADBW given two sources representable as zexts of vXi8.
46204 const SDLoc &DL, const X86Subtarget &Subtarget) {
46205 // Find the appropriate width for the PSADBW.
46206 EVT DstVT = N0.getValueType();
46207 EVT SrcVT = EVT::getVectorVT(*DAG.getContext(), MVT::i8,
46208 DstVT.getVectorElementCount());
46209 unsigned RegSize = std::max(128u, (unsigned)SrcVT.getSizeInBits());
46210
46211 // Widen the vXi8 vectors, padding with zero vector elements.
46212 unsigned NumConcat = RegSize / SrcVT.getSizeInBits();
46213 SmallVector<SDValue, 16> Ops(NumConcat, DAG.getConstant(0, DL, SrcVT));
46214 Ops[0] = DAG.getZExtOrTrunc(N0, DL, SrcVT);
46215 MVT ExtendedVT = MVT::getVectorVT(MVT::i8, RegSize / 8);
46216 SDValue SadOp0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
46217 Ops[0] = DAG.getZExtOrTrunc(N1, DL, SrcVT);
46218 SDValue SadOp1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
46219
46220 // Actually build the SAD, split as 128/256/512 bits for SSE/AVX2/AVX512BW.
46221 auto PSADBWBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
46223 MVT VT = MVT::getVectorVT(MVT::i64, Ops[0].getValueSizeInBits() / 64);
46224 return DAG.getNode(X86ISD::PSADBW, DL, VT, Ops);
46225 };
46226 MVT SadVT = MVT::getVectorVT(MVT::i64, RegSize / 64);
46227 return SplitOpsAndApply(DAG, Subtarget, DL, SadVT, {SadOp0, SadOp1},
46228 PSADBWBuilder);
46229}
46230
46231// Attempt to replace an min/max v8i16/v16i8 horizontal reduction with
46232// PHMINPOSUW.
46234 const X86Subtarget &Subtarget) {
46235 // Bail without SSE41.
46236 if (!Subtarget.hasSSE41())
46237 return SDValue();
46238
46239 EVT ExtractVT = Extract->getValueType(0);
46240 if (ExtractVT != MVT::i16 && ExtractVT != MVT::i8)
46241 return SDValue();
46242
46243 // Check for SMAX/SMIN/UMAX/UMIN horizontal reduction patterns.
46244 ISD::NodeType BinOp;
46245 SDValue Src = DAG.matchBinOpReduction(
46246 Extract, BinOp, {ISD::SMAX, ISD::SMIN, ISD::UMAX, ISD::UMIN}, true);
46247 if (!Src)
46248 return SDValue();
46249
46250 EVT SrcVT = Src.getValueType();
46251 EVT SrcSVT = SrcVT.getScalarType();
46252 if (SrcSVT != ExtractVT || (SrcVT.getSizeInBits() % 128) != 0)
46253 return SDValue();
46254
46255 SDLoc DL(Extract);
46256 SDValue MinPos = Src;
46257
46258 // First, reduce the source down to 128-bit, applying BinOp to lo/hi.
46259 while (SrcVT.getSizeInBits() > 128) {
46260 SDValue Lo, Hi;
46261 std::tie(Lo, Hi) = splitVector(MinPos, DAG, DL);
46262 SrcVT = Lo.getValueType();
46263 MinPos = DAG.getNode(BinOp, DL, SrcVT, Lo, Hi);
46264 }
46265 assert(((SrcVT == MVT::v8i16 && ExtractVT == MVT::i16) ||
46266 (SrcVT == MVT::v16i8 && ExtractVT == MVT::i8)) &&
46267 "Unexpected value type");
46268
46269 // PHMINPOSUW applies to UMIN(v8i16), for SMIN/SMAX/UMAX we must apply a mask
46270 // to flip the value accordingly.
46271 SDValue Mask;
46272 unsigned MaskEltsBits = ExtractVT.getSizeInBits();
46273 if (BinOp == ISD::SMAX)
46274 Mask = DAG.getConstant(APInt::getSignedMaxValue(MaskEltsBits), DL, SrcVT);
46275 else if (BinOp == ISD::SMIN)
46276 Mask = DAG.getConstant(APInt::getSignedMinValue(MaskEltsBits), DL, SrcVT);
46277 else if (BinOp == ISD::UMAX)
46278 Mask = DAG.getAllOnesConstant(DL, SrcVT);
46279
46280 if (Mask)
46281 MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos);
46282
46283 // For v16i8 cases we need to perform UMIN on pairs of byte elements,
46284 // shuffling each upper element down and insert zeros. This means that the
46285 // v16i8 UMIN will leave the upper element as zero, performing zero-extension
46286 // ready for the PHMINPOS.
46287 if (ExtractVT == MVT::i8) {
46289 SrcVT, DL, MinPos, DAG.getConstant(0, DL, MVT::v16i8),
46290 {1, 16, 3, 16, 5, 16, 7, 16, 9, 16, 11, 16, 13, 16, 15, 16});
46291 MinPos = DAG.getNode(ISD::UMIN, DL, SrcVT, MinPos, Upper);
46292 }
46293
46294 // Perform the PHMINPOS on a v8i16 vector,
46295 MinPos = DAG.getBitcast(MVT::v8i16, MinPos);
46296 MinPos = DAG.getNode(X86ISD::PHMINPOS, DL, MVT::v8i16, MinPos);
46297 MinPos = DAG.getBitcast(SrcVT, MinPos);
46298
46299 if (Mask)
46300 MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos);
46301
46302 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, MinPos,
46303 DAG.getVectorIdxConstant(0, DL));
46304}
46305
46306// Attempt to replace an all_of/any_of/parity style horizontal reduction with a MOVMSK.
46308 const X86Subtarget &Subtarget) {
46309 // Bail without SSE2.
46310 if (!Subtarget.hasSSE2())
46311 return SDValue();
46312
46313 EVT ExtractVT = Extract->getValueType(0);
46314 unsigned BitWidth = ExtractVT.getSizeInBits();
46315 if (ExtractVT != MVT::i64 && ExtractVT != MVT::i32 && ExtractVT != MVT::i16 &&
46316 ExtractVT != MVT::i8 && ExtractVT != MVT::i1)
46317 return SDValue();
46318
46319 // Check for OR(any_of)/AND(all_of)/XOR(parity) horizontal reduction patterns.
46320 ISD::NodeType BinOp;
46321 SDValue Match = DAG.matchBinOpReduction(Extract, BinOp, {ISD::OR, ISD::AND});
46322 if (!Match && ExtractVT == MVT::i1)
46323 Match = DAG.matchBinOpReduction(Extract, BinOp, {ISD::XOR});
46324 if (!Match)
46325 return SDValue();
46326
46327 // EXTRACT_VECTOR_ELT can require implicit extension of the vector element
46328 // which we can't support here for now.
46329 if (Match.getScalarValueSizeInBits() != BitWidth)
46330 return SDValue();
46331
46332 SDValue Movmsk;
46333 SDLoc DL(Extract);
46334 EVT MatchVT = Match.getValueType();
46335 unsigned NumElts = MatchVT.getVectorNumElements();
46336 unsigned MaxElts = Subtarget.hasInt256() ? 32 : 16;
46337 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
46338 LLVMContext &Ctx = *DAG.getContext();
46339
46340 if (ExtractVT == MVT::i1) {
46341 // Special case for (pre-legalization) vXi1 reductions.
46342 if (NumElts > 64 || !isPowerOf2_32(NumElts))
46343 return SDValue();
46344 if (Match.getOpcode() == ISD::SETCC) {
46345 ISD::CondCode CC = cast<CondCodeSDNode>(Match.getOperand(2))->get();
46346 if ((BinOp == ISD::AND && CC == ISD::CondCode::SETEQ) ||
46347 (BinOp == ISD::OR && CC == ISD::CondCode::SETNE)) {
46348 // For all_of(setcc(x,y,eq)) - use (iX)x == (iX)y.
46349 // For any_of(setcc(x,y,ne)) - use (iX)x != (iX)y.
46350 X86::CondCode X86CC;
46351 SDValue LHS = DAG.getFreeze(Match.getOperand(0));
46352 SDValue RHS = DAG.getFreeze(Match.getOperand(1));
46353 APInt Mask = APInt::getAllOnes(LHS.getScalarValueSizeInBits());
46354 if (SDValue V = LowerVectorAllEqual(DL, LHS, RHS, CC, Mask, Subtarget,
46355 DAG, X86CC))
46356 return DAG.getNode(ISD::TRUNCATE, DL, ExtractVT,
46357 getSETCC(X86CC, V, DL, DAG));
46358 }
46359 }
46360 if (TLI.isTypeLegal(MatchVT)) {
46361 // If this is a legal AVX512 predicate type then we can just bitcast.
46362 EVT MovmskVT = EVT::getIntegerVT(Ctx, NumElts);
46363 Movmsk = DAG.getBitcast(MovmskVT, Match);
46364 } else {
46365 // Use combineBitcastvxi1 to create the MOVMSK.
46366 while (NumElts > MaxElts) {
46367 SDValue Lo, Hi;
46368 std::tie(Lo, Hi) = DAG.SplitVector(Match, DL);
46369 Match = DAG.getNode(BinOp, DL, Lo.getValueType(), Lo, Hi);
46370 NumElts /= 2;
46371 }
46372 EVT MovmskVT = EVT::getIntegerVT(Ctx, NumElts);
46373 Movmsk = combineBitcastvxi1(DAG, MovmskVT, Match, DL, Subtarget);
46374 }
46375 if (!Movmsk)
46376 return SDValue();
46377 Movmsk = DAG.getZExtOrTrunc(Movmsk, DL, NumElts > 32 ? MVT::i64 : MVT::i32);
46378 } else {
46379 // FIXME: Better handling of k-registers or 512-bit vectors?
46380 unsigned MatchSizeInBits = Match.getValueSizeInBits();
46381 if (!(MatchSizeInBits == 128 ||
46382 (MatchSizeInBits == 256 && Subtarget.hasAVX())))
46383 return SDValue();
46384
46385 // Make sure this isn't a vector of 1 element. The perf win from using
46386 // MOVMSK diminishes with less elements in the reduction, but it is
46387 // generally better to get the comparison over to the GPRs as soon as
46388 // possible to reduce the number of vector ops.
46389 if (Match.getValueType().getVectorNumElements() < 2)
46390 return SDValue();
46391
46392 // Check that we are extracting a reduction of all sign bits.
46393 if (DAG.ComputeNumSignBits(Match) != BitWidth)
46394 return SDValue();
46395
46396 if (MatchSizeInBits == 256 && BitWidth < 32 && !Subtarget.hasInt256()) {
46397 SDValue Lo, Hi;
46398 std::tie(Lo, Hi) = DAG.SplitVector(Match, DL);
46399 Match = DAG.getNode(BinOp, DL, Lo.getValueType(), Lo, Hi);
46400 MatchSizeInBits = Match.getValueSizeInBits();
46401 }
46402
46403 // For 32/64 bit comparisons use MOVMSKPS/MOVMSKPD, else PMOVMSKB.
46404 MVT MaskSrcVT;
46405 if (64 == BitWidth || 32 == BitWidth)
46407 MatchSizeInBits / BitWidth);
46408 else
46409 MaskSrcVT = MVT::getVectorVT(MVT::i8, MatchSizeInBits / 8);
46410
46411 SDValue BitcastLogicOp = DAG.getBitcast(MaskSrcVT, Match);
46412 Movmsk = getPMOVMSKB(DL, BitcastLogicOp, DAG, Subtarget);
46413 NumElts = MaskSrcVT.getVectorNumElements();
46414 }
46415 assert((NumElts <= 32 || NumElts == 64) &&
46416 "Not expecting more than 64 elements");
46417
46418 MVT CmpVT = NumElts == 64 ? MVT::i64 : MVT::i32;
46419 if (BinOp == ISD::XOR) {
46420 // parity -> (PARITY(MOVMSK X))
46421 SDValue Result = DAG.getNode(ISD::PARITY, DL, CmpVT, Movmsk);
46422 return DAG.getZExtOrTrunc(Result, DL, ExtractVT);
46423 }
46424
46425 SDValue CmpC;
46426 ISD::CondCode CondCode;
46427 if (BinOp == ISD::OR) {
46428 // any_of -> MOVMSK != 0
46429 CmpC = DAG.getConstant(0, DL, CmpVT);
46430 CondCode = ISD::CondCode::SETNE;
46431 } else {
46432 // all_of -> MOVMSK == ((1 << NumElts) - 1)
46433 CmpC = DAG.getConstant(APInt::getLowBitsSet(CmpVT.getSizeInBits(), NumElts),
46434 DL, CmpVT);
46435 CondCode = ISD::CondCode::SETEQ;
46436 }
46437
46438 // The setcc produces an i8 of 0/1, so extend that to the result width and
46439 // negate to get the final 0/-1 mask value.
46440 EVT SetccVT = TLI.getSetCCResultType(DAG.getDataLayout(), Ctx, CmpVT);
46441 SDValue Setcc = DAG.getSetCC(DL, SetccVT, Movmsk, CmpC, CondCode);
46442 SDValue Zext = DAG.getZExtOrTrunc(Setcc, DL, ExtractVT);
46443 return DAG.getNegative(Zext, DL, ExtractVT);
46444}
46445
46447 const X86Subtarget &Subtarget) {
46448 if (!Subtarget.hasVNNI() && !Subtarget.hasAVXVNNI())
46449 return SDValue();
46450
46451 EVT ExtractVT = Extract->getValueType(0);
46452 // Verify the type we're extracting is i32, as the output element type of
46453 // vpdpbusd is i32.
46454 if (ExtractVT != MVT::i32)
46455 return SDValue();
46456
46457 EVT VT = Extract->getOperand(0).getValueType();
46459 return SDValue();
46460
46461 // Match shuffle + add pyramid.
46462 ISD::NodeType BinOp;
46463 SDValue Root = DAG.matchBinOpReduction(Extract, BinOp, {ISD::ADD});
46464
46465 // We can't combine to vpdpbusd for zext, because each of the 4 multiplies
46466 // done by vpdpbusd compute a signed 16-bit product that will be sign extended
46467 // before adding into the accumulator.
46468 // TODO:
46469 // We also need to verify that the multiply has at least 2x the number of bits
46470 // of the input. We shouldn't match
46471 // (sign_extend (mul (vXi9 (zext (vXi8 X))), (vXi9 (zext (vXi8 Y)))).
46472 // if (Root && (Root.getOpcode() == ISD::SIGN_EXTEND))
46473 // Root = Root.getOperand(0);
46474
46475 // If there was a match, we want Root to be a mul.
46476 if (!Root || Root.getOpcode() != ISD::MUL)
46477 return SDValue();
46478
46479 // Check whether we have an extend and mul pattern
46480 SDValue LHS, RHS;
46481 if (!detectExtMul(DAG, Root, LHS, RHS))
46482 return SDValue();
46483
46484 // Create the dot product instruction.
46485 SDLoc DL(Extract);
46486 unsigned StageBias;
46487 SDValue DP = createVPDPBUSD(DAG, LHS, RHS, StageBias, DL, Subtarget);
46488
46489 // If the original vector was wider than 4 elements, sum over the results
46490 // in the DP vector.
46491 unsigned Stages = Log2_32(VT.getVectorNumElements());
46492 EVT DpVT = DP.getValueType();
46493
46494 if (Stages > StageBias) {
46495 unsigned DpElems = DpVT.getVectorNumElements();
46496
46497 for (unsigned i = Stages - StageBias; i > 0; --i) {
46498 SmallVector<int, 16> Mask(DpElems, -1);
46499 for (unsigned j = 0, MaskEnd = 1 << (i - 1); j < MaskEnd; ++j)
46500 Mask[j] = MaskEnd + j;
46501
46502 SDValue Shuffle =
46503 DAG.getVectorShuffle(DpVT, DL, DP, DAG.getUNDEF(DpVT), Mask);
46504 DP = DAG.getNode(ISD::ADD, DL, DpVT, DP, Shuffle);
46505 }
46506 }
46507
46508 // Return the lowest ExtractSizeInBits bits.
46509 EVT ResVT =
46510 EVT::getVectorVT(*DAG.getContext(), ExtractVT,
46511 DpVT.getSizeInBits() / ExtractVT.getSizeInBits());
46512 DP = DAG.getBitcast(ResVT, DP);
46513 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, DP,
46514 Extract->getOperand(1));
46515}
46516
46518 const X86Subtarget &Subtarget) {
46519 using namespace SDPatternMatch;
46520
46521 // PSADBW is only supported on SSE2 and up.
46522 if (!Subtarget.hasSSE2())
46523 return SDValue();
46524
46525 EVT ExtractVT = Extract->getValueType(0);
46526 if (ExtractVT != MVT::i8 && ExtractVT != MVT::i16 && ExtractVT != MVT::i32 &&
46527 ExtractVT != MVT::i64)
46528 return SDValue();
46529
46530 EVT VT = Extract->getOperand(0).getValueType();
46532 return SDValue();
46533
46534 // Match shuffle + add pyramid.
46535 ISD::NodeType BinOp;
46536 SDValue Root = DAG.matchBinOpReduction(Extract, BinOp, {ISD::ADD});
46537 if (!Root)
46538 return SDValue();
46539
46540 // The operand is expected to be zero extended from i8.
46541 // In order to convert to i64 and above, additional any/zero/sign
46542 // extend is expected.
46543 // The zero extend from 32 bit has no mathematical effect on the result.
46544 // Also the sign extend is basically zero extend
46545 // (extends the sign bit which is zero).
46546 // So it is correct to skip the sign/zero extend instruction.
46547 if (Root.getOpcode() == ISD::SIGN_EXTEND ||
46548 Root.getOpcode() == ISD::ZERO_EXTEND ||
46549 Root.getOpcode() == ISD::ANY_EXTEND)
46550 Root = Root.getOperand(0);
46551
46552 // Check whether we have an vXi8 abdu pattern.
46553 // TODO: Just match ISD::ABDU once the DAG is topological sorted.
46554 SDValue Src0, Src1;
46555 if (!sd_match(
46556 Root,
46557 m_AnyOf(
46559 MVT::i8, m_c_BinOp(ISD::ABDU, m_Value(Src0), m_Value(Src1))),
46561 MVT::i8, m_Sub(m_UMax(m_Value(Src0), m_Value(Src1)),
46562 m_UMin(m_Deferred(Src0), m_Deferred(Src1)))),
46563 m_Abs(
46564 m_Sub(m_AllOf(m_Value(Src0),
46566 m_AllOf(m_Value(Src1),
46567 m_ZExt(m_SpecificVectorElementVT(MVT::i8))))))))
46568 return SDValue();
46569
46570 // Create the SAD instruction.
46571 SDLoc DL(Extract);
46572 SDValue SAD = createPSADBW(DAG, Src0, Src1, DL, Subtarget);
46573
46574 // If the original vector was wider than 8 elements, sum over the results
46575 // in the SAD vector.
46576 unsigned Stages = Log2_32(VT.getVectorNumElements());
46577 EVT SadVT = SAD.getValueType();
46578 if (Stages > 3) {
46579 unsigned SadElems = SadVT.getVectorNumElements();
46580
46581 for(unsigned i = Stages - 3; i > 0; --i) {
46582 SmallVector<int, 16> Mask(SadElems, -1);
46583 for(unsigned j = 0, MaskEnd = 1 << (i - 1); j < MaskEnd; ++j)
46584 Mask[j] = MaskEnd + j;
46585
46586 SDValue Shuffle =
46587 DAG.getVectorShuffle(SadVT, DL, SAD, DAG.getUNDEF(SadVT), Mask);
46588 SAD = DAG.getNode(ISD::ADD, DL, SadVT, SAD, Shuffle);
46589 }
46590 }
46591
46592 unsigned ExtractSizeInBits = ExtractVT.getSizeInBits();
46593 // Return the lowest ExtractSizeInBits bits.
46594 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), ExtractVT,
46595 SadVT.getSizeInBits() / ExtractSizeInBits);
46596 SAD = DAG.getBitcast(ResVT, SAD);
46597 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, SAD,
46598 Extract->getOperand(1));
46599}
46600
46601// If this extract is from a loaded vector value and will be used as an
46602// integer, that requires a potentially expensive XMM -> GPR transfer.
46603// Additionally, if we can convert to a scalar integer load, that will likely
46604// be folded into a subsequent integer op.
46605// Note: SrcVec might not have a VecVT type, but it must be the same size.
46606// Note: Unlike the related fold for this in DAGCombiner, this is not limited
46607// to a single-use of the loaded vector. For the reasons above, we
46608// expect this to be profitable even if it creates an extra load.
46609static SDValue
46611 const SDLoc &dl, SelectionDAG &DAG,
46613 assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
46614 "Only EXTRACT_VECTOR_ELT supported so far");
46615
46616 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
46617 EVT VT = N->getValueType(0);
46618
46619 bool LikelyUsedAsVector = any_of(N->users(), [](SDNode *Use) {
46620 return Use->getOpcode() == ISD::STORE ||
46621 Use->getOpcode() == ISD::INSERT_VECTOR_ELT ||
46622 Use->getOpcode() == ISD::SCALAR_TO_VECTOR;
46623 });
46624
46625 auto *LoadVec = dyn_cast<LoadSDNode>(SrcVec);
46626 if (LoadVec && ISD::isNormalLoad(LoadVec) && VT.isInteger() &&
46627 VecVT.getVectorElementType() == VT &&
46628 VecVT.getSizeInBits() == SrcVec.getValueSizeInBits() &&
46629 DCI.isAfterLegalizeDAG() && !LikelyUsedAsVector && LoadVec->isSimple()) {
46630 SDValue NewPtr = TLI.getVectorElementPointer(
46631 DAG, LoadVec->getBasePtr(), VecVT, DAG.getVectorIdxConstant(Idx, dl));
46632 unsigned PtrOff = VT.getSizeInBits() * Idx / 8;
46633 MachinePointerInfo MPI = LoadVec->getPointerInfo().getWithOffset(PtrOff);
46634 Align Alignment = commonAlignment(LoadVec->getAlign(), PtrOff);
46635 SDValue Load =
46636 DAG.getLoad(VT, dl, LoadVec->getChain(), NewPtr, MPI, Alignment,
46637 LoadVec->getMemOperand()->getFlags(), LoadVec->getAAInfo());
46638 DAG.makeEquivalentMemoryOrdering(LoadVec, Load);
46639 return Load;
46640 }
46641
46642 return SDValue();
46643}
46644
46645// Attempt to peek through a target shuffle and extract the scalar from the
46646// source.
46649 const X86Subtarget &Subtarget) {
46650 if (DCI.isBeforeLegalizeOps())
46651 return SDValue();
46652
46653 SDLoc dl(N);
46654 SDValue Src = N->getOperand(0);
46655 SDValue Idx = N->getOperand(1);
46656
46657 EVT VT = N->getValueType(0);
46658 EVT SrcVT = Src.getValueType();
46659 EVT SrcSVT = SrcVT.getVectorElementType();
46660 unsigned SrcEltBits = SrcSVT.getSizeInBits();
46661 unsigned NumSrcElts = SrcVT.getVectorNumElements();
46662
46663 // Don't attempt this for boolean mask vectors or unknown extraction indices.
46664 if (SrcSVT == MVT::i1 || !isa<ConstantSDNode>(Idx))
46665 return SDValue();
46666
46667 const APInt &IdxC = N->getConstantOperandAPInt(1);
46668 if (IdxC.uge(NumSrcElts))
46669 return SDValue();
46670
46671 SDValue SrcBC = peekThroughBitcasts(Src);
46672
46673 // Handle extract(bitcast(broadcast(scalar_value))).
46674 if (X86ISD::VBROADCAST == SrcBC.getOpcode()) {
46675 SDValue SrcOp = SrcBC.getOperand(0);
46676 EVT SrcOpVT = SrcOp.getValueType();
46677 if (SrcOpVT.isScalarInteger() && VT.isInteger() &&
46678 (SrcOpVT.getSizeInBits() % SrcEltBits) == 0) {
46679 unsigned Scale = SrcOpVT.getSizeInBits() / SrcEltBits;
46680 unsigned Offset = IdxC.urem(Scale) * SrcEltBits;
46681 // TODO support non-zero offsets.
46682 if (Offset == 0) {
46683 SrcOp = DAG.getZExtOrTrunc(SrcOp, dl, SrcVT.getScalarType());
46684 SrcOp = DAG.getZExtOrTrunc(SrcOp, dl, VT);
46685 return SrcOp;
46686 }
46687 }
46688 }
46689
46690 // If we're extracting a single element from a broadcast load and there are
46691 // no other users, just create a single load.
46693 SrcBC.hasOneUse()) {
46694 auto *MemIntr = cast<MemIntrinsicSDNode>(SrcBC);
46695 unsigned SrcBCWidth = SrcBC.getScalarValueSizeInBits();
46696 if (MemIntr->getMemoryVT().getSizeInBits() == SrcBCWidth &&
46697 VT.getSizeInBits() == SrcBCWidth && SrcEltBits == SrcBCWidth) {
46698 SDValue Load =
46699 DAG.getLoad(VT, dl, MemIntr->getChain(), MemIntr->getBasePtr(),
46700 MemIntr->getPointerInfo(), MemIntr->getBaseAlign(),
46701 MemIntr->getMemOperand()->getFlags());
46702 DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), Load.getValue(1));
46703 return Load;
46704 }
46705 }
46706
46707 // Handle extract(bitcast(scalar_to_vector(scalar_value))) for integers.
46708 // TODO: Move to DAGCombine?
46709 if (SrcBC.getOpcode() == ISD::SCALAR_TO_VECTOR && VT.isInteger() &&
46710 SrcBC.getValueType().isInteger() &&
46711 (SrcBC.getScalarValueSizeInBits() % SrcEltBits) == 0 &&
46712 SrcBC.getScalarValueSizeInBits() ==
46713 SrcBC.getOperand(0).getValueSizeInBits()) {
46714 unsigned Scale = SrcBC.getScalarValueSizeInBits() / SrcEltBits;
46715 if (IdxC.ult(Scale)) {
46716 unsigned Offset = IdxC.getZExtValue() * SrcVT.getScalarSizeInBits();
46717 SDValue Scl = SrcBC.getOperand(0);
46718 EVT SclVT = Scl.getValueType();
46719 if (Offset) {
46720 Scl = DAG.getNode(ISD::SRL, dl, SclVT, Scl,
46721 DAG.getShiftAmountConstant(Offset, SclVT, dl));
46722 }
46723 Scl = DAG.getZExtOrTrunc(Scl, dl, SrcVT.getScalarType());
46724 Scl = DAG.getZExtOrTrunc(Scl, dl, VT);
46725 return Scl;
46726 }
46727 }
46728
46729 // Handle extract(truncate(x)) for 0'th index.
46730 // TODO: Treat this as a faux shuffle?
46731 // TODO: When can we use this for general indices?
46732 if (ISD::TRUNCATE == Src.getOpcode() && IdxC == 0 &&
46733 (SrcVT.getSizeInBits() % 128) == 0) {
46734 Src = extract128BitVector(Src.getOperand(0), 0, DAG, dl);
46735 MVT ExtractVT = MVT::getVectorVT(SrcSVT.getSimpleVT(), 128 / SrcEltBits);
46736 return DAG.getNode(N->getOpcode(), dl, VT, DAG.getBitcast(ExtractVT, Src),
46737 Idx);
46738 }
46739
46740 // We can only legally extract other elements from 128-bit vectors and in
46741 // certain circumstances, depending on SSE-level.
46742 // TODO: Investigate float/double extraction if it will be just stored.
46743 auto GetLegalExtract = [&Subtarget, &DAG, &dl](SDValue Vec, EVT VecVT,
46744 unsigned Idx) {
46745 EVT VecSVT = VecVT.getScalarType();
46746 if ((VecVT.is256BitVector() || VecVT.is512BitVector()) &&
46747 (VecSVT == MVT::i8 || VecSVT == MVT::i16 || VecSVT == MVT::i32 ||
46748 VecSVT == MVT::i64)) {
46749 unsigned EltSizeInBits = VecSVT.getSizeInBits();
46750 unsigned NumEltsPerLane = 128 / EltSizeInBits;
46751 unsigned LaneOffset = (Idx & ~(NumEltsPerLane - 1)) * EltSizeInBits;
46752 unsigned LaneIdx = LaneOffset / Vec.getScalarValueSizeInBits();
46753 VecVT = EVT::getVectorVT(*DAG.getContext(), VecSVT, NumEltsPerLane);
46754 Vec = extract128BitVector(Vec, LaneIdx, DAG, dl);
46755 Idx &= (NumEltsPerLane - 1);
46756 }
46757 if ((VecVT == MVT::v4i32 || VecVT == MVT::v2i64) &&
46758 ((Idx == 0 && Subtarget.hasSSE2()) || Subtarget.hasSSE41())) {
46759 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VecVT.getScalarType(),
46760 DAG.getBitcast(VecVT, Vec),
46761 DAG.getVectorIdxConstant(Idx, dl));
46762 }
46763 if ((VecVT == MVT::v8i16 && Subtarget.hasSSE2()) ||
46764 (VecVT == MVT::v16i8 && Subtarget.hasSSE41())) {
46765 unsigned OpCode = (VecVT == MVT::v8i16 ? X86ISD::PEXTRW : X86ISD::PEXTRB);
46766 return DAG.getNode(OpCode, dl, MVT::i32, DAG.getBitcast(VecVT, Vec),
46767 DAG.getTargetConstant(Idx, dl, MVT::i8));
46768 }
46769 return SDValue();
46770 };
46771
46772 // Resolve the target shuffle inputs and mask.
46775 if (!getTargetShuffleInputs(SrcBC, Ops, Mask, DAG))
46776 return SDValue();
46777
46778 // Shuffle inputs must be the same size as the result.
46779 if (llvm::any_of(Ops, [SrcVT](SDValue Op) {
46780 return SrcVT.getSizeInBits() != Op.getValueSizeInBits();
46781 }))
46782 return SDValue();
46783
46784 // Attempt to narrow/widen the shuffle mask to the correct size.
46785 if (Mask.size() != NumSrcElts) {
46786 if ((NumSrcElts % Mask.size()) == 0) {
46787 SmallVector<int, 16> ScaledMask;
46788 int Scale = NumSrcElts / Mask.size();
46789 narrowShuffleMaskElts(Scale, Mask, ScaledMask);
46790 Mask = std::move(ScaledMask);
46791 } else if ((Mask.size() % NumSrcElts) == 0) {
46792 // Simplify Mask based on demanded element.
46793 int ExtractIdx = (int)IdxC.getZExtValue();
46794 int Scale = Mask.size() / NumSrcElts;
46795 int Lo = Scale * ExtractIdx;
46796 int Hi = Scale * (ExtractIdx + 1);
46797 for (int i = 0, e = (int)Mask.size(); i != e; ++i)
46798 if (i < Lo || Hi <= i)
46799 Mask[i] = SM_SentinelUndef;
46800
46801 SmallVector<int, 16> WidenedMask;
46802 while (Mask.size() > NumSrcElts &&
46803 canWidenShuffleElements(Mask, WidenedMask))
46804 Mask = std::move(WidenedMask);
46805 }
46806 }
46807
46808 // If narrowing/widening failed, see if we can extract+zero-extend.
46809 int ExtractIdx;
46810 EVT ExtractVT;
46811 if (Mask.size() == NumSrcElts) {
46812 ExtractIdx = Mask[IdxC.getZExtValue()];
46813 ExtractVT = SrcVT;
46814 } else {
46815 unsigned Scale = Mask.size() / NumSrcElts;
46816 if ((Mask.size() % NumSrcElts) != 0 || SrcVT.isFloatingPoint())
46817 return SDValue();
46818 unsigned ScaledIdx = Scale * IdxC.getZExtValue();
46819 if (!isUndefOrZeroInRange(Mask, ScaledIdx + 1, Scale - 1))
46820 return SDValue();
46821 ExtractIdx = Mask[ScaledIdx];
46822 EVT ExtractSVT = EVT::getIntegerVT(*DAG.getContext(), SrcEltBits / Scale);
46823 ExtractVT = EVT::getVectorVT(*DAG.getContext(), ExtractSVT, Mask.size());
46824 assert(SrcVT.getSizeInBits() == ExtractVT.getSizeInBits() &&
46825 "Failed to widen vector type");
46826 }
46827
46828 // If the shuffle source element is undef/zero then we can just accept it.
46829 if (ExtractIdx == SM_SentinelUndef)
46830 return DAG.getUNDEF(VT);
46831
46832 if (ExtractIdx == SM_SentinelZero)
46833 return VT.isFloatingPoint() ? DAG.getConstantFP(0.0, dl, VT)
46834 : DAG.getConstant(0, dl, VT);
46835
46836 SDValue SrcOp = Ops[ExtractIdx / Mask.size()];
46837 ExtractIdx = ExtractIdx % Mask.size();
46838 if (SDValue V = GetLegalExtract(SrcOp, ExtractVT, ExtractIdx))
46839 return DAG.getZExtOrTrunc(V, dl, VT);
46840
46841 if (N->getOpcode() == ISD::EXTRACT_VECTOR_ELT && ExtractVT == SrcVT)
46843 N, SrcVT, peekThroughBitcasts(SrcOp), ExtractIdx, dl, DAG, DCI))
46844 return V;
46845
46846 return SDValue();
46847}
46848
46849/// Extracting a scalar FP value from vector element 0 is free, so extract each
46850/// operand first, then perform the math as a scalar op.
46852 const X86Subtarget &Subtarget,
46854 assert(ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Expected extract");
46855 SDValue Vec = ExtElt->getOperand(0);
46856 SDValue Index = ExtElt->getOperand(1);
46857 EVT VT = ExtElt->getValueType(0);
46858 EVT VecVT = Vec.getValueType();
46859
46860 // TODO: If this is a unary/expensive/expand op, allow extraction from a
46861 // non-zero element because the shuffle+scalar op will be cheaper?
46862 if (!Vec.hasOneUse() || !isNullConstant(Index) || VecVT.getScalarType() != VT)
46863 return SDValue();
46864
46865 // Vector FP compares don't fit the pattern of FP math ops (propagate, not
46866 // extract, the condition code), so deal with those as a special-case.
46867 if (Vec.getOpcode() == ISD::SETCC && VT == MVT::i1) {
46868 EVT OpVT = Vec.getOperand(0).getValueType().getScalarType();
46869 if (OpVT != MVT::f32 && OpVT != MVT::f64)
46870 return SDValue();
46871
46872 // extract (setcc X, Y, CC), 0 --> setcc (extract X, 0), (extract Y, 0), CC
46873 SDLoc DL(ExtElt);
46874 SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, OpVT,
46875 Vec.getOperand(0), Index);
46876 SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, OpVT,
46877 Vec.getOperand(1), Index);
46878 return DAG.getNode(Vec.getOpcode(), DL, VT, Ext0, Ext1, Vec.getOperand(2));
46879 }
46880
46881 if (!(VT == MVT::f16 && Subtarget.hasFP16()) && VT != MVT::f32 &&
46882 VT != MVT::f64)
46883 return SDValue();
46884
46885 // Vector FP selects don't fit the pattern of FP math ops (because the
46886 // condition has a different type and we have to change the opcode), so deal
46887 // with those here.
46888 // FIXME: This is restricted to pre type legalization. If we loosen this we
46889 // need to convert vector bool to a scalar bool.
46890 if (DCI.isBeforeLegalize() && Vec.getOpcode() == ISD::VSELECT &&
46891 Vec.getOperand(0).getOpcode() == ISD::SETCC &&
46892 Vec.getOperand(0).getOperand(0).getValueType() == VecVT &&
46893 Vec.getOperand(0).getValueType().getScalarType() == MVT::i1) {
46894 // ext (sel Cond, X, Y), 0 --> sel (ext Cond, 0), (ext X, 0), (ext Y, 0)
46895 SDLoc DL(ExtElt);
46898 Vec.getOperand(0), Index);
46899 SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
46900 Vec.getOperand(1), Index);
46901 SDValue Ext2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
46902 Vec.getOperand(2), Index);
46903 return DAG.getNode(ISD::SELECT, DL, VT, Ext0, Ext1, Ext2);
46904 }
46905
46906 // TODO: This switch could include FNEG and the x86-specific FP logic ops
46907 // (FAND, FANDN, FOR, FXOR). But that may require enhancements to avoid
46908 // missed load folding and fma+fneg combining.
46909 switch (Vec.getOpcode()) {
46910 case ISD::FMA: // Begin 3 operands
46911 case ISD::FMAD:
46912 case ISD::FADD: // Begin 2 operands
46913 case ISD::FSUB:
46914 case ISD::FMUL:
46915 case ISD::FDIV:
46916 case ISD::FREM:
46917 case ISD::FCOPYSIGN:
46918 case ISD::FMINNUM:
46919 case ISD::FMAXNUM:
46920 case ISD::FMINNUM_IEEE:
46921 case ISD::FMAXNUM_IEEE:
46922 case ISD::FMAXIMUM:
46923 case ISD::FMINIMUM:
46924 case ISD::FMAXIMUMNUM:
46925 case ISD::FMINIMUMNUM:
46926 case X86ISD::FMAX:
46927 case X86ISD::FMIN:
46928 case ISD::FABS: // Begin 1 operand
46929 case ISD::FSQRT:
46930 case ISD::FRINT:
46931 case ISD::FCEIL:
46932 case ISD::FTRUNC:
46933 case ISD::FNEARBYINT:
46934 case ISD::FROUNDEVEN:
46935 case ISD::FROUND:
46936 case ISD::FFLOOR:
46937 case X86ISD::FRCP:
46938 case X86ISD::FRSQRT: {
46939 // extract (fp X, Y, ...), 0 --> fp (extract X, 0), (extract Y, 0), ...
46940 SDLoc DL(ExtElt);
46942 for (SDValue Op : Vec->ops())
46943 ExtOps.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op, Index));
46944 return DAG.getNode(Vec.getOpcode(), DL, VT, ExtOps);
46945 }
46946 default:
46947 return SDValue();
46948 }
46949 llvm_unreachable("All opcodes should return within switch");
46950}
46951
46952/// Try to convert a vector reduction sequence composed of binops and shuffles
46953/// into horizontal ops.
46955 const X86Subtarget &Subtarget) {
46956 assert(ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unexpected caller");
46957
46958 // We need at least SSE2 to anything here.
46959 if (!Subtarget.hasSSE2())
46960 return SDValue();
46961
46963 SDValue Rdx = DAG.matchBinOpReduction(ExtElt, Opc,
46964 {ISD::ADD, ISD::MUL, ISD::FADD}, true);
46965 if (!Rdx)
46966 return SDValue();
46967
46968 SDValue Index = ExtElt->getOperand(1);
46969 assert(isNullConstant(Index) &&
46970 "Reduction doesn't end in an extract from index 0");
46971
46972 EVT VT = ExtElt->getValueType(0);
46973 EVT VecVT = Rdx.getValueType();
46974 if (VecVT.getScalarType() != VT)
46975 return SDValue();
46976
46977 SDLoc DL(ExtElt);
46978 unsigned NumElts = VecVT.getVectorNumElements();
46979 unsigned EltSizeInBits = VecVT.getScalarSizeInBits();
46980
46981 // Extend v4i8/v8i8 vector to v16i8, with undef upper 64-bits.
46982 auto WidenToV16I8 = [&](SDValue V, bool ZeroExtend) {
46983 if (V.getValueType() == MVT::v4i8) {
46984 if (ZeroExtend && Subtarget.hasSSE41()) {
46985 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v4i32,
46986 DAG.getConstant(0, DL, MVT::v4i32),
46987 DAG.getBitcast(MVT::i32, V),
46988 DAG.getVectorIdxConstant(0, DL));
46989 return DAG.getBitcast(MVT::v16i8, V);
46990 }
46991 V = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i8, V,
46992 ZeroExtend ? DAG.getConstant(0, DL, MVT::v4i8)
46993 : DAG.getUNDEF(MVT::v4i8));
46994 }
46995 return DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V,
46996 DAG.getUNDEF(MVT::v8i8));
46997 };
46998
46999 // vXi8 mul reduction - promote to vXi16 mul reduction.
47000 if (Opc == ISD::MUL) {
47001 if (VT != MVT::i8 || NumElts < 4 || !isPowerOf2_32(NumElts))
47002 return SDValue();
47003 if (VecVT.getSizeInBits() >= 128) {
47004 EVT WideVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, NumElts / 2);
47005 SDValue Lo = getUnpackl(DAG, DL, VecVT, Rdx, DAG.getUNDEF(VecVT));
47006 SDValue Hi = getUnpackh(DAG, DL, VecVT, Rdx, DAG.getUNDEF(VecVT));
47007 Lo = DAG.getBitcast(WideVT, Lo);
47008 Hi = DAG.getBitcast(WideVT, Hi);
47009 Rdx = DAG.getNode(Opc, DL, WideVT, Lo, Hi);
47010 while (Rdx.getValueSizeInBits() > 128) {
47011 std::tie(Lo, Hi) = splitVector(Rdx, DAG, DL);
47012 Rdx = DAG.getNode(Opc, DL, Lo.getValueType(), Lo, Hi);
47013 }
47014 } else {
47015 Rdx = WidenToV16I8(Rdx, false);
47016 Rdx = getUnpackl(DAG, DL, MVT::v16i8, Rdx, DAG.getUNDEF(MVT::v16i8));
47017 Rdx = DAG.getBitcast(MVT::v8i16, Rdx);
47018 }
47019 if (NumElts >= 8)
47020 Rdx = DAG.getNode(Opc, DL, MVT::v8i16, Rdx,
47021 DAG.getVectorShuffle(MVT::v8i16, DL, Rdx, Rdx,
47022 {4, 5, 6, 7, -1, -1, -1, -1}));
47023 Rdx = DAG.getNode(Opc, DL, MVT::v8i16, Rdx,
47024 DAG.getVectorShuffle(MVT::v8i16, DL, Rdx, Rdx,
47025 {2, 3, -1, -1, -1, -1, -1, -1}));
47026 Rdx = DAG.getNode(Opc, DL, MVT::v8i16, Rdx,
47027 DAG.getVectorShuffle(MVT::v8i16, DL, Rdx, Rdx,
47028 {1, -1, -1, -1, -1, -1, -1, -1}));
47029 Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
47030 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
47031 }
47032
47033 // vXi8 add reduction - sub 128-bit vector.
47034 if (VecVT == MVT::v4i8 || VecVT == MVT::v8i8) {
47035 Rdx = WidenToV16I8(Rdx, true);
47036 Rdx = DAG.getNode(X86ISD::PSADBW, DL, MVT::v2i64, Rdx,
47037 DAG.getConstant(0, DL, MVT::v16i8));
47038 Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
47039 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
47040 }
47041
47042 // Must be a >=128-bit vector with pow2 elements.
47043 if ((VecVT.getSizeInBits() % 128) != 0 || !isPowerOf2_32(NumElts))
47044 return SDValue();
47045
47046 // vXi8 add reduction - sum lo/hi halves then use PSADBW.
47047 if (VT == MVT::i8) {
47048 while (Rdx.getValueSizeInBits() > 128) {
47049 SDValue Lo, Hi;
47050 std::tie(Lo, Hi) = splitVector(Rdx, DAG, DL);
47051 VecVT = Lo.getValueType();
47052 Rdx = DAG.getNode(ISD::ADD, DL, VecVT, Lo, Hi);
47053 }
47054 assert(VecVT == MVT::v16i8 && "v16i8 reduction expected");
47055
47057 MVT::v16i8, DL, Rdx, Rdx,
47058 {8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1});
47059 Rdx = DAG.getNode(ISD::ADD, DL, MVT::v16i8, Rdx, Hi);
47060 Rdx = DAG.getNode(X86ISD::PSADBW, DL, MVT::v2i64, Rdx,
47061 getZeroVector(MVT::v16i8, Subtarget, DAG, DL));
47062 Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
47063 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
47064 }
47065
47066 // See if we can use vXi8 PSADBW add reduction for larger zext types.
47067 // If the source vector values are 0-255, then we can use PSADBW to
47068 // sum+zext v8i8 subvectors to vXi64, then perform the reduction.
47069 // TODO: See if its worth avoiding vXi16/i32 truncations?
47070 if (Opc == ISD::ADD && NumElts >= 4 && EltSizeInBits >= 16 &&
47071 DAG.computeKnownBits(Rdx).getMaxValue().ule(255) &&
47072 (EltSizeInBits == 16 || Rdx.getOpcode() == ISD::ZERO_EXTEND ||
47073 Subtarget.hasAVX512())) {
47074 if (Rdx.getValueType() == MVT::v8i16) {
47075 Rdx = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Rdx,
47076 DAG.getUNDEF(MVT::v8i16));
47077 } else {
47078 EVT ByteVT = VecVT.changeVectorElementType(MVT::i8);
47079 Rdx = DAG.getNode(ISD::TRUNCATE, DL, ByteVT, Rdx);
47080 if (ByteVT.getSizeInBits() < 128)
47081 Rdx = WidenToV16I8(Rdx, true);
47082 }
47083
47084 // Build the PSADBW, split as 128/256/512 bits for SSE/AVX2/AVX512BW.
47085 auto PSADBWBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
47087 MVT VT = MVT::getVectorVT(MVT::i64, Ops[0].getValueSizeInBits() / 64);
47088 SDValue Zero = DAG.getConstant(0, DL, Ops[0].getValueType());
47089 return DAG.getNode(X86ISD::PSADBW, DL, VT, Ops[0], Zero);
47090 };
47091 MVT SadVT = MVT::getVectorVT(MVT::i64, Rdx.getValueSizeInBits() / 64);
47092 Rdx = SplitOpsAndApply(DAG, Subtarget, DL, SadVT, {Rdx}, PSADBWBuilder);
47093
47094 // TODO: We could truncate to vXi16/vXi32 before performing the reduction.
47095 while (Rdx.getValueSizeInBits() > 128) {
47096 SDValue Lo, Hi;
47097 std::tie(Lo, Hi) = splitVector(Rdx, DAG, DL);
47098 VecVT = Lo.getValueType();
47099 Rdx = DAG.getNode(ISD::ADD, DL, VecVT, Lo, Hi);
47100 }
47101 assert(Rdx.getValueType() == MVT::v2i64 && "v2i64 reduction expected");
47102
47103 if (NumElts > 8) {
47104 SDValue RdxHi = DAG.getVectorShuffle(MVT::v2i64, DL, Rdx, Rdx, {1, -1});
47105 Rdx = DAG.getNode(ISD::ADD, DL, MVT::v2i64, Rdx, RdxHi);
47106 }
47107
47108 VecVT = MVT::getVectorVT(VT.getSimpleVT(), 128 / VT.getSizeInBits());
47109 Rdx = DAG.getBitcast(VecVT, Rdx);
47110 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
47111 }
47112
47113 // Only use (F)HADD opcodes if they aren't microcoded or minimizes codesize.
47114 if (!shouldUseHorizontalOp(true, DAG, Subtarget))
47115 return SDValue();
47116
47117 unsigned HorizOpcode = Opc == ISD::ADD ? X86ISD::HADD : X86ISD::FHADD;
47118
47119 // 256-bit horizontal instructions operate on 128-bit chunks rather than
47120 // across the whole vector, so we need an extract + hop preliminary stage.
47121 // This is the only step where the operands of the hop are not the same value.
47122 // TODO: We could extend this to handle 512-bit or even longer vectors.
47123 if (((VecVT == MVT::v16i16 || VecVT == MVT::v8i32) && Subtarget.hasSSSE3()) ||
47124 ((VecVT == MVT::v8f32 || VecVT == MVT::v4f64) && Subtarget.hasSSE3())) {
47125 unsigned NumElts = VecVT.getVectorNumElements();
47126 SDValue Hi = extract128BitVector(Rdx, NumElts / 2, DAG, DL);
47127 SDValue Lo = extract128BitVector(Rdx, 0, DAG, DL);
47128 Rdx = DAG.getNode(HorizOpcode, DL, Lo.getValueType(), Hi, Lo);
47129 VecVT = Rdx.getValueType();
47130 }
47131 if (!((VecVT == MVT::v8i16 || VecVT == MVT::v4i32) && Subtarget.hasSSSE3()) &&
47132 !((VecVT == MVT::v4f32 || VecVT == MVT::v2f64) && Subtarget.hasSSE3()))
47133 return SDValue();
47134
47135 // extract (add (shuf X), X), 0 --> extract (hadd X, X), 0
47136 unsigned ReductionSteps = Log2_32(VecVT.getVectorNumElements());
47137 for (unsigned i = 0; i != ReductionSteps; ++i)
47138 Rdx = DAG.getNode(HorizOpcode, DL, VecVT, Rdx, Rdx);
47139
47140 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
47141}
47142
47143/// Detect vector gather/scatter index generation and convert it from being a
47144/// bunch of shuffles and extracts into a somewhat faster sequence.
47145/// For i686, the best sequence is apparently storing the value and loading
47146/// scalars back, while for x64 we should use 64-bit extracts and shifts.
47149 const X86Subtarget &Subtarget) {
47150 if (SDValue NewOp = combineExtractWithShuffle(N, DAG, DCI, Subtarget))
47151 return NewOp;
47152
47153 SDValue InputVector = N->getOperand(0);
47154 SDValue EltIdx = N->getOperand(1);
47155 auto *CIdx = dyn_cast<ConstantSDNode>(EltIdx);
47156
47157 EVT SrcVT = InputVector.getValueType();
47158 EVT VT = N->getValueType(0);
47159 SDLoc dl(InputVector);
47160 bool IsPextr = N->getOpcode() != ISD::EXTRACT_VECTOR_ELT;
47161 unsigned NumSrcElts = SrcVT.getVectorNumElements();
47162 unsigned NumEltBits = VT.getScalarSizeInBits();
47163 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
47164
47165 if (CIdx && CIdx->getAPIntValue().uge(NumSrcElts))
47166 return IsPextr ? DAG.getConstant(0, dl, VT) : DAG.getUNDEF(VT);
47167
47168 // Integer Constant Folding.
47169 if (CIdx && VT.isInteger()) {
47170 APInt UndefVecElts;
47171 SmallVector<APInt, 16> EltBits;
47172 unsigned VecEltBitWidth = SrcVT.getScalarSizeInBits();
47173 if (getTargetConstantBitsFromNode(InputVector, VecEltBitWidth, UndefVecElts,
47174 EltBits, /*AllowWholeUndefs*/ true,
47175 /*AllowPartialUndefs*/ false)) {
47176 uint64_t Idx = CIdx->getZExtValue();
47177 if (UndefVecElts[Idx])
47178 return IsPextr ? DAG.getConstant(0, dl, VT) : DAG.getUNDEF(VT);
47179 return DAG.getConstant(EltBits[Idx].zext(NumEltBits), dl, VT);
47180 }
47181
47182 // Convert extract_element(bitcast(<X x i1>) -> bitcast(extract_subvector()).
47183 // Improves lowering of bool masks on rust which splits them into byte array.
47184 if (InputVector.getOpcode() == ISD::BITCAST && (NumEltBits % 8) == 0) {
47185 SDValue Src = peekThroughBitcasts(InputVector);
47186 if (Src.getValueType().getScalarType() == MVT::i1 &&
47187 TLI.isTypeLegal(Src.getValueType())) {
47188 MVT SubVT = MVT::getVectorVT(MVT::i1, NumEltBits);
47189 SDValue Sub = DAG.getNode(
47190 ISD::EXTRACT_SUBVECTOR, dl, SubVT, Src,
47191 DAG.getVectorIdxConstant(CIdx->getZExtValue() * NumEltBits, dl));
47192 return DAG.getBitcast(VT, Sub);
47193 }
47194 }
47195 }
47196
47197 if (IsPextr) {
47198 if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(NumEltBits),
47199 DCI))
47200 return SDValue(N, 0);
47201
47202 // PEXTR*(PINSR*(v, s, c), c) -> s (with implicit zext handling).
47203 if ((InputVector.getOpcode() == X86ISD::PINSRB ||
47204 InputVector.getOpcode() == X86ISD::PINSRW) &&
47205 InputVector.getOperand(2) == EltIdx) {
47206 assert(SrcVT == InputVector.getOperand(0).getValueType() &&
47207 "Vector type mismatch");
47208 SDValue Scl = InputVector.getOperand(1);
47209 Scl = DAG.getNode(ISD::TRUNCATE, dl, SrcVT.getScalarType(), Scl);
47210 return DAG.getZExtOrTrunc(Scl, dl, VT);
47211 }
47212
47213 // TODO - Remove this once we can handle the implicit zero-extension of
47214 // X86ISD::PEXTRW/X86ISD::PEXTRB in combinePredicateReduction and
47215 // combineBasicSADPattern.
47216 return SDValue();
47217 }
47218
47219 // Detect mmx extraction of all bits as a i64. It works better as a bitcast.
47220 if (VT == MVT::i64 && SrcVT == MVT::v1i64 &&
47221 InputVector.getOpcode() == ISD::BITCAST &&
47222 InputVector.getOperand(0).getValueType() == MVT::x86mmx &&
47223 isNullConstant(EltIdx) && InputVector.hasOneUse())
47224 return DAG.getBitcast(VT, InputVector);
47225
47226 // Detect mmx to i32 conversion through a v2i32 elt extract.
47227 if (VT == MVT::i32 && SrcVT == MVT::v2i32 &&
47228 InputVector.getOpcode() == ISD::BITCAST &&
47229 InputVector.getOperand(0).getValueType() == MVT::x86mmx &&
47230 isNullConstant(EltIdx) && InputVector.hasOneUse())
47231 return DAG.getNode(X86ISD::MMX_MOVD2W, dl, MVT::i32,
47232 InputVector.getOperand(0));
47233
47234 // Check whether this extract is the root of a sum of absolute differences
47235 // pattern. This has to be done here because we really want it to happen
47236 // pre-legalization,
47237 if (SDValue SAD = combineBasicSADPattern(N, DAG, Subtarget))
47238 return SAD;
47239
47240 if (SDValue VPDPBUSD = combineVPDPBUSDPattern(N, DAG, Subtarget))
47241 return VPDPBUSD;
47242
47243 // Attempt to replace an all_of/any_of horizontal reduction with a MOVMSK.
47244 if (SDValue Cmp = combinePredicateReduction(N, DAG, Subtarget))
47245 return Cmp;
47246
47247 // Attempt to replace min/max v8i16/v16i8 reductions with PHMINPOSUW.
47248 if (SDValue MinMax = combineMinMaxReduction(N, DAG, Subtarget))
47249 return MinMax;
47250
47251 // Attempt to optimize ADD/FADD/MUL reductions with HADD, promotion etc..
47252 if (SDValue V = combineArithReduction(N, DAG, Subtarget))
47253 return V;
47254
47255 if (SDValue V = scalarizeExtEltFP(N, DAG, Subtarget, DCI))
47256 return V;
47257
47258 if (CIdx)
47260 N, InputVector.getValueType(), InputVector, CIdx->getZExtValue(),
47261 dl, DAG, DCI))
47262 return V;
47263
47264 // Attempt to extract a i1 element by using MOVMSK to extract the signbits
47265 // and then testing the relevant element.
47266 //
47267 // Note that we only combine extracts on the *same* result number, i.e.
47268 // t0 = merge_values a0, a1, a2, a3
47269 // i1 = extract_vector_elt t0, Constant:i64<2>
47270 // i1 = extract_vector_elt t0, Constant:i64<3>
47271 // but not
47272 // i1 = extract_vector_elt t0:1, Constant:i64<2>
47273 // since the latter would need its own MOVMSK.
47274 if (SrcVT.getScalarType() == MVT::i1) {
47275 bool IsVar = !CIdx;
47276 SmallVector<SDNode *, 16> BoolExtracts;
47277 unsigned ResNo = InputVector.getResNo();
47278 auto IsBoolExtract = [&BoolExtracts, &ResNo, &IsVar](SDNode *Use) {
47279 if (Use->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
47280 Use->getOperand(0).getResNo() == ResNo &&
47281 Use->getValueType(0) == MVT::i1) {
47282 BoolExtracts.push_back(Use);
47283 IsVar |= !isa<ConstantSDNode>(Use->getOperand(1));
47284 return true;
47285 }
47286 return false;
47287 };
47288 // TODO: Can we drop the oneuse check for constant extracts?
47289 if (all_of(InputVector->users(), IsBoolExtract) &&
47290 (IsVar || BoolExtracts.size() > 1)) {
47291 EVT BCVT = EVT::getIntegerVT(*DAG.getContext(), NumSrcElts);
47292 if (SDValue BC =
47293 combineBitcastvxi1(DAG, BCVT, InputVector, dl, Subtarget)) {
47294 for (SDNode *Use : BoolExtracts) {
47295 // extractelement vXi1 X, MaskIdx --> ((movmsk X) & Mask) == Mask
47296 // Mask = 1 << MaskIdx
47297 SDValue MaskIdx = DAG.getZExtOrTrunc(Use->getOperand(1), dl, MVT::i8);
47298 SDValue MaskBit = DAG.getConstant(1, dl, BCVT);
47299 SDValue Mask = DAG.getNode(ISD::SHL, dl, BCVT, MaskBit, MaskIdx);
47300 SDValue Res = DAG.getNode(ISD::AND, dl, BCVT, BC, Mask);
47301 Res = DAG.getSetCC(dl, MVT::i1, Res, Mask, ISD::SETEQ);
47302 DCI.CombineTo(Use, Res);
47303 }
47304 return SDValue(N, 0);
47305 }
47306 }
47307 }
47308
47309 // Attempt to fold extract(trunc(x),c) -> trunc(extract(x,c)).
47310 if (CIdx && InputVector.getOpcode() == ISD::TRUNCATE) {
47311 SDValue TruncSrc = InputVector.getOperand(0);
47312 EVT TruncSVT = TruncSrc.getValueType().getScalarType();
47313 if (DCI.isBeforeLegalize() && TLI.isTypeLegal(TruncSVT)) {
47314 SDValue NewExt =
47315 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, TruncSVT, TruncSrc, EltIdx);
47316 return DAG.getAnyExtOrTrunc(NewExt, dl, VT);
47317 }
47318 }
47319
47320 return SDValue();
47321}
47322
47323// Convert (vXiY *ext(vXi1 bitcast(iX))) to extend_in_reg(broadcast(iX)).
47324// This is more or less the reverse of combineBitcastvxi1.
47326 unsigned Opcode, const SDLoc &DL, EVT VT, SDValue N0, SelectionDAG &DAG,
47327 TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) {
47328 if (Opcode != ISD::SIGN_EXTEND && Opcode != ISD::ZERO_EXTEND &&
47329 Opcode != ISD::ANY_EXTEND)
47330 return SDValue();
47331 if (!DCI.isBeforeLegalizeOps())
47332 return SDValue();
47333 if (!Subtarget.hasSSE2() || Subtarget.hasAVX512())
47334 return SDValue();
47335
47336 EVT SVT = VT.getScalarType();
47337 EVT InSVT = N0.getValueType().getScalarType();
47338 unsigned EltSizeInBits = SVT.getSizeInBits();
47339
47340 // Input type must be extending a bool vector (bit-casted from a scalar
47341 // integer) to legal integer types.
47342 if (!VT.isVector())
47343 return SDValue();
47344 if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16 && SVT != MVT::i8)
47345 return SDValue();
47346 if (InSVT != MVT::i1 || N0.getOpcode() != ISD::BITCAST)
47347 return SDValue();
47348
47349 SDValue N00 = N0.getOperand(0);
47350 EVT SclVT = N00.getValueType();
47351 if (!SclVT.isScalarInteger())
47352 return SDValue();
47353
47354 SDValue Vec;
47355 SmallVector<int> ShuffleMask;
47356 unsigned NumElts = VT.getVectorNumElements();
47357 assert(NumElts == SclVT.getSizeInBits() && "Unexpected bool vector size");
47358
47359 // Broadcast the scalar integer to the vector elements.
47360 if (NumElts > EltSizeInBits) {
47361 // If the scalar integer is greater than the vector element size, then we
47362 // must split it down into sub-sections for broadcasting. For example:
47363 // i16 -> v16i8 (i16 -> v8i16 -> v16i8) with 2 sub-sections.
47364 // i32 -> v32i8 (i32 -> v8i32 -> v32i8) with 4 sub-sections.
47365 assert((NumElts % EltSizeInBits) == 0 && "Unexpected integer scale");
47366 unsigned Scale = NumElts / EltSizeInBits;
47367 EVT BroadcastVT = EVT::getVectorVT(*DAG.getContext(), SclVT, EltSizeInBits);
47368 bool UseBroadcast = Subtarget.hasInt256() &&
47369 (!BroadcastVT.is128BitVector() || isa<LoadSDNode>(N00));
47370 Vec = UseBroadcast
47371 ? DAG.getSplat(BroadcastVT, DL, N00)
47372 : DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, BroadcastVT, N00);
47373 Vec = DAG.getBitcast(VT, Vec);
47374
47375 for (unsigned i = 0; i != Scale; ++i) {
47376 int Offset = UseBroadcast ? (i * EltSizeInBits) : 0;
47377 ShuffleMask.append(EltSizeInBits, i + Offset);
47378 }
47379 Vec = DAG.getVectorShuffle(VT, DL, Vec, Vec, ShuffleMask);
47380 } else if (Subtarget.hasAVX2() && NumElts < EltSizeInBits &&
47381 (SclVT == MVT::i8 || SclVT == MVT::i16 || SclVT == MVT::i32)) {
47382 // If we have register broadcast instructions, use the scalar size as the
47383 // element type for the shuffle. Then cast to the wider element type. The
47384 // widened bits won't be used, and this might allow the use of a broadcast
47385 // load.
47386 assert((EltSizeInBits % NumElts) == 0 && "Unexpected integer scale");
47387 EVT BroadcastVT = EVT::getVectorVT(*DAG.getContext(), SclVT,
47388 (NumElts * EltSizeInBits) / NumElts);
47389 Vec = DAG.getBitcast(VT, DAG.getSplat(BroadcastVT, DL, N00));
47390 } else {
47391 // For smaller scalar integers, we can simply any-extend it to the vector
47392 // element size (we don't care about the upper bits) and broadcast it to all
47393 // elements.
47394 Vec = DAG.getSplat(VT, DL, DAG.getAnyExtOrTrunc(N00, DL, SVT));
47395 }
47396
47397 // Now, mask the relevant bit in each element.
47399 for (unsigned i = 0; i != NumElts; ++i) {
47400 int BitIdx = (i % EltSizeInBits);
47401 APInt Bit = APInt::getBitsSet(EltSizeInBits, BitIdx, BitIdx + 1);
47402 Bits.push_back(DAG.getConstant(Bit, DL, SVT));
47403 }
47404 SDValue BitMask = DAG.getBuildVector(VT, DL, Bits);
47405 Vec = DAG.getNode(ISD::AND, DL, VT, Vec, BitMask);
47406
47407 // Compare against the bitmask and extend the result.
47408 EVT CCVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, NumElts);
47409 Vec = DAG.getSetCC(DL, CCVT, Vec, BitMask, ISD::SETEQ);
47410 Vec = DAG.getSExtOrTrunc(Vec, DL, VT);
47411
47412 // For SEXT, this is now done, otherwise shift the result down for
47413 // zero-extension.
47414 if (Opcode == ISD::SIGN_EXTEND)
47415 return Vec;
47416 return DAG.getNode(ISD::SRL, DL, VT, Vec,
47417 DAG.getConstant(EltSizeInBits - 1, DL, VT));
47418}
47419
47420/// If both arms of a vector select are concatenated vectors, split the select,
47421/// and concatenate the result to eliminate a wide (256-bit) vector instruction:
47422/// vselect Cond, (concat T0, T1), (concat F0, F1) -->
47423/// concat (vselect (split Cond), T0, F0), (vselect (split Cond), T1, F1)
47425 const X86Subtarget &Subtarget) {
47426 unsigned Opcode = N->getOpcode();
47427 if (Opcode != X86ISD::BLENDV && Opcode != ISD::VSELECT)
47428 return SDValue();
47429
47430 // TODO: Split 512-bit vectors too?
47431 EVT VT = N->getValueType(0);
47432 if (!VT.is256BitVector())
47433 return SDValue();
47434
47435 // TODO: Split as long as any 2 of the 3 operands are concatenated?
47436 SDValue Cond = N->getOperand(0);
47437 SDValue TVal = N->getOperand(1);
47438 SDValue FVal = N->getOperand(2);
47439 if (!TVal.hasOneUse() || !FVal.hasOneUse() ||
47440 !isFreeToSplitVector(TVal, DAG) || !isFreeToSplitVector(FVal, DAG))
47441 return SDValue();
47442
47443 auto makeBlend = [Opcode](SelectionDAG &DAG, const SDLoc &DL,
47445 return DAG.getNode(Opcode, DL, Ops[1].getValueType(), Ops);
47446 };
47447 return SplitOpsAndApply(DAG, Subtarget, DL, VT, {Cond, TVal, FVal}, makeBlend,
47448 /*CheckBWI*/ false);
47449}
47450
47452 const SDLoc &DL) {
47453 SDValue Cond = N->getOperand(0);
47454 SDValue LHS = N->getOperand(1);
47455 SDValue RHS = N->getOperand(2);
47456
47457 auto *TrueC = dyn_cast<ConstantSDNode>(LHS);
47458 auto *FalseC = dyn_cast<ConstantSDNode>(RHS);
47459 if (!TrueC || !FalseC)
47460 return SDValue();
47461
47462 // Don't do this for crazy integer types.
47463 EVT VT = N->getValueType(0);
47464 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
47465 return SDValue();
47466
47467 // We're going to use the condition bit in math or logic ops. We could allow
47468 // this with a wider condition value (post-legalization it becomes an i8),
47469 // but if nothing is creating selects that late, it doesn't matter.
47470 if (Cond.getValueType() != MVT::i1)
47471 return SDValue();
47472
47473 // A power-of-2 multiply is just a shift. LEA also cheaply handles multiply by
47474 // 3, 5, or 9 with i32/i64, so those get transformed too.
47475 // TODO: For constants that overflow or do not differ by power-of-2 or small
47476 // multiplier, convert to 'and' + 'add'.
47477 const APInt &TrueVal = TrueC->getAPIntValue();
47478 const APInt &FalseVal = FalseC->getAPIntValue();
47479
47480 // We have a more efficient lowering for "(X == 0) ? Y : -1" using SBB.
47481 if ((TrueVal.isAllOnes() || FalseVal.isAllOnes()) &&
47482 Cond.getOpcode() == ISD::SETCC && isNullConstant(Cond.getOperand(1))) {
47483 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
47484 if (CC == ISD::SETEQ || CC == ISD::SETNE)
47485 return SDValue();
47486 }
47487
47488 bool OV;
47489 APInt Diff = TrueVal.ssub_ov(FalseVal, OV);
47490 if (OV)
47491 return SDValue();
47492
47493 APInt AbsDiff = Diff.abs();
47494 if (AbsDiff.isPowerOf2() ||
47495 ((VT == MVT::i32 || VT == MVT::i64) &&
47496 (AbsDiff == 3 || AbsDiff == 5 || AbsDiff == 9))) {
47497
47498 // We need a positive multiplier constant for shift/LEA codegen. The 'not'
47499 // of the condition can usually be folded into a compare predicate, but even
47500 // without that, the sequence should be cheaper than a CMOV alternative.
47501 if (TrueVal.slt(FalseVal)) {
47502 Cond = DAG.getNOT(DL, Cond, MVT::i1);
47503 std::swap(TrueC, FalseC);
47504 }
47505
47506 // select Cond, TC, FC --> (zext(Cond) * (TC - FC)) + FC
47507 SDValue R = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Cond);
47508
47509 // Multiply condition by the difference if non-one.
47510 if (!AbsDiff.isOne())
47511 R = DAG.getNode(ISD::MUL, DL, VT, R, DAG.getConstant(AbsDiff, DL, VT));
47512
47513 // Add the base if non-zero.
47514 if (!FalseC->isZero())
47515 R = DAG.getNode(ISD::ADD, DL, VT, R, SDValue(FalseC, 0));
47516
47517 return R;
47518 }
47519
47520 return SDValue();
47521}
47522
47523/// If this is a *dynamic* select (non-constant condition) and we can match
47524/// this node with one of the variable blend instructions, restructure the
47525/// condition so that blends can use the high (sign) bit of each element.
47526/// This function will also call SimplifyDemandedBits on already created
47527/// BLENDV to perform additional simplifications.
47529 const SDLoc &DL,
47531 const X86Subtarget &Subtarget) {
47532 SDValue Cond = N->getOperand(0);
47533 if ((N->getOpcode() != ISD::VSELECT &&
47534 N->getOpcode() != X86ISD::BLENDV) ||
47536 return SDValue();
47537
47538 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
47539 unsigned BitWidth = Cond.getScalarValueSizeInBits();
47540 EVT VT = N->getValueType(0);
47541
47542 // We can only handle the cases where VSELECT is directly legal on the
47543 // subtarget. We custom lower VSELECT nodes with constant conditions and
47544 // this makes it hard to see whether a dynamic VSELECT will correctly
47545 // lower, so we both check the operation's status and explicitly handle the
47546 // cases where a *dynamic* blend will fail even though a constant-condition
47547 // blend could be custom lowered.
47548 // FIXME: We should find a better way to handle this class of problems.
47549 // Potentially, we should combine constant-condition vselect nodes
47550 // pre-legalization into shuffles and not mark as many types as custom
47551 // lowered.
47553 return SDValue();
47554 // FIXME: We don't support i16-element blends currently. We could and
47555 // should support them by making *all* the bits in the condition be set
47556 // rather than just the high bit and using an i8-element blend.
47557 if (VT.getVectorElementType() == MVT::i16)
47558 return SDValue();
47559 // Dynamic blending was only available from SSE4.1 onward.
47560 if (VT.is128BitVector() && !Subtarget.hasSSE41())
47561 return SDValue();
47562 // Byte blends are only available in AVX2
47563 if (VT == MVT::v32i8 && !Subtarget.hasAVX2())
47564 return SDValue();
47565 // There are no 512-bit blend instructions that use sign bits.
47566 if (VT.is512BitVector())
47567 return SDValue();
47568
47569 // Don't optimize before the condition has been transformed to a legal type
47570 // and don't ever optimize vector selects that map to AVX512 mask-registers.
47572 return SDValue();
47573
47574 auto OnlyUsedAsSelectCond = [](SDValue Cond) {
47575 for (SDUse &Use : Cond->uses())
47576 if ((Use.getUser()->getOpcode() != ISD::VSELECT &&
47577 Use.getUser()->getOpcode() != X86ISD::BLENDV) ||
47578 Use.getOperandNo() != 0)
47579 return false;
47580
47581 return true;
47582 };
47583
47585
47586 if (OnlyUsedAsSelectCond(Cond)) {
47587 KnownBits Known;
47589 !DCI.isBeforeLegalizeOps());
47590 if (!TLI.SimplifyDemandedBits(Cond, DemandedBits, Known, TLO, 0, true))
47591 return SDValue();
47592
47593 // If we changed the computation somewhere in the DAG, this change will
47594 // affect all users of Cond. Update all the nodes so that we do not use
47595 // the generic VSELECT anymore. Otherwise, we may perform wrong
47596 // optimizations as we messed with the actual expectation for the vector
47597 // boolean values.
47598 for (SDNode *U : Cond->users()) {
47599 if (U->getOpcode() == X86ISD::BLENDV)
47600 continue;
47601
47602 SDValue SB = DAG.getNode(X86ISD::BLENDV, SDLoc(U), U->getValueType(0),
47603 Cond, U->getOperand(1), U->getOperand(2));
47604 DAG.ReplaceAllUsesOfValueWith(SDValue(U, 0), SB);
47605 DCI.AddToWorklist(U);
47606 }
47607 DCI.CommitTargetLoweringOpt(TLO);
47608 return SDValue(N, 0);
47609 }
47610
47611 // Otherwise we can still at least try to simplify multiple use bits.
47613 return DAG.getNode(X86ISD::BLENDV, DL, N->getValueType(0), V,
47614 N->getOperand(1), N->getOperand(2));
47615
47616 return SDValue();
47617}
47618
47619// Try to match:
47620// (or (and (M, (sub 0, X)), (pandn M, X)))
47621// which is a special case of:
47622// (select M, (sub 0, X), X)
47623// Per:
47624// http://graphics.stanford.edu/~seander/bithacks.html#ConditionalNegate
47625// We know that, if fNegate is 0 or 1:
47626// (fNegate ? -v : v) == ((v ^ -fNegate) + fNegate)
47627//
47628// Here, we have a mask, M (all 1s or 0), and, similarly, we know that:
47629// ((M & 1) ? -X : X) == ((X ^ -(M & 1)) + (M & 1))
47630// ( M ? -X : X) == ((X ^ M ) + (M & 1))
47631// This lets us transform our vselect to:
47632// (add (xor X, M), (and M, 1))
47633// And further to:
47634// (sub (xor X, M), M)
47636 EVT VT, SDValue Mask, SDValue X, SDValue Y, const SDLoc &DL,
47637 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
47638 using namespace SDPatternMatch;
47639 EVT MaskVT = Mask.getValueType();
47640 assert(MaskVT.isInteger() &&
47641 DAG.ComputeNumSignBits(Mask) == MaskVT.getScalarSizeInBits() &&
47642 "Mask must be zero/all-bits");
47643
47644 if (X.getValueType() != MaskVT || Y.getValueType() != MaskVT ||
47646 return SDValue();
47647
47648 SDValue V;
47649 if (!sd_match(Y, m_Neg(m_AllOf(m_Specific(X), m_Value(V)))) &&
47651 return SDValue();
47652
47653 SDValue SubOp1 = DAG.getNode(ISD::XOR, DL, MaskVT, V, Mask);
47654 SDValue SubOp2 = Mask;
47655
47656 // If the negate was on the false side of the select, then
47657 // the operands of the SUB need to be swapped. PR 27251.
47658 // This is because the pattern being matched above is
47659 // (vselect M, (sub (0, X), X) -> (sub (xor X, M), M)
47660 // but if the pattern matched was
47661 // (vselect M, X, (sub (0, X))), that is really negation of the pattern
47662 // above, -(vselect M, (sub 0, X), X), and therefore the replacement
47663 // pattern also needs to be a negation of the replacement pattern above.
47664 // And -(sub X, Y) is just sub (Y, X), so swapping the operands of the
47665 // sub accomplishes the negation of the replacement pattern.
47666 if (V == Y)
47667 std::swap(SubOp1, SubOp2);
47668
47669 SDValue Res = DAG.getNode(ISD::SUB, DL, MaskVT, SubOp1, SubOp2);
47670 return DAG.getBitcast(VT, Res);
47671}
47672
47674 const X86Subtarget &Subtarget) {
47675 using namespace SDPatternMatch;
47676 if (!Subtarget.hasAVX512())
47677 return SDValue();
47678
47679 ISD::CondCode CC;
47680 SDValue Cond, X, Y, LHS, RHS;
47683 m_CondCode(CC)))),
47684 m_Value(LHS), m_Value(RHS))))
47685 return SDValue();
47686
47687 if (canCombineAsMaskOperation(LHS, Subtarget) ||
47688 !canCombineAsMaskOperation(RHS, Subtarget))
47689 return SDValue();
47690
47691 // Commute LHS and RHS to create opportunity to select mask instruction.
47692 // (vselect M, L, R) -> (vselect ~M, R, L)
47693 ISD::CondCode NewCC = ISD::getSetCCInverse(CC, X.getValueType());
47694 Cond = DAG.getSetCC(SDLoc(Cond), Cond.getValueType(), X, Y, NewCC);
47695 return DAG.getSelect(DL, LHS.getValueType(), Cond, RHS, LHS);
47696}
47697
47698/// Do target-specific dag combines on SELECT and VSELECT nodes.
47701 const X86Subtarget &Subtarget) {
47702 SDLoc DL(N);
47703 SDValue Cond = N->getOperand(0);
47704 SDValue LHS = N->getOperand(1);
47705 SDValue RHS = N->getOperand(2);
47706
47707 // Try simplification again because we use this function to optimize
47708 // BLENDV nodes that are not handled by the generic combiner.
47709 if (SDValue V = DAG.simplifySelect(Cond, LHS, RHS))
47710 return V;
47711
47712 // When avx512 is available the lhs operand of select instruction can be
47713 // folded with mask instruction, while the rhs operand can't. Commute the
47714 // lhs and rhs of the select instruction to create the opportunity of
47715 // folding.
47716 if (SDValue V = commuteSelect(N, DAG, DL, Subtarget))
47717 return V;
47718
47719 EVT VT = LHS.getValueType();
47720 EVT CondVT = Cond.getValueType();
47721 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
47722 bool CondConstantVector = ISD::isBuildVectorOfConstantSDNodes(Cond.getNode());
47723
47724 // Attempt to combine (select M, (sub 0, X), X) -> (sub (xor X, M), M).
47725 // Limit this to cases of non-constant masks that createShuffleMaskFromVSELECT
47726 // can't catch, plus vXi8 cases where we'd likely end up with BLENDV.
47727 if (CondVT.isVector() && CondVT.isInteger() &&
47728 CondVT.getScalarSizeInBits() == VT.getScalarSizeInBits() &&
47729 (!CondConstantVector || CondVT.getScalarType() == MVT::i8) &&
47732 DL, DAG, Subtarget))
47733 return V;
47734
47735 // If the sign bit is known then BLENDV can be folded away.
47736 if (N->getOpcode() == X86ISD::BLENDV) {
47737 KnownBits KnownCond = DAG.computeKnownBits(Cond);
47738 if (KnownCond.isNegative())
47739 return LHS;
47740 if (KnownCond.isNonNegative())
47741 return RHS;
47742 }
47743
47744 if (N->getOpcode() == ISD::VSELECT || N->getOpcode() == X86ISD::BLENDV) {
47745 SmallVector<int, 64> CondMask;
47746 if (createShuffleMaskFromVSELECT(CondMask, Cond,
47747 N->getOpcode() == X86ISD::BLENDV)) {
47748 // Convert vselects with constant condition into shuffles.
47749 if (DCI.isBeforeLegalizeOps())
47750 return DAG.getVectorShuffle(VT, DL, LHS, RHS, CondMask);
47751
47752 // fold vselect(cond, pshufb(x), pshufb(y)) -> or (pshufb(x), pshufb(y))
47753 // by forcing the unselected elements to zero.
47754 // TODO: Can we handle more shuffles with this?
47755 if (LHS.hasOneUse() && RHS.hasOneUse()) {
47756 SmallVector<SDValue, 1> LHSOps, RHSOps;
47757 SmallVector<int, 64> LHSMask, RHSMask, ByteMask;
47760 if (LHSShuf.getOpcode() == X86ISD::PSHUFB &&
47761 RHSShuf.getOpcode() == X86ISD::PSHUFB &&
47762 scaleShuffleMaskElts(VT.getSizeInBits() / 8, CondMask, ByteMask) &&
47763 getTargetShuffleMask(LHSShuf, true, LHSOps, LHSMask) &&
47764 getTargetShuffleMask(RHSShuf, true, RHSOps, RHSMask)) {
47765 assert(ByteMask.size() == LHSMask.size() &&
47766 ByteMask.size() == RHSMask.size() && "Shuffle mask mismatch");
47767 for (auto [I, M] : enumerate(ByteMask)) {
47768 // getConstVector sets negative shuffle mask values as undef, so
47769 // ensure we hardcode SM_SentinelZero values to zero (0x80).
47770 if (M < (int)ByteMask.size()) {
47771 LHSMask[I] = isUndefOrZero(LHSMask[I]) ? 0x80 : LHSMask[I];
47772 RHSMask[I] = 0x80;
47773 } else {
47774 LHSMask[I] = 0x80;
47775 RHSMask[I] = isUndefOrZero(RHSMask[I]) ? 0x80 : RHSMask[I];
47776 }
47777 }
47778 MVT ByteVT = LHSShuf.getSimpleValueType();
47779 LHS = DAG.getNode(X86ISD::PSHUFB, DL, ByteVT, LHSOps[0],
47780 getConstVector(LHSMask, ByteVT, DAG, DL, true));
47781 RHS = DAG.getNode(X86ISD::PSHUFB, DL, ByteVT, RHSOps[0],
47782 getConstVector(RHSMask, ByteVT, DAG, DL, true));
47783 return DAG.getBitcast(VT, DAG.getNode(ISD::OR, DL, ByteVT, LHS, RHS));
47784 }
47785 }
47786
47787 // Attempt to combine as shuffle.
47788 SDValue Op(N, 0);
47789 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
47790 return Res;
47791 }
47792 }
47793
47794 // If we have SSE[12] support, try to form min/max nodes. SSE min/max
47795 // instructions match the semantics of the common C idiom x<y?x:y but not
47796 // x<=y?x:y, because of how they handle negative zero (which can be
47797 // ignored in unsafe-math mode).
47798 // We also try to create v2f32 min/max nodes, which we later widen to v4f32.
47799 if ((Cond.getOpcode() == ISD::SETCC ||
47800 Cond.getOpcode() == ISD::STRICT_FSETCCS) &&
47801 VT.isFloatingPoint() && VT != MVT::f80 && VT != MVT::f128 &&
47802 !isSoftF16(VT, Subtarget) && (TLI.isTypeLegal(VT) || VT == MVT::v2f32) &&
47803 ((VT != MVT::v8f16 && VT != MVT::v16f16) || Subtarget.hasVLX()) &&
47804 (Subtarget.hasSSE2() ||
47805 (Subtarget.hasSSE1() && VT.getScalarType() == MVT::f32))) {
47806 bool IsStrict = Cond->isStrictFPOpcode();
47807 ISD::CondCode CC =
47808 cast<CondCodeSDNode>(Cond.getOperand(IsStrict ? 3 : 2))->get();
47809 SDValue Op0 = Cond.getOperand(IsStrict ? 1 : 0);
47810 SDValue Op1 = Cond.getOperand(IsStrict ? 2 : 1);
47811
47812 unsigned Opcode = 0;
47813 // Check for x CC y ? x : y.
47814 if (DAG.isEqualTo(LHS, Op0) && DAG.isEqualTo(RHS, Op1)) {
47815 switch (CC) {
47816 default: break;
47817 case ISD::SETULT:
47818 // Converting this to a min would handle NaNs incorrectly, and swapping
47819 // the operands would cause it to handle comparisons between positive
47820 // and negative zero incorrectly.
47821 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
47823 !(DAG.isKnownNeverZeroFloat(LHS) ||
47825 break;
47826 std::swap(LHS, RHS);
47827 }
47828 Opcode = X86ISD::FMIN;
47829 break;
47830 case ISD::SETOLE:
47831 // Converting this to a min would handle comparisons between positive
47832 // and negative zero incorrectly.
47835 break;
47836 Opcode = X86ISD::FMIN;
47837 break;
47838 case ISD::SETULE:
47839 // Converting this to a min would handle both negative zeros and NaNs
47840 // incorrectly, but we can swap the operands to fix both.
47841 std::swap(LHS, RHS);
47842 [[fallthrough]];
47843 case ISD::SETOLT:
47844 case ISD::SETLT:
47845 case ISD::SETLE:
47846 Opcode = X86ISD::FMIN;
47847 break;
47848
47849 case ISD::SETOGE:
47850 // Converting this to a max would handle comparisons between positive
47851 // and negative zero incorrectly.
47854 break;
47855 Opcode = X86ISD::FMAX;
47856 break;
47857 case ISD::SETUGT:
47858 // Converting this to a max would handle NaNs incorrectly, and swapping
47859 // the operands would cause it to handle comparisons between positive
47860 // and negative zero incorrectly.
47861 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
47863 !(DAG.isKnownNeverZeroFloat(LHS) ||
47865 break;
47866 std::swap(LHS, RHS);
47867 }
47868 Opcode = X86ISD::FMAX;
47869 break;
47870 case ISD::SETUGE:
47871 // Converting this to a max would handle both negative zeros and NaNs
47872 // incorrectly, but we can swap the operands to fix both.
47873 std::swap(LHS, RHS);
47874 [[fallthrough]];
47875 case ISD::SETOGT:
47876 case ISD::SETGT:
47877 case ISD::SETGE:
47878 Opcode = X86ISD::FMAX;
47879 break;
47880 }
47881 // Check for x CC y ? y : x -- a min/max with reversed arms.
47882 } else if (DAG.isEqualTo(LHS, Op1) && DAG.isEqualTo(RHS, Op0)) {
47883 switch (CC) {
47884 default: break;
47885 case ISD::SETOGE:
47886 // Converting this to a min would handle comparisons between positive
47887 // and negative zero incorrectly, and swapping the operands would
47888 // cause it to handle NaNs incorrectly.
47890 !(DAG.isKnownNeverZeroFloat(LHS) ||
47891 DAG.isKnownNeverZeroFloat(RHS))) {
47892 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
47893 break;
47894 std::swap(LHS, RHS);
47895 }
47896 Opcode = X86ISD::FMIN;
47897 break;
47898 case ISD::SETUGT:
47899 // Converting this to a min would handle NaNs incorrectly.
47900 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
47901 break;
47902 Opcode = X86ISD::FMIN;
47903 break;
47904 case ISD::SETUGE:
47905 // Converting this to a min would handle both negative zeros and NaNs
47906 // incorrectly, but we can swap the operands to fix both.
47907 std::swap(LHS, RHS);
47908 [[fallthrough]];
47909 case ISD::SETOGT:
47910 case ISD::SETGT:
47911 case ISD::SETGE:
47912 Opcode = X86ISD::FMIN;
47913 break;
47914
47915 case ISD::SETULT:
47916 // Converting this to a max would handle NaNs incorrectly.
47917 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
47918 break;
47919 Opcode = X86ISD::FMAX;
47920 break;
47921 case ISD::SETOLE:
47922 // Converting this to a max would handle comparisons between positive
47923 // and negative zero incorrectly, and swapping the operands would
47924 // cause it to handle NaNs incorrectly.
47926 !DAG.isKnownNeverZeroFloat(LHS) &&
47927 !DAG.isKnownNeverZeroFloat(RHS)) {
47928 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
47929 break;
47930 std::swap(LHS, RHS);
47931 }
47932 Opcode = X86ISD::FMAX;
47933 break;
47934 case ISD::SETULE:
47935 // Converting this to a max would handle both negative zeros and NaNs
47936 // incorrectly, but we can swap the operands to fix both.
47937 std::swap(LHS, RHS);
47938 [[fallthrough]];
47939 case ISD::SETOLT:
47940 case ISD::SETLT:
47941 case ISD::SETLE:
47942 Opcode = X86ISD::FMAX;
47943 break;
47944 }
47945 }
47946
47947 if (Opcode) {
47948 if (IsStrict) {
47949 SDValue Ret = DAG.getNode(Opcode == X86ISD::FMIN ? X86ISD::STRICT_FMIN
47951 DL, {N->getValueType(0), MVT::Other},
47952 {Cond.getOperand(0), LHS, RHS});
47953 DAG.ReplaceAllUsesOfValueWith(Cond.getValue(1), Ret.getValue(1));
47954 return Ret;
47955 }
47956 return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS);
47957 }
47958 }
47959
47960 // Some mask scalar intrinsics rely on checking if only one bit is set
47961 // and implement it in C code like this:
47962 // A[0] = (U & 1) ? A[0] : W[0];
47963 // This creates some redundant instructions that break pattern matching.
47964 // fold (select (setcc (and (X, 1), 0, seteq), Y, Z)) -> select(and(X, 1),Z,Y)
47965 if (Subtarget.hasAVX512() && N->getOpcode() == ISD::SELECT &&
47966 Cond.getOpcode() == ISD::SETCC && (VT == MVT::f32 || VT == MVT::f64)) {
47967 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
47968 SDValue AndNode = Cond.getOperand(0);
47969 if (AndNode.getOpcode() == ISD::AND && CC == ISD::SETEQ &&
47970 isNullConstant(Cond.getOperand(1)) &&
47971 isOneConstant(AndNode.getOperand(1))) {
47972 // LHS and RHS swapped due to
47973 // setcc outputting 1 when AND resulted in 0 and vice versa.
47974 AndNode = DAG.getZExtOrTrunc(AndNode, DL, MVT::i8);
47975 return DAG.getNode(ISD::SELECT, DL, VT, AndNode, RHS, LHS);
47976 }
47977 }
47978
47979 // v16i8 (select v16i1, v16i8, v16i8) does not have a proper
47980 // lowering on KNL. In this case we convert it to
47981 // v16i8 (select v16i8, v16i8, v16i8) and use AVX instruction.
47982 // The same situation all vectors of i8 and i16 without BWI.
47983 // Make sure we extend these even before type legalization gets a chance to
47984 // split wide vectors.
47985 // Since SKX these selects have a proper lowering.
47986 if (Subtarget.hasAVX512() && !Subtarget.hasBWI() && CondVT.isVector() &&
47987 CondVT.getVectorElementType() == MVT::i1 &&
47988 (VT.getVectorElementType() == MVT::i8 ||
47989 VT.getVectorElementType() == MVT::i16)) {
47990 Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Cond);
47991 return DAG.getNode(N->getOpcode(), DL, VT, Cond, LHS, RHS);
47992 }
47993
47994 // AVX512 - Extend select to merge with target shuffle.
47995 // select(mask, extract_subvector(shuffle(x)), y) -->
47996 // extract_subvector(select(widen(mask), shuffle(x), widen(y)))
47997 // TODO - support non target shuffles as well with canCombineAsMaskOperation.
47998 if (Subtarget.hasAVX512() && CondVT.isVector() &&
47999 CondVT.getVectorElementType() == MVT::i1) {
48000 auto SelectableOp = [&TLI](SDValue Op, SDValue Alt) {
48001 return Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
48002 isTargetShuffle(Op.getOperand(0).getOpcode()) &&
48003 isNullConstant(Op.getOperand(1)) &&
48004 TLI.isTypeLegal(Op.getOperand(0).getValueType()) &&
48005 Op.hasOneUse() && Op.getOperand(0).hasOneUse() &&
48006 (Op.getOperand(0).getOpcode() != X86ISD::VPERMV3 ||
48007 ISD::isBuildVectorAllZeros(Alt.getNode()));
48008 };
48009
48010 bool SelectableLHS = SelectableOp(LHS, RHS);
48011 bool SelectableRHS = SelectableOp(RHS, LHS);
48012 if (SelectableLHS || SelectableRHS) {
48013 EVT SrcVT = SelectableLHS ? LHS.getOperand(0).getValueType()
48014 : RHS.getOperand(0).getValueType();
48015 EVT SrcCondVT = SrcVT.changeVectorElementType(MVT::i1);
48016 LHS = insertSubVector(DAG.getUNDEF(SrcVT), LHS, 0, DAG, DL,
48017 VT.getSizeInBits());
48018 RHS = insertSubVector(DAG.getUNDEF(SrcVT), RHS, 0, DAG, DL,
48019 VT.getSizeInBits());
48020 Cond = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, SrcCondVT,
48021 DAG.getUNDEF(SrcCondVT), Cond,
48022 DAG.getVectorIdxConstant(0, DL));
48023 SDValue Res = DAG.getSelect(DL, SrcVT, Cond, LHS, RHS);
48024 return extractSubVector(Res, 0, DAG, DL, VT.getSizeInBits());
48025 }
48026 }
48027
48028 if (SDValue V = combineSelectOfTwoConstants(N, DAG, DL))
48029 return V;
48030
48031 if (N->getOpcode() == ISD::SELECT && Cond.getOpcode() == ISD::SETCC &&
48032 Cond.hasOneUse()) {
48033 EVT CondVT = Cond.getValueType();
48034 SDValue Cond0 = Cond.getOperand(0);
48035 SDValue Cond1 = Cond.getOperand(1);
48036 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
48037
48038 // Canonicalize min/max:
48039 // (x > 0) ? x : 0 -> (x >= 0) ? x : 0
48040 // (x < -1) ? x : -1 -> (x <= -1) ? x : -1
48041 // This allows use of COND_S / COND_NS (see TranslateX86CC) which eliminates
48042 // the need for an extra compare against zero. e.g.
48043 // (a - b) > 0 : (a - b) ? 0 -> (a - b) >= 0 : (a - b) ? 0
48044 // subl %esi, %edi
48045 // testl %edi, %edi
48046 // movl $0, %eax
48047 // cmovgl %edi, %eax
48048 // =>
48049 // xorl %eax, %eax
48050 // subl %esi, $edi
48051 // cmovsl %eax, %edi
48052 //
48053 // We can also canonicalize
48054 // (x s> 1) ? x : 1 -> (x s>= 1) ? x : 1 -> (x s> 0) ? x : 1
48055 // (x u> 1) ? x : 1 -> (x u>= 1) ? x : 1 -> (x != 0) ? x : 1
48056 // This allows the use of a test instruction for the compare.
48057 if (LHS == Cond0 && RHS == Cond1) {
48058 if ((CC == ISD::SETGT && (isNullConstant(RHS) || isOneConstant(RHS))) ||
48059 (CC == ISD::SETLT && isAllOnesConstant(RHS))) {
48061 Cond = DAG.getSetCC(SDLoc(Cond), CondVT, Cond0, Cond1, NewCC);
48062 return DAG.getSelect(DL, VT, Cond, LHS, RHS);
48063 }
48064 if (CC == ISD::SETUGT && isOneConstant(RHS)) {
48065 ISD::CondCode NewCC = ISD::SETUGE;
48066 Cond = DAG.getSetCC(SDLoc(Cond), CondVT, Cond0, Cond1, NewCC);
48067 return DAG.getSelect(DL, VT, Cond, LHS, RHS);
48068 }
48069 }
48070
48071 // Similar to DAGCombine's select(or(CC0,CC1),X,Y) fold but for legal types.
48072 // fold eq + gt/lt nested selects into ge/le selects
48073 // select (cmpeq Cond0, Cond1), LHS, (select (cmpugt Cond0, Cond1), LHS, Y)
48074 // --> (select (cmpuge Cond0, Cond1), LHS, Y)
48075 // select (cmpslt Cond0, Cond1), LHS, (select (cmpeq Cond0, Cond1), LHS, Y)
48076 // --> (select (cmpsle Cond0, Cond1), LHS, Y)
48077 // .. etc ..
48078 if (RHS.getOpcode() == ISD::SELECT && RHS.getOperand(1) == LHS &&
48079 RHS.getOperand(0).getOpcode() == ISD::SETCC) {
48080 SDValue InnerSetCC = RHS.getOperand(0);
48081 ISD::CondCode InnerCC =
48082 cast<CondCodeSDNode>(InnerSetCC.getOperand(2))->get();
48083 if ((CC == ISD::SETEQ || InnerCC == ISD::SETEQ) &&
48084 Cond0 == InnerSetCC.getOperand(0) &&
48085 Cond1 == InnerSetCC.getOperand(1)) {
48086 ISD::CondCode NewCC;
48087 switch (CC == ISD::SETEQ ? InnerCC : CC) {
48088 // clang-format off
48089 case ISD::SETGT: NewCC = ISD::SETGE; break;
48090 case ISD::SETLT: NewCC = ISD::SETLE; break;
48091 case ISD::SETUGT: NewCC = ISD::SETUGE; break;
48092 case ISD::SETULT: NewCC = ISD::SETULE; break;
48093 default: NewCC = ISD::SETCC_INVALID; break;
48094 // clang-format on
48095 }
48096 if (NewCC != ISD::SETCC_INVALID) {
48097 Cond = DAG.getSetCC(DL, CondVT, Cond0, Cond1, NewCC);
48098 return DAG.getSelect(DL, VT, Cond, LHS, RHS.getOperand(2));
48099 }
48100 }
48101 }
48102 }
48103
48104 // Check if the first operand is all zeros and Cond type is vXi1.
48105 // If this an avx512 target we can improve the use of zero masking by
48106 // swapping the operands and inverting the condition.
48107 if (N->getOpcode() == ISD::VSELECT && Cond.hasOneUse() &&
48108 Subtarget.hasAVX512() && CondVT.getVectorElementType() == MVT::i1 &&
48109 ISD::isBuildVectorAllZeros(LHS.getNode()) &&
48110 !ISD::isBuildVectorAllZeros(RHS.getNode())) {
48111 // Invert the cond to not(cond) : xor(op,allones)=not(op)
48112 SDValue CondNew = DAG.getNOT(DL, Cond, CondVT);
48113 // Vselect cond, op1, op2 = Vselect not(cond), op2, op1
48114 return DAG.getSelect(DL, VT, CondNew, RHS, LHS);
48115 }
48116
48117 // Attempt to convert a (vXi1 bitcast(iX Cond)) selection mask before it might
48118 // get split by legalization.
48119 if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::BITCAST &&
48120 CondVT.getVectorElementType() == MVT::i1 &&
48121 TLI.isTypeLegal(VT.getScalarType())) {
48122 EVT ExtCondVT = VT.changeVectorElementTypeToInteger();
48124 ISD::SIGN_EXTEND, DL, ExtCondVT, Cond, DAG, DCI, Subtarget)) {
48125 ExtCond = DAG.getNode(ISD::TRUNCATE, DL, CondVT, ExtCond);
48126 return DAG.getSelect(DL, VT, ExtCond, LHS, RHS);
48127 }
48128 }
48129
48130 // Exploits AVX2 VSHLV/VSRLV instructions for efficient unsigned vector shifts
48131 // with out-of-bounds clamping.
48132
48133 // Unlike general shift instructions (SHL/SRL), AVX2's VSHLV/VSRLV handle
48134 // shift amounts exceeding the element bitwidth. VSHLV/VSRLV clamps the amount
48135 // to bitwidth-1 for unsigned shifts, effectively performing a maximum left
48136 // shift of bitwidth-1 positions. and returns zero for unsigned right shifts
48137 // exceeding bitwidth-1.
48138 if (N->getOpcode() == ISD::VSELECT) {
48139 using namespace llvm::SDPatternMatch;
48140 // fold select(icmp_ult(amt,BW),shl(x,amt),0) -> avx2 psllv(x,amt)
48141 // fold select(icmp_ult(amt,BW),srl(x,amt),0) -> avx2 psrlv(x,amt)
48142 if ((LHS.getOpcode() == ISD::SRL || LHS.getOpcode() == ISD::SHL) &&
48143 supportedVectorVarShift(VT, Subtarget, LHS.getOpcode()) &&
48145 sd_match(Cond, m_SetCC(m_Specific(LHS.getOperand(1)),
48148 return DAG.getNode(LHS.getOpcode() == ISD::SRL ? X86ISD::VSRLV
48149 : X86ISD::VSHLV,
48150 DL, VT, LHS.getOperand(0), LHS.getOperand(1));
48151 }
48152 // fold select(icmp_uge(amt,BW),0,shl(x,amt)) -> avx2 psllv(x,amt)
48153 // fold select(icmp_uge(amt,BW),0,srl(x,amt)) -> avx2 psrlv(x,amt)
48154 if ((RHS.getOpcode() == ISD::SRL || RHS.getOpcode() == ISD::SHL) &&
48155 supportedVectorVarShift(VT, Subtarget, RHS.getOpcode()) &&
48157 sd_match(Cond, m_SetCC(m_Specific(RHS.getOperand(1)),
48160 return DAG.getNode(RHS.getOpcode() == ISD::SRL ? X86ISD::VSRLV
48161 : X86ISD::VSHLV,
48162 DL, VT, RHS.getOperand(0), RHS.getOperand(1));
48163 }
48164 }
48165
48166 // Early exit check
48167 if (!TLI.isTypeLegal(VT) || isSoftF16(VT, Subtarget))
48168 return SDValue();
48169
48170 if (SDValue V = combineVSelectToBLENDV(N, DAG, DL, DCI, Subtarget))
48171 return V;
48172
48173 if (SDValue V = narrowVectorSelect(N, DAG, DL, Subtarget))
48174 return V;
48175
48176 // select(~Cond, X, Y) -> select(Cond, Y, X)
48177 if (CondVT.getScalarType() != MVT::i1) {
48178 if (SDValue CondNot = IsNOT(Cond, DAG))
48179 return DAG.getNode(N->getOpcode(), DL, VT,
48180 DAG.getBitcast(CondVT, CondNot), RHS, LHS);
48181
48182 // select(pcmpeq(and(X,Pow2),0),A,B) -> select(pcmpeq(and(X,Pow2),Pow2),B,A)
48183 if (Cond.getOpcode() == X86ISD::PCMPEQ &&
48184 Cond.getOperand(0).getOpcode() == ISD::AND &&
48185 ISD::isBuildVectorAllZeros(Cond.getOperand(1).getNode()) &&
48186 isConstantPowerOf2(Cond.getOperand(0).getOperand(1),
48187 Cond.getScalarValueSizeInBits(),
48188 /*AllowUndefs=*/true) &&
48189 Cond.hasOneUse()) {
48190 Cond = DAG.getNode(X86ISD::PCMPEQ, DL, CondVT, Cond.getOperand(0),
48191 Cond.getOperand(0).getOperand(1));
48192 return DAG.getNode(N->getOpcode(), DL, VT, Cond, RHS, LHS);
48193 }
48194
48195 // pcmpgt(X, -1) -> pcmpgt(0, X) to help select/blendv just use the
48196 // signbit.
48197 if (Cond.getOpcode() == X86ISD::PCMPGT &&
48198 ISD::isBuildVectorAllOnes(Cond.getOperand(1).getNode()) &&
48199 Cond.hasOneUse()) {
48200 Cond = DAG.getNode(X86ISD::PCMPGT, DL, CondVT,
48201 DAG.getConstant(0, DL, CondVT), Cond.getOperand(0));
48202 return DAG.getNode(N->getOpcode(), DL, VT, Cond, RHS, LHS);
48203 }
48204 }
48205
48206 // Try to optimize vXi1 selects if both operands are either all constants or
48207 // bitcasts from scalar integer type. In that case we can convert the operands
48208 // to integer and use an integer select which will be converted to a CMOV.
48209 // We need to take a little bit of care to avoid creating an i64 type after
48210 // type legalization.
48211 if (N->getOpcode() == ISD::SELECT && VT.isVector() &&
48212 VT.getVectorElementType() == MVT::i1 &&
48213 (DCI.isBeforeLegalize() || (VT != MVT::v64i1 || Subtarget.is64Bit()))) {
48215 if (DCI.isBeforeLegalize() || TLI.isTypeLegal(IntVT)) {
48216 bool LHSIsConst = ISD::isBuildVectorOfConstantSDNodes(LHS.getNode());
48217 bool RHSIsConst = ISD::isBuildVectorOfConstantSDNodes(RHS.getNode());
48218
48219 if ((LHSIsConst || (LHS.getOpcode() == ISD::BITCAST &&
48220 LHS.getOperand(0).getValueType() == IntVT)) &&
48221 (RHSIsConst || (RHS.getOpcode() == ISD::BITCAST &&
48222 RHS.getOperand(0).getValueType() == IntVT))) {
48223 if (LHSIsConst)
48225 else
48226 LHS = LHS.getOperand(0);
48227
48228 if (RHSIsConst)
48230 else
48231 RHS = RHS.getOperand(0);
48232
48233 SDValue Select = DAG.getSelect(DL, IntVT, Cond, LHS, RHS);
48234 return DAG.getBitcast(VT, Select);
48235 }
48236 }
48237 }
48238
48239 // If this is "((X & C) == 0) ? Y : Z" and C is a constant mask vector of
48240 // single bits, then invert the predicate and swap the select operands.
48241 // This can lower using a vector shift bit-hack rather than mask and compare.
48242 if (DCI.isBeforeLegalize() && !Subtarget.hasAVX512() &&
48243 N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&
48244 Cond.hasOneUse() && CondVT.getVectorElementType() == MVT::i1 &&
48245 Cond.getOperand(0).getOpcode() == ISD::AND &&
48246 isNullOrNullSplat(Cond.getOperand(1)) &&
48247 cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETEQ &&
48248 Cond.getOperand(0).getValueType() == VT) {
48249 // The 'and' mask must be composed of power-of-2 constants.
48250 SDValue And = Cond.getOperand(0);
48251 auto *C = isConstOrConstSplat(And.getOperand(1));
48252 if (C && C->getAPIntValue().isPowerOf2()) {
48253 // vselect (X & C == 0), LHS, RHS --> vselect (X & C != 0), RHS, LHS
48254 SDValue NotCond =
48255 DAG.getSetCC(DL, CondVT, And, Cond.getOperand(1), ISD::SETNE);
48256 return DAG.getSelect(DL, VT, NotCond, RHS, LHS);
48257 }
48258
48259 // If we have a non-splat but still powers-of-2 mask, AVX1 can use pmulld
48260 // and AVX2 can use vpsllv{dq}. 8-bit lacks a proper shift or multiply.
48261 // 16-bit lacks a proper blendv.
48262 unsigned EltBitWidth = VT.getScalarSizeInBits();
48263 bool CanShiftBlend =
48264 TLI.isTypeLegal(VT) && ((Subtarget.hasAVX() && EltBitWidth == 32) ||
48265 (Subtarget.hasAVX2() && EltBitWidth == 64) ||
48266 (Subtarget.hasXOP()));
48267 if (CanShiftBlend &&
48268 ISD::matchUnaryPredicate(And.getOperand(1), [](ConstantSDNode *C) {
48269 return C->getAPIntValue().isPowerOf2();
48270 })) {
48271 // Create a left-shift constant to get the mask bits over to the sign-bit.
48272 SDValue Mask = And.getOperand(1);
48273 SmallVector<int, 32> ShlVals;
48274 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
48275 auto *MaskVal = cast<ConstantSDNode>(Mask.getOperand(i));
48276 ShlVals.push_back(EltBitWidth - 1 -
48277 MaskVal->getAPIntValue().exactLogBase2());
48278 }
48279 // vsel ((X & C) == 0), LHS, RHS --> vsel ((shl X, C') < 0), RHS, LHS
48280 SDValue ShlAmt = getConstVector(ShlVals, VT.getSimpleVT(), DAG, DL);
48281 SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, And.getOperand(0), ShlAmt);
48282 SDValue NewCond =
48283 DAG.getSetCC(DL, CondVT, Shl, Cond.getOperand(1), ISD::SETLT);
48284 return DAG.getSelect(DL, VT, NewCond, RHS, LHS);
48285 }
48286 }
48287
48288 return SDValue();
48289}
48290
48291/// Combine:
48292/// (brcond/cmov/setcc .., (cmp (atomic_load_add x, 1), 0), COND_S)
48293/// to:
48294/// (brcond/cmov/setcc .., (LADD x, 1), COND_LE)
48295/// i.e., reusing the EFLAGS produced by the LOCKed instruction.
48296/// Note that this is only legal for some op/cc combinations.
48298 SelectionDAG &DAG,
48299 const X86Subtarget &Subtarget) {
48300 // This combine only operates on CMP-like nodes.
48301 if (!(Cmp.getOpcode() == X86ISD::CMP ||
48302 (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
48303 return SDValue();
48304
48305 // Can't replace the cmp if it has more uses than the one we're looking at.
48306 // FIXME: We would like to be able to handle this, but would need to make sure
48307 // all uses were updated.
48308 if (!Cmp.hasOneUse())
48309 return SDValue();
48310
48311 // This only applies to variations of the common case:
48312 // (icmp slt x, 0) -> (icmp sle (add x, 1), 0)
48313 // (icmp sge x, 0) -> (icmp sgt (add x, 1), 0)
48314 // (icmp sle x, 0) -> (icmp slt (sub x, 1), 0)
48315 // (icmp sgt x, 0) -> (icmp sge (sub x, 1), 0)
48316 // Using the proper condcodes (see below), overflow is checked for.
48317
48318 // FIXME: We can generalize both constraints:
48319 // - XOR/OR/AND (if they were made to survive AtomicExpand)
48320 // - LHS != 1
48321 // if the result is compared.
48322
48323 SDValue CmpLHS = Cmp.getOperand(0);
48324 SDValue CmpRHS = Cmp.getOperand(1);
48325 EVT CmpVT = CmpLHS.getValueType();
48326
48327 if (!CmpLHS.hasOneUse())
48328 return SDValue();
48329
48330 unsigned Opc = CmpLHS.getOpcode();
48331 if (Opc != ISD::ATOMIC_LOAD_ADD && Opc != ISD::ATOMIC_LOAD_SUB)
48332 return SDValue();
48333
48334 SDValue OpRHS = CmpLHS.getOperand(2);
48335 auto *OpRHSC = dyn_cast<ConstantSDNode>(OpRHS);
48336 if (!OpRHSC)
48337 return SDValue();
48338
48339 APInt Addend = OpRHSC->getAPIntValue();
48340 if (Opc == ISD::ATOMIC_LOAD_SUB)
48341 Addend = -Addend;
48342
48343 auto *CmpRHSC = dyn_cast<ConstantSDNode>(CmpRHS);
48344 if (!CmpRHSC)
48345 return SDValue();
48346
48347 APInt Comparison = CmpRHSC->getAPIntValue();
48348 APInt NegAddend = -Addend;
48349
48350 // See if we can adjust the CC to make the comparison match the negated
48351 // addend.
48352 if (Comparison != NegAddend) {
48353 APInt IncComparison = Comparison + 1;
48354 if (IncComparison == NegAddend) {
48355 if (CC == X86::COND_A && !Comparison.isMaxValue()) {
48356 Comparison = IncComparison;
48357 CC = X86::COND_AE;
48358 } else if (CC == X86::COND_LE && !Comparison.isMaxSignedValue()) {
48359 Comparison = IncComparison;
48360 CC = X86::COND_L;
48361 }
48362 }
48363 APInt DecComparison = Comparison - 1;
48364 if (DecComparison == NegAddend) {
48365 if (CC == X86::COND_AE && !Comparison.isMinValue()) {
48366 Comparison = DecComparison;
48367 CC = X86::COND_A;
48368 } else if (CC == X86::COND_L && !Comparison.isMinSignedValue()) {
48369 Comparison = DecComparison;
48370 CC = X86::COND_LE;
48371 }
48372 }
48373 }
48374
48375 // If the addend is the negation of the comparison value, then we can do
48376 // a full comparison by emitting the atomic arithmetic as a locked sub.
48377 if (Comparison == NegAddend) {
48378 // The CC is fine, but we need to rewrite the LHS of the comparison as an
48379 // atomic sub.
48380 auto *AN = cast<AtomicSDNode>(CmpLHS.getNode());
48381 auto AtomicSub = DAG.getAtomic(
48382 ISD::ATOMIC_LOAD_SUB, SDLoc(CmpLHS), CmpVT,
48383 /*Chain*/ CmpLHS.getOperand(0), /*LHS*/ CmpLHS.getOperand(1),
48384 /*RHS*/ DAG.getConstant(NegAddend, SDLoc(CmpRHS), CmpVT),
48385 AN->getMemOperand());
48386 auto LockOp = lowerAtomicArithWithLOCK(AtomicSub, DAG, Subtarget);
48387 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0), DAG.getUNDEF(CmpVT));
48388 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));
48389 return LockOp;
48390 }
48391
48392 // We can handle comparisons with zero in a number of cases by manipulating
48393 // the CC used.
48394 if (!Comparison.isZero())
48395 return SDValue();
48396
48397 if (CC == X86::COND_S && Addend == 1)
48398 CC = X86::COND_LE;
48399 else if (CC == X86::COND_NS && Addend == 1)
48400 CC = X86::COND_G;
48401 else if (CC == X86::COND_G && Addend == -1)
48402 CC = X86::COND_GE;
48403 else if (CC == X86::COND_LE && Addend == -1)
48404 CC = X86::COND_L;
48405 else
48406 return SDValue();
48407
48408 SDValue LockOp = lowerAtomicArithWithLOCK(CmpLHS, DAG, Subtarget);
48409 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0), DAG.getUNDEF(CmpVT));
48410 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));
48411 return LockOp;
48412}
48413
48414// Check whether we're just testing the signbit, and whether we can simplify
48415// this by tracking where the signbit came from.
48417 SelectionDAG &DAG) {
48418 if (CC != X86::COND_S && CC != X86::COND_NS)
48419 return SDValue();
48420
48421 if (!Cmp.hasOneUse())
48422 return SDValue();
48423
48424 SDValue Src;
48425 if (Cmp.getOpcode() == X86ISD::CMP) {
48426 // CMP(X,0) -> signbit test
48427 if (!isNullConstant(Cmp.getOperand(1)))
48428 return SDValue();
48429 Src = Cmp.getOperand(0);
48430 // Peek through a SRA node as we just need the signbit.
48431 // TODO: Remove one use limit once sdiv-fix regressions are fixed.
48432 // TODO: Use SimplifyDemandedBits instead of just SRA?
48433 if (Src.getOpcode() != ISD::SRA || !Src.hasOneUse())
48434 return SDValue();
48435 Src = Src.getOperand(0);
48436 } else if (Cmp.getOpcode() == X86ISD::OR) {
48437 // OR(X,Y) -> see if only one operand contributes to the signbit.
48438 // TODO: XOR(X,Y) -> see if only one operand contributes to the signbit.
48439 if (DAG.SignBitIsZero(Cmp.getOperand(0)))
48440 Src = Cmp.getOperand(1);
48441 else if (DAG.SignBitIsZero(Cmp.getOperand(1)))
48442 Src = Cmp.getOperand(0);
48443 else
48444 return SDValue();
48445 } else {
48446 return SDValue();
48447 }
48448
48449 // Replace with a TEST on the MSB.
48450 SDLoc DL(Cmp);
48451 MVT SrcVT = Src.getSimpleValueType();
48452 APInt BitMask = APInt::getSignMask(SrcVT.getScalarSizeInBits());
48453
48454 // If Src came from a SIGN_EXTEND_INREG or SHL (probably from an expanded
48455 // SIGN_EXTEND_INREG), then peek through and adjust the TEST bit.
48456 if (Src.getOpcode() == ISD::SHL) {
48457 if (std::optional<unsigned> ShiftAmt = DAG.getValidShiftAmount(Src)) {
48458 Src = Src.getOperand(0);
48459 BitMask.lshrInPlace(*ShiftAmt);
48460 }
48461 } else if (Src.getOpcode() == ISD::SIGN_EXTEND_INREG) {
48462 EVT ExtVT = cast<VTSDNode>(Src.getOperand(1))->getVT();
48463 Src = Src.getOperand(0);
48464 BitMask.lshrInPlace(BitMask.getBitWidth() - ExtVT.getScalarSizeInBits());
48465 }
48466
48467 SDValue Mask = DAG.getNode(ISD::AND, DL, SrcVT, Src,
48468 DAG.getConstant(BitMask, DL, SrcVT));
48469 CC = CC == X86::COND_S ? X86::COND_NE : X86::COND_E;
48470 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Mask,
48471 DAG.getConstant(0, DL, SrcVT));
48472}
48473
48474// Check whether a boolean test is testing a boolean value generated by
48475// X86ISD::SETCC. If so, return the operand of that SETCC and proper condition
48476// code.
48477//
48478// Simplify the following patterns:
48479// (Op (CMP (SETCC Cond EFLAGS) 1) EQ) or
48480// (Op (CMP (SETCC Cond EFLAGS) 0) NEQ)
48481// to (Op EFLAGS Cond)
48482//
48483// (Op (CMP (SETCC Cond EFLAGS) 0) EQ) or
48484// (Op (CMP (SETCC Cond EFLAGS) 1) NEQ)
48485// to (Op EFLAGS !Cond)
48486//
48487// where Op could be BRCOND or CMOV.
48488//
48490 // This combine only operates on CMP-like nodes.
48491 if (!(Cmp.getOpcode() == X86ISD::CMP ||
48492 (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
48493 return SDValue();
48494
48495 // Quit if not used as a boolean value.
48496 if (CC != X86::COND_E && CC != X86::COND_NE)
48497 return SDValue();
48498
48499 // Check CMP operands. One of them should be 0 or 1 and the other should be
48500 // an SetCC or extended from it.
48501 SDValue Op1 = Cmp.getOperand(0);
48502 SDValue Op2 = Cmp.getOperand(1);
48503
48504 SDValue SetCC;
48505 const ConstantSDNode* C = nullptr;
48506 bool needOppositeCond = (CC == X86::COND_E);
48507 bool checkAgainstTrue = false; // Is it a comparison against 1?
48508
48509 if ((C = dyn_cast<ConstantSDNode>(Op1)))
48510 SetCC = Op2;
48511 else if ((C = dyn_cast<ConstantSDNode>(Op2)))
48512 SetCC = Op1;
48513 else // Quit if all operands are not constants.
48514 return SDValue();
48515
48516 if (C->getZExtValue() == 1) {
48517 needOppositeCond = !needOppositeCond;
48518 checkAgainstTrue = true;
48519 } else if (C->getZExtValue() != 0)
48520 // Quit if the constant is neither 0 or 1.
48521 return SDValue();
48522
48523 bool truncatedToBoolWithAnd = false;
48524 // Skip (zext $x), (trunc $x), or (and $x, 1) node.
48525 while (SetCC.getOpcode() == ISD::ZERO_EXTEND ||
48526 SetCC.getOpcode() == ISD::TRUNCATE ||
48527 SetCC.getOpcode() == ISD::AND) {
48528 if (SetCC.getOpcode() == ISD::AND) {
48529 int OpIdx = -1;
48530 if (isOneConstant(SetCC.getOperand(0)))
48531 OpIdx = 1;
48532 if (isOneConstant(SetCC.getOperand(1)))
48533 OpIdx = 0;
48534 if (OpIdx < 0)
48535 break;
48536 SetCC = SetCC.getOperand(OpIdx);
48537 truncatedToBoolWithAnd = true;
48538 } else
48539 SetCC = SetCC.getOperand(0);
48540 }
48541
48542 switch (SetCC.getOpcode()) {
48544 // Since SETCC_CARRY gives output based on R = CF ? ~0 : 0, it's unsafe to
48545 // simplify it if the result of SETCC_CARRY is not canonicalized to 0 or 1,
48546 // i.e. it's a comparison against true but the result of SETCC_CARRY is not
48547 // truncated to i1 using 'and'.
48548 if (checkAgainstTrue && !truncatedToBoolWithAnd)
48549 break;
48551 "Invalid use of SETCC_CARRY!");
48552 [[fallthrough]];
48553 case X86ISD::SETCC:
48554 // Set the condition code or opposite one if necessary.
48555 CC = X86::CondCode(SetCC.getConstantOperandVal(0));
48556 if (needOppositeCond)
48558 return SetCC.getOperand(1);
48559 case X86ISD::CMOV: {
48560 // Check whether false/true value has canonical one, i.e. 0 or 1.
48563 // Quit if true value is not a constant.
48564 if (!TVal)
48565 return SDValue();
48566 // Quit if false value is not a constant.
48567 if (!FVal) {
48568 SDValue Op = SetCC.getOperand(0);
48569 // Skip 'zext' or 'trunc' node.
48570 if (Op.getOpcode() == ISD::ZERO_EXTEND ||
48571 Op.getOpcode() == ISD::TRUNCATE)
48572 Op = Op.getOperand(0);
48573 // A special case for rdrand/rdseed, where 0 is set if false cond is
48574 // found.
48575 if ((Op.getOpcode() != X86ISD::RDRAND &&
48576 Op.getOpcode() != X86ISD::RDSEED) || Op.getResNo() != 0)
48577 return SDValue();
48578 }
48579 // Quit if false value is not the constant 0 or 1.
48580 bool FValIsFalse = true;
48581 if (FVal && FVal->getZExtValue() != 0) {
48582 if (FVal->getZExtValue() != 1)
48583 return SDValue();
48584 // If FVal is 1, opposite cond is needed.
48585 needOppositeCond = !needOppositeCond;
48586 FValIsFalse = false;
48587 }
48588 // Quit if TVal is not the constant opposite of FVal.
48589 if (FValIsFalse && TVal->getZExtValue() != 1)
48590 return SDValue();
48591 if (!FValIsFalse && TVal->getZExtValue() != 0)
48592 return SDValue();
48593 CC = X86::CondCode(SetCC.getConstantOperandVal(2));
48594 if (needOppositeCond)
48596 return SetCC.getOperand(3);
48597 }
48598 }
48599
48600 return SDValue();
48601}
48602
48603/// Check whether Cond is an AND/OR of SETCCs off of the same EFLAGS.
48604/// Match:
48605/// (X86or (X86setcc) (X86setcc))
48606/// (X86cmp (and (X86setcc) (X86setcc)), 0)
48608 X86::CondCode &CC1, SDValue &Flags,
48609 bool &isAnd) {
48610 if (Cond->getOpcode() == X86ISD::CMP) {
48611 if (!isNullConstant(Cond->getOperand(1)))
48612 return false;
48613
48614 Cond = Cond->getOperand(0);
48615 }
48616
48617 isAnd = false;
48618
48619 SDValue SetCC0, SetCC1;
48620 switch (Cond->getOpcode()) {
48621 default: return false;
48622 case ISD::AND:
48623 case X86ISD::AND:
48624 isAnd = true;
48625 [[fallthrough]];
48626 case ISD::OR:
48627 case X86ISD::OR:
48628 SetCC0 = Cond->getOperand(0);
48629 SetCC1 = Cond->getOperand(1);
48630 break;
48631 };
48632
48633 // Make sure we have SETCC nodes, using the same flags value.
48634 if (SetCC0.getOpcode() != X86ISD::SETCC ||
48635 SetCC1.getOpcode() != X86ISD::SETCC ||
48636 SetCC0->getOperand(1) != SetCC1->getOperand(1))
48637 return false;
48638
48639 CC0 = (X86::CondCode)SetCC0->getConstantOperandVal(0);
48640 CC1 = (X86::CondCode)SetCC1->getConstantOperandVal(0);
48641 Flags = SetCC0->getOperand(1);
48642 return true;
48643}
48644
48645// When legalizing carry, we create carries via add X, -1
48646// If that comes from an actual carry, via setcc, we use the
48647// carry directly.
48649 if (EFLAGS.getOpcode() == X86ISD::ADD) {
48650 if (isAllOnesConstant(EFLAGS.getOperand(1))) {
48651 bool FoundAndLSB = false;
48652 SDValue Carry = EFLAGS.getOperand(0);
48653 while (Carry.getOpcode() == ISD::TRUNCATE ||
48654 Carry.getOpcode() == ISD::ZERO_EXTEND ||
48655 (Carry.getOpcode() == ISD::AND &&
48656 isOneConstant(Carry.getOperand(1)))) {
48657 FoundAndLSB |= Carry.getOpcode() == ISD::AND;
48658 Carry = Carry.getOperand(0);
48659 }
48660 if (Carry.getOpcode() == X86ISD::SETCC ||
48661 Carry.getOpcode() == X86ISD::SETCC_CARRY) {
48662 // TODO: Merge this code with equivalent in combineAddOrSubToADCOrSBB?
48663 uint64_t CarryCC = Carry.getConstantOperandVal(0);
48664 SDValue CarryOp1 = Carry.getOperand(1);
48665 if (CarryCC == X86::COND_B)
48666 return CarryOp1;
48667 if (CarryCC == X86::COND_A) {
48668 // Try to convert COND_A into COND_B in an attempt to facilitate
48669 // materializing "setb reg".
48670 //
48671 // Do not flip "e > c", where "c" is a constant, because Cmp
48672 // instruction cannot take an immediate as its first operand.
48673 //
48674 if (CarryOp1.getOpcode() == X86ISD::SUB &&
48675 CarryOp1.getNode()->hasOneUse() &&
48676 CarryOp1.getValueType().isInteger() &&
48677 !isa<ConstantSDNode>(CarryOp1.getOperand(1))) {
48678 SDValue SubCommute =
48679 DAG.getNode(X86ISD::SUB, SDLoc(CarryOp1), CarryOp1->getVTList(),
48680 CarryOp1.getOperand(1), CarryOp1.getOperand(0));
48681 return SDValue(SubCommute.getNode(), CarryOp1.getResNo());
48682 }
48683 }
48684 // If this is a check of the z flag of an add with 1, switch to the
48685 // C flag.
48686 if (CarryCC == X86::COND_E &&
48687 CarryOp1.getOpcode() == X86ISD::ADD &&
48688 isOneConstant(CarryOp1.getOperand(1)))
48689 return CarryOp1;
48690 } else if (FoundAndLSB) {
48691 SDLoc DL(Carry);
48692 SDValue BitNo = DAG.getConstant(0, DL, Carry.getValueType());
48693 if (Carry.getOpcode() == ISD::SRL) {
48694 BitNo = Carry.getOperand(1);
48695 Carry = Carry.getOperand(0);
48696 }
48697 return getBT(Carry, BitNo, DL, DAG);
48698 }
48699 }
48700 }
48701
48702 return SDValue();
48703}
48704
48705/// If we are inverting an PTEST/TESTP operand, attempt to adjust the CC
48706/// to avoid the inversion.
48708 SelectionDAG &DAG,
48709 const X86Subtarget &Subtarget) {
48710 // TODO: Handle X86ISD::KTEST/X86ISD::KORTEST.
48711 if (EFLAGS.getOpcode() != X86ISD::PTEST &&
48712 EFLAGS.getOpcode() != X86ISD::TESTP)
48713 return SDValue();
48714
48715 // PTEST/TESTP sets EFLAGS as:
48716 // TESTZ: ZF = (Op0 & Op1) == 0
48717 // TESTC: CF = (~Op0 & Op1) == 0
48718 // TESTNZC: ZF == 0 && CF == 0
48719 MVT VT = EFLAGS.getSimpleValueType();
48720 SDValue Op0 = EFLAGS.getOperand(0);
48721 SDValue Op1 = EFLAGS.getOperand(1);
48722 MVT OpVT = Op0.getSimpleValueType();
48723 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
48724
48725 // TEST*(~X,Y) == TEST*(X,Y)
48726 if (SDValue NotOp0 = IsNOT(Op0, DAG)) {
48727 X86::CondCode InvCC;
48728 switch (CC) {
48729 case X86::COND_B:
48730 // testc -> testz.
48731 InvCC = X86::COND_E;
48732 break;
48733 case X86::COND_AE:
48734 // !testc -> !testz.
48735 InvCC = X86::COND_NE;
48736 break;
48737 case X86::COND_E:
48738 // testz -> testc.
48739 InvCC = X86::COND_B;
48740 break;
48741 case X86::COND_NE:
48742 // !testz -> !testc.
48743 InvCC = X86::COND_AE;
48744 break;
48745 case X86::COND_A:
48746 case X86::COND_BE:
48747 // testnzc -> testnzc (no change).
48748 InvCC = CC;
48749 break;
48750 default:
48751 InvCC = X86::COND_INVALID;
48752 break;
48753 }
48754
48755 if (InvCC != X86::COND_INVALID) {
48756 CC = InvCC;
48757 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
48758 DAG.getBitcast(OpVT, NotOp0), Op1);
48759 }
48760 }
48761
48762 if (CC == X86::COND_B || CC == X86::COND_AE) {
48763 // TESTC(X,~X) == TESTC(X,-1)
48764 if (SDValue NotOp1 = IsNOT(Op1, DAG)) {
48765 if (peekThroughBitcasts(NotOp1) == peekThroughBitcasts(Op0)) {
48766 SDLoc DL(EFLAGS);
48767 return DAG.getNode(
48768 EFLAGS.getOpcode(), DL, VT, DAG.getBitcast(OpVT, NotOp1),
48769 DAG.getBitcast(OpVT,
48770 DAG.getAllOnesConstant(DL, NotOp1.getValueType())));
48771 }
48772 }
48773 // PTESTC(PCMPEQ(X,0),-1) == PTESTZ(X,X)
48774 if (EFLAGS.getOpcode() == X86ISD::PTEST &&
48776 SDValue BC0 = peekThroughBitcasts(Op0);
48777 if (BC0.getOpcode() == X86ISD::PCMPEQ &&
48779 SDLoc DL(EFLAGS);
48780 CC = (CC == X86::COND_B ? X86::COND_E : X86::COND_NE);
48781 SDValue X = DAG.getBitcast(OpVT, BC0.getOperand(0));
48782 return DAG.getNode(EFLAGS.getOpcode(), DL, VT, X, X);
48783 }
48784 }
48785 }
48786
48787 if (CC == X86::COND_E || CC == X86::COND_NE) {
48788 // TESTZ(X,~Y) == TESTC(Y,X)
48789 if (SDValue NotOp1 = IsNOT(Op1, DAG)) {
48790 CC = (CC == X86::COND_E ? X86::COND_B : X86::COND_AE);
48791 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
48792 DAG.getBitcast(OpVT, NotOp1), Op0);
48793 }
48794
48795 if (Op0 == Op1) {
48796 SDValue BC = peekThroughBitcasts(Op0);
48797 EVT BCVT = BC.getValueType();
48798
48799 // TESTZ(AND(X,Y),AND(X,Y)) == TESTZ(X,Y)
48800 if (BC.getOpcode() == ISD::AND || BC.getOpcode() == X86ISD::FAND) {
48801 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
48802 DAG.getBitcast(OpVT, BC.getOperand(0)),
48803 DAG.getBitcast(OpVT, BC.getOperand(1)));
48804 }
48805
48806 // TESTZ(AND(~X,Y),AND(~X,Y)) == TESTC(X,Y)
48807 if (BC.getOpcode() == X86ISD::ANDNP || BC.getOpcode() == X86ISD::FANDN) {
48808 CC = (CC == X86::COND_E ? X86::COND_B : X86::COND_AE);
48809 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
48810 DAG.getBitcast(OpVT, BC.getOperand(0)),
48811 DAG.getBitcast(OpVT, BC.getOperand(1)));
48812 }
48813
48814 // If every element is an all-sign value, see if we can use TESTP/MOVMSK
48815 // to more efficiently extract the sign bits and compare that.
48816 // TODO: Handle TESTC with comparison inversion.
48817 // TODO: Can we remove SimplifyMultipleUseDemandedBits and rely on
48818 // TESTP/MOVMSK combines to make sure its never worse than PTEST?
48819 if (BCVT.isVector() && TLI.isTypeLegal(BCVT)) {
48820 unsigned EltBits = BCVT.getScalarSizeInBits();
48821 if (DAG.ComputeNumSignBits(BC) == EltBits) {
48822 assert(VT == MVT::i32 && "Expected i32 EFLAGS comparison result");
48823 APInt SignMask = APInt::getSignMask(EltBits);
48824 if (SDValue Res =
48825 TLI.SimplifyMultipleUseDemandedBits(BC, SignMask, DAG)) {
48826 // For vXi16 cases we need to use pmovmksb and extract every other
48827 // sign bit.
48828 SDLoc DL(EFLAGS);
48829 if ((EltBits == 32 || EltBits == 64) && Subtarget.hasAVX()) {
48830 MVT FloatSVT = MVT::getFloatingPointVT(EltBits);
48831 MVT FloatVT =
48832 MVT::getVectorVT(FloatSVT, OpVT.getSizeInBits() / EltBits);
48833 Res = DAG.getBitcast(FloatVT, Res);
48834 return DAG.getNode(X86ISD::TESTP, SDLoc(EFLAGS), VT, Res, Res);
48835 } else if (EltBits == 16) {
48836 MVT MovmskVT = BCVT.is128BitVector() ? MVT::v16i8 : MVT::v32i8;
48837 Res = DAG.getBitcast(MovmskVT, Res);
48838 Res = getPMOVMSKB(DL, Res, DAG, Subtarget);
48839 Res = DAG.getNode(ISD::AND, DL, MVT::i32, Res,
48840 DAG.getConstant(0xAAAAAAAA, DL, MVT::i32));
48841 } else {
48842 Res = getPMOVMSKB(DL, Res, DAG, Subtarget);
48843 }
48844 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Res,
48845 DAG.getConstant(0, DL, MVT::i32));
48846 }
48847 }
48848 }
48849 }
48850
48851 // TESTZ(-1,X) == TESTZ(X,X)
48853 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT, Op1, Op1);
48854
48855 // TESTZ(X,-1) == TESTZ(X,X)
48857 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT, Op0, Op0);
48858
48859 // TESTZ(OR(LO(X),HI(X)),OR(LO(Y),HI(Y))) -> TESTZ(X,Y)
48860 // TODO: Add COND_NE handling?
48861 if (CC == X86::COND_E && OpVT.is128BitVector() && Subtarget.hasAVX()) {
48862 SDValue Src0 = peekThroughBitcasts(Op0);
48863 SDValue Src1 = peekThroughBitcasts(Op1);
48864 if (Src0.getOpcode() == ISD::OR && Src1.getOpcode() == ISD::OR) {
48866 peekThroughBitcasts(Src0.getOperand(1)), true);
48868 peekThroughBitcasts(Src1.getOperand(1)), true);
48869 if (Src0 && Src1) {
48870 MVT OpVT2 = OpVT.getDoubleNumVectorElementsVT();
48871 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
48872 DAG.getBitcast(OpVT2, Src0),
48873 DAG.getBitcast(OpVT2, Src1));
48874 }
48875 }
48876 }
48877 }
48878
48879 return SDValue();
48880}
48881
48882// Attempt to simplify the MOVMSK input based on the comparison type.
48884 SelectionDAG &DAG,
48885 const X86Subtarget &Subtarget) {
48886 // Handle eq/ne against zero (any_of).
48887 // Handle eq/ne against -1 (all_of).
48888 if (!(CC == X86::COND_E || CC == X86::COND_NE))
48889 return SDValue();
48890 if (EFLAGS.getValueType() != MVT::i32)
48891 return SDValue();
48892 unsigned CmpOpcode = EFLAGS.getOpcode();
48893 if (CmpOpcode != X86ISD::CMP && CmpOpcode != X86ISD::SUB)
48894 return SDValue();
48895 auto *CmpConstant = dyn_cast<ConstantSDNode>(EFLAGS.getOperand(1));
48896 if (!CmpConstant)
48897 return SDValue();
48898 const APInt &CmpVal = CmpConstant->getAPIntValue();
48899
48900 SDValue CmpOp = EFLAGS.getOperand(0);
48901 unsigned CmpBits = CmpOp.getValueSizeInBits();
48902 assert(CmpBits == CmpVal.getBitWidth() && "Value size mismatch");
48903
48904 // Peek through any truncate.
48905 if (CmpOp.getOpcode() == ISD::TRUNCATE)
48906 CmpOp = CmpOp.getOperand(0);
48907
48908 // Bail if we don't find a MOVMSK.
48909 if (CmpOp.getOpcode() != X86ISD::MOVMSK)
48910 return SDValue();
48911
48912 SDValue Vec = CmpOp.getOperand(0);
48913 MVT VecVT = Vec.getSimpleValueType();
48914 assert((VecVT.is128BitVector() || VecVT.is256BitVector()) &&
48915 "Unexpected MOVMSK operand");
48916 unsigned NumElts = VecVT.getVectorNumElements();
48917 unsigned NumEltBits = VecVT.getScalarSizeInBits();
48918
48919 bool IsAnyOf = CmpOpcode == X86ISD::CMP && CmpVal.isZero();
48920 bool IsAllOf = (CmpOpcode == X86ISD::SUB || CmpOpcode == X86ISD::CMP) &&
48921 NumElts <= CmpBits && CmpVal.isMask(NumElts);
48922 if (!IsAnyOf && !IsAllOf)
48923 return SDValue();
48924
48925 // TODO: Check more combining cases for me.
48926 // Here we check the cmp use number to decide do combining or not.
48927 // Currently we only get 2 tests about combining "MOVMSK(CONCAT(..))"
48928 // and "MOVMSK(PCMPEQ(..))" are fit to use this constraint.
48929 bool IsOneUse = CmpOp.getNode()->hasOneUse();
48930
48931 // See if we can peek through to a vector with a wider element type, if the
48932 // signbits extend down to all the sub-elements as well.
48933 // Calling MOVMSK with the wider type, avoiding the bitcast, helps expose
48934 // potential SimplifyDemandedBits/Elts cases.
48935 // If we looked through a truncate that discard bits, we can't do this
48936 // transform.
48937 // FIXME: We could do this transform for truncates that discarded bits by
48938 // inserting an AND mask between the new MOVMSK and the CMP.
48939 if (Vec.getOpcode() == ISD::BITCAST && NumElts <= CmpBits) {
48940 SDValue BC = peekThroughBitcasts(Vec);
48941 MVT BCVT = BC.getSimpleValueType();
48942 unsigned BCNumElts = BCVT.getVectorNumElements();
48943 unsigned BCNumEltBits = BCVT.getScalarSizeInBits();
48944 if ((BCNumEltBits == 32 || BCNumEltBits == 64) &&
48945 BCNumEltBits > NumEltBits &&
48946 DAG.ComputeNumSignBits(BC) > (BCNumEltBits - NumEltBits)) {
48947 SDLoc DL(EFLAGS);
48948 APInt CmpMask = APInt::getLowBitsSet(32, IsAnyOf ? 0 : BCNumElts);
48949 return DAG.getNode(X86ISD::CMP, DL, MVT::i32,
48950 DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, BC),
48951 DAG.getConstant(CmpMask, DL, MVT::i32));
48952 }
48953 }
48954
48955 // MOVMSK(CONCAT(X,Y)) == 0 -> MOVMSK(OR(X,Y)).
48956 // MOVMSK(CONCAT(X,Y)) != 0 -> MOVMSK(OR(X,Y)).
48957 // MOVMSK(CONCAT(X,Y)) == -1 -> MOVMSK(AND(X,Y)).
48958 // MOVMSK(CONCAT(X,Y)) != -1 -> MOVMSK(AND(X,Y)).
48959 if (VecVT.is256BitVector() && NumElts <= CmpBits && IsOneUse) {
48961 if (collectConcatOps(peekThroughBitcasts(Vec).getNode(), Ops, DAG) &&
48962 Ops.size() == 2) {
48963 SDLoc DL(EFLAGS);
48964 EVT SubVT = Ops[0].getValueType().changeTypeToInteger();
48965 APInt CmpMask = APInt::getLowBitsSet(32, IsAnyOf ? 0 : NumElts / 2);
48966 SDValue V = DAG.getNode(IsAnyOf ? ISD::OR : ISD::AND, DL, SubVT,
48967 DAG.getBitcast(SubVT, Ops[0]),
48968 DAG.getBitcast(SubVT, Ops[1]));
48969 V = DAG.getBitcast(VecVT.getHalfNumVectorElementsVT(), V);
48970 return DAG.getNode(X86ISD::CMP, DL, MVT::i32,
48971 DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V),
48972 DAG.getConstant(CmpMask, DL, MVT::i32));
48973 }
48974 }
48975
48976 // MOVMSK(PCMPEQ(X,0)) == -1 -> PTESTZ(X,X).
48977 // MOVMSK(PCMPEQ(X,0)) != -1 -> !PTESTZ(X,X).
48978 // MOVMSK(PCMPEQ(X,Y)) == -1 -> PTESTZ(XOR(X,Y),XOR(X,Y)).
48979 // MOVMSK(PCMPEQ(X,Y)) != -1 -> !PTESTZ(XOR(X,Y),XOR(X,Y)).
48980 if (IsAllOf && Subtarget.hasSSE41() && IsOneUse) {
48981 MVT TestVT = VecVT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
48982 SDValue BC = peekThroughBitcasts(Vec);
48983 // Ensure MOVMSK was testing every signbit of BC.
48984 if (BC.getValueType().getVectorNumElements() <= NumElts) {
48985 if (BC.getOpcode() == X86ISD::PCMPEQ) {
48986 SDValue V = DAG.getNode(ISD::XOR, SDLoc(BC), BC.getValueType(),
48987 BC.getOperand(0), BC.getOperand(1));
48988 V = DAG.getBitcast(TestVT, V);
48989 return DAG.getNode(X86ISD::PTEST, SDLoc(EFLAGS), MVT::i32, V, V);
48990 }
48991 // Check for 256-bit split vector cases.
48992 if (BC.getOpcode() == ISD::AND &&
48993 BC.getOperand(0).getOpcode() == X86ISD::PCMPEQ &&
48994 BC.getOperand(1).getOpcode() == X86ISD::PCMPEQ) {
48995 SDValue LHS = BC.getOperand(0);
48996 SDValue RHS = BC.getOperand(1);
48997 LHS = DAG.getNode(ISD::XOR, SDLoc(LHS), LHS.getValueType(),
48998 LHS.getOperand(0), LHS.getOperand(1));
48999 RHS = DAG.getNode(ISD::XOR, SDLoc(RHS), RHS.getValueType(),
49000 RHS.getOperand(0), RHS.getOperand(1));
49001 LHS = DAG.getBitcast(TestVT, LHS);
49002 RHS = DAG.getBitcast(TestVT, RHS);
49003 SDValue V = DAG.getNode(ISD::OR, SDLoc(EFLAGS), TestVT, LHS, RHS);
49004 return DAG.getNode(X86ISD::PTEST, SDLoc(EFLAGS), MVT::i32, V, V);
49005 }
49006 }
49007 }
49008
49009 // See if we can avoid a PACKSS by calling MOVMSK on the sources.
49010 // For vXi16 cases we can use a v2Xi8 PMOVMSKB. We must mask out
49011 // sign bits prior to the comparison with zero unless we know that
49012 // the vXi16 splats the sign bit down to the lower i8 half.
49013 // TODO: Handle all_of patterns.
49014 if (Vec.getOpcode() == X86ISD::PACKSS && VecVT == MVT::v16i8) {
49015 SDValue VecOp0 = Vec.getOperand(0);
49016 SDValue VecOp1 = Vec.getOperand(1);
49017 bool SignExt0 = DAG.ComputeNumSignBits(VecOp0) > 8;
49018 bool SignExt1 = DAG.ComputeNumSignBits(VecOp1) > 8;
49019 // PMOVMSKB(PACKSSBW(X, undef)) -> PMOVMSKB(BITCAST_v16i8(X)) & 0xAAAA.
49020 if (IsAnyOf && CmpBits == 8 && VecOp1.isUndef()) {
49021 SDLoc DL(EFLAGS);
49022 SDValue Result = DAG.getBitcast(MVT::v16i8, VecOp0);
49023 Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
49024 Result = DAG.getZExtOrTrunc(Result, DL, MVT::i16);
49025 if (!SignExt0) {
49026 Result = DAG.getNode(ISD::AND, DL, MVT::i16, Result,
49027 DAG.getConstant(0xAAAA, DL, MVT::i16));
49028 }
49029 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result,
49030 DAG.getConstant(0, DL, MVT::i16));
49031 }
49032 // PMOVMSKB(PACKSSBW(LO(X), HI(X)))
49033 // -> PMOVMSKB(BITCAST_v32i8(X)) & 0xAAAAAAAA.
49034 if (CmpBits >= 16 && Subtarget.hasInt256() &&
49035 (IsAnyOf || (SignExt0 && SignExt1))) {
49036 if (SDValue Src = getSplitVectorSrc(VecOp0, VecOp1, true)) {
49037 SDLoc DL(EFLAGS);
49038 SDValue Result = peekThroughBitcasts(Src);
49039 if (IsAllOf && Result.getOpcode() == X86ISD::PCMPEQ &&
49040 Result.getValueType().getVectorNumElements() <= NumElts) {
49041 SDValue V = DAG.getNode(ISD::XOR, DL, Result.getValueType(),
49042 Result.getOperand(0), Result.getOperand(1));
49043 V = DAG.getBitcast(MVT::v4i64, V);
49044 return DAG.getNode(X86ISD::PTEST, SDLoc(EFLAGS), MVT::i32, V, V);
49045 }
49046 Result = DAG.getBitcast(MVT::v32i8, Result);
49047 Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
49048 unsigned CmpMask = IsAnyOf ? 0 : 0xFFFFFFFF;
49049 if (!SignExt0 || !SignExt1) {
49050 assert(IsAnyOf &&
49051 "Only perform v16i16 signmasks for any_of patterns");
49052 Result = DAG.getNode(ISD::AND, DL, MVT::i32, Result,
49053 DAG.getConstant(0xAAAAAAAA, DL, MVT::i32));
49054 }
49055 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result,
49056 DAG.getConstant(CmpMask, DL, MVT::i32));
49057 }
49058 }
49059 }
49060
49061 // MOVMSK(SHUFFLE(X,u)) -> MOVMSK(X) iff every element is referenced.
49062 // Since we peek through a bitcast, we need to be careful if the base vector
49063 // type has smaller elements than the MOVMSK type. In that case, even if
49064 // all the elements are demanded by the shuffle mask, only the "high"
49065 // elements which have highbits that align with highbits in the MOVMSK vec
49066 // elements are actually demanded. A simplification of spurious operations
49067 // on the "low" elements take place during other simplifications.
49068 //
49069 // For example:
49070 // MOVMSK64(BITCAST(SHUF32 X, (1,0,3,2))) even though all the elements are
49071 // demanded, because we are swapping around the result can change.
49072 //
49073 // To address this, we check that we can scale the shuffle mask to MOVMSK
49074 // element width (this will ensure "high" elements match). Its slightly overly
49075 // conservative, but fine for an edge case fold.
49076 SmallVector<int, 32> ShuffleMask;
49077 SmallVector<SDValue, 2> ShuffleInputs;
49078 if (NumElts <= CmpBits &&
49079 getTargetShuffleInputs(peekThroughBitcasts(Vec), ShuffleInputs,
49080 ShuffleMask, DAG) &&
49081 ShuffleInputs.size() == 1 && isCompletePermute(ShuffleMask) &&
49082 ShuffleInputs[0].getValueSizeInBits() == VecVT.getSizeInBits() &&
49083 canScaleShuffleElements(ShuffleMask, NumElts)) {
49084 SDLoc DL(EFLAGS);
49085 SDValue Result = DAG.getBitcast(VecVT, ShuffleInputs[0]);
49086 Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
49087 Result =
49088 DAG.getZExtOrTrunc(Result, DL, EFLAGS.getOperand(0).getValueType());
49089 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result, EFLAGS.getOperand(1));
49090 }
49091
49092 // MOVMSKPS(V) !=/== 0 -> TESTPS(V,V)
49093 // MOVMSKPD(V) !=/== 0 -> TESTPD(V,V)
49094 // MOVMSKPS(V) !=/== -1 -> TESTPS(V,V)
49095 // MOVMSKPD(V) !=/== -1 -> TESTPD(V,V)
49096 // iff every element is referenced.
49097 if (NumElts <= CmpBits && Subtarget.hasAVX() &&
49098 !Subtarget.preferMovmskOverVTest() && IsOneUse &&
49099 (NumEltBits == 32 || NumEltBits == 64)) {
49100 SDLoc DL(EFLAGS);
49101 MVT FloatSVT = MVT::getFloatingPointVT(NumEltBits);
49102 MVT FloatVT = MVT::getVectorVT(FloatSVT, NumElts);
49103 MVT IntVT = FloatVT.changeVectorElementTypeToInteger();
49104 SDValue LHS = Vec;
49105 SDValue RHS = IsAnyOf ? Vec : DAG.getAllOnesConstant(DL, IntVT);
49106 CC = IsAnyOf ? CC : (CC == X86::COND_E ? X86::COND_B : X86::COND_AE);
49107 return DAG.getNode(X86ISD::TESTP, DL, MVT::i32,
49108 DAG.getBitcast(FloatVT, LHS),
49109 DAG.getBitcast(FloatVT, RHS));
49110 }
49111
49112 return SDValue();
49113}
49114
49115/// Optimize an EFLAGS definition used according to the condition code \p CC
49116/// into a simpler EFLAGS value, potentially returning a new \p CC and replacing
49117/// uses of chain values.
49119 SelectionDAG &DAG,
49120 const X86Subtarget &Subtarget) {
49121 if (CC == X86::COND_B)
49122 if (SDValue Flags = combineCarryThroughADD(EFLAGS, DAG))
49123 return Flags;
49124
49125 if (SDValue R = checkSignTestSetCCCombine(EFLAGS, CC, DAG))
49126 return R;
49127
49128 if (SDValue R = checkBoolTestSetCCCombine(EFLAGS, CC))
49129 return R;
49130
49131 if (SDValue R = combinePTESTCC(EFLAGS, CC, DAG, Subtarget))
49132 return R;
49133
49134 if (SDValue R = combineSetCCMOVMSK(EFLAGS, CC, DAG, Subtarget))
49135 return R;
49136
49137 return combineSetCCAtomicArith(EFLAGS, CC, DAG, Subtarget);
49138}
49139
49140/// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL]
49143 const X86Subtarget &Subtarget) {
49144 SDLoc DL(N);
49145 EVT VT = N->getValueType(0);
49146 SDValue FalseOp = N->getOperand(0);
49147 SDValue TrueOp = N->getOperand(1);
49148 X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2);
49149 SDValue Cond = N->getOperand(3);
49150
49151 // cmov X, X, ?, ? --> X
49152 if (TrueOp == FalseOp)
49153 return TrueOp;
49154
49155 // Try to simplify the EFLAGS and condition code operands.
49156 // We can't always do this as FCMOV only supports a subset of X86 cond.
49157 if (SDValue Flags = combineSetCCEFLAGS(Cond, CC, DAG, Subtarget)) {
49158 if (!(FalseOp.getValueType() == MVT::f80 ||
49159 (FalseOp.getValueType() == MVT::f64 && !Subtarget.hasSSE2()) ||
49160 (FalseOp.getValueType() == MVT::f32 && !Subtarget.hasSSE1())) ||
49161 !Subtarget.canUseCMOV() || hasFPCMov(CC)) {
49162 SDValue Ops[] = {FalseOp, TrueOp, DAG.getTargetConstant(CC, DL, MVT::i8),
49163 Flags};
49164 return DAG.getNode(X86ISD::CMOV, DL, VT, Ops);
49165 }
49166 }
49167
49168 // If this is a select between two integer constants, try to do some
49169 // optimizations. Note that the operands are ordered the opposite of SELECT
49170 // operands.
49171 if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(TrueOp)) {
49172 if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(FalseOp)) {
49173 // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is
49174 // larger than FalseC (the false value).
49175 if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) {
49177 std::swap(TrueC, FalseC);
49178 std::swap(TrueOp, FalseOp);
49179 }
49180
49181 // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3. Likewise for any pow2/0.
49182 // This is efficient for any integer data type (including i8/i16) and
49183 // shift amount.
49184 if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) {
49185 Cond = getSETCC(CC, Cond, DL, DAG);
49186
49187 // Zero extend the condition if needed.
49188 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond);
49189
49190 unsigned ShAmt = TrueC->getAPIntValue().logBase2();
49191 Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond,
49192 DAG.getConstant(ShAmt, DL, MVT::i8));
49193 return Cond;
49194 }
49195
49196 // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. This is efficient
49197 // for any integer data type, including i8/i16.
49198 if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
49199 Cond = getSETCC(CC, Cond, DL, DAG);
49200
49201 // Zero extend the condition if needed.
49203 FalseC->getValueType(0), Cond);
49204 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
49205 SDValue(FalseC, 0));
49206 return Cond;
49207 }
49208
49209 // Optimize cases that will turn into an LEA instruction. This requires
49210 // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
49211 if (VT == MVT::i32 || VT == MVT::i64) {
49212 APInt Diff = TrueC->getAPIntValue() - FalseC->getAPIntValue();
49213 assert(Diff.getBitWidth() == VT.getSizeInBits() &&
49214 "Implicit constant truncation");
49215
49216 bool isFastMultiplier = false;
49217 if (Diff.ult(10)) {
49218 switch (Diff.getZExtValue()) {
49219 default: break;
49220 case 1: // result = add base, cond
49221 case 2: // result = lea base( , cond*2)
49222 case 3: // result = lea base(cond, cond*2)
49223 case 4: // result = lea base( , cond*4)
49224 case 5: // result = lea base(cond, cond*4)
49225 case 8: // result = lea base( , cond*8)
49226 case 9: // result = lea base(cond, cond*8)
49227 isFastMultiplier = true;
49228 break;
49229 }
49230 }
49231
49232 if (isFastMultiplier) {
49233 Cond = getSETCC(CC, Cond, DL ,DAG);
49234 // Zero extend the condition if needed.
49235 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
49236 Cond);
49237 // Scale the condition by the difference.
49238 if (Diff != 1)
49239 Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
49240 DAG.getConstant(Diff, DL, Cond.getValueType()));
49241
49242 // Add the base if non-zero.
49243 if (FalseC->getAPIntValue() != 0)
49244 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
49245 SDValue(FalseC, 0));
49246 return Cond;
49247 }
49248 }
49249 }
49250 }
49251
49252 // Handle these cases:
49253 // (select (x != c), e, c) -> select (x != c), e, x),
49254 // (select (x == c), c, e) -> select (x == c), x, e)
49255 // where the c is an integer constant, and the "select" is the combination
49256 // of CMOV and CMP.
49257 //
49258 // The rationale for this change is that the conditional-move from a constant
49259 // needs two instructions, however, conditional-move from a register needs
49260 // only one instruction.
49261 //
49262 // CAVEAT: By replacing a constant with a symbolic value, it may obscure
49263 // some instruction-combining opportunities. This opt needs to be
49264 // postponed as late as possible.
49265 //
49266 if (!DCI.isBeforeLegalize() && !DCI.isBeforeLegalizeOps()) {
49267 // the DCI.xxxx conditions are provided to postpone the optimization as
49268 // late as possible.
49269
49270 ConstantSDNode *CmpAgainst = nullptr;
49271 if ((Cond.getOpcode() == X86ISD::CMP || Cond.getOpcode() == X86ISD::SUB) &&
49272 (CmpAgainst = dyn_cast<ConstantSDNode>(Cond.getOperand(1))) &&
49273 !isa<ConstantSDNode>(Cond.getOperand(0))) {
49274
49275 if (CC == X86::COND_NE &&
49276 CmpAgainst == dyn_cast<ConstantSDNode>(FalseOp)) {
49278 std::swap(TrueOp, FalseOp);
49279 }
49280
49281 if (CC == X86::COND_E && CmpAgainst == dyn_cast<ConstantSDNode>(TrueOp)) {
49282 SDValue Ops[] = {FalseOp, Cond.getOperand(0),
49283 DAG.getTargetConstant(CC, DL, MVT::i8), Cond};
49284 return DAG.getNode(X86ISD::CMOV, DL, VT, Ops);
49285 }
49286 }
49287 }
49288
49289 // Transform:
49290 //
49291 // (cmov 1 T (uge T 2))
49292 //
49293 // to:
49294 //
49295 // (adc T 0 (sub T 1))
49296 if (CC == X86::COND_AE && isOneConstant(FalseOp) &&
49297 Cond.getOpcode() == X86ISD::SUB && Cond->hasOneUse()) {
49298 SDValue Cond0 = Cond.getOperand(0);
49299 if (Cond0.getOpcode() == ISD::TRUNCATE)
49300 Cond0 = Cond0.getOperand(0);
49301 auto *Sub1C = dyn_cast<ConstantSDNode>(Cond.getOperand(1));
49302 if (Cond0 == TrueOp && Sub1C && Sub1C->getZExtValue() == 2) {
49303 EVT CondVT = Cond->getValueType(0);
49304 // Subtract 1 and generate a carry.
49305 SDValue NewSub =
49306 DAG.getNode(X86ISD::SUB, DL, Cond->getVTList(), Cond.getOperand(0),
49307 DAG.getConstant(1, DL, CondVT));
49308 SDValue EFLAGS(NewSub.getNode(), 1);
49309 return DAG.getNode(X86ISD::ADC, DL, DAG.getVTList(VT, MVT::i32), TrueOp,
49310 DAG.getConstant(0, DL, VT), EFLAGS);
49311 }
49312 }
49313
49314 // Fold and/or of setcc's to double CMOV:
49315 // (CMOV F, T, ((cc1 | cc2) != 0)) -> (CMOV (CMOV F, T, cc1), T, cc2)
49316 // (CMOV F, T, ((cc1 & cc2) != 0)) -> (CMOV (CMOV T, F, !cc1), F, !cc2)
49317 //
49318 // This combine lets us generate:
49319 // cmovcc1 (jcc1 if we don't have CMOV)
49320 // cmovcc2 (same)
49321 // instead of:
49322 // setcc1
49323 // setcc2
49324 // and/or
49325 // cmovne (jne if we don't have CMOV)
49326 // When we can't use the CMOV instruction, it might increase branch
49327 // mispredicts.
49328 // When we can use CMOV, or when there is no mispredict, this improves
49329 // throughput and reduces register pressure.
49330 //
49331 if (CC == X86::COND_NE) {
49332 SDValue Flags;
49333 X86::CondCode CC0, CC1;
49334 bool isAndSetCC;
49335 if (checkBoolTestAndOrSetCCCombine(Cond, CC0, CC1, Flags, isAndSetCC)) {
49336 if (isAndSetCC) {
49337 std::swap(FalseOp, TrueOp);
49340 }
49341
49342 SDValue LOps[] = {FalseOp, TrueOp,
49343 DAG.getTargetConstant(CC0, DL, MVT::i8), Flags};
49344 SDValue LCMOV = DAG.getNode(X86ISD::CMOV, DL, VT, LOps);
49345 SDValue Ops[] = {LCMOV, TrueOp, DAG.getTargetConstant(CC1, DL, MVT::i8),
49346 Flags};
49347 SDValue CMOV = DAG.getNode(X86ISD::CMOV, DL, VT, Ops);
49348 return CMOV;
49349 }
49350 }
49351
49352 // Fold (CMOV C1, (ADD (CTTZ X), C2), (X != 0)) ->
49353 // (ADD (CMOV C1-C2, (CTTZ X), (X != 0)), C2)
49354 // Or (CMOV (ADD (CTTZ X), C2), C1, (X == 0)) ->
49355 // (ADD (CMOV (CTTZ X), C1-C2, (X == 0)), C2)
49356 // Or (CMOV (BSR ?, X), Y, (X == 0)) -> (BSR Y, X)
49357 // TODO: Or (CMOV (BSF ?, X), Y, (X == 0)) -> (BSF Y, X)
49358 if ((CC == X86::COND_NE || CC == X86::COND_E) &&
49359 Cond.getOpcode() == X86ISD::CMP && isNullConstant(Cond.getOperand(1))) {
49360 SDValue Add = TrueOp;
49361 SDValue Const = FalseOp;
49362 // Canonicalize the condition code for easier matching and output.
49363 if (CC == X86::COND_E)
49364 std::swap(Add, Const);
49365
49366 // TODO: ADD BSF support, but requires changes to the "REP BSF" CTTZ hack.
49367 if (Subtarget.hasBitScanPassThrough() && Add.getOpcode() == X86ISD::BSR &&
49368 Add.getResNo() == 0 && Add.hasOneUse() &&
49369 Add.getOperand(1) == Cond.getOperand(0)) {
49370 return DAG.getNode(Add.getOpcode(), DL, Add->getVTList(), Const,
49371 Add.getOperand(1));
49372 }
49373
49374 // We might have replaced the constant in the cmov with the LHS of the
49375 // compare. If so change it to the RHS of the compare.
49376 if (Const == Cond.getOperand(0))
49377 Const = Cond.getOperand(1);
49378
49379 // Ok, now make sure that Add is (add (cttz X), C2) and Const is a constant.
49380 if (isa<ConstantSDNode>(Const) && Add.getOpcode() == ISD::ADD &&
49381 Add.hasOneUse() && isa<ConstantSDNode>(Add.getOperand(1)) &&
49382 (Add.getOperand(0).getOpcode() == ISD::CTTZ_ZERO_UNDEF ||
49383 Add.getOperand(0).getOpcode() == ISD::CTTZ) &&
49384 Add.getOperand(0).getOperand(0) == Cond.getOperand(0)) {
49385 // This should constant fold.
49386 SDValue Diff = DAG.getNode(ISD::SUB, DL, VT, Const, Add.getOperand(1));
49387 SDValue CMov =
49388 DAG.getNode(X86ISD::CMOV, DL, VT, Diff, Add.getOperand(0),
49389 DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8), Cond);
49390 return DAG.getNode(ISD::ADD, DL, VT, CMov, Add.getOperand(1));
49391 }
49392 }
49393
49394 return SDValue();
49395}
49396
49397/// Different mul shrinking modes.
49399
49401 EVT VT = N->getOperand(0).getValueType();
49402 if (VT.getScalarSizeInBits() != 32)
49403 return false;
49404
49405 assert(N->getNumOperands() == 2 && "NumOperands of Mul are 2");
49406 unsigned SignBits[2] = {1, 1};
49407 bool IsPositive[2] = {false, false};
49408 for (unsigned i = 0; i < 2; i++) {
49409 SDValue Opd = N->getOperand(i);
49410
49411 SignBits[i] = DAG.ComputeNumSignBits(Opd);
49412 IsPositive[i] = DAG.SignBitIsZero(Opd);
49413 }
49414
49415 bool AllPositive = IsPositive[0] && IsPositive[1];
49416 unsigned MinSignBits = std::min(SignBits[0], SignBits[1]);
49417 // When ranges are from -128 ~ 127, use MULS8 mode.
49418 if (MinSignBits >= 25)
49420 // When ranges are from 0 ~ 255, use MULU8 mode.
49421 else if (AllPositive && MinSignBits >= 24)
49423 // When ranges are from -32768 ~ 32767, use MULS16 mode.
49424 else if (MinSignBits >= 17)
49426 // When ranges are from 0 ~ 65535, use MULU16 mode.
49427 else if (AllPositive && MinSignBits >= 16)
49429 else
49430 return false;
49431 return true;
49432}
49433
49434/// When the operands of vector mul are extended from smaller size values,
49435/// like i8 and i16, the type of mul may be shrinked to generate more
49436/// efficient code. Two typical patterns are handled:
49437/// Pattern1:
49438/// %2 = sext/zext <N x i8> %1 to <N x i32>
49439/// %4 = sext/zext <N x i8> %3 to <N x i32>
49440// or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
49441/// %5 = mul <N x i32> %2, %4
49442///
49443/// Pattern2:
49444/// %2 = zext/sext <N x i16> %1 to <N x i32>
49445/// %4 = zext/sext <N x i16> %3 to <N x i32>
49446/// or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
49447/// %5 = mul <N x i32> %2, %4
49448///
49449/// There are four mul shrinking modes:
49450/// If %2 == sext32(trunc8(%2)), i.e., the scalar value range of %2 is
49451/// -128 to 128, and the scalar value range of %4 is also -128 to 128,
49452/// generate pmullw+sext32 for it (MULS8 mode).
49453/// If %2 == zext32(trunc8(%2)), i.e., the scalar value range of %2 is
49454/// 0 to 255, and the scalar value range of %4 is also 0 to 255,
49455/// generate pmullw+zext32 for it (MULU8 mode).
49456/// If %2 == sext32(trunc16(%2)), i.e., the scalar value range of %2 is
49457/// -32768 to 32767, and the scalar value range of %4 is also -32768 to 32767,
49458/// generate pmullw+pmulhw for it (MULS16 mode).
49459/// If %2 == zext32(trunc16(%2)), i.e., the scalar value range of %2 is
49460/// 0 to 65535, and the scalar value range of %4 is also 0 to 65535,
49461/// generate pmullw+pmulhuw for it (MULU16 mode).
49463 const X86Subtarget &Subtarget) {
49464 // Check for legality
49465 // pmullw/pmulhw are not supported by SSE.
49466 if (!Subtarget.hasSSE2())
49467 return SDValue();
49468
49469 // Check for profitability
49470 // pmulld is supported since SSE41. It is better to use pmulld
49471 // instead of pmullw+pmulhw, except for subtargets where pmulld is slower than
49472 // the expansion.
49473 bool OptForMinSize = DAG.getMachineFunction().getFunction().hasMinSize();
49474 if (Subtarget.hasSSE41() && (OptForMinSize || !Subtarget.isPMULLDSlow()))
49475 return SDValue();
49476
49478 if (!canReduceVMulWidth(N, DAG, Mode))
49479 return SDValue();
49480
49481 SDValue N0 = N->getOperand(0);
49482 SDValue N1 = N->getOperand(1);
49483 EVT VT = N->getOperand(0).getValueType();
49484 unsigned NumElts = VT.getVectorNumElements();
49485 if ((NumElts % 2) != 0)
49486 return SDValue();
49487
49488 EVT ReducedVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, NumElts);
49489
49490 // Shrink the operands of mul.
49491 SDValue NewN0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N0);
49492 SDValue NewN1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N1);
49493
49494 // Generate the lower part of mul: pmullw. For MULU8/MULS8, only the
49495 // lower part is needed.
49496 SDValue MulLo = DAG.getNode(ISD::MUL, DL, ReducedVT, NewN0, NewN1);
49500 DL, VT, MulLo);
49501
49502 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts / 2);
49503 // Generate the higher part of mul: pmulhw/pmulhuw. For MULU16/MULS16,
49504 // the higher part is also needed.
49505 SDValue MulHi =
49507 ReducedVT, NewN0, NewN1);
49508
49509 // Repack the lower part and higher part result of mul into a wider
49510 // result.
49511 // Generate shuffle functioning as punpcklwd.
49512 SmallVector<int, 16> ShuffleMask(NumElts);
49513 for (unsigned i = 0, e = NumElts / 2; i < e; i++) {
49514 ShuffleMask[2 * i] = i;
49515 ShuffleMask[2 * i + 1] = i + NumElts;
49516 }
49517 SDValue ResLo =
49518 DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
49519 ResLo = DAG.getBitcast(ResVT, ResLo);
49520 // Generate shuffle functioning as punpckhwd.
49521 for (unsigned i = 0, e = NumElts / 2; i < e; i++) {
49522 ShuffleMask[2 * i] = i + NumElts / 2;
49523 ShuffleMask[2 * i + 1] = i + NumElts * 3 / 2;
49524 }
49525 SDValue ResHi =
49526 DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
49527 ResHi = DAG.getBitcast(ResVT, ResHi);
49528 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ResLo, ResHi);
49529}
49530
49532 EVT VT, const SDLoc &DL) {
49533
49534 auto combineMulShlAddOrSub = [&](int Mult, int Shift, bool isAdd) {
49535 SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
49536 DAG.getConstant(Mult, DL, VT));
49537 Result = DAG.getNode(ISD::SHL, DL, VT, Result,
49538 DAG.getConstant(Shift, DL, MVT::i8));
49539 Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result,
49540 N->getOperand(0));
49541 return Result;
49542 };
49543
49544 auto combineMulMulAddOrSub = [&](int Mul1, int Mul2, bool isAdd) {
49545 SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
49546 DAG.getConstant(Mul1, DL, VT));
49547 Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, Result,
49548 DAG.getConstant(Mul2, DL, VT));
49549 Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result,
49550 N->getOperand(0));
49551 return Result;
49552 };
49553
49554 switch (MulAmt) {
49555 default:
49556 break;
49557 case 11:
49558 // mul x, 11 => add ((shl (mul x, 5), 1), x)
49559 return combineMulShlAddOrSub(5, 1, /*isAdd*/ true);
49560 case 21:
49561 // mul x, 21 => add ((shl (mul x, 5), 2), x)
49562 return combineMulShlAddOrSub(5, 2, /*isAdd*/ true);
49563 case 41:
49564 // mul x, 41 => add ((shl (mul x, 5), 3), x)
49565 return combineMulShlAddOrSub(5, 3, /*isAdd*/ true);
49566 case 22:
49567 // mul x, 22 => add (add ((shl (mul x, 5), 2), x), x)
49568 return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
49569 combineMulShlAddOrSub(5, 2, /*isAdd*/ true));
49570 case 19:
49571 // mul x, 19 => add ((shl (mul x, 9), 1), x)
49572 return combineMulShlAddOrSub(9, 1, /*isAdd*/ true);
49573 case 37:
49574 // mul x, 37 => add ((shl (mul x, 9), 2), x)
49575 return combineMulShlAddOrSub(9, 2, /*isAdd*/ true);
49576 case 73:
49577 // mul x, 73 => add ((shl (mul x, 9), 3), x)
49578 return combineMulShlAddOrSub(9, 3, /*isAdd*/ true);
49579 case 13:
49580 // mul x, 13 => add ((shl (mul x, 3), 2), x)
49581 return combineMulShlAddOrSub(3, 2, /*isAdd*/ true);
49582 case 23:
49583 // mul x, 23 => sub ((shl (mul x, 3), 3), x)
49584 return combineMulShlAddOrSub(3, 3, /*isAdd*/ false);
49585 case 26:
49586 // mul x, 26 => add ((mul (mul x, 5), 5), x)
49587 return combineMulMulAddOrSub(5, 5, /*isAdd*/ true);
49588 case 28:
49589 // mul x, 28 => add ((mul (mul x, 9), 3), x)
49590 return combineMulMulAddOrSub(9, 3, /*isAdd*/ true);
49591 case 29:
49592 // mul x, 29 => add (add ((mul (mul x, 9), 3), x), x)
49593 return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
49594 combineMulMulAddOrSub(9, 3, /*isAdd*/ true));
49595 }
49596
49597 // Another trick. If this is a power 2 + 2/4/8, we can use a shift followed
49598 // by a single LEA.
49599 // First check if this a sum of two power of 2s because that's easy. Then
49600 // count how many zeros are up to the first bit.
49601 // TODO: We can do this even without LEA at a cost of two shifts and an add.
49602 if (isPowerOf2_64(MulAmt & (MulAmt - 1))) {
49603 unsigned ScaleShift = llvm::countr_zero(MulAmt);
49604 if (ScaleShift >= 1 && ScaleShift < 4) {
49605 unsigned ShiftAmt = Log2_64((MulAmt & (MulAmt - 1)));
49606 SDValue Shift1 = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
49607 DAG.getConstant(ShiftAmt, DL, MVT::i8));
49608 SDValue Shift2 = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
49609 DAG.getConstant(ScaleShift, DL, MVT::i8));
49610 return DAG.getNode(ISD::ADD, DL, VT, Shift1, Shift2);
49611 }
49612 }
49613
49614 return SDValue();
49615}
49616
49617// If the upper 17 bits of either element are zero and the other element are
49618// zero/sign bits then we can use PMADDWD, which is always at least as quick as
49619// PMULLD, except on KNL.
49621 SelectionDAG &DAG,
49622 const X86Subtarget &Subtarget) {
49623 if (!Subtarget.hasSSE2())
49624 return SDValue();
49625
49626 if (Subtarget.isPMADDWDSlow())
49627 return SDValue();
49628
49629 EVT VT = N->getValueType(0);
49630
49631 // Only support vXi32 vectors.
49632 if (!VT.isVector() || VT.getVectorElementType() != MVT::i32)
49633 return SDValue();
49634
49635 // Make sure the type is legal or can split/widen to a legal type.
49636 // With AVX512 but without BWI, we would need to split v32i16.
49637 unsigned NumElts = VT.getVectorNumElements();
49638 if (NumElts == 1 || !isPowerOf2_32(NumElts))
49639 return SDValue();
49640
49641 // With AVX512 but without BWI, we would need to split v32i16.
49642 if (32 <= (2 * NumElts) && Subtarget.hasAVX512() && !Subtarget.hasBWI())
49643 return SDValue();
49644
49645 SDValue N0 = N->getOperand(0);
49646 SDValue N1 = N->getOperand(1);
49647
49648 // If we are zero/sign extending two steps without SSE4.1, its better to
49649 // reduce the vmul width instead.
49650 if (!Subtarget.hasSSE41() &&
49651 (((N0.getOpcode() == ISD::ZERO_EXTEND &&
49652 N0.getOperand(0).getScalarValueSizeInBits() <= 8) &&
49653 (N1.getOpcode() == ISD::ZERO_EXTEND &&
49654 N1.getOperand(0).getScalarValueSizeInBits() <= 8)) ||
49655 ((N0.getOpcode() == ISD::SIGN_EXTEND &&
49656 N0.getOperand(0).getScalarValueSizeInBits() <= 8) &&
49657 (N1.getOpcode() == ISD::SIGN_EXTEND &&
49658 N1.getOperand(0).getScalarValueSizeInBits() <= 8))))
49659 return SDValue();
49660
49661 // If we are sign extending a wide vector without SSE4.1, its better to reduce
49662 // the vmul width instead.
49663 if (!Subtarget.hasSSE41() &&
49664 (N0.getOpcode() == ISD::SIGN_EXTEND &&
49665 N0.getOperand(0).getValueSizeInBits() > 128) &&
49666 (N1.getOpcode() == ISD::SIGN_EXTEND &&
49667 N1.getOperand(0).getValueSizeInBits() > 128))
49668 return SDValue();
49669
49670 // Sign bits must extend down to the lowest i16.
49671 if (DAG.ComputeMaxSignificantBits(N1) > 16 ||
49672 DAG.ComputeMaxSignificantBits(N0) > 16)
49673 return SDValue();
49674
49675 // At least one of the elements must be zero in the upper 17 bits, or can be
49676 // safely made zero without altering the final result.
49677 auto GetZeroableOp = [&](SDValue Op) {
49678 APInt Mask17 = APInt::getHighBitsSet(32, 17);
49679 if (DAG.MaskedValueIsZero(Op, Mask17))
49680 return Op;
49681 // Mask off upper 16-bits of sign-extended constants.
49683 return DAG.getNode(ISD::AND, DL, VT, Op, DAG.getConstant(0xFFFF, DL, VT));
49684 if (Op.getOpcode() == ISD::SIGN_EXTEND && N->isOnlyUserOf(Op.getNode())) {
49685 SDValue Src = Op.getOperand(0);
49686 // Convert sext(vXi16) to zext(vXi16).
49687 if (Src.getScalarValueSizeInBits() == 16 && VT.getSizeInBits() <= 128)
49688 return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Src);
49689 // Convert sext(vXi8) to zext(vXi16 sext(vXi8)) on pre-SSE41 targets
49690 // which will expand the extension.
49691 if (Src.getScalarValueSizeInBits() < 16 && !Subtarget.hasSSE41()) {
49692 EVT ExtVT = VT.changeVectorElementType(MVT::i16);
49693 Src = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, Src);
49694 return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Src);
49695 }
49696 }
49697 // Convert SIGN_EXTEND_VECTOR_INREG to ZEXT_EXTEND_VECTOR_INREG.
49698 if (Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG &&
49699 N->isOnlyUserOf(Op.getNode())) {
49700 SDValue Src = Op.getOperand(0);
49701 if (Src.getScalarValueSizeInBits() == 16)
49702 return DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, DL, VT, Src);
49703 }
49704 // Convert VSRAI(Op, 16) to VSRLI(Op, 16).
49705 if (Op.getOpcode() == X86ISD::VSRAI && Op.getConstantOperandVal(1) == 16 &&
49706 N->isOnlyUserOf(Op.getNode())) {
49707 return DAG.getNode(X86ISD::VSRLI, DL, VT, Op.getOperand(0),
49708 Op.getOperand(1));
49709 }
49710 return SDValue();
49711 };
49712 SDValue ZeroN0 = GetZeroableOp(N0);
49713 SDValue ZeroN1 = GetZeroableOp(N1);
49714 if (!ZeroN0 && !ZeroN1)
49715 return SDValue();
49716 N0 = ZeroN0 ? ZeroN0 : N0;
49717 N1 = ZeroN1 ? ZeroN1 : N1;
49718
49719 // Use SplitOpsAndApply to handle AVX splitting.
49720 auto PMADDWDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
49722 MVT ResVT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32);
49723 MVT OpVT = MVT::getVectorVT(MVT::i16, Ops[0].getValueSizeInBits() / 16);
49724 return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT,
49725 DAG.getBitcast(OpVT, Ops[0]),
49726 DAG.getBitcast(OpVT, Ops[1]));
49727 };
49728 return SplitOpsAndApply(DAG, Subtarget, DL, VT, {N0, N1}, PMADDWDBuilder);
49729}
49730
49732 const X86Subtarget &Subtarget) {
49733 if (!Subtarget.hasSSE2())
49734 return SDValue();
49735
49736 EVT VT = N->getValueType(0);
49737
49738 // Only support vXi64 vectors.
49739 if (!VT.isVector() || VT.getVectorElementType() != MVT::i64 ||
49740 VT.getVectorNumElements() < 2 ||
49742 return SDValue();
49743
49744 SDValue N0 = N->getOperand(0);
49745 SDValue N1 = N->getOperand(1);
49746
49747 // MULDQ returns the 64-bit result of the signed multiplication of the lower
49748 // 32-bits. We can lower with this if the sign bits stretch that far.
49749 if (Subtarget.hasSSE41() && DAG.ComputeNumSignBits(N0) > 32 &&
49750 DAG.ComputeNumSignBits(N1) > 32) {
49751 auto PMULDQBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
49753 return DAG.getNode(X86ISD::PMULDQ, DL, Ops[0].getValueType(), Ops);
49754 };
49755 return SplitOpsAndApply(DAG, Subtarget, DL, VT, {N0, N1}, PMULDQBuilder,
49756 /*CheckBWI*/ false);
49757 }
49758
49759 // If the upper bits are zero we can use a single pmuludq.
49760 APInt Mask = APInt::getHighBitsSet(64, 32);
49761 if (DAG.MaskedValueIsZero(N0, Mask) && DAG.MaskedValueIsZero(N1, Mask)) {
49762 auto PMULUDQBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
49764 return DAG.getNode(X86ISD::PMULUDQ, DL, Ops[0].getValueType(), Ops);
49765 };
49766 return SplitOpsAndApply(DAG, Subtarget, DL, VT, {N0, N1}, PMULUDQBuilder,
49767 /*CheckBWI*/ false);
49768 }
49769
49770 return SDValue();
49771}
49772
49775 const X86Subtarget &Subtarget) {
49776 EVT VT = N->getValueType(0);
49777 SDLoc DL(N);
49778
49779 if (SDValue V = combineMulToPMADDWD(N, DL, DAG, Subtarget))
49780 return V;
49781
49782 if (SDValue V = combineMulToPMULDQ(N, DL, DAG, Subtarget))
49783 return V;
49784
49785 if (DCI.isBeforeLegalize() && VT.isVector())
49786 return reduceVMULWidth(N, DL, DAG, Subtarget);
49787
49788 if (VT != MVT::i64 && VT != MVT::i32 &&
49789 (!VT.isVector() || !VT.isSimple() || !VT.isInteger()))
49790 return SDValue();
49791
49792 KnownBits Known1 = DAG.computeKnownBits(N->getOperand(1));
49793 if (!Known1.isConstant())
49794 return SDValue();
49795
49796 const APInt &C = Known1.getConstant();
49797 if (C.isZero())
49798 return DAG.getConstant(0, DL, VT);
49799
49800 if (C.isAllOnes())
49801 return DAG.getNegative(N->getOperand(0), DL, VT);
49802
49803 if (isPowerOf2_64(C.getZExtValue()))
49804 return SDValue();
49805
49806 // Optimize a single multiply with constant into two operations in order to
49807 // implement it with two cheaper instructions, e.g. LEA + SHL, LEA + LEA.
49809 return SDValue();
49810
49811 // An imul is usually smaller than the alternative sequence.
49813 return SDValue();
49814
49815 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
49816 return SDValue();
49817
49818 int64_t SignMulAmt = C.getSExtValue();
49819 assert(SignMulAmt != INT64_MIN && "Int min should have been handled!");
49820 uint64_t AbsMulAmt = SignMulAmt < 0 ? -SignMulAmt : SignMulAmt;
49821
49822 SDValue NewMul = SDValue();
49823 if (VT == MVT::i64 || VT == MVT::i32) {
49824 if (AbsMulAmt == 3 || AbsMulAmt == 5 || AbsMulAmt == 9) {
49825 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
49826 DAG.getConstant(AbsMulAmt, DL, VT));
49827 if (SignMulAmt < 0)
49828 NewMul = DAG.getNegative(NewMul, DL, VT);
49829
49830 return NewMul;
49831 }
49832
49833 uint64_t MulAmt1 = 0;
49834 uint64_t MulAmt2 = 0;
49835 if ((AbsMulAmt % 9) == 0) {
49836 MulAmt1 = 9;
49837 MulAmt2 = AbsMulAmt / 9;
49838 } else if ((AbsMulAmt % 5) == 0) {
49839 MulAmt1 = 5;
49840 MulAmt2 = AbsMulAmt / 5;
49841 } else if ((AbsMulAmt % 3) == 0) {
49842 MulAmt1 = 3;
49843 MulAmt2 = AbsMulAmt / 3;
49844 }
49845
49846 // For negative multiply amounts, only allow MulAmt2 to be a power of 2.
49847 if (MulAmt2 &&
49848 (isPowerOf2_64(MulAmt2) ||
49849 (SignMulAmt >= 0 && (MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)))) {
49850
49851 if (isPowerOf2_64(MulAmt2) && !(SignMulAmt >= 0 && N->hasOneUse() &&
49852 N->user_begin()->getOpcode() == ISD::ADD))
49853 // If second multiplifer is pow2, issue it first. We want the multiply
49854 // by 3, 5, or 9 to be folded into the addressing mode unless the lone
49855 // use is an add. Only do this for positive multiply amounts since the
49856 // negate would prevent it from being used as an address mode anyway.
49857 std::swap(MulAmt1, MulAmt2);
49858
49859 if (isPowerOf2_64(MulAmt1))
49860 NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
49861 DAG.getConstant(Log2_64(MulAmt1), DL, MVT::i8));
49862 else
49863 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
49864 DAG.getConstant(MulAmt1, DL, VT));
49865
49866 if (isPowerOf2_64(MulAmt2))
49867 NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul,
49868 DAG.getConstant(Log2_64(MulAmt2), DL, MVT::i8));
49869 else
49870 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul,
49871 DAG.getConstant(MulAmt2, DL, VT));
49872
49873 // Negate the result.
49874 if (SignMulAmt < 0)
49875 NewMul = DAG.getNegative(NewMul, DL, VT);
49876 } else if (!Subtarget.slowLEA())
49877 NewMul = combineMulSpecial(C.getZExtValue(), N, DAG, VT, DL);
49878 }
49879 if (!NewMul) {
49880 EVT ShiftVT = VT.isVector() ? VT : MVT::i8;
49881 if (isPowerOf2_64(AbsMulAmt - 1)) {
49882 // (mul x, 2^N + 1) => (add (shl x, N), x)
49883 NewMul = DAG.getNode(
49884 ISD::ADD, DL, VT, N->getOperand(0),
49885 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
49886 DAG.getConstant(Log2_64(AbsMulAmt - 1), DL, ShiftVT)));
49887 if (SignMulAmt < 0)
49888 NewMul = DAG.getNegative(NewMul, DL, VT);
49889 } else if (isPowerOf2_64(AbsMulAmt + 1)) {
49890 // (mul x, 2^N - 1) => (sub (shl x, N), x)
49891 NewMul =
49892 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
49893 DAG.getConstant(Log2_64(AbsMulAmt + 1), DL, ShiftVT));
49894 // To negate, reverse the operands of the subtract.
49895 if (SignMulAmt < 0)
49896 NewMul = DAG.getNode(ISD::SUB, DL, VT, N->getOperand(0), NewMul);
49897 else
49898 NewMul = DAG.getNode(ISD::SUB, DL, VT, NewMul, N->getOperand(0));
49899 } else if (SignMulAmt >= 0 && isPowerOf2_64(AbsMulAmt - 2) &&
49900 (!VT.isVector() || Subtarget.fastImmVectorShift())) {
49901 // (mul x, 2^N + 2) => (add (shl x, N), (add x, x))
49902 NewMul =
49903 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
49904 DAG.getConstant(Log2_64(AbsMulAmt - 2), DL, ShiftVT));
49905 NewMul = DAG.getNode(
49906 ISD::ADD, DL, VT, NewMul,
49907 DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0), N->getOperand(0)));
49908 } else if (SignMulAmt >= 0 && isPowerOf2_64(AbsMulAmt + 2) &&
49909 (!VT.isVector() || Subtarget.fastImmVectorShift())) {
49910 // (mul x, 2^N - 2) => (sub (shl x, N), (add x, x))
49911 NewMul =
49912 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
49913 DAG.getConstant(Log2_64(AbsMulAmt + 2), DL, ShiftVT));
49914 NewMul = DAG.getNode(
49915 ISD::SUB, DL, VT, NewMul,
49916 DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0), N->getOperand(0)));
49917 } else if (SignMulAmt >= 0 && VT.isVector() &&
49918 Subtarget.fastImmVectorShift()) {
49919 uint64_t AbsMulAmtLowBit = AbsMulAmt & (-AbsMulAmt);
49920 uint64_t ShiftAmt1;
49921 std::optional<unsigned> Opc;
49922 if (isPowerOf2_64(AbsMulAmt - AbsMulAmtLowBit)) {
49923 ShiftAmt1 = AbsMulAmt - AbsMulAmtLowBit;
49924 Opc = ISD::ADD;
49925 } else if (isPowerOf2_64(AbsMulAmt + AbsMulAmtLowBit)) {
49926 ShiftAmt1 = AbsMulAmt + AbsMulAmtLowBit;
49927 Opc = ISD::SUB;
49928 }
49929
49930 if (Opc) {
49931 SDValue Shift1 =
49932 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
49933 DAG.getConstant(Log2_64(ShiftAmt1), DL, ShiftVT));
49934 SDValue Shift2 =
49935 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
49936 DAG.getConstant(Log2_64(AbsMulAmtLowBit), DL, ShiftVT));
49937 NewMul = DAG.getNode(*Opc, DL, VT, Shift1, Shift2);
49938 }
49939 }
49940 }
49941
49942 return NewMul;
49943}
49944
49945// Try to form a MULHU or MULHS node by looking for
49946// (srl (mul ext, ext), 16)
49947// TODO: This is X86 specific because we want to be able to handle wide types
49948// before type legalization. But we can only do it if the vector will be
49949// legalized via widening/splitting. Type legalization can't handle promotion
49950// of a MULHU/MULHS. There isn't a way to convey this to the generic DAG
49951// combiner.
49953 const SDLoc &DL,
49954 const X86Subtarget &Subtarget) {
49955 using namespace SDPatternMatch;
49956 assert((N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) &&
49957 "SRL or SRA node is required here!");
49958
49959 if (!Subtarget.hasSSE2())
49960 return SDValue();
49961
49962 // Input type should be at least vXi32.
49963 EVT VT = N->getValueType(0);
49964 if (!VT.isVector() || VT.getVectorElementType().getSizeInBits() < 32)
49965 return SDValue();
49966
49967 // The operation must be a multiply shifted right by 16.
49968 SDValue LHS, RHS;
49969 if (!sd_match(N->getOperand(1), m_SpecificInt(16)) ||
49970 !sd_match(N->getOperand(0), m_OneUse(m_Mul(m_Value(LHS), m_Value(RHS)))))
49971 return SDValue();
49972
49973 unsigned ExtOpc = LHS.getOpcode();
49974 if ((ExtOpc != ISD::SIGN_EXTEND && ExtOpc != ISD::ZERO_EXTEND) ||
49975 RHS.getOpcode() != ExtOpc)
49976 return SDValue();
49977
49978 // Peek through the extends.
49979 LHS = LHS.getOperand(0);
49980 RHS = RHS.getOperand(0);
49981
49982 // Ensure the input types match.
49983 EVT MulVT = LHS.getValueType();
49984 if (MulVT.getVectorElementType() != MVT::i16 || RHS.getValueType() != MulVT)
49985 return SDValue();
49986
49987 unsigned Opc = ExtOpc == ISD::SIGN_EXTEND ? ISD::MULHS : ISD::MULHU;
49988 SDValue Mulh = DAG.getNode(Opc, DL, MulVT, LHS, RHS);
49989
49990 ExtOpc = N->getOpcode() == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
49991 return DAG.getNode(ExtOpc, DL, VT, Mulh);
49992}
49993
49995 const X86Subtarget &Subtarget) {
49996 using namespace llvm::SDPatternMatch;
49997 SDValue N0 = N->getOperand(0);
49998 SDValue N1 = N->getOperand(1);
50000 EVT VT = N0.getValueType();
50001 unsigned EltSizeInBits = VT.getScalarSizeInBits();
50002 SDLoc DL(N);
50003
50004 // Exploits AVX2 VSHLV/VSRLV instructions for efficient unsigned vector shifts
50005 // with out-of-bounds clamping.
50006 if (N0.getOpcode() == ISD::VSELECT &&
50007 supportedVectorVarShift(VT, Subtarget, ISD::SHL)) {
50008 SDValue Cond = N0.getOperand(0);
50009 SDValue N00 = N0.getOperand(1);
50010 SDValue N01 = N0.getOperand(2);
50011 // fold shl(select(icmp_ult(amt,BW),x,0),amt) -> avx2 psllv(x,amt)
50013 sd_match(Cond, m_SetCC(m_Specific(N1), m_SpecificInt(EltSizeInBits),
50015 return DAG.getNode(X86ISD::VSHLV, DL, VT, N00, N1);
50016 }
50017 // fold shl(select(icmp_uge(amt,BW),0,x),amt) -> avx2 psllv(x,amt)
50019 sd_match(Cond, m_SetCC(m_Specific(N1), m_SpecificInt(EltSizeInBits),
50021 return DAG.getNode(X86ISD::VSHLV, DL, VT, N01, N1);
50022 }
50023 }
50024
50025 // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2))
50026 // since the result of setcc_c is all zero's or all ones.
50027 if (VT.isInteger() && !VT.isVector() &&
50028 N1C && N0.getOpcode() == ISD::AND &&
50029 N0.getOperand(1).getOpcode() == ISD::Constant) {
50030 SDValue N00 = N0.getOperand(0);
50031 APInt Mask = N0.getConstantOperandAPInt(1);
50032 Mask <<= N1C->getAPIntValue();
50033 bool MaskOK = false;
50034 // We can handle cases concerning bit-widening nodes containing setcc_c if
50035 // we carefully interrogate the mask to make sure we are semantics
50036 // preserving.
50037 // The transform is not safe if the result of C1 << C2 exceeds the bitwidth
50038 // of the underlying setcc_c operation if the setcc_c was zero extended.
50039 // Consider the following example:
50040 // zext(setcc_c) -> i32 0x0000FFFF
50041 // c1 -> i32 0x0000FFFF
50042 // c2 -> i32 0x00000001
50043 // (shl (and (setcc_c), c1), c2) -> i32 0x0001FFFE
50044 // (and setcc_c, (c1 << c2)) -> i32 0x0000FFFE
50045 if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
50046 MaskOK = true;
50047 } else if (N00.getOpcode() == ISD::SIGN_EXTEND &&
50049 MaskOK = true;
50050 } else if ((N00.getOpcode() == ISD::ZERO_EXTEND ||
50051 N00.getOpcode() == ISD::ANY_EXTEND) &&
50053 MaskOK = Mask.isIntN(N00.getOperand(0).getValueSizeInBits());
50054 }
50055 if (MaskOK && Mask != 0)
50056 return DAG.getNode(ISD::AND, DL, VT, N00, DAG.getConstant(Mask, DL, VT));
50057 }
50058
50059 return SDValue();
50060}
50061
50063 const X86Subtarget &Subtarget) {
50064 using namespace llvm::SDPatternMatch;
50065 SDValue N0 = N->getOperand(0);
50066 SDValue N1 = N->getOperand(1);
50067 EVT VT = N0.getValueType();
50068 unsigned Size = VT.getSizeInBits();
50069 SDLoc DL(N);
50070
50071 if (SDValue V = combineShiftToPMULH(N, DAG, DL, Subtarget))
50072 return V;
50073
50074 // fold sra(x,umin(amt,bw-1)) -> avx2 psrav(x,amt)
50075 if (supportedVectorVarShift(VT, Subtarget, ISD::SRA)) {
50076 SDValue ShrAmtVal;
50077 if (sd_match(N1, m_UMin(m_Value(ShrAmtVal),
50079 return DAG.getNode(X86ISD::VSRAV, DL, VT, N0, ShrAmtVal);
50080 }
50081
50082 // fold (SRA (SHL X, ShlConst), SraConst)
50083 // into (SHL (sext_in_reg X), ShlConst - SraConst)
50084 // or (sext_in_reg X)
50085 // or (SRA (sext_in_reg X), SraConst - ShlConst)
50086 // depending on relation between SraConst and ShlConst.
50087 // We only do this if (Size - ShlConst) is equal to 8, 16 or 32. That allows
50088 // us to do the sext_in_reg from corresponding bit.
50089
50090 // sexts in X86 are MOVs. The MOVs have the same code size
50091 // as above SHIFTs (only SHIFT on 1 has lower code size).
50092 // However the MOVs have 2 advantages to a SHIFT:
50093 // 1. MOVs can write to a register that differs from source
50094 // 2. MOVs accept memory operands
50095
50096 if (VT.isVector() || N1.getOpcode() != ISD::Constant ||
50097 N0.getOpcode() != ISD::SHL || !N0.hasOneUse() ||
50099 return SDValue();
50100
50101 SDValue N00 = N0.getOperand(0);
50102 SDValue N01 = N0.getOperand(1);
50103 APInt ShlConst = N01->getAsAPIntVal();
50104 APInt SraConst = N1->getAsAPIntVal();
50105 EVT CVT = N1.getValueType();
50106
50107 if (CVT != N01.getValueType())
50108 return SDValue();
50109 if (SraConst.isNegative())
50110 return SDValue();
50111
50112 for (MVT SVT : { MVT::i8, MVT::i16, MVT::i32 }) {
50113 unsigned ShiftSize = SVT.getSizeInBits();
50114 // Only deal with (Size - ShlConst) being equal to 8, 16 or 32.
50115 if (ShiftSize >= Size || ShlConst != Size - ShiftSize)
50116 continue;
50117 SDValue NN =
50118 DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, N00, DAG.getValueType(SVT));
50119 if (SraConst.eq(ShlConst))
50120 return NN;
50121 if (SraConst.ult(ShlConst))
50122 return DAG.getNode(ISD::SHL, DL, VT, NN,
50123 DAG.getConstant(ShlConst - SraConst, DL, CVT));
50124 return DAG.getNode(ISD::SRA, DL, VT, NN,
50125 DAG.getConstant(SraConst - ShlConst, DL, CVT));
50126 }
50127 return SDValue();
50128}
50129
50132 const X86Subtarget &Subtarget) {
50133 using namespace llvm::SDPatternMatch;
50134 SDValue N0 = N->getOperand(0);
50135 SDValue N1 = N->getOperand(1);
50136 EVT VT = N0.getValueType();
50137 unsigned EltSizeInBits = VT.getScalarSizeInBits();
50138 SDLoc DL(N);
50139
50140 if (SDValue V = combineShiftToPMULH(N, DAG, DL, Subtarget))
50141 return V;
50142
50143 // Exploits AVX2 VSHLV/VSRLV instructions for efficient unsigned vector shifts
50144 // with out-of-bounds clamping.
50145 if (N0.getOpcode() == ISD::VSELECT &&
50146 supportedVectorVarShift(VT, Subtarget, ISD::SRL)) {
50147 SDValue Cond = N0.getOperand(0);
50148 SDValue N00 = N0.getOperand(1);
50149 SDValue N01 = N0.getOperand(2);
50150 // fold srl(select(icmp_ult(amt,BW),x,0),amt) -> avx2 psrlv(x,amt)
50152 sd_match(Cond, m_SetCC(m_Specific(N1), m_SpecificInt(EltSizeInBits),
50154 return DAG.getNode(X86ISD::VSRLV, DL, VT, N00, N1);
50155 }
50156 // fold srl(select(icmp_uge(amt,BW),0,x),amt) -> avx2 psrlv(x,amt)
50158 sd_match(Cond, m_SetCC(m_Specific(N1), m_SpecificInt(EltSizeInBits),
50160 return DAG.getNode(X86ISD::VSRLV, DL, VT, N01, N1);
50161 }
50162 }
50163
50164 // Only do this on the last DAG combine as it can interfere with other
50165 // combines.
50166 if (!DCI.isAfterLegalizeDAG())
50167 return SDValue();
50168
50169 // Try to improve a sequence of srl (and X, C1), C2 by inverting the order.
50170 // TODO: This is a generic DAG combine that became an x86-only combine to
50171 // avoid shortcomings in other folds such as bswap, bit-test ('bt'), and
50172 // and-not ('andn').
50173 if (N0.getOpcode() != ISD::AND || !N0.hasOneUse())
50174 return SDValue();
50175
50176 auto *ShiftC = dyn_cast<ConstantSDNode>(N1);
50177 auto *AndC = dyn_cast<ConstantSDNode>(N0.getOperand(1));
50178 if (!ShiftC || !AndC)
50179 return SDValue();
50180
50181 // If we can shrink the constant mask below 8-bits or 32-bits, then this
50182 // transform should reduce code size. It may also enable secondary transforms
50183 // from improved known-bits analysis or instruction selection.
50184 APInt MaskVal = AndC->getAPIntValue();
50185
50186 // If this can be matched by a zero extend, don't optimize.
50187 if (MaskVal.isMask()) {
50188 unsigned TO = MaskVal.countr_one();
50189 if (TO >= 8 && isPowerOf2_32(TO))
50190 return SDValue();
50191 }
50192
50193 APInt NewMaskVal = MaskVal.lshr(ShiftC->getAPIntValue());
50194 unsigned OldMaskSize = MaskVal.getSignificantBits();
50195 unsigned NewMaskSize = NewMaskVal.getSignificantBits();
50196 if ((OldMaskSize > 8 && NewMaskSize <= 8) ||
50197 (OldMaskSize > 32 && NewMaskSize <= 32)) {
50198 // srl (and X, AndC), ShiftC --> and (srl X, ShiftC), (AndC >> ShiftC)
50199 SDValue NewMask = DAG.getConstant(NewMaskVal, DL, VT);
50200 SDValue NewShift = DAG.getNode(ISD::SRL, DL, VT, N0.getOperand(0), N1);
50201 return DAG.getNode(ISD::AND, DL, VT, NewShift, NewMask);
50202 }
50203 return SDValue();
50204}
50205
50207 const X86Subtarget &Subtarget) {
50208 unsigned Opcode = N->getOpcode();
50209 assert(isHorizOp(Opcode) && "Unexpected hadd/hsub/pack opcode");
50210
50211 SDLoc DL(N);
50212 EVT VT = N->getValueType(0);
50213 SDValue N0 = N->getOperand(0);
50214 SDValue N1 = N->getOperand(1);
50215 EVT SrcVT = N0.getValueType();
50216
50217 SDValue BC0 =
50218 N->isOnlyUserOf(N0.getNode()) ? peekThroughOneUseBitcasts(N0) : N0;
50219 SDValue BC1 =
50220 N->isOnlyUserOf(N1.getNode()) ? peekThroughOneUseBitcasts(N1) : N1;
50221
50222 // Attempt to fold HOP(LOSUBVECTOR(SHUFFLE(X)),HISUBVECTOR(SHUFFLE(X)))
50223 // to SHUFFLE(HOP(LOSUBVECTOR(X),HISUBVECTOR(X))), this is mainly for
50224 // truncation trees that help us avoid lane crossing shuffles.
50225 // TODO: There's a lot more we can do for PACK/HADD style shuffle combines.
50226 // TODO: We don't handle vXf64 shuffles yet.
50227 if (VT.is128BitVector() && SrcVT.getScalarSizeInBits() <= 32) {
50228 if (SDValue BCSrc = getSplitVectorSrc(BC0, BC1, false)) {
50230 SmallVector<int> ShuffleMask, ScaledMask;
50231 SDValue Vec = peekThroughBitcasts(BCSrc);
50232 if (getTargetShuffleInputs(Vec, ShuffleOps, ShuffleMask, DAG)) {
50234 // To keep the HOP LHS/RHS coherency, we must be able to scale the unary
50235 // shuffle to a v4X64 width - we can probably relax this in the future.
50236 if (!isAnyZero(ShuffleMask) && ShuffleOps.size() == 1 &&
50237 ShuffleOps[0].getValueType().is256BitVector() &&
50238 scaleShuffleElements(ShuffleMask, 4, ScaledMask)) {
50239 SDValue Lo, Hi;
50240 MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f32 : MVT::v4i32;
50241 std::tie(Lo, Hi) = DAG.SplitVector(ShuffleOps[0], DL);
50242 Lo = DAG.getBitcast(SrcVT, Lo);
50243 Hi = DAG.getBitcast(SrcVT, Hi);
50244 SDValue Res = DAG.getNode(Opcode, DL, VT, Lo, Hi);
50245 Res = DAG.getBitcast(ShufVT, Res);
50246 Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, ScaledMask);
50247 return DAG.getBitcast(VT, Res);
50248 }
50249 }
50250 }
50251 }
50252
50253 // Attempt to fold HOP(SHUFFLE(X,Y),SHUFFLE(Z,W)) -> SHUFFLE(HOP()).
50254 if (VT.is128BitVector() && SrcVT.getScalarSizeInBits() <= 32) {
50255 // If either/both ops are a shuffle that can scale to v2x64,
50256 // then see if we can perform this as a v4x32 post shuffle.
50257 SmallVector<SDValue> Ops0, Ops1;
50258 SmallVector<int> Mask0, Mask1, ScaledMask0, ScaledMask1;
50259 bool IsShuf0 =
50260 getTargetShuffleInputs(BC0, Ops0, Mask0, DAG) && !isAnyZero(Mask0) &&
50261 scaleShuffleElements(Mask0, 2, ScaledMask0) &&
50262 all_of(Ops0, [](SDValue Op) { return Op.getValueSizeInBits() == 128; });
50263 bool IsShuf1 =
50264 getTargetShuffleInputs(BC1, Ops1, Mask1, DAG) && !isAnyZero(Mask1) &&
50265 scaleShuffleElements(Mask1, 2, ScaledMask1) &&
50266 all_of(Ops1, [](SDValue Op) { return Op.getValueSizeInBits() == 128; });
50267 if (IsShuf0 || IsShuf1) {
50268 if (!IsShuf0) {
50269 Ops0.assign({BC0});
50270 ScaledMask0.assign({0, 1});
50271 }
50272 if (!IsShuf1) {
50273 Ops1.assign({BC1});
50274 ScaledMask1.assign({0, 1});
50275 }
50276
50277 SDValue LHS, RHS;
50278 int PostShuffle[4] = {-1, -1, -1, -1};
50279 auto FindShuffleOpAndIdx = [&](int M, int &Idx, ArrayRef<SDValue> Ops) {
50280 if (M < 0)
50281 return true;
50282 Idx = M % 2;
50283 SDValue Src = Ops[M / 2];
50284 if (!LHS || LHS == Src) {
50285 LHS = Src;
50286 return true;
50287 }
50288 if (!RHS || RHS == Src) {
50289 Idx += 2;
50290 RHS = Src;
50291 return true;
50292 }
50293 return false;
50294 };
50295 if (FindShuffleOpAndIdx(ScaledMask0[0], PostShuffle[0], Ops0) &&
50296 FindShuffleOpAndIdx(ScaledMask0[1], PostShuffle[1], Ops0) &&
50297 FindShuffleOpAndIdx(ScaledMask1[0], PostShuffle[2], Ops1) &&
50298 FindShuffleOpAndIdx(ScaledMask1[1], PostShuffle[3], Ops1)) {
50299 LHS = DAG.getBitcast(SrcVT, LHS);
50300 RHS = DAG.getBitcast(SrcVT, RHS ? RHS : LHS);
50301 MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f32 : MVT::v4i32;
50302 SDValue Res = DAG.getNode(Opcode, DL, VT, LHS, RHS);
50303 Res = DAG.getBitcast(ShufVT, Res);
50304 Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, PostShuffle);
50305 return DAG.getBitcast(VT, Res);
50306 }
50307 }
50308 }
50309
50310 // Attempt to fold HOP(SHUFFLE(X,Y),SHUFFLE(X,Y)) -> SHUFFLE(HOP(X,Y)).
50311 if (VT.is256BitVector() && Subtarget.hasInt256()) {
50312 SmallVector<int> Mask0, Mask1;
50313 SmallVector<SDValue> Ops0, Ops1;
50314 SmallVector<int, 2> ScaledMask0, ScaledMask1;
50315 if (getTargetShuffleInputs(BC0, Ops0, Mask0, DAG) && !isAnyZero(Mask0) &&
50316 getTargetShuffleInputs(BC1, Ops1, Mask1, DAG) && !isAnyZero(Mask1) &&
50317 !Ops0.empty() && !Ops1.empty() &&
50318 all_of(Ops0,
50319 [](SDValue Op) { return Op.getValueType().is256BitVector(); }) &&
50320 all_of(Ops1,
50321 [](SDValue Op) { return Op.getValueType().is256BitVector(); }) &&
50322 scaleShuffleElements(Mask0, 2, ScaledMask0) &&
50323 scaleShuffleElements(Mask1, 2, ScaledMask1)) {
50324 SDValue Op00 = peekThroughBitcasts(Ops0.front());
50325 SDValue Op10 = peekThroughBitcasts(Ops1.front());
50326 SDValue Op01 = peekThroughBitcasts(Ops0.back());
50327 SDValue Op11 = peekThroughBitcasts(Ops1.back());
50328 if ((Op00 == Op11) && (Op01 == Op10)) {
50329 std::swap(Op10, Op11);
50331 }
50332 if ((Op00 == Op10) && (Op01 == Op11)) {
50333 const int Map[4] = {0, 2, 1, 3};
50334 SmallVector<int, 4> ShuffleMask(
50335 {Map[ScaledMask0[0]], Map[ScaledMask1[0]], Map[ScaledMask0[1]],
50336 Map[ScaledMask1[1]]});
50337 MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f64 : MVT::v4i64;
50338 SDValue Res = DAG.getNode(Opcode, DL, VT, DAG.getBitcast(SrcVT, Op00),
50339 DAG.getBitcast(SrcVT, Op01));
50340 Res = DAG.getBitcast(ShufVT, Res);
50341 Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, ShuffleMask);
50342 return DAG.getBitcast(VT, Res);
50343 }
50344 }
50345 }
50346
50347 return SDValue();
50348}
50349
50352 const X86Subtarget &Subtarget) {
50353 unsigned Opcode = N->getOpcode();
50354 assert((X86ISD::PACKSS == Opcode || X86ISD::PACKUS == Opcode) &&
50355 "Unexpected pack opcode");
50356
50357 EVT VT = N->getValueType(0);
50358 SDValue N0 = N->getOperand(0);
50359 SDValue N1 = N->getOperand(1);
50360 unsigned NumDstElts = VT.getVectorNumElements();
50361 unsigned DstBitsPerElt = VT.getScalarSizeInBits();
50362 unsigned SrcBitsPerElt = 2 * DstBitsPerElt;
50363 assert(N0.getScalarValueSizeInBits() == SrcBitsPerElt &&
50364 N1.getScalarValueSizeInBits() == SrcBitsPerElt &&
50365 "Unexpected PACKSS/PACKUS input type");
50366
50367 bool IsSigned = (X86ISD::PACKSS == Opcode);
50368
50369 // Constant Folding.
50370 APInt UndefElts0, UndefElts1;
50371 SmallVector<APInt, 32> EltBits0, EltBits1;
50372 if ((N0.isUndef() || N->isOnlyUserOf(N0.getNode())) &&
50373 (N1.isUndef() || N->isOnlyUserOf(N1.getNode())) &&
50374 getTargetConstantBitsFromNode(N0, SrcBitsPerElt, UndefElts0, EltBits0,
50375 /*AllowWholeUndefs*/ true,
50376 /*AllowPartialUndefs*/ true) &&
50377 getTargetConstantBitsFromNode(N1, SrcBitsPerElt, UndefElts1, EltBits1,
50378 /*AllowWholeUndefs*/ true,
50379 /*AllowPartialUndefs*/ true)) {
50380 unsigned NumLanes = VT.getSizeInBits() / 128;
50381 unsigned NumSrcElts = NumDstElts / 2;
50382 unsigned NumDstEltsPerLane = NumDstElts / NumLanes;
50383 unsigned NumSrcEltsPerLane = NumSrcElts / NumLanes;
50384
50385 APInt Undefs(NumDstElts, 0);
50386 SmallVector<APInt, 32> Bits(NumDstElts, APInt::getZero(DstBitsPerElt));
50387 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
50388 for (unsigned Elt = 0; Elt != NumDstEltsPerLane; ++Elt) {
50389 unsigned SrcIdx = Lane * NumSrcEltsPerLane + Elt % NumSrcEltsPerLane;
50390 auto &UndefElts = (Elt >= NumSrcEltsPerLane ? UndefElts1 : UndefElts0);
50391 auto &EltBits = (Elt >= NumSrcEltsPerLane ? EltBits1 : EltBits0);
50392
50393 if (UndefElts[SrcIdx]) {
50394 Undefs.setBit(Lane * NumDstEltsPerLane + Elt);
50395 continue;
50396 }
50397
50398 APInt &Val = EltBits[SrcIdx];
50399 if (IsSigned) {
50400 // PACKSS: Truncate signed value with signed saturation.
50401 // Source values less than dst minint are saturated to minint.
50402 // Source values greater than dst maxint are saturated to maxint.
50403 Val = Val.truncSSat(DstBitsPerElt);
50404 } else {
50405 // PACKUS: Truncate signed value with unsigned saturation.
50406 // Source values less than zero are saturated to zero.
50407 // Source values greater than dst maxuint are saturated to maxuint.
50408 // NOTE: This is different from APInt::truncUSat.
50409 if (Val.isIntN(DstBitsPerElt))
50410 Val = Val.trunc(DstBitsPerElt);
50411 else if (Val.isNegative())
50412 Val = APInt::getZero(DstBitsPerElt);
50413 else
50414 Val = APInt::getAllOnes(DstBitsPerElt);
50415 }
50416 Bits[Lane * NumDstEltsPerLane + Elt] = Val;
50417 }
50418 }
50419
50420 return getConstVector(Bits, Undefs, VT.getSimpleVT(), DAG, SDLoc(N));
50421 }
50422
50423 // Try to fold PACK(SHUFFLE(),SHUFFLE()) -> SHUFFLE(PACK()).
50424 if (SDValue V = combineHorizOpWithShuffle(N, DAG, Subtarget))
50425 return V;
50426
50427 // Try to fold PACKSS(NOT(X),NOT(Y)) -> NOT(PACKSS(X,Y)).
50428 // Currently limit this to allsignbits cases only.
50429 if (IsSigned &&
50430 (N0.isUndef() || DAG.ComputeNumSignBits(N0) == SrcBitsPerElt) &&
50431 (N1.isUndef() || DAG.ComputeNumSignBits(N1) == SrcBitsPerElt)) {
50432 SDValue Not0 = N0.isUndef() ? N0 : IsNOT(N0, DAG);
50433 SDValue Not1 = N1.isUndef() ? N1 : IsNOT(N1, DAG);
50434 if (Not0 && Not1) {
50435 SDLoc DL(N);
50436 MVT SrcVT = N0.getSimpleValueType();
50437 SDValue Pack =
50438 DAG.getNode(X86ISD::PACKSS, DL, VT, DAG.getBitcast(SrcVT, Not0),
50439 DAG.getBitcast(SrcVT, Not1));
50440 return DAG.getNOT(DL, Pack, VT);
50441 }
50442 }
50443
50444 // Try to combine a PACKUSWB/PACKSSWB implemented truncate with a regular
50445 // truncate to create a larger truncate.
50446 if (Subtarget.hasAVX512() &&
50447 N0.getOpcode() == ISD::TRUNCATE && N1.isUndef() && VT == MVT::v16i8 &&
50448 N0.getOperand(0).getValueType() == MVT::v8i32) {
50449 if ((IsSigned && DAG.ComputeNumSignBits(N0) > 8) ||
50450 (!IsSigned &&
50451 DAG.MaskedValueIsZero(N0, APInt::getHighBitsSet(16, 8)))) {
50452 if (Subtarget.hasVLX())
50453 return DAG.getNode(X86ISD::VTRUNC, SDLoc(N), VT, N0.getOperand(0));
50454
50455 // Widen input to v16i32 so we can truncate that.
50456 SDLoc dl(N);
50457 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i32,
50458 N0.getOperand(0), DAG.getUNDEF(MVT::v8i32));
50459 return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, Concat);
50460 }
50461 }
50462
50463 // Try to fold PACK(EXTEND(X),EXTEND(Y)) -> CONCAT(X,Y) subvectors.
50464 if (VT.is128BitVector()) {
50465 unsigned ExtOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
50466 SDValue Src0, Src1;
50467 if (N0.getOpcode() == ExtOpc &&
50469 N0.getOperand(0).getScalarValueSizeInBits() == DstBitsPerElt) {
50470 Src0 = N0.getOperand(0);
50471 }
50472 if (N1.getOpcode() == ExtOpc &&
50474 N1.getOperand(0).getScalarValueSizeInBits() == DstBitsPerElt) {
50475 Src1 = N1.getOperand(0);
50476 }
50477 if ((Src0 || N0.isUndef()) && (Src1 || N1.isUndef())) {
50478 assert((Src0 || Src1) && "Found PACK(UNDEF,UNDEF)");
50479 Src0 = Src0 ? Src0 : DAG.getUNDEF(Src1.getValueType());
50480 Src1 = Src1 ? Src1 : DAG.getUNDEF(Src0.getValueType());
50481 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, Src0, Src1);
50482 }
50483
50484 // Try again with pack(*_extend_vector_inreg, undef).
50485 unsigned VecInRegOpc = IsSigned ? ISD::SIGN_EXTEND_VECTOR_INREG
50487 if (N0.getOpcode() == VecInRegOpc && N1.isUndef() &&
50488 N0.getOperand(0).getScalarValueSizeInBits() < DstBitsPerElt)
50489 return getEXTEND_VECTOR_INREG(ExtOpc, SDLoc(N), VT, N0.getOperand(0),
50490 DAG);
50491 }
50492
50493 // Attempt to combine as shuffle.
50494 SDValue Op(N, 0);
50495 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
50496 return Res;
50497
50498 return SDValue();
50499}
50500
50503 const X86Subtarget &Subtarget) {
50504 assert((X86ISD::HADD == N->getOpcode() || X86ISD::FHADD == N->getOpcode() ||
50505 X86ISD::HSUB == N->getOpcode() || X86ISD::FHSUB == N->getOpcode()) &&
50506 "Unexpected horizontal add/sub opcode");
50507
50508 if (!shouldUseHorizontalOp(true, DAG, Subtarget)) {
50509 MVT VT = N->getSimpleValueType(0);
50510 SDValue LHS = N->getOperand(0);
50511 SDValue RHS = N->getOperand(1);
50512
50513 // HOP(HOP'(X,X),HOP'(Y,Y)) -> HOP(PERMUTE(HOP'(X,Y)),PERMUTE(HOP'(X,Y)).
50514 if (LHS != RHS && LHS.getOpcode() == N->getOpcode() &&
50515 LHS.getOpcode() == RHS.getOpcode() &&
50516 LHS.getValueType() == RHS.getValueType() &&
50517 N->isOnlyUserOf(LHS.getNode()) && N->isOnlyUserOf(RHS.getNode())) {
50518 SDValue LHS0 = LHS.getOperand(0);
50519 SDValue LHS1 = LHS.getOperand(1);
50520 SDValue RHS0 = RHS.getOperand(0);
50521 SDValue RHS1 = RHS.getOperand(1);
50522 if ((LHS0 == LHS1 || LHS0.isUndef() || LHS1.isUndef()) &&
50523 (RHS0 == RHS1 || RHS0.isUndef() || RHS1.isUndef())) {
50524 SDLoc DL(N);
50525 SDValue Res = DAG.getNode(LHS.getOpcode(), DL, LHS.getValueType(),
50526 LHS0.isUndef() ? LHS1 : LHS0,
50527 RHS0.isUndef() ? RHS1 : RHS0);
50528 MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
50529 Res = DAG.getBitcast(ShufVT, Res);
50530 SDValue NewLHS =
50531 DAG.getNode(X86ISD::PSHUFD, DL, ShufVT, Res,
50532 getV4X86ShuffleImm8ForMask({0, 1, 0, 1}, DL, DAG));
50533 SDValue NewRHS =
50534 DAG.getNode(X86ISD::PSHUFD, DL, ShufVT, Res,
50535 getV4X86ShuffleImm8ForMask({2, 3, 2, 3}, DL, DAG));
50536 return DAG.getNode(N->getOpcode(), DL, VT, DAG.getBitcast(VT, NewLHS),
50537 DAG.getBitcast(VT, NewRHS));
50538 }
50539 }
50540 }
50541
50542 // Try to fold HOP(SHUFFLE(),SHUFFLE()) -> SHUFFLE(HOP()).
50543 if (SDValue V = combineHorizOpWithShuffle(N, DAG, Subtarget))
50544 return V;
50545
50546 return SDValue();
50547}
50548
50551 const X86Subtarget &Subtarget) {
50552 assert((X86ISD::VSHL == N->getOpcode() || X86ISD::VSRA == N->getOpcode() ||
50553 X86ISD::VSRL == N->getOpcode()) &&
50554 "Unexpected shift opcode");
50555 EVT VT = N->getValueType(0);
50556 SDValue N0 = N->getOperand(0);
50557 SDValue N1 = N->getOperand(1);
50558
50559 // Shift zero -> zero.
50561 return DAG.getConstant(0, SDLoc(N), VT);
50562
50563 // Detect constant shift amounts.
50564 APInt UndefElts;
50565 SmallVector<APInt, 32> EltBits;
50566 if (getTargetConstantBitsFromNode(N1, 64, UndefElts, EltBits,
50567 /*AllowWholeUndefs*/ true,
50568 /*AllowPartialUndefs*/ false)) {
50569 unsigned X86Opc = getTargetVShiftUniformOpcode(N->getOpcode(), false);
50570 return getTargetVShiftByConstNode(X86Opc, SDLoc(N), VT.getSimpleVT(), N0,
50571 EltBits[0].getZExtValue(), DAG);
50572 }
50573
50574 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
50575 APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
50576 if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, DCI))
50577 return SDValue(N, 0);
50578
50579 return SDValue();
50580}
50581
50584 const X86Subtarget &Subtarget) {
50585 unsigned Opcode = N->getOpcode();
50586 assert((X86ISD::VSHLI == Opcode || X86ISD::VSRAI == Opcode ||
50587 X86ISD::VSRLI == Opcode) &&
50588 "Unexpected shift opcode");
50589 bool LogicalShift = X86ISD::VSHLI == Opcode || X86ISD::VSRLI == Opcode;
50590 EVT VT = N->getValueType(0);
50591 SDValue N0 = N->getOperand(0);
50592 SDValue N1 = N->getOperand(1);
50593 unsigned NumBitsPerElt = VT.getScalarSizeInBits();
50594 assert(VT == N0.getValueType() && (NumBitsPerElt % 8) == 0 &&
50595 "Unexpected value type");
50596 assert(N1.getValueType() == MVT::i8 && "Unexpected shift amount type");
50597
50598 // (shift undef, X) -> 0
50599 if (N0.isUndef())
50600 return DAG.getConstant(0, SDLoc(N), VT);
50601
50602 // Out of range logical bit shifts are guaranteed to be zero.
50603 // Out of range arithmetic bit shifts splat the sign bit.
50604 unsigned ShiftVal = N->getConstantOperandVal(1);
50605 if (ShiftVal >= NumBitsPerElt) {
50606 if (LogicalShift)
50607 return DAG.getConstant(0, SDLoc(N), VT);
50608 ShiftVal = NumBitsPerElt - 1;
50609 }
50610
50611 // (shift X, 0) -> X
50612 if (!ShiftVal)
50613 return N0;
50614
50615 // (shift 0, C) -> 0
50617 // N0 is all zeros or undef. We guarantee that the bits shifted into the
50618 // result are all zeros, not undef.
50619 return DAG.getConstant(0, SDLoc(N), VT);
50620
50621 // (VSRAI -1, C) -> -1
50622 if (!LogicalShift && ISD::isBuildVectorAllOnes(N0.getNode()))
50623 // N0 is all ones or undef. We guarantee that the bits shifted into the
50624 // result are all ones, not undef.
50625 return DAG.getAllOnesConstant(SDLoc(N), VT);
50626
50627 auto MergeShifts = [&](SDValue X, uint64_t Amt0, uint64_t Amt1) {
50628 unsigned NewShiftVal = Amt0 + Amt1;
50629 if (NewShiftVal >= NumBitsPerElt) {
50630 // Out of range logical bit shifts are guaranteed to be zero.
50631 // Out of range arithmetic bit shifts splat the sign bit.
50632 if (LogicalShift)
50633 return DAG.getConstant(0, SDLoc(N), VT);
50634 NewShiftVal = NumBitsPerElt - 1;
50635 }
50636 return DAG.getNode(Opcode, SDLoc(N), VT, N0.getOperand(0),
50637 DAG.getTargetConstant(NewShiftVal, SDLoc(N), MVT::i8));
50638 };
50639
50640 // (shift (shift X, C2), C1) -> (shift X, (C1 + C2))
50641 if (Opcode == N0.getOpcode())
50642 return MergeShifts(N0.getOperand(0), ShiftVal, N0.getConstantOperandVal(1));
50643
50644 // (shl (add X, X), C) -> (shl X, (C + 1))
50645 if (Opcode == X86ISD::VSHLI && N0.getOpcode() == ISD::ADD &&
50646 N0.getOperand(0) == N0.getOperand(1))
50647 return MergeShifts(N0.getOperand(0), ShiftVal, 1);
50648
50649 // We can decode 'whole byte' logical bit shifts as shuffles.
50650 if (LogicalShift && (ShiftVal % 8) == 0) {
50651 SDValue Op(N, 0);
50652 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
50653 return Res;
50654 }
50655
50656 // Attempt to detect an expanded vXi64 SIGN_EXTEND_INREG vXi1 pattern, and
50657 // convert to a splatted v2Xi32 SIGN_EXTEND_INREG pattern:
50658 // psrad(pshufd(psllq(X,63),1,1,3,3),31) ->
50659 // pshufd(psrad(pslld(X,31),31),0,0,2,2).
50660 if (Opcode == X86ISD::VSRAI && NumBitsPerElt == 32 && ShiftVal == 31 &&
50661 N0.getOpcode() == X86ISD::PSHUFD &&
50662 N0.getConstantOperandVal(1) == getV4X86ShuffleImm({1, 1, 3, 3}) &&
50663 N0->hasOneUse()) {
50665 if (BC.getOpcode() == X86ISD::VSHLI &&
50666 BC.getScalarValueSizeInBits() == 64 &&
50667 BC.getConstantOperandVal(1) == 63) {
50668 SDLoc DL(N);
50669 SDValue Src = BC.getOperand(0);
50670 Src = DAG.getBitcast(VT, Src);
50671 Src = DAG.getNode(X86ISD::PSHUFD, DL, VT, Src,
50672 getV4X86ShuffleImm8ForMask({0, 0, 2, 2}, DL, DAG));
50673 Src = DAG.getNode(X86ISD::VSHLI, DL, VT, Src, N1);
50674 Src = DAG.getNode(X86ISD::VSRAI, DL, VT, Src, N1);
50675 return Src;
50676 }
50677 }
50678
50679 auto TryConstantFold = [&](SDValue V) {
50680 APInt UndefElts;
50681 SmallVector<APInt, 32> EltBits;
50682 if (!getTargetConstantBitsFromNode(V, NumBitsPerElt, UndefElts, EltBits,
50683 /*AllowWholeUndefs*/ true,
50684 /*AllowPartialUndefs*/ true))
50685 return SDValue();
50686 assert(EltBits.size() == VT.getVectorNumElements() &&
50687 "Unexpected shift value type");
50688 // Undef elements need to fold to 0. It's possible SimplifyDemandedBits
50689 // created an undef input due to no input bits being demanded, but user
50690 // still expects 0 in other bits.
50691 for (unsigned i = 0, e = EltBits.size(); i != e; ++i) {
50692 APInt &Elt = EltBits[i];
50693 if (UndefElts[i])
50694 Elt = 0;
50695 else if (X86ISD::VSHLI == Opcode)
50696 Elt <<= ShiftVal;
50697 else if (X86ISD::VSRAI == Opcode)
50698 Elt.ashrInPlace(ShiftVal);
50699 else
50700 Elt.lshrInPlace(ShiftVal);
50701 }
50702 // Reset undef elements since they were zeroed above.
50703 UndefElts = 0;
50704 return getConstVector(EltBits, UndefElts, VT.getSimpleVT(), DAG, SDLoc(N));
50705 };
50706
50707 // Constant Folding.
50708 if (N->isOnlyUserOf(N0.getNode())) {
50709 if (SDValue C = TryConstantFold(N0))
50710 return C;
50711
50712 // Fold (shift (logic X, C2), C1) -> (logic (shift X, C1), (shift C2, C1))
50713 // Don't break NOT patterns.
50715 if (ISD::isBitwiseLogicOp(BC.getOpcode()) &&
50716 BC->isOnlyUserOf(BC.getOperand(1).getNode()) &&
50718 if (SDValue RHS = TryConstantFold(BC.getOperand(1))) {
50719 SDLoc DL(N);
50720 SDValue LHS = DAG.getNode(Opcode, DL, VT,
50721 DAG.getBitcast(VT, BC.getOperand(0)), N1);
50722 return DAG.getNode(BC.getOpcode(), DL, VT, LHS, RHS);
50723 }
50724 }
50725 }
50726
50727 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
50728 if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(NumBitsPerElt),
50729 DCI))
50730 return SDValue(N, 0);
50731
50732 return SDValue();
50733}
50734
50737 const X86Subtarget &Subtarget) {
50738 EVT VT = N->getValueType(0);
50739 unsigned Opcode = N->getOpcode();
50740 assert(((Opcode == X86ISD::PINSRB && VT == MVT::v16i8) ||
50741 (Opcode == X86ISD::PINSRW && VT == MVT::v8i16) ||
50742 Opcode == ISD::INSERT_VECTOR_ELT) &&
50743 "Unexpected vector insertion");
50744
50745 SDValue Vec = N->getOperand(0);
50746 SDValue Scl = N->getOperand(1);
50747 SDValue Idx = N->getOperand(2);
50748
50749 // Fold insert_vector_elt(undef, elt, 0) --> scalar_to_vector(elt).
50750 if (Opcode == ISD::INSERT_VECTOR_ELT && Vec.isUndef() && isNullConstant(Idx))
50751 return DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N), VT, Scl);
50752
50753 if (Opcode == X86ISD::PINSRB || Opcode == X86ISD::PINSRW) {
50754 unsigned NumBitsPerElt = VT.getScalarSizeInBits();
50755 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
50756 if (TLI.SimplifyDemandedBits(SDValue(N, 0),
50757 APInt::getAllOnes(NumBitsPerElt), DCI))
50758 return SDValue(N, 0);
50759 }
50760
50761 // Attempt to combine insertion patterns to a shuffle.
50762 if (VT.isSimple() && DCI.isAfterLegalizeDAG()) {
50763 SDValue Op(N, 0);
50764 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
50765 return Res;
50766 }
50767
50768 return SDValue();
50769}
50770
50771/// Recognize the distinctive (AND (setcc ...) (setcc ..)) where both setccs
50772/// reference the same FP CMP, and rewrite for CMPEQSS and friends. Likewise for
50773/// OR -> CMPNEQSS.
50776 const X86Subtarget &Subtarget) {
50777 unsigned opcode;
50778
50779 // SSE1 supports CMP{eq|ne}SS, and SSE2 added CMP{eq|ne}SD, but
50780 // we're requiring SSE2 for both.
50781 if (Subtarget.hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) {
50782 SDValue N0 = N->getOperand(0);
50783 SDValue N1 = N->getOperand(1);
50784 SDValue CMP0 = N0.getOperand(1);
50785 SDValue CMP1 = N1.getOperand(1);
50786 SDLoc DL(N);
50787
50788 // The SETCCs should both refer to the same CMP.
50789 if (CMP0.getOpcode() != X86ISD::FCMP || CMP0 != CMP1)
50790 return SDValue();
50791
50792 SDValue CMP00 = CMP0->getOperand(0);
50793 SDValue CMP01 = CMP0->getOperand(1);
50794 EVT VT = CMP00.getValueType();
50795
50796 if (VT == MVT::f32 || VT == MVT::f64 ||
50797 (VT == MVT::f16 && Subtarget.hasFP16())) {
50798 bool ExpectingFlags = false;
50799 // Check for any users that want flags:
50800 for (const SDNode *U : N->users()) {
50801 if (ExpectingFlags)
50802 break;
50803
50804 switch (U->getOpcode()) {
50805 default:
50806 case ISD::BR_CC:
50807 case ISD::BRCOND:
50808 case ISD::SELECT:
50809 ExpectingFlags = true;
50810 break;
50811 case ISD::CopyToReg:
50812 case ISD::SIGN_EXTEND:
50813 case ISD::ZERO_EXTEND:
50814 case ISD::ANY_EXTEND:
50815 break;
50816 }
50817 }
50818
50819 if (!ExpectingFlags) {
50820 enum X86::CondCode cc0 = (enum X86::CondCode)N0.getConstantOperandVal(0);
50821 enum X86::CondCode cc1 = (enum X86::CondCode)N1.getConstantOperandVal(0);
50822
50823 if (cc1 == X86::COND_E || cc1 == X86::COND_NE) {
50824 X86::CondCode tmp = cc0;
50825 cc0 = cc1;
50826 cc1 = tmp;
50827 }
50828
50829 if ((cc0 == X86::COND_E && cc1 == X86::COND_NP) ||
50830 (cc0 == X86::COND_NE && cc1 == X86::COND_P)) {
50831 // FIXME: need symbolic constants for these magic numbers.
50832 // See X86ATTInstPrinter.cpp:printSSECC().
50833 unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4;
50834 if (Subtarget.hasAVX512()) {
50835 SDValue FSetCC =
50836 DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CMP00, CMP01,
50837 DAG.getTargetConstant(x86cc, DL, MVT::i8));
50838 // Need to fill with zeros to ensure the bitcast will produce zeroes
50839 // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
50840 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v16i1,
50841 DAG.getConstant(0, DL, MVT::v16i1),
50842 FSetCC, DAG.getVectorIdxConstant(0, DL));
50843 return DAG.getZExtOrTrunc(DAG.getBitcast(MVT::i16, Ins), DL,
50844 N->getSimpleValueType(0));
50845 }
50846 SDValue OnesOrZeroesF =
50847 DAG.getNode(X86ISD::FSETCC, DL, CMP00.getValueType(), CMP00,
50848 CMP01, DAG.getTargetConstant(x86cc, DL, MVT::i8));
50849
50850 bool is64BitFP = (CMP00.getValueType() == MVT::f64);
50851 MVT IntVT = is64BitFP ? MVT::i64 : MVT::i32;
50852
50853 if (is64BitFP && !Subtarget.is64Bit()) {
50854 // On a 32-bit target, we cannot bitcast the 64-bit float to a
50855 // 64-bit integer, since that's not a legal type. Since
50856 // OnesOrZeroesF is all ones or all zeroes, we don't need all the
50857 // bits, but can do this little dance to extract the lowest 32 bits
50858 // and work with those going forward.
50859 SDValue Vector64 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL,
50860 MVT::v2f64, OnesOrZeroesF);
50861 SDValue Vector32 = DAG.getBitcast(MVT::v4f32, Vector64);
50862 OnesOrZeroesF =
50863 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Vector32,
50864 DAG.getVectorIdxConstant(0, DL));
50865 IntVT = MVT::i32;
50866 }
50867
50868 SDValue OnesOrZeroesI = DAG.getBitcast(IntVT, OnesOrZeroesF);
50869 SDValue ANDed = DAG.getNode(ISD::AND, DL, IntVT, OnesOrZeroesI,
50870 DAG.getConstant(1, DL, IntVT));
50871 SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
50872 ANDed);
50873 return OneBitOfTruth;
50874 }
50875 }
50876 }
50877 }
50878 return SDValue();
50879}
50880
50881/// Try to fold: (and (xor X, -1), Y) -> (andnp X, Y).
50883 SelectionDAG &DAG) {
50884 assert(N->getOpcode() == ISD::AND && "Unexpected opcode combine into ANDNP");
50885
50886 MVT VT = N->getSimpleValueType(0);
50887 if (!VT.is128BitVector() && !VT.is256BitVector() && !VT.is512BitVector())
50888 return SDValue();
50889
50890 SDValue X, Y;
50891 SDValue N0 = N->getOperand(0);
50892 SDValue N1 = N->getOperand(1);
50893
50894 if (SDValue Not = IsNOT(N0, DAG)) {
50895 X = Not;
50896 Y = N1;
50897 } else if (SDValue Not = IsNOT(N1, DAG)) {
50898 X = Not;
50899 Y = N0;
50900 } else
50901 return SDValue();
50902
50903 X = DAG.getBitcast(VT, X);
50904 Y = DAG.getBitcast(VT, Y);
50905 return DAG.getNode(X86ISD::ANDNP, DL, VT, X, Y);
50906}
50907
50908/// Try to fold:
50909/// and (vector_shuffle<Z,...,Z>
50910/// (insert_vector_elt undef, (xor X, -1), Z), undef), Y
50911/// ->
50912/// andnp (vector_shuffle<Z,...,Z>
50913/// (insert_vector_elt undef, X, Z), undef), Y
50915 const X86Subtarget &Subtarget) {
50916 assert(N->getOpcode() == ISD::AND && "Unexpected opcode combine into ANDNP");
50917
50918 EVT VT = N->getValueType(0);
50919 // Do not split 256 and 512 bit vectors with SSE2 as they overwrite original
50920 // value and require extra moves.
50921 if (!((VT.is128BitVector() && Subtarget.hasSSE2()) ||
50922 ((VT.is256BitVector() || VT.is512BitVector()) && Subtarget.hasAVX())))
50923 return SDValue();
50924
50925 auto GetNot = [&DAG](SDValue V) {
50927 // TODO: SVN->hasOneUse() is a strong condition. It can be relaxed if all
50928 // end-users are ISD::AND including cases
50929 // (and(extract_vector_element(SVN), Y)).
50930 if (!SVN || !SVN->hasOneUse() || !SVN->isSplat() ||
50931 !SVN->getOperand(1).isUndef()) {
50932 return SDValue();
50933 }
50934 SDValue IVEN = SVN->getOperand(0);
50935 if (IVEN.getOpcode() != ISD::INSERT_VECTOR_ELT ||
50936 !IVEN.getOperand(0).isUndef() || !IVEN.hasOneUse())
50937 return SDValue();
50938 if (!isa<ConstantSDNode>(IVEN.getOperand(2)) ||
50939 IVEN.getConstantOperandAPInt(2) != SVN->getSplatIndex())
50940 return SDValue();
50941 SDValue Src = IVEN.getOperand(1);
50942 if (SDValue Not = IsNOT(Src, DAG)) {
50943 SDValue NotSrc = DAG.getBitcast(Src.getValueType(), Not);
50944 SDValue NotIVEN =
50946 IVEN.getOperand(0), NotSrc, IVEN.getOperand(2));
50947 return DAG.getVectorShuffle(SVN->getValueType(0), SDLoc(SVN), NotIVEN,
50948 SVN->getOperand(1), SVN->getMask());
50949 }
50950 return SDValue();
50951 };
50952
50953 SDValue X, Y;
50954 SDValue N0 = N->getOperand(0);
50955 SDValue N1 = N->getOperand(1);
50956 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
50957
50958 if (SDValue Not = GetNot(N0)) {
50959 X = Not;
50960 Y = N1;
50961 } else if (SDValue Not = GetNot(N1)) {
50962 X = Not;
50963 Y = N0;
50964 } else
50965 return SDValue();
50966
50967 X = DAG.getBitcast(VT, X);
50968 Y = DAG.getBitcast(VT, Y);
50969 SDLoc DL(N);
50970
50971 // We do not split for SSE at all, but we need to split vectors for AVX1 and
50972 // AVX2.
50973 if (!Subtarget.useAVX512Regs() && VT.is512BitVector() &&
50975 SDValue LoX, HiX;
50976 std::tie(LoX, HiX) = splitVector(X, DAG, DL);
50977 SDValue LoY, HiY;
50978 std::tie(LoY, HiY) = splitVector(Y, DAG, DL);
50979 EVT SplitVT = LoX.getValueType();
50980 SDValue LoV = DAG.getNode(X86ISD::ANDNP, DL, SplitVT, {LoX, LoY});
50981 SDValue HiV = DAG.getNode(X86ISD::ANDNP, DL, SplitVT, {HiX, HiY});
50982 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, {LoV, HiV});
50983 }
50984
50985 if (TLI.isTypeLegal(VT))
50986 return DAG.getNode(X86ISD::ANDNP, DL, VT, {X, Y});
50987
50988 return SDValue();
50989}
50990
50991// Try to widen AND, OR and XOR nodes to VT in order to remove casts around
50992// logical operations, like in the example below.
50993// or (and (truncate x, truncate y)),
50994// (xor (truncate z, build_vector (constants)))
50995// Given a target type \p VT, we generate
50996// or (and x, y), (xor z, zext(build_vector (constants)))
50997// given x, y and z are of type \p VT. We can do so, if operands are either
50998// truncates from VT types, the second operand is a vector of constants, can
50999// be recursively promoted or is an existing extension we can extend further.
51001 SelectionDAG &DAG,
51002 const X86Subtarget &Subtarget,
51003 unsigned Depth) {
51004 // Limit recursion to avoid excessive compile times.
51006 return SDValue();
51007
51008 if (!ISD::isBitwiseLogicOp(N.getOpcode()))
51009 return SDValue();
51010
51011 SDValue N0 = N.getOperand(0);
51012 SDValue N1 = N.getOperand(1);
51013
51014 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
51015 if (!TLI.isOperationLegalOrPromote(N.getOpcode(), VT))
51016 return SDValue();
51017
51018 if (SDValue NN0 =
51019 PromoteMaskArithmetic(N0, DL, VT, DAG, Subtarget, Depth + 1))
51020 N0 = NN0;
51021 else {
51022 // The left side has to be a 'trunc'.
51023 bool LHSTrunc = N0.getOpcode() == ISD::TRUNCATE &&
51024 N0.getOperand(0).getValueType() == VT;
51025 if (LHSTrunc)
51026 N0 = N0.getOperand(0);
51027 else
51028 return SDValue();
51029 }
51030
51031 if (SDValue NN1 =
51032 PromoteMaskArithmetic(N1, DL, VT, DAG, Subtarget, Depth + 1))
51033 N1 = NN1;
51034 else {
51035 // The right side has to be a 'trunc', a (foldable) constant or an
51036 // existing extension we can extend further.
51037 bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE &&
51038 N1.getOperand(0).getValueType() == VT;
51039 if (RHSTrunc)
51040 N1 = N1.getOperand(0);
51041 else if (ISD::isExtVecInRegOpcode(N1.getOpcode()) && VT.is256BitVector() &&
51042 Subtarget.hasInt256() && N1.hasOneUse())
51043 N1 = DAG.getNode(N1.getOpcode(), DL, VT, N1.getOperand(0));
51044 else if (SDValue Cst =
51046 N1 = Cst;
51047 else
51048 return SDValue();
51049 }
51050
51051 return DAG.getNode(N.getOpcode(), DL, VT, N0, N1);
51052}
51053
51054// On AVX/AVX2 the type v8i1 is legalized to v8i16, which is an XMM sized
51055// register. In most cases we actually compare or select YMM-sized registers
51056// and mixing the two types creates horrible code. This method optimizes
51057// some of the transition sequences.
51058// Even with AVX-512 this is still useful for removing casts around logical
51059// operations on vXi1 mask types.
51061 SelectionDAG &DAG,
51062 const X86Subtarget &Subtarget) {
51063 EVT VT = N.getValueType();
51064 assert(VT.isVector() && "Expected vector type");
51065 assert((N.getOpcode() == ISD::ANY_EXTEND ||
51066 N.getOpcode() == ISD::ZERO_EXTEND ||
51067 N.getOpcode() == ISD::SIGN_EXTEND) && "Invalid Node");
51068
51069 SDValue Narrow = N.getOperand(0);
51070 EVT NarrowVT = Narrow.getValueType();
51071
51072 // Generate the wide operation.
51073 SDValue Op = PromoteMaskArithmetic(Narrow, DL, VT, DAG, Subtarget, 0);
51074 if (!Op)
51075 return SDValue();
51076 switch (N.getOpcode()) {
51077 default: llvm_unreachable("Unexpected opcode");
51078 case ISD::ANY_EXTEND:
51079 return Op;
51080 case ISD::ZERO_EXTEND:
51081 return DAG.getZeroExtendInReg(Op, DL, NarrowVT);
51082 case ISD::SIGN_EXTEND:
51083 return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT,
51084 Op, DAG.getValueType(NarrowVT));
51085 }
51086}
51087
51088static unsigned convertIntLogicToFPLogicOpcode(unsigned Opcode) {
51089 unsigned FPOpcode;
51090 switch (Opcode) {
51091 // clang-format off
51092 default: llvm_unreachable("Unexpected input node for FP logic conversion");
51093 case ISD::AND: FPOpcode = X86ISD::FAND; break;
51094 case ISD::OR: FPOpcode = X86ISD::FOR; break;
51095 case ISD::XOR: FPOpcode = X86ISD::FXOR; break;
51096 // clang-format on
51097 }
51098 return FPOpcode;
51099}
51100
51101/// If both input operands of a logic op are being cast from floating-point
51102/// types or FP compares, try to convert this into a floating-point logic node
51103/// to avoid unnecessary moves from SSE to integer registers.
51104static SDValue convertIntLogicToFPLogic(unsigned Opc, const SDLoc &DL, EVT VT,
51105 SDValue N0, SDValue N1,
51106 SelectionDAG &DAG,
51108 const X86Subtarget &Subtarget) {
51109 assert((Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) &&
51110 "Unexpected bit opcode");
51111
51112 if (!((N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST) ||
51113 (N0.getOpcode() == ISD::SETCC && N1.getOpcode() == ISD::SETCC)))
51114 return SDValue();
51115
51116 SDValue N00 = N0.getOperand(0);
51117 SDValue N10 = N1.getOperand(0);
51118 EVT N00Type = N00.getValueType();
51119 EVT N10Type = N10.getValueType();
51120
51121 // Ensure that both types are the same and are legal scalar fp types.
51122 if (N00Type != N10Type || !((Subtarget.hasSSE1() && N00Type == MVT::f32) ||
51123 (Subtarget.hasSSE2() && N00Type == MVT::f64) ||
51124 (Subtarget.hasFP16() && N00Type == MVT::f16)))
51125 return SDValue();
51126
51127 if (N0.getOpcode() == ISD::BITCAST && !DCI.isBeforeLegalizeOps()) {
51128 unsigned FPOpcode = convertIntLogicToFPLogicOpcode(Opc);
51129 SDValue FPLogic = DAG.getNode(FPOpcode, DL, N00Type, N00, N10);
51130 return DAG.getBitcast(VT, FPLogic);
51131 }
51132
51133 if (VT != MVT::i1 || N0.getOpcode() != ISD::SETCC || !N0.hasOneUse() ||
51134 !N1.hasOneUse())
51135 return SDValue();
51136
51137 ISD::CondCode CC0 = cast<CondCodeSDNode>(N0.getOperand(2))->get();
51138 ISD::CondCode CC1 = cast<CondCodeSDNode>(N1.getOperand(2))->get();
51139
51140 // The vector ISA for FP predicates is incomplete before AVX, so converting
51141 // COMIS* to CMPS* may not be a win before AVX.
51142 if (!Subtarget.hasAVX() &&
51143 !(cheapX86FSETCC_SSE(CC0) && cheapX86FSETCC_SSE(CC1)))
51144 return SDValue();
51145
51146 // Convert scalar FP compares and logic to vector compares (COMIS* to CMPS*)
51147 // and vector logic:
51148 // logic (setcc N00, N01), (setcc N10, N11) -->
51149 // extelt (logic (setcc (s2v N00), (s2v N01)), setcc (s2v N10), (s2v N11))), 0
51150 unsigned NumElts = 128 / N00Type.getSizeInBits();
51151 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), N00Type, NumElts);
51152 EVT BoolVecVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, NumElts);
51153 SDValue ZeroIndex = DAG.getVectorIdxConstant(0, DL);
51154 SDValue N01 = N0.getOperand(1);
51155 SDValue N11 = N1.getOperand(1);
51156 SDValue Vec00 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N00);
51157 SDValue Vec01 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N01);
51158 SDValue Vec10 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N10);
51159 SDValue Vec11 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N11);
51160 SDValue Setcc0 = DAG.getSetCC(DL, BoolVecVT, Vec00, Vec01, CC0);
51161 SDValue Setcc1 = DAG.getSetCC(DL, BoolVecVT, Vec10, Vec11, CC1);
51162 SDValue Logic = DAG.getNode(Opc, DL, BoolVecVT, Setcc0, Setcc1);
51163 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Logic, ZeroIndex);
51164}
51165
51166// Attempt to fold BITOP(MOVMSK(X),MOVMSK(Y)) -> MOVMSK(BITOP(X,Y))
51167// to reduce XMM->GPR traffic.
51168static SDValue combineBitOpWithMOVMSK(unsigned Opc, const SDLoc &DL, SDValue N0,
51169 SDValue N1, SelectionDAG &DAG) {
51170 assert((Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) &&
51171 "Unexpected bit opcode");
51172
51173 // Both operands must be single use MOVMSK.
51174 if (N0.getOpcode() != X86ISD::MOVMSK || !N0.hasOneUse() ||
51175 N1.getOpcode() != X86ISD::MOVMSK || !N1.hasOneUse())
51176 return SDValue();
51177
51178 SDValue Vec0 = N0.getOperand(0);
51179 SDValue Vec1 = N1.getOperand(0);
51180 EVT VecVT0 = Vec0.getValueType();
51181 EVT VecVT1 = Vec1.getValueType();
51182
51183 // Both MOVMSK operands must be from vectors of the same size and same element
51184 // size, but its OK for a fp/int diff.
51185 if (VecVT0.getSizeInBits() != VecVT1.getSizeInBits() ||
51186 VecVT0.getScalarSizeInBits() != VecVT1.getScalarSizeInBits())
51187 return SDValue();
51188
51189 unsigned VecOpc =
51191 SDValue Result =
51192 DAG.getNode(VecOpc, DL, VecVT0, Vec0, DAG.getBitcast(VecVT0, Vec1));
51193 return DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
51194}
51195
51196// Attempt to fold BITOP(SHIFT(X,Z),SHIFT(Y,Z)) -> SHIFT(BITOP(X,Y),Z).
51197// NOTE: This is a very limited case of what SimplifyUsingDistributiveLaws
51198// handles in InstCombine.
51199static SDValue combineBitOpWithShift(unsigned Opc, const SDLoc &DL, EVT VT,
51200 SDValue N0, SDValue N1,
51201 SelectionDAG &DAG) {
51202 assert((Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) &&
51203 "Unexpected bit opcode");
51204
51205 // Both operands must be single use.
51206 if (!N0.hasOneUse() || !N1.hasOneUse())
51207 return SDValue();
51208
51209 // Search for matching shifts.
51212
51213 unsigned BCOpc = BC0.getOpcode();
51214 EVT BCVT = BC0.getValueType();
51215 if (BCOpc != BC1->getOpcode() || BCVT != BC1.getValueType())
51216 return SDValue();
51217
51218 switch (BCOpc) {
51219 case X86ISD::VSHLI:
51220 case X86ISD::VSRLI:
51221 case X86ISD::VSRAI: {
51222 if (BC0.getOperand(1) != BC1.getOperand(1))
51223 return SDValue();
51224 SDValue BitOp =
51225 DAG.getNode(Opc, DL, BCVT, BC0.getOperand(0), BC1.getOperand(0));
51226 SDValue Shift = DAG.getNode(BCOpc, DL, BCVT, BitOp, BC0.getOperand(1));
51227 return DAG.getBitcast(VT, Shift);
51228 }
51229 }
51230
51231 return SDValue();
51232}
51233
51234// Attempt to fold:
51235// BITOP(PACKSS(X,Z),PACKSS(Y,W)) --> PACKSS(BITOP(X,Y),BITOP(Z,W)).
51236// TODO: Handle PACKUS handling.
51237static SDValue combineBitOpWithPACK(unsigned Opc, const SDLoc &DL, EVT VT,
51238 SDValue N0, SDValue N1, SelectionDAG &DAG) {
51239 assert((Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) &&
51240 "Unexpected bit opcode");
51241
51242 // Both operands must be single use.
51243 if (!N0.hasOneUse() || !N1.hasOneUse())
51244 return SDValue();
51245
51246 // Search for matching packs.
51249
51250 if (N0.getOpcode() != X86ISD::PACKSS || N1.getOpcode() != X86ISD::PACKSS)
51251 return SDValue();
51252
51253 MVT DstVT = N0.getSimpleValueType();
51254 if (DstVT != N1.getSimpleValueType())
51255 return SDValue();
51256
51257 MVT SrcVT = N0.getOperand(0).getSimpleValueType();
51258 unsigned NumSrcBits = SrcVT.getScalarSizeInBits();
51259
51260 // Limit to allsignbits packing.
51261 if (DAG.ComputeNumSignBits(N0.getOperand(0)) != NumSrcBits ||
51262 DAG.ComputeNumSignBits(N0.getOperand(1)) != NumSrcBits ||
51263 DAG.ComputeNumSignBits(N1.getOperand(0)) != NumSrcBits ||
51264 DAG.ComputeNumSignBits(N1.getOperand(1)) != NumSrcBits)
51265 return SDValue();
51266
51267 SDValue LHS = DAG.getNode(Opc, DL, SrcVT, N0.getOperand(0), N1.getOperand(0));
51268 SDValue RHS = DAG.getNode(Opc, DL, SrcVT, N0.getOperand(1), N1.getOperand(1));
51269 return DAG.getBitcast(VT, DAG.getNode(X86ISD::PACKSS, DL, DstVT, LHS, RHS));
51270}
51271
51272/// If this is a zero/all-bits result that is bitwise-anded with a low bits
51273/// mask. (Mask == 1 for the x86 lowering of a SETCC + ZEXT), replace the 'and'
51274/// with a shift-right to eliminate loading the vector constant mask value.
51276 SelectionDAG &DAG,
51277 const X86Subtarget &Subtarget) {
51278 SDValue Op0 = peekThroughBitcasts(N->getOperand(0));
51279 SDValue Op1 = peekThroughBitcasts(N->getOperand(1));
51280 EVT VT = Op0.getValueType();
51281 if (VT != Op1.getValueType() || !VT.isSimple() || !VT.isInteger())
51282 return SDValue();
51283
51284 // Try to convert an "is positive" signbit masking operation into arithmetic
51285 // shift and "andn". This saves a materialization of a -1 vector constant.
51286 // The "is negative" variant should be handled more generally because it only
51287 // requires "and" rather than "andn":
51288 // and (pcmpgt X, -1), Y --> pandn (vsrai X, BitWidth - 1), Y
51289 //
51290 // This is limited to the original type to avoid producing even more bitcasts.
51291 // If the bitcasts can't be eliminated, then it is unlikely that this fold
51292 // will be profitable.
51293 if (N->getValueType(0) == VT &&
51294 supportedVectorShiftWithImm(VT, Subtarget, ISD::SRA)) {
51295 SDValue X, Y;
51296 if (Op1.getOpcode() == X86ISD::PCMPGT &&
51297 isAllOnesOrAllOnesSplat(Op1.getOperand(1)) && Op1.hasOneUse()) {
51298 X = Op1.getOperand(0);
51299 Y = Op0;
51300 } else if (Op0.getOpcode() == X86ISD::PCMPGT &&
51301 isAllOnesOrAllOnesSplat(Op0.getOperand(1)) && Op0.hasOneUse()) {
51302 X = Op0.getOperand(0);
51303 Y = Op1;
51304 }
51305 if (X && Y) {
51306 SDValue Sra =
51308 VT.getScalarSizeInBits() - 1, DAG);
51309 return DAG.getNode(X86ISD::ANDNP, DL, VT, Sra, Y);
51310 }
51311 }
51312
51313 APInt SplatVal;
51314 if (!X86::isConstantSplat(Op1, SplatVal, false) || !SplatVal.isMask())
51315 return SDValue();
51316
51317 // Don't prevent creation of ANDN.
51318 if (isBitwiseNot(Op0))
51319 return SDValue();
51320
51321 if (!supportedVectorShiftWithImm(VT, Subtarget, ISD::SRL))
51322 return SDValue();
51323
51324 unsigned EltBitWidth = VT.getScalarSizeInBits();
51325 if (EltBitWidth != DAG.ComputeNumSignBits(Op0))
51326 return SDValue();
51327
51328 unsigned ShiftVal = SplatVal.countr_one();
51329 SDValue ShAmt = DAG.getTargetConstant(EltBitWidth - ShiftVal, DL, MVT::i8);
51330 SDValue Shift = DAG.getNode(X86ISD::VSRLI, DL, VT, Op0, ShAmt);
51331 return DAG.getBitcast(N->getValueType(0), Shift);
51332}
51333
51334// Get the index node from the lowered DAG of a GEP IR instruction with one
51335// indexing dimension.
51337 if (Ld->isIndexed())
51338 return SDValue();
51339
51340 SDValue Base = Ld->getBasePtr();
51341 if (Base.getOpcode() != ISD::ADD)
51342 return SDValue();
51343
51344 SDValue ShiftedIndex = Base.getOperand(0);
51345 if (ShiftedIndex.getOpcode() != ISD::SHL)
51346 return SDValue();
51347
51348 return ShiftedIndex.getOperand(0);
51349}
51350
51351static bool hasBZHI(const X86Subtarget &Subtarget, MVT VT) {
51352 return Subtarget.hasBMI2() &&
51353 (VT == MVT::i32 || (VT == MVT::i64 && Subtarget.is64Bit()));
51354}
51355
51356/// Folds (and X, (or Y, ~Z)) --> (and X, ~(and ~Y, Z))
51357/// This undoes the inverse fold performed in InstCombine
51359 SelectionDAG &DAG) {
51360 using namespace llvm::SDPatternMatch;
51361 MVT VT = N->getSimpleValueType(0);
51362 if (!DAG.getTargetLoweringInfo().hasAndNot(SDValue(N, 0)))
51363 return SDValue();
51364
51365 SDValue X, Y, Z;
51366 if (sd_match(N, m_And(m_Value(X),
51367 m_OneUse(m_Or(m_Value(Y), m_Not(m_Value(Z))))))) {
51368 // Don't fold if Y or Z are constants to prevent infinite loops.
51371 return DAG.getNode(
51372 ISD::AND, DL, VT, X,
51373 DAG.getNOT(
51374 DL, DAG.getNode(ISD::AND, DL, VT, DAG.getNOT(DL, Y, VT), Z), VT));
51375 }
51376
51377 return SDValue();
51378}
51379
51380// This function recognizes cases where X86 bzhi instruction can replace and
51381// 'and-load' sequence.
51382// In case of loading integer value from an array of constants which is defined
51383// as follows:
51384//
51385// int array[SIZE] = {0x0, 0x1, 0x3, 0x7, 0xF ..., 2^(SIZE-1) - 1}
51386//
51387// then applying a bitwise and on the result with another input.
51388// It's equivalent to performing bzhi (zero high bits) on the input, with the
51389// same index of the load.
51391 const X86Subtarget &Subtarget) {
51392 MVT VT = Node->getSimpleValueType(0);
51393 SDLoc dl(Node);
51394
51395 // Check if subtarget has BZHI instruction for the node's type
51396 if (!hasBZHI(Subtarget, VT))
51397 return SDValue();
51398
51399 // Try matching the pattern for both operands.
51400 for (unsigned i = 0; i < 2; i++) {
51401 // continue if the operand is not a load instruction
51402 auto *Ld = dyn_cast<LoadSDNode>(Node->getOperand(i));
51403 if (!Ld)
51404 continue;
51405 const Value *MemOp = Ld->getMemOperand()->getValue();
51406 if (!MemOp)
51407 continue;
51408 // Get the Node which indexes into the array.
51410 if (!Index)
51411 continue;
51412
51413 if (auto *GEP = dyn_cast<GetElementPtrInst>(MemOp)) {
51414 if (auto *GV = dyn_cast<GlobalVariable>(GEP->getOperand(0))) {
51415 if (GV->isConstant() && GV->hasDefinitiveInitializer()) {
51416 Constant *Init = GV->getInitializer();
51417 Type *Ty = Init->getType();
51419 !Ty->getArrayElementType()->isIntegerTy() ||
51420 Ty->getArrayElementType()->getScalarSizeInBits() !=
51421 VT.getSizeInBits() ||
51422 Ty->getArrayNumElements() >
51423 Ty->getArrayElementType()->getScalarSizeInBits())
51424 continue;
51425
51426 // Check if the array's constant elements are suitable to our case.
51427 uint64_t ArrayElementCount = Init->getType()->getArrayNumElements();
51428 bool ConstantsMatch = true;
51429 for (uint64_t j = 0; j < ArrayElementCount; j++) {
51430 auto *Elem = cast<ConstantInt>(Init->getAggregateElement(j));
51431 if (Elem->getZExtValue() != (((uint64_t)1 << j) - 1)) {
51432 ConstantsMatch = false;
51433 break;
51434 }
51435 }
51436 if (!ConstantsMatch)
51437 continue;
51438
51439 // Do the transformation (For 32-bit type):
51440 // -> (and (load arr[idx]), inp)
51441 // <- (and (srl 0xFFFFFFFF, (sub 32, idx)))
51442 // that will be replaced with one bzhi instruction.
51443 SDValue Inp = Node->getOperand(i == 0 ? 1 : 0);
51444 SDValue SizeC = DAG.getConstant(VT.getSizeInBits(), dl, MVT::i32);
51445
51446 Index = DAG.getZExtOrTrunc(Index, dl, MVT::i32);
51447 SDValue Sub = DAG.getNode(ISD::SUB, dl, MVT::i32, SizeC, Index);
51448 Sub = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Sub);
51449
51450 SDValue AllOnes = DAG.getAllOnesConstant(dl, VT);
51451 SDValue LShr = DAG.getNode(ISD::SRL, dl, VT, AllOnes, Sub);
51452 return DAG.getNode(ISD::AND, dl, VT, Inp, LShr);
51453 }
51454 }
51455 }
51456 }
51457 return SDValue();
51458}
51459
51460// Look for (and (bitcast (vXi1 (concat_vectors (vYi1 setcc), undef,))), C)
51461// Where C is a mask containing the same number of bits as the setcc and
51462// where the setcc will freely 0 upper bits of k-register. We can replace the
51463// undef in the concat with 0s and remove the AND. This mainly helps with
51464// v2i1/v4i1 setcc being casted to scalar.
51466 const X86Subtarget &Subtarget) {
51467 assert(N->getOpcode() == ISD::AND && "Unexpected opcode!");
51468
51469 EVT VT = N->getValueType(0);
51470
51471 // Make sure this is an AND with constant. We will check the value of the
51472 // constant later.
51473 auto *C1 = dyn_cast<ConstantSDNode>(N->getOperand(1));
51474 if (!C1)
51475 return SDValue();
51476
51477 // This is implied by the ConstantSDNode.
51478 assert(!VT.isVector() && "Expected scalar VT!");
51479
51480 SDValue Src = N->getOperand(0);
51481 if (!Src.hasOneUse())
51482 return SDValue();
51483
51484 // (Optionally) peek through any_extend().
51485 if (Src.getOpcode() == ISD::ANY_EXTEND) {
51486 if (!Src.getOperand(0).hasOneUse())
51487 return SDValue();
51488 Src = Src.getOperand(0);
51489 }
51490
51491 if (Src.getOpcode() != ISD::BITCAST || !Src.getOperand(0).hasOneUse())
51492 return SDValue();
51493
51494 Src = Src.getOperand(0);
51495 EVT SrcVT = Src.getValueType();
51496
51497 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
51498 if (!SrcVT.isVector() || SrcVT.getVectorElementType() != MVT::i1 ||
51499 !TLI.isTypeLegal(SrcVT))
51500 return SDValue();
51501
51502 if (Src.getOpcode() != ISD::CONCAT_VECTORS)
51503 return SDValue();
51504
51505 // We only care about the first subvector of the concat, we expect the
51506 // other subvectors to be ignored due to the AND if we make the change.
51507 SDValue SubVec = Src.getOperand(0);
51508 EVT SubVecVT = SubVec.getValueType();
51509
51510 // The RHS of the AND should be a mask with as many bits as SubVec.
51511 if (!TLI.isTypeLegal(SubVecVT) ||
51512 !C1->getAPIntValue().isMask(SubVecVT.getVectorNumElements()))
51513 return SDValue();
51514
51515 // First subvector should be a setcc with a legal result type or a
51516 // AND containing at least one setcc with a legal result type.
51517 auto IsLegalSetCC = [&](SDValue V) {
51518 if (V.getOpcode() != ISD::SETCC)
51519 return false;
51520 EVT SetccVT = V.getOperand(0).getValueType();
51521 if (!TLI.isTypeLegal(SetccVT) ||
51522 !(Subtarget.hasVLX() || SetccVT.is512BitVector()))
51523 return false;
51524 if (!(Subtarget.hasBWI() || SetccVT.getScalarSizeInBits() >= 32))
51525 return false;
51526 return true;
51527 };
51528 if (!(IsLegalSetCC(SubVec) || (SubVec.getOpcode() == ISD::AND &&
51529 (IsLegalSetCC(SubVec.getOperand(0)) ||
51530 IsLegalSetCC(SubVec.getOperand(1))))))
51531 return SDValue();
51532
51533 // We passed all the checks. Rebuild the concat_vectors with zeroes
51534 // and cast it back to VT.
51535 SDLoc dl(N);
51536 SmallVector<SDValue, 4> Ops(Src.getNumOperands(),
51537 DAG.getConstant(0, dl, SubVecVT));
51538 Ops[0] = SubVec;
51539 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, dl, SrcVT,
51540 Ops);
51541 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), SrcVT.getSizeInBits());
51542 return DAG.getZExtOrTrunc(DAG.getBitcast(IntVT, Concat), dl, VT);
51543}
51544
51546 SDValue OpMustEq, SDValue Op, unsigned Depth) {
51547 // We don't want to go crazy with the recursion here. This isn't a super
51548 // important optimization.
51549 static constexpr unsigned kMaxDepth = 2;
51550
51551 // Only do this re-ordering if op has one use.
51552 if (!Op.hasOneUse())
51553 return SDValue();
51554
51555 SDLoc DL(Op);
51556 // If we hit another assosiative op, recurse further.
51557 if (Op.getOpcode() == Opc) {
51558 // Done recursing.
51559 if (Depth++ >= kMaxDepth)
51560 return SDValue();
51561
51562 for (unsigned OpIdx = 0; OpIdx < 2; ++OpIdx)
51563 if (SDValue R =
51564 getBMIMatchingOp(Opc, DAG, OpMustEq, Op.getOperand(OpIdx), Depth))
51565 return DAG.getNode(Op.getOpcode(), DL, Op.getValueType(), R,
51566 Op.getOperand(1 - OpIdx));
51567
51568 } else if (Op.getOpcode() == ISD::SUB) {
51569 if (Opc == ISD::AND) {
51570 // BLSI: (and x, (sub 0, x))
51571 if (isNullConstant(Op.getOperand(0)) && Op.getOperand(1) == OpMustEq)
51572 return DAG.getNode(Opc, DL, Op.getValueType(), OpMustEq, Op);
51573 }
51574 // Opc must be ISD::AND or ISD::XOR
51575 // BLSR: (and x, (sub x, 1))
51576 // BLSMSK: (xor x, (sub x, 1))
51577 if (isOneConstant(Op.getOperand(1)) && Op.getOperand(0) == OpMustEq)
51578 return DAG.getNode(Opc, DL, Op.getValueType(), OpMustEq, Op);
51579
51580 } else if (Op.getOpcode() == ISD::ADD) {
51581 // Opc must be ISD::AND or ISD::XOR
51582 // BLSR: (and x, (add x, -1))
51583 // BLSMSK: (xor x, (add x, -1))
51584 if (isAllOnesConstant(Op.getOperand(1)) && Op.getOperand(0) == OpMustEq)
51585 return DAG.getNode(Opc, DL, Op.getValueType(), OpMustEq, Op);
51586 }
51587 return SDValue();
51588}
51589
51591 const X86Subtarget &Subtarget) {
51592 EVT VT = N->getValueType(0);
51593 // Make sure this node is a candidate for BMI instructions.
51594 if (!Subtarget.hasBMI() || !VT.isScalarInteger() ||
51595 (VT != MVT::i32 && VT != MVT::i64))
51596 return SDValue();
51597
51598 assert(N->getOpcode() == ISD::AND || N->getOpcode() == ISD::XOR);
51599
51600 // Try and match LHS and RHS.
51601 for (unsigned OpIdx = 0; OpIdx < 2; ++OpIdx)
51602 if (SDValue OpMatch =
51603 getBMIMatchingOp(N->getOpcode(), DAG, N->getOperand(OpIdx),
51604 N->getOperand(1 - OpIdx), 0))
51605 return OpMatch;
51606 return SDValue();
51607}
51608
51609/// Fold AND(Y, XOR(X, NEG(X))) -> ANDN(Y, BLSMSK(X)) if BMI is available.
51611 SelectionDAG &DAG,
51612 const X86Subtarget &Subtarget) {
51613 using namespace llvm::SDPatternMatch;
51614
51615 EVT VT = And->getValueType(0);
51616 // Make sure this node is a candidate for BMI instructions.
51617 if (!Subtarget.hasBMI() || (VT != MVT::i32 && VT != MVT::i64))
51618 return SDValue();
51619
51620 SDValue X;
51621 SDValue Y;
51624 m_Value(Y))))
51625 return SDValue();
51626
51627 SDValue BLSMSK =
51628 DAG.getNode(ISD::XOR, DL, VT, X,
51629 DAG.getNode(ISD::SUB, DL, VT, X, DAG.getConstant(1, DL, VT)));
51630 SDValue AndN = DAG.getNode(ISD::AND, DL, VT, Y, DAG.getNOT(DL, BLSMSK, VT));
51631 return AndN;
51632}
51633
51635 SelectionDAG &DAG,
51637 const X86Subtarget &ST) {
51638 // cmp(setcc(cc, X), 0)
51639 // brcond ne
51640 // ->
51641 // X
51642 // brcond cc
51643
51644 // sub(setcc(cc, X), 1)
51645 // brcond ne
51646 // ->
51647 // X
51648 // brcond ~cc
51649 //
51650 // if only flag has users
51651
51652 SDValue SetCC = N->getOperand(0);
51653
51654 if (SetCC.getOpcode() != X86ISD::SETCC || !Flag.hasOneUse())
51655 return SDValue();
51656
51657 // Check the only user of flag is `brcond ne`.
51658 SDNode *BrCond = *Flag->user_begin();
51659 if (BrCond->getOpcode() != X86ISD::BRCOND)
51660 return SDValue();
51661 unsigned CondNo = 2;
51662 if (static_cast<X86::CondCode>(BrCond->getConstantOperandVal(CondNo)) !=
51664 return SDValue();
51665
51666 SDValue X = SetCC.getOperand(1);
51667 // sub has two results while X only have one. DAG combine assumes the value
51668 // type matches.
51669 if (N->getOpcode() == X86ISD::SUB)
51670 X = DAG.getMergeValues({N->getOperand(0), X}, SDLoc(N));
51671
51672 SDValue CCN = SetCC.getOperand(0);
51673 X86::CondCode CC =
51674 static_cast<X86::CondCode>(CCN->getAsAPIntVal().getSExtValue());
51676 // Update CC for the consumer of the flag.
51677 // The old CC is `ne`. Hence, when comparing the result with 0, we are
51678 // checking if the second condition evaluates to true. When comparing the
51679 // result with 1, we are checking uf the second condition evaluates to false.
51681 if (isNullConstant(N->getOperand(1)))
51682 Ops[CondNo] = CCN;
51683 else if (isOneConstant(N->getOperand(1)))
51684 Ops[CondNo] = DAG.getTargetConstant(OppositeCC, SDLoc(BrCond), MVT::i8);
51685 else
51686 llvm_unreachable("expect constant 0 or 1");
51687
51688 SDValue NewBrCond =
51689 DAG.getNode(X86ISD::BRCOND, SDLoc(BrCond), BrCond->getValueType(0), Ops);
51690 // Avoid self-assign error b/c CC1 can be `e/ne`.
51691 if (BrCond != NewBrCond.getNode())
51692 DCI.CombineTo(BrCond, NewBrCond);
51693 return X;
51694}
51695
51698 const X86Subtarget &ST) {
51699 // and/or(setcc(cc0, flag0), setcc(cc1, sub (X, Y)))
51700 // ->
51701 // setcc(cc1, ccmp(X, Y, ~cflags/cflags, cc0/~cc0, flag0))
51702
51703 // and/or(setcc(cc0, flag0), setcc(cc1, cmp (X, 0)))
51704 // ->
51705 // setcc(cc1, ctest(X, X, ~cflags/cflags, cc0/~cc0, flag0))
51706 //
51707 // where cflags is determined by cc1.
51708
51709 if (!ST.hasCCMP())
51710 return SDValue();
51711
51712 SDValue SetCC0 = N->getOperand(0);
51713 SDValue SetCC1 = N->getOperand(1);
51714 if (SetCC0.getOpcode() != X86ISD::SETCC ||
51715 SetCC1.getOpcode() != X86ISD::SETCC)
51716 return SDValue();
51717
51718 auto GetCombineToOpc = [&](SDValue V) -> unsigned {
51719 SDValue Op = V.getOperand(1);
51720 unsigned Opc = Op.getOpcode();
51721 if (Opc == X86ISD::SUB)
51722 return X86ISD::CCMP;
51723 if (Opc == X86ISD::CMP && isNullConstant(Op.getOperand(1)))
51724 return X86ISD::CTEST;
51725 return 0U;
51726 };
51727
51728 unsigned NewOpc = 0;
51729
51730 // AND/OR is commutable. Canonicalize the operands to make SETCC with SUB/CMP
51731 // appear on the right.
51732 if (!(NewOpc = GetCombineToOpc(SetCC1))) {
51733 std::swap(SetCC0, SetCC1);
51734 if (!(NewOpc = GetCombineToOpc(SetCC1)))
51735 return SDValue();
51736 }
51737
51738 X86::CondCode CC0 =
51739 static_cast<X86::CondCode>(SetCC0.getConstantOperandVal(0));
51740 // CCMP/CTEST is not conditional when the source condition is COND_P/COND_NP.
51741 if (CC0 == X86::COND_P || CC0 == X86::COND_NP)
51742 return SDValue();
51743
51744 bool IsOR = N->getOpcode() == ISD::OR;
51745
51746 // CMP/TEST is executed and updates the EFLAGS normally only when SrcCC
51747 // evaluates to true. So we need to inverse CC0 as SrcCC when the logic
51748 // operator is OR. Similar for CC1.
51749 SDValue SrcCC =
51751 SDLoc(SetCC0.getOperand(0)), MVT::i8)
51752 : SetCC0.getOperand(0);
51753 SDValue CC1N = SetCC1.getOperand(0);
51754 X86::CondCode CC1 =
51755 static_cast<X86::CondCode>(CC1N->getAsAPIntVal().getSExtValue());
51757 X86::CondCode CFlagsCC = IsOR ? CC1 : OppositeCC1;
51758 SDLoc DL(N);
51759 SDValue CFlags = DAG.getTargetConstant(
51760 X86::getCCMPCondFlagsFromCondCode(CFlagsCC), DL, MVT::i8);
51761 SDValue Sub = SetCC1.getOperand(1);
51762
51763 // Replace any uses of the old flag produced by SUB/CMP with the new one
51764 // produced by CCMP/CTEST.
51765 SDValue CCMP = (NewOpc == X86ISD::CCMP)
51766 ? DAG.getNode(X86ISD::CCMP, DL, MVT::i32,
51767 {Sub.getOperand(0), Sub.getOperand(1),
51768 CFlags, SrcCC, SetCC0.getOperand(1)})
51769 : DAG.getNode(X86ISD::CTEST, DL, MVT::i32,
51770 {Sub.getOperand(0), Sub.getOperand(0),
51771 CFlags, SrcCC, SetCC0.getOperand(1)});
51772
51773 return DAG.getNode(X86ISD::SETCC, DL, MVT::i8, {CC1N, CCMP});
51774}
51775
51778 const X86Subtarget &Subtarget) {
51779 using namespace SDPatternMatch;
51780
51781 SDValue N0 = N->getOperand(0);
51782 SDValue N1 = N->getOperand(1);
51783 EVT VT = N->getValueType(0);
51784 SDLoc dl(N);
51785 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
51786
51787 // If this is SSE1 only convert to FAND to avoid scalarization.
51788 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
51789 return DAG.getBitcast(MVT::v4i32,
51790 DAG.getNode(X86ISD::FAND, dl, MVT::v4f32,
51791 DAG.getBitcast(MVT::v4f32, N0),
51792 DAG.getBitcast(MVT::v4f32, N1)));
51793 }
51794
51795 // Use a 32-bit and+zext if upper bits known zero.
51796 if (VT == MVT::i64 && Subtarget.is64Bit() && !isa<ConstantSDNode>(N1)) {
51797 APInt HiMask = APInt::getHighBitsSet(64, 32);
51798 if (DAG.MaskedValueIsZero(N1, HiMask) ||
51799 DAG.MaskedValueIsZero(N0, HiMask)) {
51800 SDValue LHS = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, N0);
51801 SDValue RHS = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, N1);
51802 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64,
51803 DAG.getNode(ISD::AND, dl, MVT::i32, LHS, RHS));
51804 }
51805 }
51806
51807 // Match all-of bool scalar reductions into a bitcast/movmsk + cmp.
51808 // TODO: Support multiple SrcOps.
51809 if (VT == MVT::i1) {
51811 SmallVector<APInt, 2> SrcPartials;
51812 if (matchScalarReduction(SDValue(N, 0), ISD::AND, SrcOps, &SrcPartials) &&
51813 SrcOps.size() == 1) {
51814 unsigned NumElts = SrcOps[0].getValueType().getVectorNumElements();
51815 EVT MaskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
51816 SDValue Mask = combineBitcastvxi1(DAG, MaskVT, SrcOps[0], dl, Subtarget);
51817 if (!Mask && TLI.isTypeLegal(SrcOps[0].getValueType()))
51818 Mask = DAG.getBitcast(MaskVT, SrcOps[0]);
51819 if (Mask) {
51820 assert(SrcPartials[0].getBitWidth() == NumElts &&
51821 "Unexpected partial reduction mask");
51822 SDValue PartialBits = DAG.getConstant(SrcPartials[0], dl, MaskVT);
51823 Mask = DAG.getNode(ISD::AND, dl, MaskVT, Mask, PartialBits);
51824 return DAG.getSetCC(dl, MVT::i1, Mask, PartialBits, ISD::SETEQ);
51825 }
51826 }
51827 }
51828
51829 // InstCombine converts:
51830 // `(-x << C0) & C1`
51831 // to
51832 // `(x * (Pow2_Ceil(C1) - (1 << C0))) & C1`
51833 // This saves an IR instruction but on x86 the neg/shift version is preferable
51834 // so undo the transform.
51835
51836 if (N0.getOpcode() == ISD::MUL && N0.hasOneUse()) {
51837 // TODO: We don't actually need a splat for this, we just need the checks to
51838 // hold for each element.
51839 ConstantSDNode *N1C = isConstOrConstSplat(N1, /*AllowUndefs*/ true,
51840 /*AllowTruncation*/ false);
51841 ConstantSDNode *N01C =
51842 isConstOrConstSplat(N0.getOperand(1), /*AllowUndefs*/ true,
51843 /*AllowTruncation*/ false);
51844 if (N1C && N01C) {
51845 const APInt &MulC = N01C->getAPIntValue();
51846 const APInt &AndC = N1C->getAPIntValue();
51847 APInt MulCLowBit = MulC & (-MulC);
51848 if (MulC.uge(AndC) && !MulC.isPowerOf2() &&
51849 (MulCLowBit + MulC).isPowerOf2()) {
51850 SDValue Neg = DAG.getNegative(N0.getOperand(0), dl, VT);
51851 int32_t MulCLowBitLog = MulCLowBit.exactLogBase2();
51852 assert(MulCLowBitLog != -1 &&
51853 "Isolated lowbit is somehow not a power of 2!");
51854 SDValue Shift = DAG.getNode(ISD::SHL, dl, VT, Neg,
51855 DAG.getConstant(MulCLowBitLog, dl, VT));
51856 return DAG.getNode(ISD::AND, dl, VT, Shift, N1);
51857 }
51858 }
51859 }
51860
51861 if (SDValue SetCC = combineAndOrForCcmpCtest(N, DAG, DCI, Subtarget))
51862 return SetCC;
51863
51864 if (SDValue V = combineScalarAndWithMaskSetcc(N, DAG, Subtarget))
51865 return V;
51866
51867 if (SDValue R = combineBitOpWithMOVMSK(N->getOpcode(), dl, N0, N1, DAG))
51868 return R;
51869
51870 if (SDValue R = combineBitOpWithShift(N->getOpcode(), dl, VT, N0, N1, DAG))
51871 return R;
51872
51873 if (SDValue R = combineBitOpWithPACK(N->getOpcode(), dl, VT, N0, N1, DAG))
51874 return R;
51875
51876 if (SDValue FPLogic = convertIntLogicToFPLogic(N->getOpcode(), dl, VT, N0, N1,
51877 DAG, DCI, Subtarget))
51878 return FPLogic;
51879
51880 if (SDValue R = combineAndShuffleNot(N, DAG, Subtarget))
51881 return R;
51882
51883 if (DCI.isBeforeLegalizeOps())
51884 return SDValue();
51885
51886 if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
51887 return R;
51888
51889 if (SDValue R = combineAndNotIntoANDNP(N, dl ,DAG))
51890 return R;
51891
51892 if (SDValue ShiftRight = combineAndMaskToShift(N, dl, DAG, Subtarget))
51893 return ShiftRight;
51894
51895 if (SDValue R = combineAndLoadToBZHI(N, DAG, Subtarget))
51896 return R;
51897
51898 if (SDValue R = combineAndNotOrIntoAndNotAnd(N, dl, DAG))
51899 return R;
51900
51901 // fold (and (mul x, c1), c2) -> (mul x, (and c1, c2))
51902 // iff c2 is all/no bits mask - i.e. a select-with-zero mask.
51903 // TODO: Handle PMULDQ/PMULUDQ/VPMADDWD/VPMADDUBSW?
51904 if (VT.isVector() && getTargetConstantFromNode(N1)) {
51905 unsigned Opc0 = N0.getOpcode();
51906 if ((Opc0 == ISD::MUL || Opc0 == ISD::MULHU || Opc0 == ISD::MULHS) &&
51908 DAG.ComputeNumSignBits(N1) == VT.getScalarSizeInBits() &&
51909 N0->hasOneUse() && N0.getOperand(1)->hasOneUse()) {
51910 SDValue MaskMul = DAG.getNode(ISD::AND, dl, VT, N0.getOperand(1), N1);
51911 return DAG.getNode(Opc0, dl, VT, N0.getOperand(0), MaskMul);
51912 }
51913 }
51914
51915 // On AVX512 targets, attempt to reverse foldVSelectToSignBitSplatMask.
51916 // to make use of predicated selects.
51917 // AND(X,SEXT(SETCC())) -> SELECT(SETCC(),X,0)
51918 if (DCI.isAfterLegalizeDAG() && VT.isVector()) {
51919 SDValue X, Y;
51920 EVT CondVT = VT.changeVectorElementType(MVT::i1);
51921 if (TLI.isTypeLegal(VT) && TLI.isTypeLegal(CondVT) &&
51922 (VT.is512BitVector() || Subtarget.hasVLX()) &&
51923 (VT.getScalarSizeInBits() >= 32 || Subtarget.hasBWI()) &&
51926 m_Value(Y), m_SpecificVT(CondVT),
51927 m_SetCC(m_Value(), m_Value(), m_Value()))))))) {
51928 return DAG.getSelect(dl, VT, Y, X,
51929 getZeroVector(VT.getSimpleVT(), Subtarget, DAG, dl));
51930 }
51931 }
51932
51933 // Fold AND(SRL(X,Y),1) -> SETCC(BT(X,Y), COND_B) iff Y is not a constant
51934 // avoids slow variable shift (moving shift amount to ECX etc.)
51935 if (isOneConstant(N1) && N0->hasOneUse()) {
51936 SDValue Src = N0;
51937 while ((Src.getOpcode() == ISD::ZERO_EXTEND ||
51938 Src.getOpcode() == ISD::TRUNCATE) &&
51939 Src.getOperand(0)->hasOneUse())
51940 Src = Src.getOperand(0);
51941 bool ContainsNOT = false;
51942 X86::CondCode X86CC = X86::COND_B;
51943 // Peek through AND(NOT(SRL(X,Y)),1).
51944 if (isBitwiseNot(Src)) {
51945 Src = Src.getOperand(0);
51946 X86CC = X86::COND_AE;
51947 ContainsNOT = true;
51948 }
51949 if (Src.getOpcode() == ISD::SRL &&
51950 !isa<ConstantSDNode>(Src.getOperand(1))) {
51951 SDValue BitNo = Src.getOperand(1);
51952 Src = Src.getOperand(0);
51953 // Peek through AND(SRL(NOT(X),Y),1).
51954 if (isBitwiseNot(Src)) {
51955 Src = Src.getOperand(0);
51956 X86CC = X86CC == X86::COND_AE ? X86::COND_B : X86::COND_AE;
51957 ContainsNOT = true;
51958 }
51959 // If we have BMI2 then SHRX should be faster for i32/i64 cases.
51960 if (!(Subtarget.hasBMI2() && !ContainsNOT && VT.getSizeInBits() >= 32))
51961 if (SDValue BT = getBT(Src, BitNo, dl, DAG))
51962 return DAG.getZExtOrTrunc(getSETCC(X86CC, BT, dl, DAG), dl, VT);
51963 }
51964 }
51965
51966 if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
51967 // Attempt to recursively combine a bitmask AND with shuffles.
51968 SDValue Op(N, 0);
51969 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
51970 return Res;
51971
51972 // If either operand is a constant mask, then only the elements that aren't
51973 // zero are actually demanded by the other operand.
51974 auto GetDemandedMasks = [&](SDValue Op) {
51975 APInt UndefElts;
51976 SmallVector<APInt> EltBits;
51977 int NumElts = VT.getVectorNumElements();
51978 int EltSizeInBits = VT.getScalarSizeInBits();
51979 APInt DemandedBits = APInt::getAllOnes(EltSizeInBits);
51980 APInt DemandedElts = APInt::getAllOnes(NumElts);
51981 if (getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts,
51982 EltBits)) {
51983 DemandedBits.clearAllBits();
51984 DemandedElts.clearAllBits();
51985 for (int I = 0; I != NumElts; ++I) {
51986 if (UndefElts[I]) {
51987 // We can't assume an undef src element gives an undef dst - the
51988 // other src might be zero.
51989 DemandedBits.setAllBits();
51990 DemandedElts.setBit(I);
51991 } else if (!EltBits[I].isZero()) {
51992 DemandedBits |= EltBits[I];
51993 DemandedElts.setBit(I);
51994 }
51995 }
51996 }
51997 return std::make_pair(DemandedBits, DemandedElts);
51998 };
51999 APInt Bits0, Elts0;
52000 APInt Bits1, Elts1;
52001 std::tie(Bits0, Elts0) = GetDemandedMasks(N1);
52002 std::tie(Bits1, Elts1) = GetDemandedMasks(N0);
52003
52004 if (TLI.SimplifyDemandedVectorElts(N0, Elts0, DCI) ||
52005 TLI.SimplifyDemandedVectorElts(N1, Elts1, DCI) ||
52006 TLI.SimplifyDemandedBits(N0, Bits0, Elts0, DCI) ||
52007 TLI.SimplifyDemandedBits(N1, Bits1, Elts1, DCI)) {
52008 if (N->getOpcode() != ISD::DELETED_NODE)
52009 DCI.AddToWorklist(N);
52010 return SDValue(N, 0);
52011 }
52012
52013 SDValue NewN0 = TLI.SimplifyMultipleUseDemandedBits(N0, Bits0, Elts0, DAG);
52014 SDValue NewN1 = TLI.SimplifyMultipleUseDemandedBits(N1, Bits1, Elts1, DAG);
52015 if (NewN0 || NewN1)
52016 return DAG.getNode(ISD::AND, dl, VT, NewN0 ? NewN0 : N0,
52017 NewN1 ? NewN1 : N1);
52018 }
52019
52020 // Attempt to combine a scalar bitmask AND with an extracted shuffle.
52021 if ((VT.getScalarSizeInBits() % 8) == 0 &&
52023 isa<ConstantSDNode>(N0.getOperand(1)) && N0->hasOneUse()) {
52024 SDValue BitMask = N1;
52025 SDValue SrcVec = N0.getOperand(0);
52026 EVT SrcVecVT = SrcVec.getValueType();
52027
52028 // Check that the constant bitmask masks whole bytes.
52029 APInt UndefElts;
52030 SmallVector<APInt, 64> EltBits;
52031 if (VT == SrcVecVT.getScalarType() && N0->isOnlyUserOf(SrcVec.getNode()) &&
52032 getTargetConstantBitsFromNode(BitMask, 8, UndefElts, EltBits) &&
52033 llvm::all_of(EltBits, [](const APInt &M) {
52034 return M.isZero() || M.isAllOnes();
52035 })) {
52036 unsigned NumElts = SrcVecVT.getVectorNumElements();
52037 unsigned Scale = SrcVecVT.getScalarSizeInBits() / 8;
52038 unsigned Idx = N0.getConstantOperandVal(1);
52039
52040 // Create a root shuffle mask from the byte mask and the extracted index.
52041 SmallVector<int, 16> ShuffleMask(NumElts * Scale, SM_SentinelUndef);
52042 for (unsigned i = 0; i != Scale; ++i) {
52043 if (UndefElts[i])
52044 continue;
52045 int VecIdx = Scale * Idx + i;
52046 ShuffleMask[VecIdx] = EltBits[i].isZero() ? SM_SentinelZero : VecIdx;
52047 }
52048
52050 {SrcVec}, 0, SrcVec.getOpcode(), SrcVec.getSimpleValueType(),
52051 ShuffleMask, {}, /*Depth=*/1, X86::MaxShuffleCombineDepth,
52052 /*AllowVariableCrossLaneMask=*/true,
52053 /*AllowVariablePerLaneMask=*/true,
52054 /*IsMaskedShuffle=*/false, DAG, SDLoc(SrcVec), Subtarget))
52055 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Shuffle,
52056 N0.getOperand(1));
52057 }
52058 }
52059
52060 if (SDValue R = combineBMILogicOp(N, DAG, Subtarget))
52061 return R;
52062
52063 if (SDValue R = combineAndXorSubWithBMI(N, dl, DAG, Subtarget))
52064 return R;
52065
52066 return SDValue();
52067}
52068
52069// Canonicalize OR(AND(X,C),AND(Y,~C)) -> OR(AND(X,C),ANDNP(C,Y))
52071 SelectionDAG &DAG,
52072 const X86Subtarget &Subtarget) {
52073 assert(N->getOpcode() == ISD::OR && "Unexpected Opcode");
52074
52075 MVT VT = N->getSimpleValueType(0);
52076 unsigned EltSizeInBits = VT.getScalarSizeInBits();
52077 if (!VT.isVector() || (EltSizeInBits % 8) != 0)
52078 return SDValue();
52079
52080 SDValue N0 = peekThroughBitcasts(N->getOperand(0));
52081 SDValue N1 = peekThroughBitcasts(N->getOperand(1));
52082 if (N0.getOpcode() != ISD::AND || N1.getOpcode() != ISD::AND)
52083 return SDValue();
52084
52085 // On XOP we'll lower to PCMOV so accept one use. With AVX512, we can use
52086 // VPTERNLOG. Otherwise only do this if either mask has multiple uses already.
52087 if (!(Subtarget.hasXOP() || useVPTERNLOG(Subtarget, VT) ||
52088 !N0.getOperand(1).hasOneUse() || !N1.getOperand(1).hasOneUse()))
52089 return SDValue();
52090
52091 // Attempt to extract constant byte masks.
52092 APInt UndefElts0, UndefElts1;
52093 SmallVector<APInt, 32> EltBits0, EltBits1;
52094 if (!getTargetConstantBitsFromNode(N0.getOperand(1), 8, UndefElts0, EltBits0,
52095 /*AllowWholeUndefs*/ false,
52096 /*AllowPartialUndefs*/ false))
52097 return SDValue();
52098 if (!getTargetConstantBitsFromNode(N1.getOperand(1), 8, UndefElts1, EltBits1,
52099 /*AllowWholeUndefs*/ false,
52100 /*AllowPartialUndefs*/ false))
52101 return SDValue();
52102
52103 for (unsigned i = 0, e = EltBits0.size(); i != e; ++i) {
52104 // TODO - add UNDEF elts support.
52105 if (UndefElts0[i] || UndefElts1[i])
52106 return SDValue();
52107 if (EltBits0[i] != ~EltBits1[i])
52108 return SDValue();
52109 }
52110
52111 if (useVPTERNLOG(Subtarget, VT)) {
52112 // Emit a VPTERNLOG node directly - 0xCA is the imm code for A?B:C.
52113 // VPTERNLOG is only available as vXi32/64-bit types.
52114 MVT OpSVT = EltSizeInBits <= 32 ? MVT::i32 : MVT::i64;
52115 MVT OpVT =
52116 MVT::getVectorVT(OpSVT, VT.getSizeInBits() / OpSVT.getSizeInBits());
52117 SDValue A = DAG.getBitcast(OpVT, N0.getOperand(1));
52118 SDValue B = DAG.getBitcast(OpVT, N0.getOperand(0));
52119 SDValue C = DAG.getBitcast(OpVT, N1.getOperand(0));
52120 SDValue Imm = DAG.getTargetConstant(0xCA, DL, MVT::i8);
52121 SDValue Res = getAVX512Node(X86ISD::VPTERNLOG, DL, OpVT, {A, B, C, Imm},
52122 DAG, Subtarget);
52123 return DAG.getBitcast(VT, Res);
52124 }
52125
52126 SDValue X = N->getOperand(0);
52127 SDValue Y =
52128 DAG.getNode(X86ISD::ANDNP, DL, VT, DAG.getBitcast(VT, N0.getOperand(1)),
52129 DAG.getBitcast(VT, N1.getOperand(0)));
52130 return DAG.getNode(ISD::OR, DL, VT, X, Y);
52131}
52132
52133// Try to match OR(ANDNP(MASK,X),AND(MASK,Y)) logic pattern.
52134// TODO: Try to match OR(AND(~MASK,X),AND(MASK,Y)) logic pattern.
52135// Waiting for ANDNP combine allows other combines to happen that prevent
52136// matching.
52137static bool matchLogicBlend(SDNode *N, SDValue &X, SDValue &Y, SDValue &Mask) {
52138 using namespace SDPatternMatch;
52139 return sd_match(N, m_Or(m_BinOp(X86ISD::ANDNP, m_Value(Mask), m_Value(X)),
52140 m_And(m_Deferred(Mask), m_Value(Y))));
52141}
52142
52143// Try to fold:
52144// (or (and (m, y), (pandn m, x)))
52145// into:
52146// (vselect m, x, y)
52147// As a special case, try to fold:
52148// (or (and (m, (sub 0, x)), (pandn m, x)))
52149// into:
52150// (sub (xor X, M), M)
52152 SelectionDAG &DAG,
52153 const X86Subtarget &Subtarget) {
52154 assert(N->getOpcode() == ISD::OR && "Unexpected Opcode");
52155
52156 EVT VT = N->getValueType(0);
52157 if (!((VT.is128BitVector() && Subtarget.hasSSE2()) ||
52158 (VT.is256BitVector() && Subtarget.hasInt256())))
52159 return SDValue();
52160
52161 SDValue X, Y, Mask;
52162 if (!matchLogicBlend(N, X, Y, Mask))
52163 return SDValue();
52164
52165 // Validate that X, Y, and Mask are bitcasts, and see through them.
52166 Mask = peekThroughBitcasts(Mask);
52169
52170 EVT MaskVT = Mask.getValueType();
52171 unsigned EltBits = MaskVT.getScalarSizeInBits();
52172
52173 // TODO: Attempt to handle floating point cases as well?
52174 if (!MaskVT.isInteger() || DAG.ComputeNumSignBits(Mask) != EltBits)
52175 return SDValue();
52176
52177 // Attempt to combine to conditional negate: (sub (xor X, M), M)
52178 if (SDValue Res = combineLogicBlendIntoConditionalNegate(VT, Mask, X, Y, DL,
52179 DAG, Subtarget))
52180 return Res;
52181
52182 // PBLENDVB is only available on SSE 4.1.
52183 if (!Subtarget.hasSSE41())
52184 return SDValue();
52185
52186 // If we have VPTERNLOG we should prefer that since PBLENDVB is multiple uops.
52187 if (Subtarget.hasVLX())
52188 return SDValue();
52189
52190 MVT BlendVT = VT.is256BitVector() ? MVT::v32i8 : MVT::v16i8;
52191
52192 X = DAG.getBitcast(BlendVT, X);
52193 Y = DAG.getBitcast(BlendVT, Y);
52194 Mask = DAG.getBitcast(BlendVT, Mask);
52195 Mask = DAG.getSelect(DL, BlendVT, Mask, Y, X);
52196 return DAG.getBitcast(VT, Mask);
52197}
52198
52199// Helper function for combineOrCmpEqZeroToCtlzSrl
52200// Transforms:
52201// seteq(cmp x, 0)
52202// into:
52203// srl(ctlz x), log2(bitsize(x))
52204// Input pattern is checked by caller.
52206 SDValue Cmp = Op.getOperand(1);
52207 EVT VT = Cmp.getOperand(0).getValueType();
52208 unsigned Log2b = Log2_32(VT.getSizeInBits());
52209 SDLoc dl(Op);
52210 SDValue Clz = DAG.getNode(ISD::CTLZ, dl, VT, Cmp->getOperand(0));
52211 // The result of the shift is true or false, and on X86, the 32-bit
52212 // encoding of shr and lzcnt is more desirable.
52213 SDValue Trunc = DAG.getZExtOrTrunc(Clz, dl, MVT::i32);
52214 SDValue Scc = DAG.getNode(ISD::SRL, dl, MVT::i32, Trunc,
52215 DAG.getConstant(Log2b, dl, MVT::i8));
52216 return Scc;
52217}
52218
52219// Try to transform:
52220// zext(or(setcc(eq, (cmp x, 0)), setcc(eq, (cmp y, 0))))
52221// into:
52222// srl(or(ctlz(x), ctlz(y)), log2(bitsize(x))
52223// Will also attempt to match more generic cases, eg:
52224// zext(or(or(setcc(eq, cmp 0), setcc(eq, cmp 0)), setcc(eq, cmp 0)))
52225// Only applies if the target supports the FastLZCNT feature.
52228 const X86Subtarget &Subtarget) {
52229 if (DCI.isBeforeLegalize() || !Subtarget.getTargetLowering()->isCtlzFast())
52230 return SDValue();
52231
52232 auto isORCandidate = [](SDValue N) {
52233 return (N->getOpcode() == ISD::OR && N->hasOneUse());
52234 };
52235
52236 // Check the zero extend is extending to 32-bit or more. The code generated by
52237 // srl(ctlz) for 16-bit or less variants of the pattern would require extra
52238 // instructions to clear the upper bits.
52239 if (!N->hasOneUse() || !N->getSimpleValueType(0).bitsGE(MVT::i32) ||
52240 !isORCandidate(N->getOperand(0)))
52241 return SDValue();
52242
52243 // Check the node matches: setcc(eq, cmp 0)
52244 auto isSetCCCandidate = [](SDValue N) {
52245 return N->getOpcode() == X86ISD::SETCC && N->hasOneUse() &&
52246 X86::CondCode(N->getConstantOperandVal(0)) == X86::COND_E &&
52247 N->getOperand(1).getOpcode() == X86ISD::CMP &&
52248 isNullConstant(N->getOperand(1).getOperand(1)) &&
52249 N->getOperand(1).getValueType().bitsGE(MVT::i32);
52250 };
52251
52252 SDNode *OR = N->getOperand(0).getNode();
52253 SDValue LHS = OR->getOperand(0);
52254 SDValue RHS = OR->getOperand(1);
52255
52256 // Save nodes matching or(or, setcc(eq, cmp 0)).
52258 while (((isORCandidate(LHS) && isSetCCCandidate(RHS)) ||
52259 (isORCandidate(RHS) && isSetCCCandidate(LHS)))) {
52260 ORNodes.push_back(OR);
52261 OR = (LHS->getOpcode() == ISD::OR) ? LHS.getNode() : RHS.getNode();
52262 LHS = OR->getOperand(0);
52263 RHS = OR->getOperand(1);
52264 }
52265
52266 // The last OR node should match or(setcc(eq, cmp 0), setcc(eq, cmp 0)).
52267 if (!(isSetCCCandidate(LHS) && isSetCCCandidate(RHS)) ||
52268 !isORCandidate(SDValue(OR, 0)))
52269 return SDValue();
52270
52271 // We have a or(setcc(eq, cmp 0), setcc(eq, cmp 0)) pattern, try to lower it
52272 // to
52273 // or(srl(ctlz),srl(ctlz)).
52274 // The dag combiner can then fold it into:
52275 // srl(or(ctlz, ctlz)).
52276 SDValue NewLHS = lowerX86CmpEqZeroToCtlzSrl(LHS, DAG);
52277 SDValue Ret, NewRHS;
52278 if (NewLHS && (NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, DAG)))
52279 Ret = DAG.getNode(ISD::OR, SDLoc(OR), MVT::i32, NewLHS, NewRHS);
52280
52281 if (!Ret)
52282 return SDValue();
52283
52284 // Try to lower nodes matching the or(or, setcc(eq, cmp 0)) pattern.
52285 while (!ORNodes.empty()) {
52286 OR = ORNodes.pop_back_val();
52287 LHS = OR->getOperand(0);
52288 RHS = OR->getOperand(1);
52289 // Swap rhs with lhs to match or(setcc(eq, cmp, 0), or).
52290 if (RHS->getOpcode() == ISD::OR)
52291 std::swap(LHS, RHS);
52292 NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, DAG);
52293 if (!NewRHS)
52294 return SDValue();
52295 Ret = DAG.getNode(ISD::OR, SDLoc(OR), MVT::i32, Ret, NewRHS);
52296 }
52297
52298 return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), Ret);
52299}
52300
52301/// If this is an add or subtract where one operand is produced by a cmp+setcc,
52302/// then try to convert it to an ADC or SBB. This replaces TEST+SET+{ADD/SUB}
52303/// with CMP+{ADC, SBB}.
52304/// Also try (ADD/SUB)+(AND(SRL,1)) bit extraction pattern with BT+{ADC, SBB}.
52305static SDValue combineAddOrSubToADCOrSBB(bool IsSub, const SDLoc &DL, EVT VT,
52306 SDValue X, SDValue Y,
52307 SelectionDAG &DAG,
52308 bool ZeroSecondOpOnly = false) {
52309 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
52310 return SDValue();
52311
52312 // Look through a one-use zext.
52313 if (Y.getOpcode() == ISD::ZERO_EXTEND && Y.hasOneUse())
52314 Y = Y.getOperand(0);
52315
52316 X86::CondCode CC;
52317 SDValue EFLAGS;
52318 if (Y.getOpcode() == X86ISD::SETCC && Y.hasOneUse()) {
52319 CC = (X86::CondCode)Y.getConstantOperandVal(0);
52320 EFLAGS = Y.getOperand(1);
52321 } else if (Y.getOpcode() == ISD::AND && isOneConstant(Y.getOperand(1)) &&
52322 Y.hasOneUse()) {
52323 EFLAGS = LowerAndToBT(Y, ISD::SETNE, DL, DAG, CC);
52324 }
52325
52326 if (!EFLAGS)
52327 return SDValue();
52328
52329 // If X is -1 or 0, then we have an opportunity to avoid constants required in
52330 // the general case below.
52331 auto *ConstantX = dyn_cast<ConstantSDNode>(X);
52332 if (ConstantX && !ZeroSecondOpOnly) {
52333 if ((!IsSub && CC == X86::COND_AE && ConstantX->isAllOnes()) ||
52334 (IsSub && CC == X86::COND_B && ConstantX->isZero())) {
52335 // This is a complicated way to get -1 or 0 from the carry flag:
52336 // -1 + SETAE --> -1 + (!CF) --> CF ? -1 : 0 --> SBB %eax, %eax
52337 // 0 - SETB --> 0 - (CF) --> CF ? -1 : 0 --> SBB %eax, %eax
52338 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
52339 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
52340 EFLAGS);
52341 }
52342
52343 if ((!IsSub && CC == X86::COND_BE && ConstantX->isAllOnes()) ||
52344 (IsSub && CC == X86::COND_A && ConstantX->isZero())) {
52345 if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&
52346 EFLAGS.getValueType().isInteger() &&
52347 !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
52348 // Swap the operands of a SUB, and we have the same pattern as above.
52349 // -1 + SETBE (SUB A, B) --> -1 + SETAE (SUB B, A) --> SUB + SBB
52350 // 0 - SETA (SUB A, B) --> 0 - SETB (SUB B, A) --> SUB + SBB
52351 SDValue NewSub = DAG.getNode(
52352 X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
52353 EFLAGS.getOperand(1), EFLAGS.getOperand(0));
52354 SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
52355 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
52356 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
52357 NewEFLAGS);
52358 }
52359 }
52360 }
52361
52362 if (CC == X86::COND_B) {
52363 // X + SETB Z --> adc X, 0
52364 // X - SETB Z --> sbb X, 0
52365 return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL,
52366 DAG.getVTList(VT, MVT::i32), X,
52367 DAG.getConstant(0, DL, VT), EFLAGS);
52368 }
52369
52370 if (ZeroSecondOpOnly)
52371 return SDValue();
52372
52373 if (CC == X86::COND_A) {
52374 // Try to convert COND_A into COND_B in an attempt to facilitate
52375 // materializing "setb reg".
52376 //
52377 // Do not flip "e > c", where "c" is a constant, because Cmp instruction
52378 // cannot take an immediate as its first operand.
52379 //
52380 // If EFLAGS is from a CMP that compares the same operands as the earlier
52381 // SUB producing X (i.e. CMP X, Y), we can directly use the carry flag with
52382 // SBB/ADC without creating a flipped SUB.
52383 if (EFLAGS.getOpcode() == X86ISD::CMP &&
52384 EFLAGS.getValueType().isInteger() && X == EFLAGS.getOperand(0)) {
52385 return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL,
52386 DAG.getVTList(VT, MVT::i32), X,
52387 DAG.getConstant(0, DL, VT), EFLAGS);
52388 }
52389
52390 if (EFLAGS.getOpcode() == X86ISD::SUB &&
52391 EFLAGS.getValueType().isInteger() &&
52392 !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
52393 // Only create NewSub if we know one of the folds will succeed to avoid
52394 // introducing a temporary node that may persist and affect one-use checks
52395 // below.
52396 if (EFLAGS.getNode()->hasOneUse()) {
52397 SDValue NewSub = DAG.getNode(
52398 X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
52399 EFLAGS.getOperand(1), EFLAGS.getOperand(0));
52400 SDValue NewEFLAGS = NewSub.getValue(EFLAGS.getResNo());
52401 return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL,
52402 DAG.getVTList(VT, MVT::i32), X,
52403 DAG.getConstant(0, DL, VT), NewEFLAGS);
52404 }
52405
52406 if (IsSub && X == EFLAGS.getValue(0)) {
52407 SDValue NewSub = DAG.getNode(
52408 X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
52409 EFLAGS.getOperand(1), EFLAGS.getOperand(0));
52410 SDValue NewEFLAGS = NewSub.getValue(EFLAGS.getResNo());
52411 return DAG.getNode(X86ISD::SBB, DL, DAG.getVTList(VT, MVT::i32),
52412 EFLAGS.getOperand(0), EFLAGS.getOperand(1),
52413 NewEFLAGS);
52414 }
52415 }
52416 }
52417
52418 if (CC == X86::COND_AE) {
52419 // X + SETAE --> sbb X, -1
52420 // X - SETAE --> adc X, -1
52421 return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL,
52422 DAG.getVTList(VT, MVT::i32), X,
52423 DAG.getAllOnesConstant(DL, VT), EFLAGS);
52424 }
52425
52426 if (CC == X86::COND_BE) {
52427 // X + SETBE --> sbb X, -1
52428 // X - SETBE --> adc X, -1
52429 // Try to convert COND_BE into COND_AE in an attempt to facilitate
52430 // materializing "setae reg".
52431 //
52432 // Do not flip "e <= c", where "c" is a constant, because Cmp instruction
52433 // cannot take an immediate as its first operand.
52434 //
52435 if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.getNode()->hasOneUse() &&
52436 EFLAGS.getValueType().isInteger() &&
52437 !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
52438 SDValue NewSub =
52439 DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
52440 EFLAGS.getOperand(1), EFLAGS.getOperand(0));
52441 SDValue NewEFLAGS = NewSub.getValue(EFLAGS.getResNo());
52442 return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL,
52443 DAG.getVTList(VT, MVT::i32), X,
52444 DAG.getAllOnesConstant(DL, VT), NewEFLAGS);
52445 }
52446 }
52447
52448 if (CC != X86::COND_E && CC != X86::COND_NE)
52449 return SDValue();
52450
52451 if (EFLAGS.getOpcode() != X86ISD::CMP || !EFLAGS.hasOneUse() ||
52452 !X86::isZeroNode(EFLAGS.getOperand(1)) ||
52453 !EFLAGS.getOperand(0).getValueType().isInteger())
52454 return SDValue();
52455
52456 SDValue Z = EFLAGS.getOperand(0);
52457 EVT ZVT = Z.getValueType();
52458
52459 // If X is -1 or 0, then we have an opportunity to avoid constants required in
52460 // the general case below.
52461 if (ConstantX) {
52462 // 'neg' sets the carry flag when Z != 0, so create 0 or -1 using 'sbb' with
52463 // fake operands:
52464 // 0 - (Z != 0) --> sbb %eax, %eax, (neg Z)
52465 // -1 + (Z == 0) --> sbb %eax, %eax, (neg Z)
52466 if ((IsSub && CC == X86::COND_NE && ConstantX->isZero()) ||
52467 (!IsSub && CC == X86::COND_E && ConstantX->isAllOnes())) {
52468 SDValue Zero = DAG.getConstant(0, DL, ZVT);
52469 SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
52470 SDValue Neg = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Zero, Z);
52471 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
52472 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
52473 SDValue(Neg.getNode(), 1));
52474 }
52475
52476 // cmp with 1 sets the carry flag when Z == 0, so create 0 or -1 using 'sbb'
52477 // with fake operands:
52478 // 0 - (Z == 0) --> sbb %eax, %eax, (cmp Z, 1)
52479 // -1 + (Z != 0) --> sbb %eax, %eax, (cmp Z, 1)
52480 if ((IsSub && CC == X86::COND_E && ConstantX->isZero()) ||
52481 (!IsSub && CC == X86::COND_NE && ConstantX->isAllOnes())) {
52482 SDValue One = DAG.getConstant(1, DL, ZVT);
52483 SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
52484 SDValue Cmp1 = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Z, One);
52485 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
52486 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
52487 Cmp1.getValue(1));
52488 }
52489 }
52490
52491 // (cmp Z, 1) sets the carry flag if Z is 0.
52492 SDValue One = DAG.getConstant(1, DL, ZVT);
52493 SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
52494 SDValue Cmp1 = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Z, One);
52495
52496 // Add the flags type for ADC/SBB nodes.
52497 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
52498
52499 // X - (Z != 0) --> sub X, (zext(setne Z, 0)) --> adc X, -1, (cmp Z, 1)
52500 // X + (Z != 0) --> add X, (zext(setne Z, 0)) --> sbb X, -1, (cmp Z, 1)
52501 if (CC == X86::COND_NE)
52502 return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL, VTs, X,
52503 DAG.getAllOnesConstant(DL, VT), Cmp1.getValue(1));
52504
52505 // X - (Z == 0) --> sub X, (zext(sete Z, 0)) --> sbb X, 0, (cmp Z, 1)
52506 // X + (Z == 0) --> add X, (zext(sete Z, 0)) --> adc X, 0, (cmp Z, 1)
52507 return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL, VTs, X,
52508 DAG.getConstant(0, DL, VT), Cmp1.getValue(1));
52509}
52510
52511/// If this is an add or subtract where one operand is produced by a cmp+setcc,
52512/// then try to convert it to an ADC or SBB. This replaces TEST+SET+{ADD/SUB}
52513/// with CMP+{ADC, SBB}.
52515 SelectionDAG &DAG) {
52516 bool IsSub = N->getOpcode() == ISD::SUB;
52517 SDValue X = N->getOperand(0);
52518 SDValue Y = N->getOperand(1);
52519 EVT VT = N->getValueType(0);
52520
52521 if (SDValue ADCOrSBB = combineAddOrSubToADCOrSBB(IsSub, DL, VT, X, Y, DAG))
52522 return ADCOrSBB;
52523
52524 // Commute and try again (negate the result for subtracts).
52525 if (SDValue ADCOrSBB = combineAddOrSubToADCOrSBB(IsSub, DL, VT, Y, X, DAG)) {
52526 if (IsSub)
52527 ADCOrSBB = DAG.getNegative(ADCOrSBB, DL, VT);
52528 return ADCOrSBB;
52529 }
52530
52531 return SDValue();
52532}
52533
52534static SDValue combineOrXorWithSETCC(unsigned Opc, const SDLoc &DL, EVT VT,
52535 SDValue N0, SDValue N1,
52536 SelectionDAG &DAG) {
52537 assert((Opc == ISD::XOR || Opc == ISD::OR) && "Unexpected opcode");
52538
52539 // Delegate to combineAddOrSubToADCOrSBB if we have:
52540 //
52541 // (xor/or (zero_extend (setcc)) imm)
52542 //
52543 // where imm is odd if and only if we have xor, in which case the XOR/OR are
52544 // equivalent to a SUB/ADD, respectively.
52545 if (N0.getOpcode() == ISD::ZERO_EXTEND &&
52546 N0.getOperand(0).getOpcode() == X86ISD::SETCC && N0.hasOneUse()) {
52547 if (auto *N1C = dyn_cast<ConstantSDNode>(N1)) {
52548 bool IsSub = Opc == ISD::XOR;
52549 bool N1COdd = N1C->getZExtValue() & 1;
52550 if (IsSub ? N1COdd : !N1COdd)
52551 if (SDValue R = combineAddOrSubToADCOrSBB(IsSub, DL, VT, N1, N0, DAG))
52552 return R;
52553 }
52554 }
52555
52556 // not(pcmpeq(and(X,CstPow2),0)) -> pcmpeq(and(X,CstPow2),CstPow2)
52557 if (Opc == ISD::XOR && N0.getOpcode() == X86ISD::PCMPEQ &&
52558 N0.getOperand(0).getOpcode() == ISD::AND &&
52562 VT.getScalarSizeInBits(), /*AllowUndefs=*/true)) {
52563 return DAG.getNode(X86ISD::PCMPEQ, DL, VT, N0.getOperand(0),
52564 N0.getOperand(0).getOperand(1));
52565 }
52566
52567 return SDValue();
52568}
52569
52572 const X86Subtarget &Subtarget) {
52573 SDValue N0 = N->getOperand(0);
52574 SDValue N1 = N->getOperand(1);
52575 EVT VT = N->getValueType(0);
52576 SDLoc dl(N);
52577 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
52578
52579 // If this is SSE1 only convert to FOR to avoid scalarization.
52580 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
52581 return DAG.getBitcast(MVT::v4i32,
52582 DAG.getNode(X86ISD::FOR, dl, MVT::v4f32,
52583 DAG.getBitcast(MVT::v4f32, N0),
52584 DAG.getBitcast(MVT::v4f32, N1)));
52585 }
52586
52587 // Match any-of bool scalar reductions into a bitcast/movmsk + cmp.
52588 // TODO: Support multiple SrcOps.
52589 if (VT == MVT::i1) {
52591 SmallVector<APInt, 2> SrcPartials;
52592 if (matchScalarReduction(SDValue(N, 0), ISD::OR, SrcOps, &SrcPartials) &&
52593 SrcOps.size() == 1) {
52594 unsigned NumElts = SrcOps[0].getValueType().getVectorNumElements();
52595 EVT MaskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
52596 SDValue Mask = combineBitcastvxi1(DAG, MaskVT, SrcOps[0], dl, Subtarget);
52597 if (!Mask && TLI.isTypeLegal(SrcOps[0].getValueType()))
52598 Mask = DAG.getBitcast(MaskVT, SrcOps[0]);
52599 if (Mask) {
52600 assert(SrcPartials[0].getBitWidth() == NumElts &&
52601 "Unexpected partial reduction mask");
52602 SDValue ZeroBits = DAG.getConstant(0, dl, MaskVT);
52603 SDValue PartialBits = DAG.getConstant(SrcPartials[0], dl, MaskVT);
52604 Mask = DAG.getNode(ISD::AND, dl, MaskVT, Mask, PartialBits);
52605 return DAG.getSetCC(dl, MVT::i1, Mask, ZeroBits, ISD::SETNE);
52606 }
52607 }
52608 }
52609
52610 if (SDValue SetCC = combineAndOrForCcmpCtest(N, DAG, DCI, Subtarget))
52611 return SetCC;
52612
52613 if (SDValue R = combineBitOpWithMOVMSK(N->getOpcode(), dl, N0, N1, DAG))
52614 return R;
52615
52616 if (SDValue R = combineBitOpWithShift(N->getOpcode(), dl, VT, N0, N1, DAG))
52617 return R;
52618
52619 if (SDValue R = combineBitOpWithPACK(N->getOpcode(), dl, VT, N0, N1, DAG))
52620 return R;
52621
52622 if (SDValue FPLogic = convertIntLogicToFPLogic(N->getOpcode(), dl, VT, N0, N1,
52623 DAG, DCI, Subtarget))
52624 return FPLogic;
52625
52626 if (DCI.isBeforeLegalizeOps())
52627 return SDValue();
52628
52629 if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
52630 return R;
52631
52632 if (SDValue R = canonicalizeBitSelect(N, dl, DAG, Subtarget))
52633 return R;
52634
52635 if (SDValue R = combineLogicBlendIntoPBLENDV(N, dl, DAG, Subtarget))
52636 return R;
52637
52638 // Combine `(x86isd::setcc_carry) | C` and `(0 - SetCC) | C`
52639 // into `(zext (not SetCC)) * (C + 1) - 1` if we can get a LEA out of it.
52640 if ((VT == MVT::i32 || VT == MVT::i64) && N0.hasOneUse()) {
52641 if (auto *CN = dyn_cast<ConstantSDNode>(N1)) {
52642 uint64_t Val = CN->getZExtValue();
52643 if (Val == 1 || Val == 2 || Val == 3 || Val == 4 || Val == 7 ||
52644 Val == 8) {
52645 SDValue NotCond;
52646 if (N0.getOpcode() == X86ISD::SETCC_CARRY &&
52647 N0.getOperand(1).hasOneUse()) {
52650 NotCond = getSETCC(NewCC, N0.getOperand(1), SDLoc(N0), DAG);
52651 } else if (N0.getOpcode() == ISD::SUB &&
52652 isNullConstant(N0.getOperand(0))) {
52653 SDValue Cond = N0.getOperand(1);
52654 if (Cond.getOpcode() == ISD::ZERO_EXTEND && Cond.hasOneUse())
52655 Cond = Cond.getOperand(0);
52656 if (Cond.getOpcode() == X86ISD::SETCC && Cond.hasOneUse()) {
52657 X86::CondCode OldCC = (X86::CondCode)Cond.getConstantOperandVal(0);
52659 NotCond = getSETCC(NewCC, Cond.getOperand(1), SDLoc(Cond), DAG);
52660 }
52661 }
52662
52663 if (NotCond) {
52664 SDValue R = DAG.getZExtOrTrunc(NotCond, dl, VT);
52665 R = DAG.getNode(ISD::MUL, dl, VT, R, DAG.getConstant(Val + 1, dl, VT));
52666 R = DAG.getNode(ISD::SUB, dl, VT, R, DAG.getConstant(1, dl, VT));
52667 return R;
52668 }
52669 }
52670 }
52671 }
52672
52673 // Combine OR(X,KSHIFTL(Y,Elts/2)) -> CONCAT_VECTORS(X,Y) == KUNPCK(X,Y).
52674 // Combine OR(KSHIFTL(X,Elts/2),Y) -> CONCAT_VECTORS(Y,X) == KUNPCK(Y,X).
52675 // iff the upper elements of the non-shifted arg are zero.
52676 // KUNPCK require 16+ bool vector elements.
52677 if (N0.getOpcode() == X86ISD::KSHIFTL || N1.getOpcode() == X86ISD::KSHIFTL) {
52678 unsigned NumElts = VT.getVectorNumElements();
52679 unsigned HalfElts = NumElts / 2;
52680 APInt UpperElts = APInt::getHighBitsSet(NumElts, HalfElts);
52681 if (NumElts >= 16 && N1.getOpcode() == X86ISD::KSHIFTL &&
52682 N1.getConstantOperandAPInt(1) == HalfElts &&
52683 DAG.MaskedVectorIsZero(N0, UpperElts)) {
52684 return DAG.getNode(
52685 ISD::CONCAT_VECTORS, dl, VT,
52686 extractSubVector(N0, 0, DAG, dl, HalfElts),
52687 extractSubVector(N1.getOperand(0), 0, DAG, dl, HalfElts));
52688 }
52689 if (NumElts >= 16 && N0.getOpcode() == X86ISD::KSHIFTL &&
52690 N0.getConstantOperandAPInt(1) == HalfElts &&
52691 DAG.MaskedVectorIsZero(N1, UpperElts)) {
52692 return DAG.getNode(
52693 ISD::CONCAT_VECTORS, dl, VT,
52694 extractSubVector(N1, 0, DAG, dl, HalfElts),
52695 extractSubVector(N0.getOperand(0), 0, DAG, dl, HalfElts));
52696 }
52697 }
52698
52699 if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
52700 // Attempt to recursively combine an OR of shuffles.
52701 SDValue Op(N, 0);
52702 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
52703 return Res;
52704
52705 // If either operand is a constant mask, then only the elements that aren't
52706 // allones are actually demanded by the other operand.
52707 auto SimplifyUndemandedElts = [&](SDValue Op, SDValue OtherOp) {
52708 APInt UndefElts;
52709 SmallVector<APInt> EltBits;
52710 int NumElts = VT.getVectorNumElements();
52711 int EltSizeInBits = VT.getScalarSizeInBits();
52712 if (!getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts, EltBits))
52713 return false;
52714
52715 APInt DemandedElts = APInt::getZero(NumElts);
52716 for (int I = 0; I != NumElts; ++I)
52717 if (!EltBits[I].isAllOnes())
52718 DemandedElts.setBit(I);
52719
52720 return TLI.SimplifyDemandedVectorElts(OtherOp, DemandedElts, DCI);
52721 };
52722 if (SimplifyUndemandedElts(N0, N1) || SimplifyUndemandedElts(N1, N0)) {
52723 if (N->getOpcode() != ISD::DELETED_NODE)
52724 DCI.AddToWorklist(N);
52725 return SDValue(N, 0);
52726 }
52727 }
52728
52729 if (SDValue R = combineOrXorWithSETCC(N->getOpcode(), dl, VT, N0, N1, DAG))
52730 return R;
52731
52732 return SDValue();
52733}
52734
52735/// Try to turn tests against the signbit in the form of:
52736/// XOR(TRUNCATE(SRL(X, size(X)-1)), 1)
52737/// into:
52738/// SETGT(X, -1)
52740 SelectionDAG &DAG) {
52741 // This is only worth doing if the output type is i8 or i1.
52742 EVT ResultType = N->getValueType(0);
52743 if (ResultType != MVT::i8 && ResultType != MVT::i1)
52744 return SDValue();
52745
52746 SDValue N0 = N->getOperand(0);
52747 SDValue N1 = N->getOperand(1);
52748
52749 // We should be performing an xor against a truncated shift.
52750 if (N0.getOpcode() != ISD::TRUNCATE || !N0.hasOneUse())
52751 return SDValue();
52752
52753 // Make sure we are performing an xor against one.
52754 if (!isOneConstant(N1))
52755 return SDValue();
52756
52757 // SetCC on x86 zero extends so only act on this if it's a logical shift.
52758 SDValue Shift = N0.getOperand(0);
52759 if (Shift.getOpcode() != ISD::SRL || !Shift.hasOneUse())
52760 return SDValue();
52761
52762 // Make sure we are truncating from one of i16, i32 or i64.
52763 EVT ShiftTy = Shift.getValueType();
52764 if (ShiftTy != MVT::i16 && ShiftTy != MVT::i32 && ShiftTy != MVT::i64)
52765 return SDValue();
52766
52767 // Make sure the shift amount extracts the sign bit.
52768 if (!isa<ConstantSDNode>(Shift.getOperand(1)) ||
52769 Shift.getConstantOperandAPInt(1) != (ShiftTy.getSizeInBits() - 1))
52770 return SDValue();
52771
52772 // Create a greater-than comparison against -1.
52773 // N.B. Using SETGE against 0 works but we want a canonical looking
52774 // comparison, using SETGT matches up with what TranslateX86CC.
52775 SDValue ShiftOp = Shift.getOperand(0);
52776 EVT ShiftOpTy = ShiftOp.getValueType();
52777 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
52778 EVT SetCCResultType = TLI.getSetCCResultType(DAG.getDataLayout(),
52779 *DAG.getContext(), ResultType);
52780 SDValue Cond =
52781 DAG.getSetCC(DL, SetCCResultType, ShiftOp,
52782 DAG.getAllOnesConstant(DL, ShiftOpTy), ISD::SETGT);
52783 if (SetCCResultType != ResultType)
52784 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, ResultType, Cond);
52785 return Cond;
52786}
52787
52788/// Turn vector tests of the signbit in the form of:
52789/// xor (sra X, elt_size(X)-1), -1
52790/// into:
52791/// pcmpgt X, -1
52792///
52793/// This should be called before type legalization because the pattern may not
52794/// persist after that.
52796 const X86Subtarget &Subtarget) {
52797 EVT VT = N->getValueType(0);
52798 if (!VT.isSimple())
52799 return SDValue();
52800
52801 switch (VT.getSimpleVT().SimpleTy) {
52802 // clang-format off
52803 default: return SDValue();
52804 case MVT::v16i8:
52805 case MVT::v8i16:
52806 case MVT::v4i32:
52807 case MVT::v2i64: if (!Subtarget.hasSSE2()) return SDValue(); break;
52808 case MVT::v32i8:
52809 case MVT::v16i16:
52810 case MVT::v8i32:
52811 case MVT::v4i64: if (!Subtarget.hasAVX2()) return SDValue(); break;
52812 // clang-format on
52813 }
52814
52815 // There must be a shift right algebraic before the xor, and the xor must be a
52816 // 'not' operation.
52817 SDValue Shift = N->getOperand(0);
52818 SDValue Ones = N->getOperand(1);
52819 if (Shift.getOpcode() != ISD::SRA || !Shift.hasOneUse() ||
52821 return SDValue();
52822
52823 // The shift should be smearing the sign bit across each vector element.
52824 auto *ShiftAmt =
52825 isConstOrConstSplat(Shift.getOperand(1), /*AllowUndefs*/ true);
52826 if (!ShiftAmt ||
52827 ShiftAmt->getAPIntValue() != (Shift.getScalarValueSizeInBits() - 1))
52828 return SDValue();
52829
52830 // Create a greater-than comparison against -1. We don't use the more obvious
52831 // greater-than-or-equal-to-zero because SSE/AVX don't have that instruction.
52832 return DAG.getSetCC(SDLoc(N), VT, Shift.getOperand(0), Ones, ISD::SETGT);
52833}
52834
52835/// Detect patterns of truncation with unsigned saturation:
52836///
52837/// 1. (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type).
52838/// Return the source value x to be truncated or SDValue() if the pattern was
52839/// not matched.
52840///
52841/// 2. (truncate (smin (smax (x, C1), C2)) to dest_type),
52842/// where C1 >= 0 and C2 is unsigned max of destination type.
52843///
52844/// (truncate (smax (smin (x, C2), C1)) to dest_type)
52845/// where C1 >= 0, C2 is unsigned max of destination type and C1 <= C2.
52846///
52847/// These two patterns are equivalent to:
52848/// (truncate (umin (smax(x, C1), unsigned_max_of_dest_type)) to dest_type)
52849/// So return the smax(x, C1) value to be truncated or SDValue() if the
52850/// pattern was not matched.
52852 const SDLoc &DL) {
52853 using namespace llvm::SDPatternMatch;
52854 EVT InVT = In.getValueType();
52855
52856 // Saturation with truncation. We truncate from InVT to VT.
52858 "Unexpected types for truncate operation");
52859
52860 APInt C1, C2;
52862
52863 // C2 should be equal to UINT32_MAX / UINT16_MAX / UINT8_MAX according
52864 // the element size of the destination type.
52865 if (sd_match(In, m_UMin(m_Value(UMin), m_ConstInt(C2))) &&
52866 C2.isMask(VT.getScalarSizeInBits()))
52867 return UMin;
52868
52869 if (sd_match(In, m_SMin(m_Value(SMin), m_ConstInt(C2))) &&
52871 C1.isNonNegative() && C2.isMask(VT.getScalarSizeInBits()))
52872 return SMin;
52873
52874 if (sd_match(In, m_SMax(m_Value(SMax), m_ConstInt(C1))) &&
52876 C1.isNonNegative() && C2.isMask(VT.getScalarSizeInBits()) && C2.uge(C1))
52877 return DAG.getNode(ISD::SMAX, DL, InVT, SMin, In.getOperand(1));
52878
52879 return SDValue();
52880}
52881
52882/// Detect patterns of truncation with signed saturation:
52883/// (truncate (smin ((smax (x, signed_min_of_dest_type)),
52884/// signed_max_of_dest_type)) to dest_type)
52885/// or:
52886/// (truncate (smax ((smin (x, signed_max_of_dest_type)),
52887/// signed_min_of_dest_type)) to dest_type).
52888/// With MatchPackUS, the smax/smin range is [0, unsigned_max_of_dest_type].
52889/// Return the source value to be truncated or SDValue() if the pattern was not
52890/// matched.
52891static SDValue detectSSatPattern(SDValue In, EVT VT, bool MatchPackUS = false) {
52892 using namespace llvm::SDPatternMatch;
52893 unsigned NumDstBits = VT.getScalarSizeInBits();
52894 unsigned NumSrcBits = In.getScalarValueSizeInBits();
52895 assert(NumSrcBits > NumDstBits && "Unexpected types for truncate operation");
52896
52897 APInt SignedMax, SignedMin;
52898 if (MatchPackUS) {
52899 SignedMax = APInt::getAllOnes(NumDstBits).zext(NumSrcBits);
52900 SignedMin = APInt::getZero(NumSrcBits);
52901 } else {
52902 SignedMax = APInt::getSignedMaxValue(NumDstBits).sext(NumSrcBits);
52903 SignedMin = APInt::getSignedMinValue(NumDstBits).sext(NumSrcBits);
52904 }
52905
52906 SDValue SMin, SMax;
52907 if (sd_match(In, m_SMin(m_Value(SMin), m_SpecificInt(SignedMax))) &&
52908 sd_match(SMin, m_SMax(m_Value(SMax), m_SpecificInt(SignedMin))))
52909 return SMax;
52910
52911 if (sd_match(In, m_SMax(m_Value(SMax), m_SpecificInt(SignedMin))) &&
52912 sd_match(SMax, m_SMin(m_Value(SMin), m_SpecificInt(SignedMax))))
52913 return SMin;
52914
52915 return SDValue();
52916}
52917
52919 SelectionDAG &DAG,
52920 const X86Subtarget &Subtarget) {
52921 if (!Subtarget.hasSSE2() || !VT.isVector())
52922 return SDValue();
52923
52924 EVT SVT = VT.getVectorElementType();
52925 EVT InVT = In.getValueType();
52926 EVT InSVT = InVT.getVectorElementType();
52927
52928 // If we're clamping a signed 32-bit vector to 0-255 and the 32-bit vector is
52929 // split across two registers. We can use a packusdw+perm to clamp to 0-65535
52930 // and concatenate at the same time. Then we can use a final vpmovuswb to
52931 // clip to 0-255.
52932 if (Subtarget.hasBWI() && !Subtarget.useAVX512Regs() &&
52933 InVT == MVT::v16i32 && VT == MVT::v16i8) {
52934 if (SDValue USatVal = detectSSatPattern(In, VT, true)) {
52935 // Emit a VPACKUSDW+VPERMQ followed by a VPMOVUSWB.
52936 SDValue Mid = truncateVectorWithPACK(X86ISD::PACKUS, MVT::v16i16, USatVal,
52937 DL, DAG, Subtarget);
52938 assert(Mid && "Failed to pack!");
52939 return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, Mid);
52940 }
52941 }
52942
52943 // vXi32 truncate instructions are available with AVX512F.
52944 // vXi16 truncate instructions are only available with AVX512BW.
52945 // For 256-bit or smaller vectors, we require VLX.
52946 // FIXME: We could widen truncates to 512 to remove the VLX restriction.
52947 // If the result type is 256-bits or larger and we have disable 512-bit
52948 // registers, we should go ahead and use the pack instructions if possible.
52949 bool PreferAVX512 = ((Subtarget.hasAVX512() && InSVT == MVT::i32) ||
52950 (Subtarget.hasBWI() && InSVT == MVT::i16)) &&
52951 (InVT.getSizeInBits() > 128) &&
52952 (Subtarget.hasVLX() || InVT.getSizeInBits() > 256) &&
52953 !(!Subtarget.useAVX512Regs() && VT.getSizeInBits() >= 256);
52954
52955 if (!PreferAVX512 && VT.getVectorNumElements() > 1 &&
52957 (SVT == MVT::i8 || SVT == MVT::i16) &&
52958 (InSVT == MVT::i16 || InSVT == MVT::i32)) {
52959 if (SDValue USatVal = detectSSatPattern(In, VT, true)) {
52960 // vXi32 -> vXi8 must be performed as PACKUSWB(PACKSSDW,PACKSSDW).
52961 if (SVT == MVT::i8 && InSVT == MVT::i32) {
52962 EVT MidVT = VT.changeVectorElementType(MVT::i16);
52963 SDValue Mid = truncateVectorWithPACK(X86ISD::PACKSS, MidVT, USatVal, DL,
52964 DAG, Subtarget);
52965 assert(Mid && "Failed to pack!");
52967 Subtarget);
52968 assert(V && "Failed to pack!");
52969 return V;
52970 } else if (SVT == MVT::i8 || Subtarget.hasSSE41())
52971 return truncateVectorWithPACK(X86ISD::PACKUS, VT, USatVal, DL, DAG,
52972 Subtarget);
52973 }
52974 if (SDValue SSatVal = detectSSatPattern(In, VT))
52975 return truncateVectorWithPACK(X86ISD::PACKSS, VT, SSatVal, DL, DAG,
52976 Subtarget);
52977 }
52978
52979 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
52980 if (TLI.isTypeLegal(InVT) && InVT.isVector() && SVT != MVT::i1 &&
52981 Subtarget.hasAVX512() && (InSVT != MVT::i16 || Subtarget.hasBWI()) &&
52982 (SVT == MVT::i32 || SVT == MVT::i16 || SVT == MVT::i8)) {
52983 unsigned TruncOpc = 0;
52984 SDValue SatVal;
52985 if (SDValue SSatVal = detectSSatPattern(In, VT)) {
52986 SatVal = SSatVal;
52987 TruncOpc = X86ISD::VTRUNCS;
52988 } else if (SDValue USatVal = detectUSatPattern(In, VT, DAG, DL)) {
52989 SatVal = USatVal;
52990 TruncOpc = X86ISD::VTRUNCUS;
52991 }
52992 if (SatVal) {
52993 unsigned ResElts = VT.getVectorNumElements();
52994 // If the input type is less than 512 bits and we don't have VLX, we need
52995 // to widen to 512 bits.
52996 if (!Subtarget.hasVLX() && !InVT.is512BitVector()) {
52997 unsigned NumConcats = 512 / InVT.getSizeInBits();
52998 ResElts *= NumConcats;
52999 SmallVector<SDValue, 4> ConcatOps(NumConcats, DAG.getUNDEF(InVT));
53000 ConcatOps[0] = SatVal;
53001 InVT = EVT::getVectorVT(*DAG.getContext(), InSVT,
53002 NumConcats * InVT.getVectorNumElements());
53003 SatVal = DAG.getNode(ISD::CONCAT_VECTORS, DL, InVT, ConcatOps);
53004 }
53005 // Widen the result if its narrower than 128 bits.
53006 if (ResElts * SVT.getSizeInBits() < 128)
53007 ResElts = 128 / SVT.getSizeInBits();
53008 EVT TruncVT = EVT::getVectorVT(*DAG.getContext(), SVT, ResElts);
53009 SDValue Res = DAG.getNode(TruncOpc, DL, TruncVT, SatVal);
53010 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
53011 DAG.getVectorIdxConstant(0, DL));
53012 }
53013 }
53014
53015 return SDValue();
53016}
53017
53019 SelectionDAG &DAG,
53021 const X86Subtarget &Subtarget) {
53022 auto *Ld = cast<LoadSDNode>(N);
53023 EVT RegVT = Ld->getValueType(0);
53024 SDValue Ptr = Ld->getBasePtr();
53025 SDValue Chain = Ld->getChain();
53026 ISD::LoadExtType Ext = Ld->getExtensionType();
53027
53028 if (Ext != ISD::NON_EXTLOAD || !Subtarget.hasAVX() || !Ld->isSimple())
53029 return SDValue();
53030
53031 if (!(RegVT.is128BitVector() || RegVT.is256BitVector()))
53032 return SDValue();
53033
53035 if (!LdC)
53036 return SDValue();
53037
53038 auto MatchingBits = [](const APInt &Undefs, const APInt &UserUndefs,
53039 ArrayRef<APInt> Bits, ArrayRef<APInt> UserBits) {
53040 for (unsigned I = 0, E = Undefs.getBitWidth(); I != E; ++I) {
53041 if (Undefs[I])
53042 continue;
53043 if (UserUndefs[I] || Bits[I] != UserBits[I])
53044 return false;
53045 }
53046 return true;
53047 };
53048
53049 // Look through all other loads/broadcasts in the chain for another constant
53050 // pool entry.
53051 for (SDNode *User : Chain->users()) {
53052 auto *UserLd = dyn_cast<MemSDNode>(User);
53053 if (User != N && UserLd &&
53054 (User->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD ||
53055 User->getOpcode() == X86ISD::VBROADCAST_LOAD ||
53057 UserLd->getChain() == Chain && User->hasAnyUseOfValue(0) &&
53058 User->getValueSizeInBits(0).getFixedValue() >
53059 RegVT.getFixedSizeInBits()) {
53060 EVT UserVT = User->getValueType(0);
53061 SDValue UserPtr = UserLd->getBasePtr();
53062 const Constant *UserC = getTargetConstantFromBasePtr(UserPtr);
53063
53064 // See if we are loading a constant that matches in the lower
53065 // bits of a longer constant (but from a different constant pool ptr).
53066 if (UserC && UserPtr != Ptr) {
53067 unsigned LdSize = LdC->getType()->getPrimitiveSizeInBits();
53068 unsigned UserSize = UserC->getType()->getPrimitiveSizeInBits();
53069 if (LdSize < UserSize || !ISD::isNormalLoad(User)) {
53070 APInt Undefs, UserUndefs;
53071 SmallVector<APInt> Bits, UserBits;
53072 unsigned NumBits = std::min(RegVT.getScalarSizeInBits(),
53073 UserVT.getScalarSizeInBits());
53074 if (getTargetConstantBitsFromNode(SDValue(N, 0), NumBits, Undefs,
53075 Bits) &&
53077 UserUndefs, UserBits)) {
53078 if (MatchingBits(Undefs, UserUndefs, Bits, UserBits)) {
53080 SDValue Extract = extractSubVector(SDValue(User, 0), 0, DAG, dl,
53081 RegVT.getSizeInBits());
53082 Extract = DAG.getBitcast(RegVT, Extract);
53083 return DCI.CombineTo(N, Extract, SDValue(User, 1));
53084 }
53085 }
53086 }
53087 }
53088 }
53089 }
53090
53091 return SDValue();
53092}
53093
53096 const X86Subtarget &Subtarget) {
53097 auto *Ld = cast<LoadSDNode>(N);
53098 EVT RegVT = Ld->getValueType(0);
53099 EVT MemVT = Ld->getMemoryVT();
53100 SDLoc dl(Ld);
53101 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
53102
53103 // For chips with slow 32-byte unaligned loads, break the 32-byte operation
53104 // into two 16-byte operations. Also split non-temporal aligned loads on
53105 // pre-AVX2 targets as 32-byte loads will lower to regular temporal loads.
53106 ISD::LoadExtType Ext = Ld->getExtensionType();
53107 unsigned Fast;
53108 if (RegVT.is256BitVector() && !DCI.isBeforeLegalizeOps() &&
53109 Ext == ISD::NON_EXTLOAD &&
53110 ((Ld->isNonTemporal() && !Subtarget.hasInt256() &&
53111 Ld->getAlign() >= Align(16)) ||
53112 (TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), RegVT,
53113 *Ld->getMemOperand(), &Fast) &&
53114 !Fast))) {
53115 unsigned NumElems = RegVT.getVectorNumElements();
53116 if (NumElems < 2)
53117 return SDValue();
53118
53119 unsigned HalfOffset = 16;
53120 SDValue Ptr1 = Ld->getBasePtr();
53121 SDValue Ptr2 =
53122 DAG.getMemBasePlusOffset(Ptr1, TypeSize::getFixed(HalfOffset), dl);
53123 EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
53124 NumElems / 2);
53125 SDValue Load1 =
53126 DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr1, Ld->getPointerInfo(),
53127 Ld->getBaseAlign(), Ld->getMemOperand()->getFlags());
53128 SDValue Load2 =
53129 DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr2,
53130 Ld->getPointerInfo().getWithOffset(HalfOffset),
53131 Ld->getBaseAlign(), Ld->getMemOperand()->getFlags());
53132 SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
53133 Load1.getValue(1), Load2.getValue(1));
53134
53135 SDValue NewVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Load1, Load2);
53136 return DCI.CombineTo(N, NewVec, TF, true);
53137 }
53138
53139 // Bool vector load - attempt to cast to an integer, as we have good
53140 // (vXiY *ext(vXi1 bitcast(iX))) handling.
53141 if (Ext == ISD::NON_EXTLOAD && !Subtarget.hasAVX512() && RegVT.isVector() &&
53142 RegVT.getScalarType() == MVT::i1 && DCI.isBeforeLegalize()) {
53143 unsigned NumElts = RegVT.getVectorNumElements();
53144 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
53145 if (TLI.isTypeLegal(IntVT)) {
53146 SDValue IntLoad = DAG.getLoad(IntVT, dl, Ld->getChain(), Ld->getBasePtr(),
53147 Ld->getPointerInfo(), Ld->getBaseAlign(),
53148 Ld->getMemOperand()->getFlags());
53149 SDValue BoolVec = DAG.getBitcast(RegVT, IntLoad);
53150 return DCI.CombineTo(N, BoolVec, IntLoad.getValue(1), true);
53151 }
53152 }
53153
53154 // If we also broadcast this vector to a wider type, then just extract the
53155 // lowest subvector.
53156 if (Ext == ISD::NON_EXTLOAD && Subtarget.hasAVX() && Ld->isSimple() &&
53157 (RegVT.is128BitVector() || RegVT.is256BitVector())) {
53158 SDValue Ptr = Ld->getBasePtr();
53159 SDValue Chain = Ld->getChain();
53160 for (SDNode *User : Chain->users()) {
53161 auto *UserLd = dyn_cast<MemSDNode>(User);
53162 if (User != N && UserLd &&
53163 User->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD &&
53164 UserLd->getChain() == Chain && UserLd->getBasePtr() == Ptr &&
53165 UserLd->getMemoryVT().getSizeInBits() == MemVT.getSizeInBits() &&
53166 User->hasAnyUseOfValue(0) &&
53167 User->getValueSizeInBits(0).getFixedValue() >
53168 RegVT.getFixedSizeInBits()) {
53170 SDValue Extract = extractSubVector(SDValue(User, 0), 0, DAG, dl,
53171 RegVT.getSizeInBits());
53172 Extract = DAG.getBitcast(RegVT, Extract);
53173 return DCI.CombineTo(N, Extract, SDValue(User, 1));
53174 }
53175 }
53176 }
53177
53178 if (SDValue V = combineConstantPoolLoads(Ld, dl, DAG, DCI, Subtarget))
53179 return V;
53180
53181 // Cast ptr32 and ptr64 pointers to the default address space before a load.
53182 unsigned AddrSpace = Ld->getAddressSpace();
53183 if (AddrSpace == X86AS::PTR64 || AddrSpace == X86AS::PTR32_SPTR ||
53184 AddrSpace == X86AS::PTR32_UPTR) {
53185 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
53186 if (PtrVT != Ld->getBasePtr().getSimpleValueType()) {
53187 SDValue Cast =
53188 DAG.getAddrSpaceCast(dl, PtrVT, Ld->getBasePtr(), AddrSpace, 0);
53189 return DAG.getExtLoad(Ext, dl, RegVT, Ld->getChain(), Cast,
53190 Ld->getPointerInfo(), MemVT, Ld->getBaseAlign(),
53191 Ld->getMemOperand()->getFlags());
53192 }
53193 }
53194
53195 return SDValue();
53196}
53197
53198/// If V is a build vector of boolean constants and exactly one of those
53199/// constants is true, return the operand index of that true element.
53200/// Otherwise, return -1.
53201static int getOneTrueElt(SDValue V) {
53202 // This needs to be a build vector of booleans.
53203 // TODO: Checking for the i1 type matches the IR definition for the mask,
53204 // but the mask check could be loosened to i8 or other types. That might
53205 // also require checking more than 'allOnesValue'; eg, the x86 HW
53206 // instructions only require that the MSB is set for each mask element.
53207 // The ISD::MSTORE comments/definition do not specify how the mask operand
53208 // is formatted.
53209 auto *BV = dyn_cast<BuildVectorSDNode>(V);
53210 if (!BV || BV->getValueType(0).getVectorElementType() != MVT::i1)
53211 return -1;
53212
53213 int TrueIndex = -1;
53214 unsigned NumElts = BV->getValueType(0).getVectorNumElements();
53215 for (unsigned i = 0; i < NumElts; ++i) {
53216 const SDValue &Op = BV->getOperand(i);
53217 if (Op.isUndef())
53218 continue;
53219 auto *ConstNode = dyn_cast<ConstantSDNode>(Op);
53220 if (!ConstNode)
53221 return -1;
53222 if (ConstNode->getAPIntValue().countr_one() >= 1) {
53223 // If we already found a one, this is too many.
53224 if (TrueIndex >= 0)
53225 return -1;
53226 TrueIndex = i;
53227 }
53228 }
53229 return TrueIndex;
53230}
53231
53232/// Given a masked memory load/store operation, return true if it has one mask
53233/// bit set. If it has one mask bit set, then also return the memory address of
53234/// the scalar element to load/store, the vector index to insert/extract that
53235/// scalar element, and the alignment for the scalar memory access.
53237 SelectionDAG &DAG, SDValue &Addr,
53238 SDValue &Index, Align &Alignment,
53239 unsigned &Offset) {
53240 int TrueMaskElt = getOneTrueElt(MaskedOp->getMask());
53241 if (TrueMaskElt < 0)
53242 return false;
53243
53244 // Get the address of the one scalar element that is specified by the mask
53245 // using the appropriate offset from the base pointer.
53246 EVT EltVT = MaskedOp->getMemoryVT().getVectorElementType();
53247 Offset = 0;
53248 Addr = MaskedOp->getBasePtr();
53249 if (TrueMaskElt != 0) {
53250 Offset = TrueMaskElt * EltVT.getStoreSize();
53252 SDLoc(MaskedOp));
53253 }
53254
53255 Index = DAG.getIntPtrConstant(TrueMaskElt, SDLoc(MaskedOp));
53256 Alignment = commonAlignment(MaskedOp->getBaseAlign(), EltVT.getStoreSize());
53257 return true;
53258}
53259
53260/// If exactly one element of the mask is set for a non-extending masked load,
53261/// it is a scalar load and vector insert.
53262/// Note: It is expected that the degenerate cases of an all-zeros or all-ones
53263/// mask have already been optimized in IR, so we don't bother with those here.
53264static SDValue
53267 const X86Subtarget &Subtarget) {
53268 assert(ML->isUnindexed() && "Unexpected indexed masked load!");
53269 // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
53270 // However, some target hooks may need to be added to know when the transform
53271 // is profitable. Endianness would also have to be considered.
53272
53273 SDValue Addr, VecIndex;
53274 Align Alignment;
53275 unsigned Offset;
53276 if (!getParamsForOneTrueMaskedElt(ML, DAG, Addr, VecIndex, Alignment, Offset))
53277 return SDValue();
53278
53279 // Load the one scalar element that is specified by the mask using the
53280 // appropriate offset from the base pointer.
53281 SDLoc DL(ML);
53282 EVT VT = ML->getValueType(0);
53283 EVT EltVT = VT.getVectorElementType();
53284
53285 EVT CastVT = VT;
53286 if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {
53287 EltVT = MVT::f64;
53288 CastVT = VT.changeVectorElementType(EltVT);
53289 }
53290
53291 SDValue Load =
53292 DAG.getLoad(EltVT, DL, ML->getChain(), Addr,
53293 ML->getPointerInfo().getWithOffset(Offset),
53294 Alignment, ML->getMemOperand()->getFlags());
53295
53296 SDValue PassThru = DAG.getBitcast(CastVT, ML->getPassThru());
53297
53298 // Insert the loaded element into the appropriate place in the vector.
53299 SDValue Insert =
53300 DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, CastVT, PassThru, Load, VecIndex);
53301 Insert = DAG.getBitcast(VT, Insert);
53302 return DCI.CombineTo(ML, Insert, Load.getValue(1), true);
53303}
53304
53305static SDValue
53308 assert(ML->isUnindexed() && "Unexpected indexed masked load!");
53309 if (!ISD::isBuildVectorOfConstantSDNodes(ML->getMask().getNode()))
53310 return SDValue();
53311
53312 SDLoc DL(ML);
53313 EVT VT = ML->getValueType(0);
53314
53315 // If we are loading the first and last elements of a vector, it is safe and
53316 // always faster to load the whole vector. Replace the masked load with a
53317 // vector load and select.
53318 unsigned NumElts = VT.getVectorNumElements();
53319 BuildVectorSDNode *MaskBV = cast<BuildVectorSDNode>(ML->getMask());
53320 bool LoadFirstElt = !isNullConstant(MaskBV->getOperand(0));
53321 bool LoadLastElt = !isNullConstant(MaskBV->getOperand(NumElts - 1));
53322 if (LoadFirstElt && LoadLastElt) {
53323 SDValue VecLd = DAG.getLoad(VT, DL, ML->getChain(), ML->getBasePtr(),
53324 ML->getMemOperand());
53325 SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), VecLd,
53326 ML->getPassThru());
53327 return DCI.CombineTo(ML, Blend, VecLd.getValue(1), true);
53328 }
53329
53330 // Convert a masked load with a constant mask into a masked load and a select.
53331 // This allows the select operation to use a faster kind of select instruction
53332 // (for example, vblendvps -> vblendps).
53333
53334 // Don't try this if the pass-through operand is already undefined. That would
53335 // cause an infinite loop because that's what we're about to create.
53336 if (ML->getPassThru().isUndef())
53337 return SDValue();
53338
53339 if (ISD::isBuildVectorAllZeros(ML->getPassThru().getNode()))
53340 return SDValue();
53341
53342 // The new masked load has an undef pass-through operand. The select uses the
53343 // original pass-through operand.
53344 SDValue NewML = DAG.getMaskedLoad(
53345 VT, DL, ML->getChain(), ML->getBasePtr(), ML->getOffset(), ML->getMask(),
53346 DAG.getUNDEF(VT), ML->getMemoryVT(), ML->getMemOperand(),
53347 ML->getAddressingMode(), ML->getExtensionType());
53348 SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), NewML,
53349 ML->getPassThru());
53350
53351 return DCI.CombineTo(ML, Blend, NewML.getValue(1), true);
53352}
53353
53356 const X86Subtarget &Subtarget) {
53357 auto *Mld = cast<MaskedLoadSDNode>(N);
53358
53359 // TODO: Expanding load with constant mask may be optimized as well.
53360 if (Mld->isExpandingLoad())
53361 return SDValue();
53362
53363 if (Mld->getExtensionType() == ISD::NON_EXTLOAD) {
53364 if (SDValue ScalarLoad =
53365 reduceMaskedLoadToScalarLoad(Mld, DAG, DCI, Subtarget))
53366 return ScalarLoad;
53367
53368 // TODO: Do some AVX512 subsets benefit from this transform?
53369 if (!Subtarget.hasAVX512())
53370 if (SDValue Blend = combineMaskedLoadConstantMask(Mld, DAG, DCI))
53371 return Blend;
53372 }
53373
53374 // If the mask value has been legalized to a non-boolean vector, try to
53375 // simplify ops leading up to it. We only demand the MSB of each lane.
53376 SDValue Mask = Mld->getMask();
53377 if (Mask.getScalarValueSizeInBits() != 1) {
53378 EVT VT = Mld->getValueType(0);
53379 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
53381 if (TLI.SimplifyDemandedBits(Mask, DemandedBits, DCI)) {
53382 if (N->getOpcode() != ISD::DELETED_NODE)
53383 DCI.AddToWorklist(N);
53384 return SDValue(N, 0);
53385 }
53386 if (SDValue NewMask =
53388 return DAG.getMaskedLoad(
53389 VT, SDLoc(N), Mld->getChain(), Mld->getBasePtr(), Mld->getOffset(),
53390 NewMask, Mld->getPassThru(), Mld->getMemoryVT(), Mld->getMemOperand(),
53391 Mld->getAddressingMode(), Mld->getExtensionType());
53392 }
53393
53394 return SDValue();
53395}
53396
53397/// If exactly one element of the mask is set for a non-truncating masked store,
53398/// it is a vector extract and scalar store.
53399/// Note: It is expected that the degenerate cases of an all-zeros or all-ones
53400/// mask have already been optimized in IR, so we don't bother with those here.
53402 SelectionDAG &DAG,
53403 const X86Subtarget &Subtarget) {
53404 // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
53405 // However, some target hooks may need to be added to know when the transform
53406 // is profitable. Endianness would also have to be considered.
53407
53408 SDValue Addr, VecIndex;
53409 Align Alignment;
53410 unsigned Offset;
53411 if (!getParamsForOneTrueMaskedElt(MS, DAG, Addr, VecIndex, Alignment, Offset))
53412 return SDValue();
53413
53414 // Extract the one scalar element that is actually being stored.
53415 SDLoc DL(MS);
53416 SDValue Value = MS->getValue();
53417 EVT VT = Value.getValueType();
53418 EVT EltVT = VT.getVectorElementType();
53419 if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {
53420 EltVT = MVT::f64;
53421 EVT CastVT = VT.changeVectorElementType(EltVT);
53422 Value = DAG.getBitcast(CastVT, Value);
53423 }
53424 SDValue Extract =
53425 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Value, VecIndex);
53426
53427 // Store that element at the appropriate offset from the base pointer.
53428 return DAG.getStore(MS->getChain(), DL, Extract, Addr,
53430 Alignment, MS->getMemOperand()->getFlags());
53431}
53432
53435 const X86Subtarget &Subtarget) {
53437 if (Mst->isCompressingStore())
53438 return SDValue();
53439
53440 EVT VT = Mst->getValue().getValueType();
53441 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
53442
53443 if (Mst->isTruncatingStore())
53444 return SDValue();
53445
53446 if (SDValue ScalarStore = reduceMaskedStoreToScalarStore(Mst, DAG, Subtarget))
53447 return ScalarStore;
53448
53449 // If the mask value has been legalized to a non-boolean vector, try to
53450 // simplify ops leading up to it. We only demand the MSB of each lane.
53451 SDValue Mask = Mst->getMask();
53452 if (Mask.getScalarValueSizeInBits() != 1) {
53454 if (TLI.SimplifyDemandedBits(Mask, DemandedBits, DCI)) {
53455 if (N->getOpcode() != ISD::DELETED_NODE)
53456 DCI.AddToWorklist(N);
53457 return SDValue(N, 0);
53458 }
53459 if (SDValue NewMask =
53461 return DAG.getMaskedStore(Mst->getChain(), SDLoc(N), Mst->getValue(),
53462 Mst->getBasePtr(), Mst->getOffset(), NewMask,
53463 Mst->getMemoryVT(), Mst->getMemOperand(),
53464 Mst->getAddressingMode());
53465 }
53466
53467 SDValue Value = Mst->getValue();
53468 if (Value.getOpcode() == ISD::TRUNCATE && Value.getNode()->hasOneUse() &&
53469 TLI.isTruncStoreLegal(Value.getOperand(0).getValueType(),
53470 Mst->getMemoryVT())) {
53471 return DAG.getMaskedStore(Mst->getChain(), SDLoc(N), Value.getOperand(0),
53472 Mst->getBasePtr(), Mst->getOffset(), Mask,
53473 Mst->getMemoryVT(), Mst->getMemOperand(),
53474 Mst->getAddressingMode(), true);
53475 }
53476
53477 return SDValue();
53478}
53479
53482 const X86Subtarget &Subtarget) {
53484 EVT StVT = St->getMemoryVT();
53485 SDLoc dl(St);
53486 SDValue StoredVal = St->getValue();
53487 EVT VT = StoredVal.getValueType();
53488 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
53489
53490 // Convert a store of vXi1 into a store of iX and a bitcast.
53491 if (!Subtarget.hasAVX512() && VT == StVT && VT.isVector() &&
53492 VT.getVectorElementType() == MVT::i1) {
53493
53495 StoredVal = DAG.getBitcast(NewVT, StoredVal);
53496
53497 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
53498 St->getPointerInfo(), St->getBaseAlign(),
53499 St->getMemOperand()->getFlags());
53500 }
53501
53502 // If this is a store of a scalar_to_vector to v1i1, just use a scalar store.
53503 // This will avoid a copy to k-register.
53504 if (VT == MVT::v1i1 && VT == StVT && Subtarget.hasAVX512() &&
53505 StoredVal.getOpcode() == ISD::SCALAR_TO_VECTOR &&
53506 StoredVal.getOperand(0).getValueType() == MVT::i8) {
53507 SDValue Val = StoredVal.getOperand(0);
53508 // We must store zeros to the unused bits.
53509 Val = DAG.getZeroExtendInReg(Val, dl, MVT::i1);
53510 return DAG.getStore(St->getChain(), dl, Val, St->getBasePtr(),
53511 St->getPointerInfo(), St->getBaseAlign(),
53512 St->getMemOperand()->getFlags());
53513 }
53514
53515 // Widen v2i1/v4i1 stores to v8i1.
53516 if ((VT == MVT::v1i1 || VT == MVT::v2i1 || VT == MVT::v4i1) && VT == StVT &&
53517 Subtarget.hasAVX512()) {
53518 unsigned NumConcats = 8 / VT.getVectorNumElements();
53519 // We must store zeros to the unused bits.
53520 SmallVector<SDValue, 4> Ops(NumConcats, DAG.getConstant(0, dl, VT));
53521 Ops[0] = StoredVal;
53522 StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
53523 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
53524 St->getPointerInfo(), St->getBaseAlign(),
53525 St->getMemOperand()->getFlags());
53526 }
53527
53528 // Turn vXi1 stores of constants into a scalar store.
53529 if ((VT == MVT::v8i1 || VT == MVT::v16i1 || VT == MVT::v32i1 ||
53530 VT == MVT::v64i1) && VT == StVT && TLI.isTypeLegal(VT) &&
53532 // If its a v64i1 store without 64-bit support, we need two stores.
53533 if (!DCI.isBeforeLegalize() && VT == MVT::v64i1 && !Subtarget.is64Bit()) {
53534 SDValue Lo = DAG.getBuildVector(MVT::v32i1, dl,
53535 StoredVal->ops().slice(0, 32));
53537 SDValue Hi = DAG.getBuildVector(MVT::v32i1, dl,
53538 StoredVal->ops().slice(32, 32));
53540
53541 SDValue Ptr0 = St->getBasePtr();
53542 SDValue Ptr1 = DAG.getMemBasePlusOffset(Ptr0, TypeSize::getFixed(4), dl);
53543
53544 SDValue Ch0 =
53545 DAG.getStore(St->getChain(), dl, Lo, Ptr0, St->getPointerInfo(),
53546 St->getBaseAlign(), St->getMemOperand()->getFlags());
53547 SDValue Ch1 = DAG.getStore(
53548 St->getChain(), dl, Hi, Ptr1, St->getPointerInfo().getWithOffset(4),
53549 St->getBaseAlign(), St->getMemOperand()->getFlags());
53550 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1);
53551 }
53552
53553 StoredVal = combinevXi1ConstantToInteger(StoredVal, DAG);
53554 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
53555 St->getPointerInfo(), St->getBaseAlign(),
53556 St->getMemOperand()->getFlags());
53557 }
53558
53559 // Convert scalar fabs/fneg load-store to integer equivalents.
53560 if ((VT == MVT::f16 || VT == MVT::bf16 || VT == MVT::f32 || VT == MVT::f64) &&
53561 (StoredVal.getOpcode() == ISD::FABS ||
53562 StoredVal.getOpcode() == ISD::FNEG) &&
53563 ISD::isNormalLoad(StoredVal.getOperand(0).getNode()) &&
53564 StoredVal.hasOneUse() && StoredVal.getOperand(0).hasOneUse()) {
53565 MVT IntVT = VT.getSimpleVT().changeTypeToInteger();
53566 if (TLI.isTypeLegal(IntVT)) {
53568 unsigned SignOp = ISD::XOR;
53569 if (StoredVal.getOpcode() == ISD::FABS) {
53570 SignMask = ~SignMask;
53571 SignOp = ISD::AND;
53572 }
53573 SDValue LogicOp = DAG.getNode(
53574 SignOp, dl, IntVT, DAG.getBitcast(IntVT, StoredVal.getOperand(0)),
53575 DAG.getConstant(SignMask, dl, IntVT));
53576 return DAG.getStore(St->getChain(), dl, LogicOp, St->getBasePtr(),
53577 St->getPointerInfo(), St->getBaseAlign(),
53578 St->getMemOperand()->getFlags());
53579 }
53580 }
53581
53582 // If we are saving a 32-byte vector and 32-byte stores are slow, such as on
53583 // Sandy Bridge, perform two 16-byte stores.
53584 unsigned Fast;
53585 if (VT.is256BitVector() && StVT == VT &&
53586 TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
53587 *St->getMemOperand(), &Fast) &&
53588 !Fast) {
53589 unsigned NumElems = VT.getVectorNumElements();
53590 if (NumElems < 2)
53591 return SDValue();
53592
53593 return splitVectorStore(St, DAG);
53594 }
53595
53596 // Split under-aligned vector non-temporal stores.
53597 if (St->isNonTemporal() && StVT == VT &&
53598 St->getAlign().value() < VT.getStoreSize()) {
53599 // ZMM/YMM nt-stores - either it can be stored as a series of shorter
53600 // vectors or the legalizer can scalarize it to use MOVNTI.
53601 if (VT.is256BitVector() || VT.is512BitVector()) {
53602 unsigned NumElems = VT.getVectorNumElements();
53603 if (NumElems < 2)
53604 return SDValue();
53605 return splitVectorStore(St, DAG);
53606 }
53607
53608 // XMM nt-stores - scalarize this to f64 nt-stores on SSE4A, else i32/i64
53609 // to use MOVNTI.
53610 if (VT.is128BitVector() && Subtarget.hasSSE2()) {
53611 MVT NTVT = Subtarget.hasSSE4A()
53612 ? MVT::v2f64
53613 : (TLI.isTypeLegal(MVT::i64) ? MVT::v2i64 : MVT::v4i32);
53614 return scalarizeVectorStore(St, NTVT, DAG);
53615 }
53616 }
53617
53618 // Try to optimize v16i16->v16i8 truncating stores when BWI is not
53619 // supported, but avx512f is by extending to v16i32 and truncating.
53620 if (!St->isTruncatingStore() && VT == MVT::v16i8 && !Subtarget.hasBWI() &&
53621 St->getValue().getOpcode() == ISD::TRUNCATE &&
53622 St->getValue().getOperand(0).getValueType() == MVT::v16i16 &&
53623 TLI.isTruncStoreLegal(MVT::v16i32, MVT::v16i8) &&
53624 St->getValue().hasOneUse() && !DCI.isBeforeLegalizeOps()) {
53625 SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::v16i32,
53626 St->getValue().getOperand(0));
53627 return DAG.getTruncStore(St->getChain(), dl, Ext, St->getBasePtr(),
53628 MVT::v16i8, St->getMemOperand());
53629 }
53630
53631 // Try to fold a VTRUNCUS or VTRUNCS into a truncating store.
53632 if (!St->isTruncatingStore() &&
53633 (StoredVal.getOpcode() == X86ISD::VTRUNCUS ||
53634 StoredVal.getOpcode() == X86ISD::VTRUNCS) &&
53635 StoredVal.hasOneUse() &&
53636 TLI.isTruncStoreLegal(StoredVal.getOperand(0).getValueType(), VT)) {
53637 bool IsSigned = StoredVal.getOpcode() == X86ISD::VTRUNCS;
53638 return EmitTruncSStore(IsSigned, St->getChain(),
53639 dl, StoredVal.getOperand(0), St->getBasePtr(),
53640 VT, St->getMemOperand(), DAG);
53641 }
53642
53643 // Try to fold a extract_element(VTRUNC) pattern into a truncating store.
53644 if (!St->isTruncatingStore()) {
53645 auto IsExtractedElement = [](SDValue V) {
53646 if (V.getOpcode() == ISD::TRUNCATE && V.hasOneUse())
53647 V = V.getOperand(0);
53648 unsigned Opc = V.getOpcode();
53650 isNullConstant(V.getOperand(1)) && V.hasOneUse() &&
53651 V.getOperand(0).hasOneUse())
53652 return V.getOperand(0);
53653 return SDValue();
53654 };
53655 if (SDValue Extract = IsExtractedElement(StoredVal)) {
53656 SDValue Trunc = peekThroughOneUseBitcasts(Extract);
53657 if (Trunc.getOpcode() == X86ISD::VTRUNC) {
53658 SDValue Src = Trunc.getOperand(0);
53659 MVT DstVT = Trunc.getSimpleValueType();
53660 MVT SrcVT = Src.getSimpleValueType();
53661 unsigned NumSrcElts = SrcVT.getVectorNumElements();
53662 unsigned NumTruncBits = DstVT.getScalarSizeInBits() * NumSrcElts;
53663 MVT TruncVT = MVT::getVectorVT(DstVT.getScalarType(), NumSrcElts);
53664 if (NumTruncBits == VT.getSizeInBits() &&
53665 TLI.isTruncStoreLegal(SrcVT, TruncVT)) {
53666 return DAG.getTruncStore(St->getChain(), dl, Src, St->getBasePtr(),
53667 TruncVT, St->getMemOperand());
53668 }
53669 }
53670 }
53671 }
53672
53673 // Optimize trunc store (of multiple scalars) to shuffle and store.
53674 // First, pack all of the elements in one place. Next, store to memory
53675 // in fewer chunks.
53676 if (St->isTruncatingStore() && VT.isVector()) {
53677 if (TLI.isTruncStoreLegal(VT, StVT)) {
53678 if (SDValue Val = detectSSatPattern(St->getValue(), St->getMemoryVT()))
53679 return EmitTruncSStore(true /* Signed saturation */, St->getChain(),
53680 dl, Val, St->getBasePtr(),
53681 St->getMemoryVT(), St->getMemOperand(), DAG);
53682 if (SDValue Val = detectUSatPattern(St->getValue(), St->getMemoryVT(),
53683 DAG, dl))
53684 return EmitTruncSStore(false /* Unsigned saturation */, St->getChain(),
53685 dl, Val, St->getBasePtr(),
53686 St->getMemoryVT(), St->getMemOperand(), DAG);
53687 }
53688
53689 return SDValue();
53690 }
53691
53692 // Cast ptr32 and ptr64 pointers to the default address space before a store.
53693 unsigned AddrSpace = St->getAddressSpace();
53694 if (AddrSpace == X86AS::PTR64 || AddrSpace == X86AS::PTR32_SPTR ||
53695 AddrSpace == X86AS::PTR32_UPTR) {
53696 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
53697 if (PtrVT != St->getBasePtr().getSimpleValueType()) {
53698 SDValue Cast =
53699 DAG.getAddrSpaceCast(dl, PtrVT, St->getBasePtr(), AddrSpace, 0);
53700 return DAG.getTruncStore(
53701 St->getChain(), dl, StoredVal, Cast, St->getPointerInfo(), StVT,
53702 St->getBaseAlign(), St->getMemOperand()->getFlags(), St->getAAInfo());
53703 }
53704 }
53705
53706 // Convert store(cmov(load(p), x, CC), p) to cstore(x, p, CC)
53707 // store(cmov(x, load(p), CC), p) to cstore(x, p, InvertCC)
53708 if ((VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) &&
53709 Subtarget.hasCF() && St->isSimple()) {
53710 SDValue Cmov;
53711 if (StoredVal.getOpcode() == X86ISD::CMOV)
53712 Cmov = StoredVal;
53713 else if (StoredVal.getOpcode() == ISD::TRUNCATE &&
53714 StoredVal.getOperand(0).getOpcode() == X86ISD::CMOV)
53715 Cmov = StoredVal.getOperand(0);
53716 else
53717 return SDValue();
53718
53719 auto *Ld = dyn_cast<LoadSDNode>(St->getChain());
53720 if (!Ld || !Ld->isSimple() || Ld->getBasePtr() != St->getBasePtr())
53721 return SDValue();
53722
53723 bool InvertCC = false;
53724 SDValue V = SDValue(Ld, 0);
53725 if (V == Cmov.getOperand(1))
53726 InvertCC = true;
53727 else if (V != Cmov.getOperand(0))
53728 return SDValue();
53729
53730 SDVTList Tys = DAG.getVTList(MVT::Other);
53731 SDValue CC = Cmov.getOperand(2);
53732 SDValue Src = DAG.getAnyExtOrTrunc(Cmov.getOperand(!InvertCC), dl, VT);
53733 if (InvertCC)
53734 CC = DAG.getTargetConstant(
53737 dl, MVT::i8);
53738 SDValue Ops[] = {St->getChain(), Src, St->getBasePtr(), CC,
53739 Cmov.getOperand(3)};
53740 return DAG.getMemIntrinsicNode(X86ISD::CSTORE, dl, Tys, Ops, VT,
53741 St->getMemOperand());
53742 }
53743
53744 // Turn load->store of MMX types into GPR load/stores. This avoids clobbering
53745 // the FP state in cases where an emms may be missing.
53746 // A preferable solution to the general problem is to figure out the right
53747 // places to insert EMMS. This qualifies as a quick hack.
53748
53749 // Similarly, turn load->store of i64 into double load/stores in 32-bit mode.
53750 if (VT.getSizeInBits() != 64)
53751 return SDValue();
53752
53753 const Function &F = DAG.getMachineFunction().getFunction();
53754 bool NoImplicitFloatOps = F.hasFnAttribute(Attribute::NoImplicitFloat);
53755 bool F64IsLegal =
53756 !Subtarget.useSoftFloat() && !NoImplicitFloatOps && Subtarget.hasSSE2();
53757
53758 if (!F64IsLegal || Subtarget.is64Bit())
53759 return SDValue();
53760
53761 if (VT == MVT::i64 && isa<LoadSDNode>(St->getValue()) &&
53762 cast<LoadSDNode>(St->getValue())->isSimple() &&
53763 St->getChain().hasOneUse() && St->isSimple()) {
53764 auto *Ld = cast<LoadSDNode>(St->getValue());
53765
53766 if (!ISD::isNormalLoad(Ld))
53767 return SDValue();
53768
53769 // Avoid the transformation if there are multiple uses of the loaded value.
53770 if (!Ld->hasNUsesOfValue(1, 0))
53771 return SDValue();
53772
53773 SDLoc LdDL(Ld);
53774 SDLoc StDL(N);
53775
53776 // Remove any range metadata as we're converting to f64 load/store.
53777 Ld->getMemOperand()->clearRanges();
53778
53779 // Lower to a single movq load/store pair.
53780 SDValue NewLd = DAG.getLoad(MVT::f64, LdDL, Ld->getChain(),
53781 Ld->getBasePtr(), Ld->getMemOperand());
53782
53783 // Make sure new load is placed in same chain order.
53784 DAG.makeEquivalentMemoryOrdering(Ld, NewLd);
53785 return DAG.getStore(St->getChain(), StDL, NewLd, St->getBasePtr(),
53786 St->getMemOperand());
53787 }
53788
53789 // This is similar to the above case, but here we handle a scalar 64-bit
53790 // integer store that is extracted from a vector on a 32-bit target.
53791 // If we have SSE2, then we can treat it like a floating-point double
53792 // to get past legalization. The execution dependencies fixup pass will
53793 // choose the optimal machine instruction for the store if this really is
53794 // an integer or v2f32 rather than an f64.
53795 if (VT == MVT::i64 &&
53797 SDValue OldExtract = St->getOperand(1);
53798 SDValue ExtOp0 = OldExtract.getOperand(0);
53799 unsigned VecSize = ExtOp0.getValueSizeInBits();
53800 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, VecSize / 64);
53801 SDValue BitCast = DAG.getBitcast(VecVT, ExtOp0);
53802 SDValue NewExtract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
53803 BitCast, OldExtract.getOperand(1));
53804 return DAG.getStore(St->getChain(), dl, NewExtract, St->getBasePtr(),
53805 St->getPointerInfo(), St->getBaseAlign(),
53806 St->getMemOperand()->getFlags());
53807 }
53808
53809 return SDValue();
53810}
53811
53814 const X86Subtarget &Subtarget) {
53815 auto *St = cast<MemIntrinsicSDNode>(N);
53816
53817 SDValue StoredVal = N->getOperand(1);
53818 MVT VT = StoredVal.getSimpleValueType();
53819 EVT MemVT = St->getMemoryVT();
53820
53821 // Figure out which elements we demand.
53822 unsigned StElts = MemVT.getSizeInBits() / VT.getScalarSizeInBits();
53823 APInt DemandedElts = APInt::getLowBitsSet(VT.getVectorNumElements(), StElts);
53824
53825 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
53826 if (TLI.SimplifyDemandedVectorElts(StoredVal, DemandedElts, DCI)) {
53827 if (N->getOpcode() != ISD::DELETED_NODE)
53828 DCI.AddToWorklist(N);
53829 return SDValue(N, 0);
53830 }
53831
53832 return SDValue();
53833}
53834
53835/// Return 'true' if this vector operation is "horizontal"
53836/// and return the operands for the horizontal operation in LHS and RHS. A
53837/// horizontal operation performs the binary operation on successive elements
53838/// of its first operand, then on successive elements of its second operand,
53839/// returning the resulting values in a vector. For example, if
53840/// A = < float a0, float a1, float a2, float a3 >
53841/// and
53842/// B = < float b0, float b1, float b2, float b3 >
53843/// then the result of doing a horizontal operation on A and B is
53844/// A horizontal-op B = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >.
53845/// In short, LHS and RHS are inspected to see if LHS op RHS is of the form
53846/// A horizontal-op B, for some already available A and B, and if so then LHS is
53847/// set to A, RHS to B, and the routine returns 'true'.
53848static bool isHorizontalBinOp(unsigned HOpcode, SDValue &LHS, SDValue &RHS,
53849 SelectionDAG &DAG, const X86Subtarget &Subtarget,
53850 bool IsCommutative,
53851 SmallVectorImpl<int> &PostShuffleMask,
53852 bool ForceHorizOp) {
53853 // If either operand is undef, bail out. The binop should be simplified.
53854 if (LHS.isUndef() || RHS.isUndef())
53855 return false;
53856
53857 // Look for the following pattern:
53858 // A = < float a0, float a1, float a2, float a3 >
53859 // B = < float b0, float b1, float b2, float b3 >
53860 // and
53861 // LHS = VECTOR_SHUFFLE A, B, <0, 2, 4, 6>
53862 // RHS = VECTOR_SHUFFLE A, B, <1, 3, 5, 7>
53863 // then LHS op RHS = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >
53864 // which is A horizontal-op B.
53865
53866 MVT VT = LHS.getSimpleValueType();
53867 assert((VT.is128BitVector() || VT.is256BitVector()) &&
53868 "Unsupported vector type for horizontal add/sub");
53869 unsigned NumElts = VT.getVectorNumElements();
53870
53871 auto GetShuffle = [&](SDValue Op, SDValue &N0, SDValue &N1,
53872 SmallVectorImpl<int> &ShuffleMask) {
53873 bool UseSubVector = false;
53874 if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
53875 Op.getOperand(0).getValueType().is256BitVector() &&
53876 llvm::isNullConstant(Op.getOperand(1))) {
53877 Op = Op.getOperand(0);
53878 UseSubVector = true;
53879 }
53881 SmallVector<int, 16> SrcMask, ScaledMask;
53883 if (getTargetShuffleInputs(BC, SrcOps, SrcMask, DAG) &&
53884 !isAnyZero(SrcMask) && all_of(SrcOps, [BC](SDValue Op) {
53885 return Op.getValueSizeInBits() == BC.getValueSizeInBits();
53886 })) {
53887 resolveTargetShuffleInputsAndMask(SrcOps, SrcMask);
53888 if (!UseSubVector && SrcOps.size() <= 2 &&
53889 scaleShuffleElements(SrcMask, NumElts, ScaledMask)) {
53890 N0 = !SrcOps.empty() ? SrcOps[0] : SDValue();
53891 N1 = SrcOps.size() > 1 ? SrcOps[1] : SDValue();
53892 ShuffleMask.assign(ScaledMask.begin(), ScaledMask.end());
53893 }
53894 if (UseSubVector && SrcOps.size() == 1 &&
53895 scaleShuffleElements(SrcMask, 2 * NumElts, ScaledMask)) {
53896 std::tie(N0, N1) = DAG.SplitVector(SrcOps[0], SDLoc(Op));
53897 ArrayRef<int> Mask = ArrayRef<int>(ScaledMask).slice(0, NumElts);
53898 ShuffleMask.assign(Mask.begin(), Mask.end());
53899 }
53900 }
53901 };
53902
53903 // View LHS in the form
53904 // LHS = VECTOR_SHUFFLE A, B, LMask
53905 // If LHS is not a shuffle, then pretend it is the identity shuffle:
53906 // LHS = VECTOR_SHUFFLE LHS, undef, <0, 1, ..., N-1>
53907 // NOTE: A default initialized SDValue represents an UNDEF of type VT.
53908 SDValue A, B;
53910 GetShuffle(LHS, A, B, LMask);
53911
53912 // Likewise, view RHS in the form
53913 // RHS = VECTOR_SHUFFLE C, D, RMask
53914 SDValue C, D;
53916 GetShuffle(RHS, C, D, RMask);
53917
53918 // At least one of the operands should be a vector shuffle.
53919 unsigned NumShuffles = (LMask.empty() ? 0 : 1) + (RMask.empty() ? 0 : 1);
53920 if (NumShuffles == 0)
53921 return false;
53922
53923 if (LMask.empty()) {
53924 A = LHS;
53925 for (unsigned i = 0; i != NumElts; ++i)
53926 LMask.push_back(i);
53927 }
53928
53929 if (RMask.empty()) {
53930 C = RHS;
53931 for (unsigned i = 0; i != NumElts; ++i)
53932 RMask.push_back(i);
53933 }
53934
53935 // If we have an unary mask, ensure the other op is set to null.
53936 if (isUndefOrInRange(LMask, 0, NumElts))
53937 B = SDValue();
53938 else if (isUndefOrInRange(LMask, NumElts, NumElts * 2))
53939 A = SDValue();
53940
53941 if (isUndefOrInRange(RMask, 0, NumElts))
53942 D = SDValue();
53943 else if (isUndefOrInRange(RMask, NumElts, NumElts * 2))
53944 C = SDValue();
53945
53946 // If A and B occur in reverse order in RHS, then canonicalize by commuting
53947 // RHS operands and shuffle mask.
53948 if (A != C) {
53949 std::swap(C, D);
53951 }
53952 // Check that the shuffles are both shuffling the same vectors.
53953 if (!(A == C && B == D))
53954 return false;
53955
53956 PostShuffleMask.clear();
53957 PostShuffleMask.append(NumElts, SM_SentinelUndef);
53958
53959 // LHS and RHS are now:
53960 // LHS = shuffle A, B, LMask
53961 // RHS = shuffle A, B, RMask
53962 // Check that the masks correspond to performing a horizontal operation.
53963 // AVX defines horizontal add/sub to operate independently on 128-bit lanes,
53964 // so we just repeat the inner loop if this is a 256-bit op.
53965 unsigned Num128BitChunks = VT.getSizeInBits() / 128;
53966 unsigned NumEltsPer128BitChunk = NumElts / Num128BitChunks;
53967 unsigned NumEltsPer64BitChunk = NumEltsPer128BitChunk / 2;
53968 assert((NumEltsPer128BitChunk % 2 == 0) &&
53969 "Vector type should have an even number of elements in each lane");
53970 for (unsigned j = 0; j != NumElts; j += NumEltsPer128BitChunk) {
53971 for (unsigned i = 0; i != NumEltsPer128BitChunk; ++i) {
53972 // Ignore undefined components.
53973 int LIdx = LMask[i + j], RIdx = RMask[i + j];
53974 if (LIdx < 0 || RIdx < 0 ||
53975 (!A.getNode() && (LIdx < (int)NumElts || RIdx < (int)NumElts)) ||
53976 (!B.getNode() && (LIdx >= (int)NumElts || RIdx >= (int)NumElts)))
53977 continue;
53978
53979 // Check that successive odd/even elements are being operated on. If not,
53980 // this is not a horizontal operation.
53981 if (!((RIdx & 1) == 1 && (LIdx + 1) == RIdx) &&
53982 !((LIdx & 1) == 1 && (RIdx + 1) == LIdx && IsCommutative))
53983 return false;
53984
53985 // Compute the post-shuffle mask index based on where the element
53986 // is stored in the HOP result, and where it needs to be moved to.
53987 int Base = LIdx & ~1u;
53988 int Index = ((Base % NumEltsPer128BitChunk) / 2) +
53989 ((Base % NumElts) & ~(NumEltsPer128BitChunk - 1));
53990
53991 // The low half of the 128-bit result must choose from A.
53992 // The high half of the 128-bit result must choose from B,
53993 // unless B is undef. In that case, we are always choosing from A.
53994 if ((B && Base >= (int)NumElts) || (!B && i >= NumEltsPer64BitChunk))
53995 Index += NumEltsPer64BitChunk;
53996 PostShuffleMask[i + j] = Index;
53997 }
53998 }
53999
54000 SDValue NewLHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it.
54001 SDValue NewRHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it.
54002
54003 bool IsIdentityPostShuffle =
54004 isSequentialOrUndefInRange(PostShuffleMask, 0, NumElts, 0);
54005 if (IsIdentityPostShuffle)
54006 PostShuffleMask.clear();
54007
54008 // Avoid 128-bit multi lane shuffles if pre-AVX2 and FP (integer will split).
54009 if (!IsIdentityPostShuffle && !Subtarget.hasAVX2() && VT.isFloatingPoint() &&
54010 isMultiLaneShuffleMask(128, VT.getScalarSizeInBits(), PostShuffleMask))
54011 return false;
54012
54013 // If the source nodes are already used in HorizOps then always accept this.
54014 // Shuffle folding should merge these back together.
54015 auto FoundHorizUser = [&](SDNode *User) {
54016 return User->getOpcode() == HOpcode && User->getValueType(0) == VT;
54017 };
54018 ForceHorizOp =
54019 ForceHorizOp || (llvm::any_of(NewLHS->users(), FoundHorizUser) &&
54020 llvm::any_of(NewRHS->users(), FoundHorizUser));
54021
54022 // Assume a SingleSource HOP if we only shuffle one input and don't need to
54023 // shuffle the result.
54024 if (!ForceHorizOp &&
54025 !shouldUseHorizontalOp(NewLHS == NewRHS &&
54026 (NumShuffles < 2 || !IsIdentityPostShuffle),
54027 DAG, Subtarget))
54028 return false;
54029
54030 LHS = DAG.getBitcast(VT, NewLHS);
54031 RHS = DAG.getBitcast(VT, NewRHS);
54032 return true;
54033}
54034
54035// Try to synthesize horizontal (f)hadd/hsub from (f)adds/subs of shuffles.
54037 const X86Subtarget &Subtarget) {
54038 EVT VT = N->getValueType(0);
54039 unsigned Opcode = N->getOpcode();
54040 bool IsAdd = (Opcode == ISD::FADD) || (Opcode == ISD::ADD);
54041 SmallVector<int, 8> PostShuffleMask;
54042
54043 auto MergableHorizOp = [N](unsigned HorizOpcode) {
54044 return N->hasOneUse() &&
54045 N->user_begin()->getOpcode() == ISD::VECTOR_SHUFFLE &&
54046 (N->user_begin()->getOperand(0).getOpcode() == HorizOpcode ||
54047 N->user_begin()->getOperand(1).getOpcode() == HorizOpcode);
54048 };
54049
54050 switch (Opcode) {
54051 case ISD::FADD:
54052 case ISD::FSUB:
54053 if ((Subtarget.hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
54054 (Subtarget.hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) {
54055 SDValue LHS = N->getOperand(0);
54056 SDValue RHS = N->getOperand(1);
54057 auto HorizOpcode = IsAdd ? X86ISD::FHADD : X86ISD::FHSUB;
54058 if (isHorizontalBinOp(HorizOpcode, LHS, RHS, DAG, Subtarget, IsAdd,
54059 PostShuffleMask, MergableHorizOp(HorizOpcode))) {
54060 SDValue HorizBinOp = DAG.getNode(HorizOpcode, SDLoc(N), VT, LHS, RHS);
54061 if (!PostShuffleMask.empty())
54062 HorizBinOp = DAG.getVectorShuffle(VT, SDLoc(HorizBinOp), HorizBinOp,
54063 DAG.getUNDEF(VT), PostShuffleMask);
54064 return HorizBinOp;
54065 }
54066 }
54067 break;
54068 case ISD::ADD:
54069 case ISD::SUB:
54070 if (Subtarget.hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32 ||
54071 VT == MVT::v16i16 || VT == MVT::v8i32)) {
54072 SDValue LHS = N->getOperand(0);
54073 SDValue RHS = N->getOperand(1);
54074 auto HorizOpcode = IsAdd ? X86ISD::HADD : X86ISD::HSUB;
54075 if (isHorizontalBinOp(HorizOpcode, LHS, RHS, DAG, Subtarget, IsAdd,
54076 PostShuffleMask, MergableHorizOp(HorizOpcode))) {
54077 auto HOpBuilder = [HorizOpcode](SelectionDAG &DAG, const SDLoc &DL,
54079 return DAG.getNode(HorizOpcode, DL, Ops[0].getValueType(), Ops);
54080 };
54081 SDValue HorizBinOp = SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT,
54082 {LHS, RHS}, HOpBuilder);
54083 if (!PostShuffleMask.empty())
54084 HorizBinOp = DAG.getVectorShuffle(VT, SDLoc(HorizBinOp), HorizBinOp,
54085 DAG.getUNDEF(VT), PostShuffleMask);
54086 return HorizBinOp;
54087 }
54088 }
54089 break;
54090 }
54091
54092 return SDValue();
54093}
54094
54095// Try to combine the following nodes
54096// t29: i64 = X86ISD::Wrapper TargetConstantPool:i64
54097// <i32 -2147483648[float -0.000000e+00]> 0
54098// t27: v16i32[v16f32],ch = X86ISD::VBROADCAST_LOAD
54099// <(load 4 from constant-pool)> t0, t29
54100// [t30: v16i32 = bitcast t27]
54101// t6: v16i32 = xor t7, t27[t30]
54102// t11: v16f32 = bitcast t6
54103// t21: v16f32 = X86ISD::VFMULC[X86ISD::VCFMULC] t11, t8
54104// into X86ISD::VFCMULC[X86ISD::VFMULC] if possible:
54105// t22: v16f32 = bitcast t7
54106// t23: v16f32 = X86ISD::VFCMULC[X86ISD::VFMULC] t8, t22
54107// t24: v32f16 = bitcast t23
54109 const X86Subtarget &Subtarget) {
54110 EVT VT = N->getValueType(0);
54111 SDValue LHS = N->getOperand(0);
54112 SDValue RHS = N->getOperand(1);
54113 int CombineOpcode =
54114 N->getOpcode() == X86ISD::VFCMULC ? X86ISD::VFMULC : X86ISD::VFCMULC;
54115 auto combineConjugation = [&](SDValue &r) {
54116 if (LHS->getOpcode() == ISD::BITCAST) {
54117 SDValue XOR = LHS.getOperand(0);
54118 if (XOR->getOpcode() == ISD::XOR) {
54119 KnownBits XORRHS = DAG.computeKnownBits(XOR.getOperand(1));
54120 if (XORRHS.isConstant()) {
54121 APInt ConjugationInt32 = APInt(32, 0x80000000);
54122 APInt ConjugationInt64 = APInt(64, 0x8000000080000000ULL);
54123 if ((XORRHS.getBitWidth() == 32 &&
54124 XORRHS.getConstant() == ConjugationInt32) ||
54125 (XORRHS.getBitWidth() == 64 &&
54126 XORRHS.getConstant() == ConjugationInt64)) {
54127 SelectionDAG::FlagInserter FlagsInserter(DAG, N);
54128 SDValue I2F = DAG.getBitcast(VT, LHS.getOperand(0).getOperand(0));
54129 SDValue FCMulC = DAG.getNode(CombineOpcode, SDLoc(N), VT, RHS, I2F);
54130 r = DAG.getBitcast(VT, FCMulC);
54131 return true;
54132 }
54133 }
54134 }
54135 }
54136 return false;
54137 };
54138 SDValue Res;
54139 if (combineConjugation(Res))
54140 return Res;
54141 std::swap(LHS, RHS);
54142 if (combineConjugation(Res))
54143 return Res;
54144 return Res;
54145}
54146
54147// Try to combine the following nodes:
54148// FADD(A, FMA(B, C, 0)) and FADD(A, FMUL(B, C)) to FMA(B, C, A)
54150 const X86Subtarget &Subtarget) {
54151 auto AllowContract = [&DAG](const SDNodeFlags &Flags) {
54153 Flags.hasAllowContract();
54154 };
54155
54156 auto HasNoSignedZero = [&DAG](const SDNodeFlags &Flags) {
54157 return DAG.getTarget().Options.NoSignedZerosFPMath ||
54158 Flags.hasNoSignedZeros();
54159 };
54160 auto IsVectorAllNegativeZero = [&DAG](SDValue Op) {
54161 APInt AI = APInt(32, 0x80008000);
54162 KnownBits Bits = DAG.computeKnownBits(Op);
54163 return Bits.getBitWidth() == 32 && Bits.isConstant() &&
54164 Bits.getConstant() == AI;
54165 };
54166
54167 if (N->getOpcode() != ISD::FADD || !Subtarget.hasFP16() ||
54168 !AllowContract(N->getFlags()))
54169 return SDValue();
54170
54171 EVT VT = N->getValueType(0);
54172 if (VT != MVT::v8f16 && VT != MVT::v16f16 && VT != MVT::v32f16)
54173 return SDValue();
54174
54175 SDValue LHS = N->getOperand(0);
54176 SDValue RHS = N->getOperand(1);
54177 bool IsConj;
54178 SDValue FAddOp1, MulOp0, MulOp1;
54179 auto GetCFmulFrom = [&MulOp0, &MulOp1, &IsConj, &AllowContract,
54180 &IsVectorAllNegativeZero,
54181 &HasNoSignedZero](SDValue N) -> bool {
54182 if (!N.hasOneUse() || N.getOpcode() != ISD::BITCAST)
54183 return false;
54184 SDValue Op0 = N.getOperand(0);
54185 unsigned Opcode = Op0.getOpcode();
54186 if (Op0.hasOneUse() && AllowContract(Op0->getFlags())) {
54187 if ((Opcode == X86ISD::VFMULC || Opcode == X86ISD::VFCMULC)) {
54188 MulOp0 = Op0.getOperand(0);
54189 MulOp1 = Op0.getOperand(1);
54190 IsConj = Opcode == X86ISD::VFCMULC;
54191 return true;
54192 }
54193 if ((Opcode == X86ISD::VFMADDC || Opcode == X86ISD::VFCMADDC) &&
54195 HasNoSignedZero(Op0->getFlags())) ||
54196 IsVectorAllNegativeZero(Op0->getOperand(2)))) {
54197 MulOp0 = Op0.getOperand(0);
54198 MulOp1 = Op0.getOperand(1);
54199 IsConj = Opcode == X86ISD::VFCMADDC;
54200 return true;
54201 }
54202 }
54203 return false;
54204 };
54205
54206 if (GetCFmulFrom(LHS))
54207 FAddOp1 = RHS;
54208 else if (GetCFmulFrom(RHS))
54209 FAddOp1 = LHS;
54210 else
54211 return SDValue();
54212
54213 MVT CVT = MVT::getVectorVT(MVT::f32, VT.getVectorNumElements() / 2);
54214 FAddOp1 = DAG.getBitcast(CVT, FAddOp1);
54215 unsigned NewOp = IsConj ? X86ISD::VFCMADDC : X86ISD::VFMADDC;
54216 // FIXME: How do we handle when fast math flags of FADD are different from
54217 // CFMUL's?
54218 SDValue CFmul =
54219 DAG.getNode(NewOp, SDLoc(N), CVT, MulOp0, MulOp1, FAddOp1, N->getFlags());
54220 return DAG.getBitcast(VT, CFmul);
54221}
54222
54223/// Do target-specific dag combines on floating-point adds/subs.
54225 const X86Subtarget &Subtarget) {
54226 if (SDValue HOp = combineToHorizontalAddSub(N, DAG, Subtarget))
54227 return HOp;
54228
54229 if (SDValue COp = combineFaddCFmul(N, DAG, Subtarget))
54230 return COp;
54231
54232 return SDValue();
54233}
54234
54236 const X86Subtarget &Subtarget) {
54237 EVT VT = N->getValueType(0);
54238 SDValue Src = N->getOperand(0);
54239 EVT SrcVT = Src.getValueType();
54240 SDLoc DL(N);
54241
54242 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
54243
54244 // Let legalize expand this if it isn't a legal type yet.
54245 if (!TLI.isTypeLegal(VT))
54246 return SDValue();
54247
54248 if ((SrcVT.getScalarType() == MVT::f16 && !Subtarget.hasFP16()) ||
54249 (SrcVT.getScalarType() == MVT::f32 && !Subtarget.hasDQI()))
54250 return SDValue();
54251
54252 if (SrcVT == MVT::v2f16) {
54253 SrcVT = MVT::v4f16;
54254 Src = DAG.getNode(ISD::CONCAT_VECTORS, DL, SrcVT, Src,
54255 DAG.getUNDEF(MVT::v2f16));
54256 }
54257
54258 if (SrcVT == MVT::v4f16) {
54259 SrcVT = MVT::v8f16;
54260 Src = DAG.getNode(ISD::CONCAT_VECTORS, DL, SrcVT, Src,
54261 DAG.getUNDEF(MVT::v4f16));
54262 } else if (SrcVT == MVT::v2f32) {
54263 SrcVT = MVT::v4f32;
54264 Src = DAG.getNode(ISD::CONCAT_VECTORS, DL, SrcVT, Src,
54265 DAG.getUNDEF(MVT::v2f32));
54266 } else {
54267 return SDValue();
54268 }
54269
54270 return DAG.getNode(X86ISD::CVTP2SI, DL, VT, Src);
54271}
54272
54273// Attempt to fold some (truncate (srl (add/or/xor X, C1), C2)) patterns to
54274// (add/or/xor (truncate (srl X, C2)), C1'). C1' will be smaller than C1 so we
54275// are able to avoid generating code with MOVABS and large constants in certain
54276// cases.
54278 const SDLoc &DL) {
54279 assert(N.getOpcode() == ISD::SRL && "Unknown shift opcode");
54280 std::optional<unsigned> ValidSrlConst = DAG.getValidShiftAmount(N);
54281 if (!ValidSrlConst)
54282 return SDValue();
54283 unsigned SrlConstVal = *ValidSrlConst;
54284
54285 SDValue Op = N.getOperand(0);
54286 unsigned Opcode = Op.getOpcode();
54287 assert(VT == MVT::i32 && Op.getValueType() == MVT::i64 &&
54288 "Illegal truncation types");
54289
54290 if ((Opcode != ISD::ADD && Opcode != ISD::OR && Opcode != ISD::XOR) ||
54291 !isa<ConstantSDNode>(Op.getOperand(1)))
54292 return SDValue();
54293 const APInt &OpConst = Op.getConstantOperandAPInt(1);
54294
54295 if (SrlConstVal <= 32 ||
54296 (Opcode == ISD::ADD && OpConst.countr_zero() < SrlConstVal))
54297 return SDValue();
54298
54299 SDValue OpLhsSrl =
54300 DAG.getNode(ISD::SRL, DL, MVT::i64, Op.getOperand(0), N.getOperand(1));
54301 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, VT, OpLhsSrl);
54302
54303 APInt NewOpConstVal = OpConst.lshr(SrlConstVal).trunc(VT.getSizeInBits());
54304 SDValue NewOpConst = DAG.getConstant(NewOpConstVal, DL, VT);
54305 SDValue NewOpNode = DAG.getNode(Opcode, DL, VT, Trunc, NewOpConst);
54306
54307 if (Opcode == ISD::ADD) {
54308 EVT CleanUpVT = EVT::getIntegerVT(*DAG.getContext(), 64 - SrlConstVal);
54309 return DAG.getZeroExtendInReg(NewOpNode, DL, CleanUpVT);
54310 }
54311 return NewOpNode;
54312}
54313
54314/// Attempt to pre-truncate inputs to arithmetic ops if it will simplify
54315/// the codegen.
54316/// e.g. TRUNC( BINOP( X, Y ) ) --> BINOP( TRUNC( X ), TRUNC( Y ) )
54317/// TODO: This overlaps with the generic combiner's visitTRUNCATE. Remove
54318/// anything that is guaranteed to be transformed by DAGCombiner.
54320 const X86Subtarget &Subtarget,
54321 const SDLoc &DL) {
54322 assert(N->getOpcode() == ISD::TRUNCATE && "Wrong opcode");
54323 SDValue Src = N->getOperand(0);
54324 unsigned SrcOpcode = Src.getOpcode();
54325 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
54326
54327 EVT VT = N->getValueType(0);
54328 EVT SrcVT = Src.getValueType();
54329
54330 auto IsFreeTruncation = [VT](SDValue Op) {
54331 unsigned TruncSizeInBits = VT.getScalarSizeInBits();
54332
54333 // See if this has been extended from a smaller/equal size to
54334 // the truncation size, allowing a truncation to combine with the extend.
54335 unsigned Opcode = Op.getOpcode();
54336 if ((Opcode == ISD::ANY_EXTEND || Opcode == ISD::SIGN_EXTEND ||
54337 Opcode == ISD::ZERO_EXTEND) &&
54338 Op.getOperand(0).getScalarValueSizeInBits() <= TruncSizeInBits)
54339 return true;
54340
54341 // See if this is a single use constant which can be constant folded.
54342 // NOTE: We don't peek throught bitcasts here because there is currently
54343 // no support for constant folding truncate+bitcast+vector_of_constants. So
54344 // we'll just send up with a truncate on both operands which will
54345 // get turned back into (truncate (binop)) causing an infinite loop.
54346 return ISD::isBuildVectorOfConstantSDNodes(Op.getNode());
54347 };
54348
54349 auto TruncateArithmetic = [&](SDValue N0, SDValue N1) {
54350 SDValue Trunc0 = DAG.getNode(ISD::TRUNCATE, DL, VT, N0);
54351 SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, VT, N1);
54352 return DAG.getNode(SrcOpcode, DL, VT, Trunc0, Trunc1);
54353 };
54354
54355 // Don't combine if the operation has other uses.
54356 if (!Src.hasOneUse())
54357 return SDValue();
54358
54359 if (VT == MVT::i32 && SrcVT == MVT::i64 && SrcOpcode == ISD::SRL)
54360 return combinei64TruncSrlConstant(Src, VT, DAG, DL);
54361
54362 if (!VT.isVector())
54363 return SDValue();
54364
54365 // In most cases its only worth pre-truncating if we're only facing the cost
54366 // of one truncation.
54367 // i.e. if one of the inputs will constant fold or the input is repeated.
54368 switch (SrcOpcode) {
54369 case ISD::MUL:
54370 // X86 is rubbish at scalar and vector i64 multiplies (until AVX512DQ) - its
54371 // better to truncate if we have the chance.
54372 if (SrcVT.getScalarType() == MVT::i64 &&
54373 TLI.isOperationLegal(SrcOpcode, VT) &&
54374 !TLI.isOperationLegal(SrcOpcode, SrcVT))
54375 return TruncateArithmetic(Src.getOperand(0), Src.getOperand(1));
54376 [[fallthrough]];
54377 case ISD::AND:
54378 case ISD::XOR:
54379 case ISD::OR:
54380 case ISD::ADD:
54381 case ISD::SUB: {
54382 SDValue Op0 = Src.getOperand(0);
54383 SDValue Op1 = Src.getOperand(1);
54384 if (TLI.isOperationLegal(SrcOpcode, VT) &&
54385 (Op0 == Op1 || IsFreeTruncation(Op0) || IsFreeTruncation(Op1)))
54386 return TruncateArithmetic(Op0, Op1);
54387 break;
54388 }
54389 }
54390
54391 return SDValue();
54392}
54393
54394// Try to form a MULHU or MULHS node by looking for
54395// (trunc (srl (mul ext, ext), >= 16))
54396// TODO: This is X86 specific because we want to be able to handle wide types
54397// before type legalization. But we can only do it if the vector will be
54398// legalized via widening/splitting. Type legalization can't handle promotion
54399// of a MULHU/MULHS. There isn't a way to convey this to the generic DAG
54400// combiner.
54401static SDValue combinePMULH(SDValue Src, EVT VT, const SDLoc &DL,
54402 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
54403 using namespace llvm::SDPatternMatch;
54404
54405 if (!Subtarget.hasSSE2())
54406 return SDValue();
54407
54408 // Only handle vXi16 types that are at least 128-bits unless they will be
54409 // widened.
54410 if (!VT.isVector() || VT.getVectorElementType() != MVT::i16)
54411 return SDValue();
54412
54413 // Input type should be at least vXi32.
54414 EVT InVT = Src.getValueType();
54415 if (InVT.getVectorElementType().getSizeInBits() < 32)
54416 return SDValue();
54417
54418 // First instruction should be a right shift by 16 of a multiply.
54419 SDValue LHS, RHS;
54420 APInt ShiftAmt;
54421 if (!sd_match(Src,
54422 m_Srl(m_Mul(m_Value(LHS), m_Value(RHS)), m_ConstInt(ShiftAmt))))
54423 return SDValue();
54424
54425 if (ShiftAmt.ult(16) || ShiftAmt.uge(InVT.getScalarSizeInBits()))
54426 return SDValue();
54427
54428 uint64_t AdditionalShift = ShiftAmt.getZExtValue() - 16;
54429
54430 // Count leading sign/zero bits on both inputs - if there are enough then
54431 // truncation back to vXi16 will be cheap - either as a pack/shuffle
54432 // sequence or using AVX512 truncations. If the inputs are sext/zext then the
54433 // truncations may actually be free by peeking through to the ext source.
54434 auto IsSext = [&DAG](SDValue V) {
54435 return DAG.ComputeMaxSignificantBits(V) <= 16;
54436 };
54437 auto IsZext = [&DAG](SDValue V) {
54438 return DAG.computeKnownBits(V).countMaxActiveBits() <= 16;
54439 };
54440
54441 bool IsSigned = IsSext(LHS) && IsSext(RHS);
54442 bool IsUnsigned = IsZext(LHS) && IsZext(RHS);
54443 if (!IsSigned && !IsUnsigned)
54444 return SDValue();
54445
54446 // Check if both inputs are extensions, which will be removed by truncation.
54447 auto isOpTruncateFree = [](SDValue Op) {
54448 if (Op.getOpcode() == ISD::SIGN_EXTEND ||
54449 Op.getOpcode() == ISD::ZERO_EXTEND)
54450 return Op.getOperand(0).getScalarValueSizeInBits() <= 16;
54451 return ISD::isBuildVectorOfConstantSDNodes(Op.getNode());
54452 };
54453 bool IsTruncateFree = isOpTruncateFree(LHS) && isOpTruncateFree(RHS);
54454
54455 // For AVX2+ targets, with the upper bits known zero, we can perform MULHU on
54456 // the (bitcasted) inputs directly, and then cheaply pack/truncate the result
54457 // (upper elts will be zero). Don't attempt this with just AVX512F as MULHU
54458 // will have to split anyway.
54459 unsigned InSizeInBits = InVT.getSizeInBits();
54460 if (IsUnsigned && !IsTruncateFree && Subtarget.hasInt256() &&
54461 !(Subtarget.hasAVX512() && !Subtarget.hasBWI() && VT.is256BitVector()) &&
54462 (InSizeInBits % 16) == 0) {
54463 EVT BCVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
54464 InVT.getSizeInBits() / 16);
54465 SDValue Res = DAG.getNode(ISD::MULHU, DL, BCVT, DAG.getBitcast(BCVT, LHS),
54466 DAG.getBitcast(BCVT, RHS));
54467 Res = DAG.getNode(ISD::TRUNCATE, DL, VT, DAG.getBitcast(InVT, Res));
54468 return DAG.getNode(ISD::SRL, DL, VT, Res,
54469 DAG.getShiftAmountConstant(AdditionalShift, VT, DL));
54470 }
54471
54472 // Truncate back to source type.
54473 LHS = DAG.getNode(ISD::TRUNCATE, DL, VT, LHS);
54474 RHS = DAG.getNode(ISD::TRUNCATE, DL, VT, RHS);
54475
54476 unsigned Opc = IsSigned ? ISD::MULHS : ISD::MULHU;
54477 SDValue Res = DAG.getNode(Opc, DL, VT, LHS, RHS);
54478 return DAG.getNode(ISD::SRL, DL, VT, Res,
54479 DAG.getShiftAmountConstant(AdditionalShift, VT, DL));
54480}
54481
54482// Attempt to match PMADDUBSW, which multiplies corresponding unsigned bytes
54483// from one vector with signed bytes from another vector, adds together
54484// adjacent pairs of 16-bit products, and saturates the result before
54485// truncating to 16-bits.
54486//
54487// Which looks something like this:
54488// (i16 (ssat (add (mul (zext (even elts (i8 A))), (sext (even elts (i8 B)))),
54489// (mul (zext (odd elts (i8 A)), (sext (odd elts (i8 B))))))))
54491 const X86Subtarget &Subtarget,
54492 const SDLoc &DL) {
54493 if (!VT.isVector() || !Subtarget.hasSSSE3())
54494 return SDValue();
54495
54496 unsigned NumElems = VT.getVectorNumElements();
54497 EVT ScalarVT = VT.getVectorElementType();
54498 if (ScalarVT != MVT::i16 || NumElems < 8 || !isPowerOf2_32(NumElems))
54499 return SDValue();
54500
54501 SDValue SSatVal = detectSSatPattern(In, VT);
54502 if (!SSatVal || SSatVal.getOpcode() != ISD::ADD)
54503 return SDValue();
54504
54505 // Ok this is a signed saturation of an ADD. See if this ADD is adding pairs
54506 // of multiplies from even/odd elements.
54507 SDValue N0 = SSatVal.getOperand(0);
54508 SDValue N1 = SSatVal.getOperand(1);
54509
54510 if (N0.getOpcode() != ISD::MUL || N1.getOpcode() != ISD::MUL)
54511 return SDValue();
54512
54513 SDValue N00 = N0.getOperand(0);
54514 SDValue N01 = N0.getOperand(1);
54515 SDValue N10 = N1.getOperand(0);
54516 SDValue N11 = N1.getOperand(1);
54517
54518 // TODO: Handle constant vectors and use knownbits/computenumsignbits?
54519 // Canonicalize zero_extend to LHS.
54520 if (N01.getOpcode() == ISD::ZERO_EXTEND)
54521 std::swap(N00, N01);
54522 if (N11.getOpcode() == ISD::ZERO_EXTEND)
54523 std::swap(N10, N11);
54524
54525 // Ensure we have a zero_extend and a sign_extend.
54526 if (N00.getOpcode() != ISD::ZERO_EXTEND ||
54527 N01.getOpcode() != ISD::SIGN_EXTEND ||
54528 N10.getOpcode() != ISD::ZERO_EXTEND ||
54529 N11.getOpcode() != ISD::SIGN_EXTEND)
54530 return SDValue();
54531
54532 // Peek through the extends.
54533 N00 = N00.getOperand(0);
54534 N01 = N01.getOperand(0);
54535 N10 = N10.getOperand(0);
54536 N11 = N11.getOperand(0);
54537
54538 // Ensure the extend is from vXi8.
54539 if (N00.getValueType().getVectorElementType() != MVT::i8 ||
54540 N01.getValueType().getVectorElementType() != MVT::i8 ||
54541 N10.getValueType().getVectorElementType() != MVT::i8 ||
54542 N11.getValueType().getVectorElementType() != MVT::i8)
54543 return SDValue();
54544
54545 // All inputs should be build_vectors.
54546 if (N00.getOpcode() != ISD::BUILD_VECTOR ||
54547 N01.getOpcode() != ISD::BUILD_VECTOR ||
54548 N10.getOpcode() != ISD::BUILD_VECTOR ||
54550 return SDValue();
54551
54552 // N00/N10 are zero extended. N01/N11 are sign extended.
54553
54554 // For each element, we need to ensure we have an odd element from one vector
54555 // multiplied by the odd element of another vector and the even element from
54556 // one of the same vectors being multiplied by the even element from the
54557 // other vector. So we need to make sure for each element i, this operator
54558 // is being performed:
54559 // A[2 * i] * B[2 * i] + A[2 * i + 1] * B[2 * i + 1]
54560 SDValue ZExtIn, SExtIn;
54561 for (unsigned i = 0; i != NumElems; ++i) {
54562 SDValue N00Elt = N00.getOperand(i);
54563 SDValue N01Elt = N01.getOperand(i);
54564 SDValue N10Elt = N10.getOperand(i);
54565 SDValue N11Elt = N11.getOperand(i);
54566 // TODO: Be more tolerant to undefs.
54567 if (N00Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
54568 N01Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
54569 N10Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
54571 return SDValue();
54572 auto *ConstN00Elt = dyn_cast<ConstantSDNode>(N00Elt.getOperand(1));
54573 auto *ConstN01Elt = dyn_cast<ConstantSDNode>(N01Elt.getOperand(1));
54574 auto *ConstN10Elt = dyn_cast<ConstantSDNode>(N10Elt.getOperand(1));
54575 auto *ConstN11Elt = dyn_cast<ConstantSDNode>(N11Elt.getOperand(1));
54576 if (!ConstN00Elt || !ConstN01Elt || !ConstN10Elt || !ConstN11Elt)
54577 return SDValue();
54578 unsigned IdxN00 = ConstN00Elt->getZExtValue();
54579 unsigned IdxN01 = ConstN01Elt->getZExtValue();
54580 unsigned IdxN10 = ConstN10Elt->getZExtValue();
54581 unsigned IdxN11 = ConstN11Elt->getZExtValue();
54582 // Add is commutative so indices can be reordered.
54583 if (IdxN00 > IdxN10) {
54584 std::swap(IdxN00, IdxN10);
54585 std::swap(IdxN01, IdxN11);
54586 }
54587 // N0 indices be the even element. N1 indices must be the next odd element.
54588 if (IdxN00 != 2 * i || IdxN10 != 2 * i + 1 ||
54589 IdxN01 != 2 * i || IdxN11 != 2 * i + 1)
54590 return SDValue();
54591 SDValue N00In = N00Elt.getOperand(0);
54592 SDValue N01In = N01Elt.getOperand(0);
54593 SDValue N10In = N10Elt.getOperand(0);
54594 SDValue N11In = N11Elt.getOperand(0);
54595 // First time we find an input capture it.
54596 if (!ZExtIn) {
54597 ZExtIn = N00In;
54598 SExtIn = N01In;
54599 }
54600 if (ZExtIn != N00In || SExtIn != N01In ||
54601 ZExtIn != N10In || SExtIn != N11In)
54602 return SDValue();
54603 }
54604
54605 auto ExtractVec = [&DAG, &DL, NumElems](SDValue &Ext) {
54606 EVT ExtVT = Ext.getValueType();
54607 if (ExtVT.getVectorNumElements() != NumElems * 2) {
54608 MVT NVT = MVT::getVectorVT(MVT::i8, NumElems * 2);
54609 Ext = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NVT, Ext,
54610 DAG.getVectorIdxConstant(0, DL));
54611 }
54612 };
54613 ExtractVec(ZExtIn);
54614 ExtractVec(SExtIn);
54615
54616 auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
54618 // Shrink by adding truncate nodes and let DAGCombine fold with the
54619 // sources.
54620 EVT InVT = Ops[0].getValueType();
54621 assert(InVT.getScalarType() == MVT::i8 &&
54622 "Unexpected scalar element type");
54623 assert(InVT == Ops[1].getValueType() && "Operands' types mismatch");
54624 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
54625 InVT.getVectorNumElements() / 2);
54626 return DAG.getNode(X86ISD::VPMADDUBSW, DL, ResVT, Ops[0], Ops[1]);
54627 };
54628 return SplitOpsAndApply(DAG, Subtarget, DL, VT, { ZExtIn, SExtIn },
54629 PMADDBuilder);
54630}
54631
54633 const X86Subtarget &Subtarget) {
54634 EVT VT = N->getValueType(0);
54635 SDValue Src = N->getOperand(0);
54636 SDLoc DL(N);
54637
54638 // Attempt to pre-truncate inputs to arithmetic ops instead.
54639 if (SDValue V = combineTruncatedArithmetic(N, DAG, Subtarget, DL))
54640 return V;
54641
54642 // Try to detect PMADD
54643 if (SDValue PMAdd = detectPMADDUBSW(Src, VT, DAG, Subtarget, DL))
54644 return PMAdd;
54645
54646 // Try to combine truncation with signed/unsigned saturation.
54647 if (SDValue Val = combineTruncateWithSat(Src, VT, DL, DAG, Subtarget))
54648 return Val;
54649
54650 // Try to combine PMULHUW/PMULHW for vXi16.
54651 if (SDValue V = combinePMULH(Src, VT, DL, DAG, Subtarget))
54652 return V;
54653
54654 // The bitcast source is a direct mmx result.
54655 // Detect bitcasts between i32 to x86mmx
54656 if (Src.getOpcode() == ISD::BITCAST && VT == MVT::i32) {
54657 SDValue BCSrc = Src.getOperand(0);
54658 if (BCSrc.getValueType() == MVT::x86mmx)
54659 return DAG.getNode(X86ISD::MMX_MOVD2W, DL, MVT::i32, BCSrc);
54660 }
54661
54662 return SDValue();
54663}
54664
54667 EVT VT = N->getValueType(0);
54668 SDValue In = N->getOperand(0);
54669 SDLoc DL(N);
54670
54671 if (SDValue SSatVal = detectSSatPattern(In, VT))
54672 return DAG.getNode(X86ISD::VTRUNCS, DL, VT, SSatVal);
54673 if (SDValue USatVal = detectUSatPattern(In, VT, DAG, DL))
54674 return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, USatVal);
54675
54676 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
54677 APInt DemandedMask(APInt::getAllOnes(VT.getScalarSizeInBits()));
54678 if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
54679 return SDValue(N, 0);
54680
54681 return SDValue();
54682}
54683
54684/// Returns the negated value if the node \p N flips sign of FP value.
54685///
54686/// FP-negation node may have different forms: FNEG(x), FXOR (x, 0x80000000)
54687/// or FSUB(0, x)
54688/// AVX512F does not have FXOR, so FNEG is lowered as
54689/// (bitcast (xor (bitcast x), (bitcast ConstantFP(0x80000000)))).
54690/// In this case we go though all bitcasts.
54691/// This also recognizes splat of a negated value and returns the splat of that
54692/// value.
54693static SDValue isFNEG(SelectionDAG &DAG, SDNode *N, unsigned Depth = 0) {
54694 if (N->getOpcode() == ISD::FNEG)
54695 return N->getOperand(0);
54696
54697 // Don't recurse exponentially.
54699 return SDValue();
54700
54701 unsigned ScalarSize = N->getValueType(0).getScalarSizeInBits();
54702
54704 EVT VT = Op->getValueType(0);
54705
54706 // Make sure the element size doesn't change.
54707 if (VT.getScalarSizeInBits() != ScalarSize)
54708 return SDValue();
54709
54710 unsigned Opc = Op.getOpcode();
54711 switch (Opc) {
54712 case ISD::VECTOR_SHUFFLE: {
54713 // For a VECTOR_SHUFFLE(VEC1, VEC2), if the VEC2 is undef, then the negate
54714 // of this is VECTOR_SHUFFLE(-VEC1, UNDEF). The mask can be anything here.
54715 if (!Op.getOperand(1).isUndef())
54716 return SDValue();
54717 if (SDValue NegOp0 = isFNEG(DAG, Op.getOperand(0).getNode(), Depth + 1))
54718 if (NegOp0.getValueType() == VT) // FIXME: Can we do better?
54719 return DAG.getVectorShuffle(VT, SDLoc(Op), NegOp0, DAG.getUNDEF(VT),
54720 cast<ShuffleVectorSDNode>(Op)->getMask());
54721 break;
54722 }
54724 // Negate of INSERT_VECTOR_ELT(UNDEF, V, INDEX) is INSERT_VECTOR_ELT(UNDEF,
54725 // -V, INDEX).
54726 SDValue InsVector = Op.getOperand(0);
54727 SDValue InsVal = Op.getOperand(1);
54728 if (!InsVector.isUndef())
54729 return SDValue();
54730 if (SDValue NegInsVal = isFNEG(DAG, InsVal.getNode(), Depth + 1))
54731 if (NegInsVal.getValueType() == VT.getVectorElementType()) // FIXME
54732 return DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(Op), VT, InsVector,
54733 NegInsVal, Op.getOperand(2));
54734 break;
54735 }
54736 case ISD::FSUB:
54737 case ISD::XOR:
54738 case X86ISD::FXOR: {
54739 SDValue Op1 = Op.getOperand(1);
54740 SDValue Op0 = Op.getOperand(0);
54741
54742 // For XOR and FXOR, we want to check if constant
54743 // bits of Op1 are sign bit masks. For FSUB, we
54744 // have to check if constant bits of Op0 are sign
54745 // bit masks and hence we swap the operands.
54746 if (Opc == ISD::FSUB)
54747 std::swap(Op0, Op1);
54748
54749 APInt UndefElts;
54750 SmallVector<APInt, 16> EltBits;
54751 // Extract constant bits and see if they are all
54752 // sign bit masks. Ignore the undef elements.
54753 if (getTargetConstantBitsFromNode(Op1, ScalarSize, UndefElts, EltBits,
54754 /* AllowWholeUndefs */ true,
54755 /* AllowPartialUndefs */ false)) {
54756 for (unsigned I = 0, E = EltBits.size(); I < E; I++)
54757 if (!UndefElts[I] && !EltBits[I].isSignMask())
54758 return SDValue();
54759
54760 // Only allow bitcast from correctly-sized constant.
54761 Op0 = peekThroughBitcasts(Op0);
54762 if (Op0.getScalarValueSizeInBits() == ScalarSize)
54763 return Op0;
54764 }
54765 break;
54766 } // case
54767 } // switch
54768
54769 return SDValue();
54770}
54771
54772static unsigned negateFMAOpcode(unsigned Opcode, bool NegMul, bool NegAcc,
54773 bool NegRes) {
54774 if (NegMul) {
54775 switch (Opcode) {
54776 // clang-format off
54777 default: llvm_unreachable("Unexpected opcode");
54778 case ISD::FMA: Opcode = X86ISD::FNMADD; break;
54779 case ISD::STRICT_FMA: Opcode = X86ISD::STRICT_FNMADD; break;
54780 case X86ISD::FMADD_RND: Opcode = X86ISD::FNMADD_RND; break;
54781 case X86ISD::FMSUB: Opcode = X86ISD::FNMSUB; break;
54782 case X86ISD::STRICT_FMSUB: Opcode = X86ISD::STRICT_FNMSUB; break;
54783 case X86ISD::FMSUB_RND: Opcode = X86ISD::FNMSUB_RND; break;
54784 case X86ISD::FNMADD: Opcode = ISD::FMA; break;
54785 case X86ISD::STRICT_FNMADD: Opcode = ISD::STRICT_FMA; break;
54786 case X86ISD::FNMADD_RND: Opcode = X86ISD::FMADD_RND; break;
54787 case X86ISD::FNMSUB: Opcode = X86ISD::FMSUB; break;
54788 case X86ISD::STRICT_FNMSUB: Opcode = X86ISD::STRICT_FMSUB; break;
54789 case X86ISD::FNMSUB_RND: Opcode = X86ISD::FMSUB_RND; break;
54790 // clang-format on
54791 }
54792 }
54793
54794 if (NegAcc) {
54795 switch (Opcode) {
54796 // clang-format off
54797 default: llvm_unreachable("Unexpected opcode");
54798 case ISD::FMA: Opcode = X86ISD::FMSUB; break;
54799 case ISD::STRICT_FMA: Opcode = X86ISD::STRICT_FMSUB; break;
54800 case X86ISD::FMADD_RND: Opcode = X86ISD::FMSUB_RND; break;
54801 case X86ISD::FMSUB: Opcode = ISD::FMA; break;
54802 case X86ISD::STRICT_FMSUB: Opcode = ISD::STRICT_FMA; break;
54803 case X86ISD::FMSUB_RND: Opcode = X86ISD::FMADD_RND; break;
54804 case X86ISD::FNMADD: Opcode = X86ISD::FNMSUB; break;
54805 case X86ISD::STRICT_FNMADD: Opcode = X86ISD::STRICT_FNMSUB; break;
54806 case X86ISD::FNMADD_RND: Opcode = X86ISD::FNMSUB_RND; break;
54807 case X86ISD::FNMSUB: Opcode = X86ISD::FNMADD; break;
54808 case X86ISD::STRICT_FNMSUB: Opcode = X86ISD::STRICT_FNMADD; break;
54809 case X86ISD::FNMSUB_RND: Opcode = X86ISD::FNMADD_RND; break;
54810 case X86ISD::FMADDSUB: Opcode = X86ISD::FMSUBADD; break;
54811 case X86ISD::FMADDSUB_RND: Opcode = X86ISD::FMSUBADD_RND; break;
54812 case X86ISD::FMSUBADD: Opcode = X86ISD::FMADDSUB; break;
54813 case X86ISD::FMSUBADD_RND: Opcode = X86ISD::FMADDSUB_RND; break;
54814 // clang-format on
54815 }
54816 }
54817
54818 if (NegRes) {
54819 switch (Opcode) {
54820 // For accuracy reason, we never combine fneg and fma under strict FP.
54821 // clang-format off
54822 default: llvm_unreachable("Unexpected opcode");
54823 case ISD::FMA: Opcode = X86ISD::FNMSUB; break;
54824 case X86ISD::FMADD_RND: Opcode = X86ISD::FNMSUB_RND; break;
54825 case X86ISD::FMSUB: Opcode = X86ISD::FNMADD; break;
54826 case X86ISD::FMSUB_RND: Opcode = X86ISD::FNMADD_RND; break;
54827 case X86ISD::FNMADD: Opcode = X86ISD::FMSUB; break;
54828 case X86ISD::FNMADD_RND: Opcode = X86ISD::FMSUB_RND; break;
54829 case X86ISD::FNMSUB: Opcode = ISD::FMA; break;
54830 case X86ISD::FNMSUB_RND: Opcode = X86ISD::FMADD_RND; break;
54831 // clang-format on
54832 }
54833 }
54834
54835 return Opcode;
54836}
54837
54838/// Do target-specific dag combines on floating point negations.
54841 const X86Subtarget &Subtarget) {
54842 EVT OrigVT = N->getValueType(0);
54843 SDValue Arg = isFNEG(DAG, N);
54844 if (!Arg)
54845 return SDValue();
54846
54847 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
54848 EVT VT = Arg.getValueType();
54849 EVT SVT = VT.getScalarType();
54850 SDLoc DL(N);
54851
54852 // Let legalize expand this if it isn't a legal type yet.
54853 if (!TLI.isTypeLegal(VT))
54854 return SDValue();
54855
54856 // If we're negating a FMUL node on a target with FMA, then we can avoid the
54857 // use of a constant by performing (-0 - A*B) instead.
54858 // FIXME: Check rounding control flags as well once it becomes available.
54859 if (Arg.getOpcode() == ISD::FMUL && (SVT == MVT::f32 || SVT == MVT::f64) &&
54860 Arg->getFlags().hasNoSignedZeros() && Subtarget.hasAnyFMA()) {
54861 SDValue Zero = DAG.getConstantFP(0.0, DL, VT);
54862 SDValue NewNode = DAG.getNode(X86ISD::FNMSUB, DL, VT, Arg.getOperand(0),
54863 Arg.getOperand(1), Zero);
54864 return DAG.getBitcast(OrigVT, NewNode);
54865 }
54866
54868 bool LegalOperations = !DCI.isBeforeLegalizeOps();
54869 if (SDValue NegArg =
54870 TLI.getNegatedExpression(Arg, DAG, LegalOperations, CodeSize))
54871 return DAG.getBitcast(OrigVT, NegArg);
54872
54873 return SDValue();
54874}
54875
54877 bool LegalOperations,
54878 bool ForCodeSize,
54880 unsigned Depth) const {
54881 // fneg patterns are removable even if they have multiple uses.
54882 if (SDValue Arg = isFNEG(DAG, Op.getNode(), Depth)) {
54884 return DAG.getBitcast(Op.getValueType(), Arg);
54885 }
54886
54887 EVT VT = Op.getValueType();
54888 EVT SVT = VT.getScalarType();
54889 unsigned Opc = Op.getOpcode();
54890 SDNodeFlags Flags = Op.getNode()->getFlags();
54891 switch (Opc) {
54892 case ISD::FMA:
54893 case X86ISD::FMSUB:
54894 case X86ISD::FNMADD:
54895 case X86ISD::FNMSUB:
54896 case X86ISD::FMADD_RND:
54897 case X86ISD::FMSUB_RND:
54898 case X86ISD::FNMADD_RND:
54899 case X86ISD::FNMSUB_RND: {
54900 if (!Op.hasOneUse() || !Subtarget.hasAnyFMA() || !isTypeLegal(VT) ||
54901 !(SVT == MVT::f32 || SVT == MVT::f64) ||
54903 break;
54904
54905 // Don't fold (fneg (fma (fneg x), y, (fneg z))) to (fma x, y, z)
54906 // if it may have signed zeros.
54907 if (!Flags.hasNoSignedZeros())
54908 break;
54909
54910 // Because getCheaperNegatedExpression can delete nodes we need a handle to
54911 // keep temporary nodes alive.
54912 std::list<HandleSDNode> Handles;
54913
54914 // This is always negatible for free but we might be able to remove some
54915 // extra operand negations as well.
54916 SmallVector<SDValue, 4> NewOps(Op.getNumOperands(), SDValue());
54917 for (int i = 0; i != 3; ++i) {
54918 NewOps[i] = getCheaperNegatedExpression(
54919 Op.getOperand(i), DAG, LegalOperations, ForCodeSize, Depth + 1);
54920 if (!!NewOps[i])
54921 Handles.emplace_back(NewOps[i]);
54922 }
54923
54924 bool NegA = !!NewOps[0];
54925 bool NegB = !!NewOps[1];
54926 bool NegC = !!NewOps[2];
54927 unsigned NewOpc = negateFMAOpcode(Opc, NegA != NegB, NegC, true);
54928
54929 Cost = (NegA || NegB || NegC) ? NegatibleCost::Cheaper
54931
54932 // Fill in the non-negated ops with the original values.
54933 for (int i = 0, e = Op.getNumOperands(); i != e; ++i)
54934 if (!NewOps[i])
54935 NewOps[i] = Op.getOperand(i);
54936 return DAG.getNode(NewOpc, SDLoc(Op), VT, NewOps);
54937 }
54938 case X86ISD::FRCP:
54939 if (SDValue NegOp0 =
54940 getNegatedExpression(Op.getOperand(0), DAG, LegalOperations,
54941 ForCodeSize, Cost, Depth + 1))
54942 return DAG.getNode(Opc, SDLoc(Op), VT, NegOp0);
54943 break;
54944 }
54945
54946 return TargetLowering::getNegatedExpression(Op, DAG, LegalOperations,
54947 ForCodeSize, Cost, Depth);
54948}
54949
54951 const X86Subtarget &Subtarget) {
54952 MVT VT = N->getSimpleValueType(0);
54953 // If we have integer vector types available, use the integer opcodes.
54954 if (!VT.isVector() || !Subtarget.hasSSE2())
54955 return SDValue();
54956
54957 SDLoc dl(N);
54959 SDValue Op0 = DAG.getBitcast(IntVT, N->getOperand(0));
54960 SDValue Op1 = DAG.getBitcast(IntVT, N->getOperand(1));
54961 unsigned IntOpcode;
54962 switch (N->getOpcode()) {
54963 // clang-format off
54964 default: llvm_unreachable("Unexpected FP logic op");
54965 case X86ISD::FOR: IntOpcode = ISD::OR; break;
54966 case X86ISD::FXOR: IntOpcode = ISD::XOR; break;
54967 case X86ISD::FAND: IntOpcode = ISD::AND; break;
54968 case X86ISD::FANDN: IntOpcode = X86ISD::ANDNP; break;
54969 // clang-format on
54970 }
54971 SDValue IntOp = DAG.getNode(IntOpcode, dl, IntVT, Op0, Op1);
54972 return DAG.getBitcast(VT, IntOp);
54973}
54974
54975/// Fold a xor(setcc cond, val), 1 --> setcc (inverted(cond), val)
54977 if (N->getOpcode() != ISD::XOR)
54978 return SDValue();
54979
54980 SDValue LHS = N->getOperand(0);
54981 if (!isOneConstant(N->getOperand(1)) || LHS->getOpcode() != X86ISD::SETCC)
54982 return SDValue();
54983
54985 X86::CondCode(LHS->getConstantOperandVal(0)));
54986 return getSETCC(NewCC, LHS->getOperand(1), DL, DAG);
54987}
54988
54990 const X86Subtarget &Subtarget) {
54991 assert((N->getOpcode() == ISD::XOR || N->getOpcode() == ISD::SUB) &&
54992 "Invalid opcode for combing with CTLZ");
54993 if (Subtarget.hasFastLZCNT())
54994 return SDValue();
54995
54996 EVT VT = N->getValueType(0);
54997 if (VT != MVT::i8 && VT != MVT::i16 && VT != MVT::i32 &&
54998 (VT != MVT::i64 || !Subtarget.is64Bit()))
54999 return SDValue();
55000
55001 SDValue N0 = N->getOperand(0);
55002 SDValue N1 = N->getOperand(1);
55003
55004 if (N0.getOpcode() != ISD::CTLZ_ZERO_UNDEF &&
55006 return SDValue();
55007
55008 SDValue OpCTLZ;
55009 SDValue OpSizeTM1;
55010
55011 if (N1.getOpcode() == ISD::CTLZ_ZERO_UNDEF) {
55012 OpCTLZ = N1;
55013 OpSizeTM1 = N0;
55014 } else if (N->getOpcode() == ISD::SUB) {
55015 return SDValue();
55016 } else {
55017 OpCTLZ = N0;
55018 OpSizeTM1 = N1;
55019 }
55020
55021 if (!OpCTLZ.hasOneUse())
55022 return SDValue();
55023 auto *C = dyn_cast<ConstantSDNode>(OpSizeTM1);
55024 if (!C)
55025 return SDValue();
55026
55027 if (C->getZExtValue() != uint64_t(OpCTLZ.getValueSizeInBits() - 1))
55028 return SDValue();
55029 EVT OpVT = VT;
55030 SDValue Op = OpCTLZ.getOperand(0);
55031 if (VT == MVT::i8) {
55032 // Zero extend to i32 since there is not an i8 bsr.
55033 OpVT = MVT::i32;
55034 Op = DAG.getNode(ISD::ZERO_EXTEND, DL, OpVT, Op);
55035 }
55036
55037 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
55038 Op = DAG.getNode(X86ISD::BSR, DL, VTs, DAG.getUNDEF(OpVT), Op);
55039 if (VT == MVT::i8)
55040 Op = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, Op);
55041
55042 return Op;
55043}
55044
55047 const X86Subtarget &Subtarget) {
55048 SDValue N0 = N->getOperand(0);
55049 SDValue N1 = N->getOperand(1);
55050 EVT VT = N->getValueType(0);
55051 SDLoc DL(N);
55052
55053 // If this is SSE1 only convert to FXOR to avoid scalarization.
55054 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
55055 return DAG.getBitcast(MVT::v4i32,
55056 DAG.getNode(X86ISD::FXOR, DL, MVT::v4f32,
55057 DAG.getBitcast(MVT::v4f32, N0),
55058 DAG.getBitcast(MVT::v4f32, N1)));
55059 }
55060
55061 if (SDValue Cmp = foldVectorXorShiftIntoCmp(N, DAG, Subtarget))
55062 return Cmp;
55063
55064 if (SDValue R = combineBitOpWithMOVMSK(N->getOpcode(), DL, N0, N1, DAG))
55065 return R;
55066
55067 if (SDValue R = combineBitOpWithShift(N->getOpcode(), DL, VT, N0, N1, DAG))
55068 return R;
55069
55070 if (SDValue R = combineBitOpWithPACK(N->getOpcode(), DL, VT, N0, N1, DAG))
55071 return R;
55072
55073 if (SDValue FPLogic = convertIntLogicToFPLogic(N->getOpcode(), DL, VT, N0, N1,
55074 DAG, DCI, Subtarget))
55075 return FPLogic;
55076
55077 if (SDValue R = combineXorSubCTLZ(N, DL, DAG, Subtarget))
55078 return R;
55079
55080 if (DCI.isBeforeLegalizeOps())
55081 return SDValue();
55082
55083 if (SDValue SetCC = foldXor1SetCC(N, DL, DAG))
55084 return SetCC;
55085
55086 if (SDValue R = combineOrXorWithSETCC(N->getOpcode(), DL, VT, N0, N1, DAG))
55087 return R;
55088
55089 if (SDValue RV = foldXorTruncShiftIntoCmp(N, DL, DAG))
55090 return RV;
55091
55092 // Fold not(iX bitcast(vXi1)) -> (iX bitcast(not(vec))) for legal boolvecs.
55093 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
55094 if (llvm::isAllOnesConstant(N1) && N0.getOpcode() == ISD::BITCAST &&
55095 N0.getOperand(0).getValueType().isVector() &&
55096 N0.getOperand(0).getValueType().getVectorElementType() == MVT::i1 &&
55097 TLI.isTypeLegal(N0.getOperand(0).getValueType()) && N0.hasOneUse()) {
55098 return DAG.getBitcast(
55099 VT, DAG.getNOT(DL, N0.getOperand(0), N0.getOperand(0).getValueType()));
55100 }
55101
55102 // Handle AVX512 mask widening.
55103 // Fold not(insert_subvector(undef,sub)) -> insert_subvector(undef,not(sub))
55104 if (ISD::isBuildVectorAllOnes(N1.getNode()) && VT.isVector() &&
55105 VT.getVectorElementType() == MVT::i1 &&
55107 TLI.isTypeLegal(N0.getOperand(1).getValueType())) {
55108 return DAG.getNode(
55110 DAG.getNOT(DL, N0.getOperand(1), N0.getOperand(1).getValueType()),
55111 N0.getOperand(2));
55112 }
55113
55114 // Fold xor(zext(xor(x,c1)),c2) -> xor(zext(x),xor(zext(c1),c2))
55115 // Fold xor(truncate(xor(x,c1)),c2) -> xor(truncate(x),xor(truncate(c1),c2))
55116 // TODO: Under what circumstances could this be performed in DAGCombine?
55117 if ((N0.getOpcode() == ISD::TRUNCATE || N0.getOpcode() == ISD::ZERO_EXTEND) &&
55118 N0.getOperand(0).getOpcode() == N->getOpcode()) {
55119 SDValue TruncExtSrc = N0.getOperand(0);
55120 auto *N1C = dyn_cast<ConstantSDNode>(N1);
55121 auto *N001C = dyn_cast<ConstantSDNode>(TruncExtSrc.getOperand(1));
55122 if (N1C && !N1C->isOpaque() && N001C && !N001C->isOpaque()) {
55123 SDValue LHS = DAG.getZExtOrTrunc(TruncExtSrc.getOperand(0), DL, VT);
55124 SDValue RHS = DAG.getZExtOrTrunc(TruncExtSrc.getOperand(1), DL, VT);
55125 return DAG.getNode(ISD::XOR, DL, VT, LHS,
55126 DAG.getNode(ISD::XOR, DL, VT, RHS, N1));
55127 }
55128 }
55129
55130 if (SDValue R = combineBMILogicOp(N, DAG, Subtarget))
55131 return R;
55132
55133 return combineFneg(N, DAG, DCI, Subtarget);
55134}
55135
55138 const X86Subtarget &Subtarget) {
55139 SDValue N0 = N->getOperand(0);
55140 EVT VT = N->getValueType(0);
55141
55142 // Convert a (iX bitreverse(bitcast(vXi1 X))) -> (iX bitcast(shuffle(X)))
55143 if (VT.isInteger() && N0.getOpcode() == ISD::BITCAST && N0.hasOneUse()) {
55144 SDValue Src = N0.getOperand(0);
55145 EVT SrcVT = Src.getValueType();
55146 if (SrcVT.isVector() && SrcVT.getScalarType() == MVT::i1 &&
55147 (DCI.isBeforeLegalize() ||
55148 DAG.getTargetLoweringInfo().isTypeLegal(SrcVT)) &&
55149 Subtarget.hasSSSE3()) {
55150 unsigned NumElts = SrcVT.getVectorNumElements();
55151 SmallVector<int, 32> ReverseMask(NumElts);
55152 for (unsigned I = 0; I != NumElts; ++I)
55153 ReverseMask[I] = (NumElts - 1) - I;
55154 SDValue Rev =
55155 DAG.getVectorShuffle(SrcVT, SDLoc(N), Src, Src, ReverseMask);
55156 return DAG.getBitcast(VT, Rev);
55157 }
55158 }
55159
55160 return SDValue();
55161}
55162
55163// Various combines to try to convert to avgceilu.
55166 const X86Subtarget &Subtarget) {
55167 unsigned Opcode = N->getOpcode();
55168 SDValue N0 = N->getOperand(0);
55169 SDValue N1 = N->getOperand(1);
55170 EVT VT = N->getValueType(0);
55171 EVT SVT = VT.getScalarType();
55172 SDLoc DL(N);
55173
55174 // avgceils(x,y) -> flipsign(avgceilu(flipsign(x),flipsign(y)))
55175 // Only useful on vXi8 which doesn't have good SRA handling.
55176 if (Opcode == ISD::AVGCEILS && VT.isVector() && SVT == MVT::i8) {
55178 SDValue SignMask = DAG.getConstant(SignBit, DL, VT);
55179 N0 = DAG.getNode(ISD::XOR, DL, VT, N0, SignMask);
55180 N1 = DAG.getNode(ISD::XOR, DL, VT, N1, SignMask);
55181 return DAG.getNode(ISD::XOR, DL, VT,
55182 DAG.getNode(ISD::AVGCEILU, DL, VT, N0, N1), SignMask);
55183 }
55184
55185 return SDValue();
55186}
55187
55190 const X86Subtarget &Subtarget) {
55191 EVT VT = N->getValueType(0);
55192 unsigned NumBits = VT.getSizeInBits();
55193
55194 // TODO - Constant Folding.
55195
55196 // Simplify the inputs.
55197 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
55198 APInt DemandedMask(APInt::getAllOnes(NumBits));
55199 if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
55200 return SDValue(N, 0);
55201
55202 return SDValue();
55203}
55204
55206 return isNullFPConstant(V) || ISD::isBuildVectorAllZeros(V.getNode());
55207}
55208
55209/// If a value is a scalar FP zero or a vector FP zero (potentially including
55210/// undefined elements), return a zero constant that may be used to fold away
55211/// that value. In the case of a vector, the returned constant will not contain
55212/// undefined elements even if the input parameter does. This makes it suitable
55213/// to be used as a replacement operand with operations (eg, bitwise-and) where
55214/// an undef should not propagate.
55216 const X86Subtarget &Subtarget) {
55218 return SDValue();
55219
55220 if (V.getValueType().isVector())
55221 return getZeroVector(V.getSimpleValueType(), Subtarget, DAG, SDLoc(V));
55222
55223 return V;
55224}
55225
55227 const X86Subtarget &Subtarget) {
55228 SDValue N0 = N->getOperand(0);
55229 SDValue N1 = N->getOperand(1);
55230 EVT VT = N->getValueType(0);
55231 SDLoc DL(N);
55232
55233 // Vector types are handled in combineANDXORWithAllOnesIntoANDNP().
55234 if (!((VT == MVT::f32 && Subtarget.hasSSE1()) ||
55235 (VT == MVT::f64 && Subtarget.hasSSE2()) ||
55236 (VT == MVT::v4f32 && Subtarget.hasSSE1() && !Subtarget.hasSSE2())))
55237 return SDValue();
55238
55239 auto isAllOnesConstantFP = [](SDValue V) {
55240 if (V.getSimpleValueType().isVector())
55241 return ISD::isBuildVectorAllOnes(V.getNode());
55242 auto *C = dyn_cast<ConstantFPSDNode>(V);
55243 return C && C->getConstantFPValue()->isAllOnesValue();
55244 };
55245
55246 // fand (fxor X, -1), Y --> fandn X, Y
55247 if (N0.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N0.getOperand(1)))
55248 return DAG.getNode(X86ISD::FANDN, DL, VT, N0.getOperand(0), N1);
55249
55250 // fand X, (fxor Y, -1) --> fandn Y, X
55251 if (N1.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N1.getOperand(1)))
55252 return DAG.getNode(X86ISD::FANDN, DL, VT, N1.getOperand(0), N0);
55253
55254 return SDValue();
55255}
55256
55257/// Do target-specific dag combines on X86ISD::FAND nodes.
55259 const X86Subtarget &Subtarget) {
55260 // FAND(0.0, x) -> 0.0
55261 if (SDValue V = getNullFPConstForNullVal(N->getOperand(0), DAG, Subtarget))
55262 return V;
55263
55264 // FAND(x, 0.0) -> 0.0
55265 if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
55266 return V;
55267
55268 if (SDValue V = combineFAndFNotToFAndn(N, DAG, Subtarget))
55269 return V;
55270
55271 return lowerX86FPLogicOp(N, DAG, Subtarget);
55272}
55273
55274/// Do target-specific dag combines on X86ISD::FANDN nodes.
55276 const X86Subtarget &Subtarget) {
55277 // FANDN(0.0, x) -> x
55278 if (isNullFPScalarOrVectorConst(N->getOperand(0)))
55279 return N->getOperand(1);
55280
55281 // FANDN(x, 0.0) -> 0.0
55282 if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
55283 return V;
55284
55285 return lowerX86FPLogicOp(N, DAG, Subtarget);
55286}
55287
55288/// Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes.
55291 const X86Subtarget &Subtarget) {
55292 assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR);
55293
55294 // F[X]OR(0.0, x) -> x
55295 if (isNullFPScalarOrVectorConst(N->getOperand(0)))
55296 return N->getOperand(1);
55297
55298 // F[X]OR(x, 0.0) -> x
55299 if (isNullFPScalarOrVectorConst(N->getOperand(1)))
55300 return N->getOperand(0);
55301
55302 if (SDValue NewVal = combineFneg(N, DAG, DCI, Subtarget))
55303 return NewVal;
55304
55305 return lowerX86FPLogicOp(N, DAG, Subtarget);
55306}
55307
55308/// Do target-specific dag combines on X86ISD::FMIN and X86ISD::FMAX nodes.
55310 assert(N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX);
55311
55312 // FMIN/FMAX are commutative if no NaNs and no negative zeros are allowed.
55313 if (!DAG.getTarget().Options.NoNaNsFPMath ||
55315 return SDValue();
55316
55317 // If we run in unsafe-math mode, then convert the FMAX and FMIN nodes
55318 // into FMINC and FMAXC, which are Commutative operations.
55319 unsigned NewOp = 0;
55320 switch (N->getOpcode()) {
55321 default: llvm_unreachable("unknown opcode");
55322 case X86ISD::FMIN: NewOp = X86ISD::FMINC; break;
55323 case X86ISD::FMAX: NewOp = X86ISD::FMAXC; break;
55324 }
55325
55326 return DAG.getNode(NewOp, SDLoc(N), N->getValueType(0),
55327 N->getOperand(0), N->getOperand(1));
55328}
55329
55331 const X86Subtarget &Subtarget) {
55332 EVT VT = N->getValueType(0);
55333 if (Subtarget.useSoftFloat() || isSoftF16(VT, Subtarget))
55334 return SDValue();
55335
55336 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
55337
55338 auto IsMinMaxLegal = [&](EVT VT) {
55339 if (!TLI.isTypeLegal(VT))
55340 return false;
55341 return VT.getScalarType() != MVT::f16 ||
55342 (Subtarget.hasFP16() && (VT == MVT::v32f16 || Subtarget.hasVLX()));
55343 };
55344
55345 if (!((Subtarget.hasSSE1() && VT == MVT::f32) ||
55346 (Subtarget.hasSSE2() && VT == MVT::f64) ||
55347 (Subtarget.hasFP16() && VT == MVT::f16) ||
55348 (VT.isVector() && IsMinMaxLegal(VT))))
55349 return SDValue();
55350
55351 SDValue Op0 = N->getOperand(0);
55352 SDValue Op1 = N->getOperand(1);
55353 SDLoc DL(N);
55354 auto MinMaxOp = N->getOpcode() == ISD::FMAXNUM ? X86ISD::FMAX : X86ISD::FMIN;
55355
55356 // If we don't have to respect NaN inputs, this is a direct translation to x86
55357 // min/max instructions.
55358 if (DAG.getTarget().Options.NoNaNsFPMath || N->getFlags().hasNoNaNs())
55359 return DAG.getNode(MinMaxOp, DL, VT, Op0, Op1, N->getFlags());
55360
55361 // If one of the operands is known non-NaN use the native min/max instructions
55362 // with the non-NaN input as second operand.
55363 if (DAG.isKnownNeverNaN(Op1))
55364 return DAG.getNode(MinMaxOp, DL, VT, Op0, Op1, N->getFlags());
55365 if (DAG.isKnownNeverNaN(Op0))
55366 return DAG.getNode(MinMaxOp, DL, VT, Op1, Op0, N->getFlags());
55367
55368 // If we have to respect NaN inputs, this takes at least 3 instructions.
55369 // Favor a library call when operating on a scalar and minimizing code size.
55370 if (!VT.isVector() && DAG.getMachineFunction().getFunction().hasMinSize())
55371 return SDValue();
55372
55373 EVT SetCCType = TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(),
55374 VT);
55375
55376 // There are 4 possibilities involving NaN inputs, and these are the required
55377 // outputs:
55378 // Op1
55379 // Num NaN
55380 // ----------------
55381 // Num | Max | Op0 |
55382 // Op0 ----------------
55383 // NaN | Op1 | NaN |
55384 // ----------------
55385 //
55386 // The SSE FP max/min instructions were not designed for this case, but rather
55387 // to implement:
55388 // Min = Op1 < Op0 ? Op1 : Op0
55389 // Max = Op1 > Op0 ? Op1 : Op0
55390 //
55391 // So they always return Op0 if either input is a NaN. However, we can still
55392 // use those instructions for fmaxnum by selecting away a NaN input.
55393
55394 // If either operand is NaN, the 2nd source operand (Op0) is passed through.
55395 SDValue MinOrMax = DAG.getNode(MinMaxOp, DL, VT, Op1, Op0);
55396 SDValue IsOp0Nan = DAG.getSetCC(DL, SetCCType, Op0, Op0, ISD::SETUO);
55397
55398 // If Op0 is a NaN, select Op1. Otherwise, select the max. If both operands
55399 // are NaN, the NaN value of Op1 is the result.
55400 return DAG.getSelect(DL, VT, IsOp0Nan, Op1, MinOrMax);
55401}
55402
55405 EVT VT = N->getValueType(0);
55406 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
55407
55408 APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
55409 if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, DCI))
55410 return SDValue(N, 0);
55411
55412 // Convert a full vector load into vzload when not all bits are needed.
55413 SDValue In = N->getOperand(0);
55414 MVT InVT = In.getSimpleValueType();
55415 if (VT.getVectorNumElements() < InVT.getVectorNumElements() &&
55416 ISD::isNormalLoad(In.getNode()) && In.hasOneUse()) {
55417 assert(InVT.is128BitVector() && "Expected 128-bit input vector");
55418 LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(0));
55419 unsigned NumBits = InVT.getScalarSizeInBits() * VT.getVectorNumElements();
55420 MVT MemVT = MVT::getIntegerVT(NumBits);
55421 MVT LoadVT = MVT::getVectorVT(MemVT, 128 / NumBits);
55422 if (SDValue VZLoad = narrowLoadToVZLoad(LN, MemVT, LoadVT, DAG)) {
55423 SDLoc dl(N);
55424 SDValue Convert = DAG.getNode(N->getOpcode(), dl, VT,
55425 DAG.getBitcast(InVT, VZLoad));
55426 DCI.CombineTo(N, Convert);
55427 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
55429 return SDValue(N, 0);
55430 }
55431 }
55432
55433 return SDValue();
55434}
55435
55439 bool IsStrict = TSI.isTargetStrictFPOpcode(N->getOpcode());
55440 EVT VT = N->getValueType(0);
55441
55442 // Convert a full vector load into vzload when not all bits are needed.
55443 SDValue In = N->getOperand(IsStrict ? 1 : 0);
55444 MVT InVT = In.getSimpleValueType();
55445 if (VT.getVectorNumElements() < InVT.getVectorNumElements() &&
55446 ISD::isNormalLoad(In.getNode()) && In.hasOneUse()) {
55447 assert(InVT.is128BitVector() && "Expected 128-bit input vector");
55448 LoadSDNode *LN = cast<LoadSDNode>(In);
55449 unsigned NumBits = InVT.getScalarSizeInBits() * VT.getVectorNumElements();
55450 MVT MemVT = MVT::getFloatingPointVT(NumBits);
55451 MVT LoadVT = MVT::getVectorVT(MemVT, 128 / NumBits);
55452 if (SDValue VZLoad = narrowLoadToVZLoad(LN, MemVT, LoadVT, DAG)) {
55453 SDLoc dl(N);
55454 if (IsStrict) {
55455 SDValue Convert =
55456 DAG.getNode(N->getOpcode(), dl, {VT, MVT::Other},
55457 {N->getOperand(0), DAG.getBitcast(InVT, VZLoad)});
55458 DCI.CombineTo(N, Convert, Convert.getValue(1));
55459 } else {
55460 SDValue Convert =
55461 DAG.getNode(N->getOpcode(), dl, VT, DAG.getBitcast(InVT, VZLoad));
55462 DCI.CombineTo(N, Convert);
55463 }
55464 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
55466 return SDValue(N, 0);
55467 }
55468 }
55469
55470 return SDValue();
55471}
55472
55473/// Do target-specific dag combines on X86ISD::ANDNP nodes.
55476 const X86Subtarget &Subtarget) {
55477 SDValue N0 = N->getOperand(0);
55478 SDValue N1 = N->getOperand(1);
55479 MVT VT = N->getSimpleValueType(0);
55480 int NumElts = VT.getVectorNumElements();
55481 unsigned EltSizeInBits = VT.getScalarSizeInBits();
55482 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
55483 SDLoc DL(N);
55484
55485 // ANDNP(undef, x) -> 0
55486 // ANDNP(x, undef) -> 0
55487 if (N0.isUndef() || N1.isUndef())
55488 return DAG.getConstant(0, DL, VT);
55489
55490 // ANDNP(0, x) -> x
55492 return N1;
55493
55494 // ANDNP(x, 0) -> 0
55496 return DAG.getConstant(0, DL, VT);
55497
55498 // ANDNP(x, -1) -> NOT(x) -> XOR(x, -1)
55500 return DAG.getNOT(DL, N0, VT);
55501
55502 // Turn ANDNP back to AND if input is inverted.
55503 if (SDValue Not = IsNOT(N0, DAG))
55504 return DAG.getNode(ISD::AND, DL, VT, DAG.getBitcast(VT, Not), N1);
55505
55506 // On AVX512 targets, attempt to reverse foldVSelectToSignBitSplatMask.
55507 // to make use of predicated selects.
55508 // ANDN(SEXT(SETCC()),X) -> SELECT(NOT(SETCC()),X,0)
55509 if (DCI.isAfterLegalizeDAG() && N0.getOpcode() == ISD::SIGN_EXTEND) {
55510 SDValue Src = N0.getOperand(0);
55511 EVT SrcVT = Src.getValueType();
55512 if (Src.getOpcode() == ISD::SETCC && SrcVT.getScalarType() == MVT::i1 &&
55513 (VT.is512BitVector() || Subtarget.hasVLX()) &&
55514 (VT.getScalarSizeInBits() >= 32 || Subtarget.hasBWI()) &&
55515 TLI.isTypeLegal(SrcVT) && N0.hasOneUse() && Src.hasOneUse())
55516 return DAG.getSelect(DL, VT, DAG.getNOT(DL, Src, SrcVT), N1,
55517 getZeroVector(VT, Subtarget, DAG, DL));
55518 }
55519
55520 // Constant Folding
55521 APInt Undefs0, Undefs1;
55522 SmallVector<APInt> EltBits0, EltBits1;
55523 if (getTargetConstantBitsFromNode(N0, EltSizeInBits, Undefs0, EltBits0,
55524 /*AllowWholeUndefs*/ true,
55525 /*AllowPartialUndefs*/ true)) {
55526 if (getTargetConstantBitsFromNode(N1, EltSizeInBits, Undefs1, EltBits1,
55527 /*AllowWholeUndefs*/ true,
55528 /*AllowPartialUndefs*/ true)) {
55529 SmallVector<APInt> ResultBits;
55530 for (int I = 0; I != NumElts; ++I)
55531 ResultBits.push_back(~EltBits0[I] & EltBits1[I]);
55532 return getConstVector(ResultBits, VT, DAG, DL);
55533 }
55534
55535 // Constant fold NOT(N0) to allow us to use AND.
55536 // Ensure this is only performed if we can confirm that the bitcasted source
55537 // has oneuse to prevent an infinite loop with canonicalizeBitSelect.
55538 if (N0->hasOneUse()) {
55540 if (BC0.getOpcode() != ISD::BITCAST) {
55541 for (APInt &Elt : EltBits0)
55542 Elt = ~Elt;
55543 SDValue Not = getConstVector(EltBits0, VT, DAG, DL);
55544 return DAG.getNode(ISD::AND, DL, VT, Not, N1);
55545 }
55546 }
55547 }
55548
55549 // Attempt to recursively combine a bitmask ANDNP with shuffles.
55550 if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
55551 SDValue Op(N, 0);
55552 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
55553 return Res;
55554
55555 // If either operand is a constant mask, then only the elements that aren't
55556 // zero are actually demanded by the other operand.
55557 auto GetDemandedMasks = [&](SDValue Op, bool Invert = false) {
55558 APInt UndefElts;
55559 SmallVector<APInt> EltBits;
55560 APInt DemandedBits = APInt::getAllOnes(EltSizeInBits);
55561 APInt DemandedElts = APInt::getAllOnes(NumElts);
55562 if (getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts,
55563 EltBits)) {
55564 DemandedBits.clearAllBits();
55565 DemandedElts.clearAllBits();
55566 for (int I = 0; I != NumElts; ++I) {
55567 if (UndefElts[I]) {
55568 // We can't assume an undef src element gives an undef dst - the
55569 // other src might be zero.
55570 DemandedBits.setAllBits();
55571 DemandedElts.setBit(I);
55572 } else if ((Invert && !EltBits[I].isAllOnes()) ||
55573 (!Invert && !EltBits[I].isZero())) {
55574 DemandedBits |= Invert ? ~EltBits[I] : EltBits[I];
55575 DemandedElts.setBit(I);
55576 }
55577 }
55578 }
55579 return std::make_pair(DemandedBits, DemandedElts);
55580 };
55581 APInt Bits0, Elts0;
55582 APInt Bits1, Elts1;
55583 std::tie(Bits0, Elts0) = GetDemandedMasks(N1);
55584 std::tie(Bits1, Elts1) = GetDemandedMasks(N0, true);
55585
55586 if (TLI.SimplifyDemandedVectorElts(N0, Elts0, DCI) ||
55587 TLI.SimplifyDemandedVectorElts(N1, Elts1, DCI) ||
55588 TLI.SimplifyDemandedBits(N0, Bits0, Elts0, DCI) ||
55589 TLI.SimplifyDemandedBits(N1, Bits1, Elts1, DCI)) {
55590 if (N->getOpcode() != ISD::DELETED_NODE)
55591 DCI.AddToWorklist(N);
55592 return SDValue(N, 0);
55593 }
55594 }
55595
55596 // Folds for better commutativity:
55597 if (N1->hasOneUse()) {
55598 // ANDNP(x,NOT(y)) -> AND(NOT(x),NOT(y)) -> NOT(OR(X,Y)).
55599 if (SDValue Not = IsNOT(N1, DAG))
55600 return DAG.getNOT(
55601 DL, DAG.getNode(ISD::OR, DL, VT, N0, DAG.getBitcast(VT, Not)), VT);
55602
55603 // ANDNP(x,PSHUFB(y,z)) -> PSHUFB(y,OR(z,x))
55604 // Zero out elements by setting the PSHUFB mask value to 0xFF.
55605 if (DAG.ComputeNumSignBits(N0) == EltSizeInBits) {
55607 if (BC1.getOpcode() == X86ISD::PSHUFB) {
55608 EVT ShufVT = BC1.getValueType();
55609 SDValue NewMask = DAG.getNode(ISD::OR, DL, ShufVT, BC1.getOperand(1),
55610 DAG.getBitcast(ShufVT, N0));
55611 SDValue NewShuf =
55612 DAG.getNode(X86ISD::PSHUFB, DL, ShufVT, BC1.getOperand(0), NewMask);
55613 return DAG.getBitcast(VT, NewShuf);
55614 }
55615 }
55616 }
55617
55618 return SDValue();
55619}
55620
55623 SDValue N1 = N->getOperand(1);
55624
55625 // BT ignores high bits in the bit index operand.
55626 unsigned BitWidth = N1.getValueSizeInBits();
55628 if (DAG.getTargetLoweringInfo().SimplifyDemandedBits(N1, DemandedMask, DCI)) {
55629 if (N->getOpcode() != ISD::DELETED_NODE)
55630 DCI.AddToWorklist(N);
55631 return SDValue(N, 0);
55632 }
55633
55634 return SDValue();
55635}
55636
55639 bool IsStrict = N->getOpcode() == X86ISD::STRICT_CVTPH2PS;
55640 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
55641
55642 if (N->getValueType(0) == MVT::v4f32 && Src.getValueType() == MVT::v8i16) {
55643 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
55644 APInt DemandedElts = APInt::getLowBitsSet(8, 4);
55645 if (TLI.SimplifyDemandedVectorElts(Src, DemandedElts, DCI)) {
55646 if (N->getOpcode() != ISD::DELETED_NODE)
55647 DCI.AddToWorklist(N);
55648 return SDValue(N, 0);
55649 }
55650
55651 // Convert a full vector load into vzload when not all bits are needed.
55652 if (ISD::isNormalLoad(Src.getNode()) && Src.hasOneUse()) {
55653 LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(IsStrict ? 1 : 0));
55654 if (SDValue VZLoad = narrowLoadToVZLoad(LN, MVT::i64, MVT::v2i64, DAG)) {
55655 SDLoc dl(N);
55656 if (IsStrict) {
55657 SDValue Convert = DAG.getNode(
55658 N->getOpcode(), dl, {MVT::v4f32, MVT::Other},
55659 {N->getOperand(0), DAG.getBitcast(MVT::v8i16, VZLoad)});
55660 DCI.CombineTo(N, Convert, Convert.getValue(1));
55661 } else {
55662 SDValue Convert = DAG.getNode(N->getOpcode(), dl, MVT::v4f32,
55663 DAG.getBitcast(MVT::v8i16, VZLoad));
55664 DCI.CombineTo(N, Convert);
55665 }
55666
55667 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
55669 return SDValue(N, 0);
55670 }
55671 }
55672 }
55673
55674 return SDValue();
55675}
55676
55677// Try to combine sext_in_reg of a cmov of constants by extending the constants.
55679 assert(N->getOpcode() == ISD::SIGN_EXTEND_INREG);
55680
55681 EVT DstVT = N->getValueType(0);
55682
55683 SDValue N0 = N->getOperand(0);
55684 SDValue N1 = N->getOperand(1);
55685 EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
55686
55687 if (ExtraVT != MVT::i8 && ExtraVT != MVT::i16)
55688 return SDValue();
55689
55690 // Look through single use any_extends / truncs.
55691 SDValue IntermediateBitwidthOp;
55692 if ((N0.getOpcode() == ISD::ANY_EXTEND || N0.getOpcode() == ISD::TRUNCATE) &&
55693 N0.hasOneUse()) {
55694 IntermediateBitwidthOp = N0;
55695 N0 = N0.getOperand(0);
55696 }
55697
55698 // See if we have a single use cmov.
55699 if (N0.getOpcode() != X86ISD::CMOV || !N0.hasOneUse())
55700 return SDValue();
55701
55702 SDValue CMovOp0 = N0.getOperand(0);
55703 SDValue CMovOp1 = N0.getOperand(1);
55704
55705 // Make sure both operands are constants.
55706 if (!isa<ConstantSDNode>(CMovOp0.getNode()) ||
55707 !isa<ConstantSDNode>(CMovOp1.getNode()))
55708 return SDValue();
55709
55710 SDLoc DL(N);
55711
55712 // If we looked through an any_extend/trunc above, add one to the constants.
55713 if (IntermediateBitwidthOp) {
55714 unsigned IntermediateOpc = IntermediateBitwidthOp.getOpcode();
55715 CMovOp0 = DAG.getNode(IntermediateOpc, DL, DstVT, CMovOp0);
55716 CMovOp1 = DAG.getNode(IntermediateOpc, DL, DstVT, CMovOp1);
55717 }
55718
55719 CMovOp0 = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, DstVT, CMovOp0, N1);
55720 CMovOp1 = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, DstVT, CMovOp1, N1);
55721
55722 EVT CMovVT = DstVT;
55723 // We do not want i16 CMOV's. Promote to i32 and truncate afterwards.
55724 if (DstVT == MVT::i16) {
55725 CMovVT = MVT::i32;
55726 CMovOp0 = DAG.getNode(ISD::ZERO_EXTEND, DL, CMovVT, CMovOp0);
55727 CMovOp1 = DAG.getNode(ISD::ZERO_EXTEND, DL, CMovVT, CMovOp1);
55728 }
55729
55730 SDValue CMov = DAG.getNode(X86ISD::CMOV, DL, CMovVT, CMovOp0, CMovOp1,
55731 N0.getOperand(2), N0.getOperand(3));
55732
55733 if (CMovVT != DstVT)
55734 CMov = DAG.getNode(ISD::TRUNCATE, DL, DstVT, CMov);
55735
55736 return CMov;
55737}
55738
55740 const X86Subtarget &Subtarget) {
55741 assert(N->getOpcode() == ISD::SIGN_EXTEND_INREG);
55742
55743 if (SDValue V = combineSextInRegCmov(N, DAG))
55744 return V;
55745
55746 EVT VT = N->getValueType(0);
55747 SDValue N0 = N->getOperand(0);
55748 SDValue N1 = N->getOperand(1);
55749 EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
55750 SDLoc dl(N);
55751
55752 // The SIGN_EXTEND_INREG to v4i64 is expensive operation on the
55753 // both SSE and AVX2 since there is no sign-extended shift right
55754 // operation on a vector with 64-bit elements.
55755 //(sext_in_reg (v4i64 anyext (v4i32 x )), ExtraVT) ->
55756 // (v4i64 sext (v4i32 sext_in_reg (v4i32 x , ExtraVT)))
55757 if (VT == MVT::v4i64 && (N0.getOpcode() == ISD::ANY_EXTEND ||
55758 N0.getOpcode() == ISD::SIGN_EXTEND)) {
55759 SDValue N00 = N0.getOperand(0);
55760
55761 // EXTLOAD has a better solution on AVX2,
55762 // it may be replaced with X86ISD::VSEXT node.
55763 if (N00.getOpcode() == ISD::LOAD && Subtarget.hasInt256())
55764 if (!ISD::isNormalLoad(N00.getNode()))
55765 return SDValue();
55766
55767 // Attempt to promote any comparison mask ops before moving the
55768 // SIGN_EXTEND_INREG in the way.
55769 if (SDValue Promote = PromoteMaskArithmetic(N0, dl, DAG, Subtarget))
55770 return DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, VT, Promote, N1);
55771
55772 if (N00.getValueType() == MVT::v4i32 && ExtraVT.getSizeInBits() < 128) {
55773 SDValue Tmp =
55774 DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32, N00, N1);
55775 return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i64, Tmp);
55776 }
55777 }
55778 return SDValue();
55779}
55780
55781/// sext(add_nsw(x, C)) --> add(sext(x), C_sext)
55782/// zext(add_nuw(x, C)) --> add(zext(x), C_zext)
55783/// Promoting a sign/zero extension ahead of a no overflow 'add' exposes
55784/// opportunities to combine math ops, use an LEA, or use a complex addressing
55785/// mode. This can eliminate extend, add, and shift instructions.
55787 const X86Subtarget &Subtarget) {
55788 if (Ext->getOpcode() != ISD::SIGN_EXTEND &&
55789 Ext->getOpcode() != ISD::ZERO_EXTEND)
55790 return SDValue();
55791
55792 // TODO: This should be valid for other integer types.
55793 EVT VT = Ext->getValueType(0);
55794 if (VT != MVT::i64)
55795 return SDValue();
55796
55797 SDValue Add = Ext->getOperand(0);
55798 if (Add.getOpcode() != ISD::ADD)
55799 return SDValue();
55800
55801 SDValue AddOp0 = Add.getOperand(0);
55802 SDValue AddOp1 = Add.getOperand(1);
55803 bool Sext = Ext->getOpcode() == ISD::SIGN_EXTEND;
55804 bool NSW = Add->getFlags().hasNoSignedWrap();
55805 bool NUW = Add->getFlags().hasNoUnsignedWrap();
55806 NSW = NSW || (Sext && DAG.willNotOverflowAdd(true, AddOp0, AddOp1));
55807 NUW = NUW || (!Sext && DAG.willNotOverflowAdd(false, AddOp0, AddOp1));
55808
55809 // We need an 'add nsw' feeding into the 'sext' or 'add nuw' feeding
55810 // into the 'zext'
55811 if ((Sext && !NSW) || (!Sext && !NUW))
55812 return SDValue();
55813
55814 // Having a constant operand to the 'add' ensures that we are not increasing
55815 // the instruction count because the constant is extended for free below.
55816 // A constant operand can also become the displacement field of an LEA.
55817 auto *AddOp1C = dyn_cast<ConstantSDNode>(AddOp1);
55818 if (!AddOp1C)
55819 return SDValue();
55820
55821 // Don't make the 'add' bigger if there's no hope of combining it with some
55822 // other 'add' or 'shl' instruction.
55823 // TODO: It may be profitable to generate simpler LEA instructions in place
55824 // of single 'add' instructions, but the cost model for selecting an LEA
55825 // currently has a high threshold.
55826 bool HasLEAPotential = false;
55827 for (auto *User : Ext->users()) {
55828 if (User->getOpcode() == ISD::ADD || User->getOpcode() == ISD::SHL) {
55829 HasLEAPotential = true;
55830 break;
55831 }
55832 }
55833 if (!HasLEAPotential)
55834 return SDValue();
55835
55836 // Everything looks good, so pull the '{s|z}ext' ahead of the 'add'.
55837 int64_t AddC = Sext ? AddOp1C->getSExtValue() : AddOp1C->getZExtValue();
55838 SDValue NewExt = DAG.getNode(Ext->getOpcode(), SDLoc(Ext), VT, AddOp0);
55839 SDValue NewConstant = DAG.getConstant(AddC, SDLoc(Add), VT);
55840
55841 // The wider add is guaranteed to not wrap because both operands are
55842 // sign-extended.
55843 SDNodeFlags Flags;
55844 Flags.setNoSignedWrap(NSW);
55845 Flags.setNoUnsignedWrap(NUW);
55846 return DAG.getNode(ISD::ADD, SDLoc(Add), VT, NewExt, NewConstant, Flags);
55847}
55848
55849// If we face {ANY,SIGN,ZERO}_EXTEND that is applied to a CMOV with constant
55850// operands and the result of CMOV is not used anywhere else - promote CMOV
55851// itself instead of promoting its result. This could be beneficial, because:
55852// 1) X86TargetLowering::EmitLoweredSelect later can do merging of two
55853// (or more) pseudo-CMOVs only when they go one-after-another and
55854// getting rid of result extension code after CMOV will help that.
55855// 2) Promotion of constant CMOV arguments is free, hence the
55856// {ANY,SIGN,ZERO}_EXTEND will just be deleted.
55857// 3) 16-bit CMOV encoding is 4 bytes, 32-bit CMOV is 3-byte, so this
55858// promotion is also good in terms of code-size.
55859// (64-bit CMOV is 4-bytes, that's why we don't do 32-bit => 64-bit
55860// promotion).
55862 SDValue CMovN = Extend->getOperand(0);
55863 if (CMovN.getOpcode() != X86ISD::CMOV || !CMovN.hasOneUse())
55864 return SDValue();
55865
55866 EVT TargetVT = Extend->getValueType(0);
55867 unsigned ExtendOpcode = Extend->getOpcode();
55868 SDLoc DL(Extend);
55869
55870 EVT VT = CMovN.getValueType();
55871 SDValue CMovOp0 = CMovN.getOperand(0);
55872 SDValue CMovOp1 = CMovN.getOperand(1);
55873
55874 if (!isa<ConstantSDNode>(CMovOp0.getNode()) ||
55875 !isa<ConstantSDNode>(CMovOp1.getNode()))
55876 return SDValue();
55877
55878 // Only extend to i32 or i64.
55879 if (TargetVT != MVT::i32 && TargetVT != MVT::i64)
55880 return SDValue();
55881
55882 // Only extend from i16 unless its a sign_extend from i32. Zext/aext from i32
55883 // are free.
55884 if (VT != MVT::i16 && !(ExtendOpcode == ISD::SIGN_EXTEND && VT == MVT::i32))
55885 return SDValue();
55886
55887 // If this a zero extend to i64, we should only extend to i32 and use a free
55888 // zero extend to finish.
55889 EVT ExtendVT = TargetVT;
55890 if (TargetVT == MVT::i64 && ExtendOpcode != ISD::SIGN_EXTEND)
55891 ExtendVT = MVT::i32;
55892
55893 CMovOp0 = DAG.getNode(ExtendOpcode, DL, ExtendVT, CMovOp0);
55894 CMovOp1 = DAG.getNode(ExtendOpcode, DL, ExtendVT, CMovOp1);
55895
55896 SDValue Res = DAG.getNode(X86ISD::CMOV, DL, ExtendVT, CMovOp0, CMovOp1,
55897 CMovN.getOperand(2), CMovN.getOperand(3));
55898
55899 // Finish extending if needed.
55900 if (ExtendVT != TargetVT)
55901 Res = DAG.getNode(ExtendOpcode, DL, TargetVT, Res);
55902
55903 return Res;
55904}
55905
55906// Attempt to combine a (sext/zext (setcc)) to a setcc with a xmm/ymm/zmm
55907// result type.
55909 const X86Subtarget &Subtarget) {
55910 SDValue N0 = N->getOperand(0);
55911 EVT VT = N->getValueType(0);
55912 SDLoc dl(N);
55913
55914 // Only do this combine with AVX512 for vector extends.
55915 if (!Subtarget.hasAVX512() || !VT.isVector() || N0.getOpcode() != ISD::SETCC)
55916 return SDValue();
55917
55918 // Only combine legal element types.
55919 EVT SVT = VT.getVectorElementType();
55920 if (SVT != MVT::i8 && SVT != MVT::i16 && SVT != MVT::i32 &&
55921 SVT != MVT::i64 && SVT != MVT::f32 && SVT != MVT::f64)
55922 return SDValue();
55923
55924 // We don't have CMPP Instruction for vxf16
55925 if (N0.getOperand(0).getValueType().getVectorElementType() == MVT::f16)
55926 return SDValue();
55927 // We can only do this if the vector size in 256 bits or less.
55928 unsigned Size = VT.getSizeInBits();
55929 if (Size > 256 && Subtarget.useAVX512Regs())
55930 return SDValue();
55931
55932 EVT N00VT = N0.getOperand(0).getValueType();
55933
55934 // Don't fold if the condition code can't be handled by PCMPEQ/PCMPGT since
55935 // that's the only integer compares with we have.
55937 if (N00VT.isInteger() && ISD::isUnsignedIntSetCC(CC))
55938 return SDValue();
55939
55940 // Only do this combine if the extension will be fully consumed by the setcc.
55941 EVT MatchingVecType = N00VT.changeVectorElementTypeToInteger();
55942 if (Size != MatchingVecType.getSizeInBits())
55943 return SDValue();
55944
55945 SDValue Res = DAG.getSetCC(dl, VT, N0.getOperand(0), N0.getOperand(1), CC);
55946
55947 if (N->getOpcode() == ISD::ZERO_EXTEND)
55948 Res = DAG.getZeroExtendInReg(Res, dl, N0.getValueType());
55949
55950 return Res;
55951}
55952
55955 const X86Subtarget &Subtarget) {
55956 SDValue N0 = N->getOperand(0);
55957 EVT VT = N->getValueType(0);
55958 SDLoc DL(N);
55959
55960 // (i32 (sext (i8 (x86isd::setcc_carry)))) -> (i32 (x86isd::setcc_carry))
55961 if (!DCI.isBeforeLegalizeOps() &&
55963 SDValue Setcc = DAG.getNode(X86ISD::SETCC_CARRY, DL, VT, N0->getOperand(0),
55964 N0->getOperand(1));
55965 bool ReplaceOtherUses = !N0.hasOneUse();
55966 DCI.CombineTo(N, Setcc);
55967 // Replace other uses with a truncate of the widened setcc_carry.
55968 if (ReplaceOtherUses) {
55969 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0),
55970 N0.getValueType(), Setcc);
55971 DCI.CombineTo(N0.getNode(), Trunc);
55972 }
55973
55974 return SDValue(N, 0);
55975 }
55976
55977 if (SDValue NewCMov = combineToExtendCMOV(N, DAG))
55978 return NewCMov;
55979
55980 if (!DCI.isBeforeLegalizeOps())
55981 return SDValue();
55982
55983 if (SDValue V = combineExtSetcc(N, DAG, Subtarget))
55984 return V;
55985
55986 if (SDValue V = combineToExtendBoolVectorInReg(N->getOpcode(), DL, VT, N0,
55987 DAG, DCI, Subtarget))
55988 return V;
55989
55990 if (VT.isVector()) {
55991 if (SDValue R = PromoteMaskArithmetic(SDValue(N, 0), DL, DAG, Subtarget))
55992 return R;
55993
55995 return DAG.getNode(N0.getOpcode(), DL, VT, N0.getOperand(0));
55996 }
55997
55998 if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
55999 return NewAdd;
56000
56001 return SDValue();
56002}
56003
56004// Inverting a constant vector is profitable if it can be eliminated and the
56005// inverted vector is already present in DAG. Otherwise, it will be loaded
56006// anyway.
56007//
56008// We determine which of the values can be completely eliminated and invert it.
56009// If both are eliminable, select a vector with the first negative element.
56012 "ConstantFP build vector expected");
56013 // Check if we can eliminate V. We assume if a value is only used in FMAs, we
56014 // can eliminate it. Since this function is invoked for each FMA with this
56015 // vector.
56016 auto IsNotFMA = [](SDNode *User) {
56017 return User->getOpcode() != ISD::FMA &&
56018 User->getOpcode() != ISD::STRICT_FMA;
56019 };
56020 if (llvm::any_of(V->users(), IsNotFMA))
56021 return SDValue();
56022
56024 EVT VT = V.getValueType();
56025 EVT EltVT = VT.getVectorElementType();
56026 for (const SDValue &Op : V->op_values()) {
56027 if (auto *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
56028 Ops.push_back(DAG.getConstantFP(-Cst->getValueAPF(), SDLoc(Op), EltVT));
56029 } else {
56030 assert(Op.isUndef());
56031 Ops.push_back(DAG.getUNDEF(EltVT));
56032 }
56033 }
56034
56036 if (!NV)
56037 return SDValue();
56038
56039 // If an inverted version cannot be eliminated, choose it instead of the
56040 // original version.
56041 if (llvm::any_of(NV->users(), IsNotFMA))
56042 return SDValue(NV, 0);
56043
56044 // If the inverted version also can be eliminated, we have to consistently
56045 // prefer one of the values. We prefer a constant with a negative value on
56046 // the first place.
56047 // N.B. We need to skip undefs that may precede a value.
56048 for (const SDValue &Op : V->op_values()) {
56049 if (auto *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
56050 if (Cst->isNegative())
56051 return SDValue();
56052 break;
56053 }
56054 }
56055 return SDValue(NV, 0);
56056}
56057
56060 const X86Subtarget &Subtarget) {
56061 SDLoc dl(N);
56062 EVT VT = N->getValueType(0);
56064 bool IsStrict = N->isTargetOpcode()
56065 ? TSI.isTargetStrictFPOpcode(N->getOpcode())
56066 : N->isStrictFPOpcode();
56067
56068 // Let legalize expand this if it isn't a legal type yet.
56069 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
56070 if (!TLI.isTypeLegal(VT))
56071 return SDValue();
56072
56073 SDValue A = N->getOperand(IsStrict ? 1 : 0);
56074 SDValue B = N->getOperand(IsStrict ? 2 : 1);
56075 SDValue C = N->getOperand(IsStrict ? 3 : 2);
56076
56077 // If the operation allows fast-math and the target does not support FMA,
56078 // split this into mul+add to avoid libcall(s).
56079 SDNodeFlags Flags = N->getFlags();
56080 if (!IsStrict && Flags.hasAllowReassociation() &&
56081 TLI.isOperationExpand(ISD::FMA, VT)) {
56082 SDValue Fmul = DAG.getNode(ISD::FMUL, dl, VT, A, B, Flags);
56083 return DAG.getNode(ISD::FADD, dl, VT, Fmul, C, Flags);
56084 }
56085
56086 EVT ScalarVT = VT.getScalarType();
56087 if (((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) ||
56088 !Subtarget.hasAnyFMA()) &&
56089 !(ScalarVT == MVT::f16 && Subtarget.hasFP16()) &&
56090 !(ScalarVT == MVT::bf16 && Subtarget.hasAVX10_2()))
56091 return SDValue();
56092
56093 auto invertIfNegative = [&DAG, &TLI, &DCI](SDValue &V) {
56095 bool LegalOperations = !DCI.isBeforeLegalizeOps();
56096 if (SDValue NegV = TLI.getCheaperNegatedExpression(V, DAG, LegalOperations,
56097 CodeSize)) {
56098 V = NegV;
56099 return true;
56100 }
56101 // Look through extract_vector_elts. If it comes from an FNEG, create a
56102 // new extract from the FNEG input.
56103 if (V.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
56104 isNullConstant(V.getOperand(1))) {
56105 SDValue Vec = V.getOperand(0);
56106 if (SDValue NegV = TLI.getCheaperNegatedExpression(
56107 Vec, DAG, LegalOperations, CodeSize)) {
56108 V = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(V), V.getValueType(),
56109 NegV, V.getOperand(1));
56110 return true;
56111 }
56112 }
56113 // Lookup if there is an inverted version of constant vector V in DAG.
56114 if (ISD::isBuildVectorOfConstantFPSDNodes(V.getNode())) {
56115 if (SDValue NegV = getInvertedVectorForFMA(V, DAG)) {
56116 V = NegV;
56117 return true;
56118 }
56119 }
56120 return false;
56121 };
56122
56123 // Do not convert the passthru input of scalar intrinsics.
56124 // FIXME: We could allow negations of the lower element only.
56125 bool NegA = invertIfNegative(A);
56126 // Create a dummy use for A so that in the process of negating B or C
56127 // recursively, it is not deleted.
56128 HandleSDNode NegAHandle(A);
56129 bool NegB = invertIfNegative(B);
56130 // Similar to A, get a handle on B.
56131 HandleSDNode NegBHandle(B);
56132 bool NegC = invertIfNegative(C);
56133
56134 if (!NegA && !NegB && !NegC)
56135 return SDValue();
56136
56137 unsigned NewOpcode =
56138 negateFMAOpcode(N->getOpcode(), NegA != NegB, NegC, false);
56139
56140 // Propagate fast-math-flags to new FMA node.
56141 SelectionDAG::FlagInserter FlagsInserter(DAG, Flags);
56142 if (IsStrict) {
56143 assert(N->getNumOperands() == 4 && "Shouldn't be greater than 4");
56144 return DAG.getNode(NewOpcode, dl, {VT, MVT::Other},
56145 {N->getOperand(0), A, B, C});
56146 } else {
56147 if (N->getNumOperands() == 4)
56148 return DAG.getNode(NewOpcode, dl, VT, A, B, C, N->getOperand(3));
56149 return DAG.getNode(NewOpcode, dl, VT, A, B, C);
56150 }
56151}
56152
56153// Combine FMADDSUB(A, B, FNEG(C)) -> FMSUBADD(A, B, C)
56154// Combine FMSUBADD(A, B, FNEG(C)) -> FMADDSUB(A, B, C)
56157 SDLoc dl(N);
56158 EVT VT = N->getValueType(0);
56159 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
56161 bool LegalOperations = !DCI.isBeforeLegalizeOps();
56162
56163 SDValue N2 = N->getOperand(2);
56164
56165 SDValue NegN2 =
56166 TLI.getCheaperNegatedExpression(N2, DAG, LegalOperations, CodeSize);
56167 if (!NegN2)
56168 return SDValue();
56169 unsigned NewOpcode = negateFMAOpcode(N->getOpcode(), false, true, false);
56170
56171 if (N->getNumOperands() == 4)
56172 return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1),
56173 NegN2, N->getOperand(3));
56174 return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1),
56175 NegN2);
56176}
56177
56178// Try to widen the build vector and bitcast it to the type of zext.
56179// This is a special case for the 128-bit vector types. Intention is to remove
56180// the zext and replace it with a bitcast the wider type. While lowering
56181// the bitcast is removed and extra commutation due to zext is avoided.
56182// For example:
56183// zext v4i16 ( v4i8 build_vector (x, y, z, w)) -> bitcast v4i16 ( v8i8
56184// build_vector (x, 0, y, 0, z, w, 0)
56186
56187 if (Extend->getOpcode() != ISD::ZERO_EXTEND)
56188 return SDValue();
56189
56190 EVT ExtendVT = Extend->getValueType(0);
56191
56192 SDValue BV = Extend->getOperand(0);
56193 if (BV.getOpcode() != ISD::BUILD_VECTOR || !BV.hasOneUse())
56194 return SDValue();
56195
56196 if (any_of(BV->op_values(), [](SDValue Op) { return Op.isUndef(); })) {
56197 // If the build vector has undef elements, we cannot widen it.
56198 // The widening would create a vector with more undef elements, which
56199 // is not valid.
56200 return SDValue();
56201 }
56202
56203 if (!all_of(BV->op_values(),
56204 [](SDValue Op) { return Op.getOpcode() == ISD::LOAD; })) {
56205 // If the build vector any element other than \ISD::LOAD, we cannot widen
56206 // it.
56207 return SDValue();
56208 }
56209
56210 SDLoc dl(BV);
56211 EVT VT = BV.getValueType();
56212 EVT EltVT = BV.getOperand(0).getValueType();
56213 unsigned NumElts = VT.getVectorNumElements();
56214
56215 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
56216
56217 if (TLI.getTypeAction(*DAG.getContext(), VT) !=
56219 return SDValue();
56220
56221 EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
56222 unsigned WidenNumElts = WidenVT.getVectorNumElements();
56223
56224 SmallVector<SDValue, 16> NewOps(BV->op_begin(), BV->op_end());
56225 assert(WidenNumElts >= NumElts && "Shrinking vector instead of widening!");
56226 // Fill the new elements with Zero.
56227 NewOps.append(WidenNumElts - NumElts, DAG.getConstant(0, dl, EltVT));
56228 // Compute the step to place the elements in the right place and control the
56229 // iteration.
56230 unsigned step = WidenNumElts / NumElts;
56231 if (WidenVT.is128BitVector()) {
56232 if (step > 1 && Extend->getValueSizeInBits(0) == WidenVT.getSizeInBits()) {
56233 for (int i = NumElts - 1, j = WidenNumElts - step; i > 0;
56234 i--, j -= step) {
56235 SDValue temp = NewOps[i];
56236 NewOps[i] = NewOps[j];
56237 NewOps[j] = temp;
56238 }
56239 // Create new build vector with WidenVT and NewOps
56240 SDValue NewBV = DAG.getBuildVector(WidenVT, dl, NewOps);
56241 // Replace the old build vector with the new one. Bitcast the
56242 // new build vector to the type of the zext.
56243 SDValue NewBVBitcast = DAG.getBitcast(ExtendVT, NewBV);
56244 DAG.ReplaceAllUsesOfValueWith(SDValue(Extend, 0), NewBVBitcast);
56245 return NewBV;
56246 }
56247 }
56248 return SDValue();
56249}
56250
56253 const X86Subtarget &Subtarget) {
56254 SDLoc dl(N);
56255 SDValue N0 = N->getOperand(0);
56256 EVT VT = N->getValueType(0);
56257
56258 // (i32 (aext (i8 (x86isd::setcc_carry)))) -> (i32 (x86isd::setcc_carry))
56259 // FIXME: Is this needed? We don't seem to have any tests for it.
56260 if (!DCI.isBeforeLegalizeOps() && N->getOpcode() == ISD::ANY_EXTEND &&
56262 SDValue Setcc = DAG.getNode(X86ISD::SETCC_CARRY, dl, VT, N0->getOperand(0),
56263 N0->getOperand(1));
56264 bool ReplaceOtherUses = !N0.hasOneUse();
56265 DCI.CombineTo(N, Setcc);
56266 // Replace other uses with a truncate of the widened setcc_carry.
56267 if (ReplaceOtherUses) {
56268 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0),
56269 N0.getValueType(), Setcc);
56270 DCI.CombineTo(N0.getNode(), Trunc);
56271 }
56272
56273 return SDValue(N, 0);
56274 }
56275
56276 if (SDValue NewCMov = combineToExtendCMOV(N, DAG))
56277 return NewCMov;
56278
56279 if (DCI.isBeforeLegalizeOps())
56280 if (SDValue V = combineExtSetcc(N, DAG, Subtarget))
56281 return V;
56282
56283 if (SDValue V = combineToExtendBoolVectorInReg(N->getOpcode(), dl, VT, N0,
56284 DAG, DCI, Subtarget))
56285 return V;
56286
56287 if (VT.isVector())
56288 if (SDValue R = PromoteMaskArithmetic(SDValue(N, 0), dl, DAG, Subtarget))
56289 return R;
56290
56291 if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
56292 return NewAdd;
56293
56294 if (SDValue R = combineOrCmpEqZeroToCtlzSrl(N, DAG, DCI, Subtarget))
56295 return R;
56296
56297 // TODO: Combine with any target/faux shuffle.
56298 if (N0.getOpcode() == X86ISD::PACKUS && N0.getValueSizeInBits() == 128 &&
56300 SDValue N00 = N0.getOperand(0);
56301 SDValue N01 = N0.getOperand(1);
56302 unsigned NumSrcEltBits = N00.getScalarValueSizeInBits();
56303 APInt ZeroMask = APInt::getHighBitsSet(NumSrcEltBits, NumSrcEltBits / 2);
56304 if ((N00.isUndef() || DAG.MaskedValueIsZero(N00, ZeroMask)) &&
56305 (N01.isUndef() || DAG.MaskedValueIsZero(N01, ZeroMask))) {
56306 return concatSubVectors(N00, N01, DAG, dl);
56307 }
56308 }
56309
56310 if (SDValue V = widenBuildVec(N, DAG))
56311 return V;
56312
56313 return SDValue();
56314}
56315
56316/// If we have AVX512, but not BWI and this is a vXi16/vXi8 setcc, just
56317/// pre-promote its result type since vXi1 vectors don't get promoted
56318/// during type legalization.
56321 const SDLoc &DL, SelectionDAG &DAG,
56322 const X86Subtarget &Subtarget) {
56323 if (Subtarget.hasAVX512() && !Subtarget.hasBWI() && VT.isVector() &&
56324 VT.getVectorElementType() == MVT::i1 &&
56325 (OpVT.getVectorElementType() == MVT::i8 ||
56326 OpVT.getVectorElementType() == MVT::i16)) {
56327 SDValue Setcc = DAG.getSetCC(DL, OpVT, LHS, RHS, CC);
56328 return DAG.getNode(ISD::TRUNCATE, DL, VT, Setcc);
56329 }
56330 return SDValue();
56331}
56332
56333// The pattern (setcc (and (broadcast x), (2^n, 2^{n+1}, ...)), (0, 0, ...),
56334// eq/ne) is generated when using an integer as a mask. Instead of generating a
56335// broadcast + vptest, we can directly move the integer to a mask register.
56337 const SDLoc &DL, SelectionDAG &DAG,
56338 const X86Subtarget &Subtarget) {
56339 if (CC != ISD::SETNE && CC != ISD::SETEQ)
56340 return SDValue();
56341
56342 if (!Subtarget.hasAVX512())
56343 return SDValue();
56344
56345 if (Op0.getOpcode() != ISD::AND)
56346 return SDValue();
56347
56348 SDValue Broadcast = Op0.getOperand(0);
56349 if (Broadcast.getOpcode() != X86ISD::VBROADCAST &&
56350 Broadcast.getOpcode() != X86ISD::VBROADCAST_LOAD)
56351 return SDValue();
56352
56353 SDValue Load = Op0.getOperand(1);
56354 EVT LoadVT = Load.getSimpleValueType();
56355
56356 APInt UndefElts;
56357 SmallVector<APInt, 32> EltBits;
56359 UndefElts, EltBits,
56360 /*AllowWholeUndefs*/ true,
56361 /*AllowPartialUndefs*/ false) ||
56362 UndefElts[0] || !EltBits[0].isPowerOf2() || UndefElts.getBitWidth() > 16)
56363 return SDValue();
56364
56365 // Check if the constant pool contains only powers of 2 starting from some
56366 // 2^N. The table may also contain undefs because of widening of vector
56367 // operands.
56368 unsigned N = EltBits[0].logBase2();
56369 unsigned Len = UndefElts.getBitWidth();
56370 for (unsigned I = 1; I != Len; ++I) {
56371 if (UndefElts[I]) {
56372 if (!UndefElts.extractBits(Len - (I + 1), I + 1).isAllOnes())
56373 return SDValue();
56374 break;
56375 }
56376
56377 if (EltBits[I].getBitWidth() <= N + I || !EltBits[I].isOneBitSet(N + I))
56378 return SDValue();
56379 }
56380
56381 MVT BroadcastOpVT = Broadcast.getSimpleValueType().getVectorElementType();
56382 SDValue BroadcastOp;
56383 if (Broadcast.getOpcode() != X86ISD::VBROADCAST) {
56384 BroadcastOp = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, BroadcastOpVT,
56385 Broadcast, DAG.getVectorIdxConstant(0, DL));
56386 } else {
56387 BroadcastOp = Broadcast.getOperand(0);
56388 if (BroadcastOp.getValueType().isVector())
56389 return SDValue();
56390 }
56391
56392 SDValue Masked = BroadcastOp;
56393 if (N != 0) {
56394 unsigned BroadcastOpBitWidth = BroadcastOpVT.getSizeInBits();
56395 unsigned NumDefinedElts = UndefElts.countTrailingZeros();
56396
56397 if (NumDefinedElts > BroadcastOpBitWidth)
56398 return SDValue();
56399
56400 APInt Mask = APInt::getLowBitsSet(BroadcastOpBitWidth, NumDefinedElts);
56401 SDValue ShiftedValue = DAG.getNode(ISD::SRL, DL, BroadcastOpVT, BroadcastOp,
56402 DAG.getConstant(N, DL, BroadcastOpVT));
56403 Masked = DAG.getNode(ISD::AND, DL, BroadcastOpVT, ShiftedValue,
56404 DAG.getConstant(Mask, DL, BroadcastOpVT));
56405 }
56406 // We can't extract more than 16 bits using this pattern, because 2^{17} will
56407 // not fit in an i16 and a vXi32 where X > 16 is more than 512 bits.
56408 SDValue Trunc = DAG.getAnyExtOrTrunc(Masked, DL, MVT::i16);
56409 SDValue Bitcast = DAG.getNode(ISD::BITCAST, DL, MVT::v16i1, Trunc);
56410
56411 if (CC == ISD::SETEQ)
56412 Bitcast = DAG.getNOT(DL, Bitcast, MVT::v16i1);
56413
56414 if (VT != MVT::v16i1)
56415 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Bitcast,
56416 DAG.getVectorIdxConstant(0, DL));
56417
56418 return Bitcast;
56419}
56420
56423 const X86Subtarget &Subtarget) {
56424 const ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
56425 const SDValue LHS = N->getOperand(0);
56426 const SDValue RHS = N->getOperand(1);
56427 EVT VT = N->getValueType(0);
56428 EVT OpVT = LHS.getValueType();
56429 SDLoc DL(N);
56430
56431 if (CC == ISD::SETNE || CC == ISD::SETEQ) {
56432 if (SDValue V = combineVectorSizedSetCCEquality(VT, LHS, RHS, CC, DL, DAG,
56433 Subtarget))
56434 return V;
56435 }
56436
56437 if (VT == MVT::i1) {
56438 X86::CondCode X86CC;
56439 if (SDValue V =
56440 MatchVectorAllEqualTest(LHS, RHS, CC, DL, Subtarget, DAG, X86CC))
56441 return DAG.getNode(ISD::TRUNCATE, DL, VT, getSETCC(X86CC, V, DL, DAG));
56442 }
56443
56444 if (CC == ISD::SETNE || CC == ISD::SETEQ) {
56445 if (OpVT.isScalarInteger()) {
56446 // cmpeq(or(X,Y),X) --> cmpeq(and(~X,Y),0)
56447 // cmpne(or(X,Y),X) --> cmpne(and(~X,Y),0)
56448 auto MatchOrCmpEq = [&](SDValue N0, SDValue N1) {
56449 if (N0.getOpcode() == ISD::OR && N0->hasOneUse()) {
56450 if (N0.getOperand(0) == N1)
56451 return DAG.getNode(ISD::AND, DL, OpVT, DAG.getNOT(DL, N1, OpVT),
56452 N0.getOperand(1));
56453 if (N0.getOperand(1) == N1)
56454 return DAG.getNode(ISD::AND, DL, OpVT, DAG.getNOT(DL, N1, OpVT),
56455 N0.getOperand(0));
56456 }
56457 return SDValue();
56458 };
56459 if (SDValue AndN = MatchOrCmpEq(LHS, RHS))
56460 return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);
56461 if (SDValue AndN = MatchOrCmpEq(RHS, LHS))
56462 return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);
56463
56464 // cmpeq(and(X,Y),Y) --> cmpeq(and(~X,Y),0)
56465 // cmpne(and(X,Y),Y) --> cmpne(and(~X,Y),0)
56466 auto MatchAndCmpEq = [&](SDValue N0, SDValue N1) {
56467 if (N0.getOpcode() == ISD::AND && N0->hasOneUse()) {
56468 if (N0.getOperand(0) == N1)
56469 return DAG.getNode(ISD::AND, DL, OpVT, N1,
56470 DAG.getNOT(DL, N0.getOperand(1), OpVT));
56471 if (N0.getOperand(1) == N1)
56472 return DAG.getNode(ISD::AND, DL, OpVT, N1,
56473 DAG.getNOT(DL, N0.getOperand(0), OpVT));
56474 }
56475 return SDValue();
56476 };
56477 if (SDValue AndN = MatchAndCmpEq(LHS, RHS))
56478 return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);
56479 if (SDValue AndN = MatchAndCmpEq(RHS, LHS))
56480 return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);
56481
56482 // cmpeq(trunc(x),C) --> cmpeq(x,C)
56483 // cmpne(trunc(x),C) --> cmpne(x,C)
56484 // iff x upper bits are zero.
56485 if (LHS.getOpcode() == ISD::TRUNCATE &&
56486 LHS.getOperand(0).getScalarValueSizeInBits() >= 32 &&
56488 EVT SrcVT = LHS.getOperand(0).getValueType();
56490 OpVT.getScalarSizeInBits());
56491 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
56492 if (DAG.MaskedValueIsZero(LHS.getOperand(0), UpperBits) &&
56493 TLI.isTypeLegal(LHS.getOperand(0).getValueType()))
56494 return DAG.getSetCC(DL, VT, LHS.getOperand(0),
56495 DAG.getZExtOrTrunc(RHS, DL, SrcVT), CC);
56496 }
56497
56498 // With C as a power of 2 and C != 0 and C != INT_MIN:
56499 // icmp eq Abs(X) C ->
56500 // (icmp eq A, C) | (icmp eq A, -C)
56501 // icmp ne Abs(X) C ->
56502 // (icmp ne A, C) & (icmp ne A, -C)
56503 // Both of these patterns can be better optimized in
56504 // DAGCombiner::foldAndOrOfSETCC. Note this only applies for scalar
56505 // integers which is checked above.
56506 if (LHS.getOpcode() == ISD::ABS && LHS.hasOneUse()) {
56507 if (auto *C = dyn_cast<ConstantSDNode>(RHS)) {
56508 const APInt &CInt = C->getAPIntValue();
56509 // We can better optimize this case in DAGCombiner::foldAndOrOfSETCC.
56510 if (CInt.isPowerOf2() && !CInt.isMinSignedValue()) {
56511 SDValue BaseOp = LHS.getOperand(0);
56512 SDValue SETCC0 = DAG.getSetCC(DL, VT, BaseOp, RHS, CC);
56513 SDValue SETCC1 = DAG.getSetCC(
56514 DL, VT, BaseOp, DAG.getConstant(-CInt, DL, OpVT), CC);
56515 return DAG.getNode(CC == ISD::SETEQ ? ISD::OR : ISD::AND, DL, VT,
56516 SETCC0, SETCC1);
56517 }
56518 }
56519 }
56520 }
56521 }
56522
56523 if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
56524 (CC == ISD::SETNE || CC == ISD::SETEQ || ISD::isSignedIntSetCC(CC))) {
56525 // Using temporaries to avoid messing up operand ordering for later
56526 // transformations if this doesn't work.
56527 SDValue Op0 = LHS;
56528 SDValue Op1 = RHS;
56529 ISD::CondCode TmpCC = CC;
56530 // Put build_vector on the right.
56531 if (Op0.getOpcode() == ISD::BUILD_VECTOR) {
56532 std::swap(Op0, Op1);
56533 TmpCC = ISD::getSetCCSwappedOperands(TmpCC);
56534 }
56535
56536 bool IsSEXT0 =
56537 (Op0.getOpcode() == ISD::SIGN_EXTEND) &&
56538 (Op0.getOperand(0).getValueType().getVectorElementType() == MVT::i1);
56539 bool IsVZero1 = ISD::isBuildVectorAllZeros(Op1.getNode());
56540
56541 if (IsSEXT0 && IsVZero1) {
56542 assert(VT == Op0.getOperand(0).getValueType() &&
56543 "Unexpected operand type");
56544 if (TmpCC == ISD::SETGT)
56545 return DAG.getConstant(0, DL, VT);
56546 if (TmpCC == ISD::SETLE)
56547 return DAG.getConstant(1, DL, VT);
56548 if (TmpCC == ISD::SETEQ || TmpCC == ISD::SETGE)
56549 return DAG.getNOT(DL, Op0.getOperand(0), VT);
56550
56551 assert((TmpCC == ISD::SETNE || TmpCC == ISD::SETLT) &&
56552 "Unexpected condition code!");
56553 return Op0.getOperand(0);
56554 }
56555
56556 if (IsVZero1)
56557 if (SDValue V =
56558 combineAVX512SetCCToKMOV(VT, Op0, TmpCC, DL, DAG, Subtarget))
56559 return V;
56560 }
56561
56562 // Try and make unsigned vector comparison signed. On pre AVX512 targets there
56563 // only are unsigned comparisons (`PCMPGT`) and on AVX512 its often better to
56564 // use `PCMPGT` if the result is mean to stay in a vector (and if its going to
56565 // a mask, there are signed AVX512 comparisons).
56566 if (VT.isVector() && OpVT.isVector() && OpVT.isInteger()) {
56567 bool CanMakeSigned = false;
56568 if (ISD::isUnsignedIntSetCC(CC)) {
56569 KnownBits CmpKnown =
56571 // If we know LHS/RHS share the same sign bit at each element we can
56572 // make this signed.
56573 // NOTE: `computeKnownBits` on a vector type aggregates common bits
56574 // across all lanes. So a pattern where the sign varies from lane to
56575 // lane, but at each lane Sign(LHS) is known to equal Sign(RHS), will be
56576 // missed. We could get around this by demanding each lane
56577 // independently, but this isn't the most important optimization and
56578 // that may eat into compile time.
56579 CanMakeSigned =
56580 CmpKnown.Zero.isSignBitSet() || CmpKnown.One.isSignBitSet();
56581 }
56582 if (CanMakeSigned || ISD::isSignedIntSetCC(CC)) {
56583 SDValue LHSOut = LHS;
56584 SDValue RHSOut = RHS;
56585 ISD::CondCode NewCC = CC;
56586 switch (CC) {
56587 case ISD::SETGE:
56588 case ISD::SETUGE:
56589 if (SDValue NewLHS = incDecVectorConstant(LHS, DAG, /*IsInc*/ true,
56590 /*NSW*/ true))
56591 LHSOut = NewLHS;
56592 else if (SDValue NewRHS = incDecVectorConstant(
56593 RHS, DAG, /*IsInc*/ false, /*NSW*/ true))
56594 RHSOut = NewRHS;
56595 else
56596 break;
56597
56598 [[fallthrough]];
56599 case ISD::SETUGT:
56600 NewCC = ISD::SETGT;
56601 break;
56602
56603 case ISD::SETLE:
56604 case ISD::SETULE:
56605 if (SDValue NewLHS = incDecVectorConstant(LHS, DAG, /*IsInc*/ false,
56606 /*NSW*/ true))
56607 LHSOut = NewLHS;
56608 else if (SDValue NewRHS = incDecVectorConstant(RHS, DAG, /*IsInc*/ true,
56609 /*NSW*/ true))
56610 RHSOut = NewRHS;
56611 else
56612 break;
56613
56614 [[fallthrough]];
56615 case ISD::SETULT:
56616 // Will be swapped to SETGT in LowerVSETCC*.
56617 NewCC = ISD::SETLT;
56618 break;
56619 default:
56620 break;
56621 }
56622 if (NewCC != CC) {
56623 if (SDValue R = truncateAVX512SetCCNoBWI(VT, OpVT, LHSOut, RHSOut,
56624 NewCC, DL, DAG, Subtarget))
56625 return R;
56626 return DAG.getSetCC(DL, VT, LHSOut, RHSOut, NewCC);
56627 }
56628 }
56629 }
56630
56631 if (SDValue R =
56632 truncateAVX512SetCCNoBWI(VT, OpVT, LHS, RHS, CC, DL, DAG, Subtarget))
56633 return R;
56634
56635 // In the middle end transforms:
56636 // `(or (icmp eq X, C), (icmp eq X, C+1))`
56637 // -> `(icmp ult (add x, -C), 2)`
56638 // Likewise inverted cases with `ugt`.
56639 //
56640 // Since x86, pre avx512, doesn't have unsigned vector compares, this results
56641 // in worse codegen. So, undo the middle-end transform and go back to `(or
56642 // (icmp eq), (icmp eq))` form.
56643 // Also skip AVX1 with ymm vectors, as the umin approach combines better than
56644 // the xmm approach.
56645 //
56646 // NB: We don't handle the similiar simplication of `(and (icmp ne), (icmp
56647 // ne))` as it doesn't end up instruction positive.
56648 // TODO: We might want to do this for avx512 as well if we `sext` the result.
56649 if (VT.isVector() && OpVT.isVector() && OpVT.isInteger() &&
56650 ISD::isUnsignedIntSetCC(CC) && LHS.getOpcode() == ISD::ADD &&
56651 !Subtarget.hasAVX512() &&
56652 (OpVT.getSizeInBits() <= 128 || !Subtarget.hasAVX() ||
56653 Subtarget.hasAVX2()) &&
56654 LHS.hasOneUse()) {
56655
56656 APInt CmpC;
56657 SDValue AddC = LHS.getOperand(1);
56658 if (ISD::isConstantSplatVector(RHS.getNode(), CmpC) &&
56660 // See which form we have depending on the constant/condition.
56661 SDValue C0 = SDValue();
56662 SDValue C1 = SDValue();
56663
56664 // If we had `(add x, -1)` and can lower with `umin`, don't transform as
56665 // we will end up generating an additional constant. Keeping in the
56666 // current form has a slight latency cost, but it probably worth saving a
56667 // constant.
56670 // Pass
56671 }
56672 // Normal Cases
56673 else if ((CC == ISD::SETULT && CmpC == 2) ||
56674 (CC == ISD::SETULE && CmpC == 1)) {
56675 // These will constant fold.
56676 C0 = DAG.getNegative(AddC, DL, OpVT);
56677 C1 = DAG.getNode(ISD::SUB, DL, OpVT, C0,
56678 DAG.getAllOnesConstant(DL, OpVT));
56679 }
56680 // Inverted Cases
56681 else if ((CC == ISD::SETUGT && (-CmpC) == 3) ||
56682 (CC == ISD::SETUGE && (-CmpC) == 2)) {
56683 // These will constant fold.
56684 C0 = DAG.getNOT(DL, AddC, OpVT);
56685 C1 = DAG.getNode(ISD::ADD, DL, OpVT, C0,
56686 DAG.getAllOnesConstant(DL, OpVT));
56687 }
56688 if (C0 && C1) {
56689 SDValue NewLHS =
56690 DAG.getSetCC(DL, VT, LHS.getOperand(0), C0, ISD::SETEQ);
56691 SDValue NewRHS =
56692 DAG.getSetCC(DL, VT, LHS.getOperand(0), C1, ISD::SETEQ);
56693 return DAG.getNode(ISD::OR, DL, VT, NewLHS, NewRHS);
56694 }
56695 }
56696 }
56697
56698 // For an SSE1-only target, lower a comparison of v4f32 to X86ISD::CMPP early
56699 // to avoid scalarization via legalization because v4i32 is not a legal type.
56700 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32 &&
56701 LHS.getValueType() == MVT::v4f32)
56702 return LowerVSETCC(SDValue(N, 0), Subtarget, DAG);
56703
56704 // X pred 0.0 --> X pred -X
56705 // If the negation of X already exists, use it in the comparison. This removes
56706 // the need to materialize 0.0 and allows matching to SSE's MIN/MAX
56707 // instructions in patterns with a 'select' node.
56709 SDVTList FNegVT = DAG.getVTList(OpVT);
56710 if (SDNode *FNeg = DAG.getNodeIfExists(ISD::FNEG, FNegVT, {LHS}))
56711 return DAG.getSetCC(DL, VT, LHS, SDValue(FNeg, 0), CC);
56712 }
56713
56714 return SDValue();
56715}
56716
56719 const X86Subtarget &Subtarget) {
56720 SDValue Src = N->getOperand(0);
56721 MVT SrcVT = Src.getSimpleValueType();
56722 MVT VT = N->getSimpleValueType(0);
56723 unsigned NumBits = VT.getScalarSizeInBits();
56724 unsigned NumElts = SrcVT.getVectorNumElements();
56725 unsigned NumBitsPerElt = SrcVT.getScalarSizeInBits();
56726 assert(VT == MVT::i32 && NumElts <= NumBits && "Unexpected MOVMSK types");
56727
56728 // Perform constant folding.
56729 APInt UndefElts;
56730 SmallVector<APInt, 32> EltBits;
56731 if (getTargetConstantBitsFromNode(Src, NumBitsPerElt, UndefElts, EltBits,
56732 /*AllowWholeUndefs*/ true,
56733 /*AllowPartialUndefs*/ true)) {
56734 APInt Imm(32, 0);
56735 for (unsigned Idx = 0; Idx != NumElts; ++Idx)
56736 if (!UndefElts[Idx] && EltBits[Idx].isNegative())
56737 Imm.setBit(Idx);
56738
56739 return DAG.getConstant(Imm, SDLoc(N), VT);
56740 }
56741
56742 // Look through int->fp bitcasts that don't change the element width.
56743 unsigned EltWidth = SrcVT.getScalarSizeInBits();
56744 if (Subtarget.hasSSE2() && Src.getOpcode() == ISD::BITCAST &&
56745 Src.getOperand(0).getScalarValueSizeInBits() == EltWidth)
56746 return DAG.getNode(X86ISD::MOVMSK, SDLoc(N), VT, Src.getOperand(0));
56747
56748 // Fold movmsk(not(x)) -> not(movmsk(x)) to improve folding of movmsk results
56749 // with scalar comparisons.
56750 if (SDValue NotSrc = IsNOT(Src, DAG)) {
56751 SDLoc DL(N);
56752 APInt NotMask = APInt::getLowBitsSet(NumBits, NumElts);
56753 NotSrc = DAG.getBitcast(SrcVT, NotSrc);
56754 return DAG.getNode(ISD::XOR, DL, VT,
56755 DAG.getNode(X86ISD::MOVMSK, DL, VT, NotSrc),
56756 DAG.getConstant(NotMask, DL, VT));
56757 }
56758
56759 // Fold movmsk(icmp_sgt(x,-1)) -> not(movmsk(x)) to improve folding of movmsk
56760 // results with scalar comparisons.
56761 if (Src.getOpcode() == X86ISD::PCMPGT &&
56762 ISD::isBuildVectorAllOnes(Src.getOperand(1).getNode())) {
56763 SDLoc DL(N);
56764 APInt NotMask = APInt::getLowBitsSet(NumBits, NumElts);
56765 return DAG.getNode(ISD::XOR, DL, VT,
56766 DAG.getNode(X86ISD::MOVMSK, DL, VT, Src.getOperand(0)),
56767 DAG.getConstant(NotMask, DL, VT));
56768 }
56769
56770 // Fold movmsk(icmp_eq(and(x,c1),c1)) -> movmsk(shl(x,c2))
56771 // Fold movmsk(icmp_eq(and(x,c1),0)) -> movmsk(not(shl(x,c2)))
56772 // iff pow2splat(c1).
56773 // Use KnownBits to determine if only a single bit is non-zero
56774 // in each element (pow2 or zero), and shift that bit to the msb.
56775 if (Src.getOpcode() == X86ISD::PCMPEQ) {
56776 KnownBits KnownLHS = DAG.computeKnownBits(Src.getOperand(0));
56777 KnownBits KnownRHS = DAG.computeKnownBits(Src.getOperand(1));
56778 unsigned ShiftAmt = KnownLHS.countMinLeadingZeros();
56779 if (KnownLHS.countMaxPopulation() == 1 &&
56780 (KnownRHS.isZero() || (KnownRHS.countMaxPopulation() == 1 &&
56781 ShiftAmt == KnownRHS.countMinLeadingZeros()))) {
56782 SDLoc DL(N);
56783 MVT ShiftVT = SrcVT;
56784 SDValue ShiftLHS = Src.getOperand(0);
56785 SDValue ShiftRHS = Src.getOperand(1);
56786 if (ShiftVT.getScalarType() == MVT::i8) {
56787 // vXi8 shifts - we only care about the signbit so can use PSLLW.
56788 ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
56789 ShiftLHS = DAG.getBitcast(ShiftVT, ShiftLHS);
56790 ShiftRHS = DAG.getBitcast(ShiftVT, ShiftRHS);
56791 }
56792 ShiftLHS = getTargetVShiftByConstNode(X86ISD::VSHLI, DL, ShiftVT,
56793 ShiftLHS, ShiftAmt, DAG);
56794 ShiftRHS = getTargetVShiftByConstNode(X86ISD::VSHLI, DL, ShiftVT,
56795 ShiftRHS, ShiftAmt, DAG);
56796 ShiftLHS = DAG.getBitcast(SrcVT, ShiftLHS);
56797 ShiftRHS = DAG.getBitcast(SrcVT, ShiftRHS);
56798 SDValue Res = DAG.getNode(ISD::XOR, DL, SrcVT, ShiftLHS, ShiftRHS);
56799 return DAG.getNode(X86ISD::MOVMSK, DL, VT, DAG.getNOT(DL, Res, SrcVT));
56800 }
56801 }
56802
56803 // Fold movmsk(logic(X,C)) -> logic(movmsk(X),C)
56804 if (N->isOnlyUserOf(Src.getNode())) {
56806 if (ISD::isBitwiseLogicOp(SrcBC.getOpcode())) {
56807 APInt UndefElts;
56808 SmallVector<APInt, 32> EltBits;
56809 if (getTargetConstantBitsFromNode(SrcBC.getOperand(1), NumBitsPerElt,
56810 UndefElts, EltBits)) {
56811 APInt Mask = APInt::getZero(NumBits);
56812 for (unsigned Idx = 0; Idx != NumElts; ++Idx) {
56813 if (!UndefElts[Idx] && EltBits[Idx].isNegative())
56814 Mask.setBit(Idx);
56815 }
56816 SDLoc DL(N);
56817 SDValue NewSrc = DAG.getBitcast(SrcVT, SrcBC.getOperand(0));
56818 SDValue NewMovMsk = DAG.getNode(X86ISD::MOVMSK, DL, VT, NewSrc);
56819 return DAG.getNode(SrcBC.getOpcode(), DL, VT, NewMovMsk,
56820 DAG.getConstant(Mask, DL, VT));
56821 }
56822 }
56823 }
56824
56825 // Simplify the inputs.
56826 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
56827 APInt DemandedMask(APInt::getAllOnes(NumBits));
56828 if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
56829 return SDValue(N, 0);
56830
56831 return SDValue();
56832}
56833
56836 const X86Subtarget &Subtarget) {
56837 MVT VT = N->getSimpleValueType(0);
56838 unsigned NumBits = VT.getScalarSizeInBits();
56839
56840 // Simplify the inputs.
56841 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
56842 APInt DemandedMask(APInt::getAllOnes(NumBits));
56843 if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
56844 return SDValue(N, 0);
56845
56846 return SDValue();
56847}
56848
56852 SDValue Mask = MemOp->getMask();
56853
56854 // With vector masks we only demand the upper bit of the mask.
56855 if (Mask.getScalarValueSizeInBits() != 1) {
56856 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
56857 APInt DemandedMask(APInt::getSignMask(Mask.getScalarValueSizeInBits()));
56858 if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI)) {
56859 if (N->getOpcode() != ISD::DELETED_NODE)
56860 DCI.AddToWorklist(N);
56861 return SDValue(N, 0);
56862 }
56863 }
56864
56865 return SDValue();
56866}
56867
56869 SDValue Index, SDValue Base, SDValue Scale,
56870 SelectionDAG &DAG) {
56871 SDLoc DL(GorS);
56872
56873 if (auto *Gather = dyn_cast<MaskedGatherSDNode>(GorS)) {
56874 SDValue Ops[] = { Gather->getChain(), Gather->getPassThru(),
56875 Gather->getMask(), Base, Index, Scale } ;
56876 return DAG.getMaskedGather(Gather->getVTList(),
56877 Gather->getMemoryVT(), DL, Ops,
56878 Gather->getMemOperand(),
56879 Gather->getIndexType(),
56880 Gather->getExtensionType());
56881 }
56882 auto *Scatter = cast<MaskedScatterSDNode>(GorS);
56883 SDValue Ops[] = { Scatter->getChain(), Scatter->getValue(),
56884 Scatter->getMask(), Base, Index, Scale };
56885 return DAG.getMaskedScatter(Scatter->getVTList(),
56886 Scatter->getMemoryVT(), DL,
56887 Ops, Scatter->getMemOperand(),
56888 Scatter->getIndexType(),
56889 Scatter->isTruncatingStore());
56890}
56891
56894 SDLoc DL(N);
56895 auto *GorS = cast<MaskedGatherScatterSDNode>(N);
56896 SDValue Index = GorS->getIndex();
56897 SDValue Base = GorS->getBasePtr();
56898 SDValue Scale = GorS->getScale();
56899 EVT IndexVT = Index.getValueType();
56900 EVT IndexSVT = IndexVT.getVectorElementType();
56901 unsigned IndexWidth = Index.getScalarValueSizeInBits();
56902 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
56903 EVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
56904
56905 if (DCI.isBeforeLegalize()) {
56906 // Attempt to move shifted index into the address scale, allows further
56907 // index truncation below.
56908 if (Index.getOpcode() == ISD::SHL && IndexSVT == PtrVT &&
56909 isa<ConstantSDNode>(Scale)) {
56910 unsigned ScaleAmt = Scale->getAsZExtVal();
56911 assert(isPowerOf2_32(ScaleAmt) && "Scale must be a power of 2");
56912 unsigned Log2ScaleAmt = Log2_32(ScaleAmt);
56913 unsigned MaskBits = IndexWidth - Log2ScaleAmt;
56914 APInt DemandedBits = APInt::getLowBitsSet(IndexWidth, MaskBits);
56915 if (TLI.SimplifyDemandedBits(Index, DemandedBits, DCI)) {
56916 if (N->getOpcode() != ISD::DELETED_NODE)
56917 DCI.AddToWorklist(N);
56918 return SDValue(N, 0);
56919 }
56920 if (auto MinShAmt = DAG.getValidMinimumShiftAmount(Index)) {
56921 if (*MinShAmt >= 1 && Log2ScaleAmt < 3 &&
56922 DAG.ComputeNumSignBits(Index.getOperand(0)) > 1) {
56923 SDValue ShAmt = Index.getOperand(1);
56924 SDValue NewShAmt =
56925 DAG.getNode(ISD::SUB, DL, ShAmt.getValueType(), ShAmt,
56926 DAG.getConstant(1, DL, ShAmt.getValueType()));
56927 SDValue NewIndex = DAG.getNode(ISD::SHL, DL, Index.getValueType(),
56928 Index.getOperand(0), NewShAmt);
56929 SDValue NewScale =
56930 DAG.getConstant(ScaleAmt * 2, DL, Scale.getValueType());
56931 return rebuildGatherScatter(GorS, NewIndex, Base, NewScale, DAG);
56932 }
56933 }
56934 }
56935
56936 // Shrink indices if they are larger than 32-bits.
56937 // Only do this before legalize types since v2i64 could become v2i32.
56938 // FIXME: We could check that the type is legal if we're after legalize
56939 // types, but then we would need to construct test cases where that happens.
56940 if (IndexWidth > 32 && DAG.ComputeNumSignBits(Index) > (IndexWidth - 32)) {
56941 EVT NewVT = IndexVT.changeVectorElementType(MVT::i32);
56942
56943 // FIXME: We could support more than just constant fold, but we need to
56944 // careful with costing. A truncate that can be optimized out would be
56945 // fine. Otherwise we might only want to create a truncate if it avoids
56946 // a split.
56947 if (SDValue TruncIndex =
56948 DAG.FoldConstantArithmetic(ISD::TRUNCATE, DL, NewVT, Index))
56949 return rebuildGatherScatter(GorS, TruncIndex, Base, Scale, DAG);
56950
56951 // Shrink any sign/zero extends from 32 or smaller to larger than 32 if
56952 // there are sufficient sign bits. Only do this before legalize types to
56953 // avoid creating illegal types in truncate.
56954 if ((Index.getOpcode() == ISD::SIGN_EXTEND ||
56955 Index.getOpcode() == ISD::ZERO_EXTEND) &&
56956 Index.getOperand(0).getScalarValueSizeInBits() <= 32) {
56957 Index = DAG.getNode(ISD::TRUNCATE, DL, NewVT, Index);
56958 return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
56959 }
56960
56961 // Shrink if we remove an illegal type.
56962 if (!TLI.isTypeLegal(Index.getValueType()) && TLI.isTypeLegal(NewVT)) {
56963 Index = DAG.getNode(ISD::TRUNCATE, DL, NewVT, Index);
56964 return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
56965 }
56966 }
56967 }
56968
56969 // Try to move splat adders from the index operand to the base
56970 // pointer operand. Taking care to multiply by the scale. We can only do
56971 // this when index element type is the same as the pointer type.
56972 // Otherwise we need to be sure the math doesn't wrap before the scale.
56973 if (Index.getOpcode() == ISD::ADD && IndexSVT == PtrVT &&
56974 isa<ConstantSDNode>(Scale)) {
56975 uint64_t ScaleAmt = Scale->getAsZExtVal();
56976
56977 for (unsigned I = 0; I != 2; ++I)
56978 if (auto *BV = dyn_cast<BuildVectorSDNode>(Index.getOperand(I))) {
56979 BitVector UndefElts;
56980 if (SDValue Splat = BV->getSplatValue(&UndefElts)) {
56981 if (UndefElts.none()) {
56982 // If the splat value is constant we can add the scaled splat value
56983 // to the existing base.
56984 if (auto *C = dyn_cast<ConstantSDNode>(Splat)) {
56985 APInt Adder = C->getAPIntValue() * ScaleAmt;
56986 SDValue NewBase = DAG.getNode(ISD::ADD, DL, PtrVT, Base,
56987 DAG.getConstant(Adder, DL, PtrVT));
56988 SDValue NewIndex = Index.getOperand(1 - I);
56989 return rebuildGatherScatter(GorS, NewIndex, NewBase, Scale, DAG);
56990 }
56991 // For non-constant cases, limit this to non-scaled cases.
56992 if (ScaleAmt == 1) {
56993 SDValue NewBase = DAG.getNode(ISD::ADD, DL, PtrVT, Base, Splat);
56994 SDValue NewIndex = Index.getOperand(1 - I);
56995 return rebuildGatherScatter(GorS, NewIndex, NewBase, Scale, DAG);
56996 }
56997 }
56998 }
56999 // It's also possible base is just a constant. In that case, just
57000 // replace it with 0 and move the displacement into the index.
57001 if (ScaleAmt == 1 && BV->isConstant() && isa<ConstantSDNode>(Base)) {
57002 SDValue Splat = DAG.getSplatBuildVector(IndexVT, DL, Base);
57003 // Combine the constant build_vector and the constant base.
57004 Splat =
57005 DAG.getNode(ISD::ADD, DL, IndexVT, Index.getOperand(I), Splat);
57006 // Add to the other half of the original Index add.
57007 SDValue NewIndex = DAG.getNode(ISD::ADD, DL, IndexVT,
57008 Index.getOperand(1 - I), Splat);
57009 SDValue NewBase = DAG.getConstant(0, DL, PtrVT);
57010 return rebuildGatherScatter(GorS, NewIndex, NewBase, Scale, DAG);
57011 }
57012 }
57013 }
57014
57015 if (DCI.isBeforeLegalizeOps()) {
57016 // Make sure the index is either i32 or i64
57017 if (IndexWidth != 32 && IndexWidth != 64) {
57018 MVT EltVT = IndexWidth > 32 ? MVT::i64 : MVT::i32;
57019 IndexVT = IndexVT.changeVectorElementType(EltVT);
57020 Index = DAG.getSExtOrTrunc(Index, DL, IndexVT);
57021 return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
57022 }
57023 }
57024
57025 // With vector masks we only demand the upper bit of the mask.
57026 SDValue Mask = GorS->getMask();
57027 if (Mask.getScalarValueSizeInBits() != 1) {
57028 APInt DemandedMask(APInt::getSignMask(Mask.getScalarValueSizeInBits()));
57029 if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI)) {
57030 if (N->getOpcode() != ISD::DELETED_NODE)
57031 DCI.AddToWorklist(N);
57032 return SDValue(N, 0);
57033 }
57034 }
57035
57036 return SDValue();
57037}
57038
57039// Optimize RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT
57041 const X86Subtarget &Subtarget) {
57042 SDLoc DL(N);
57043 X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0));
57044 SDValue EFLAGS = N->getOperand(1);
57045
57046 // Try to simplify the EFLAGS and condition code operands.
57047 if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG, Subtarget))
57048 return getSETCC(CC, Flags, DL, DAG);
57049
57050 return SDValue();
57051}
57052
57053/// Optimize branch condition evaluation.
57055 const X86Subtarget &Subtarget) {
57056 SDLoc DL(N);
57057 SDValue EFLAGS = N->getOperand(3);
57058 X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(2));
57059
57060 // Try to simplify the EFLAGS and condition code operands.
57061 // Make sure to not keep references to operands, as combineSetCCEFLAGS can
57062 // RAUW them under us.
57063 if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG, Subtarget)) {
57064 SDValue Cond = DAG.getTargetConstant(CC, DL, MVT::i8);
57065 return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), N->getOperand(0),
57066 N->getOperand(1), Cond, Flags);
57067 }
57068
57069 return SDValue();
57070}
57071
57072// TODO: Could we move this to DAGCombine?
57074 SelectionDAG &DAG) {
57075 // Take advantage of vector comparisons (etc.) producing 0 or -1 in each lane
57076 // to optimize away operation when it's from a constant.
57077 //
57078 // The general transformation is:
57079 // UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
57080 // AND(VECTOR_CMP(x,y), constant2)
57081 // constant2 = UNARYOP(constant)
57082
57083 // Early exit if this isn't a vector operation, the operand of the
57084 // unary operation isn't a bitwise AND, or if the sizes of the operations
57085 // aren't the same.
57086 EVT VT = N->getValueType(0);
57087 bool IsStrict = N->isStrictFPOpcode();
57088 unsigned NumEltBits = VT.getScalarSizeInBits();
57089 SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);
57090 if (!VT.isVector() || Op0.getOpcode() != ISD::AND ||
57091 DAG.ComputeNumSignBits(Op0.getOperand(0)) != NumEltBits ||
57092 VT.getSizeInBits() != Op0.getValueSizeInBits())
57093 return SDValue();
57094
57095 // Now check that the other operand of the AND is a constant. We could
57096 // make the transformation for non-constant splats as well, but it's unclear
57097 // that would be a benefit as it would not eliminate any operations, just
57098 // perform one more step in scalar code before moving to the vector unit.
57099 if (auto *BV = dyn_cast<BuildVectorSDNode>(Op0.getOperand(1))) {
57100 // Bail out if the vector isn't a constant.
57101 if (!BV->isConstant())
57102 return SDValue();
57103
57104 // Everything checks out. Build up the new and improved node.
57105 SDLoc DL(N);
57106 EVT IntVT = BV->getValueType(0);
57107 // Create a new constant of the appropriate type for the transformed
57108 // DAG.
57109 SDValue SourceConst;
57110 if (IsStrict)
57111 SourceConst = DAG.getNode(N->getOpcode(), DL, {VT, MVT::Other},
57112 {N->getOperand(0), SDValue(BV, 0)});
57113 else
57114 SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
57115 // The AND node needs bitcasts to/from an integer vector type around it.
57116 SDValue MaskConst = DAG.getBitcast(IntVT, SourceConst);
57117 SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT, Op0->getOperand(0),
57118 MaskConst);
57119 SDValue Res = DAG.getBitcast(VT, NewAnd);
57120 if (IsStrict)
57121 return DAG.getMergeValues({Res, SourceConst.getValue(1)}, DL);
57122 return Res;
57123 }
57124
57125 return SDValue();
57126}
57127
57128/// If we are converting a value to floating-point, try to replace scalar
57129/// truncate of an extracted vector element with a bitcast. This tries to keep
57130/// the sequence on XMM registers rather than moving between vector and GPRs.
57132 // TODO: This is currently only used by combineSIntToFP, but it is generalized
57133 // to allow being called by any similar cast opcode.
57134 // TODO: Consider merging this into lowering: vectorizeExtractedCast().
57135 SDValue Trunc = N->getOperand(0);
57136 if (!Trunc.hasOneUse() || Trunc.getOpcode() != ISD::TRUNCATE)
57137 return SDValue();
57138
57139 SDValue ExtElt = Trunc.getOperand(0);
57140 if (!ExtElt.hasOneUse() || ExtElt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
57141 !isNullConstant(ExtElt.getOperand(1)))
57142 return SDValue();
57143
57144 EVT TruncVT = Trunc.getValueType();
57145 EVT SrcVT = ExtElt.getValueType();
57146 unsigned DestWidth = TruncVT.getSizeInBits();
57147 unsigned SrcWidth = SrcVT.getSizeInBits();
57148 if (SrcWidth % DestWidth != 0)
57149 return SDValue();
57150
57151 // inttofp (trunc (extelt X, 0)) --> inttofp (extelt (bitcast X), 0)
57152 EVT SrcVecVT = ExtElt.getOperand(0).getValueType();
57153 unsigned VecWidth = SrcVecVT.getSizeInBits();
57154 unsigned NumElts = VecWidth / DestWidth;
57155 EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), TruncVT, NumElts);
57156 SDValue BitcastVec = DAG.getBitcast(BitcastVT, ExtElt.getOperand(0));
57157 SDLoc DL(N);
57158 SDValue NewExtElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, TruncVT,
57159 BitcastVec, ExtElt.getOperand(1));
57160 return DAG.getNode(N->getOpcode(), DL, N->getValueType(0), NewExtElt);
57161}
57162
57164 const X86Subtarget &Subtarget) {
57165 bool IsStrict = N->isStrictFPOpcode();
57166 SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);
57167 EVT VT = N->getValueType(0);
57168 EVT InVT = Op0.getValueType();
57169
57170 // Using i16 as an intermediate type is a bad idea, unless we have HW support
57171 // for it. Therefore for type sizes equal or smaller than 32 just go with i32.
57172 // if hasFP16 support:
57173 // UINT_TO_FP(vXi1~15) -> SINT_TO_FP(ZEXT(vXi1~15 to vXi16))
57174 // UINT_TO_FP(vXi17~31) -> SINT_TO_FP(ZEXT(vXi17~31 to vXi32))
57175 // else
57176 // UINT_TO_FP(vXi1~31) -> SINT_TO_FP(ZEXT(vXi1~31 to vXi32))
57177 // UINT_TO_FP(vXi33~63) -> SINT_TO_FP(ZEXT(vXi33~63 to vXi64))
57178 if (InVT.isVector() && VT.getVectorElementType() == MVT::f16) {
57179 unsigned ScalarSize = InVT.getScalarSizeInBits();
57180 if ((ScalarSize == 16 && Subtarget.hasFP16()) || ScalarSize == 32 ||
57181 ScalarSize >= 64)
57182 return SDValue();
57183 SDLoc dl(N);
57184 EVT DstVT =
57186 (Subtarget.hasFP16() && ScalarSize < 16) ? MVT::i16
57187 : ScalarSize < 32 ? MVT::i32
57188 : MVT::i64,
57189 InVT.getVectorNumElements());
57190 SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0);
57191 if (IsStrict)
57192 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
57193 {N->getOperand(0), P});
57194 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
57195 }
57196
57197 // UINT_TO_FP(vXi1) -> SINT_TO_FP(ZEXT(vXi1 to vXi32))
57198 // UINT_TO_FP(vXi8) -> SINT_TO_FP(ZEXT(vXi8 to vXi32))
57199 // UINT_TO_FP(vXi16) -> SINT_TO_FP(ZEXT(vXi16 to vXi32))
57200 if (InVT.isVector() && InVT.getScalarSizeInBits() < 32 &&
57201 VT.getScalarType() != MVT::f16) {
57202 SDLoc dl(N);
57203 EVT DstVT = InVT.changeVectorElementType(MVT::i32);
57204 SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0);
57205
57206 // UINT_TO_FP isn't legal without AVX512 so use SINT_TO_FP.
57207 if (IsStrict)
57208 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
57209 {N->getOperand(0), P});
57210 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
57211 }
57212
57213 // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't
57214 // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform
57215 // the optimization here.
57216 SDNodeFlags Flags = N->getFlags();
57217 if (Flags.hasNonNeg() || DAG.SignBitIsZero(Op0)) {
57218 if (IsStrict)
57219 return DAG.getNode(ISD::STRICT_SINT_TO_FP, SDLoc(N), {VT, MVT::Other},
57220 {N->getOperand(0), Op0});
57221 return DAG.getNode(ISD::SINT_TO_FP, SDLoc(N), VT, Op0);
57222 }
57223
57224 return SDValue();
57225}
57226
57229 const X86Subtarget &Subtarget) {
57230 // First try to optimize away the conversion entirely when it's
57231 // conditionally from a constant. Vectors only.
57232 bool IsStrict = N->isStrictFPOpcode();
57234 return Res;
57235
57236 // Now move on to more general possibilities.
57237 SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);
57238 EVT VT = N->getValueType(0);
57239 EVT InVT = Op0.getValueType();
57240
57241 // Using i16 as an intermediate type is a bad idea, unless we have HW support
57242 // for it. Therefore for type sizes equal or smaller than 32 just go with i32.
57243 // if hasFP16 support:
57244 // SINT_TO_FP(vXi1~15) -> SINT_TO_FP(SEXT(vXi1~15 to vXi16))
57245 // SINT_TO_FP(vXi17~31) -> SINT_TO_FP(SEXT(vXi17~31 to vXi32))
57246 // else
57247 // SINT_TO_FP(vXi1~31) -> SINT_TO_FP(ZEXT(vXi1~31 to vXi32))
57248 // SINT_TO_FP(vXi33~63) -> SINT_TO_FP(SEXT(vXi33~63 to vXi64))
57249 if (InVT.isVector() && VT.getVectorElementType() == MVT::f16) {
57250 unsigned ScalarSize = InVT.getScalarSizeInBits();
57251 if ((ScalarSize == 16 && Subtarget.hasFP16()) || ScalarSize == 32 ||
57252 ScalarSize >= 64)
57253 return SDValue();
57254 SDLoc dl(N);
57255 EVT DstVT =
57257 (Subtarget.hasFP16() && ScalarSize < 16) ? MVT::i16
57258 : ScalarSize < 32 ? MVT::i32
57259 : MVT::i64,
57260 InVT.getVectorNumElements());
57261 SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);
57262 if (IsStrict)
57263 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
57264 {N->getOperand(0), P});
57265 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
57266 }
57267
57268 // SINT_TO_FP(vXi1) -> SINT_TO_FP(SEXT(vXi1 to vXi32))
57269 // SINT_TO_FP(vXi8) -> SINT_TO_FP(SEXT(vXi8 to vXi32))
57270 // SINT_TO_FP(vXi16) -> SINT_TO_FP(SEXT(vXi16 to vXi32))
57271 if (InVT.isVector() && InVT.getScalarSizeInBits() < 32 &&
57272 VT.getScalarType() != MVT::f16) {
57273 SDLoc dl(N);
57274 EVT DstVT = InVT.changeVectorElementType(MVT::i32);
57275 SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);
57276 if (IsStrict)
57277 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
57278 {N->getOperand(0), P});
57279 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
57280 }
57281
57282 // Without AVX512DQ we only support i64 to float scalar conversion. For both
57283 // vectors and scalars, see if we know that the upper bits are all the sign
57284 // bit, in which case we can truncate the input to i32 and convert from that.
57285 if (InVT.getScalarSizeInBits() > 32 && !Subtarget.hasDQI()) {
57286 unsigned BitWidth = InVT.getScalarSizeInBits();
57287 unsigned NumSignBits = DAG.ComputeNumSignBits(Op0);
57288 if (NumSignBits >= (BitWidth - 31)) {
57289 EVT TruncVT = MVT::i32;
57290 if (InVT.isVector())
57291 TruncVT = InVT.changeVectorElementType(TruncVT);
57292 SDLoc dl(N);
57293 if (DCI.isBeforeLegalize() || TruncVT != MVT::v2i32) {
57294 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Op0);
57295 if (IsStrict)
57296 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
57297 {N->getOperand(0), Trunc});
57298 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Trunc);
57299 }
57300 // If we're after legalize and the type is v2i32 we need to shuffle and
57301 // use CVTSI2P.
57302 assert(InVT == MVT::v2i64 && "Unexpected VT!");
57303 SDValue Cast = DAG.getBitcast(MVT::v4i32, Op0);
57304 SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Cast, Cast,
57305 { 0, 2, -1, -1 });
57306 if (IsStrict)
57307 return DAG.getNode(X86ISD::STRICT_CVTSI2P, dl, {VT, MVT::Other},
57308 {N->getOperand(0), Shuf});
57309 return DAG.getNode(X86ISD::CVTSI2P, dl, VT, Shuf);
57310 }
57311 }
57312
57313 // Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have
57314 // a 32-bit target where SSE doesn't support i64->FP operations.
57315 if (!Subtarget.useSoftFloat() && Subtarget.hasX87() &&
57316 Op0.getOpcode() == ISD::LOAD) {
57317 LoadSDNode *Ld = cast<LoadSDNode>(Op0.getNode());
57318
57319 // This transformation is not supported if the result type is f16 or f128.
57320 if (VT == MVT::f16 || VT == MVT::f128)
57321 return SDValue();
57322
57323 // If we have AVX512DQ we can use packed conversion instructions unless
57324 // the VT is f80.
57325 if (Subtarget.hasDQI() && VT != MVT::f80)
57326 return SDValue();
57327
57328 if (Ld->isSimple() && !VT.isVector() && ISD::isNormalLoad(Op0.getNode()) &&
57329 Op0.hasOneUse() && !Subtarget.is64Bit() && InVT == MVT::i64) {
57330 std::pair<SDValue, SDValue> Tmp =
57331 Subtarget.getTargetLowering()->BuildFILD(
57332 VT, InVT, SDLoc(N), Ld->getChain(), Ld->getBasePtr(),
57333 Ld->getPointerInfo(), Ld->getBaseAlign(), DAG);
57334 DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), Tmp.second);
57335 return Tmp.first;
57336 }
57337 }
57338
57339 if (IsStrict)
57340 return SDValue();
57341
57342 if (SDValue V = combineToFPTruncExtElt(N, DAG))
57343 return V;
57344
57345 return SDValue();
57346}
57347
57349 const X86Subtarget &Subtarget) {
57350 EVT VT = N->getValueType(0);
57351 SDValue Src = N->getOperand(0);
57352 if (Subtarget.hasSSE2() && Src.getOpcode() == ISD::FRINT &&
57353 VT.getScalarType() == MVT::i32 && Src.hasOneUse())
57354 return DAG.getNode(ISD::LRINT, SDLoc(N), VT, Src.getOperand(0));
57355
57356 return SDValue();
57357}
57358
57359// Custom handling for VCVTTPS2QQS/VCVTTPS2UQQS
57361 const X86Subtarget &Subtarget) {
57362 if (!Subtarget.hasAVX10_2())
57363 return SDValue();
57364
57365 bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT_SAT;
57366 EVT SrcVT = N->getOperand(0).getValueType();
57367 EVT DstVT = N->getValueType(0);
57368 SDLoc dl(N);
57369
57370 if (SrcVT == MVT::v2f32 && DstVT == MVT::v2i64) {
57371 SDValue V2F32Value = DAG.getUNDEF(SrcVT);
57372
57373 // Concatenate the original v2f32 input and V2F32Value to create v4f32
57374 SDValue NewSrc = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
57375 N->getOperand(0), V2F32Value);
57376
57377 // Select the FP_TO_SINT_SAT/FP_TO_UINT_SAT node
57378 if (IsSigned)
57379 return DAG.getNode(X86ISD::FP_TO_SINT_SAT, dl, MVT::v2i64, NewSrc);
57380
57381 return DAG.getNode(X86ISD::FP_TO_UINT_SAT, dl, MVT::v2i64, NewSrc);
57382 }
57383 return SDValue();
57384}
57385
57387 assert(Flags.getValueType() == MVT::i32 && "Unexpected VT!");
57388
57389 for (const SDNode *User : Flags->users()) {
57390 X86::CondCode CC;
57391 switch (User->getOpcode()) {
57392 default:
57393 // Be conservative.
57394 return true;
57395 case X86ISD::SETCC:
57397 CC = (X86::CondCode)User->getConstantOperandVal(0);
57398 break;
57399 case X86ISD::BRCOND:
57400 case X86ISD::CMOV:
57401 CC = (X86::CondCode)User->getConstantOperandVal(2);
57402 break;
57403 }
57404
57405 switch (CC) {
57406 // clang-format off
57407 default: break;
57408 case X86::COND_A: case X86::COND_AE:
57409 case X86::COND_B: case X86::COND_BE:
57410 case X86::COND_O: case X86::COND_NO:
57411 case X86::COND_G: case X86::COND_GE:
57412 case X86::COND_L: case X86::COND_LE:
57413 return true;
57414 // clang-format on
57415 }
57416 }
57417
57418 return false;
57419}
57420
57421static bool onlyZeroFlagUsed(SDValue Flags) {
57422 assert(Flags.getValueType() == MVT::i32 && "Unexpected VT!");
57423
57424 for (const SDNode *User : Flags->users()) {
57425 unsigned CCOpNo;
57426 switch (User->getOpcode()) {
57427 default:
57428 // Be conservative.
57429 return false;
57430 case X86ISD::SETCC:
57432 CCOpNo = 0;
57433 break;
57434 case X86ISD::BRCOND:
57435 case X86ISD::CMOV:
57436 CCOpNo = 2;
57437 break;
57438 }
57439
57440 X86::CondCode CC = (X86::CondCode)User->getConstantOperandVal(CCOpNo);
57441 if (CC != X86::COND_E && CC != X86::COND_NE)
57442 return false;
57443 }
57444
57445 return true;
57446}
57447
57450 const X86Subtarget &Subtarget) {
57451 // Only handle test patterns.
57452 if (!isNullConstant(N->getOperand(1)))
57453 return SDValue();
57454
57455 // If we have a CMP of a truncated binop, see if we can make a smaller binop
57456 // and use its flags directly.
57457 // TODO: Maybe we should try promoting compares that only use the zero flag
57458 // first if we can prove the upper bits with computeKnownBits?
57459 SDLoc dl(N);
57460 SDValue Op = N->getOperand(0);
57461 EVT VT = Op.getValueType();
57462 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
57463
57464 if (SDValue CMP =
57465 combineX86SubCmpForFlags(N, SDValue(N, 0), DAG, DCI, Subtarget))
57466 return CMP;
57467
57468 // If we have a constant logical shift that's only used in a comparison
57469 // against zero turn it into an equivalent AND. This allows turning it into
57470 // a TEST instruction later.
57471 if ((Op.getOpcode() == ISD::SRL || Op.getOpcode() == ISD::SHL) &&
57472 Op.hasOneUse() && isa<ConstantSDNode>(Op.getOperand(1)) &&
57473 onlyZeroFlagUsed(SDValue(N, 0))) {
57474 unsigned BitWidth = VT.getSizeInBits();
57475 const APInt &ShAmt = Op.getConstantOperandAPInt(1);
57476 if (ShAmt.ult(BitWidth)) { // Avoid undefined shifts.
57477 unsigned MaskBits = BitWidth - ShAmt.getZExtValue();
57478 APInt Mask = Op.getOpcode() == ISD::SRL
57479 ? APInt::getHighBitsSet(BitWidth, MaskBits)
57480 : APInt::getLowBitsSet(BitWidth, MaskBits);
57481 if (Mask.isSignedIntN(32)) {
57482 Op = DAG.getNode(ISD::AND, dl, VT, Op.getOperand(0),
57483 DAG.getConstant(Mask, dl, VT));
57484 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
57485 DAG.getConstant(0, dl, VT));
57486 }
57487 }
57488 }
57489
57490 // If we're extracting from a avx512 bool vector and comparing against zero,
57491 // then try to just bitcast the vector to an integer to use TEST/BT directly.
57492 // (and (extract_elt (kshiftr vXi1, C), 0), 1) -> (and (bc vXi1), 1<<C)
57493 if (Op.getOpcode() == ISD::AND && isOneConstant(Op.getOperand(1)) &&
57494 Op.hasOneUse() && onlyZeroFlagUsed(SDValue(N, 0))) {
57495 SDValue Src = Op.getOperand(0);
57496 if (Src.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
57497 isNullConstant(Src.getOperand(1)) &&
57498 Src.getOperand(0).getValueType().getScalarType() == MVT::i1) {
57499 SDValue BoolVec = Src.getOperand(0);
57500 unsigned ShAmt = 0;
57501 if (BoolVec.getOpcode() == X86ISD::KSHIFTR) {
57502 ShAmt = BoolVec.getConstantOperandVal(1);
57503 BoolVec = BoolVec.getOperand(0);
57504 }
57505 BoolVec = widenMaskVector(BoolVec, false, Subtarget, DAG, dl);
57506 EVT VecVT = BoolVec.getValueType();
57507 unsigned BitWidth = VecVT.getVectorNumElements();
57508 EVT BCVT = EVT::getIntegerVT(*DAG.getContext(), BitWidth);
57509 if (TLI.isTypeLegal(VecVT) && TLI.isTypeLegal(BCVT)) {
57510 APInt Mask = APInt::getOneBitSet(BitWidth, ShAmt);
57511 Op = DAG.getBitcast(BCVT, BoolVec);
57512 Op = DAG.getNode(ISD::AND, dl, BCVT, Op,
57513 DAG.getConstant(Mask, dl, BCVT));
57514 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
57515 DAG.getConstant(0, dl, BCVT));
57516 }
57517 }
57518 }
57519
57520 // Peek through any zero-extend if we're only testing for a zero result.
57521 if (Op.getOpcode() == ISD::ZERO_EXTEND && onlyZeroFlagUsed(SDValue(N, 0))) {
57522 SDValue Src = Op.getOperand(0);
57523 EVT SrcVT = Src.getValueType();
57524 if (SrcVT.getScalarSizeInBits() >= 8 && TLI.isTypeLegal(SrcVT))
57525 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Src,
57526 DAG.getConstant(0, dl, SrcVT));
57527 }
57528
57529 // Look for a truncate.
57530 if (Op.getOpcode() != ISD::TRUNCATE)
57531 return SDValue();
57532
57533 SDValue Trunc = Op;
57534 Op = Op.getOperand(0);
57535
57536 // See if we can compare with zero against the truncation source,
57537 // which should help using the Z flag from many ops. Only do this for
57538 // i32 truncated op to prevent partial-reg compares of promoted ops.
57539 EVT OpVT = Op.getValueType();
57540 APInt UpperBits =
57542 if (OpVT == MVT::i32 && DAG.MaskedValueIsZero(Op, UpperBits) &&
57543 onlyZeroFlagUsed(SDValue(N, 0))) {
57544 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
57545 DAG.getConstant(0, dl, OpVT));
57546 }
57547
57548 // After this the truncate and arithmetic op must have a single use.
57549 if (!Trunc.hasOneUse() || !Op.hasOneUse())
57550 return SDValue();
57551
57552 unsigned NewOpc;
57553 switch (Op.getOpcode()) {
57554 default: return SDValue();
57555 case ISD::AND:
57556 // Skip and with constant. We have special handling for and with immediate
57557 // during isel to generate test instructions.
57558 if (isa<ConstantSDNode>(Op.getOperand(1)))
57559 return SDValue();
57560 NewOpc = X86ISD::AND;
57561 break;
57562 case ISD::OR: NewOpc = X86ISD::OR; break;
57563 case ISD::XOR: NewOpc = X86ISD::XOR; break;
57564 case ISD::ADD:
57565 // If the carry or overflow flag is used, we can't truncate.
57567 return SDValue();
57568 NewOpc = X86ISD::ADD;
57569 break;
57570 case ISD::SUB:
57571 // If the carry or overflow flag is used, we can't truncate.
57573 return SDValue();
57574 NewOpc = X86ISD::SUB;
57575 break;
57576 }
57577
57578 // We found an op we can narrow. Truncate its inputs.
57579 SDValue Op0 = DAG.getNode(ISD::TRUNCATE, dl, VT, Op.getOperand(0));
57580 SDValue Op1 = DAG.getNode(ISD::TRUNCATE, dl, VT, Op.getOperand(1));
57581
57582 // Use a X86 specific opcode to avoid DAG combine messing with it.
57583 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
57584 Op = DAG.getNode(NewOpc, dl, VTs, Op0, Op1);
57585
57586 // For AND, keep a CMP so that we can match the test pattern.
57587 if (NewOpc == X86ISD::AND)
57588 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
57589 DAG.getConstant(0, dl, VT));
57590
57591 // Return the flags.
57592 return Op.getValue(1);
57593}
57594
57597 const X86Subtarget &ST) {
57598 assert((X86ISD::ADD == N->getOpcode() || X86ISD::SUB == N->getOpcode()) &&
57599 "Expected X86ISD::ADD or X86ISD::SUB");
57600
57601 SDLoc DL(N);
57602 SDValue LHS = N->getOperand(0);
57603 SDValue RHS = N->getOperand(1);
57604 MVT VT = LHS.getSimpleValueType();
57605 bool IsSub = X86ISD::SUB == N->getOpcode();
57606 unsigned GenericOpc = IsSub ? ISD::SUB : ISD::ADD;
57607
57608 if (IsSub && isOneConstant(RHS) && !N->hasAnyUseOfValue(0))
57609 if (SDValue CMP = combineX86SubCmpForFlags(N, SDValue(N, 1), DAG, DCI, ST))
57610 return CMP;
57611
57612 // If we don't use the flag result, simplify back to a generic ADD/SUB.
57613 if (!N->hasAnyUseOfValue(1)) {
57614 SDValue Res = DAG.getNode(GenericOpc, DL, VT, LHS, RHS);
57615 return DAG.getMergeValues({Res, DAG.getConstant(0, DL, MVT::i32)}, DL);
57616 }
57617
57618 // Fold any similar generic ADD/SUB opcodes to reuse this node.
57619 auto MatchGeneric = [&](SDValue N0, SDValue N1, bool Negate) {
57620 SDValue Ops[] = {N0, N1};
57621 SDVTList VTs = DAG.getVTList(N->getValueType(0));
57622 if (SDNode *GenericAddSub = DAG.getNodeIfExists(GenericOpc, VTs, Ops)) {
57623 SDValue Op(N, 0);
57624 if (Negate) {
57625 // Bail if this is only used by a user of the x86 add/sub.
57626 if (GenericAddSub->hasOneUse() &&
57627 GenericAddSub->user_begin()->isOnlyUserOf(N))
57628 return;
57629 Op = DAG.getNegative(Op, DL, VT);
57630 }
57631 DCI.CombineTo(GenericAddSub, Op);
57632 }
57633 };
57634 MatchGeneric(LHS, RHS, false);
57635 MatchGeneric(RHS, LHS, X86ISD::SUB == N->getOpcode());
57636
57637 // TODO: Can we drop the ZeroSecondOpOnly limit? This is to guarantee that the
57638 // EFLAGS result doesn't change.
57639 return combineAddOrSubToADCOrSBB(IsSub, DL, VT, LHS, RHS, DAG,
57640 /*ZeroSecondOpOnly*/ true);
57641}
57642
57644 SDValue LHS = N->getOperand(0);
57645 SDValue RHS = N->getOperand(1);
57646 SDValue BorrowIn = N->getOperand(2);
57647
57648 if (SDValue Flags = combineCarryThroughADD(BorrowIn, DAG)) {
57649 MVT VT = N->getSimpleValueType(0);
57650 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
57651 return DAG.getNode(X86ISD::SBB, SDLoc(N), VTs, LHS, RHS, Flags);
57652 }
57653
57654 // Fold SBB(SUB(X,Y),0,Carry) -> SBB(X,Y,Carry)
57655 // iff the flag result is dead.
57656 if (LHS.getOpcode() == ISD::SUB && isNullConstant(RHS) &&
57657 !N->hasAnyUseOfValue(1))
57658 return DAG.getNode(X86ISD::SBB, SDLoc(N), N->getVTList(), LHS.getOperand(0),
57659 LHS.getOperand(1), BorrowIn);
57660
57661 return SDValue();
57662}
57663
57664// Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS
57667 SDValue LHS = N->getOperand(0);
57668 SDValue RHS = N->getOperand(1);
57669 SDValue CarryIn = N->getOperand(2);
57670 auto *LHSC = dyn_cast<ConstantSDNode>(LHS);
57671 auto *RHSC = dyn_cast<ConstantSDNode>(RHS);
57672
57673 // Canonicalize constant to RHS.
57674 if (LHSC && !RHSC)
57675 return DAG.getNode(X86ISD::ADC, SDLoc(N), N->getVTList(), RHS, LHS,
57676 CarryIn);
57677
57678 // If the LHS and RHS of the ADC node are zero, then it can't overflow and
57679 // the result is either zero or one (depending on the input carry bit).
57680 // Strength reduce this down to a "set on carry" aka SETCC_CARRY&1.
57681 if (LHSC && RHSC && LHSC->isZero() && RHSC->isZero() &&
57682 // We don't have a good way to replace an EFLAGS use, so only do this when
57683 // dead right now.
57684 SDValue(N, 1).use_empty()) {
57685 SDLoc DL(N);
57686 EVT VT = N->getValueType(0);
57687 SDValue CarryOut = DAG.getConstant(0, DL, N->getValueType(1));
57688 SDValue Res1 = DAG.getNode(
57689 ISD::AND, DL, VT,
57691 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8), CarryIn),
57692 DAG.getConstant(1, DL, VT));
57693 return DCI.CombineTo(N, Res1, CarryOut);
57694 }
57695
57696 // Fold ADC(C1,C2,Carry) -> ADC(0,C1+C2,Carry)
57697 // iff the flag result is dead.
57698 // TODO: Allow flag result if C1+C2 doesn't signed/unsigned overflow.
57699 if (LHSC && RHSC && !LHSC->isZero() && !N->hasAnyUseOfValue(1)) {
57700 SDLoc DL(N);
57701 APInt Sum = LHSC->getAPIntValue() + RHSC->getAPIntValue();
57702 return DAG.getNode(X86ISD::ADC, DL, N->getVTList(),
57703 DAG.getConstant(0, DL, LHS.getValueType()),
57704 DAG.getConstant(Sum, DL, LHS.getValueType()), CarryIn);
57705 }
57706
57707 if (SDValue Flags = combineCarryThroughADD(CarryIn, DAG)) {
57708 MVT VT = N->getSimpleValueType(0);
57709 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
57710 return DAG.getNode(X86ISD::ADC, SDLoc(N), VTs, LHS, RHS, Flags);
57711 }
57712
57713 // Fold ADC(ADD(X,Y),0,Carry) -> ADC(X,Y,Carry)
57714 // iff the flag result is dead.
57715 if (LHS.getOpcode() == ISD::ADD && RHSC && RHSC->isZero() &&
57716 !N->hasAnyUseOfValue(1))
57717 return DAG.getNode(X86ISD::ADC, SDLoc(N), N->getVTList(), LHS.getOperand(0),
57718 LHS.getOperand(1), CarryIn);
57719
57720 return SDValue();
57721}
57722
57724 const SDLoc &DL, EVT VT,
57725 const X86Subtarget &Subtarget) {
57726 using namespace SDPatternMatch;
57727
57728 // Example of pattern we try to detect:
57729 // t := (v8i32 mul (sext (v8i16 x0), (sext (v8i16 x1))))
57730 //(add (build_vector (extract_elt t, 0),
57731 // (extract_elt t, 2),
57732 // (extract_elt t, 4),
57733 // (extract_elt t, 6)),
57734 // (build_vector (extract_elt t, 1),
57735 // (extract_elt t, 3),
57736 // (extract_elt t, 5),
57737 // (extract_elt t, 7)))
57738
57739 if (!Subtarget.hasSSE2())
57740 return SDValue();
57741
57742 if (!VT.isVector() || VT.getVectorElementType() != MVT::i32 ||
57743 VT.getVectorNumElements() < 4 ||
57745 return SDValue();
57746
57747 SDValue Op0, Op1, Accum;
57752 m_Value(Op1))))))
57753 return SDValue();
57754
57755 // Check if one of Op0,Op1 is of the form:
57756 // (build_vector (extract_elt Mul, 0),
57757 // (extract_elt Mul, 2),
57758 // (extract_elt Mul, 4),
57759 // ...
57760 // the other is of the form:
57761 // (build_vector (extract_elt Mul, 1),
57762 // (extract_elt Mul, 3),
57763 // (extract_elt Mul, 5),
57764 // ...
57765 // and identify Mul.
57766 SDValue Mul;
57767 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; i += 2) {
57768 SDValue Op0L = Op0->getOperand(i), Op1L = Op1->getOperand(i),
57769 Op0H = Op0->getOperand(i + 1), Op1H = Op1->getOperand(i + 1);
57770 // TODO: Be more tolerant to undefs.
57771 APInt Idx0L, Idx0H, Idx1L, Idx1H;
57772 SDValue Vec0L, Vec0H, Vec1L, Vec1H;
57773 if (!sd_match(Op0L, m_ExtractElt(m_Value(Vec0L), m_ConstInt(Idx0L))) ||
57774 !sd_match(Op0H, m_ExtractElt(m_Value(Vec0H), m_ConstInt(Idx0H))) ||
57775 !sd_match(Op1L, m_ExtractElt(m_Value(Vec1L), m_ConstInt(Idx1L))) ||
57776 !sd_match(Op1H, m_ExtractElt(m_Value(Vec1H), m_ConstInt(Idx1H))))
57777 return SDValue();
57778 // Commutativity of mul allows factors of a product to reorder.
57779 if (Idx0L.getZExtValue() > Idx1L.getZExtValue())
57780 std::swap(Idx0L, Idx1L);
57781 if (Idx0H.getZExtValue() > Idx1H.getZExtValue())
57782 std::swap(Idx0H, Idx1H);
57783 // Commutativity of add allows pairs of factors to reorder.
57784 if (Idx0L.getZExtValue() > Idx0H.getZExtValue()) {
57785 std::swap(Idx0L, Idx0H);
57786 std::swap(Idx1L, Idx1H);
57787 }
57788 if (Idx0L != 2 * i || Idx1L != 2 * i + 1 || Idx0H != 2 * i + 2 ||
57789 Idx1H != 2 * i + 3)
57790 return SDValue();
57791 if (!Mul) {
57792 // First time an extract_elt's source vector is visited. Must be a MUL
57793 // with 2X number of vector elements than the BUILD_VECTOR.
57794 // Both extracts must be from same MUL.
57795 Mul = Vec0L;
57796 if (Mul.getOpcode() != ISD::MUL ||
57797 Mul.getValueType().getVectorNumElements() != 2 * e)
57798 return SDValue();
57799 }
57800 // Check that the extract is from the same MUL previously seen.
57801 if (Mul != Vec0L || Mul != Vec1L || Mul != Vec0H || Mul != Vec1H)
57802 return SDValue();
57803 }
57804
57805 // Check if the Mul source can be safely shrunk.
57807 if (!canReduceVMulWidth(Mul.getNode(), DAG, Mode) ||
57809 return SDValue();
57810
57811 EVT TruncVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
57812 VT.getVectorNumElements() * 2);
57813 SDValue N0 = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Mul.getOperand(0));
57814 SDValue N1 = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Mul.getOperand(1));
57815
57816 auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
57818 EVT InVT = Ops[0].getValueType();
57819 assert(InVT == Ops[1].getValueType() && "Operands' types mismatch");
57820 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
57821 InVT.getVectorNumElements() / 2);
57822 return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT, Ops[0], Ops[1]);
57823 };
57824 SDValue R = SplitOpsAndApply(DAG, Subtarget, DL, VT, {N0, N1}, PMADDBuilder);
57825 if (Accum)
57826 R = DAG.getNode(ISD::ADD, DL, VT, R, Accum);
57827 return R;
57828}
57829
57830// Attempt to turn this pattern into PMADDWD.
57831// (add (mul (sext (build_vector)), (sext (build_vector))),
57832// (mul (sext (build_vector)), (sext (build_vector)))
57834 const SDLoc &DL, EVT VT,
57835 const X86Subtarget &Subtarget) {
57836 using namespace SDPatternMatch;
57837
57838 if (!Subtarget.hasSSE2())
57839 return SDValue();
57840
57841 if (!VT.isVector() || VT.getVectorElementType() != MVT::i32 ||
57842 VT.getVectorNumElements() < 4 ||
57844 return SDValue();
57845
57846 // All inputs need to be sign extends.
57847 // TODO: Support ZERO_EXTEND from known positive?
57848 SDValue N00, N01, N10, N11;
57849 if (!sd_match(N, m_Add(m_Mul(m_SExt(m_Value(N00)), m_SExt(m_Value(N01))),
57850 m_Mul(m_SExt(m_Value(N10)), m_SExt(m_Value(N11))))))
57851 return SDValue();
57852
57853 // Must be extending from vXi16.
57854 EVT InVT = N00.getValueType();
57855 if (InVT.getVectorElementType() != MVT::i16 || N01.getValueType() != InVT ||
57856 N10.getValueType() != InVT || N11.getValueType() != InVT)
57857 return SDValue();
57858
57859 // All inputs should be build_vectors.
57860 if (N00.getOpcode() != ISD::BUILD_VECTOR ||
57861 N01.getOpcode() != ISD::BUILD_VECTOR ||
57862 N10.getOpcode() != ISD::BUILD_VECTOR ||
57864 return SDValue();
57865
57866 // For each element, we need to ensure we have an odd element from one vector
57867 // multiplied by the odd element of another vector and the even element from
57868 // one of the same vectors being multiplied by the even element from the
57869 // other vector. So we need to make sure for each element i, this operator
57870 // is being performed:
57871 // A[2 * i] * B[2 * i] + A[2 * i + 1] * B[2 * i + 1]
57872 SDValue In0, In1;
57873 for (unsigned i = 0; i != N00.getNumOperands(); ++i) {
57874 SDValue N00Elt = N00.getOperand(i);
57875 SDValue N01Elt = N01.getOperand(i);
57876 SDValue N10Elt = N10.getOperand(i);
57877 SDValue N11Elt = N11.getOperand(i);
57878 // TODO: Be more tolerant to undefs.
57879 SDValue N00In, N01In, N10In, N11In;
57880 APInt IdxN00, IdxN01, IdxN10, IdxN11;
57881 if (!sd_match(N00Elt, m_ExtractElt(m_Value(N00In), m_ConstInt(IdxN00))) ||
57882 !sd_match(N01Elt, m_ExtractElt(m_Value(N01In), m_ConstInt(IdxN01))) ||
57883 !sd_match(N10Elt, m_ExtractElt(m_Value(N10In), m_ConstInt(IdxN10))) ||
57884 !sd_match(N11Elt, m_ExtractElt(m_Value(N11In), m_ConstInt(IdxN11))))
57885 return SDValue();
57886 // Add is commutative so indices can be reordered.
57887 if (IdxN00.getZExtValue() > IdxN10.getZExtValue()) {
57888 std::swap(IdxN00, IdxN10);
57889 std::swap(IdxN01, IdxN11);
57890 }
57891 // N0 indices be the even element. N1 indices must be the next odd element.
57892 if (IdxN00 != 2 * i || IdxN10 != 2 * i + 1 || IdxN01 != 2 * i ||
57893 IdxN11 != 2 * i + 1)
57894 return SDValue();
57895
57896 // First time we find an input capture it.
57897 if (!In0) {
57898 In0 = N00In;
57899 In1 = N01In;
57900
57901 // The input vectors must be at least as wide as the output.
57902 // If they are larger than the output, we extract subvector below.
57903 if (In0.getValueSizeInBits() < VT.getSizeInBits() ||
57904 In1.getValueSizeInBits() < VT.getSizeInBits())
57905 return SDValue();
57906 }
57907 // Mul is commutative so the input vectors can be in any order.
57908 // Canonicalize to make the compares easier.
57909 if (In0 != N00In)
57910 std::swap(N00In, N01In);
57911 if (In0 != N10In)
57912 std::swap(N10In, N11In);
57913 if (In0 != N00In || In1 != N01In || In0 != N10In || In1 != N11In)
57914 return SDValue();
57915 }
57916
57917 auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
57919 EVT OpVT = Ops[0].getValueType();
57920 assert(OpVT.getScalarType() == MVT::i16 &&
57921 "Unexpected scalar element type");
57922 assert(OpVT == Ops[1].getValueType() && "Operands' types mismatch");
57923 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
57924 OpVT.getVectorNumElements() / 2);
57925 return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT, Ops[0], Ops[1]);
57926 };
57927
57928 // If the output is narrower than an input, extract the low part of the input
57929 // vector.
57930 EVT OutVT16 = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
57931 VT.getVectorNumElements() * 2);
57932 if (OutVT16.bitsLT(In0.getValueType())) {
57933 In0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT16, In0,
57934 DAG.getVectorIdxConstant(0, DL));
57935 }
57936 if (OutVT16.bitsLT(In1.getValueType())) {
57937 In1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT16, In1,
57938 DAG.getVectorIdxConstant(0, DL));
57939 }
57940 return SplitOpsAndApply(DAG, Subtarget, DL, VT, { In0, In1 },
57941 PMADDBuilder);
57942}
57943
57944// ADD(VPMADDWD(X,Y),VPMADDWD(Z,W)) -> VPMADDWD(SHUFFLE(X,Z), SHUFFLE(Y,W))
57945// If upper element in each pair of both VPMADDWD are zero then we can merge
57946// the operand elements and use the implicit add of VPMADDWD.
57947// TODO: Add support for VPMADDUBSW (which isn't commutable).
57949 const SDLoc &DL, EVT VT) {
57950 if (N0.getOpcode() != N1.getOpcode() || N0.getOpcode() != X86ISD::VPMADDWD)
57951 return SDValue();
57952
57953 // TODO: Add 256/512-bit support once VPMADDWD combines with shuffles.
57954 if (VT.getSizeInBits() > 128)
57955 return SDValue();
57956
57957 unsigned NumElts = VT.getVectorNumElements();
57958 MVT OpVT = N0.getOperand(0).getSimpleValueType();
57960 APInt DemandedHiElts = APInt::getSplat(2 * NumElts, APInt(2, 2));
57961
57962 bool Op0HiZero =
57963 DAG.MaskedValueIsZero(N0.getOperand(0), DemandedBits, DemandedHiElts) ||
57964 DAG.MaskedValueIsZero(N0.getOperand(1), DemandedBits, DemandedHiElts);
57965 bool Op1HiZero =
57966 DAG.MaskedValueIsZero(N1.getOperand(0), DemandedBits, DemandedHiElts) ||
57967 DAG.MaskedValueIsZero(N1.getOperand(1), DemandedBits, DemandedHiElts);
57968
57969 // TODO: Check for zero lower elements once we have actual codegen that
57970 // creates them.
57971 if (!Op0HiZero || !Op1HiZero)
57972 return SDValue();
57973
57974 // Create a shuffle mask packing the lower elements from each VPMADDWD.
57975 SmallVector<int> Mask;
57976 for (int i = 0; i != (int)NumElts; ++i) {
57977 Mask.push_back(2 * i);
57978 Mask.push_back(2 * (i + NumElts));
57979 }
57980
57981 SDValue LHS =
57982 DAG.getVectorShuffle(OpVT, DL, N0.getOperand(0), N1.getOperand(0), Mask);
57983 SDValue RHS =
57984 DAG.getVectorShuffle(OpVT, DL, N0.getOperand(1), N1.getOperand(1), Mask);
57985 return DAG.getNode(X86ISD::VPMADDWD, DL, VT, LHS, RHS);
57986}
57987
57988/// CMOV of constants requires materializing constant operands in registers.
57989/// Try to fold those constants into an 'add' instruction to reduce instruction
57990/// count. We do this with CMOV rather the generic 'select' because there are
57991/// earlier folds that may be used to turn select-of-constants into logic hacks.
57993 SelectionDAG &DAG,
57994 const X86Subtarget &Subtarget) {
57995 // If an operand is zero, add-of-0 gets simplified away, so that's clearly
57996 // better because we eliminate 1-2 instructions. This transform is still
57997 // an improvement without zero operands because we trade 2 move constants and
57998 // 1 add for 2 adds (LEA) as long as the constants can be represented as
57999 // immediate asm operands (fit in 32-bits).
58000 auto isSuitableCmov = [](SDValue V) {
58001 if (V.getOpcode() != X86ISD::CMOV || !V.hasOneUse())
58002 return false;
58003 if (!isa<ConstantSDNode>(V.getOperand(0)) ||
58004 !isa<ConstantSDNode>(V.getOperand(1)))
58005 return false;
58006 return isNullConstant(V.getOperand(0)) || isNullConstant(V.getOperand(1)) ||
58007 (V.getConstantOperandAPInt(0).isSignedIntN(32) &&
58008 V.getConstantOperandAPInt(1).isSignedIntN(32));
58009 };
58010
58011 // Match an appropriate CMOV as the first operand of the add.
58012 SDValue Cmov = N->getOperand(0);
58013 SDValue OtherOp = N->getOperand(1);
58014 if (!isSuitableCmov(Cmov))
58015 std::swap(Cmov, OtherOp);
58016 if (!isSuitableCmov(Cmov))
58017 return SDValue();
58018
58019 // Don't remove a load folding opportunity for the add. That would neutralize
58020 // any improvements from removing constant materializations.
58021 if (X86::mayFoldLoad(OtherOp, Subtarget))
58022 return SDValue();
58023
58024 EVT VT = N->getValueType(0);
58025 SDValue FalseOp = Cmov.getOperand(0);
58026 SDValue TrueOp = Cmov.getOperand(1);
58027
58028 // We will push the add through the select, but we can potentially do better
58029 // if we know there is another add in the sequence and this is pointer math.
58030 // In that case, we can absorb an add into the trailing memory op and avoid
58031 // a 3-operand LEA which is likely slower than a 2-operand LEA.
58032 // TODO: If target has "slow3OpsLEA", do this even without the trailing memop?
58033 if (OtherOp.getOpcode() == ISD::ADD && OtherOp.hasOneUse() &&
58034 !isa<ConstantSDNode>(OtherOp.getOperand(0)) &&
58035 all_of(N->users(), [&](SDNode *Use) {
58036 auto *MemNode = dyn_cast<MemSDNode>(Use);
58037 return MemNode && MemNode->getBasePtr().getNode() == N;
58038 })) {
58039 // add (cmov C1, C2), add (X, Y) --> add (cmov (add X, C1), (add X, C2)), Y
58040 // TODO: We are arbitrarily choosing op0 as the 1st piece of the sum, but
58041 // it is possible that choosing op1 might be better.
58042 SDValue X = OtherOp.getOperand(0), Y = OtherOp.getOperand(1);
58043 FalseOp = DAG.getNode(ISD::ADD, DL, VT, X, FalseOp);
58044 TrueOp = DAG.getNode(ISD::ADD, DL, VT, X, TrueOp);
58045 Cmov = DAG.getNode(X86ISD::CMOV, DL, VT, FalseOp, TrueOp,
58046 Cmov.getOperand(2), Cmov.getOperand(3));
58047 return DAG.getNode(ISD::ADD, DL, VT, Cmov, Y);
58048 }
58049
58050 // add (cmov C1, C2), OtherOp --> cmov (add OtherOp, C1), (add OtherOp, C2)
58051 FalseOp = DAG.getNode(ISD::ADD, DL, VT, OtherOp, FalseOp);
58052 TrueOp = DAG.getNode(ISD::ADD, DL, VT, OtherOp, TrueOp);
58053 return DAG.getNode(X86ISD::CMOV, DL, VT, FalseOp, TrueOp, Cmov.getOperand(2),
58054 Cmov.getOperand(3));
58055}
58056
58057// Attempt to turn ADD(MUL(x, y), acc)) -> VPMADD52L
58058// When upper 12 bits of x, y and MUL(x, y) are known to be 0
58060 EVT VT, const X86Subtarget &Subtarget) {
58061 using namespace SDPatternMatch;
58062 if (!VT.isVector() || VT.getScalarSizeInBits() != 64 ||
58063 (!Subtarget.hasAVXIFMA() && !Subtarget.hasIFMA()))
58064 return SDValue();
58065
58066 // Need AVX-512VL vector length extensions if operating on XMM/YMM registers
58067 if (!Subtarget.hasAVXIFMA() && !Subtarget.hasVLX() &&
58068 VT.getSizeInBits() < 512)
58069 return SDValue();
58070
58071 const auto TotalSize = VT.getSizeInBits();
58072 if (TotalSize < 128 || !isPowerOf2_64(TotalSize))
58073 return SDValue();
58074
58075 SDValue X, Y, Acc;
58076 if (!sd_match(N, m_Add(m_Mul(m_Value(X), m_Value(Y)), m_Value(Acc))))
58077 return SDValue();
58078
58079 KnownBits KnownX = DAG.computeKnownBits(X);
58080 if (KnownX.countMinLeadingZeros() < 12)
58081 return SDValue();
58082 KnownBits KnownY = DAG.computeKnownBits(Y);
58083 if (KnownY.countMinLeadingZeros() < 12)
58084 return SDValue();
58085 KnownBits KnownMul = KnownBits::mul(KnownX, KnownY);
58086 if (KnownMul.countMinLeadingZeros() < 12)
58087 return SDValue();
58088
58089 auto VPMADD52Builder = [](SelectionDAG &G, SDLoc DL,
58090 ArrayRef<SDValue> SubOps) {
58091 EVT SubVT = SubOps[0].getValueType();
58092 assert(SubVT.getScalarSizeInBits() == 64 &&
58093 "Unexpected element size, only supports 64bit size");
58094 return G.getNode(X86ISD::VPMADD52L, DL, SubVT, SubOps[1] /*X*/,
58095 SubOps[2] /*Y*/, SubOps[0] /*Acc*/);
58096 };
58097
58098 return SplitOpsAndApply(DAG, Subtarget, DL, VT, {Acc, X, Y}, VPMADD52Builder,
58099 /*CheckBWI*/ false,
58100 /*AllowAVX512*/ Subtarget.hasIFMA());
58101}
58102
58105 const X86Subtarget &Subtarget) {
58106 using namespace SDPatternMatch;
58107 EVT VT = N->getValueType(0);
58108 SDValue Op0 = N->getOperand(0);
58109 SDValue Op1 = N->getOperand(1);
58110 SDLoc DL(N);
58111
58112 if (SDValue Select = pushAddIntoCmovOfConsts(N, DL, DAG, Subtarget))
58113 return Select;
58114
58115 if (SDValue MAdd = matchPMADDWD(DAG, N, DL, VT, Subtarget))
58116 return MAdd;
58117 if (SDValue MAdd = matchPMADDWD_2(DAG, N, DL, VT, Subtarget))
58118 return MAdd;
58119 if (SDValue MAdd = combineAddOfPMADDWD(DAG, Op0, Op1, DL, VT))
58120 return MAdd;
58121
58122 // Try to synthesize horizontal adds from adds of shuffles.
58123 if (SDValue V = combineToHorizontalAddSub(N, DAG, Subtarget))
58124 return V;
58125
58126 // Prefer VSHLI to reduce uses, X86FixupInstTunings may revert this depending
58127 // on the scheduler model. Limit multiple users to AVX+ targets to prevent
58128 // introducing extra register moves.
58129 if (Op0 == Op1 && supportedVectorShiftWithImm(VT, Subtarget, ISD::SHL))
58130 if (Subtarget.hasAVX() || N->isOnlyUserOf(Op0.getNode()))
58132 Op0, 1, DAG);
58133
58134 // Canonicalize hidden LEA pattern:
58135 // Fold (add (sub (shl x, c), y), z) -> (sub (add (shl x, c), z), y)
58136 // iff c < 4
58137 if (VT == MVT::i32 || VT == MVT::i64) {
58138 SDValue Y, Z, Shift;
58139 APInt Amt;
58140 if (sd_match(
58142 m_Shl(m_Value(), m_ConstInt(Amt))),
58143 m_Value(Y))),
58144 m_Value(Z))) &&
58145 Amt.ult(4) && !isa<ConstantSDNode>(Z)) {
58146 return DAG.getNode(ISD::SUB, DL, VT,
58147 DAG.getNode(ISD::ADD, DL, VT, Shift, Z), Y);
58148 }
58149 }
58150
58151 SDValue X, Y;
58152
58153 // add(psadbw(X,0),psadbw(Y,0)) -> psadbw(add(X,Y),0)
58154 // iff X and Y won't overflow.
58155 if (sd_match(Op0, m_c_BinOp(X86ISD::PSADBW, m_Value(X), m_Zero())) &&
58157 DAG.willNotOverflowAdd(/*IsSigned=*/false, X, Y)) {
58158 MVT OpVT = X.getSimpleValueType();
58159 SDValue Sum = DAG.getNode(ISD::ADD, DL, OpVT, X, Y);
58160 return DAG.getNode(X86ISD::PSADBW, DL, VT, Sum,
58161 getZeroVector(OpVT, Subtarget, DAG, DL));
58162 }
58163
58164 if (VT.isVector()) {
58165 EVT BoolVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
58167
58168 // If vectors of i1 are legal, turn (add (zext (vXi1 X)), Y) into
58169 // (sub Y, (sext (vXi1 X))).
58170 // FIXME: We have the (sub Y, (zext (vXi1 X))) -> (add (sext (vXi1 X)), Y)
58171 // in generic DAG combine without a legal type check, but adding this there
58172 // caused regressions.
58173 if (DAG.getTargetLoweringInfo().isTypeLegal(BoolVT) &&
58175 m_Value(Y)))) {
58176 SDValue SExt = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, X);
58177 return DAG.getNode(ISD::SUB, DL, VT, Y, SExt);
58178 }
58179
58180 // Fold (add X, (srl Y, 7)) -> (sub X, (icmp_sgt 0, Y)) to undo instcombine
58181 // canonicalisation as we don't have good vXi8 shifts.
58182 if (VT.getScalarType() == MVT::i8 &&
58184 SDValue Cmp =
58185 DAG.getSetCC(DL, BoolVT, DAG.getConstant(0, DL, VT), Y, ISD::SETGT);
58186 return DAG.getNode(ISD::SUB, DL, VT, X, DAG.getSExtOrTrunc(Cmp, DL, VT));
58187 }
58188 }
58189
58190 // Peephole for 512-bit VPDPBSSD on non-VLX targets.
58191 // TODO: Should this be part of matchPMADDWD/matchPMADDWD_2?
58192 if (Subtarget.hasVNNI() && Subtarget.useAVX512Regs() && VT == MVT::v16i32) {
58193 SDValue Accum, Lo0, Lo1, Hi0, Hi1;
58194 if (sd_match(N, m_Add(m_Value(Accum),
58197 m_Value(Lo1)),
58199 m_Value(Hi1)))))) {
58200 return DAG.getNode(X86ISD::VPDPWSSD, DL, VT, Accum,
58201 concatSubVectors(Lo0, Hi0, DAG, DL),
58202 concatSubVectors(Lo1, Hi1, DAG, DL));
58203 }
58204 }
58205
58206 // Fold ADD(ADC(Y,0,W),X) -> ADC(X,Y,W)
58207 if (Op0.getOpcode() == X86ISD::ADC && Op0->hasOneUse() &&
58208 X86::isZeroNode(Op0.getOperand(1))) {
58209 assert(!Op0->hasAnyUseOfValue(1) && "Overflow bit in use");
58210 return DAG.getNode(X86ISD::ADC, SDLoc(Op0), Op0->getVTList(), Op1,
58211 Op0.getOperand(0), Op0.getOperand(2));
58212 }
58213
58214 if (SDValue IFMA52 = matchVPMADD52(N, DAG, DL, VT, Subtarget))
58215 return IFMA52;
58216
58217 return combineAddOrSubToADCOrSBB(N, DL, DAG);
58218}
58219
58220// Try to fold (sub Y, cmovns X, -X) -> (add Y, cmovns -X, X) if the cmov
58221// condition comes from the subtract node that produced -X. This matches the
58222// cmov expansion for absolute value. By swapping the operands we convert abs
58223// to nabs.
58224static SDValue combineSubABS(EVT VT, const SDLoc &DL, SDValue N0, SDValue N1,
58225 SelectionDAG &DAG) {
58226 if (N1.getOpcode() != X86ISD::CMOV || !N1.hasOneUse())
58227 return SDValue();
58228
58229 SDValue Cond = N1.getOperand(3);
58230 if (Cond.getOpcode() != X86ISD::SUB)
58231 return SDValue();
58232 assert(Cond.getResNo() == 1 && "Unexpected result number");
58233
58234 SDValue FalseOp = N1.getOperand(0);
58235 SDValue TrueOp = N1.getOperand(1);
58237
58238 // ABS condition should come from a negate operation.
58239 if ((CC == X86::COND_S || CC == X86::COND_NS) &&
58240 isNullConstant(Cond.getOperand(0))) {
58241 // Get the X and -X from the negate.
58242 SDValue NegX = Cond.getValue(0);
58243 SDValue X = Cond.getOperand(1);
58244
58245 // Cmov operands should be X and NegX. Order doesn't matter.
58246 if (!(TrueOp == X && FalseOp == NegX) && !(TrueOp == NegX && FalseOp == X))
58247 return SDValue();
58248
58249 // Build a new CMOV with the operands swapped.
58250 SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, VT, TrueOp, FalseOp,
58251 N1.getOperand(2), Cond);
58252 // Convert sub to add.
58253 return DAG.getNode(ISD::ADD, DL, VT, N0, Cmov);
58254 }
58255
58256 // Handle ABD special case:
58257 // NEG(ABD(X,Y)) -> NEG(CMOV(SUB(X,Y),SUB(Y,X))) -> CMOV(SUB(Y,X),SUB(X,Y)).
58258 // ABD condition should come from a pair of matching subtracts.
58259 if ((CC == X86::COND_L || CC == X86::COND_B) && isNullConstant(N0) &&
58260 (FalseOp == Cond.getValue(0) || TrueOp == Cond.getValue(0)) &&
58261 (TrueOp.getOpcode() == ISD::SUB || TrueOp.getOpcode() == X86ISD::SUB) &&
58262 (FalseOp.getOpcode() == ISD::SUB || FalseOp.getOpcode() == X86ISD::SUB) &&
58263 (TrueOp.getOperand(0) == FalseOp.getOperand(1)) &&
58264 (TrueOp.getOperand(1) == FalseOp.getOperand(0))) {
58265 // Build a new CMOV with the operands swapped.
58266 return DAG.getNode(X86ISD::CMOV, DL, VT, TrueOp, FalseOp, N1.getOperand(2),
58267 Cond);
58268 }
58269
58270 return SDValue();
58271}
58272
58274 SDValue Op0 = N->getOperand(0);
58275 SDValue Op1 = N->getOperand(1);
58276
58277 // (sub C (zero_extend (setcc)))
58278 // =>
58279 // (add (zero_extend (setcc inverted) C-1)) if C is a nonzero immediate
58280 // Don't disturb (sub 0 setcc), which is easily done with neg.
58281 EVT VT = N->getValueType(0);
58282 auto *Op0C = dyn_cast<ConstantSDNode>(Op0);
58283 if (Op1.getOpcode() == ISD::ZERO_EXTEND && Op1.hasOneUse() && Op0C &&
58284 !Op0C->isZero() && Op1.getOperand(0).getOpcode() == X86ISD::SETCC &&
58285 Op1.getOperand(0).hasOneUse()) {
58286 SDValue SetCC = Op1.getOperand(0);
58289 APInt NewImm = Op0C->getAPIntValue() - 1;
58290 SDLoc DL(Op1);
58291 SDValue NewSetCC = getSETCC(NewCC, SetCC.getOperand(1), DL, DAG);
58292 NewSetCC = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, NewSetCC);
58293 return DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(VT, VT), NewSetCC,
58294 DAG.getConstant(NewImm, DL, VT));
58295 }
58296
58297 return SDValue();
58298}
58299
58301 if (N->getConstantOperandVal(3) != X86::COND_NE)
58302 return SDValue();
58303
58304 SDValue Sub = N->getOperand(4);
58305 if (Sub.getOpcode() != X86ISD::SUB)
58306 return SDValue();
58307
58308 SDValue Op1 = Sub.getOperand(1);
58309
58310 if (!X86::isZeroNode(Sub.getOperand(0)))
58311 return SDValue();
58312
58313 SDLoc DL(N);
58314 SmallVector<SDValue, 5> Ops(N->op_values());
58315 if (Op1.getOpcode() == X86ISD::SETCC) {
58316 // res, flags2 = sub 0, (setcc cc, flag)
58317 // cload/cstore ..., cond_ne, flag2
58318 // ->
58319 // cload/cstore cc, flag
58320 Ops[3] = Op1.getOperand(0);
58321 Ops[4] = Op1.getOperand(1);
58322 } else if (Op1.getOpcode() == ISD::AND && Sub.getValue(0).use_empty()) {
58323 SDValue Src = Op1;
58324 SDValue Op10 = Op1.getOperand(0);
58325 if (Op10.getOpcode() == ISD::XOR && isAllOnesConstant(Op10.getOperand(1)) &&
58327 // res, flags2 = sub 0, (and (xor X, -1), 1)
58328 // cload/cstore ..., cond_ne, flag2
58329 // ->
58330 // res, flags2 = sub 0, (and X, 1)
58331 // cload/cstore ..., cond_e, flag2
58332 Src = DAG.getNode(ISD::AND, DL, Op1.getValueType(), Op10.getOperand(0),
58333 Op1.getOperand(1));
58334 Ops[3] = DAG.getTargetConstant(X86::COND_E, DL, MVT::i8);
58335 }
58336 // res, flags2 = sub 0, (and X, Y)
58337 // cload/cstore ..., cc, flag2
58338 // ->
58339 // res, flags2 = cmp (and X, Y), 0
58340 // cload/cstore ..., cc, flag2
58341 Ops[4] = DAG.getNode(X86ISD::CMP, DL, MVT::i32, Src, Sub.getOperand(0));
58342 } else {
58343 return SDValue();
58344 }
58345
58346 return DAG.getMemIntrinsicNode(N->getOpcode(), DL, N->getVTList(), Ops,
58347 cast<MemSDNode>(N)->getMemoryVT(),
58348 cast<MemSDNode>(N)->getMemOperand());
58349}
58350
58353 const X86Subtarget &Subtarget) {
58354 EVT VT = N->getValueType(0);
58355 SDValue Op0 = N->getOperand(0);
58356 SDValue Op1 = N->getOperand(1);
58357 SDLoc DL(N);
58358
58359 auto IsNonOpaqueConstant = [&](SDValue Op) {
58361 /*AllowOpaques*/ false);
58362 };
58363
58364 // X86 can't encode an immediate LHS of a sub. See if we can push the
58365 // negation into a preceding instruction. If the RHS of the sub is a XOR with
58366 // one use and a constant, invert the immediate, saving one register.
58367 // However, ignore cases where C1 is 0, as those will become a NEG.
58368 // sub(C1, xor(X, C2)) -> add(xor(X, ~C2), C1+1)
58369 if (Op1.getOpcode() == ISD::XOR && IsNonOpaqueConstant(Op0) &&
58370 !isNullConstant(Op0) && IsNonOpaqueConstant(Op1.getOperand(1)) &&
58371 Op1->hasOneUse()) {
58372 SDValue NewXor = DAG.getNode(ISD::XOR, SDLoc(Op1), VT, Op1.getOperand(0),
58373 DAG.getNOT(SDLoc(Op1), Op1.getOperand(1), VT));
58374 SDValue NewAdd =
58375 DAG.getNode(ISD::ADD, DL, VT, Op0, DAG.getConstant(1, DL, VT));
58376 return DAG.getNode(ISD::ADD, DL, VT, NewXor, NewAdd);
58377 }
58378
58379 if (SDValue V = combineSubABS(VT, DL, Op0, Op1, DAG))
58380 return V;
58381
58382 // Try to synthesize horizontal subs from subs of shuffles.
58383 if (SDValue V = combineToHorizontalAddSub(N, DAG, Subtarget))
58384 return V;
58385
58386 // Fold SUB(X,ADC(Y,0,W)) -> SBB(X,Y,W)
58387 if (Op1.getOpcode() == X86ISD::ADC && Op1->hasOneUse() &&
58388 X86::isZeroNode(Op1.getOperand(1))) {
58389 assert(!Op1->hasAnyUseOfValue(1) && "Overflow bit in use");
58390 return DAG.getNode(X86ISD::SBB, SDLoc(Op1), Op1->getVTList(), Op0,
58391 Op1.getOperand(0), Op1.getOperand(2));
58392 }
58393
58394 // Fold SUB(X,SBB(Y,Z,W)) -> SUB(ADC(X,Z,W),Y)
58395 // Don't fold to ADC(0,0,W)/SETCC_CARRY pattern which will prevent more folds.
58396 if (Op1.getOpcode() == X86ISD::SBB && Op1->hasOneUse() &&
58397 !(X86::isZeroNode(Op0) && X86::isZeroNode(Op1.getOperand(1)))) {
58398 assert(!Op1->hasAnyUseOfValue(1) && "Overflow bit in use");
58399 SDValue ADC = DAG.getNode(X86ISD::ADC, SDLoc(Op1), Op1->getVTList(), Op0,
58400 Op1.getOperand(1), Op1.getOperand(2));
58401 return DAG.getNode(ISD::SUB, DL, VT, ADC.getValue(0), Op1.getOperand(0));
58402 }
58403
58404 if (SDValue V = combineXorSubCTLZ(N, DL, DAG, Subtarget))
58405 return V;
58406
58407 if (SDValue V = combineAddOrSubToADCOrSBB(N, DL, DAG))
58408 return V;
58409
58410 return combineSubSetcc(N, DAG);
58411}
58412
58414 const X86Subtarget &Subtarget) {
58415 unsigned Opcode = N->getOpcode();
58416 assert((Opcode == X86ISD::PCMPEQ || Opcode == X86ISD::PCMPGT) &&
58417 "Unknown PCMP opcode");
58418
58419 SDValue LHS = N->getOperand(0);
58420 SDValue RHS = N->getOperand(1);
58421 MVT VT = N->getSimpleValueType(0);
58422 unsigned EltBits = VT.getScalarSizeInBits();
58423 unsigned NumElts = VT.getVectorNumElements();
58424 SDLoc DL(N);
58425
58426 if (LHS == RHS)
58427 return (Opcode == X86ISD::PCMPEQ) ? DAG.getAllOnesConstant(DL, VT)
58428 : DAG.getConstant(0, DL, VT);
58429
58430 // Constant Folding.
58431 // PCMPEQ(X,UNDEF) -> UNDEF
58432 // PCMPGT(X,UNDEF) -> 0
58433 // PCMPGT(UNDEF,X) -> 0
58434 APInt LHSUndefs, RHSUndefs;
58435 SmallVector<APInt> LHSBits, RHSBits;
58436 if (getTargetConstantBitsFromNode(LHS, EltBits, LHSUndefs, LHSBits) &&
58437 getTargetConstantBitsFromNode(RHS, EltBits, RHSUndefs, RHSBits)) {
58438 APInt Ones = APInt::getAllOnes(EltBits);
58439 APInt Zero = APInt::getZero(EltBits);
58440 SmallVector<APInt> Results(NumElts);
58441 for (unsigned I = 0; I != NumElts; ++I) {
58442 if (Opcode == X86ISD::PCMPEQ) {
58443 Results[I] = (LHSBits[I] == RHSBits[I]) ? Ones : Zero;
58444 } else {
58445 bool AnyUndef = LHSUndefs[I] || RHSUndefs[I];
58446 Results[I] = (!AnyUndef && LHSBits[I].sgt(RHSBits[I])) ? Ones : Zero;
58447 }
58448 }
58449 if (Opcode == X86ISD::PCMPEQ)
58450 return getConstVector(Results, LHSUndefs | RHSUndefs, VT, DAG, DL);
58451 return getConstVector(Results, VT, DAG, DL);
58452 }
58453
58454 return SDValue();
58455}
58456
58457// Helper to determine if we can convert an integer comparison to a float
58458// comparison byt casting the operands.
58459static std::optional<unsigned>
58460CastIntSETCCtoFP(MVT VT, ISD::CondCode CC, unsigned NumSignificantBitsLHS,
58461 unsigned NumSignificantBitsRHS) {
58462 MVT SVT = VT.getScalarType();
58463 assert(SVT == MVT::f32 && "Only tested for float so far");
58464 const fltSemantics &Sem = SVT.getFltSemantics();
58465 assert((CC == ISD::SETEQ || CC == ISD::SETGT) &&
58466 "Only PCMPEQ/PCMPGT currently supported");
58467
58468 // TODO: Handle bitcastable integers.
58469
58470 // For cvt + signed compare we need lhs and rhs to be exactly representable as
58471 // a fp value.
58472 unsigned FPPrec = APFloat::semanticsPrecision(Sem);
58473 if (FPPrec >= NumSignificantBitsLHS && FPPrec >= NumSignificantBitsRHS)
58474 return ISD::SINT_TO_FP;
58475
58476 return std::nullopt;
58477}
58478
58479/// Helper that combines an array of subvector ops as if they were the operands
58480/// of a ISD::CONCAT_VECTORS node, but may have come from another source (e.g.
58481/// ISD::INSERT_SUBVECTOR). The ops are assumed to be of the same type.
58484 const X86Subtarget &Subtarget,
58485 unsigned Depth) {
58486 assert(Subtarget.hasAVX() && "AVX assumed for concat_vectors");
58487 unsigned EltSizeInBits = VT.getScalarSizeInBits();
58488
58489 if (llvm::all_of(Ops, [](SDValue Op) { return Op.isUndef(); }))
58490 return DAG.getUNDEF(VT);
58491
58492 if (llvm::all_of(Ops, [](SDValue Op) {
58493 return Op.isUndef() || ISD::isBuildVectorAllZeros(Op.getNode());
58494 }))
58495 return getZeroVector(VT, Subtarget, DAG, DL);
58496
58498 return SDValue(); // Limit search depth.
58499
58500 SDValue Op0 = Ops[0];
58501 bool IsSplat = llvm::all_equal(Ops);
58502 unsigned NumOps = Ops.size();
58503 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
58504 LLVMContext &Ctx = *DAG.getContext();
58505
58506 // Repeated subvectors.
58507 if (IsSplat &&
58508 (VT.is256BitVector() || (VT.is512BitVector() && Subtarget.hasAVX512()))) {
58509 // If this broadcast is inserted into both halves, use a larger broadcast.
58510 if (Op0.getOpcode() == X86ISD::VBROADCAST)
58511 return DAG.getNode(Op0.getOpcode(), DL, VT, Op0.getOperand(0));
58512
58513 // concat_vectors(movddup(x),movddup(x)) -> broadcast(x)
58514 if (Op0.getOpcode() == X86ISD::MOVDDUP && VT == MVT::v4f64 &&
58515 (Subtarget.hasAVX2() ||
58517 VT.getScalarType(), Subtarget)))
58518 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
58519 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f64,
58520 Op0.getOperand(0),
58521 DAG.getVectorIdxConstant(0, DL)));
58522
58523 // concat_vectors(scalar_to_vector(x),scalar_to_vector(x)) -> broadcast(x)
58524 if (Op0.getOpcode() == ISD::SCALAR_TO_VECTOR &&
58525 (Subtarget.hasAVX2() ||
58526 (EltSizeInBits >= 32 &&
58527 X86::mayFoldLoad(Op0.getOperand(0), Subtarget))) &&
58528 Op0.getOperand(0).getValueType() == VT.getScalarType())
58529 return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Op0.getOperand(0));
58530
58531 // concat_vectors(extract_subvector(splat(x)),
58532 // extract_subvector(splat(x))) -> splat(x)
58533 // concat_vectors(extract_subvector(subv_broadcast(x)),
58534 // extract_subvector(subv_broadcast(x))) -> subv_broadcast(x)
58535 if (Op0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
58536 Op0.getOperand(0).getValueType() == VT) {
58537 SDValue SrcVec = Op0.getOperand(0);
58538 if (DAG.isSplatValue(SrcVec, /*AllowUndefs*/ false))
58539 return SrcVec;
58540 if (SrcVec.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD &&
58541 Op0.getValueType() == cast<MemSDNode>(SrcVec)->getMemoryVT())
58542 return SrcVec;
58543 }
58544
58545 // concat_vectors(permq(x),permq(x)) -> permq(concat_vectors(x,x))
58546 if (Op0.getOpcode() == X86ISD::VPERMI && Subtarget.useAVX512Regs() &&
58547 !X86::mayFoldLoad(Op0.getOperand(0), Subtarget))
58548 return DAG.getNode(Op0.getOpcode(), DL, VT,
58550 Op0.getOperand(0), Op0.getOperand(0)),
58551 Op0.getOperand(1));
58552 }
58553
58554 // TODO: This should go in combineX86ShufflesRecursively eventually.
58555 if (NumOps == 2) {
58556 SDValue Src0 = peekThroughBitcasts(Ops[0]);
58557 SDValue Src1 = peekThroughBitcasts(Ops[1]);
58558 if (Src0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
58560 EVT SrcVT0 = Src0.getOperand(0).getValueType();
58561 EVT SrcVT1 = Src1.getOperand(0).getValueType();
58562 unsigned NumSrcElts0 = SrcVT0.getVectorNumElements();
58563 unsigned NumSrcElts1 = SrcVT1.getVectorNumElements();
58564 const APInt &SrcIdx0 = Src0.getConstantOperandAPInt(1);
58565 const APInt &SrcIdx1 = Src1.getConstantOperandAPInt(1);
58566 // concat(extract_subvector(v0), extract_subvector(v1)) -> vperm2x128.
58567 // Only concat of subvector high halves which vperm2x128 is best at or if
58568 // it should fold into a subvector broadcast.
58569 if (VT.is256BitVector() && SrcVT0.is256BitVector() &&
58570 SrcVT1.is256BitVector()) {
58571 assert((SrcIdx0 == 0 || SrcIdx0 == (NumSrcElts0 / 2)) &&
58572 (SrcIdx1 == 0 || SrcIdx1 == (NumSrcElts1 / 2)) &&
58573 "Bad subvector index");
58574 if ((SrcIdx0 == (NumSrcElts0 / 2) && SrcIdx1 == (NumSrcElts1 / 2)) ||
58575 (IsSplat && ISD::isNormalLoad(Src0.getOperand(0).getNode()))) {
58576 unsigned Index = 0;
58577 Index |= SrcIdx0 == 0 ? 0x00 : 0x01;
58578 Index |= SrcIdx1 == 0 ? 0x20 : 0x30;
58579 return DAG.getNode(X86ISD::VPERM2X128, DL, VT,
58580 DAG.getBitcast(VT, Src0.getOperand(0)),
58581 DAG.getBitcast(VT, Src1.getOperand(0)),
58582 DAG.getTargetConstant(Index, DL, MVT::i8));
58583 }
58584 }
58585 // Widen extract_subvector
58586 // concat(extract_subvector(x,lo), extract_subvector(x,hi))
58587 // --> extract_subvector(x,lo)
58588 unsigned NumSubElts0 = Src0.getValueType().getVectorNumElements();
58589 if (Src0.getOperand(0) == Src1.getOperand(0) &&
58590 (SrcIdx0 == 0 || SrcIdx0 == (NumSrcElts0 / 2)) &&
58591 SrcIdx1 == (SrcIdx0 + NumSubElts0)) {
58592 return DAG.getBitcast(VT,
58594 Src0.getConstantOperandVal(1),
58595 DAG, DL, VT.getSizeInBits()));
58596 }
58597 }
58598 }
58599
58600 // Repeated opcode.
58601 // TODO - combineX86ShufflesRecursively should handle shuffle concatenation
58602 // but it currently struggles with different vector widths.
58603 if (llvm::all_of(Ops, [Op0](SDValue Op) {
58604 return Op.getOpcode() == Op0.getOpcode() && Op.hasOneUse();
58605 })) {
58606 auto ConcatSubOperand = [&](EVT VT, ArrayRef<SDValue> SubOps, unsigned I) {
58608 for (SDValue SubOp : SubOps)
58609 Subs.push_back(SubOp.getOperand(I));
58610 // Attempt to peek through bitcasts and concat the original subvectors.
58611 EVT SubVT = peekThroughBitcasts(Subs[0]).getValueType();
58612 if (SubVT.isSimple() && SubVT.isVector()) {
58613 MVT ConcatVT =
58615 SubVT.getVectorElementCount() * Subs.size());
58616 for (SDValue &Sub : Subs)
58617 Sub = DAG.getBitcast(SubVT, Sub);
58618 if (SDValue ConcatSrc = combineConcatVectorOps(DL, ConcatVT, Subs, DAG,
58619 Subtarget, Depth + 1))
58620 return DAG.getBitcast(VT, ConcatSrc);
58621 return DAG.getBitcast(
58622 VT, DAG.getNode(ISD::CONCAT_VECTORS, DL, ConcatVT, Subs));
58623 }
58624 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs);
58625 };
58626 auto IsConcatFree = [](MVT VT, ArrayRef<SDValue> SubOps, unsigned Op) {
58627 bool AllConstants = true;
58628 bool AllSubs = true;
58629 unsigned VecSize = VT.getSizeInBits();
58630 SDValue BC0 = peekThroughBitcasts(SubOps[0].getOperand(Op));
58631 if (isa<LoadSDNode>(BC0) && all_of(SubOps, [&](SDValue SubOp) {
58632 return BC0 == peekThroughBitcasts(SubOp.getOperand(Op));
58633 }))
58634 return true;
58635 for (unsigned I = 0, E = SubOps.size(); I != E; ++I) {
58636 SDValue BC = peekThroughBitcasts(SubOps[I].getOperand(Op));
58637 unsigned SubSize = BC.getValueSizeInBits();
58638 unsigned EltSize = BC.getScalarValueSizeInBits();
58639 AllConstants &= ISD::isBuildVectorOfConstantSDNodes(BC.getNode()) ||
58641 AllSubs &= BC.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
58642 BC.getOperand(0).getValueSizeInBits() == VecSize &&
58643 (BC.getConstantOperandVal(1) * EltSize) == (I * SubSize);
58644 }
58645 return AllConstants || AllSubs;
58646 };
58647 auto CombineSubOperand = [&](MVT VT, ArrayRef<SDValue> SubOps, unsigned I) {
58648 bool AllConstants = true;
58650 for (SDValue SubOp : SubOps) {
58651 SDValue BC = peekThroughBitcasts(SubOp.getOperand(I));
58652 AllConstants &= ISD::isBuildVectorOfConstantSDNodes(BC.getNode()) ||
58654 Subs.push_back(SubOp.getOperand(I));
58655 }
58656 if (AllConstants)
58657 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs);
58658 return combineConcatVectorOps(DL, VT, Subs, DAG, Subtarget, Depth + 1);
58659 };
58660
58661 unsigned Opcode = Op0.getOpcode();
58662 switch (Opcode) {
58663 case ISD::BITCAST: {
58664 // TODO: Support AVX1/AVX2 bitcasts.
58666 for (SDValue SubOp : Ops)
58667 SubOps.push_back(peekThroughBitcasts(SubOp.getOperand(0)));
58668 EVT InnerVT = SubOps[0].getValueType();
58669 unsigned InnerSizeInBits = InnerVT.getScalarSizeInBits();
58670 if (!IsSplat && InnerVT.isSimple() && InnerVT.isVector() &&
58671 (Subtarget.hasBWI() ||
58672 (EltSizeInBits >= 32 && InnerSizeInBits >= 32)) &&
58673 ((VT.is256BitVector() && Subtarget.hasVLX()) ||
58674 (VT.is512BitVector() && Subtarget.useAVX512Regs())) &&
58675 llvm::all_of(SubOps, [InnerVT](SDValue Op) {
58676 return Op.getValueType() == InnerVT;
58677 })) {
58678 MVT ConcatSVT = InnerVT.getScalarType().getSimpleVT();
58679 MVT ConcatVT = MVT::getVectorVT(
58680 ConcatSVT, VT.getSizeInBits() / ConcatSVT.getSizeInBits());
58681 if (SDValue ConcatSrc = combineConcatVectorOps(
58682 DL, ConcatVT, SubOps, DAG, Subtarget, Depth + 1))
58683 return DAG.getBitcast(VT, ConcatSrc);
58684 }
58685 break;
58686 }
58687 case ISD::VECTOR_SHUFFLE: {
58688 // TODO: Generalize NumOps support.
58689 if (!IsSplat && NumOps == 2 &&
58690 ((VT.is256BitVector() &&
58691 (EltSizeInBits >= 32 || Subtarget.hasInt256())) ||
58692 (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
58693 (EltSizeInBits >= 32 || Subtarget.useBWIRegs())))) {
58694 SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
58695 SDValue Concat1 = CombineSubOperand(VT, Ops, 1);
58696 if (Concat0 || Concat1 ||
58697 (Ops[0].getOperand(0) == Ops[1].getOperand(0) &&
58698 Ops[0].getOperand(1) == Ops[1].getOperand(1) &&
58699 Subtarget.hasVBMI())) {
58700 int NumSubElts = Op0.getValueType().getVectorNumElements();
58701 SmallVector<int> NewMask;
58702 for (int M : cast<ShuffleVectorSDNode>(Ops[0])->getMask()) {
58703 M = M >= NumSubElts ? M + NumSubElts : M;
58704 NewMask.push_back(M);
58705 }
58706 for (int M : cast<ShuffleVectorSDNode>(Ops[1])->getMask()) {
58707 if (0 <= M)
58708 M = (M >= NumSubElts ? M + NumSubElts : M) + NumSubElts;
58709 NewMask.push_back(M);
58710 }
58711 Concat0 = Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0);
58712 Concat1 = Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1);
58713 return DAG.getVectorShuffle(VT, DL, Concat0, Concat1, NewMask);
58714 }
58715 }
58716 break;
58717 }
58718 case X86ISD::VBROADCAST: {
58719 // TODO: 512-bit VBROADCAST concatenation.
58720 if (!IsSplat && llvm::all_of(Ops, [](SDValue Op) {
58721 return Op.getOperand(0).getValueType().is128BitVector();
58722 })) {
58723 if (VT == MVT::v4f64 || VT == MVT::v4i64)
58724 return DAG.getNode(X86ISD::UNPCKL, DL, VT,
58725 ConcatSubOperand(VT, Ops, 0),
58726 ConcatSubOperand(VT, Ops, 0));
58727 // TODO: Add pseudo v8i32 PSHUFD handling to AVX1Only targets.
58728 if (VT == MVT::v8f32 || (VT == MVT::v8i32 && Subtarget.hasInt256()))
58729 return DAG.getNode(VT == MVT::v8f32 ? X86ISD::VPERMILPI
58731 DL, VT, ConcatSubOperand(VT, Ops, 0),
58732 getV4X86ShuffleImm8ForMask({0, 0, 0, 0}, DL, DAG));
58733 }
58734 break;
58735 }
58736 case X86ISD::MOVDDUP:
58737 case X86ISD::MOVSHDUP:
58738 case X86ISD::MOVSLDUP: {
58739 if (!IsSplat && (VT.is256BitVector() ||
58740 (VT.is512BitVector() && Subtarget.useAVX512Regs())))
58741 return DAG.getNode(Opcode, DL, VT, ConcatSubOperand(VT, Ops, 0));
58742 break;
58743 }
58744 case X86ISD::SHUFP: {
58745 if (!IsSplat &&
58746 (VT == MVT::v8f32 ||
58747 (VT == MVT::v16f32 && Subtarget.useAVX512Regs())) &&
58748 llvm::all_of(Ops, [Op0](SDValue Op) {
58749 return Op.getOperand(2) == Op0.getOperand(2);
58750 })) {
58751 SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
58752 SDValue Concat1 = CombineSubOperand(VT, Ops, 1);
58753 if (Concat0 || Concat1)
58754 return DAG.getNode(Opcode, DL, VT,
58755 Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0),
58756 Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1),
58757 Op0.getOperand(2));
58758 }
58759 break;
58760 }
58761 case X86ISD::UNPCKH:
58762 case X86ISD::UNPCKL: {
58763 // TODO: UNPCK should use CombineSubOperand
58764 // Don't concatenate build_vector patterns.
58765 if (!IsSplat &&
58766 ((VT.is256BitVector() &&
58767 (EltSizeInBits >= 32 || Subtarget.hasInt256())) ||
58768 (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
58769 (EltSizeInBits >= 32 || Subtarget.useBWIRegs()))) &&
58770 none_of(Ops, [](SDValue Op) {
58771 return peekThroughBitcasts(Op.getOperand(0)).getOpcode() ==
58773 peekThroughBitcasts(Op.getOperand(1)).getOpcode() ==
58775 })) {
58776 SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
58777 SDValue Concat1 = CombineSubOperand(VT, Ops, 1);
58778 if (Concat0 || Concat1 ||
58779 (Subtarget.hasInt256() && EltSizeInBits == 64))
58780 return DAG.getNode(Opcode, DL, VT,
58781 Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0),
58782 Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1));
58783 }
58784 break;
58785 }
58786 case X86ISD::PSHUFHW:
58787 case X86ISD::PSHUFLW:
58788 case X86ISD::PSHUFD:
58789 if (!IsSplat &&
58790 ((VT.is256BitVector() && Subtarget.hasInt256()) ||
58791 (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
58792 (EltSizeInBits >= 32 || Subtarget.useBWIRegs()))) &&
58793 llvm::all_of(Ops, [Op0](SDValue Op) {
58794 return Op.getOperand(1) == Op0.getOperand(1);
58795 })) {
58796 return DAG.getNode(Opcode, DL, VT, ConcatSubOperand(VT, Ops, 0),
58797 Op0.getOperand(1));
58798 }
58799 [[fallthrough]];
58800 case X86ISD::VPERMILPI:
58801 if (!IsSplat && EltSizeInBits == 32 &&
58802 (VT.is256BitVector() ||
58803 (VT.is512BitVector() && Subtarget.useAVX512Regs())) &&
58804 all_of(Ops, [&Op0](SDValue Op) {
58805 return Op0.getOperand(1) == Op.getOperand(1);
58806 })) {
58807 MVT FloatVT = VT.changeVectorElementType(MVT::f32);
58808 SDValue Res = DAG.getBitcast(FloatVT, ConcatSubOperand(VT, Ops, 0));
58809 Res =
58810 DAG.getNode(X86ISD::VPERMILPI, DL, FloatVT, Res, Op0.getOperand(1));
58811 return DAG.getBitcast(VT, Res);
58812 }
58813 break;
58814 case X86ISD::VPERMILPV:
58815 if (!IsSplat && (VT.is256BitVector() ||
58816 (VT.is512BitVector() && Subtarget.useAVX512Regs()))) {
58817 SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
58818 SDValue Concat1 = CombineSubOperand(VT, Ops, 1);
58819 if (Concat0 || Concat1)
58820 return DAG.getNode(Opcode, DL, VT,
58821 Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0),
58822 Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1));
58823 }
58824 break;
58825 case X86ISD::PSHUFB:
58826 case X86ISD::PSADBW:
58827 case X86ISD::VPMADDUBSW:
58828 case X86ISD::VPMADDWD:
58829 if (!IsSplat && ((VT.is256BitVector() && Subtarget.hasInt256()) ||
58830 (VT.is512BitVector() && Subtarget.useBWIRegs()))) {
58831 MVT SrcVT = Op0.getOperand(0).getSimpleValueType();
58832 SrcVT = MVT::getVectorVT(SrcVT.getScalarType(),
58833 NumOps * SrcVT.getVectorNumElements());
58834 SDValue Concat0 = CombineSubOperand(SrcVT, Ops, 0);
58835 SDValue Concat1 = CombineSubOperand(SrcVT, Ops, 1);
58836 if (Concat0 || Concat1)
58837 return DAG.getNode(
58838 Opcode, DL, VT,
58839 Concat0 ? Concat0 : ConcatSubOperand(SrcVT, Ops, 0),
58840 Concat1 ? Concat1 : ConcatSubOperand(SrcVT, Ops, 1));
58841 }
58842 break;
58843 case X86ISD::VPERMV:
58844 // TODO: Handle 256-bit and NumOps == 4 cases.
58845 if (!IsSplat && NumOps == 2 &&
58846 (VT.is512BitVector() && Subtarget.useAVX512Regs())) {
58847 MVT OpVT = Op0.getSimpleValueType();
58848 int NumSrcElts = OpVT.getVectorNumElements();
58849 SmallVector<int, 64> ConcatMask;
58850 for (unsigned i = 0; i != NumOps; ++i) {
58851 SmallVector<int, 64> SubMask;
58853 if (!getTargetShuffleMask(Ops[i], false, SubOps, SubMask))
58854 break;
58855 for (int M : SubMask) {
58856 if (0 <= M)
58857 M += i * NumSrcElts;
58858 ConcatMask.push_back(M);
58859 }
58860 }
58861 if (ConcatMask.size() == (NumOps * NumSrcElts))
58862 return lowerShuffleWithPERMV(DL, VT, ConcatMask,
58863 ConcatSubOperand(VT, Ops, 1),
58864 DAG.getUNDEF(VT), Subtarget, DAG);
58865 }
58866 break;
58867 case X86ISD::VPERMV3:
58868 // TODO: Handle 256-bit and NumOps == 4 cases.
58869 if (!IsSplat && NumOps == 2 &&
58870 (VT.is512BitVector() && Subtarget.useAVX512Regs())) {
58871 MVT OpVT = Op0.getSimpleValueType();
58872 int NumSrcElts = OpVT.getVectorNumElements();
58873 SmallVector<int, 64> ConcatMask;
58874 for (unsigned i = 0; i != NumOps; ++i) {
58875 SmallVector<int, 64> SubMask;
58877 if (!getTargetShuffleMask(Ops[i], false, SubOps, SubMask))
58878 break;
58879 for (int M : SubMask) {
58880 if (0 <= M) {
58881 int Src = M < NumSrcElts ? 0 : 2;
58882 M += M < NumSrcElts ? 0 : NumSrcElts;
58883
58884 // Reference the lowest sub if the upper sub is the same.
58885 if (Ops[0].getOperand(Src) != Ops[i].getOperand(Src))
58886 M += i * NumSrcElts;
58887 }
58888 ConcatMask.push_back(M);
58889 }
58890 }
58891 if (ConcatMask.size() == (NumOps * NumSrcElts)) {
58892 SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
58893 SDValue Concat1 = CombineSubOperand(VT, Ops, 2);
58894 if (Concat0 || Concat1)
58895 return lowerShuffleWithPERMV(
58896 DL, VT, ConcatMask,
58897 Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0),
58898 Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 2), Subtarget,
58899 DAG);
58900 }
58901 }
58902 break;
58903 case X86ISD::VPERM2X128: {
58904 if (!IsSplat && VT.is512BitVector() && Subtarget.useAVX512Regs()) {
58905 assert(NumOps == 2 && "Bad concat_vectors operands");
58906 unsigned Imm0 = Ops[0].getConstantOperandVal(2);
58907 unsigned Imm1 = Ops[1].getConstantOperandVal(2);
58908 // TODO: Handle zero'd subvectors.
58909 if ((Imm0 & 0x88) == 0 && (Imm1 & 0x88) == 0) {
58910 int Mask[4] = {(int)(Imm0 & 0x03), (int)((Imm0 >> 4) & 0x3), (int)(Imm1 & 0x03),
58911 (int)((Imm1 >> 4) & 0x3)};
58912 MVT ShuffleVT = VT.isFloatingPoint() ? MVT::v8f64 : MVT::v8i64;
58913 SDValue LHS = concatSubVectors(Ops[0].getOperand(0),
58914 Ops[0].getOperand(1), DAG, DL);
58915 SDValue RHS = concatSubVectors(Ops[1].getOperand(0),
58916 Ops[1].getOperand(1), DAG, DL);
58917 SDValue Res = DAG.getNode(X86ISD::SHUF128, DL, ShuffleVT,
58918 DAG.getBitcast(ShuffleVT, LHS),
58919 DAG.getBitcast(ShuffleVT, RHS),
58920 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
58921 return DAG.getBitcast(VT, Res);
58922 }
58923 }
58924 break;
58925 }
58926 case X86ISD::SHUF128: {
58927 if (!IsSplat && NumOps == 2 && VT.is512BitVector()) {
58928 unsigned Imm0 = Ops[0].getConstantOperandVal(2);
58929 unsigned Imm1 = Ops[1].getConstantOperandVal(2);
58930 unsigned Imm = ((Imm0 & 1) << 0) | ((Imm0 & 2) << 1) | 0x08 |
58931 ((Imm1 & 1) << 4) | ((Imm1 & 2) << 5) | 0x80;
58932 SDValue LHS = concatSubVectors(Ops[0].getOperand(0),
58933 Ops[0].getOperand(1), DAG, DL);
58934 SDValue RHS = concatSubVectors(Ops[1].getOperand(0),
58935 Ops[1].getOperand(1), DAG, DL);
58936 return DAG.getNode(X86ISD::SHUF128, DL, VT, LHS, RHS,
58937 DAG.getTargetConstant(Imm, DL, MVT::i8));
58938 }
58939 break;
58940 }
58941 case ISD::TRUNCATE:
58942 if (!IsSplat && NumOps == 2 && VT.is256BitVector()) {
58943 EVT SrcVT = Ops[0].getOperand(0).getValueType();
58944 if (SrcVT.is256BitVector() && SrcVT.isSimple() &&
58945 SrcVT == Ops[1].getOperand(0).getValueType() &&
58946 Subtarget.useAVX512Regs() &&
58947 Subtarget.getPreferVectorWidth() >= 512 &&
58948 (SrcVT.getScalarSizeInBits() > 16 || Subtarget.useBWIRegs())) {
58949 EVT NewSrcVT = SrcVT.getDoubleNumVectorElementsVT(Ctx);
58950 return DAG.getNode(ISD::TRUNCATE, DL, VT,
58951 ConcatSubOperand(NewSrcVT, Ops, 0));
58952 }
58953 }
58954 break;
58955 case ISD::ANY_EXTEND:
58956 case ISD::SIGN_EXTEND:
58957 case ISD::ZERO_EXTEND:
58958 // TODO: Handle ANY_EXTEND combos with SIGN/ZERO_EXTEND.
58959 if (!IsSplat && NumOps == 2 &&
58960 ((VT.is256BitVector() && Subtarget.hasInt256()) ||
58961 (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
58962 (EltSizeInBits >= 32 || Subtarget.useBWIRegs())))) {
58963 EVT SrcVT = Ops[0].getOperand(0).getValueType();
58964 if (SrcVT.isSimple() && SrcVT.is128BitVector() &&
58965 SrcVT == Ops[1].getOperand(0).getValueType()) {
58966 EVT NewSrcVT = SrcVT.getDoubleNumVectorElementsVT(Ctx);
58967 return DAG.getNode(Opcode, DL, VT,
58968 ConcatSubOperand(NewSrcVT, Ops, 0));
58969 }
58970 }
58971 break;
58975 // TODO: Handle ANY_EXTEND_INREG combos with SIGN/ZERO_EXTEND_INREG.
58976 if (!IsSplat && NumOps == 2 &&
58977 ((VT.is256BitVector() && Subtarget.hasInt256()) ||
58978 (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
58979 (EltSizeInBits >= 32 || Subtarget.useBWIRegs()))) &&
58981 Op0.getOperand(0).getValueType() ==
58982 Ops[0].getOperand(0).getValueType()) {
58983 EVT SrcVT = Op0.getOperand(0).getValueType();
58984 unsigned NumElts = VT.getVectorNumElements();
58985 MVT UnpackSVT =
58986 MVT::getIntegerVT(SrcVT.getScalarSizeInBits() * (NumElts / 2));
58987 MVT UnpackVT =
58988 MVT::getVectorVT(UnpackSVT, 128 / UnpackSVT.getScalarSizeInBits());
58989 SDValue Unpack =
58990 DAG.getNode(X86ISD::UNPCKL, DL, UnpackVT,
58991 DAG.getBitcast(UnpackVT, Ops[0].getOperand(0)),
58992 DAG.getBitcast(UnpackVT, Ops[1].getOperand(0)));
58993 return getEXTEND_VECTOR_INREG(Opcode, DL, VT,
58994 DAG.getBitcast(SrcVT, Unpack), DAG);
58995 }
58996 break;
58997 }
58998 case X86ISD::VSHLI:
58999 case X86ISD::VSRLI:
59000 // Special case: SHL/SRL AVX1 V4i64 by 32-bits can lower as a shuffle.
59001 if (VT == MVT::v4i64 && !Subtarget.hasInt256() &&
59002 llvm::all_of(Ops, [](SDValue Op) {
59003 return Op.getConstantOperandAPInt(1) == 32;
59004 })) {
59005 if (SDValue Res = CombineSubOperand(VT, Ops, 0)) {
59006 SDValue Zero = getZeroVector(MVT::v8i32, Subtarget, DAG, DL);
59007 Res = DAG.getBitcast(MVT::v8i32, Res);
59008 if (Opcode == X86ISD::VSHLI) {
59009 Res = DAG.getVectorShuffle(MVT::v8i32, DL, Res, Zero,
59010 {8, 0, 8, 2, 8, 4, 8, 6});
59011 } else {
59012 Res = DAG.getVectorShuffle(MVT::v8i32, DL, Res, Zero,
59013 {1, 8, 3, 8, 5, 8, 7, 8});
59014 }
59015 return DAG.getBitcast(VT, Res);
59016 }
59017 }
59018 [[fallthrough]];
59019 case X86ISD::VSRAI:
59020 case X86ISD::VSHL:
59021 case X86ISD::VSRL:
59022 case X86ISD::VSRA:
59023 if (((VT.is256BitVector() && Subtarget.hasInt256()) ||
59024 (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
59025 (EltSizeInBits >= 32 || Subtarget.useBWIRegs()))) &&
59026 llvm::all_of(Ops, [Op0](SDValue Op) {
59027 return Op0.getOperand(1) == Op.getOperand(1);
59028 })) {
59029 return DAG.getNode(Opcode, DL, VT, ConcatSubOperand(VT, Ops, 0),
59030 Op0.getOperand(1));
59031 }
59032 break;
59033 case X86ISD::VPERMI:
59034 case X86ISD::VROTLI:
59035 case X86ISD::VROTRI:
59036 if (!IsSplat &&
59037 ((VT.is256BitVector() && Subtarget.hasVLX()) ||
59038 (VT.is512BitVector() && Subtarget.useAVX512Regs())) &&
59039 llvm::all_of(Ops, [Op0](SDValue Op) {
59040 return Op0.getOperand(1) == Op.getOperand(1);
59041 })) {
59042 assert(!(Opcode == X86ISD::VPERMI &&
59043 Op0.getValueType().is128BitVector()) &&
59044 "Illegal 128-bit X86ISD::VPERMI nodes");
59045 return DAG.getNode(Opcode, DL, VT, ConcatSubOperand(VT, Ops, 0),
59046 Op0.getOperand(1));
59047 }
59048 break;
59049 case ISD::AND:
59050 case ISD::OR:
59051 case ISD::XOR:
59052 case X86ISD::ANDNP:
59053 // TODO: AVX512 targets should only use CombineSubOperand like AVX1/2.
59054 if (!IsSplat && (VT.is256BitVector() ||
59055 (VT.is512BitVector() && Subtarget.useAVX512Regs()))) {
59056 // Don't concatenate root AVX1 NOT patterns.
59057 // TODO: Allow NOT folding if Concat0 succeeds.
59058 if (Opcode == ISD::XOR && Depth == 0 && !Subtarget.hasInt256() &&
59059 llvm::all_of(Ops, [](SDValue X) {
59060 return ISD::isBuildVectorAllOnes(X.getOperand(1).getNode());
59061 }))
59062 break;
59063 SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
59064 SDValue Concat1 = CombineSubOperand(VT, Ops, 1);
59065 if (Concat0 || Concat1 || Subtarget.useAVX512Regs())
59066 return DAG.getNode(Opcode, DL, VT,
59067 Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0),
59068 Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1));
59069 }
59070 break;
59071 case X86ISD::PCMPEQ:
59072 case X86ISD::PCMPGT:
59073 // TODO: 512-bit PCMPEQ/PCMPGT -> VPCMP+VPMOVM2 handling.
59074 if (!IsSplat && VT.is256BitVector() && Subtarget.hasInt256()) {
59075 SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
59076 SDValue Concat1 = CombineSubOperand(VT, Ops, 1);
59077 if (Concat0 || Concat1)
59078 return DAG.getNode(Opcode, DL, VT,
59079 Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0),
59080 Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1));
59081 break;
59082 }
59083
59084 if (!IsSplat && VT == MVT::v8i32) {
59085 // Without AVX2, see if we can cast the values to v8f32 and use fcmp.
59086 // TODO: Handle v4f64 as well?
59087 unsigned MaxSigBitsLHS = 0, MaxSigBitsRHS = 0;
59088 for (unsigned I = 0; I != NumOps; ++I) {
59089 MaxSigBitsLHS =
59090 std::max(MaxSigBitsLHS,
59091 DAG.ComputeMaxSignificantBits(Ops[I].getOperand(0)));
59092 MaxSigBitsRHS =
59093 std::max(MaxSigBitsRHS,
59094 DAG.ComputeMaxSignificantBits(Ops[I].getOperand(1)));
59095 if (MaxSigBitsLHS == EltSizeInBits && MaxSigBitsRHS == EltSizeInBits)
59096 break;
59097 }
59098
59099 ISD::CondCode ICC =
59100 Opcode == X86ISD::PCMPEQ ? ISD::SETEQ : ISD::SETGT;
59101 ISD::CondCode FCC =
59103
59104 MVT FpSVT = MVT::getFloatingPointVT(EltSizeInBits);
59105 MVT FpVT = VT.changeVectorElementType(FpSVT);
59106
59107 if (std::optional<unsigned> CastOpc =
59108 CastIntSETCCtoFP(FpVT, ICC, MaxSigBitsLHS, MaxSigBitsRHS)) {
59109 SDValue LHS = CombineSubOperand(VT, Ops, 0);
59110 SDValue RHS = CombineSubOperand(VT, Ops, 1);
59111 LHS = LHS ? LHS : ConcatSubOperand(VT, Ops, 0);
59112 RHS = RHS ? RHS : ConcatSubOperand(VT, Ops, 1);
59113 LHS = DAG.getNode(*CastOpc, DL, FpVT, LHS);
59114 RHS = DAG.getNode(*CastOpc, DL, FpVT, RHS);
59115
59116 bool IsAlwaysSignaling;
59117 unsigned FSETCC =
59118 translateX86FSETCC(FCC, LHS, RHS, IsAlwaysSignaling);
59119 return DAG.getBitcast(
59120 VT, DAG.getNode(X86ISD::CMPP, DL, FpVT, LHS, RHS,
59121 DAG.getTargetConstant(FSETCC, DL, MVT::i8)));
59122 }
59123 }
59124 break;
59125 case ISD::CTPOP:
59126 case ISD::CTTZ:
59127 case ISD::CTLZ:
59130 if (!IsSplat && ((VT.is256BitVector() && Subtarget.hasInt256()) ||
59131 (VT.is512BitVector() && Subtarget.useBWIRegs()))) {
59132 return DAG.getNode(Opcode, DL, VT, ConcatSubOperand(VT, Ops, 0));
59133 }
59134 break;
59136 // TODO: GF2P8AFFINEQB should use CombineSubOperand.
59137 if (!IsSplat &&
59138 (VT.is256BitVector() ||
59139 (VT.is512BitVector() && Subtarget.useAVX512Regs())) &&
59140 llvm::all_of(Ops, [Op0](SDValue Op) {
59141 return Op0.getOperand(2) == Op.getOperand(2);
59142 })) {
59143 return DAG.getNode(Opcode, DL, VT, ConcatSubOperand(VT, Ops, 0),
59144 ConcatSubOperand(VT, Ops, 1), Op0.getOperand(2));
59145 }
59146 break;
59147 case ISD::ADD:
59148 case ISD::SUB:
59149 case ISD::MUL:
59150 // TODO: Add more integer binops?
59151 if (!IsSplat && ((VT.is256BitVector() && Subtarget.hasInt256()) ||
59152 (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
59153 (EltSizeInBits >= 32 || Subtarget.useBWIRegs())))) {
59154 SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
59155 SDValue Concat1 = CombineSubOperand(VT, Ops, 1);
59156 if (Concat0 || Concat1 || llvm::all_of(Ops, [](SDValue Op) {
59157 return Op.getOperand(0) == Op.getOperand(1);
59158 }))
59159 return DAG.getNode(Opcode, DL, VT,
59160 Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0),
59161 Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1));
59162 }
59163 break;
59164 // Due to VADD, VSUB, VMUL can executed on more ports than VINSERT and
59165 // their latency are short, so here we don't replace them unless we won't
59166 // introduce extra VINSERT.
59167 case ISD::FADD:
59168 case ISD::FSUB:
59169 case ISD::FMUL:
59170 if (!IsSplat && (VT.is256BitVector() ||
59171 (VT.is512BitVector() && Subtarget.useAVX512Regs()))) {
59172 SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
59173 SDValue Concat1 = CombineSubOperand(VT, Ops, 1);
59174 if (Concat0 || Concat1)
59175 return DAG.getNode(Opcode, DL, VT,
59176 Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0),
59177 Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1));
59178 }
59179 break;
59180 // Always prefer to concatenate high latency FDIV instructions.
59181 case ISD::FDIV:
59182 if (!IsSplat && (VT.is256BitVector() ||
59183 (VT.is512BitVector() && Subtarget.useAVX512Regs()))) {
59184 return DAG.getNode(Opcode, DL, VT, ConcatSubOperand(VT, Ops, 0),
59185 ConcatSubOperand(VT, Ops, 1));
59186 }
59187 break;
59188 case X86ISD::HADD:
59189 case X86ISD::HSUB:
59190 case X86ISD::FHADD:
59191 case X86ISD::FHSUB:
59192 if (!IsSplat && VT.is256BitVector() &&
59193 (VT.isFloatingPoint() || Subtarget.hasInt256())) {
59194 SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
59195 SDValue Concat1 = CombineSubOperand(VT, Ops, 1);
59196 if (Concat0 || Concat1)
59197 return DAG.getNode(Opcode, DL, VT,
59198 Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0),
59199 Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1));
59200 }
59201 break;
59202 case X86ISD::PACKSS:
59203 case X86ISD::PACKUS:
59204 if (!IsSplat && ((VT.is256BitVector() && Subtarget.hasInt256()) ||
59205 (VT.is512BitVector() && Subtarget.useBWIRegs()))) {
59206 MVT SrcVT = Op0.getOperand(0).getSimpleValueType();
59207 SrcVT = MVT::getVectorVT(SrcVT.getScalarType(),
59208 NumOps * SrcVT.getVectorNumElements());
59209 SDValue Concat0 = CombineSubOperand(SrcVT, Ops, 0);
59210 SDValue Concat1 = CombineSubOperand(SrcVT, Ops, 1);
59211 if (Concat0 || Concat1)
59212 return DAG.getNode(
59213 Opcode, DL, VT,
59214 Concat0 ? Concat0 : ConcatSubOperand(SrcVT, Ops, 0),
59215 Concat1 ? Concat1 : ConcatSubOperand(SrcVT, Ops, 1));
59216 }
59217 break;
59218 case X86ISD::VSHLD:
59219 case X86ISD::VSHRD:
59220 case X86ISD::PALIGNR:
59221 if (!IsSplat &&
59222 ((VT.is256BitVector() && Subtarget.hasInt256()) ||
59223 (VT.is512BitVector() && Subtarget.useBWIRegs())) &&
59224 llvm::all_of(Ops, [Op0](SDValue Op) {
59225 return Op0.getOperand(2) == Op.getOperand(2);
59226 })) {
59227 SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
59228 SDValue Concat1 = CombineSubOperand(VT, Ops, 1);
59229 if (Concat0 || Concat1)
59230 return DAG.getNode(Opcode, DL, VT,
59231 Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0),
59232 Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1),
59233 Op0.getOperand(2));
59234 }
59235 break;
59236 case X86ISD::BLENDI:
59237 if (VT.is256BitVector() && NumOps == 2 &&
59238 (EltSizeInBits >= 32 ||
59239 (Subtarget.hasInt256() &&
59240 Ops[0].getOperand(2) == Ops[1].getOperand(2)))) {
59241 SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
59242 SDValue Concat1 = CombineSubOperand(VT, Ops, 1);
59243 if (Concat0 || Concat1) {
59244 unsigned NumElts = VT.getVectorNumElements();
59245 APInt Mask = getBLENDIBlendMask(Ops[0]).zext(NumElts);
59246 Mask.insertBits(getBLENDIBlendMask(Ops[1]), NumElts / 2);
59247 Mask = Mask.zextOrTrunc(8);
59248 return DAG.getNode(Opcode, DL, VT,
59249 Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0),
59250 Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1),
59251 DAG.getTargetConstant(Mask, DL, MVT::i8));
59252 }
59253 }
59254 // TODO: BWI targets should only use CombineSubOperand.
59255 if (((VT.is256BitVector() && Subtarget.hasVLX()) ||
59256 (VT.is512BitVector() && Subtarget.useAVX512Regs())) &&
59257 (EltSizeInBits >= 32 || Subtarget.useBWIRegs())) {
59258 SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
59259 SDValue Concat1 = CombineSubOperand(VT, Ops, 1);
59260 if (Concat0 || Concat1 || Subtarget.useBWIRegs()) {
59261 unsigned NumElts = VT.getVectorNumElements();
59262 APInt Mask = getBLENDIBlendMask(Ops[0]).zext(NumElts);
59263 for (unsigned I = 1; I != NumOps; ++I)
59264 Mask.insertBits(getBLENDIBlendMask(Ops[I]), I * (NumElts / NumOps));
59265 unsigned NumMaskBits = NumElts >= 8 ? NumElts : 8;
59266 Mask = Mask.zextOrTrunc(NumMaskBits);
59267 MVT MaskSVT = MVT::getIntegerVT(NumMaskBits);
59268 MVT MaskVT = MVT::getVectorVT(MVT::i1, NumMaskBits);
59269 SDValue Sel =
59270 DAG.getBitcast(MaskVT, DAG.getConstant(Mask, DL, MaskSVT));
59271 Sel = extractSubVector(Sel, 0, DAG, DL, NumElts);
59272 Concat0 = Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0);
59273 Concat1 = Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1);
59274 return DAG.getSelect(DL, VT, Sel, Concat1, Concat0);
59275 }
59276 }
59277 break;
59278 case ISD::VSELECT:
59279 // TODO: VSELECT should use CombineSubOperand.
59280 if (!IsSplat && Subtarget.hasAVX512() &&
59281 (VT.is256BitVector() ||
59282 (VT.is512BitVector() && Subtarget.useAVX512Regs())) &&
59283 (EltSizeInBits >= 32 || Subtarget.hasBWI())) {
59284 EVT SelVT = Ops[0].getOperand(0).getValueType();
59285 if (SelVT.getVectorElementType() == MVT::i1) {
59286 SelVT = EVT::getVectorVT(Ctx, MVT::i1,
59287 NumOps * SelVT.getVectorNumElements());
59288 if (TLI.isTypeLegal(SelVT))
59289 return DAG.getNode(
59290 Opcode, DL, VT, ConcatSubOperand(SelVT.getSimpleVT(), Ops, 0),
59291 ConcatSubOperand(VT, Ops, 1), ConcatSubOperand(VT, Ops, 2));
59292 }
59293 }
59294 [[fallthrough]];
59295 case X86ISD::BLENDV:
59296 // TODO: BLENDV should use CombineSubOperand.
59297 if (!IsSplat && VT.is256BitVector() && NumOps == 2 &&
59298 (EltSizeInBits >= 32 || Subtarget.hasInt256()) &&
59299 IsConcatFree(VT, Ops, 1) && IsConcatFree(VT, Ops, 2)) {
59300 EVT SelVT = Ops[0].getOperand(0).getValueType();
59301 SelVT = SelVT.getDoubleNumVectorElementsVT(Ctx);
59302 if (TLI.isTypeLegal(SelVT))
59303 return DAG.getNode(
59304 Opcode, DL, VT, ConcatSubOperand(SelVT.getSimpleVT(), Ops, 0),
59305 ConcatSubOperand(VT, Ops, 1), ConcatSubOperand(VT, Ops, 2));
59306 }
59307 break;
59308 }
59309 }
59310
59311 // Fold subvector loads into one.
59312 // If needed, look through bitcasts to get to the load.
59313 if (auto *FirstLd = dyn_cast<LoadSDNode>(peekThroughBitcasts(Op0))) {
59314 unsigned Fast;
59315 const X86TargetLowering *TLI = Subtarget.getTargetLowering();
59316 if (TLI->allowsMemoryAccess(Ctx, DAG.getDataLayout(), VT,
59317 *FirstLd->getMemOperand(), &Fast) &&
59318 Fast) {
59319 if (SDValue Ld =
59320 EltsFromConsecutiveLoads(VT, Ops, DL, DAG, Subtarget, false))
59321 return Ld;
59322 }
59323 }
59324
59325 // Attempt to fold target constant loads.
59326 if (all_of(Ops, [](SDValue Op) { return getTargetConstantFromNode(Op); })) {
59327 SmallVector<APInt> EltBits;
59328 APInt UndefElts = APInt::getZero(VT.getVectorNumElements());
59329 for (unsigned I = 0; I != NumOps; ++I) {
59330 APInt OpUndefElts;
59331 SmallVector<APInt> OpEltBits;
59332 if (!getTargetConstantBitsFromNode(Ops[I], EltSizeInBits, OpUndefElts,
59333 OpEltBits, /*AllowWholeUndefs*/ true,
59334 /*AllowPartialUndefs*/ false))
59335 break;
59336 EltBits.append(OpEltBits);
59337 UndefElts.insertBits(OpUndefElts, I * OpUndefElts.getBitWidth());
59338 }
59339 if (EltBits.size() == VT.getVectorNumElements()) {
59340 Constant *C = getConstantVector(VT, EltBits, UndefElts, Ctx);
59341 MVT PVT = TLI.getPointerTy(DAG.getDataLayout());
59342 SDValue CV = DAG.getConstantPool(C, PVT);
59345 SDValue Ld = DAG.getLoad(VT, DL, DAG.getEntryNode(), CV, MPI);
59346 SDValue Sub = extractSubVector(Ld, 0, DAG, DL, Op0.getValueSizeInBits());
59348 return Ld;
59349 }
59350 }
59351
59352 // If this simple subvector or scalar/subvector broadcast_load is inserted
59353 // into both halves, use a larger broadcast_load. Update other uses to use
59354 // an extracted subvector.
59355 if (IsSplat &&
59356 (VT.is256BitVector() || (VT.is512BitVector() && Subtarget.hasAVX512()))) {
59357 if (ISD::isNormalLoad(Op0.getNode()) ||
59360 auto *Mem = cast<MemSDNode>(Op0);
59361 unsigned Opc = Op0.getOpcode() == X86ISD::VBROADCAST_LOAD
59364 if (SDValue BcastLd =
59365 getBROADCAST_LOAD(Opc, DL, VT, Mem->getMemoryVT(), Mem, 0, DAG)) {
59366 SDValue BcastSrc =
59367 extractSubVector(BcastLd, 0, DAG, DL, Op0.getValueSizeInBits());
59368 DAG.ReplaceAllUsesOfValueWith(Op0, BcastSrc);
59369 return BcastLd;
59370 }
59371 }
59372 }
59373
59374 // If we're splatting a 128-bit subvector to 512-bits, use SHUF128 directly.
59375 if (IsSplat && NumOps == 4 && VT.is512BitVector() &&
59376 Subtarget.useAVX512Regs()) {
59377 MVT ShuffleVT = VT.isFloatingPoint() ? MVT::v8f64 : MVT::v8i64;
59378 SDValue Res = widenSubVector(Op0, false, Subtarget, DAG, DL, 512);
59379 Res = DAG.getBitcast(ShuffleVT, Res);
59380 Res = DAG.getNode(X86ISD::SHUF128, DL, ShuffleVT, Res, Res,
59381 getV4X86ShuffleImm8ForMask({0, 0, 0, 0}, DL, DAG));
59382 return DAG.getBitcast(VT, Res);
59383 }
59384
59385 // We can always convert per-lane vXf64 shuffles into VSHUFPD.
59386 if (!IsSplat &&
59387 ((NumOps == 2 && VT == MVT::v4f64) ||
59388 (NumOps == 4 && VT == MVT::v8f64 && Subtarget.useAVX512Regs())) &&
59389 all_of(Ops, [](SDValue Op) { return Op.hasOneUse(); })) {
59390 // Collect the individual per-lane v2f64/v4f64 shuffles.
59391 MVT OpVT = Ops[0].getSimpleValueType();
59392 unsigned NumOpElts = OpVT.getVectorNumElements();
59395 if (all_of(seq<int>(NumOps), [&](int I) {
59396 return getTargetShuffleInputs(Ops[I], SrcOps[I], SrcMasks[I], DAG,
59397 Depth + 1) &&
59398 !is128BitLaneCrossingShuffleMask(OpVT, SrcMasks[I]) &&
59399 none_of(SrcMasks[I], isUndefOrZero) &&
59400 SrcMasks[I].size() == NumOpElts &&
59401 all_of(SrcOps[I], [&OpVT](SDValue V) {
59402 return V.getValueType() == OpVT;
59403 });
59404 })) {
59405 // Concatenate the shuffle masks into SHUFPD mask and collect subops.
59406 bool Unary = true;
59407 unsigned SHUFPDMask = 0;
59409 for (unsigned I = 0; I != NumOps; ++I) {
59410 LHS[I] = SrcOps[I][SrcMasks[I][0] / NumOpElts];
59411 RHS[I] = SrcOps[I][SrcMasks[I][1] / NumOpElts];
59412 Unary &= LHS[I] == RHS[I];
59413 for (unsigned J = 0; J != NumOpElts; ++J)
59414 SHUFPDMask |= (SrcMasks[I][J] & 1) << ((I * NumOpElts) + J);
59415 }
59416 // Concat SHUFPD LHS/RHS operands - if they match then it will become a
59417 // PERMILPD mask and we can always profitably concatenate them.
59418 SDValue Concat0 =
59419 combineConcatVectorOps(DL, VT, LHS, DAG, Subtarget, Depth + 1);
59420 SDValue Concat1 =
59421 combineConcatVectorOps(DL, VT, RHS, DAG, Subtarget, Depth + 1);
59422 if (Unary || Concat0 || Concat1) {
59423 Concat0 =
59424 Concat0 ? Concat0 : DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LHS);
59425 Concat1 =
59426 Concat1 ? Concat1 : DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, RHS);
59427 return DAG.getNode(X86ISD::SHUFP, DL, VT, Concat0, Concat1,
59428 DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
59429 }
59430 }
59431 }
59432
59433 return SDValue();
59434}
59435
59438 const X86Subtarget &Subtarget) {
59439 EVT VT = N->getValueType(0);
59440 EVT SrcVT = N->getOperand(0).getValueType();
59441 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
59443
59444 if (VT.getVectorElementType() == MVT::i1) {
59445 // Attempt to constant fold.
59446 unsigned SubSizeInBits = SrcVT.getSizeInBits();
59448 for (unsigned I = 0, E = Ops.size(); I != E; ++I) {
59450 if (!C) break;
59451 Constant.insertBits(C->getAPIntValue(), I * SubSizeInBits);
59452 if (I == (E - 1)) {
59453 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
59454 if (TLI.isTypeLegal(IntVT))
59455 return DAG.getBitcast(VT, DAG.getConstant(Constant, SDLoc(N), IntVT));
59456 }
59457 }
59458
59459 // Don't do anything else for i1 vectors.
59460 return SDValue();
59461 }
59462
59463 if (Subtarget.hasAVX() && TLI.isTypeLegal(VT) && TLI.isTypeLegal(SrcVT)) {
59464 if (SDValue R = combineConcatVectorOps(SDLoc(N), VT.getSimpleVT(), Ops, DAG,
59465 Subtarget))
59466 return R;
59467 }
59468
59469 return SDValue();
59470}
59471
59474 const X86Subtarget &Subtarget) {
59475 if (DCI.isBeforeLegalizeOps())
59476 return SDValue();
59477
59478 MVT OpVT = N->getSimpleValueType(0);
59479
59480 bool IsI1Vector = OpVT.getVectorElementType() == MVT::i1;
59481
59482 SDLoc dl(N);
59483 SDValue Vec = N->getOperand(0);
59484 SDValue SubVec = N->getOperand(1);
59485
59486 uint64_t IdxVal = N->getConstantOperandVal(2);
59487 MVT SubVecVT = SubVec.getSimpleValueType();
59488 int VecNumElts = OpVT.getVectorNumElements();
59489 int SubVecNumElts = SubVecVT.getVectorNumElements();
59490
59491 if (Vec.isUndef() && SubVec.isUndef())
59492 return DAG.getUNDEF(OpVT);
59493
59494 // Inserting undefs/zeros into zeros/undefs is a zero vector.
59495 if ((Vec.isUndef() || ISD::isBuildVectorAllZeros(Vec.getNode())) &&
59496 (SubVec.isUndef() || ISD::isBuildVectorAllZeros(SubVec.getNode())))
59497 return getZeroVector(OpVT, Subtarget, DAG, dl);
59498
59500 // If we're inserting into a zero vector and then into a larger zero vector,
59501 // just insert into the larger zero vector directly.
59502 if (SubVec.getOpcode() == ISD::INSERT_SUBVECTOR &&
59504 uint64_t Idx2Val = SubVec.getConstantOperandVal(2);
59505 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
59506 getZeroVector(OpVT, Subtarget, DAG, dl),
59507 SubVec.getOperand(1),
59508 DAG.getVectorIdxConstant(IdxVal + Idx2Val, dl));
59509 }
59510
59511 // If we're inserting into a zero vector and our input was extracted from an
59512 // insert into a zero vector of the same type and the extraction was at
59513 // least as large as the original insertion. Just insert the original
59514 // subvector into a zero vector.
59515 if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR && IdxVal == 0 &&
59516 isNullConstant(SubVec.getOperand(1)) &&
59518 SDValue Ins = SubVec.getOperand(0);
59519 if (isNullConstant(Ins.getOperand(2)) &&
59520 ISD::isBuildVectorAllZeros(Ins.getOperand(0).getNode()) &&
59521 Ins.getOperand(1).getValueSizeInBits().getFixedValue() <=
59522 SubVecVT.getFixedSizeInBits())
59523 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
59524 getZeroVector(OpVT, Subtarget, DAG, dl),
59525 Ins.getOperand(1), N->getOperand(2));
59526 }
59527 }
59528
59529 // Stop here if this is an i1 vector.
59530 if (IsI1Vector)
59531 return SDValue();
59532
59533 // Eliminate an intermediate vector widening:
59534 // insert_subvector X, (insert_subvector undef, Y, 0), Idx -->
59535 // insert_subvector X, Y, Idx
59536 // TODO: This is a more general version of a DAGCombiner fold, can we move it
59537 // there?
59538 if (SubVec.getOpcode() == ISD::INSERT_SUBVECTOR &&
59539 SubVec.getOperand(0).isUndef() && isNullConstant(SubVec.getOperand(2)))
59540 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, Vec,
59541 SubVec.getOperand(1), N->getOperand(2));
59542
59543 // If this is an insert of an extract, combine to a shuffle. Don't do this
59544 // if the insert or extract can be represented with a subregister operation.
59545 if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
59546 SubVec.getOperand(0).getSimpleValueType() == OpVT &&
59547 (IdxVal != 0 ||
59548 !(Vec.isUndef() || ISD::isBuildVectorAllZeros(Vec.getNode())))) {
59549 SDValue ExtSrc = SubVec.getOperand(0);
59550 int ExtIdxVal = SubVec.getConstantOperandVal(1);
59551 // Create a shuffle mask matching the extraction and insertion.
59552 SmallVector<int, 64> Mask(VecNumElts);
59553 std::iota(Mask.begin(), Mask.end(), 0);
59554 std::iota(Mask.begin() + IdxVal, Mask.begin() + IdxVal + SubVecNumElts,
59555 ExtIdxVal + VecNumElts);
59556 if (ExtIdxVal != 0)
59557 return DAG.getVectorShuffle(OpVT, dl, Vec, ExtSrc, Mask);
59558 // See if we can use a blend instead of extract/insert pair.
59559 SmallVector<int, 64> BlendMask(VecNumElts);
59560 std::iota(BlendMask.begin(), BlendMask.end(), 0);
59561 std::iota(BlendMask.begin() + IdxVal,
59562 BlendMask.begin() + IdxVal + SubVecNumElts, VecNumElts + IdxVal);
59563 if (isShuffleEquivalent(Mask, BlendMask, Vec, ExtSrc) &&
59564 VecNumElts == (2 * SubVecNumElts)) {
59565 assert((IdxVal % SubVecNumElts) == 0 && "Unaligned subvector insertion");
59566 if (OpVT.is256BitVector() && SubVecVT.is128BitVector()) {
59567 SDValue Blend = DAG.getNode(
59568 X86ISD::BLENDI, dl, MVT::v8f32, DAG.getBitcast(MVT::v8f32, Vec),
59569 DAG.getBitcast(MVT::v8f32, ExtSrc),
59570 DAG.getTargetConstant(IdxVal == 0 ? 0x0F : 0xF0, dl, MVT::i8));
59571 return DAG.getBitcast(OpVT, Blend);
59572 } else if (OpVT.is512BitVector() && SubVecVT.is256BitVector()) {
59573 MVT ShufVT = OpVT.isInteger() ? MVT::v8i64 : MVT::v8f64;
59574 SDValue Lo = DAG.getBitcast(ShufVT, IdxVal == 0 ? ExtSrc : Vec);
59575 SDValue Hi = DAG.getBitcast(ShufVT, IdxVal == 0 ? Vec : ExtSrc);
59576 SDValue Shuffle =
59577 DAG.getNode(X86ISD::SHUF128, dl, ShufVT, Lo, Hi,
59578 getV4X86ShuffleImm8ForMask({0, 1, 2, 3}, dl, DAG));
59579 return DAG.getBitcast(OpVT, Shuffle);
59580 }
59581 }
59582 }
59583
59584 // Match concat_vector style patterns.
59585 SmallVector<SDValue, 2> SubVectorOps;
59586 if (collectConcatOps(N, SubVectorOps, DAG)) {
59587 if (SDValue Fold =
59588 combineConcatVectorOps(dl, OpVT, SubVectorOps, DAG, Subtarget))
59589 return Fold;
59590
59591 // If we're inserting all zeros into the upper half, change this to
59592 // a concat with zero. We will match this to a move
59593 // with implicit upper bit zeroing during isel.
59594 // We do this here because we don't want combineConcatVectorOps to
59595 // create INSERT_SUBVECTOR from CONCAT_VECTORS.
59596 if (SubVectorOps.size() == 2 &&
59597 ISD::isBuildVectorAllZeros(SubVectorOps[1].getNode()))
59598 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
59599 getZeroVector(OpVT, Subtarget, DAG, dl),
59600 SubVectorOps[0], DAG.getVectorIdxConstant(0, dl));
59601
59602 // Attempt to recursively combine to a shuffle.
59603 if (all_of(SubVectorOps, [](SDValue SubOp) {
59605 })) {
59606 SDValue Op(N, 0);
59607 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
59608 return Res;
59609 }
59610 }
59611
59612 // If this is a broadcast insert into an upper undef, use a larger broadcast.
59613 if (Vec.isUndef() && IdxVal != 0 && SubVec.getOpcode() == X86ISD::VBROADCAST)
59614 return DAG.getNode(X86ISD::VBROADCAST, dl, OpVT, SubVec.getOperand(0));
59615
59616 // If this is a broadcast load inserted into an upper undef, use a larger
59617 // broadcast load.
59618 if (Vec.isUndef() && IdxVal != 0 && SubVec.hasOneUse() &&
59619 SubVec.getOpcode() == X86ISD::VBROADCAST_LOAD) {
59620 auto *MemIntr = cast<MemIntrinsicSDNode>(SubVec);
59622 MemIntr->getMemoryVT(), MemIntr, 0, DAG);
59623 }
59624
59625 // If we're splatting the lower half subvector of a full vector load into the
59626 // upper half, attempt to create a subvector broadcast.
59627 if ((int)IdxVal == (VecNumElts / 2) &&
59628 Vec.getValueSizeInBits() == (2 * SubVec.getValueSizeInBits())) {
59629 auto *VecLd = dyn_cast<LoadSDNode>(Vec);
59630 auto *SubLd = dyn_cast<LoadSDNode>(SubVec);
59631 if (VecLd && SubLd &&
59633 SubLd, VecLd, SubVec.getValueSizeInBits() / 8, 0)) {
59635 SubVecVT, SubLd, 0, DAG);
59636 SDValue NewSubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVecVT,
59637 BcastLd, DAG.getVectorIdxConstant(0, dl));
59638 DCI.CombineTo(SubLd, NewSubVec, BcastLd.getValue(1));
59639 return BcastLd;
59640 }
59641 }
59642
59643 // Attempt to constant fold (if we're not widening).
59644 if (!Vec.isUndef() && !ISD::isBuildVectorAllZeros(Vec.getNode())) {
59645 unsigned EltSizeInBits = OpVT.getScalarSizeInBits();
59646 APInt VecUndefElts, SubUndefElts;
59647 SmallVector<APInt, 16> VecEltBits, SubEltBits;
59648 if (getTargetConstantBitsFromNode(Vec, EltSizeInBits, VecUndefElts,
59649 VecEltBits) &&
59650 getTargetConstantBitsFromNode(SubVec, EltSizeInBits, SubUndefElts,
59651 SubEltBits)) {
59652 VecUndefElts.insertBits(SubUndefElts, IdxVal);
59653 llvm::copy(SubEltBits, VecEltBits.begin() + IdxVal);
59654 return getConstVector(VecEltBits, VecUndefElts, OpVT, DAG, dl);
59655 }
59656 }
59657
59658 // Attempt to recursively combine to a shuffle.
59661 SDValue Op(N, 0);
59662 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
59663 return Res;
59664 }
59665
59666 // Match insertion of subvector load that perfectly aliases a base load.
59667 if ((IdxVal % SubVecNumElts) == 0 && ISD::isNormalLoad(Vec.getNode()) &&
59668 ISD::isNormalLoad(SubVec.getNode()) &&
59670 cast<LoadSDNode>(SubVec), cast<LoadSDNode>(Vec),
59671 SubVec.getValueSizeInBits() / 8, IdxVal / SubVecNumElts))
59672 return Vec;
59673
59674 return SDValue();
59675}
59676
59677/// If we are extracting a subvector of a vector select and the select condition
59678/// is composed of concatenated vectors, try to narrow the select width. This
59679/// is a common pattern for AVX1 integer code because 256-bit selects may be
59680/// legal, but there is almost no integer math/logic available for 256-bit.
59681/// This function should only be called with legal types (otherwise, the calls
59682/// to get simple value types will assert).
59684 SelectionDAG &DAG) {
59685 SDValue Sel = Ext->getOperand(0);
59686 if (Sel.getOpcode() != ISD::VSELECT ||
59687 !isFreeToSplitVector(Sel.getOperand(0), DAG))
59688 return SDValue();
59689
59690 // Note: We assume simple value types because this should only be called with
59691 // legal operations/types.
59692 // TODO: This can be extended to handle extraction to 256-bits.
59693 MVT VT = Ext->getSimpleValueType(0);
59694 if (!VT.is128BitVector())
59695 return SDValue();
59696
59697 MVT SelCondVT = Sel.getOperand(0).getSimpleValueType();
59698 if (!SelCondVT.is256BitVector() && !SelCondVT.is512BitVector())
59699 return SDValue();
59700
59701 MVT WideVT = Ext->getOperand(0).getSimpleValueType();
59702 MVT SelVT = Sel.getSimpleValueType();
59703 assert((SelVT.is256BitVector() || SelVT.is512BitVector()) &&
59704 "Unexpected vector type with legal operations");
59705
59706 unsigned SelElts = SelVT.getVectorNumElements();
59707 unsigned CastedElts = WideVT.getVectorNumElements();
59708 unsigned ExtIdx = Ext->getConstantOperandVal(1);
59709 if (SelElts % CastedElts == 0) {
59710 // The select has the same or more (narrower) elements than the extract
59711 // operand. The extraction index gets scaled by that factor.
59712 ExtIdx *= (SelElts / CastedElts);
59713 } else if (CastedElts % SelElts == 0) {
59714 // The select has less (wider) elements than the extract operand. Make sure
59715 // that the extraction index can be divided evenly.
59716 unsigned IndexDivisor = CastedElts / SelElts;
59717 if (ExtIdx % IndexDivisor != 0)
59718 return SDValue();
59719 ExtIdx /= IndexDivisor;
59720 } else {
59721 llvm_unreachable("Element count of simple vector types are not divisible?");
59722 }
59723
59724 unsigned NarrowingFactor = WideVT.getSizeInBits() / VT.getSizeInBits();
59725 unsigned NarrowElts = SelElts / NarrowingFactor;
59726 MVT NarrowSelVT = MVT::getVectorVT(SelVT.getVectorElementType(), NarrowElts);
59727 SDValue ExtCond = extract128BitVector(Sel.getOperand(0), ExtIdx, DAG, DL);
59728 SDValue ExtT = extract128BitVector(Sel.getOperand(1), ExtIdx, DAG, DL);
59729 SDValue ExtF = extract128BitVector(Sel.getOperand(2), ExtIdx, DAG, DL);
59730 SDValue NarrowSel = DAG.getSelect(DL, NarrowSelVT, ExtCond, ExtT, ExtF);
59731 return DAG.getBitcast(VT, NarrowSel);
59732}
59733
59736 const X86Subtarget &Subtarget) {
59737 if (!N->getValueType(0).isSimple())
59738 return SDValue();
59739
59740 MVT VT = N->getSimpleValueType(0);
59741 SDValue InVec = N->getOperand(0);
59742 unsigned IdxVal = N->getConstantOperandVal(1);
59743 EVT InVecVT = InVec.getValueType();
59744 unsigned SizeInBits = VT.getSizeInBits();
59745 unsigned InSizeInBits = InVecVT.getSizeInBits();
59746 unsigned NumSubElts = VT.getVectorNumElements();
59747 unsigned NumInElts = InVecVT.getVectorNumElements();
59748 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
59749 SDLoc DL(N);
59750
59751 // For AVX1 only, if we are extracting from a 256-bit and+not (which will
59752 // eventually get combined/lowered into ANDNP) with a concatenated operand,
59753 // split the 'and' into 128-bit ops to avoid the concatenate and extract.
59754 // We let generic combining take over from there to simplify the
59755 // insert/extract and 'not'.
59756 // This pattern emerges during AVX1 legalization. We handle it before lowering
59757 // to avoid complications like splitting constant vector loads.
59758 if (Subtarget.hasAVX() && !Subtarget.hasAVX2() && TLI.isTypeLegal(InVecVT) &&
59759 InSizeInBits == 256 && InVec.getOpcode() == ISD::AND) {
59760 auto isConcatenatedNot = [](SDValue V) {
59761 V = peekThroughBitcasts(V);
59762 if (!isBitwiseNot(V))
59763 return false;
59764 SDValue NotOp = V->getOperand(0);
59766 };
59767 if (isConcatenatedNot(InVec.getOperand(0)) ||
59768 isConcatenatedNot(InVec.getOperand(1))) {
59769 // extract (and v4i64 X, (not (concat Y1, Y2))), n -> andnp v2i64 X(n), Y1
59770 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT,
59771 splitVectorIntBinary(InVec, DAG, DL),
59772 N->getOperand(1));
59773 }
59774 }
59775
59776 if (DCI.isBeforeLegalizeOps())
59777 return SDValue();
59778
59779 if (SDValue V = narrowExtractedVectorSelect(N, DL, DAG))
59780 return V;
59781
59783 return getZeroVector(VT, Subtarget, DAG, DL);
59784
59785 if (ISD::isBuildVectorAllOnes(InVec.getNode())) {
59786 if (VT.getScalarType() == MVT::i1)
59787 return DAG.getConstant(1, DL, VT);
59788 return getOnesVector(VT, DAG, DL);
59789 }
59790
59791 if (InVec.getOpcode() == ISD::BUILD_VECTOR)
59792 return DAG.getBuildVector(VT, DL, InVec->ops().slice(IdxVal, NumSubElts));
59793
59794 // EXTRACT_SUBVECTOR(EXTRACT_SUBVECTOR(V,C1)),C2) - EXTRACT_SUBVECTOR(V,C1+C2)
59795 if (IdxVal != 0 && InVec.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
59796 InVec.hasOneUse() && TLI.isTypeLegal(VT) &&
59797 TLI.isTypeLegal(InVec.getOperand(0).getValueType())) {
59798 unsigned NewIdx = IdxVal + InVec.getConstantOperandVal(1);
59799 return extractSubVector(InVec.getOperand(0), NewIdx, DAG, DL, SizeInBits);
59800 }
59801
59802 // EXTRACT_SUBVECTOR(INSERT_SUBVECTOR(SRC,SUB,C1),C2)
59803 // --> INSERT_SUBVECTOR(EXTRACT_SUBVECTOR(SRC,C2),SUB,C1-C2)
59804 // iff SUB is entirely contained in the extraction.
59805 if (VT.getVectorElementType() != MVT::i1 && TLI.isTypeLegal(VT) &&
59806 InVec.getOpcode() == ISD::INSERT_SUBVECTOR && InVec.hasOneUse()) {
59807 SDValue Src = InVec.getOperand(0);
59808 SDValue Sub = InVec.getOperand(1);
59809 EVT SubVT = Sub.getValueType();
59810 uint64_t InsIdx = InVec.getConstantOperandVal(2);
59811 if (IdxVal <= InsIdx &&
59812 (IdxVal + NumSubElts) >= (InsIdx + SubVT.getVectorNumElements())) {
59813 SDValue NewSrc = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Src,
59814 DAG.getVectorIdxConstant(IdxVal, DL));
59815 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, NewSrc, Sub,
59816 DAG.getVectorIdxConstant(InsIdx - IdxVal, DL));
59817 }
59818 }
59819
59820 // If we're extracting an upper subvector see if we'd get the same elements if
59821 // we extracted the lowest subvector instead which should allow
59822 // SimplifyDemandedVectorElts do more simplifications.
59823 if (IdxVal != 0) {
59824 bool AllEquiv = all_of(seq<unsigned>(NumSubElts), [&](unsigned I) {
59825 return IsElementEquivalent(NumInElts, InVec, InVec, I, I + IdxVal);
59826 });
59827 if (AllEquiv)
59828 return extractSubVector(InVec, 0, DAG, DL, SizeInBits);
59829 }
59830
59831 // Check if we're extracting a whole broadcasted subvector.
59832 if (InVec.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) {
59833 auto *MemIntr = cast<MemIntrinsicSDNode>(InVec);
59834 EVT MemVT = MemIntr->getMemoryVT();
59835 if (MemVT == VT) {
59836 // If this is the only use, we can replace with a regular load (this may
59837 // have been missed by SimplifyDemandedVectorElts due to extra uses of the
59838 // memory chain).
59839 if (InVec.hasOneUse()) {
59840 SDValue Ld =
59841 DAG.getLoad(MemVT, DL, MemIntr->getChain(), MemIntr->getBasePtr(),
59842 MemIntr->getMemOperand());
59843 DAG.makeEquivalentMemoryOrdering(SDValue(MemIntr, 1), Ld.getValue(1));
59844 return Ld;
59845 }
59846 }
59847 }
59848
59849 // Attempt to extract from the source of a shuffle vector.
59850 if ((InSizeInBits % SizeInBits) == 0 && (IdxVal % NumSubElts) == 0) {
59851 SmallVector<int, 32> ShuffleMask;
59852 SmallVector<int, 32> ScaledMask;
59853 SmallVector<SDValue, 2> ShuffleInputs;
59854 unsigned NumSubVecs = InSizeInBits / SizeInBits;
59855 // Decode the shuffle mask and scale it so its shuffling subvectors.
59856 if (getTargetShuffleInputs(InVec, ShuffleInputs, ShuffleMask, DAG) &&
59857 scaleShuffleElements(ShuffleMask, NumSubVecs, ScaledMask)) {
59858 unsigned SubVecIdx = IdxVal / NumSubElts;
59859 if (ScaledMask[SubVecIdx] == SM_SentinelUndef)
59860 return DAG.getUNDEF(VT);
59861 if (ScaledMask[SubVecIdx] == SM_SentinelZero)
59862 return getZeroVector(VT, Subtarget, DAG, DL);
59863 SDValue Src = ShuffleInputs[ScaledMask[SubVecIdx] / NumSubVecs];
59864 if (Src.getValueSizeInBits() == InSizeInBits) {
59865 unsigned SrcSubVecIdx = ScaledMask[SubVecIdx] % NumSubVecs;
59866 unsigned SrcEltIdx = SrcSubVecIdx * NumSubElts;
59867 return extractSubVector(DAG.getBitcast(InVecVT, Src), SrcEltIdx, DAG,
59868 DL, SizeInBits);
59869 }
59870 }
59871 }
59872
59873 auto IsExtractFree = [](SDValue V) {
59874 if (V.hasOneUse()) {
59876 if (V.getOpcode() == ISD::LOAD)
59877 return true;
59878 }
59879 V = peekThroughBitcasts(V);
59880 if (ISD::isBuildVectorOfConstantSDNodes(V.getNode()))
59881 return true;
59883 return true;
59884 return V.isUndef();
59885 };
59886
59887 // If we're extracting the lowest subvector and we're the only user,
59888 // we may be able to perform this with a smaller vector width.
59889 unsigned InOpcode = InVec.getOpcode();
59890 if (InVec.hasOneUse()) {
59891 if (IdxVal == 0 && VT == MVT::v2f64 && InVecVT == MVT::v4f64) {
59892 // v2f64 CVTDQ2PD(v4i32).
59893 if (InOpcode == ISD::SINT_TO_FP &&
59894 InVec.getOperand(0).getValueType() == MVT::v4i32) {
59895 return DAG.getNode(X86ISD::CVTSI2P, DL, VT, InVec.getOperand(0));
59896 }
59897 // v2f64 CVTUDQ2PD(v4i32).
59898 if (InOpcode == ISD::UINT_TO_FP && Subtarget.hasVLX() &&
59899 InVec.getOperand(0).getValueType() == MVT::v4i32) {
59900 return DAG.getNode(X86ISD::CVTUI2P, DL, VT, InVec.getOperand(0));
59901 }
59902 // v2f64 CVTPS2PD(v4f32).
59903 if (InOpcode == ISD::FP_EXTEND &&
59904 InVec.getOperand(0).getValueType() == MVT::v4f32) {
59905 return DAG.getNode(X86ISD::VFPEXT, DL, VT, InVec.getOperand(0));
59906 }
59907 }
59908 // v4i32 CVTPS2DQ(v4f32) / CVTPS2UDQ(v4f32).
59909 // v4f32 CVTDQ2PS(v4i32) / CVTUDQ2PS(v4i32).
59910 if ((InOpcode == ISD::FP_TO_SINT || InOpcode == ISD::SINT_TO_FP ||
59911 ((InOpcode == ISD::FP_TO_UINT || InOpcode == ISD::UINT_TO_FP) &&
59912 Subtarget.hasVLX())) &&
59913 (VT == MVT::v4i32 || VT == MVT::v4f32)) {
59914 SDValue Src = InVec.getOperand(0);
59915 if (Src.getValueType().getScalarSizeInBits() == 32)
59916 return DAG.getNode(InOpcode, DL, VT,
59917 extractSubVector(Src, IdxVal, DAG, DL, SizeInBits));
59918 }
59919 if (IdxVal == 0 &&
59920 (ISD::isExtOpcode(InOpcode) || ISD::isExtVecInRegOpcode(InOpcode)) &&
59921 (SizeInBits == 128 || SizeInBits == 256) &&
59922 InVec.getOperand(0).getValueSizeInBits() >= SizeInBits) {
59923 SDValue Ext = InVec.getOperand(0);
59924 if (Ext.getValueSizeInBits() > SizeInBits)
59925 Ext = extractSubVector(Ext, 0, DAG, DL, SizeInBits);
59926 unsigned ExtOp = DAG.getOpcode_EXTEND_VECTOR_INREG(InOpcode);
59927 return DAG.getNode(ExtOp, DL, VT, Ext);
59928 }
59929 if (IdxVal == 0 && InOpcode == ISD::VSELECT &&
59930 InVec.getOperand(0).getValueType().is256BitVector() &&
59931 InVec.getOperand(1).getValueType().is256BitVector() &&
59932 InVec.getOperand(2).getValueType().is256BitVector()) {
59933 SDValue Ext0 = extractSubVector(InVec.getOperand(0), 0, DAG, DL, 128);
59934 SDValue Ext1 = extractSubVector(InVec.getOperand(1), 0, DAG, DL, 128);
59935 SDValue Ext2 = extractSubVector(InVec.getOperand(2), 0, DAG, DL, 128);
59936 return DAG.getNode(InOpcode, DL, VT, Ext0, Ext1, Ext2);
59937 }
59938 if (IdxVal == 0 && InOpcode == ISD::TRUNCATE && Subtarget.hasVLX() &&
59939 (SizeInBits == 128 || SizeInBits == 256)) {
59940 SDValue InVecSrc = InVec.getOperand(0);
59941 unsigned Scale = InVecSrc.getValueSizeInBits() / InSizeInBits;
59942 SDValue Ext = extractSubVector(InVecSrc, 0, DAG, DL, Scale * SizeInBits);
59943 return DAG.getNode(InOpcode, DL, VT, Ext);
59944 }
59945
59946 if (SizeInBits == 128 || SizeInBits == 256) {
59947 switch (InOpcode) {
59948 case X86ISD::MOVDDUP:
59949 return DAG.getNode(
59950 InOpcode, DL, VT,
59951 extractSubVector(InVec.getOperand(0), IdxVal, DAG, DL, SizeInBits));
59952 case X86ISD::PSHUFD:
59953 case X86ISD::VPERMILPI:
59954 if (InVec.getOperand(0).hasOneUse()) {
59955 uint64_t M = InVec.getConstantOperandVal(1) & 255;
59956 M = VT.getScalarSizeInBits() < 64 ? M : (M >> IdxVal);
59957 return DAG.getNode(InOpcode, DL, VT,
59958 extractSubVector(InVec.getOperand(0), IdxVal, DAG,
59959 DL, SizeInBits),
59960 DAG.getTargetConstant(M, DL, MVT::i8));
59961 }
59962 break;
59963 case X86ISD::PCMPEQ:
59964 case X86ISD::PCMPGT:
59965 case X86ISD::UNPCKH:
59966 case X86ISD::UNPCKL:
59967 if (IsExtractFree(InVec.getOperand(0)) ||
59968 IsExtractFree(InVec.getOperand(1)))
59969 return DAG.getNode(InOpcode, DL, VT,
59970 extractSubVector(InVec.getOperand(0), IdxVal, DAG,
59971 DL, SizeInBits),
59972 extractSubVector(InVec.getOperand(1), IdxVal, DAG,
59973 DL, SizeInBits));
59974 break;
59975 case X86ISD::CMPP:
59976 if (IsExtractFree(InVec.getOperand(0)) ||
59977 IsExtractFree(InVec.getOperand(1)))
59978 return DAG.getNode(InOpcode, DL, VT,
59979 extractSubVector(InVec.getOperand(0), IdxVal, DAG,
59980 DL, SizeInBits),
59981 extractSubVector(InVec.getOperand(1), IdxVal, DAG,
59982 DL, SizeInBits),
59983 InVec.getOperand(2));
59984 break;
59985 case X86ISD::BLENDI:
59986 if (IsExtractFree(InVec.getOperand(0)) ||
59987 IsExtractFree(InVec.getOperand(1))) {
59988 uint64_t M = InVec.getConstantOperandVal(2) & 255;
59989 M = VT.getScalarType() == MVT::i16 ? M : (M >> IdxVal);
59990 return DAG.getNode(InOpcode, DL, VT,
59991 extractSubVector(InVec.getOperand(0), IdxVal, DAG,
59992 DL, SizeInBits),
59993 extractSubVector(InVec.getOperand(1), IdxVal, DAG,
59994 DL, SizeInBits),
59995 DAG.getTargetConstant(M, DL, MVT::i8));
59996 }
59997 break;
59998 case X86ISD::VPERMV:
59999 if (IdxVal != 0) {
60000 SDValue Mask = InVec.getOperand(0);
60001 SDValue Src = InVec.getOperand(1);
60002 Mask = extractSubVector(Mask, IdxVal, DAG, DL, SizeInBits);
60003 Mask = widenSubVector(Mask, /*ZeroNewElements=*/false, Subtarget, DAG,
60004 DL, InSizeInBits);
60005 SDValue Shuffle = DAG.getNode(InOpcode, DL, InVecVT, Mask, Src);
60006 return extractSubVector(Shuffle, 0, DAG, DL, SizeInBits);
60007 }
60008 break;
60009 case X86ISD::VPERMV3:
60010 if (IdxVal != 0) {
60011 SDValue Src0 = InVec.getOperand(0);
60012 SDValue Mask = InVec.getOperand(1);
60013 SDValue Src1 = InVec.getOperand(2);
60014 Mask = extractSubVector(Mask, IdxVal, DAG, DL, SizeInBits);
60015 Mask = widenSubVector(Mask, /*ZeroNewElements=*/false, Subtarget, DAG,
60016 DL, InSizeInBits);
60017 SDValue Shuffle =
60018 DAG.getNode(InOpcode, DL, InVecVT, Src0, Mask, Src1);
60019 return extractSubVector(Shuffle, 0, DAG, DL, SizeInBits);
60020 }
60021 break;
60022 }
60023 }
60024 }
60025
60026 // Always split vXi64 logical shifts where we're extracting the upper 32-bits
60027 // as this is very likely to fold into a shuffle/truncation.
60028 if ((InOpcode == X86ISD::VSHLI || InOpcode == X86ISD::VSRLI) &&
60029 InVecVT.getScalarSizeInBits() == 64 &&
60030 InVec.getConstantOperandAPInt(1) == 32) {
60031 SDValue Ext =
60032 extractSubVector(InVec.getOperand(0), IdxVal, DAG, DL, SizeInBits);
60033 return DAG.getNode(InOpcode, DL, VT, Ext, InVec.getOperand(1));
60034 }
60035
60036 return SDValue();
60037}
60038
60040 const X86Subtarget &Subtarget) {
60041 using namespace SDPatternMatch;
60042 EVT VT = N->getValueType(0);
60043 SDValue Src = N->getOperand(0);
60044 SDLoc DL(N);
60045
60046 // If this is a scalar to vector to v1i1 from an AND with 1, bypass the and.
60047 // This occurs frequently in our masked scalar intrinsic code and our
60048 // floating point select lowering with AVX512.
60049 // TODO: SimplifyDemandedBits instead?
60050 if (VT == MVT::v1i1 && Src.getOpcode() == ISD::AND && Src.hasOneUse() &&
60051 isOneConstant(Src.getOperand(1)))
60052 return DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1, Src.getOperand(0));
60053
60054 // Combine scalar_to_vector of an extract_vector_elt into an extract_subvec.
60055 if (VT == MVT::v1i1 && Src.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
60056 Src.hasOneUse() && Src.getOperand(0).getValueType().isVector() &&
60057 Src.getOperand(0).getValueType().getVectorElementType() == MVT::i1 &&
60058 isNullConstant(Src.getOperand(1)))
60059 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Src.getOperand(0),
60060 Src.getOperand(1));
60061
60062 // Reduce v2i64 to v4i32 if we don't need the upper bits or are known zero.
60063 // TODO: Move to DAGCombine/SimplifyDemandedBits?
60064 if ((VT == MVT::v2i64 || VT == MVT::v2f64) && Src.hasOneUse()) {
60065 auto IsExt64 = [&DAG](SDValue Op, bool IsZeroExt) {
60066 if (Op.getValueType() != MVT::i64)
60067 return SDValue();
60068 unsigned Opc = IsZeroExt ? ISD::ZERO_EXTEND : ISD::ANY_EXTEND;
60069 if (Op.getOpcode() == Opc &&
60070 Op.getOperand(0).getScalarValueSizeInBits() <= 32)
60071 return Op.getOperand(0);
60072 unsigned Ext = IsZeroExt ? ISD::ZEXTLOAD : ISD::EXTLOAD;
60073 if (auto *Ld = dyn_cast<LoadSDNode>(Op))
60074 if (Ld->getExtensionType() == Ext &&
60075 Ld->getMemoryVT().getScalarSizeInBits() <= 32)
60076 return Op;
60077 if (IsZeroExt) {
60078 KnownBits Known = DAG.computeKnownBits(Op);
60079 if (!Known.isConstant() && Known.countMinLeadingZeros() >= 32)
60080 return Op;
60081 }
60082 return SDValue();
60083 };
60084
60085 if (SDValue AnyExt = IsExt64(peekThroughOneUseBitcasts(Src), false))
60086 return DAG.getBitcast(
60087 VT, DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32,
60088 DAG.getAnyExtOrTrunc(AnyExt, DL, MVT::i32)));
60089
60090 if (SDValue ZeroExt = IsExt64(peekThroughOneUseBitcasts(Src), true))
60091 return DAG.getBitcast(
60092 VT,
60093 DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v4i32,
60094 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32,
60095 DAG.getZExtOrTrunc(ZeroExt, DL, MVT::i32))));
60096 }
60097
60098 if (Src.getOpcode() == ISD::BITCAST) {
60099 SDValue SrcOp = Src.getOperand(0);
60100 // Combine (v4i32 (scalar_to_vector (i32 (bitcast (float))))) to MOVD.
60101 if (VT == MVT::v4i32 && SrcOp.getValueType() == MVT::f32)
60102 return DAG.getBitcast(
60103 VT, DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4f32, SrcOp));
60104 // Combine (v2i64 (scalar_to_vector (i64 (bitcast (double))))) to MOVQ.
60105 if (VT == MVT::v2i64 && SrcOp.getValueType() == MVT::f64)
60106 return DAG.getBitcast(
60107 VT, DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, SrcOp));
60108 // Combine (v2i64 (scalar_to_vector (i64 (bitcast (mmx))))) to MOVQ2DQ.
60109 if (VT == MVT::v2i64 && SrcOp.getValueType() == MVT::x86mmx)
60110 return DAG.getNode(X86ISD::MOVQ2DQ, DL, VT, SrcOp);
60111 }
60112
60113 if (VT == MVT::v4i32) {
60114 SDValue HalfSrc;
60115 // Combine (v4i32 (scalar_to_vector (i32 (anyext (bitcast (f16))))))
60116 // to remove XMM->GPR->XMM moves.
60117 if (sd_match(Src, m_AnyExt(m_BitCast(
60118 m_AllOf(m_SpecificVT(MVT::f16), m_Value(HalfSrc))))))
60119 return DAG.getBitcast(
60120 VT, DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v8f16, HalfSrc));
60121 }
60122
60123 // See if we're broadcasting the scalar value, in which case just reuse that.
60124 // Ensure the same SDValue from the SDNode use is being used.
60125 if (VT.getScalarType() == Src.getValueType())
60126 for (SDNode *User : Src->users())
60127 if (User->getOpcode() == X86ISD::VBROADCAST &&
60128 Src == User->getOperand(0)) {
60129 unsigned SizeInBits = VT.getFixedSizeInBits();
60130 unsigned BroadcastSizeInBits =
60131 User->getValueSizeInBits(0).getFixedValue();
60132 if (BroadcastSizeInBits == SizeInBits)
60133 return SDValue(User, 0);
60134 if (BroadcastSizeInBits > SizeInBits)
60135 return extractSubVector(SDValue(User, 0), 0, DAG, DL, SizeInBits);
60136 // TODO: Handle BroadcastSizeInBits < SizeInBits when we have test
60137 // coverage.
60138 }
60139
60140 // Check for cases where we've ended up with a scalarized shift, typically
60141 // during type legalization.
60142 switch (Src.getOpcode()) {
60143 case ISD::SHL:
60144 case ISD::SRL:
60145 case ISD::SRA:
60146 if (auto *Amt = dyn_cast<ConstantSDNode>(Src.getOperand(1))) {
60147 if (supportedVectorShiftWithImm(VT, Subtarget, Src.getOpcode()) &&
60148 Src.hasOneUse()) {
60149 SDValue SrcVec =
60150 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Src.getOperand(0));
60151 unsigned Opc = getTargetVShiftUniformOpcode(Src.getOpcode(), false);
60152 return getTargetVShiftByConstNode(Opc, DL, VT.getSimpleVT(), SrcVec,
60153 Amt->getZExtValue(), DAG);
60154 }
60155 }
60156 break;
60157 case ISD::FSHL:
60158 case ISD::FSHR:
60159 if (auto *Amt = dyn_cast<ConstantSDNode>(Src.getOperand(2))) {
60160 if (supportedVectorShiftWithImm(VT, Subtarget, ISD::SHL) &&
60161 Src.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
60162 Src.getOperand(1).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
60163 Src.hasOneUse()) {
60164 uint64_t AmtVal =
60165 Amt->getAPIntValue().urem(Src.getScalarValueSizeInBits());
60166 SDValue SrcVec0 =
60167 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Src.getOperand(0));
60168 SDValue SrcVec1 =
60169 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Src.getOperand(1));
60170 return DAG.getNode(Src.getOpcode(), DL, VT, SrcVec0, SrcVec1,
60171 DAG.getConstant(AmtVal, DL, VT));
60172 }
60173 }
60174 break;
60175 }
60176
60177 return SDValue();
60178}
60179
60180// Simplify PMULDQ and PMULUDQ operations.
60183 const X86Subtarget &Subtarget) {
60184 SDValue LHS = N->getOperand(0);
60185 SDValue RHS = N->getOperand(1);
60186
60187 // Canonicalize constant to RHS.
60190 return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), RHS, LHS);
60191
60192 // Multiply by zero.
60193 // Don't return RHS as it may contain UNDEFs.
60194 if (ISD::isBuildVectorAllZeros(RHS.getNode()))
60195 return DAG.getConstant(0, SDLoc(N), N->getValueType(0));
60196
60197 // PMULDQ/PMULUDQ only uses lower 32 bits from each vector element.
60198 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
60199 if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(64), DCI))
60200 return SDValue(N, 0);
60201
60202 // If the input is an extend_invec and the SimplifyDemandedBits call didn't
60203 // convert it to any_extend_invec, due to the LegalOperations check, do the
60204 // conversion directly to a vector shuffle manually. This exposes combine
60205 // opportunities missed by combineEXTEND_VECTOR_INREG not calling
60206 // combineX86ShufflesRecursively on SSE4.1 targets.
60207 // FIXME: This is basically a hack around several other issues related to
60208 // ANY_EXTEND_VECTOR_INREG.
60209 if (N->getValueType(0) == MVT::v2i64 && LHS.hasOneUse() &&
60210 (LHS.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG ||
60211 LHS.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG) &&
60212 LHS.getOperand(0).getValueType() == MVT::v4i32) {
60213 SDLoc dl(N);
60214 LHS = DAG.getVectorShuffle(MVT::v4i32, dl, LHS.getOperand(0),
60215 LHS.getOperand(0), { 0, -1, 1, -1 });
60216 LHS = DAG.getBitcast(MVT::v2i64, LHS);
60217 return DAG.getNode(N->getOpcode(), dl, MVT::v2i64, LHS, RHS);
60218 }
60219 if (N->getValueType(0) == MVT::v2i64 && RHS.hasOneUse() &&
60220 (RHS.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG ||
60221 RHS.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG) &&
60222 RHS.getOperand(0).getValueType() == MVT::v4i32) {
60223 SDLoc dl(N);
60224 RHS = DAG.getVectorShuffle(MVT::v4i32, dl, RHS.getOperand(0),
60225 RHS.getOperand(0), { 0, -1, 1, -1 });
60226 RHS = DAG.getBitcast(MVT::v2i64, RHS);
60227 return DAG.getNode(N->getOpcode(), dl, MVT::v2i64, LHS, RHS);
60228 }
60229
60230 return SDValue();
60231}
60232
60233// Simplify VPMADDUBSW/VPMADDWD operations.
60236 MVT VT = N->getSimpleValueType(0);
60237 SDValue LHS = N->getOperand(0);
60238 SDValue RHS = N->getOperand(1);
60239 unsigned Opc = N->getOpcode();
60240 bool IsPMADDWD = Opc == X86ISD::VPMADDWD;
60242 "Unexpected PMADD opcode");
60243
60244 // Multiply by zero.
60245 // Don't return LHS/RHS as it may contain UNDEFs.
60246 if (ISD::isBuildVectorAllZeros(LHS.getNode()) ||
60248 return DAG.getConstant(0, SDLoc(N), VT);
60249
60250 // Constant folding.
60251 APInt LHSUndefs, RHSUndefs;
60252 SmallVector<APInt> LHSBits, RHSBits;
60253 unsigned SrcEltBits = LHS.getScalarValueSizeInBits();
60254 unsigned DstEltBits = VT.getScalarSizeInBits();
60255 if (getTargetConstantBitsFromNode(LHS, SrcEltBits, LHSUndefs, LHSBits) &&
60256 getTargetConstantBitsFromNode(RHS, SrcEltBits, RHSUndefs, RHSBits)) {
60257 SmallVector<APInt> Result;
60258 for (unsigned I = 0, E = LHSBits.size(); I != E; I += 2) {
60259 APInt LHSLo = LHSBits[I + 0], LHSHi = LHSBits[I + 1];
60260 APInt RHSLo = RHSBits[I + 0], RHSHi = RHSBits[I + 1];
60261 LHSLo = IsPMADDWD ? LHSLo.sext(DstEltBits) : LHSLo.zext(DstEltBits);
60262 LHSHi = IsPMADDWD ? LHSHi.sext(DstEltBits) : LHSHi.zext(DstEltBits);
60263 APInt Lo = LHSLo * RHSLo.sext(DstEltBits);
60264 APInt Hi = LHSHi * RHSHi.sext(DstEltBits);
60265 APInt Res = IsPMADDWD ? (Lo + Hi) : Lo.sadd_sat(Hi);
60266 Result.push_back(Res);
60267 }
60268 return getConstVector(Result, VT, DAG, SDLoc(N));
60269 }
60270
60271 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
60272 APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
60273 if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, DCI))
60274 return SDValue(N, 0);
60275
60276 return SDValue();
60277}
60278
60279// Simplify VPMADD52L/VPMADD52H operations.
60282 MVT VT = N->getSimpleValueType(0);
60283
60284 bool AddLow = N->getOpcode() == X86ISD::VPMADD52L;
60285 SDValue Op0 = N->getOperand(0);
60286 SDValue Op1 = N->getOperand(1);
60287 SDValue Op2 = N->getOperand(2);
60288 SDLoc DL(N);
60289
60290 APInt C0, C1;
60291 bool HasC0 = X86::isConstantSplat(Op0, C0),
60292 HasC1 = X86::isConstantSplat(Op1, C1);
60293
60294 // lo/hi(C * X) + Z --> lo/hi(X * C) + Z
60295 if (HasC0 && !HasC1)
60296 return DAG.getNode(N->getOpcode(), DL, VT, Op1, Op0, Op2);
60297
60298 // lo(X * 1) + Z --> lo(X) + Z iff X == lo(X)
60299 if (AddLow && HasC1 && C1.trunc(52).isOne()) {
60300 KnownBits KnownOp0 = DAG.computeKnownBits(Op0);
60301 if (KnownOp0.countMinLeadingZeros() >= 12)
60302 return DAG.getNode(ISD::ADD, DL, VT, Op0, Op2);
60303 }
60304
60305 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
60306 unsigned NumEltBits = VT.getScalarSizeInBits();
60307 if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(NumEltBits),
60308 DCI))
60309 return SDValue(N, 0);
60310
60311 return SDValue();
60312}
60313
60316 const X86Subtarget &Subtarget) {
60317 EVT VT = N->getValueType(0);
60318 SDValue In = N->getOperand(0);
60319 unsigned Opcode = N->getOpcode();
60320 unsigned InOpcode = In.getOpcode();
60321 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
60322 SDLoc DL(N);
60323
60324 // Try to merge vector loads and extend_inreg to an extload.
60325 if (!DCI.isBeforeLegalizeOps() && ISD::isNormalLoad(In.getNode()) &&
60326 In.hasOneUse()) {
60327 auto *Ld = cast<LoadSDNode>(In);
60328 if (Ld->isSimple()) {
60329 MVT SVT = In.getSimpleValueType().getVectorElementType();
60332 : ISD::ZEXTLOAD;
60333 EVT MemVT = VT.changeVectorElementType(SVT);
60334 if (TLI.isLoadExtLegal(Ext, VT, MemVT)) {
60335 SDValue Load = DAG.getExtLoad(
60336 Ext, DL, VT, Ld->getChain(), Ld->getBasePtr(), Ld->getPointerInfo(),
60337 MemVT, Ld->getBaseAlign(), Ld->getMemOperand()->getFlags());
60338 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
60339 return Load;
60340 }
60341 }
60342 }
60343
60344 // Fold EXTEND_VECTOR_INREG(EXTEND_VECTOR_INREG(X)) -> EXTEND_VECTOR_INREG(X).
60345 if (Opcode == InOpcode)
60346 return DAG.getNode(Opcode, DL, VT, In.getOperand(0));
60347
60348 // Fold EXTEND_VECTOR_INREG(EXTRACT_SUBVECTOR(EXTEND(X),0))
60349 // -> EXTEND_VECTOR_INREG(X).
60350 // TODO: Handle non-zero subvector indices.
60351 if (InOpcode == ISD::EXTRACT_SUBVECTOR && In.getConstantOperandVal(1) == 0 &&
60352 In.getOperand(0).getOpcode() == DAG.getOpcode_EXTEND(Opcode) &&
60353 In.getOperand(0).getOperand(0).getValueSizeInBits() ==
60354 In.getValueSizeInBits())
60355 return DAG.getNode(Opcode, DL, VT, In.getOperand(0).getOperand(0));
60356
60357 // Fold EXTEND_VECTOR_INREG(BUILD_VECTOR(X,Y,?,?)) -> BUILD_VECTOR(X,0,Y,0).
60358 // TODO: Move to DAGCombine?
60359 if (!DCI.isBeforeLegalizeOps() && Opcode == ISD::ZERO_EXTEND_VECTOR_INREG &&
60360 In.getOpcode() == ISD::BUILD_VECTOR && In.hasOneUse() &&
60361 In.getValueSizeInBits() == VT.getSizeInBits()) {
60362 unsigned NumElts = VT.getVectorNumElements();
60363 unsigned Scale = VT.getScalarSizeInBits() / In.getScalarValueSizeInBits();
60364 EVT EltVT = In.getOperand(0).getValueType();
60365 SmallVector<SDValue> Elts(Scale * NumElts, DAG.getConstant(0, DL, EltVT));
60366 for (unsigned I = 0; I != NumElts; ++I)
60367 Elts[I * Scale] = In.getOperand(I);
60368 return DAG.getBitcast(VT, DAG.getBuildVector(In.getValueType(), DL, Elts));
60369 }
60370
60371 // Attempt to combine as a shuffle on SSE41+ targets.
60372 if (Subtarget.hasSSE41()) {
60373 SDValue Op(N, 0);
60374 if (TLI.isTypeLegal(VT) && TLI.isTypeLegal(In.getValueType()))
60375 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
60376 return Res;
60377 }
60378
60379 return SDValue();
60380}
60381
60384 EVT VT = N->getValueType(0);
60385 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
60386 if (ISD::isBuildVectorAllZeros(N->getOperand(0).getNode()))
60387 return DAG.getConstant(0, SDLoc(N), VT);
60388
60389 // Fold kshiftr(extract_subvector(X,C1),C2)
60390 // --> extract_subvector(kshiftr(X,C1+C2),0)
60391 // Fold kshiftr(kshiftr(X,C1),C2) --> kshiftr(X,C1+C2)
60392 if (N->getOpcode() == X86ISD::KSHIFTR) {
60393 SDLoc DL(N);
60394 if (N->getOperand(0).getOpcode() == ISD::EXTRACT_SUBVECTOR ||
60395 N->getOperand(0).getOpcode() == X86ISD::KSHIFTR) {
60396 SDValue Src = N->getOperand(0).getOperand(0);
60397 uint64_t Amt = N->getConstantOperandVal(1) +
60398 N->getOperand(0).getConstantOperandVal(1);
60399 EVT SrcVT = Src.getValueType();
60400 if (TLI.isTypeLegal(SrcVT) && Amt < SrcVT.getVectorNumElements()) {
60401 SDValue Shift = DAG.getNode(X86ISD::KSHIFTR, DL, SrcVT, Src,
60402 DAG.getTargetConstant(Amt, DL, MVT::i8));
60403 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Shift,
60404 DAG.getVectorIdxConstant(0, DL));
60405 }
60406 }
60407 }
60408
60409 APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
60410 if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, DCI))
60411 return SDValue(N, 0);
60412
60413 return SDValue();
60414}
60415
60416// Optimize (fp16_to_fp (fp_to_fp16 X)) to VCVTPS2PH followed by VCVTPH2PS.
60417// Done as a combine because the lowering for fp16_to_fp and fp_to_fp16 produce
60418// extra instructions between the conversion due to going to scalar and back.
60420 const X86Subtarget &Subtarget) {
60421 if (Subtarget.useSoftFloat() || !Subtarget.hasF16C())
60422 return SDValue();
60423
60424 if (N->getOperand(0).getOpcode() != ISD::FP_TO_FP16)
60425 return SDValue();
60426
60427 if (N->getValueType(0) != MVT::f32 ||
60428 N->getOperand(0).getOperand(0).getValueType() != MVT::f32)
60429 return SDValue();
60430
60431 SDLoc dl(N);
60432 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32,
60433 N->getOperand(0).getOperand(0));
60434 Res = DAG.getNode(X86ISD::CVTPS2PH, dl, MVT::v8i16, Res,
60435 DAG.getTargetConstant(4, dl, MVT::i32));
60436 Res = DAG.getNode(X86ISD::CVTPH2PS, dl, MVT::v4f32, Res);
60437 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res,
60438 DAG.getVectorIdxConstant(0, dl));
60439}
60440
60443 const X86Subtarget &Subtarget) {
60444 EVT VT = N->getValueType(0);
60445 bool IsStrict = N->isStrictFPOpcode();
60446 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
60447 EVT SrcVT = Src.getValueType();
60448
60449 SDLoc dl(N);
60450 if (SrcVT.getScalarType() == MVT::bf16) {
60451 if (DCI.isAfterLegalizeDAG() && Src.getOpcode() == ISD::FP_ROUND &&
60452 !IsStrict && Src.getOperand(0).getValueType() == VT)
60453 return Src.getOperand(0);
60454
60455 if (!SrcVT.isVector())
60456 return SDValue();
60457
60458 assert(!IsStrict && "Strict FP doesn't support BF16");
60459 if (VT.getVectorElementType() == MVT::f64) {
60460 EVT TmpVT = VT.changeVectorElementType(MVT::f32);
60461 return DAG.getNode(ISD::FP_EXTEND, dl, VT,
60462 DAG.getNode(ISD::FP_EXTEND, dl, TmpVT, Src));
60463 }
60464 assert(VT.getVectorElementType() == MVT::f32 && "Unexpected fpext");
60465 EVT NVT = SrcVT.changeVectorElementType(MVT::i32);
60466 Src = DAG.getBitcast(SrcVT.changeTypeToInteger(), Src);
60467 Src = DAG.getNode(ISD::ZERO_EXTEND, dl, NVT, Src);
60468 Src = DAG.getNode(ISD::SHL, dl, NVT, Src, DAG.getConstant(16, dl, NVT));
60469 return DAG.getBitcast(VT, Src);
60470 }
60471
60472 if (!Subtarget.hasF16C() || Subtarget.useSoftFloat())
60473 return SDValue();
60474
60475 if (Subtarget.hasFP16())
60476 return SDValue();
60477
60478 if (!SrcVT.isVector() || SrcVT.getVectorElementType() != MVT::f16)
60479 return SDValue();
60480
60481 if (VT.getVectorElementType() != MVT::f32 &&
60482 VT.getVectorElementType() != MVT::f64)
60483 return SDValue();
60484
60485 unsigned NumElts = VT.getVectorNumElements();
60486 if (NumElts == 1 || !isPowerOf2_32(NumElts))
60487 return SDValue();
60488
60489 // Convert the input to vXi16.
60490 EVT IntVT = SrcVT.changeVectorElementTypeToInteger();
60491 Src = DAG.getBitcast(IntVT, Src);
60492
60493 // Widen to at least 8 input elements.
60494 if (NumElts < 8) {
60495 unsigned NumConcats = 8 / NumElts;
60496 SDValue Fill = NumElts == 4 ? DAG.getUNDEF(IntVT)
60497 : DAG.getConstant(0, dl, IntVT);
60498 SmallVector<SDValue, 4> Ops(NumConcats, Fill);
60499 Ops[0] = Src;
60500 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, Ops);
60501 }
60502
60503 // Destination is vXf32 with at least 4 elements.
60504 EVT CvtVT = EVT::getVectorVT(*DAG.getContext(), MVT::f32,
60505 std::max(4U, NumElts));
60506 SDValue Cvt, Chain;
60507 if (IsStrict) {
60508 Cvt = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {CvtVT, MVT::Other},
60509 {N->getOperand(0), Src});
60510 Chain = Cvt.getValue(1);
60511 } else {
60512 Cvt = DAG.getNode(X86ISD::CVTPH2PS, dl, CvtVT, Src);
60513 }
60514
60515 if (NumElts < 4) {
60516 assert(NumElts == 2 && "Unexpected size");
60517 Cvt = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2f32, Cvt,
60518 DAG.getVectorIdxConstant(0, dl));
60519 }
60520
60521 if (IsStrict) {
60522 // Extend to the original VT if necessary.
60523 if (Cvt.getValueType() != VT) {
60524 Cvt = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {VT, MVT::Other},
60525 {Chain, Cvt});
60526 Chain = Cvt.getValue(1);
60527 }
60528 return DAG.getMergeValues({Cvt, Chain}, dl);
60529 }
60530
60531 // Extend to the original VT if necessary.
60532 return DAG.getNode(ISD::FP_EXTEND, dl, VT, Cvt);
60533}
60534
60535// Try to find a larger VBROADCAST_LOAD/SUBV_BROADCAST_LOAD that we can extract.
60538 assert((N->getOpcode() == X86ISD::VBROADCAST_LOAD ||
60539 N->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) &&
60540 "Unknown broadcast load type");
60541
60542 auto *MemIntrin = cast<MemIntrinsicSDNode>(N);
60543 SDValue Ptr = MemIntrin->getBasePtr();
60544 SDValue Chain = MemIntrin->getChain();
60545 EVT VT = N->getSimpleValueType(0);
60546 EVT MemVT = MemIntrin->getMemoryVT();
60547
60548 // Look at other users of our base pointer and try to find a wider broadcast.
60549 // The input chain and the size of the memory VT must match.
60550 for (SDNode *User : Ptr->users())
60551 if (User != N && User->getOpcode() == N->getOpcode() &&
60552 cast<MemIntrinsicSDNode>(User)->getBasePtr() == Ptr &&
60553 cast<MemIntrinsicSDNode>(User)->getChain() == Chain &&
60554 cast<MemIntrinsicSDNode>(User)->getMemoryVT().getSizeInBits() ==
60555 MemVT.getSizeInBits() &&
60556 User->getValueSizeInBits(0).getFixedValue() > VT.getFixedSizeInBits()) {
60558 MemIntrin->isSimple() && "Illegal broadcast load type");
60560 SDValue Extract = extractSubVector(SDValue(User, 0), 0, DAG, SDLoc(N),
60561 VT.getSizeInBits());
60562 Extract = DAG.getBitcast(VT, Extract);
60563 Extract = DCI.CombineTo(N, Extract, SDValue(User, 1));
60564 return Extract;
60565 }
60566
60567 return SDValue();
60568}
60569
60571 const X86Subtarget &Subtarget) {
60572 if (!Subtarget.hasF16C() || Subtarget.useSoftFloat())
60573 return SDValue();
60574
60575 bool IsStrict = N->isStrictFPOpcode();
60576 EVT VT = N->getValueType(0);
60577 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
60578 EVT SrcVT = Src.getValueType();
60579
60580 if (!VT.isVector() || VT.getVectorElementType() != MVT::f16 ||
60581 SrcVT.getVectorElementType() != MVT::f32)
60582 return SDValue();
60583
60584 SDLoc dl(N);
60585
60586 SDValue Cvt, Chain;
60587 unsigned NumElts = VT.getVectorNumElements();
60588 if (Subtarget.hasFP16()) {
60589 // Combine (v8f16 fp_round(concat_vectors(v4f32 (xint_to_fp v4i64),
60590 // v4f32 (xint_to_fp v4i64))))
60591 // into (v8f16 vector_shuffle(v8f16 (CVTXI2P v4i64),
60592 // v8f16 (CVTXI2P v4i64)))
60593 if (NumElts == 8 && Src.getOpcode() == ISD::CONCAT_VECTORS &&
60594 Src.getNumOperands() == 2) {
60595 SDValue Cvt0, Cvt1;
60596 SDValue Op0 = Src.getOperand(0);
60597 SDValue Op1 = Src.getOperand(1);
60598 bool IsOp0Strict = Op0->isStrictFPOpcode();
60599 if (Op0.getOpcode() != Op1.getOpcode() ||
60600 Op0.getOperand(IsOp0Strict ? 1 : 0).getValueType() != MVT::v4i64 ||
60601 Op1.getOperand(IsOp0Strict ? 1 : 0).getValueType() != MVT::v4i64) {
60602 return SDValue();
60603 }
60604 int Mask[8] = {0, 1, 2, 3, 8, 9, 10, 11};
60605 if (IsStrict) {
60606 assert(IsOp0Strict && "Op0 must be strict node");
60607 unsigned Opc = Op0.getOpcode() == ISD::STRICT_SINT_TO_FP
60610 Cvt0 = DAG.getNode(Opc, dl, {MVT::v8f16, MVT::Other},
60611 {Op0.getOperand(0), Op0.getOperand(1)});
60612 Cvt1 = DAG.getNode(Opc, dl, {MVT::v8f16, MVT::Other},
60613 {Op1.getOperand(0), Op1.getOperand(1)});
60614 Cvt = DAG.getVectorShuffle(MVT::v8f16, dl, Cvt0, Cvt1, Mask);
60615 return DAG.getMergeValues({Cvt, Cvt0.getValue(1)}, dl);
60616 }
60617 unsigned Opc = Op0.getOpcode() == ISD::SINT_TO_FP ? X86ISD::CVTSI2P
60619 Cvt0 = DAG.getNode(Opc, dl, MVT::v8f16, Op0.getOperand(0));
60620 Cvt1 = DAG.getNode(Opc, dl, MVT::v8f16, Op1.getOperand(0));
60621 return Cvt = DAG.getVectorShuffle(MVT::v8f16, dl, Cvt0, Cvt1, Mask);
60622 }
60623 return SDValue();
60624 }
60625
60626 if (NumElts == 1 || !isPowerOf2_32(NumElts))
60627 return SDValue();
60628
60629 // Widen to at least 4 input elements.
60630 if (NumElts < 4)
60631 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
60632 DAG.getConstantFP(0.0, dl, SrcVT));
60633
60634 // Destination is v8i16 with at least 8 elements.
60635 EVT CvtVT =
60636 EVT::getVectorVT(*DAG.getContext(), MVT::i16, std::max(8U, NumElts));
60637 SDValue Rnd = DAG.getTargetConstant(4, dl, MVT::i32);
60638 if (IsStrict) {
60639 Cvt = DAG.getNode(X86ISD::STRICT_CVTPS2PH, dl, {CvtVT, MVT::Other},
60640 {N->getOperand(0), Src, Rnd});
60641 Chain = Cvt.getValue(1);
60642 } else {
60643 Cvt = DAG.getNode(X86ISD::CVTPS2PH, dl, CvtVT, Src, Rnd);
60644 }
60645
60646 // Extract down to real number of elements.
60647 if (NumElts < 8) {
60649 Cvt = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, IntVT, Cvt,
60650 DAG.getVectorIdxConstant(0, dl));
60651 }
60652
60653 Cvt = DAG.getBitcast(VT, Cvt);
60654
60655 if (IsStrict)
60656 return DAG.getMergeValues({Cvt, Chain}, dl);
60657
60658 return Cvt;
60659}
60660
60662 SDValue Src = N->getOperand(0);
60663
60664 // Turn MOVDQ2Q+simple_load into an mmx load.
60665 if (ISD::isNormalLoad(Src.getNode()) && Src.hasOneUse()) {
60666 LoadSDNode *LN = cast<LoadSDNode>(Src.getNode());
60667
60668 if (LN->isSimple()) {
60669 SDValue NewLd =
60670 DAG.getLoad(MVT::x86mmx, SDLoc(N), LN->getChain(), LN->getBasePtr(),
60671 LN->getPointerInfo(), LN->getBaseAlign(),
60672 LN->getMemOperand()->getFlags());
60673 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), NewLd.getValue(1));
60674 return NewLd;
60675 }
60676 }
60677
60678 return SDValue();
60679}
60680
60683 unsigned NumBits = N->getSimpleValueType(0).getSizeInBits();
60684 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
60685 if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(NumBits), DCI))
60686 return SDValue(N, 0);
60687
60688 return SDValue();
60689}
60690
60691// Fixup the MMX intrinsics' types: in IR they are expressed with <1 x i64>,
60692// and so SelectionDAGBuilder creates them with v1i64 types, but they need to
60693// use x86mmx instead.
60695 SDLoc dl(N);
60696
60697 bool MadeChange = false, CastReturnVal = false;
60699 for (const SDValue &Arg : N->op_values()) {
60700 if (Arg.getValueType() == MVT::v1i64) {
60701 MadeChange = true;
60702 Args.push_back(DAG.getBitcast(MVT::x86mmx, Arg));
60703 } else
60704 Args.push_back(Arg);
60705 }
60706 SDVTList VTs = N->getVTList();
60707 SDVTList NewVTs = VTs;
60708 if (VTs.NumVTs > 0 && VTs.VTs[0] == MVT::v1i64) {
60709 SmallVector<EVT> NewVTArr(ArrayRef<EVT>(VTs.VTs, VTs.NumVTs));
60710 NewVTArr[0] = MVT::x86mmx;
60711 NewVTs = DAG.getVTList(NewVTArr);
60712 MadeChange = true;
60713 CastReturnVal = true;
60714 }
60715
60716 if (MadeChange) {
60717 SDValue Result = DAG.getNode(N->getOpcode(), dl, NewVTs, Args);
60718 if (CastReturnVal) {
60720 for (unsigned i = 0, e = Result->getNumValues(); i != e; ++i)
60721 Returns.push_back(Result.getValue(i));
60722 Returns[0] = DAG.getBitcast(MVT::v1i64, Returns[0]);
60723 return DAG.getMergeValues(Returns, dl);
60724 }
60725 return Result;
60726 }
60727 return SDValue();
60728}
60731 if (!DCI.isBeforeLegalize())
60732 return SDValue();
60733
60734 unsigned IntNo = N->getConstantOperandVal(0);
60735 const IntrinsicData *IntrData = getIntrinsicWithoutChain(IntNo);
60736
60737 if (IntrData && IntrData->Type == INTR_TYPE_CAST_MMX)
60738 return FixupMMXIntrinsicTypes(N, DAG);
60739
60740 return SDValue();
60741}
60742
60745 if (!DCI.isBeforeLegalize())
60746 return SDValue();
60747
60748 unsigned IntNo = N->getConstantOperandVal(1);
60749 const IntrinsicData *IntrData = getIntrinsicWithChain(IntNo);
60750
60751 if (IntrData && IntrData->Type == INTR_TYPE_CAST_MMX)
60752 return FixupMMXIntrinsicTypes(N, DAG);
60753
60754 return SDValue();
60755}
60756
60759 if (!DCI.isBeforeLegalize())
60760 return SDValue();
60761
60762 unsigned IntNo = N->getConstantOperandVal(1);
60763 const IntrinsicData *IntrData = getIntrinsicWithChain(IntNo);
60764
60765 if (IntrData && IntrData->Type == INTR_TYPE_CAST_MMX)
60766 return FixupMMXIntrinsicTypes(N, DAG);
60767
60768 return SDValue();
60769}
60770
60772 DAGCombinerInfo &DCI) const {
60773 SelectionDAG &DAG = DCI.DAG;
60774 switch (N->getOpcode()) {
60775 // clang-format off
60776 default: break;
60778 return combineSCALAR_TO_VECTOR(N, DAG, Subtarget);
60780 case X86ISD::PEXTRW:
60781 case X86ISD::PEXTRB:
60782 return combineExtractVectorElt(N, DAG, DCI, Subtarget);
60784 return combineCONCAT_VECTORS(N, DAG, DCI, Subtarget);
60786 return combineINSERT_SUBVECTOR(N, DAG, DCI, Subtarget);
60788 return combineEXTRACT_SUBVECTOR(N, DAG, DCI, Subtarget);
60789 case ISD::VSELECT:
60790 case ISD::SELECT:
60791 case X86ISD::BLENDV: return combineSelect(N, DAG, DCI, Subtarget);
60792 case ISD::BITCAST: return combineBitcast(N, DAG, DCI, Subtarget);
60793 case X86ISD::CMOV: return combineCMov(N, DAG, DCI, Subtarget);
60794 case X86ISD::CMP: return combineCMP(N, DAG, DCI, Subtarget);
60795 case ISD::ADD: return combineAdd(N, DAG, DCI, Subtarget);
60796 case ISD::SUB: return combineSub(N, DAG, DCI, Subtarget);
60797 case X86ISD::ADD:
60798 case X86ISD::SUB: return combineX86AddSub(N, DAG, DCI, Subtarget);
60799 case X86ISD::CLOAD:
60800 case X86ISD::CSTORE: return combineX86CloadCstore(N, DAG);
60801 case X86ISD::SBB: return combineSBB(N, DAG);
60802 case X86ISD::ADC: return combineADC(N, DAG, DCI);
60803 case ISD::MUL: return combineMul(N, DAG, DCI, Subtarget);
60804 case ISD::SHL: return combineShiftLeft(N, DAG, Subtarget);
60805 case ISD::SRA: return combineShiftRightArithmetic(N, DAG, Subtarget);
60806 case ISD::SRL: return combineShiftRightLogical(N, DAG, DCI, Subtarget);
60807 case ISD::AND: return combineAnd(N, DAG, DCI, Subtarget);
60808 case ISD::OR: return combineOr(N, DAG, DCI, Subtarget);
60809 case ISD::XOR: return combineXor(N, DAG, DCI, Subtarget);
60810 case ISD::BITREVERSE: return combineBITREVERSE(N, DAG, DCI, Subtarget);
60811 case ISD::AVGCEILS:
60812 case ISD::AVGCEILU:
60813 case ISD::AVGFLOORS:
60814 case ISD::AVGFLOORU: return combineAVG(N, DAG, DCI, Subtarget);
60815 case X86ISD::BEXTR:
60816 case X86ISD::BEXTRI: return combineBEXTR(N, DAG, DCI, Subtarget);
60817 case ISD::LOAD: return combineLoad(N, DAG, DCI, Subtarget);
60818 case ISD::MLOAD: return combineMaskedLoad(N, DAG, DCI, Subtarget);
60819 case ISD::STORE: return combineStore(N, DAG, DCI, Subtarget);
60820 case ISD::MSTORE: return combineMaskedStore(N, DAG, DCI, Subtarget);
60822 return combineVEXTRACT_STORE(N, DAG, DCI, Subtarget);
60823 case ISD::SINT_TO_FP:
60825 return combineSIntToFP(N, DAG, DCI, Subtarget);
60826 case ISD::UINT_TO_FP:
60828 return combineUIntToFP(N, DAG, Subtarget);
60829 case ISD::FP_TO_SINT: return combineFPToSInt(N, DAG, Subtarget);
60830 case ISD::LRINT:
60831 case ISD::LLRINT: return combineLRINT_LLRINT(N, DAG, Subtarget);
60832 case ISD::FADD:
60833 case ISD::FSUB: return combineFaddFsub(N, DAG, Subtarget);
60834 case X86ISD::VFCMULC:
60835 case X86ISD::VFMULC: return combineFMulcFCMulc(N, DAG, Subtarget);
60836 case ISD::FNEG: return combineFneg(N, DAG, DCI, Subtarget);
60837 case ISD::TRUNCATE: return combineTruncate(N, DAG, Subtarget);
60838 case X86ISD::VTRUNC: return combineVTRUNC(N, DAG, DCI);
60839 case X86ISD::ANDNP: return combineAndnp(N, DAG, DCI, Subtarget);
60840 case X86ISD::FAND: return combineFAnd(N, DAG, Subtarget);
60841 case X86ISD::FANDN: return combineFAndn(N, DAG, Subtarget);
60842 case X86ISD::FXOR:
60843 case X86ISD::FOR: return combineFOr(N, DAG, DCI, Subtarget);
60844 case X86ISD::FMIN:
60845 case X86ISD::FMAX: return combineFMinFMax(N, DAG);
60846 case ISD::FMINNUM:
60847 case ISD::FMAXNUM: return combineFMinNumFMaxNum(N, DAG, Subtarget);
60848 case X86ISD::CVTSI2P:
60849 case X86ISD::CVTUI2P: return combineX86INT_TO_FP(N, DAG, DCI);
60850 case X86ISD::CVTP2SI:
60851 case X86ISD::CVTP2UI:
60853 case X86ISD::CVTTP2SI:
60855 case X86ISD::CVTTP2UI:
60856 return combineCVTP2I_CVTTP2I(N, DAG, DCI);
60858 case X86ISD::CVTPH2PS: return combineCVTPH2PS(N, DAG, DCI);
60859 case X86ISD::BT: return combineBT(N, DAG, DCI);
60860 case ISD::ANY_EXTEND:
60861 case ISD::ZERO_EXTEND: return combineZext(N, DAG, DCI, Subtarget);
60862 case ISD::SIGN_EXTEND: return combineSext(N, DAG, DCI, Subtarget);
60863 case ISD::SIGN_EXTEND_INREG: return combineSignExtendInReg(N, DAG, Subtarget);
60867 return combineEXTEND_VECTOR_INREG(N, DAG, DCI, Subtarget);
60868 case ISD::SETCC: return combineSetCC(N, DAG, DCI, Subtarget);
60869 case X86ISD::SETCC: return combineX86SetCC(N, DAG, Subtarget);
60870 case X86ISD::BRCOND: return combineBrCond(N, DAG, Subtarget);
60871 case X86ISD::PACKSS:
60872 case X86ISD::PACKUS: return combineVectorPack(N, DAG, DCI, Subtarget);
60873 case X86ISD::HADD:
60874 case X86ISD::HSUB:
60875 case X86ISD::FHADD:
60876 case X86ISD::FHSUB: return combineVectorHADDSUB(N, DAG, DCI, Subtarget);
60877 case X86ISD::VSHL:
60878 case X86ISD::VSRA:
60879 case X86ISD::VSRL:
60880 return combineVectorShiftVar(N, DAG, DCI, Subtarget);
60881 case X86ISD::VSHLI:
60882 case X86ISD::VSRAI:
60883 case X86ISD::VSRLI:
60884 return combineVectorShiftImm(N, DAG, DCI, Subtarget);
60886 case X86ISD::PINSRB:
60887 case X86ISD::PINSRW: return combineVectorInsert(N, DAG, DCI, Subtarget);
60888 case X86ISD::SHUFP: // Handle all target specific shuffles
60889 case X86ISD::INSERTPS:
60890 case X86ISD::EXTRQI:
60891 case X86ISD::INSERTQI:
60892 case X86ISD::VALIGN:
60893 case X86ISD::PALIGNR:
60894 case X86ISD::VSHLDQ:
60895 case X86ISD::VSRLDQ:
60896 case X86ISD::BLENDI:
60897 case X86ISD::UNPCKH:
60898 case X86ISD::UNPCKL:
60899 case X86ISD::MOVHLPS:
60900 case X86ISD::MOVLHPS:
60901 case X86ISD::PSHUFB:
60902 case X86ISD::PSHUFD:
60903 case X86ISD::PSHUFHW:
60904 case X86ISD::PSHUFLW:
60905 case X86ISD::MOVSHDUP:
60906 case X86ISD::MOVSLDUP:
60907 case X86ISD::MOVDDUP:
60908 case X86ISD::MOVSS:
60909 case X86ISD::MOVSD:
60910 case X86ISD::MOVSH:
60911 case X86ISD::VBROADCAST:
60912 case X86ISD::VPPERM:
60913 case X86ISD::VPERMI:
60914 case X86ISD::VPERMV:
60915 case X86ISD::VPERMV3:
60916 case X86ISD::VPERMIL2:
60917 case X86ISD::VPERMILPI:
60918 case X86ISD::VPERMILPV:
60919 case X86ISD::VPERM2X128:
60920 case X86ISD::SHUF128:
60921 case X86ISD::VZEXT_MOVL:
60922 case ISD::VECTOR_SHUFFLE: return combineShuffle(N, DAG, DCI,Subtarget);
60923 case X86ISD::FMADD_RND:
60924 case X86ISD::FMSUB:
60926 case X86ISD::FMSUB_RND:
60927 case X86ISD::FNMADD:
60929 case X86ISD::FNMADD_RND:
60930 case X86ISD::FNMSUB:
60932 case X86ISD::FNMSUB_RND:
60933 case ISD::FMA:
60934 case ISD::STRICT_FMA: return combineFMA(N, DAG, DCI, Subtarget);
60937 case X86ISD::FMADDSUB:
60938 case X86ISD::FMSUBADD: return combineFMADDSUB(N, DAG, DCI);
60939 case X86ISD::MOVMSK: return combineMOVMSK(N, DAG, DCI, Subtarget);
60940 case X86ISD::TESTP: return combineTESTP(N, DAG, DCI, Subtarget);
60941 case X86ISD::MGATHER:
60942 case X86ISD::MSCATTER: return combineX86GatherScatter(N, DAG, DCI);
60943 case ISD::MGATHER:
60944 case ISD::MSCATTER: return combineGatherScatter(N, DAG, DCI);
60945 case X86ISD::PCMPEQ:
60946 case X86ISD::PCMPGT: return combineVectorCompare(N, DAG, Subtarget);
60947 case X86ISD::PMULDQ:
60948 case X86ISD::PMULUDQ: return combinePMULDQ(N, DAG, DCI, Subtarget);
60949 case X86ISD::VPMADDUBSW:
60950 case X86ISD::VPMADDWD: return combineVPMADD(N, DAG, DCI);
60951 case X86ISD::VPMADD52L:
60952 case X86ISD::VPMADD52H: return combineVPMADD52LH(N, DAG, DCI);
60953 case X86ISD::KSHIFTL:
60954 case X86ISD::KSHIFTR: return combineKSHIFT(N, DAG, DCI);
60955 case ISD::FP16_TO_FP: return combineFP16_TO_FP(N, DAG, Subtarget);
60957 case ISD::FP_EXTEND: return combineFP_EXTEND(N, DAG, DCI, Subtarget);
60959 case ISD::FP_ROUND: return combineFP_ROUND(N, DAG, Subtarget);
60961 case X86ISD::SUBV_BROADCAST_LOAD: return combineBROADCAST_LOAD(N, DAG, DCI);
60962 case X86ISD::MOVDQ2Q: return combineMOVDQ2Q(N, DAG);
60963 case X86ISD::PDEP: return combinePDEP(N, DAG, DCI);
60964 case ISD::INTRINSIC_WO_CHAIN: return combineINTRINSIC_WO_CHAIN(N, DAG, DCI);
60965 case ISD::INTRINSIC_W_CHAIN: return combineINTRINSIC_W_CHAIN(N, DAG, DCI);
60966 case ISD::INTRINSIC_VOID: return combineINTRINSIC_VOID(N, DAG, DCI);
60968 case ISD::FP_TO_UINT_SAT: return combineFP_TO_xINT_SAT(N, DAG, Subtarget);
60969 // clang-format on
60970 }
60971
60972 return SDValue();
60973}
60974
60976 return Subtarget.canUseCMOV() && (VT == MVT::i32 || VT == MVT::i64);
60977}
60978
60979// Prefer (non-AVX512) vector TRUNCATE(SIGN_EXTEND_INREG(X)) to use of PACKSS.
60981 EVT ExtVT) const {
60982 return Subtarget.hasAVX512() || !VT.isVector();
60983}
60984
60986 if (!isTypeLegal(VT))
60987 return false;
60988
60989 // There are no vXi8 shifts.
60990 if (Opc == ISD::SHL && VT.isVector() && VT.getVectorElementType() == MVT::i8)
60991 return false;
60992
60993 // TODO: Almost no 8-bit ops are desirable because they have no actual
60994 // size/speed advantages vs. 32-bit ops, but they do have a major
60995 // potential disadvantage by causing partial register stalls.
60996 //
60997 // 8-bit multiply/shl is probably not cheaper than 32-bit multiply/shl, and
60998 // we have specializations to turn 32-bit multiply/shl into LEA or other ops.
60999 // Also, see the comment in "IsDesirableToPromoteOp" - where we additionally
61000 // check for a constant operand to the multiply.
61001 if ((Opc == ISD::MUL || Opc == ISD::SHL) && VT == MVT::i8)
61002 return false;
61003
61004 // i16 instruction encodings are longer and some i16 instructions are slow,
61005 // so those are not desirable.
61006 if (VT == MVT::i16) {
61007 switch (Opc) {
61008 default:
61009 break;
61010 case ISD::LOAD:
61011 case ISD::SIGN_EXTEND:
61012 case ISD::ZERO_EXTEND:
61013 case ISD::ANY_EXTEND:
61014 case ISD::MUL:
61015 return false;
61016 case ISD::SHL:
61017 case ISD::SRA:
61018 case ISD::SRL:
61019 case ISD::SUB:
61020 case ISD::ADD:
61021 case ISD::AND:
61022 case ISD::OR:
61023 case ISD::XOR:
61024 // NDD instruction never has "partial register write" issue b/c it has
61025 // destination register's upper bits [63:OSIZE]) zeroed even when
61026 // OSIZE=8/16.
61027 return Subtarget.hasNDD();
61028 }
61029 }
61030
61031 // Any legal type not explicitly accounted for above here is desirable.
61032 return true;
61033}
61034
61036 SDValue Value, SDValue Addr,
61037 int JTI,
61038 SelectionDAG &DAG) const {
61039 const Module *M = DAG.getMachineFunction().getFunction().getParent();
61040 Metadata *IsCFProtectionSupported = M->getModuleFlag("cf-protection-branch");
61041 if (IsCFProtectionSupported) {
61042 // In case control-flow branch protection is enabled, we need to add
61043 // notrack prefix to the indirect branch.
61044 // In order to do that we create NT_BRIND SDNode.
61045 // Upon ISEL, the pattern will convert it to jmp with NoTrack prefix.
61046 SDValue Chain = Value;
61047 // Jump table debug info is only needed if CodeView is enabled.
61049 Chain = DAG.getJumpTableDebugInfo(JTI, Chain, dl);
61050 return DAG.getNode(X86ISD::NT_BRIND, dl, MVT::Other, Chain, Addr);
61051 }
61052
61053 return TargetLowering::expandIndirectJTBranch(dl, Value, Addr, JTI, DAG);
61054}
61055
61058 const SDNode *LogicOp, const SDNode *SETCC0, const SDNode *SETCC1) const {
61060 EVT VT = LogicOp->getValueType(0);
61061 EVT OpVT = SETCC0->getOperand(0).getValueType();
61062 if (!VT.isInteger())
61064
61065 if (VT.isVector())
61070
61071 // Don't use `NotAnd` as even though `not` is generally shorter code size than
61072 // `add`, `add` can lower to LEA which can save moves / spills. Any case where
61073 // `NotAnd` applies, `AddAnd` does as well.
61074 // TODO: Currently we lower (icmp eq/ne (and ~X, Y), 0) -> `test (not X), Y`,
61075 // if we change that to `andn Y, X` it may be worth prefering `NotAnd` here.
61077}
61078
61080 EVT VT = Op.getValueType();
61081 bool Is8BitMulByConstant = VT == MVT::i8 && Op.getOpcode() == ISD::MUL &&
61082 isa<ConstantSDNode>(Op.getOperand(1));
61083
61084 // i16 is legal, but undesirable since i16 instruction encodings are longer
61085 // and some i16 instructions are slow.
61086 // 8-bit multiply-by-constant can usually be expanded to something cheaper
61087 // using LEA and/or other ALU ops.
61088 if (VT != MVT::i16 && !Is8BitMulByConstant)
61089 return false;
61090
61091 auto IsFoldableRMW = [](SDValue Load, SDValue Op) {
61092 if (!Op.hasOneUse())
61093 return false;
61094 SDNode *User = *Op->user_begin();
61096 return false;
61097 auto *Ld = cast<LoadSDNode>(Load);
61098 auto *St = cast<StoreSDNode>(User);
61099 return Ld->getBasePtr() == St->getBasePtr();
61100 };
61101
61102 auto IsFoldableAtomicRMW = [](SDValue Load, SDValue Op) {
61103 if (!Load.hasOneUse() || Load.getOpcode() != ISD::ATOMIC_LOAD)
61104 return false;
61105 if (!Op.hasOneUse())
61106 return false;
61107 SDNode *User = *Op->user_begin();
61108 if (User->getOpcode() != ISD::ATOMIC_STORE)
61109 return false;
61110 auto *Ld = cast<AtomicSDNode>(Load);
61111 auto *St = cast<AtomicSDNode>(User);
61112 return Ld->getBasePtr() == St->getBasePtr();
61113 };
61114
61115 auto IsFoldableZext = [](SDValue Op) {
61116 if (!Op.hasOneUse())
61117 return false;
61118 SDNode *User = *Op->user_begin();
61119 EVT VT = User->getValueType(0);
61120 return (User->getOpcode() == ISD::ZERO_EXTEND &&
61121 (VT == MVT::i32 || VT == MVT::i64));
61122 };
61123
61124 bool Commute = false;
61125 switch (Op.getOpcode()) {
61126 default: return false;
61127 case ISD::SIGN_EXTEND:
61128 case ISD::ZERO_EXTEND:
61129 case ISD::ANY_EXTEND:
61130 break;
61131 case ISD::SHL:
61132 case ISD::SRA:
61133 case ISD::SRL: {
61134 SDValue N0 = Op.getOperand(0);
61135 // Look out for (store (shl (load), x)).
61136 if (X86::mayFoldLoad(N0, Subtarget) && IsFoldableRMW(N0, Op))
61137 return false;
61138 break;
61139 }
61140 case ISD::MUL:
61141 // When ZU is enabled, we prefer to not promote for MUL by a constant
61142 // when there is an opportunity to fold a zext with imulzu.
61143 if (Subtarget.hasZU() && IsFoldableZext(Op) &&
61144 (isa<ConstantSDNode>(Op.getOperand(0)) ||
61145 isa<ConstantSDNode>(Op.getOperand(1))))
61146 return false;
61147 [[fallthrough]];
61148 case ISD::ADD:
61149 case ISD::AND:
61150 case ISD::OR:
61151 case ISD::XOR:
61152 Commute = true;
61153 [[fallthrough]];
61154 case ISD::SUB: {
61155 SDValue N0 = Op.getOperand(0);
61156 SDValue N1 = Op.getOperand(1);
61157 // Avoid disabling potential load folding opportunities.
61158 if (X86::mayFoldLoad(N1, Subtarget) &&
61159 (!Commute || !isa<ConstantSDNode>(N0) ||
61160 (Op.getOpcode() != ISD::MUL && IsFoldableRMW(N1, Op))))
61161 return false;
61162 if (X86::mayFoldLoad(N0, Subtarget) &&
61163 ((Commute && !isa<ConstantSDNode>(N1)) ||
61164 (Op.getOpcode() != ISD::MUL && IsFoldableRMW(N0, Op))))
61165 return false;
61166 if (IsFoldableAtomicRMW(N0, Op) ||
61167 (Commute && IsFoldableAtomicRMW(N1, Op)))
61168 return false;
61169 }
61170 }
61171
61172 PVT = MVT::i32;
61173 return true;
61174}
61175
61176//===----------------------------------------------------------------------===//
61177// X86 Inline Assembly Support
61178//===----------------------------------------------------------------------===//
61179
61182 .Case("{@cca}", X86::COND_A)
61183 .Case("{@ccae}", X86::COND_AE)
61184 .Case("{@ccb}", X86::COND_B)
61185 .Case("{@ccbe}", X86::COND_BE)
61186 .Case("{@ccc}", X86::COND_B)
61187 .Case("{@cce}", X86::COND_E)
61188 .Case("{@ccz}", X86::COND_E)
61189 .Case("{@ccg}", X86::COND_G)
61190 .Case("{@ccge}", X86::COND_GE)
61191 .Case("{@ccl}", X86::COND_L)
61192 .Case("{@ccle}", X86::COND_LE)
61193 .Case("{@ccna}", X86::COND_BE)
61194 .Case("{@ccnae}", X86::COND_B)
61195 .Case("{@ccnb}", X86::COND_AE)
61196 .Case("{@ccnbe}", X86::COND_A)
61197 .Case("{@ccnc}", X86::COND_AE)
61198 .Case("{@ccne}", X86::COND_NE)
61199 .Case("{@ccnz}", X86::COND_NE)
61200 .Case("{@ccng}", X86::COND_LE)
61201 .Case("{@ccnge}", X86::COND_L)
61202 .Case("{@ccnl}", X86::COND_GE)
61203 .Case("{@ccnle}", X86::COND_G)
61204 .Case("{@ccno}", X86::COND_NO)
61205 .Case("{@ccnp}", X86::COND_NP)
61206 .Case("{@ccns}", X86::COND_NS)
61207 .Case("{@cco}", X86::COND_O)
61208 .Case("{@ccp}", X86::COND_P)
61209 .Case("{@ccs}", X86::COND_S)
61211 return Cond;
61212}
61213
61214/// Given a constraint letter, return the type of constraint for this target.
61217 if (Constraint.size() == 1) {
61218 switch (Constraint[0]) {
61219 case 'R':
61220 case 'q':
61221 case 'Q':
61222 case 'f':
61223 case 't':
61224 case 'u':
61225 case 'y':
61226 case 'x':
61227 case 'v':
61228 case 'l':
61229 case 'k': // AVX512 masking registers.
61230 return C_RegisterClass;
61231 case 'a':
61232 case 'b':
61233 case 'c':
61234 case 'd':
61235 case 'S':
61236 case 'D':
61237 case 'A':
61238 return C_Register;
61239 case 'I':
61240 case 'J':
61241 case 'K':
61242 case 'N':
61243 case 'G':
61244 case 'L':
61245 case 'M':
61246 return C_Immediate;
61247 case 'C':
61248 case 'e':
61249 case 'Z':
61250 return C_Other;
61251 default:
61252 break;
61253 }
61254 }
61255 else if (Constraint.size() == 2) {
61256 switch (Constraint[0]) {
61257 default:
61258 break;
61259 case 'W':
61260 if (Constraint[1] != 's')
61261 break;
61262 return C_Other;
61263 case 'Y':
61264 switch (Constraint[1]) {
61265 default:
61266 break;
61267 case 'z':
61268 return C_Register;
61269 case 'i':
61270 case 'm':
61271 case 'k':
61272 case 't':
61273 case '2':
61274 return C_RegisterClass;
61275 }
61276 break;
61277 case 'j':
61278 switch (Constraint[1]) {
61279 default:
61280 break;
61281 case 'r':
61282 case 'R':
61283 return C_RegisterClass;
61284 }
61285 }
61286 } else if (parseConstraintCode(Constraint) != X86::COND_INVALID)
61287 return C_Other;
61288 return TargetLowering::getConstraintType(Constraint);
61289}
61290
61291/// Examine constraint type and operand type and determine a weight value.
61292/// This object must already have been set up with the operand type
61293/// and the current alternative constraint selected.
61296 AsmOperandInfo &Info, const char *Constraint) const {
61298 Value *CallOperandVal = Info.CallOperandVal;
61299 // If we don't have a value, we can't do a match,
61300 // but allow it at the lowest weight.
61301 if (!CallOperandVal)
61302 return CW_Default;
61303 Type *Ty = CallOperandVal->getType();
61304 // Look at the constraint type.
61305 switch (*Constraint) {
61306 default:
61308 [[fallthrough]];
61309 case 'R':
61310 case 'q':
61311 case 'Q':
61312 case 'a':
61313 case 'b':
61314 case 'c':
61315 case 'd':
61316 case 'S':
61317 case 'D':
61318 case 'A':
61319 if (CallOperandVal->getType()->isIntegerTy())
61320 Wt = CW_SpecificReg;
61321 break;
61322 case 'f':
61323 case 't':
61324 case 'u':
61325 if (Ty->isFloatingPointTy())
61326 Wt = CW_SpecificReg;
61327 break;
61328 case 'y':
61329 if (Ty->getPrimitiveSizeInBits() == 64 && Subtarget.hasMMX())
61330 Wt = CW_SpecificReg;
61331 break;
61332 case 'Y':
61333 if (StringRef(Constraint).size() != 2)
61334 break;
61335 switch (Constraint[1]) {
61336 default:
61337 return CW_Invalid;
61338 // XMM0
61339 case 'z':
61340 if (((Ty->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) ||
61341 ((Ty->getPrimitiveSizeInBits() == 256) && Subtarget.hasAVX()) ||
61342 ((Ty->getPrimitiveSizeInBits() == 512) && Subtarget.hasAVX512()))
61343 return CW_SpecificReg;
61344 return CW_Invalid;
61345 // Conditional OpMask regs (AVX512)
61346 case 'k':
61347 if ((Ty->getPrimitiveSizeInBits() == 64) && Subtarget.hasAVX512())
61348 return CW_Register;
61349 return CW_Invalid;
61350 // Any MMX reg
61351 case 'm':
61352 if (Ty->getPrimitiveSizeInBits() == 64 && Subtarget.hasMMX())
61353 return CW_SpecificReg;
61354 return CW_Invalid;
61355 // Any SSE reg when ISA >= SSE2, same as 'x'
61356 case 'i':
61357 case 't':
61358 case '2':
61359 if (!Subtarget.hasSSE2())
61360 return CW_Invalid;
61361 break;
61362 }
61363 break;
61364 case 'j':
61365 if (StringRef(Constraint).size() != 2)
61366 break;
61367 switch (Constraint[1]) {
61368 default:
61369 return CW_Invalid;
61370 case 'r':
61371 case 'R':
61372 if (CallOperandVal->getType()->isIntegerTy())
61373 Wt = CW_SpecificReg;
61374 break;
61375 }
61376 break;
61377 case 'v':
61378 if ((Ty->getPrimitiveSizeInBits() == 512) && Subtarget.hasAVX512())
61379 Wt = CW_Register;
61380 [[fallthrough]];
61381 case 'x':
61382 if (((Ty->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) ||
61383 ((Ty->getPrimitiveSizeInBits() == 256) && Subtarget.hasAVX()))
61384 Wt = CW_Register;
61385 break;
61386 case 'k':
61387 // Enable conditional vector operations using %k<#> registers.
61388 if ((Ty->getPrimitiveSizeInBits() == 64) && Subtarget.hasAVX512())
61389 Wt = CW_Register;
61390 break;
61391 case 'I':
61392 if (auto *C = dyn_cast<ConstantInt>(Info.CallOperandVal))
61393 if (C->getZExtValue() <= 31)
61394 Wt = CW_Constant;
61395 break;
61396 case 'J':
61397 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
61398 if (C->getZExtValue() <= 63)
61399 Wt = CW_Constant;
61400 break;
61401 case 'K':
61402 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
61403 if ((C->getSExtValue() >= -0x80) && (C->getSExtValue() <= 0x7f))
61404 Wt = CW_Constant;
61405 break;
61406 case 'L':
61407 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
61408 if ((C->getZExtValue() == 0xff) || (C->getZExtValue() == 0xffff))
61409 Wt = CW_Constant;
61410 break;
61411 case 'M':
61412 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
61413 if (C->getZExtValue() <= 3)
61414 Wt = CW_Constant;
61415 break;
61416 case 'N':
61417 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
61418 if (C->getZExtValue() <= 0xff)
61419 Wt = CW_Constant;
61420 break;
61421 case 'G':
61422 case 'C':
61423 if (isa<ConstantFP>(CallOperandVal))
61424 Wt = CW_Constant;
61425 break;
61426 case 'e':
61427 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
61428 if ((C->getSExtValue() >= -0x80000000LL) &&
61429 (C->getSExtValue() <= 0x7fffffffLL))
61430 Wt = CW_Constant;
61431 break;
61432 case 'Z':
61433 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
61434 if (C->getZExtValue() <= 0xffffffff)
61435 Wt = CW_Constant;
61436 break;
61437 }
61438 return Wt;
61439}
61440
61441/// Try to replace an X constraint, which matches anything, with another that
61442/// has more specific requirements based on the type of the corresponding
61443/// operand.
61445LowerXConstraint(EVT ConstraintVT) const {
61446 // FP X constraints get lowered to SSE1/2 registers if available, otherwise
61447 // 'f' like normal targets.
61448 if (ConstraintVT.isFloatingPoint()) {
61449 if (Subtarget.hasSSE1())
61450 return "x";
61451 }
61452
61453 return TargetLowering::LowerXConstraint(ConstraintVT);
61454}
61455
61456// Lower @cc targets via setcc.
61458 SDValue &Chain, SDValue &Glue, const SDLoc &DL,
61459 const AsmOperandInfo &OpInfo, SelectionDAG &DAG) const {
61460 X86::CondCode Cond = parseConstraintCode(OpInfo.ConstraintCode);
61461 if (Cond == X86::COND_INVALID)
61462 return SDValue();
61463 // Check that return type is valid.
61464 if (OpInfo.ConstraintVT.isVector() || !OpInfo.ConstraintVT.isInteger() ||
61465 OpInfo.ConstraintVT.getSizeInBits() < 8)
61466 report_fatal_error("Glue output operand is of invalid type");
61467
61468 // Get EFLAGS register. Only update chain when copyfrom is glued.
61469 if (Glue.getNode()) {
61470 Glue = DAG.getCopyFromReg(Chain, DL, X86::EFLAGS, MVT::i32, Glue);
61471 Chain = Glue.getValue(1);
61472 } else
61473 Glue = DAG.getCopyFromReg(Chain, DL, X86::EFLAGS, MVT::i32);
61474 // Extract CC code.
61475 SDValue CC = getSETCC(Cond, Glue, DL, DAG);
61476 // Extend to 32-bits
61477 SDValue Result = DAG.getNode(ISD::ZERO_EXTEND, DL, OpInfo.ConstraintVT, CC);
61478
61479 return Result;
61480}
61481
61482/// Lower the specified operand into the Ops vector.
61483/// If it is invalid, don't add anything to Ops.
61485 StringRef Constraint,
61486 std::vector<SDValue> &Ops,
61487 SelectionDAG &DAG) const {
61488 SDValue Result;
61489 char ConstraintLetter = Constraint[0];
61490 switch (ConstraintLetter) {
61491 default: break;
61492 case 'I':
61493 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
61494 if (C->getZExtValue() <= 31) {
61495 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
61496 Op.getValueType());
61497 break;
61498 }
61499 }
61500 return;
61501 case 'J':
61502 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
61503 if (C->getZExtValue() <= 63) {
61504 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
61505 Op.getValueType());
61506 break;
61507 }
61508 }
61509 return;
61510 case 'K':
61511 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
61512 if (isInt<8>(C->getSExtValue())) {
61513 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
61514 Op.getValueType());
61515 break;
61516 }
61517 }
61518 return;
61519 case 'L':
61520 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
61521 if (C->getZExtValue() == 0xff || C->getZExtValue() == 0xffff ||
61522 (Subtarget.is64Bit() && C->getZExtValue() == 0xffffffff)) {
61523 Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op),
61524 Op.getValueType());
61525 break;
61526 }
61527 }
61528 return;
61529 case 'M':
61530 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
61531 if (C->getZExtValue() <= 3) {
61532 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
61533 Op.getValueType());
61534 break;
61535 }
61536 }
61537 return;
61538 case 'N':
61539 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
61540 if (C->getZExtValue() <= 255) {
61541 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
61542 Op.getValueType());
61543 break;
61544 }
61545 }
61546 return;
61547 case 'O':
61548 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
61549 if (C->getZExtValue() <= 127) {
61550 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
61551 Op.getValueType());
61552 break;
61553 }
61554 }
61555 return;
61556 case 'e': {
61557 // 32-bit signed value
61558 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
61560 C->getSExtValue())) {
61561 // Widen to 64 bits here to get it sign extended.
61562 Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op), MVT::i64);
61563 break;
61564 }
61565 // FIXME gcc accepts some relocatable values here too, but only in certain
61566 // memory models; it's complicated.
61567 }
61568 return;
61569 }
61570 case 'W': {
61571 assert(Constraint[1] == 's');
61572 // Op is a BlockAddressSDNode or a GlobalAddressSDNode with an optional
61573 // offset.
61574 if (const auto *BA = dyn_cast<BlockAddressSDNode>(Op)) {
61575 Ops.push_back(DAG.getTargetBlockAddress(BA->getBlockAddress(),
61576 BA->getValueType(0)));
61577 } else {
61578 int64_t Offset = 0;
61579 if (Op->getOpcode() == ISD::ADD &&
61580 isa<ConstantSDNode>(Op->getOperand(1))) {
61581 Offset = cast<ConstantSDNode>(Op->getOperand(1))->getSExtValue();
61582 Op = Op->getOperand(0);
61583 }
61584 if (const auto *GA = dyn_cast<GlobalAddressSDNode>(Op))
61585 Ops.push_back(DAG.getTargetGlobalAddress(GA->getGlobal(), SDLoc(Op),
61586 GA->getValueType(0), Offset));
61587 }
61588 return;
61589 }
61590 case 'Z': {
61591 // 32-bit unsigned value
61592 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
61594 C->getZExtValue())) {
61595 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
61596 Op.getValueType());
61597 break;
61598 }
61599 }
61600 // FIXME gcc accepts some relocatable values here too, but only in certain
61601 // memory models; it's complicated.
61602 return;
61603 }
61604 case 'i': {
61605 // Literal immediates are always ok.
61606 if (auto *CST = dyn_cast<ConstantSDNode>(Op)) {
61607 bool IsBool = CST->getConstantIntValue()->getBitWidth() == 1;
61608 BooleanContent BCont = getBooleanContents(MVT::i64);
61609 ISD::NodeType ExtOpc = IsBool ? getExtendForContent(BCont)
61611 int64_t ExtVal = ExtOpc == ISD::ZERO_EXTEND ? CST->getZExtValue()
61612 : CST->getSExtValue();
61613 Result = DAG.getTargetConstant(ExtVal, SDLoc(Op), MVT::i64);
61614 break;
61615 }
61616
61617 // In any sort of PIC mode addresses need to be computed at runtime by
61618 // adding in a register or some sort of table lookup. These can't
61619 // be used as immediates. BlockAddresses and BasicBlocks are fine though.
61620 if ((Subtarget.isPICStyleGOT() || Subtarget.isPICStyleStubPIC()) &&
61622 return;
61623
61624 // If we are in non-pic codegen mode, we allow the address of a global (with
61625 // an optional displacement) to be used with 'i'.
61626 if (auto *GA = dyn_cast<GlobalAddressSDNode>(Op))
61627 // If we require an extra load to get this address, as in PIC mode, we
61628 // can't accept it.
61630 Subtarget.classifyGlobalReference(GA->getGlobal())))
61631 return;
61632 break;
61633 }
61634 }
61635
61636 if (Result.getNode()) {
61637 Ops.push_back(Result);
61638 return;
61639 }
61640 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
61641}
61642
61643/// Check if \p RC is a general purpose register class.
61644/// I.e., GR* or one of their variant.
61645static bool isGRClass(const TargetRegisterClass &RC) {
61646 return RC.hasSuperClassEq(&X86::GR8RegClass) ||
61647 RC.hasSuperClassEq(&X86::GR16RegClass) ||
61648 RC.hasSuperClassEq(&X86::GR32RegClass) ||
61649 RC.hasSuperClassEq(&X86::GR64RegClass) ||
61650 RC.hasSuperClassEq(&X86::LOW32_ADDR_ACCESS_RBPRegClass);
61651}
61652
61653/// Check if \p RC is a vector register class.
61654/// I.e., FR* / VR* or one of their variant.
61655static bool isFRClass(const TargetRegisterClass &RC) {
61656 return RC.hasSuperClassEq(&X86::FR16XRegClass) ||
61657 RC.hasSuperClassEq(&X86::FR32XRegClass) ||
61658 RC.hasSuperClassEq(&X86::FR64XRegClass) ||
61659 RC.hasSuperClassEq(&X86::VR128XRegClass) ||
61660 RC.hasSuperClassEq(&X86::VR256XRegClass) ||
61661 RC.hasSuperClassEq(&X86::VR512RegClass);
61662}
61663
61664/// Check if \p RC is a mask register class.
61665/// I.e., VK* or one of their variant.
61666static bool isVKClass(const TargetRegisterClass &RC) {
61667 return RC.hasSuperClassEq(&X86::VK1RegClass) ||
61668 RC.hasSuperClassEq(&X86::VK2RegClass) ||
61669 RC.hasSuperClassEq(&X86::VK4RegClass) ||
61670 RC.hasSuperClassEq(&X86::VK8RegClass) ||
61671 RC.hasSuperClassEq(&X86::VK16RegClass) ||
61672 RC.hasSuperClassEq(&X86::VK32RegClass) ||
61673 RC.hasSuperClassEq(&X86::VK64RegClass);
61674}
61675
61676static bool useEGPRInlineAsm(const X86Subtarget &Subtarget) {
61677 return Subtarget.hasEGPR() && Subtarget.useInlineAsmGPR32();
61678}
61679
61680std::pair<unsigned, const TargetRegisterClass *>
61682 StringRef Constraint,
61683 MVT VT) const {
61684 // First, see if this is a constraint that directly corresponds to an LLVM
61685 // register class.
61686 if (Constraint.size() == 1) {
61687 // GCC Constraint Letters
61688 switch (Constraint[0]) {
61689 default: break;
61690 // 'A' means [ER]AX + [ER]DX.
61691 case 'A':
61692 if (Subtarget.is64Bit())
61693 return std::make_pair(X86::RAX, &X86::GR64_ADRegClass);
61694 assert((Subtarget.is32Bit() || Subtarget.is16Bit()) &&
61695 "Expecting 64, 32 or 16 bit subtarget");
61696 return std::make_pair(X86::EAX, &X86::GR32_ADRegClass);
61697
61698 // TODO: Slight differences here in allocation order and leaving
61699 // RIP in the class. Do they matter any more here than they do
61700 // in the normal allocation?
61701 case 'k':
61702 if (Subtarget.hasAVX512()) {
61703 if (VT == MVT::v1i1 || VT == MVT::i1)
61704 return std::make_pair(0U, &X86::VK1RegClass);
61705 if (VT == MVT::v8i1 || VT == MVT::i8)
61706 return std::make_pair(0U, &X86::VK8RegClass);
61707 if (VT == MVT::v16i1 || VT == MVT::i16)
61708 return std::make_pair(0U, &X86::VK16RegClass);
61709 }
61710 if (Subtarget.hasBWI()) {
61711 if (VT == MVT::v32i1 || VT == MVT::i32)
61712 return std::make_pair(0U, &X86::VK32RegClass);
61713 if (VT == MVT::v64i1 || VT == MVT::i64)
61714 return std::make_pair(0U, &X86::VK64RegClass);
61715 }
61716 break;
61717 case 'q': // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode.
61718 if (Subtarget.is64Bit()) {
61719 if (VT == MVT::i8 || VT == MVT::i1)
61720 return std::make_pair(0U, useEGPRInlineAsm(Subtarget)
61721 ? &X86::GR8RegClass
61722 : &X86::GR8_NOREX2RegClass);
61723 if (VT == MVT::i16)
61724 return std::make_pair(0U, useEGPRInlineAsm(Subtarget)
61725 ? &X86::GR16RegClass
61726 : &X86::GR16_NOREX2RegClass);
61727 if (VT == MVT::i32 || VT == MVT::f32)
61728 return std::make_pair(0U, useEGPRInlineAsm(Subtarget)
61729 ? &X86::GR32RegClass
61730 : &X86::GR32_NOREX2RegClass);
61731 if (VT != MVT::f80 && !VT.isVector())
61732 return std::make_pair(0U, useEGPRInlineAsm(Subtarget)
61733 ? &X86::GR64RegClass
61734 : &X86::GR64_NOREX2RegClass);
61735 break;
61736 }
61737 [[fallthrough]];
61738 // 32-bit fallthrough
61739 case 'Q': // Q_REGS
61740 if (VT == MVT::i8 || VT == MVT::i1)
61741 return std::make_pair(0U, &X86::GR8_ABCD_LRegClass);
61742 if (VT == MVT::i16)
61743 return std::make_pair(0U, &X86::GR16_ABCDRegClass);
61744 if (VT == MVT::i32 || VT == MVT::f32 ||
61745 (!VT.isVector() && !Subtarget.is64Bit()))
61746 return std::make_pair(0U, &X86::GR32_ABCDRegClass);
61747 if (VT != MVT::f80 && !VT.isVector())
61748 return std::make_pair(0U, &X86::GR64_ABCDRegClass);
61749 break;
61750 case 'r': // GENERAL_REGS
61751 case 'l': // INDEX_REGS
61752 if (VT == MVT::i8 || VT == MVT::i1)
61753 return std::make_pair(0U, useEGPRInlineAsm(Subtarget)
61754 ? &X86::GR8RegClass
61755 : &X86::GR8_NOREX2RegClass);
61756 if (VT == MVT::i16)
61757 return std::make_pair(0U, useEGPRInlineAsm(Subtarget)
61758 ? &X86::GR16RegClass
61759 : &X86::GR16_NOREX2RegClass);
61760 if (VT == MVT::i32 || VT == MVT::f32 ||
61761 (!VT.isVector() && !Subtarget.is64Bit()))
61762 return std::make_pair(0U, useEGPRInlineAsm(Subtarget)
61763 ? &X86::GR32RegClass
61764 : &X86::GR32_NOREX2RegClass);
61765 if (VT != MVT::f80 && !VT.isVector())
61766 return std::make_pair(0U, useEGPRInlineAsm(Subtarget)
61767 ? &X86::GR64RegClass
61768 : &X86::GR64_NOREX2RegClass);
61769 break;
61770 case 'R': // LEGACY_REGS
61771 if (VT == MVT::i8 || VT == MVT::i1)
61772 return std::make_pair(0U, &X86::GR8_NOREXRegClass);
61773 if (VT == MVT::i16)
61774 return std::make_pair(0U, &X86::GR16_NOREXRegClass);
61775 if (VT == MVT::i32 || VT == MVT::f32 ||
61776 (!VT.isVector() && !Subtarget.is64Bit()))
61777 return std::make_pair(0U, &X86::GR32_NOREXRegClass);
61778 if (VT != MVT::f80 && !VT.isVector())
61779 return std::make_pair(0U, &X86::GR64_NOREXRegClass);
61780 break;
61781 case 'f': // FP Stack registers.
61782 // If SSE is enabled for this VT, use f80 to ensure the isel moves the
61783 // value to the correct fpstack register class.
61784 if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT))
61785 return std::make_pair(0U, &X86::RFP32RegClass);
61786 if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT))
61787 return std::make_pair(0U, &X86::RFP64RegClass);
61788 if (VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f80)
61789 return std::make_pair(0U, &X86::RFP80RegClass);
61790 break;
61791 case 'y': // MMX_REGS if MMX allowed.
61792 if (!Subtarget.hasMMX()) break;
61793 return std::make_pair(0U, &X86::VR64RegClass);
61794 case 'v':
61795 case 'x': // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed
61796 if (!Subtarget.hasSSE1()) break;
61797 bool VConstraint = (Constraint[0] == 'v');
61798
61799 switch (VT.SimpleTy) {
61800 default: break;
61801 // Scalar SSE types.
61802 case MVT::f16:
61803 if (VConstraint && Subtarget.hasFP16())
61804 return std::make_pair(0U, &X86::FR16XRegClass);
61805 break;
61806 case MVT::f32:
61807 case MVT::i32:
61808 if (VConstraint && Subtarget.hasVLX())
61809 return std::make_pair(0U, &X86::FR32XRegClass);
61810 return std::make_pair(0U, &X86::FR32RegClass);
61811 case MVT::f64:
61812 case MVT::i64:
61813 if (VConstraint && Subtarget.hasVLX())
61814 return std::make_pair(0U, &X86::FR64XRegClass);
61815 return std::make_pair(0U, &X86::FR64RegClass);
61816 case MVT::i128:
61817 if (Subtarget.is64Bit()) {
61818 if (VConstraint && Subtarget.hasVLX())
61819 return std::make_pair(0U, &X86::VR128XRegClass);
61820 return std::make_pair(0U, &X86::VR128RegClass);
61821 }
61822 break;
61823 // Vector types and fp128.
61824 case MVT::v8f16:
61825 if (!Subtarget.hasFP16())
61826 break;
61827 if (VConstraint)
61828 return std::make_pair(0U, &X86::VR128XRegClass);
61829 return std::make_pair(0U, &X86::VR128RegClass);
61830 case MVT::v8bf16:
61831 if (!Subtarget.hasBF16() || !Subtarget.hasVLX())
61832 break;
61833 if (VConstraint)
61834 return std::make_pair(0U, &X86::VR128XRegClass);
61835 return std::make_pair(0U, &X86::VR128RegClass);
61836 case MVT::f128:
61837 if (!Subtarget.is64Bit())
61838 break;
61839 [[fallthrough]];
61840 case MVT::v16i8:
61841 case MVT::v8i16:
61842 case MVT::v4i32:
61843 case MVT::v2i64:
61844 case MVT::v4f32:
61845 case MVT::v2f64:
61846 if (VConstraint && Subtarget.hasVLX())
61847 return std::make_pair(0U, &X86::VR128XRegClass);
61848 return std::make_pair(0U, &X86::VR128RegClass);
61849 // AVX types.
61850 case MVT::v16f16:
61851 if (!Subtarget.hasFP16())
61852 break;
61853 if (VConstraint)
61854 return std::make_pair(0U, &X86::VR256XRegClass);
61855 return std::make_pair(0U, &X86::VR256RegClass);
61856 case MVT::v16bf16:
61857 if (!Subtarget.hasBF16() || !Subtarget.hasVLX())
61858 break;
61859 if (VConstraint)
61860 return std::make_pair(0U, &X86::VR256XRegClass);
61861 return std::make_pair(0U, &X86::VR256RegClass);
61862 case MVT::v32i8:
61863 case MVT::v16i16:
61864 case MVT::v8i32:
61865 case MVT::v4i64:
61866 case MVT::v8f32:
61867 case MVT::v4f64:
61868 if (VConstraint && Subtarget.hasVLX())
61869 return std::make_pair(0U, &X86::VR256XRegClass);
61870 if (Subtarget.hasAVX())
61871 return std::make_pair(0U, &X86::VR256RegClass);
61872 break;
61873 case MVT::v32f16:
61874 if (!Subtarget.hasFP16())
61875 break;
61876 if (VConstraint)
61877 return std::make_pair(0U, &X86::VR512RegClass);
61878 return std::make_pair(0U, &X86::VR512_0_15RegClass);
61879 case MVT::v32bf16:
61880 if (!Subtarget.hasBF16())
61881 break;
61882 if (VConstraint)
61883 return std::make_pair(0U, &X86::VR512RegClass);
61884 return std::make_pair(0U, &X86::VR512_0_15RegClass);
61885 case MVT::v64i8:
61886 case MVT::v32i16:
61887 case MVT::v8f64:
61888 case MVT::v16f32:
61889 case MVT::v16i32:
61890 case MVT::v8i64:
61891 if (!Subtarget.hasAVX512()) break;
61892 if (VConstraint)
61893 return std::make_pair(0U, &X86::VR512RegClass);
61894 return std::make_pair(0U, &X86::VR512_0_15RegClass);
61895 }
61896 break;
61897 }
61898 } else if (Constraint.size() == 2 && Constraint[0] == 'Y') {
61899 switch (Constraint[1]) {
61900 default:
61901 break;
61902 case 'i':
61903 case 't':
61904 case '2':
61905 return getRegForInlineAsmConstraint(TRI, "x", VT);
61906 case 'm':
61907 if (!Subtarget.hasMMX()) break;
61908 return std::make_pair(0U, &X86::VR64RegClass);
61909 case 'z':
61910 if (!Subtarget.hasSSE1()) break;
61911 switch (VT.SimpleTy) {
61912 default: break;
61913 // Scalar SSE types.
61914 case MVT::f16:
61915 if (!Subtarget.hasFP16())
61916 break;
61917 return std::make_pair(X86::XMM0, &X86::FR16XRegClass);
61918 case MVT::f32:
61919 case MVT::i32:
61920 return std::make_pair(X86::XMM0, &X86::FR32RegClass);
61921 case MVT::f64:
61922 case MVT::i64:
61923 return std::make_pair(X86::XMM0, &X86::FR64RegClass);
61924 case MVT::v8f16:
61925 if (!Subtarget.hasFP16())
61926 break;
61927 return std::make_pair(X86::XMM0, &X86::VR128RegClass);
61928 case MVT::v8bf16:
61929 if (!Subtarget.hasBF16() || !Subtarget.hasVLX())
61930 break;
61931 return std::make_pair(X86::XMM0, &X86::VR128RegClass);
61932 case MVT::f128:
61933 case MVT::v16i8:
61934 case MVT::v8i16:
61935 case MVT::v4i32:
61936 case MVT::v2i64:
61937 case MVT::v4f32:
61938 case MVT::v2f64:
61939 return std::make_pair(X86::XMM0, &X86::VR128RegClass);
61940 // AVX types.
61941 case MVT::v16f16:
61942 if (!Subtarget.hasFP16())
61943 break;
61944 return std::make_pair(X86::YMM0, &X86::VR256RegClass);
61945 case MVT::v16bf16:
61946 if (!Subtarget.hasBF16() || !Subtarget.hasVLX())
61947 break;
61948 return std::make_pair(X86::YMM0, &X86::VR256RegClass);
61949 case MVT::v32i8:
61950 case MVT::v16i16:
61951 case MVT::v8i32:
61952 case MVT::v4i64:
61953 case MVT::v8f32:
61954 case MVT::v4f64:
61955 if (Subtarget.hasAVX())
61956 return std::make_pair(X86::YMM0, &X86::VR256RegClass);
61957 break;
61958 case MVT::v32f16:
61959 if (!Subtarget.hasFP16())
61960 break;
61961 return std::make_pair(X86::ZMM0, &X86::VR512_0_15RegClass);
61962 case MVT::v32bf16:
61963 if (!Subtarget.hasBF16())
61964 break;
61965 return std::make_pair(X86::ZMM0, &X86::VR512_0_15RegClass);
61966 case MVT::v64i8:
61967 case MVT::v32i16:
61968 case MVT::v8f64:
61969 case MVT::v16f32:
61970 case MVT::v16i32:
61971 case MVT::v8i64:
61972 if (Subtarget.hasAVX512())
61973 return std::make_pair(X86::ZMM0, &X86::VR512_0_15RegClass);
61974 break;
61975 }
61976 break;
61977 case 'k':
61978 // This register class doesn't allocate k0 for masked vector operation.
61979 if (Subtarget.hasAVX512()) {
61980 if (VT == MVT::v1i1 || VT == MVT::i1)
61981 return std::make_pair(0U, &X86::VK1WMRegClass);
61982 if (VT == MVT::v8i1 || VT == MVT::i8)
61983 return std::make_pair(0U, &X86::VK8WMRegClass);
61984 if (VT == MVT::v16i1 || VT == MVT::i16)
61985 return std::make_pair(0U, &X86::VK16WMRegClass);
61986 }
61987 if (Subtarget.hasBWI()) {
61988 if (VT == MVT::v32i1 || VT == MVT::i32)
61989 return std::make_pair(0U, &X86::VK32WMRegClass);
61990 if (VT == MVT::v64i1 || VT == MVT::i64)
61991 return std::make_pair(0U, &X86::VK64WMRegClass);
61992 }
61993 break;
61994 }
61995 } else if (Constraint.size() == 2 && Constraint[0] == 'j') {
61996 switch (Constraint[1]) {
61997 default:
61998 break;
61999 case 'r':
62000 if (VT == MVT::i8 || VT == MVT::i1)
62001 return std::make_pair(0U, &X86::GR8_NOREX2RegClass);
62002 if (VT == MVT::i16)
62003 return std::make_pair(0U, &X86::GR16_NOREX2RegClass);
62004 if (VT == MVT::i32 || VT == MVT::f32)
62005 return std::make_pair(0U, &X86::GR32_NOREX2RegClass);
62006 if (VT != MVT::f80 && !VT.isVector())
62007 return std::make_pair(0U, &X86::GR64_NOREX2RegClass);
62008 break;
62009 case 'R':
62010 if (VT == MVT::i8 || VT == MVT::i1)
62011 return std::make_pair(0U, &X86::GR8RegClass);
62012 if (VT == MVT::i16)
62013 return std::make_pair(0U, &X86::GR16RegClass);
62014 if (VT == MVT::i32 || VT == MVT::f32)
62015 return std::make_pair(0U, &X86::GR32RegClass);
62016 if (VT != MVT::f80 && !VT.isVector())
62017 return std::make_pair(0U, &X86::GR64RegClass);
62018 break;
62019 }
62020 }
62021
62022 if (parseConstraintCode(Constraint) != X86::COND_INVALID)
62023 return std::make_pair(0U, &X86::GR32RegClass);
62024
62025 // Use the default implementation in TargetLowering to convert the register
62026 // constraint into a member of a register class.
62027 std::pair<Register, const TargetRegisterClass*> Res;
62029
62030 // Not found as a standard register?
62031 if (!Res.second) {
62032 // Only match x87 registers if the VT is one SelectionDAGBuilder can convert
62033 // to/from f80.
62034 if (VT == MVT::Other || VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f80) {
62035 // Map st(0) -> st(7) -> ST0
62036 if (Constraint.size() == 7 && Constraint[0] == '{' &&
62037 tolower(Constraint[1]) == 's' && tolower(Constraint[2]) == 't' &&
62038 Constraint[3] == '(' &&
62039 (Constraint[4] >= '0' && Constraint[4] <= '7') &&
62040 Constraint[5] == ')' && Constraint[6] == '}') {
62041 // st(7) is not allocatable and thus not a member of RFP80. Return
62042 // singleton class in cases where we have a reference to it.
62043 if (Constraint[4] == '7')
62044 return std::make_pair(X86::FP7, &X86::RFP80_7RegClass);
62045 return std::make_pair(X86::FP0 + Constraint[4] - '0',
62046 &X86::RFP80RegClass);
62047 }
62048
62049 // GCC allows "st(0)" to be called just plain "st".
62050 if (StringRef("{st}").equals_insensitive(Constraint))
62051 return std::make_pair(X86::FP0, &X86::RFP80RegClass);
62052 }
62053
62054 // flags -> EFLAGS
62055 if (StringRef("{flags}").equals_insensitive(Constraint))
62056 return std::make_pair(X86::EFLAGS, &X86::CCRRegClass);
62057
62058 // dirflag -> DF
62059 // Only allow for clobber.
62060 if (StringRef("{dirflag}").equals_insensitive(Constraint) &&
62061 VT == MVT::Other)
62062 return std::make_pair(X86::DF, &X86::DFCCRRegClass);
62063
62064 // fpsr -> FPSW
62065 // Only allow for clobber.
62066 if (StringRef("{fpsr}").equals_insensitive(Constraint) && VT == MVT::Other)
62067 return std::make_pair(X86::FPSW, &X86::FPCCRRegClass);
62068
62069 return Res;
62070 }
62071
62072 // Make sure it isn't a register that requires 64-bit mode.
62073 if (!Subtarget.is64Bit() &&
62074 (isFRClass(*Res.second) || isGRClass(*Res.second)) &&
62075 TRI->getEncodingValue(Res.first) >= 8) {
62076 // Register requires REX prefix, but we're in 32-bit mode.
62077 return std::make_pair(0, nullptr);
62078 }
62079
62080 // Make sure it isn't a register that requires AVX512.
62081 if (!Subtarget.hasAVX512() && isFRClass(*Res.second) &&
62082 TRI->getEncodingValue(Res.first) & 0x10) {
62083 // Register requires EVEX prefix.
62084 return std::make_pair(0, nullptr);
62085 }
62086
62087 // Otherwise, check to see if this is a register class of the wrong value
62088 // type. For example, we want to map "{ax},i32" -> {eax}, we don't want it to
62089 // turn into {ax},{dx}.
62090 // MVT::Other is used to specify clobber names.
62091 if (TRI->isTypeLegalForClass(*Res.second, VT) || VT == MVT::Other)
62092 return Res; // Correct type already, nothing to do.
62093
62094 // Get a matching integer of the correct size. i.e. "ax" with MVT::32 should
62095 // return "eax". This should even work for things like getting 64bit integer
62096 // registers when given an f64 type.
62097 const TargetRegisterClass *Class = Res.second;
62098 // The generic code will match the first register class that contains the
62099 // given register. Thus, based on the ordering of the tablegened file,
62100 // the "plain" GR classes might not come first.
62101 // Therefore, use a helper method.
62102 if (isGRClass(*Class)) {
62103 unsigned Size = VT.getSizeInBits();
62104 if (Size == 1) Size = 8;
62105 if (Size != 8 && Size != 16 && Size != 32 && Size != 64)
62106 return std::make_pair(0, nullptr);
62107 Register DestReg = getX86SubSuperRegister(Res.first, Size);
62108 if (DestReg.isValid()) {
62109 bool is64Bit = Subtarget.is64Bit();
62110 const TargetRegisterClass *RC =
62111 Size == 8 ? (is64Bit ? &X86::GR8RegClass : &X86::GR8_NOREXRegClass)
62112 : Size == 16 ? (is64Bit ? &X86::GR16RegClass : &X86::GR16_NOREXRegClass)
62113 : Size == 32 ? (is64Bit ? &X86::GR32RegClass : &X86::GR32_NOREXRegClass)
62114 : /*Size == 64*/ (is64Bit ? &X86::GR64RegClass : nullptr);
62115 if (Size == 64 && !is64Bit) {
62116 // Model GCC's behavior here and select a fixed pair of 32-bit
62117 // registers.
62118 switch (DestReg) {
62119 case X86::RAX:
62120 return std::make_pair(X86::EAX, &X86::GR32_ADRegClass);
62121 case X86::RDX:
62122 return std::make_pair(X86::EDX, &X86::GR32_DCRegClass);
62123 case X86::RCX:
62124 return std::make_pair(X86::ECX, &X86::GR32_CBRegClass);
62125 case X86::RBX:
62126 return std::make_pair(X86::EBX, &X86::GR32_BSIRegClass);
62127 case X86::RSI:
62128 return std::make_pair(X86::ESI, &X86::GR32_SIDIRegClass);
62129 case X86::RDI:
62130 return std::make_pair(X86::EDI, &X86::GR32_DIBPRegClass);
62131 case X86::RBP:
62132 return std::make_pair(X86::EBP, &X86::GR32_BPSPRegClass);
62133 default:
62134 return std::make_pair(0, nullptr);
62135 }
62136 }
62137 if (RC && RC->contains(DestReg))
62138 return std::make_pair(DestReg, RC);
62139 return Res;
62140 }
62141 // No register found/type mismatch.
62142 return std::make_pair(0, nullptr);
62143 } else if (isFRClass(*Class)) {
62144 // Handle references to XMM physical registers that got mapped into the
62145 // wrong class. This can happen with constraints like {xmm0} where the
62146 // target independent register mapper will just pick the first match it can
62147 // find, ignoring the required type.
62148
62149 // TODO: Handle f128 and i128 in FR128RegClass after it is tested well.
62150 if (VT == MVT::f16)
62151 Res.second = &X86::FR16XRegClass;
62152 else if (VT == MVT::f32 || VT == MVT::i32)
62153 Res.second = &X86::FR32XRegClass;
62154 else if (VT == MVT::f64 || VT == MVT::i64)
62155 Res.second = &X86::FR64XRegClass;
62156 else if (TRI->isTypeLegalForClass(X86::VR128XRegClass, VT))
62157 Res.second = &X86::VR128XRegClass;
62158 else if (TRI->isTypeLegalForClass(X86::VR256XRegClass, VT))
62159 Res.second = &X86::VR256XRegClass;
62160 else if (TRI->isTypeLegalForClass(X86::VR512RegClass, VT))
62161 Res.second = &X86::VR512RegClass;
62162 else {
62163 // Type mismatch and not a clobber: Return an error;
62164 Res.first = 0;
62165 Res.second = nullptr;
62166 }
62167 } else if (isVKClass(*Class)) {
62168 if (VT == MVT::v1i1 || VT == MVT::i1)
62169 Res.second = &X86::VK1RegClass;
62170 else if (VT == MVT::v8i1 || VT == MVT::i8)
62171 Res.second = &X86::VK8RegClass;
62172 else if (VT == MVT::v16i1 || VT == MVT::i16)
62173 Res.second = &X86::VK16RegClass;
62174 else if (VT == MVT::v32i1 || VT == MVT::i32)
62175 Res.second = &X86::VK32RegClass;
62176 else if (VT == MVT::v64i1 || VT == MVT::i64)
62177 Res.second = &X86::VK64RegClass;
62178 else {
62179 // Type mismatch and not a clobber: Return an error;
62180 Res.first = 0;
62181 Res.second = nullptr;
62182 }
62183 }
62184
62185 return Res;
62186}
62187
62188bool X86TargetLowering::isIntDivCheap(EVT VT, AttributeList Attr) const {
62189 // Integer division on x86 is expensive. However, when aggressively optimizing
62190 // for code size, we prefer to use a div instruction, as it is usually smaller
62191 // than the alternative sequence.
62192 // The exception to this is vector division. Since x86 doesn't have vector
62193 // integer division, leaving the division as-is is a loss even in terms of
62194 // size, because it will have to be scalarized, while the alternative code
62195 // sequence can be performed in vector form.
62196 bool OptSize = Attr.hasFnAttr(Attribute::MinSize);
62197 return OptSize && !VT.isVector();
62198}
62199
62200void X86TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
62201 if (!Subtarget.is64Bit())
62202 return;
62203
62204 // Update IsSplitCSR in X86MachineFunctionInfo.
62206 Entry->getParent()->getInfo<X86MachineFunctionInfo>();
62207 AFI->setIsSplitCSR(true);
62208}
62209
62210void X86TargetLowering::insertCopiesSplitCSR(
62211 MachineBasicBlock *Entry,
62212 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
62213 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
62214 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
62215 if (!IStart)
62216 return;
62217
62218 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
62219 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
62220 MachineBasicBlock::iterator MBBI = Entry->begin();
62221 for (const MCPhysReg *I = IStart; *I; ++I) {
62222 const TargetRegisterClass *RC = nullptr;
62223 if (X86::GR64RegClass.contains(*I))
62224 RC = &X86::GR64RegClass;
62225 else
62226 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
62227
62228 Register NewVR = MRI->createVirtualRegister(RC);
62229 // Create copy from CSR to a virtual register.
62230 // FIXME: this currently does not emit CFI pseudo-instructions, it works
62231 // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
62232 // nounwind. If we want to generalize this later, we may need to emit
62233 // CFI pseudo-instructions.
62234 assert(
62235 Entry->getParent()->getFunction().hasFnAttribute(Attribute::NoUnwind) &&
62236 "Function should be nounwind in insertCopiesSplitCSR!");
62237 Entry->addLiveIn(*I);
62238 BuildMI(*Entry, MBBI, MIMetadata(), TII->get(TargetOpcode::COPY), NewVR)
62239 .addReg(*I);
62240
62241 // Insert the copy-back instructions right before the terminator.
62242 for (auto *Exit : Exits)
62243 BuildMI(*Exit, Exit->getFirstTerminator(), MIMetadata(),
62244 TII->get(TargetOpcode::COPY), *I)
62245 .addReg(NewVR);
62246 }
62247}
62248
62250 return Subtarget.is64Bit();
62251}
62252
62256 const TargetInstrInfo *TII) const {
62257 assert(MBBI->isCall() && MBBI->getCFIType() &&
62258 "Invalid call instruction for a KCFI check");
62259
62260 MachineFunction &MF = *MBB.getParent();
62261 // If the call target is a memory operand, unfold it and use R11 for the
62262 // call, so KCFI_CHECK won't have to recompute the address.
62263 switch (MBBI->getOpcode()) {
62264 case X86::CALL64m:
62265 case X86::CALL64m_NT:
62266 case X86::TAILJMPm64:
62267 case X86::TAILJMPm64_REX: {
62270 if (!TII->unfoldMemoryOperand(MF, *OrigCall, X86::R11, /*UnfoldLoad=*/true,
62271 /*UnfoldStore=*/false, NewMIs))
62272 report_fatal_error("Failed to unfold memory operand for a KCFI check");
62273 for (auto *NewMI : NewMIs)
62274 MBBI = MBB.insert(OrigCall, NewMI);
62275 assert(MBBI->isCall() &&
62276 "Unexpected instruction after memory operand unfolding");
62277 if (OrigCall->shouldUpdateAdditionalCallInfo())
62278 MF.moveAdditionalCallInfo(&*OrigCall, &*MBBI);
62279 MBBI->setCFIType(MF, OrigCall->getCFIType());
62280 OrigCall->eraseFromParent();
62281 break;
62282 }
62283 default:
62284 break;
62285 }
62286
62287 MachineOperand &Target = MBBI->getOperand(0);
62288 Register TargetReg;
62289 switch (MBBI->getOpcode()) {
62290 case X86::CALL64r:
62291 case X86::CALL64r_ImpCall:
62292 case X86::CALL64r_NT:
62293 case X86::TAILJMPr64:
62294 case X86::TAILJMPr64_REX:
62295 assert(Target.isReg() && "Unexpected target operand for an indirect call");
62296 Target.setIsRenamable(false);
62297 TargetReg = Target.getReg();
62298 break;
62299 case X86::CALL64pcrel32:
62300 case X86::TAILJMPd64:
62301 assert(Target.isSymbol() && "Unexpected target operand for a direct call");
62302 // X86TargetLowering::EmitLoweredIndirectThunk always uses r11 for
62303 // 64-bit indirect thunk calls.
62304 assert(StringRef(Target.getSymbolName()).ends_with("_r11") &&
62305 "Unexpected register for an indirect thunk call");
62306 TargetReg = X86::R11;
62307 break;
62308 default:
62309 llvm_unreachable("Unexpected CFI call opcode");
62310 break;
62311 }
62312
62313 return BuildMI(MBB, MBBI, MIMetadata(*MBBI), TII->get(X86::KCFI_CHECK))
62314 .addReg(TargetReg)
62315 .addImm(MBBI->getCFIType())
62316 .getInstr();
62317}
62318
62319/// Returns true if stack probing through a function call is requested.
62323
62324/// Returns true if stack probing through inline assembly is requested.
62326
62327 // No inline stack probe for Windows, they have their own mechanism.
62328 if (Subtarget.isOSWindows() || Subtarget.isUEFI() ||
62329 MF.getFunction().hasFnAttribute("no-stack-arg-probe"))
62330 return false;
62331
62332 // If the function specifically requests inline stack probes, emit them.
62333 if (MF.getFunction().hasFnAttribute("probe-stack"))
62334 return MF.getFunction().getFnAttribute("probe-stack").getValueAsString() ==
62335 "inline-asm";
62336
62337 return false;
62338}
62339
62340/// Returns the name of the symbol used to emit stack probes or the empty
62341/// string if not applicable.
62344 // Inline Stack probes disable stack probe call
62345 if (hasInlineStackProbe(MF))
62346 return "";
62347
62348 // If the function specifically requests stack probes, emit them.
62349 if (MF.getFunction().hasFnAttribute("probe-stack"))
62350 return MF.getFunction().getFnAttribute("probe-stack").getValueAsString();
62351
62352 // Generally, if we aren't on Windows, the platform ABI does not include
62353 // support for stack probes, so don't emit them.
62354 if ((!Subtarget.isOSWindows() && !Subtarget.isUEFI()) ||
62355 Subtarget.isTargetMachO() ||
62356 MF.getFunction().hasFnAttribute("no-stack-arg-probe"))
62357 return "";
62358
62359 // We need a stack probe to conform to the Windows ABI. Choose the right
62360 // symbol.
62361 if (Subtarget.is64Bit())
62362 return Subtarget.isTargetCygMing() ? "___chkstk_ms" : "__chkstk";
62363 return Subtarget.isTargetCygMing() ? "_alloca" : "_chkstk";
62364}
62365
62366unsigned
62368 // The default stack probe size is 4096 if the function has no stackprobesize
62369 // attribute.
62370 return MF.getFunction().getFnAttributeAsParsedInteger("stack-probe-size",
62371 4096);
62372}
62373
62375 if (ML && ML->isInnermost() &&
62376 ExperimentalPrefInnermostLoopAlignment.getNumOccurrences())
62379}
unsigned const MachineRegisterInfo * MRI
#define Success
static SDValue Widen(SelectionDAG *CurDAG, SDValue N)
return SDValue()
static AArch64CC::CondCode parseConstraintCode(llvm::StringRef Constraint)
static SDValue LowerADDRSPACECAST(SDValue Op, SelectionDAG &DAG)
static SDValue LowerFunnelShift(SDValue Op, SelectionDAG &DAG)
static SDValue getSETCC(AArch64CC::CondCode CC, SDValue NZCV, const SDLoc &DL, SelectionDAG &DAG)
Helper function to create 'CSET', which is equivalent to 'CSINC <Wd>, WZR, WZR, invert(<cond>)'.
static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG)
static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG)
static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
Turn vector tests of the signbit in the form of: xor (sra X, elt_size(X)-1), -1 into: cmge X,...
unsigned RegSize
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
amdgpu aa AMDGPU Address space based Alias Analysis Wrapper
static msgpack::DocNode getNode(msgpack::DocNode DN, msgpack::Type Type, MCValue Val)
#define NODE_NAME_CASE(node)
constexpr LLT F64
constexpr LLT S1
AMDGPU Register Bank Select
static SDValue LowerShift(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue LowerATOMIC_FENCE(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue LowerCTTZ(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue getZeroVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl)
getZeroVector - Returns a vector of specified type with all zero elements.
static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG)
static SDValue LowerMLOAD(SDValue Op, SelectionDAG &DAG)
static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
#define EXPAND(Op)
Function Alias Analysis Results
BitTracker BT
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static bool isSigned(unsigned int Opcode)
Hexagon Common GEP
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
std::pair< Value *, Value * > ShuffleOps
We are building a shuffle to create V, which is a sequence of insertelement, extractelement pairs.
static int matchShuffleAsBitRotate(ArrayRef< int > Mask, int NumSubElts)
Try to lower a vector shuffle as a bit rotation.
static std::pair< Value *, APInt > getMask(Value *WideMask, unsigned Factor, ElementCount LeafValueEC)
static Value * LowerCTLZ(LLVMContext &Context, Value *V, Instruction *IP)
Emit the code to lower ctlz of V before the specified instruction IP.
static Value * LowerCTPOP(LLVMContext &Context, Value *V, Instruction *IP)
Emit the code to lower ctpop of V before the specified instruction IP.
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
#define RegName(no)
static LVOptions Options
Definition LVOptions.cpp:25
static bool isZero(Value *V, const DataLayout &DL, DominatorTree *DT, AssumptionCache *AC)
Definition Lint.cpp:539
This file implements the LivePhysRegs utility for tracking liveness of physical registers.
Live Register Matrix
static SDValue lower128BitShuffle(const SDLoc &DL, ArrayRef< int > Mask, MVT VT, SDValue V1, SDValue V2, SelectionDAG &DAG, const LoongArchSubtarget &Subtarget)
Dispatching routine to lower various 128-bit LoongArch vector shuffles.
static bool checkBitcastSrcVectorSize(SDValue Src, unsigned Size, unsigned Depth)
static SDValue signExtendBitcastSrcVector(SelectionDAG &DAG, EVT SExtVT, SDValue Src, const SDLoc &DL)
static SDValue lower256BitShuffle(const SDLoc &DL, ArrayRef< int > Mask, MVT VT, SDValue V1, SDValue V2, SelectionDAG &DAG, const LoongArchSubtarget &Subtarget)
Dispatching routine to lower various 256-bit LoongArch vector shuffles.
static void computeZeroableShuffleElements(ArrayRef< int > Mask, SDValue V1, SDValue V2, APInt &KnownUndef, APInt &KnownZero)
Compute whether each element of a shuffle is zeroable.
static int matchShuffleAsByteRotate(MVT VT, SDValue &V1, SDValue &V2, ArrayRef< int > Mask)
Attempts to match vector shuffle as byte rotation.
static int matchShuffleAsShift(MVT &ShiftVT, unsigned &Opcode, unsigned ScalarSizeInBits, ArrayRef< int > Mask, int MaskOffset, const APInt &Zeroable)
Attempts to match a shuffle mask against the VBSLL, VBSRL, VSLLI and VSRLI instruction.
static bool isRepeatedShuffleMask(unsigned LaneSizeInBits, MVT VT, ArrayRef< int > Mask, SmallVectorImpl< int > &RepeatedMask)
Test whether a shuffle mask is equivalent within each sub-lane.
static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc)
Return true if node is an ISD::AND or ISD::OR of two M68k::SETcc nodes each of which has no other use...
static bool hasNonFlagsUse(SDValue Op)
return true if Op has a use that doesn't just read flags.
static bool isCMOVPseudo(MachineInstr &MI)
static SDValue combineCarryThroughADD(SDValue CCR)
static bool isTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG)
#define F(x, y, z)
Definition MD5.cpp:55
#define I(x, y, z)
Definition MD5.cpp:58
#define G(x, y, z)
Definition MD5.cpp:56
Machine Check Debug Module
static bool isUndef(const MachineInstr &MI)
Register Reg
Register const TargetRegisterInfo * TRI
#define R2(n)
Promote Memory to Register
Definition Mem2Reg.cpp:110
#define T
#define T1
MachineInstr unsigned OpIdx
uint64_t High
uint64_t IntrinsicInst * II
#define P(N)
static CodeModel::Model getCodeModel(const PPCSubtarget &S, const TargetMachine &TM, const MachineOperand &MO)
PowerPC Reduce CR logical Operation
PowerPC TLS Dynamic Call Fixup
if(PassOpts->AAPipeline)
static constexpr MCPhysReg SPReg
static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG, const RISCVSubtarget &Subtarget)
static unsigned negateFMAOpcode(unsigned Opcode, bool NegMul, bool NegAcc)
static SDValue combineVectorSizedSetCCEquality(EVT VT, SDValue X, SDValue Y, ISD::CondCode CC, const SDLoc &DL, SelectionDAG &DAG, const RISCVSubtarget &Subtarget)
Try to map an integer comparison with size > XLEN to vector instructions before type legalization spl...
const SmallVectorImpl< MachineOperand > & Cond
static cl::opt< RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode > Mode("regalloc-enable-advisor", cl::Hidden, cl::init(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default), cl::desc("Enable regalloc advisor mode"), cl::values(clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default, "default", "Default"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Release, "release", "precompiled"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Development, "development", "for training")))
static bool isValid(const char C)
Returns true if C is a valid mangled character: <0-9a-zA-Z_>.
Contains matchers for matching SelectionDAG nodes and values.
static bool isSimple(Instruction *I)
static Type * getValueType(Value *V)
Returns the type of the given value/instruction V.
unsigned OpIndex
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition Value.cpp:480
This file implements the SmallBitVector class.
This file defines the SmallSet class.
This file defines the SmallVector class.
static SPCC::CondCodes GetOppositeBranchCondition(SPCC::CondCodes CC)
static bool Enabled
Definition Statistic.cpp:46
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
This file contains some functions that are useful when dealing with strings.
This file implements the StringSwitch template, which mimics a switch() statement whose cases are str...
#define LLVM_DEBUG(...)
Definition Debug.h:114
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
This file describes how to lower LLVM code to machine code.
static const char LUT[]
static llvm::Type * getVectorElementType(llvm::Type *Ty)
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
Definition VPlanSLP.cpp:247
static unsigned getBitWidth(Type *Ty, const DataLayout &DL)
Returns the bitwidth of the given scalar or pointer type.
static KnownBits computeKnownBitsForHorizontalOperation(const Operator *I, const APInt &DemandedElts, const SimplifyQuery &Q, unsigned Depth, const function_ref< KnownBits(const KnownBits &, const KnownBits &)> KnownBitsFunc)
static SDValue extractSubVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, const SDLoc &DL, unsigned VectorWidth)
static bool is64Bit(const char *name)
#define GET_EGPR_IF_ENABLED(OPC)
static unsigned getSUBriOpcode(bool IsLP64)
static SDValue convertIntLogicToFPLogic(unsigned Opc, const SDLoc &DL, EVT VT, SDValue N0, SDValue N1, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
If both input operands of a logic op are being cast from floating-point types or FP compares,...
static bool isNoopOrBroadcastShuffleMask(ArrayRef< int > Mask)
static bool matchLogicBlend(SDNode *N, SDValue &X, SDValue &Y, SDValue &Mask)
static MVT widenMaskVectorType(MVT VT, const X86Subtarget &Subtarget)
Widen a mask vector type to a minimum of v8i1/v16i1 to allow use of KSHIFT and bitcast with integer t...
static SDValue combineAndLoadToBZHI(SDNode *Node, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineOrCmpEqZeroToCtlzSrl(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineAndnp(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Do target-specific dag combines on X86ISD::ANDNP nodes.
static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineAddOrSubToADCOrSBB(bool IsSub, const SDLoc &DL, EVT VT, SDValue X, SDValue Y, SelectionDAG &DAG, bool ZeroSecondOpOnly=false)
If this is an add or subtract where one operand is produced by a cmp+setcc, then try to convert it to...
static bool matchScalarReduction(SDValue Op, ISD::NodeType BinOp, SmallVectorImpl< SDValue > &SrcOps, SmallVectorImpl< APInt > *SrcMask=nullptr)
Helper for matching BINOP(EXTRACTELT(X,0),BINOP(EXTRACTELT(X,1),...)) style scalarized (associative) ...
static unsigned translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0, SDValue &Op1, bool &IsAlwaysSignaling)
Turns an ISD::CondCode into a value suitable for SSE floating-point mask CMPs.
static SDValue detectPMADDUBSW(SDValue In, EVT VT, SelectionDAG &DAG, const X86Subtarget &Subtarget, const SDLoc &DL)
static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC)
static bool useEGPRInlineAsm(const X86Subtarget &Subtarget)
static SDValue getNullFPConstForNullVal(SDValue V, SelectionDAG &DAG, const X86Subtarget &Subtarget)
If a value is a scalar FP zero or a vector FP zero (potentially including undefined elements),...
static bool matchBinaryPermuteShuffle(MVT MaskVT, ArrayRef< int > Mask, const APInt &Zeroable, bool AllowFloatDomain, bool AllowIntDomain, SDValue &V1, SDValue &V2, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned &Shuffle, MVT &ShuffleVT, unsigned &PermuteImm)
static SDValue combineSub(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static bool isGRClass(const TargetRegisterClass &RC)
Check if RC is a general purpose register class.
static bool getTargetShuffleMask(SDValue N, bool AllowSentinelZero, SmallVectorImpl< SDValue > &Ops, SmallVectorImpl< int > &Mask, bool &IsUnary)
Calculates the shuffle mask corresponding to the target-specific opcode.
static SDValue vectorizeExtractedCast(SDValue Cast, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Given a scalar cast operation that is extracted from a vector, try to vectorize the cast op followed ...
static SDValue combineShiftRightLogical(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsInsertPS(const SDLoc &DL, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, SelectionDAG &DAG)
static SDValue combineSubSetcc(SDNode *N, SelectionDAG &DAG)
static int match1BitShuffleAsKSHIFT(unsigned &Opcode, ArrayRef< int > Mask, int MaskOffset, const APInt &Zeroable)
static bool isHorizontalBinOpPart(const BuildVectorSDNode *N, unsigned Opcode, const SDLoc &DL, SelectionDAG &DAG, unsigned BaseIdx, unsigned LastIdx, SDValue &V0, SDValue &V1)
This is a helper function of LowerToHorizontalOp().
static SDValue SplitAndExtendv16i1(unsigned ExtOpc, MVT VT, SDValue In, const SDLoc &dl, SelectionDAG &DAG)
static SDValue getShuffleHalfVectors(const SDLoc &DL, SDValue V1, SDValue V2, ArrayRef< int > HalfMask, int HalfIdx1, int HalfIdx2, bool UndefLower, SelectionDAG &DAG, bool UseConcat=false)
Given the output values from getHalfShuffleMask(), create a half width shuffle of extracted vectors f...
static SDValue combineSignExtendInReg(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineFPToSInt(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT, SDValue SrcOp, SDValue ShAmt, int ShAmtIdx, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle vector element shifts by a splat shift amount.
@ ConstantBit
@ NotConstantBit
@ NotShiftBit
static SDValue combineZext(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue incDecVectorConstant(SDValue V, SelectionDAG &DAG, bool IsInc, bool NSW)
Given a buildvector constant, return a new vector constant with each element incremented or decrement...
static bool cheapX86FSETCC_SSE(ISD::CondCode SetCCOpcode)
static SDValue lowerV4F32Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower 4-lane 32-bit floating point shuffles.
static MachineBasicBlock * emitXBegin(MachineInstr &MI, MachineBasicBlock *MBB, const TargetInstrInfo *TII)
Utility function to emit xbegin specifying the start of an RTM region.
static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef< SDValue > Elts, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget, bool IsAfterLegalize)
Given the initializing elements 'Elts' of a vector of type 'VT', see if the elements can be replaced ...
static bool scaleShuffleElements(ArrayRef< int > Mask, unsigned NumDstElts, SmallVectorImpl< int > &ScaledMask)
static SDValue GetTLSADDR(SelectionDAG &DAG, GlobalAddressSDNode *GA, const EVT PtrVT, unsigned ReturnReg, unsigned char OperandFlags, bool LoadGlobalBaseReg=false, bool LocalDynamic=false)
static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static cl::opt< int > BrMergingCcmpBias("x86-br-merging-ccmp-bias", cl::init(6), cl::desc("Increases 'x86-br-merging-base-cost' in cases that the target " "supports conditional compare instructions."), cl::Hidden)
static APInt getExtractedDemandedElts(SDNode *N)
static SDValue combineAndMaskToShift(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
If this is a zero/all-bits result that is bitwise-anded with a low bits mask.
static bool isBlendOrUndef(ArrayRef< int > Mask)
Return true if every element in Mask, is an in-place blend/select mask or is undef.
static SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG)
static SDValue lowerV8I32Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 8-lane 32-bit integer shuffles.
static SDValue LowerVectorCTLZInRegLUT(SDValue Op, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineX86ShufflesConstants(MVT VT, ArrayRef< SDValue > Ops, ArrayRef< int > Mask, ArrayRef< const SDNode * > SrcNodes, SelectionDAG &DAG, const SDLoc &DL, const X86Subtarget &Subtarget)
static SDValue combinePTESTCC(SDValue EFLAGS, X86::CondCode &CC, SelectionDAG &DAG, const X86Subtarget &Subtarget)
If we are inverting an PTEST/TESTP operand, attempt to adjust the CC to avoid the inversion.
static unsigned getAltBitOpcode(unsigned Opcode)
static Constant * getConstantVector(MVT VT, ArrayRef< APInt > Bits, const APInt &Undefs, LLVMContext &C)
static SDValue LowerABD(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue promoteXINT_TO_FP(SDValue Op, const SDLoc &dl, SelectionDAG &DAG)
static SDValue combineCastedMaskArithmetic(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Insert i1-subvector to i1-vector.
static SDValue materializeVectorConstant(SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Create a vector constant without a load.
static SDValue lowerShuffleWithPSHUFB(const SDLoc &DL, MVT VT, ArrayRef< int > Mask, SDValue V1, SDValue V2, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower a shuffle with a single PSHUFB of V1 or V2.
static SDValue combineFP16_TO_FP(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT, ArrayRef< SDValue > Ops, SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned Depth=0)
Helper that combines an array of subvector ops as if they were the operands of a ISD::CONCAT_VECTORS ...
static SDValue combineBMILogicOp(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerUINT_TO_FP_i64(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
64-bit unsigned integer to double expansion.
static bool useVectorCast(unsigned Opcode, MVT FromVT, MVT ToVT, const X86Subtarget &Subtarget)
static SDValue LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG)
static SDValue lowerV4X128Shuffle(const SDLoc &DL, MVT VT, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower a vector shuffle as a 128-bit shuffles.
static SDValue LowerTruncateVecPackWithSignBits(MVT DstVT, SDValue In, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG, const SDNodeFlags Flags=SDNodeFlags())
This function lowers a vector truncation of 'extended sign-bits' or 'extended zero-bits' values.
static SDValue matchPMADDWD(SelectionDAG &DAG, SDNode *N, const SDLoc &DL, EVT VT, const X86Subtarget &Subtarget)
static SDValue combineSelect(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Do target-specific dag combines on SELECT and VSELECT nodes.
static bool isUndefOrZeroInRange(ArrayRef< int > Mask, unsigned Pos, unsigned Size)
Return true if every element in Mask, beginning from position Pos and ending in Pos+Size is undef or ...
static SDValue combineToConsecutiveLoads(EVT VT, SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget, bool IsAfterLegalize)
static SDValue getConstVector(ArrayRef< int > Values, MVT VT, SelectionDAG &DAG, const SDLoc &dl, bool IsMask=false)
static SDValue commuteSelect(SDNode *N, SelectionDAG &DAG, const SDLoc &DL, const X86Subtarget &Subtarget)
static MachineInstrBuilder createPHIsForCMOVsInSinkBB(MachineBasicBlock::iterator MIItBegin, MachineBasicBlock::iterator MIItEnd, MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB, MachineBasicBlock *SinkMBB)
static SDValue insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, const SDLoc &dl)
Generate a DAG to put 128-bits into a vector > 128 bits.
static bool onlyZeroFlagUsed(SDValue Flags)
static SDValue extract256BitVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, const SDLoc &dl)
Generate a DAG to grab 256-bits from a 512-bit vector.
static SDValue combineFAndFNotToFAndn(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue combineMulToPMADDWD(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static bool isFreeToSplitVector(SDValue V, SelectionDAG &DAG)
static SDValue lowerShuffleAsLanePermuteAndShuffle(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Lower a vector shuffle crossing multiple 128-bit lanes by shuffling one source with a lane permutatio...
static SDValue checkSignTestSetCCCombine(SDValue Cmp, X86::CondCode &CC, SelectionDAG &DAG)
static bool isFoldableUseOfShuffle(SDNode *N)
static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts, SmallVectorImpl< SDValue > &Inputs, SmallVectorImpl< int > &Mask, const SelectionDAG &DAG, unsigned Depth, bool ResolveKnownElts)
static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask, SDValue PreservedSrc, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Return (and Op, Mask) for compare instructions or (vselect Mask, Op, PreservedSrc) for others along w...
static SDValue lowerAddSub(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue truncateVectorWithPACKSS(EVT DstVT, SDValue In, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Truncate using inreg sign extension and X86ISD::PACKSS.
static SDValue combineMaskedLoadConstantMask(MaskedLoadSDNode *ML, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static bool isShuffleMaskInputInPlace(int Input, ArrayRef< int > Mask)
Test whether the specified input (0 or 1) is in-place blended by the given mask.
static bool isMultiLaneShuffleMask(unsigned LaneSizeInBits, unsigned ScalarSizeInBits, ArrayRef< int > Mask)
Test whether elements in each LaneSizeInBits lane in this shuffle mask come from multiple lanes - thi...
static SDValue LowerVSETCCWithSUBUS(SDValue Op0, SDValue Op1, MVT VT, ISD::CondCode Cond, const SDLoc &dl, const X86Subtarget &Subtarget, SelectionDAG &DAG)
As another special case, use PSUBUS[BW] when it's profitable.
static SDValue combineFP_EXTEND(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static APInt getBLENDIBlendMask(SDValue V)
Get the expanded blend mask from a BLENDI node.
static SDValue EmitTest(SDValue Op, X86::CondCode X86CC, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Emit nodes that will be selected as "test Op0,Op0", or something equivalent.
static bool is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef< int > Mask, SmallVectorImpl< int > &RepeatedMask)
Test whether a shuffle mask is equivalent within each 128-bit lane.
static SDValue getPMOVMSKB(const SDLoc &DL, SDValue V, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineSCALAR_TO_VECTOR(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineUIntToFP(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static void getPackDemandedElts(EVT VT, const APInt &DemandedElts, APInt &DemandedLHS, APInt &DemandedRHS)
static SDValue LowerVectorCTPOPInRegLUT(SDValue Op, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue LowerSELECTWithCmpZero(SDValue CmpVal, SDValue LHS, SDValue RHS, unsigned X86CC, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineADC(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static std::optional< unsigned > CastIntSETCCtoFP(MVT VT, ISD::CondCode CC, unsigned NumSignificantBitsLHS, unsigned NumSignificantBitsRHS)
static SDValue lowerShuffleAsVTRUNCAndUnpack(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, SelectionDAG &DAG)
static bool isShuffleFoldableLoad(SDValue)
Helper to test for a load that can be folded with x86 shuffles.
static SDValue narrowVectorSelect(SDNode *N, SelectionDAG &DAG, const SDLoc &DL, const X86Subtarget &Subtarget)
If both arms of a vector select are concatenated vectors, split the select, and concatenate the resul...
static SDValue lowerShuffleAsElementInsertion(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower insertion of a single element into a zero vector.
static SDValue combineXor(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static bool isUnpackWdShuffleMask(ArrayRef< int > Mask, MVT VT, const SelectionDAG &DAG)
static SDValue LowerTruncateVecPack(MVT DstVT, SDValue In, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
This function lowers a vector truncation from vXi32/vXi64 to vXi8/vXi16 into X86ISD::PACKUS/X86ISDPAC...
static SDValue lowerShuffleAsRepeatedMaskAndLanePermute(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle case where shuffle sources are coming from the same 128-bit lane and every lane can be represe...
static SDValue getSHUFPDImmForMask(ArrayRef< int > Mask, const SDLoc &DL, SelectionDAG &DAG)
static void computeKnownBitsForPSADBW(SDValue LHS, SDValue RHS, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth)
static int getSEHRegistrationNodeSize(const Function *Fn)
static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask, SDValue PreservedSrc, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Creates an SDNode for a predicated scalar operation.
static SDValue buildFromShuffleMostly(SDValue Op, const SDLoc &DL, SelectionDAG &DAG)
static SDValue lowerBuildVectorToBitOp(BuildVectorSDNode *Op, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
If a BUILD_VECTOR's source elements all apply the same bit operation and one of their operands is con...
static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue isFNEG(SelectionDAG &DAG, SDNode *N, unsigned Depth=0)
Returns the negated value if the node N flips sign of FP value.
static SDValue lowerShuffleWithPERMV(const SDLoc &DL, MVT VT, ArrayRef< int > OriginalMask, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue lowerV16I16Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 16-lane 16-bit integer shuffles.
static SDValue lowerShuffleWithUndefHalf(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower shuffles where an entire half of a 256 or 512-bit vector is UNDEF.
static SDValue lowerAtomicArith(SDValue N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Lower atomic_load_ops into LOCK-prefixed operations.
static SDValue convertShiftLeftToScale(SDValue Amt, const SDLoc &dl, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue lowerV32I8Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 32-lane 8-bit integer shuffles.
static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr, MachineBasicBlock *BB, const TargetRegisterInfo *TRI)
static void computeKnownBitsForPMADDWD(SDValue LHS, SDValue RHS, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth)
static SDValue LowerBITCAST(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG)
static SDValue lowerShuffleAsTruncBroadcast(const SDLoc &DL, MVT VT, SDValue V0, int BroadcastIdx, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower broadcast of a single - truncated - integer element, coming from a scalar_to_vector/buil...
static bool isAddSubOrSubAdd(const BuildVectorSDNode *BV, const X86Subtarget &Subtarget, SelectionDAG &DAG, SDValue &Opnd0, SDValue &Opnd1, unsigned &NumExtracts, bool &IsSubAdd, bool &HasAllowContract)
Returns true iff BV builds a vector with the result equivalent to the result of ADDSUB/SUBADD operati...
static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1, const SDLoc &DL, SelectionDAG &DAG, unsigned X86Opcode, bool Mode, bool isUndefLO, bool isUndefHI)
Emit a sequence of two 128-bit horizontal add/sub followed by a concat_vector.
static SDValue combineBitOpWithPACK(unsigned Opc, const SDLoc &DL, EVT VT, SDValue N0, SDValue N1, SelectionDAG &DAG)
static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower BUILD_VECTOR to a horizontal add/sub operation if possible.
SDValue getGFNICtrlMask(unsigned Opcode, SelectionDAG &DAG, const SDLoc &DL, MVT VT, unsigned Amt=0)
static SDValue combineFP_ROUND(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineAndShuffleNot(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Try to fold: and (vector_shuffle<Z,...,Z> (insert_vector_elt undef, (xor X, -1), Z),...
static SDValue lowerShuffleAsBitMask(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to emit a bitmask instruction for a shuffle.
static SDValue lowerShuffleWithUNPCK256(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG)
Check if the mask can be mapped to a preliminary shuffle (vperm 64-bit) followed by unpack 256-bit.
static bool is256BitLaneRepeatedShuffleMask(MVT VT, ArrayRef< int > Mask, SmallVectorImpl< int > &RepeatedMask)
Test whether a shuffle mask is equivalent within each 256-bit lane.
static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue LowerShiftByScalarVariable(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerSIGN_EXTEND_Mask(SDValue Op, const SDLoc &dl, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue getVectorShuffle(SelectionDAG &DAG, EVT VT, const SDLoc &dl, SDValue V1, SDValue V2, ArrayRef< int > Mask)
static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineCommutableSHUFP(SDValue N, MVT VT, const SDLoc &DL, SelectionDAG &DAG)
static SDValue LowerUINT_TO_FP_i32(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
32-bit unsigned integer to float expansion.
static SDValue combineAdd(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue LowerTruncateVecI1(SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineVTRUNC(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue LowerLoad(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static cl::opt< int > ExperimentalPrefInnermostLoopAlignment("x86-experimental-pref-innermost-loop-alignment", cl::init(4), cl::desc("Sets the preferable loop alignment for experiments (as log2 bytes) " "for innermost loops only. If specified, this option overrides " "alignment set by x86-experimental-pref-loop-alignment."), cl::Hidden)
static SDValue createVariablePermute(MVT VT, SDValue SrcVec, SDValue IndicesVec, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Look for opportunities to create a VPERMV/VPERMILPV/PSHUFB variable permute from a vector of source v...
static SDValue getHopForBuildVector(const BuildVectorSDNode *BV, const SDLoc &DL, SelectionDAG &DAG, unsigned HOpcode, SDValue V0, SDValue V1)
static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static bool needCarryOrOverflowFlag(SDValue Flags)
static SDValue combineCVTPH2PS(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl)
Returns a vector of specified type with all bits set.
static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static bool isUndefLowerHalf(ArrayRef< int > Mask)
Return true if the mask creates a vector whose lower half is undefined.
static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineOrXorWithSETCC(unsigned Opc, const SDLoc &DL, EVT VT, SDValue N0, SDValue N1, SelectionDAG &DAG)
static SDValue combineRedundantDWordShuffle(SDValue N, MutableArrayRef< int > Mask, const SDLoc &DL, SelectionDAG &DAG)
Search for a combinable shuffle across a chain ending in pshufd.
static SDValue getBMIMatchingOp(unsigned Opc, SelectionDAG &DAG, SDValue OpMustEq, SDValue Op, unsigned Depth)
static SDValue createPSADBW(SelectionDAG &DAG, SDValue N0, SDValue N1, const SDLoc &DL, const X86Subtarget &Subtarget)
static SDValue lowerBuildVectorAsBlend(BuildVectorSDNode *BVOp, SDLoc const &DL, X86Subtarget const &Subtarget, SelectionDAG &DAG)
Attempt to lower a BUILD_VECTOR of scalar values to a shuffle of splats representing a blend.
static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT, SDValue SrcOp, uint64_t ShiftAmt, SelectionDAG &DAG)
Handle vector element shifts where the shift amount is a constant.
static SDValue getPack(SelectionDAG &DAG, const X86Subtarget &Subtarget, const SDLoc &dl, MVT VT, SDValue LHS, SDValue RHS, bool PackHiHalf=false)
Returns a node that packs the LHS + RHS nodes together at half width.
static SDValue combineMOVDQ2Q(SDNode *N, SelectionDAG &DAG)
static bool matchUnaryShuffle(MVT MaskVT, ArrayRef< int > Mask, bool AllowFloatDomain, bool AllowIntDomain, SDValue V1, const SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned &Shuffle, MVT &SrcVT, MVT &DstVT)
static bool isConstantPowerOf2(SDValue V, unsigned EltSizeInBIts, bool AllowUndefs)
static SDValue lowerFPToIntToFP(SDValue CastToFP, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Given a scalar cast to FP with a cast to integer operand (almost an ftrunc), try to vectorize the cas...
static SDValue combineAndXorSubWithBMI(SDNode *And, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Fold AND(Y, XOR(X, NEG(X))) -> ANDN(Y, BLSMSK(X)) if BMI is available.
static SDValue combineX86SubCmpForFlags(SDNode *N, SDValue Flag, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &ST)
static SDValue LowerVectorCTLZ_GFNI(SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool getHalfShuffleMask(ArrayRef< int > Mask, MutableArrayRef< int > HalfMask, int &HalfIdx1, int &HalfIdx2)
If the input shuffle mask results in a vector that is undefined in all upper or lower half elements a...
static cl::opt< int > BrMergingBaseCostThresh("x86-br-merging-base-cost", cl::init(2), cl::desc("Sets the cost threshold for when multiple conditionals will be merged " "into one branch versus be split in multiple branches. Merging " "conditionals saves branches at the cost of additional instructions. " "This value sets the instruction cost limit, below which conditionals " "will be merged, and above which conditionals will be split. Set to -1 " "to never merge branches."), cl::Hidden)
static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts, SmallVectorImpl< int > &Mask, SmallVectorImpl< SDValue > &Ops, const SelectionDAG &DAG, unsigned Depth, bool ResolveKnownElts)
static bool hasBZHI(const X86Subtarget &Subtarget, MVT VT)
static SDValue emitLockedStackOp(SelectionDAG &DAG, const X86Subtarget &Subtarget, SDValue Chain, const SDLoc &DL)
Emit a locked operation on a stack location which does not change any memory location,...
static bool matchShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2, bool &ForceV1Zero, bool &ForceV2Zero, unsigned &ShuffleImm, ArrayRef< int > Mask, const APInt &Zeroable)
static SDValue lowerV8F16Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower 8-lane 16-bit floating point shuffles.
static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
SDValue SplitOpsAndApply(SelectionDAG &DAG, const X86Subtarget &Subtarget, const SDLoc &DL, EVT VT, ArrayRef< SDValue > Ops, F Builder, bool CheckBWI=true, bool AllowAVX512=true)
static SDValue lowerAtomicArithWithLOCK(SDValue N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsBitBlend(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG)
Try to emit a blend instruction for a shuffle using bit math.
static SDValue reduceMaskedLoadToScalarLoad(MaskedLoadSDNode *ML, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
If exactly one element of the mask is set for a non-extending masked load, it is a scalar load and ve...
static SDValue lower1BitShuffleAsKSHIFTR(const SDLoc &DL, ArrayRef< int > Mask, MVT VT, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineVectorPack(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue expandIntrinsicWChainHelper(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, unsigned TargetOpcode, unsigned SrcReg, const X86Subtarget &Subtarget, SmallVectorImpl< SDValue > &Results)
Handles the lowering of builtin intrinsics with chain that return their value into registers EDX:EAX.
static bool matchUnaryPermuteShuffle(MVT MaskVT, ArrayRef< int > Mask, const APInt &Zeroable, bool AllowFloatDomain, bool AllowIntDomain, const SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned &Shuffle, MVT &ShuffleVT, unsigned &PermuteImm)
static bool shouldExpandCmpArithRMWInIR(AtomicRMWInst *AI)
static SDValue combineVSelectToBLENDV(SDNode *N, SelectionDAG &DAG, const SDLoc &DL, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
If this is a dynamic select (non-constant condition) and we can match this node with one of the varia...
static SDValue combineVectorCompareAndMaskUnaryOp(SDNode *N, SelectionDAG &DAG)
static SDValue LowerBuildVectorAsInsert(SDValue Op, const SDLoc &DL, const APInt &NonZeroMask, unsigned NumNonZero, unsigned NumZero, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits, unsigned EltSizeInBits, ArrayRef< int > Mask, SmallVectorImpl< int > &RepeatedMask)
Test whether a target shuffle mask is equivalent within each sub-lane.
static const char * getIndirectThunkSymbol(const X86Subtarget &Subtarget, Register Reg)
static bool isLaneCrossingShuffleMask(unsigned LaneSizeInBits, unsigned ScalarSizeInBits, ArrayRef< int > Mask)
Test whether there are elements crossing LaneSizeInBits lanes in this shuffle mask.
static SDValue FixupMMXIntrinsicTypes(SDNode *N, SelectionDAG &DAG)
static bool isShuffleMaskInputBroadcastable(int Input, ArrayRef< int > Mask, int BroadcastableElement=0)
Test whether the specified input (0 or 1) is a broadcast/splat blended by the given mask.
static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, SDValue Src, SDValue Mask, SDValue Base, SDValue Index, SDValue ScaleOp, SDValue Chain, const X86Subtarget &Subtarget)
static SDValue LowerAndToBT(SDValue And, ISD::CondCode CC, const SDLoc &dl, SelectionDAG &DAG, X86::CondCode &X86CC)
Result of 'and' is compared against zero.
static SDValue combinePMULH(SDValue Src, EVT VT, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsZeroOrAnyExtend(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower a vector shuffle as a zero extension on any microarch.
static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool supportedVectorShiftWithBaseAmnt(EVT VT, const X86Subtarget &Subtarget, unsigned Opcode)
static SDValue combineVPMADD(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue LowerMULO(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue lowerShuffleWithSHUFPD(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineBitOpWithShift(unsigned Opc, const SDLoc &DL, EVT VT, SDValue N0, SDValue N1, SelectionDAG &DAG)
static SDValue LowerHorizontalByteSum(SDValue V, MVT VT, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Compute the horizontal sum of bytes in V for the elements of VT.
static SDValue LowerFP16_TO_FP(SDValue Op, SelectionDAG &DAG)
static SDValue lowerV32I16Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 32-lane 16-bit integer shuffles.
static SDValue combineBitcastToBoolVector(EVT VT, SDValue V, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned Depth=0)
static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, SDValue Mask, SDValue Base, SDValue Index, SDValue ScaleOp, SDValue Chain, const X86Subtarget &Subtarget)
static SDValue MarkEHGuard(SDValue Op, SelectionDAG &DAG)
static SDValue combineX86CloadCstore(SDNode *N, SelectionDAG &DAG)
static void growShuffleMask(ArrayRef< int > SrcMask, SmallVectorImpl< int > &DstMask, unsigned SrcSizeInBits, unsigned DstSizeInBits)
static SDValue lowerShuffleWithEXPAND(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static void computeInLaneShuffleMask(const ArrayRef< int > &Mask, int LaneSize, SmallVector< int > &InLaneMask)
Helper to get compute inlane shuffle mask for a complete shuffle mask.
static SDValue lowerX86CmpEqZeroToCtlzSrl(SDValue Op, SelectionDAG &DAG)
static SDValue combineVectorInsert(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static bool isX86CCSigned(X86::CondCode X86CC)
Return true if the condition is an signed comparison operation.
static SDValue combineTESTP(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineAnd(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue getBROADCAST_LOAD(unsigned Opcode, const SDLoc &DL, EVT VT, EVT MemVT, MemSDNode *Mem, unsigned Offset, SelectionDAG &DAG)
static bool isUndefUpperHalf(ArrayRef< int > Mask)
Return true if the mask creates a vector whose upper half is undefined.
static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
uint64_t getGFNICtrlImm(unsigned Opcode, unsigned Amt=0)
static SDValue lowerShuffleWithPACK(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue lowerShuffleAsSpecificExtension(const SDLoc &DL, MVT VT, int Scale, int Offset, unsigned ExtOpc, SDValue InputV, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower a vector shuffle as an any/signed/zero extension.
static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG)
Lower SRA_PARTS and friends, which return two i32 values and take a 2 x i32 value to shift plus a shi...
static SDValue combineFMulcFCMulc(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool canReduceVMulWidth(SDNode *N, SelectionDAG &DAG, ShrinkMode &Mode)
static std::pair< SDValue, SDValue > getX86XALUOOp(X86::CondCode &Cond, SDValue Op, SelectionDAG &DAG)
static SDValue combineVectorShiftVar(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerAVG(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Recognize the distinctive (AND (setcc ...) (setcc ..)) where both setccs reference the same FP CMP,...
static bool isVKClass(const TargetRegisterClass &RC)
Check if RC is a mask register class.
static int canLowerByDroppingElements(ArrayRef< int > Mask, bool MatchEven, bool IsSingleInput)
Check whether a compaction lowering can be done by dropping even/odd elements and compute how many ti...
static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget, const SDLoc &DL)
Attempt to pre-truncate inputs to arithmetic ops if it will simplify the codegen.
static SDValue lowerShuffleAsBroadcast(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower broadcast of a single element.
static SDValue combineCVTP2I_CVTTP2I(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static void resolveTargetShuffleInputsAndMask(SmallVectorImpl< SDValue > &Inputs, SmallVectorImpl< int > &Mask)
Removes unused/repeated shuffle source inputs and adjusts the shuffle mask.
static SDValue lowerV64I8Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 64-lane 8-bit integer shuffles.
static SDValue combineBitOpWithMOVMSK(unsigned Opc, const SDLoc &DL, SDValue N0, SDValue N1, SelectionDAG &DAG)
static SDValue combineAndNotIntoANDNP(SDNode *N, const SDLoc &DL, SelectionDAG &DAG)
Try to fold: (and (xor X, -1), Y) -> (andnp X, Y).
static SDValue combineShuffleToAddSubOrFMAddSub(SDNode *N, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to combine a shuffle into a target-specific add-sub or mul-add-sub node.
static SDValue lowerShuffleAsLanePermuteAndPermute(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Lower a vector shuffle crossing multiple 128-bit lanes as a lane permutation followed by a per-lane p...
static SDValue lowerV8I16Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Generic lowering of 8-lane i16 shuffles.
static SDValue getEXTEND_VECTOR_INREG(unsigned Opcode, const SDLoc &DL, EVT VT, SDValue In, SelectionDAG &DAG)
static bool canonicalizeShuffleMaskWithCommute(ArrayRef< int > Mask)
Helper function that returns true if the shuffle mask should be commuted to improve canonicalization.
static SDValue lowerUINT_TO_FP_v2i32(SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue getV4X86ShuffleImm8ForMask(ArrayRef< int > Mask, const SDLoc &DL, SelectionDAG &DAG)
static SDValue splitVSETCC(EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SelectionDAG &DAG, const SDLoc &dl)
Break a VSETCC 256/512-bit vector into two new 128/256 ones and then concatenate the result back.
static SDValue splitVectorStore(StoreSDNode *Store, SelectionDAG &DAG)
Change a vector store into a pair of half-size vector stores.
static SDValue widenSubVector(MVT VT, SDValue Vec, bool ZeroNewElements, const X86Subtarget &Subtarget, SelectionDAG &DAG, const SDLoc &dl)
Widen a vector to a larger size with the same scalar type, with the new elements either zero or undef...
static bool supportedVectorVarShift(EVT VT, const X86Subtarget &Subtarget, unsigned Opcode)
static bool isUndefInRange(ArrayRef< int > Mask, unsigned Pos, unsigned Size)
Return true if every element in Mask, beginning from position Pos and ending in Pos+Size is the undef...
static SDValue LowerToTLSGeneralDynamicModelX32(GlobalAddressSDNode *GA, SelectionDAG &DAG, const EVT PtrVT)
static SDValue combineFAndn(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Do target-specific dag combines on X86ISD::FANDN nodes.
static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG, const EVT PtrVT, TLSModel::Model model, bool is64Bit, bool isPIC)
static bool supportedVectorShiftWithImm(EVT VT, const X86Subtarget &Subtarget, unsigned Opcode)
static SDValue lower1BitShuffle(const SDLoc &DL, ArrayRef< int > Mask, MVT VT, SDValue V1, SDValue V2, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineToExtendBoolVectorInReg(unsigned Opcode, const SDLoc &DL, EVT VT, SDValue N0, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue splitVectorIntBinary(SDValue Op, SelectionDAG &DAG, const SDLoc &dl)
Break a binary integer operation into 2 half sized ops and then concatenate the result back.
static SDValue createMMXBuildVector(BuildVectorSDNode *BV, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerADDSUBO_CARRY(SDValue Op, SelectionDAG &DAG)
static unsigned getV4X86ShuffleImm(ArrayRef< int > Mask)
Get a 4-lane 8-bit shuffle immediate for a mask.
static SDValue combineShiftLeft(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static void resolveTargetShuffleFromZeroables(SmallVectorImpl< int > &Mask, const APInt &KnownUndef, const APInt &KnownZero, bool ResolveKnownZeros=true)
static SDValue LowerBUILD_VECTORvXi1(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Insert one bit to mask vector, like v16i1 or v8i1.
static SDValue getGatherNode(SDValue Op, SelectionDAG &DAG, SDValue Src, SDValue Mask, SDValue Base, SDValue Index, SDValue ScaleOp, SDValue Chain, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsLanePermuteAndRepeatedMask(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower a vector shuffle by first fixing the 128-bit lanes and then shuffling each lane.
static bool isSoftF16(T VT, const X86Subtarget &Subtarget)
static SDValue lowerV16I32Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 16-lane 32-bit integer shuffles.
static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Detect vector gather/scatter index generation and convert it from being a bunch of shuffles and extra...
static bool isSingleSHUFPSMask(ArrayRef< int > Mask)
Test whether this can be lowered with a single SHUFPS instruction.
static SDValue LowerFCanonicalize(SDValue Op, SelectionDAG &DAG)
static bool checkBoolTestAndOrSetCCCombine(SDValue Cond, X86::CondCode &CC0, X86::CondCode &CC1, SDValue &Flags, bool &isAnd)
Check whether Cond is an AND/OR of SETCCs off of the same EFLAGS.
static bool isX86LogicalCmp(SDValue Op)
Return true if opcode is a X86 logical comparison.
static bool isAnyInRange(ArrayRef< int > Mask, int Low, int Hi)
Return true if the value of any element in Mask falls within the specified range (L,...
static SDValue combineEXTRACT_SUBVECTOR(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static cl::opt< bool > WidenShift("x86-widen-shift", cl::init(true), cl::desc("Replace narrow shifts with wider shifts."), cl::Hidden)
static SDValue combineSextInRegCmov(SDNode *N, SelectionDAG &DAG)
static SDValue detectSSatPattern(SDValue In, EVT VT, bool MatchPackUS=false)
Detect patterns of truncation with signed saturation: (truncate (smin ((smax (x, signed_min_of_dest_t...
const unsigned FPStateSize
static bool matchShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2, unsigned &UnpackOpcode, bool IsUnary, ArrayRef< int > TargetMask, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineFneg(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Do target-specific dag combines on floating point negations.
static SDValue combineLoad(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineToHorizontalAddSub(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineXorSubCTLZ(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue insertSubVector(SDValue Result, SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, const SDLoc &dl, unsigned vectorWidth)
static bool isHopBuildVector(const BuildVectorSDNode *BV, SelectionDAG &DAG, unsigned &HOpcode, SDValue &V0, SDValue &V1)
static SDValue combineSelectOfTwoConstants(SDNode *N, SelectionDAG &DAG, const SDLoc &DL)
static SDValue combineFOr(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes.
static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineINTRINSIC_VOID(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static bool createShuffleMaskFromVSELECT(SmallVectorImpl< int > &Mask, SDValue Cond, bool IsBLENDV=false)
static SDValue getMaskNode(SDValue Mask, MVT MaskVT, const X86Subtarget &Subtarget, SelectionDAG &DAG, const SDLoc &dl)
Return Mask with the necessary casting or extending for Mask according to MaskVT when lowering maskin...
static SDValue lowerV8F64Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 8-lane 64-bit floating point shuffles.
static bool shouldUseHorizontalOp(bool IsSingleSource, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Horizontal vector math instructions may be slower than normal math with shuffles.
static bool isFRClass(const TargetRegisterClass &RC)
Check if RC is a vector register class.
static SDValue splitAndLowerShuffle(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG, bool SimpleOnly)
Generic routine to split vector shuffle into half-sized shuffles.
static SDValue combineAVX512SetCCToKMOV(EVT VT, SDValue Op0, ISD::CondCode CC, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG, const EVT PtrVT)
static SDValue IsNOT(SDValue V, SelectionDAG &DAG)
static SDValue combineFMinFMax(SDNode *N, SelectionDAG &DAG)
Do target-specific dag combines on X86ISD::FMIN and X86ISD::FMAX nodes.
static SDValue combineOr(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, unsigned NumBits, SelectionDAG &DAG, const TargetLowering &TLI, const SDLoc &dl)
Return a vector logical shift node.
static SDValue combineVPDPBUSDPattern(SDNode *Extract, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerVACOPY(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineINTRINSIC_WO_CHAIN(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue lowerV4I32Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower 4-lane i32 vector shuffles.
static SDValue widenMaskVector(SDValue Vec, bool ZeroNewElements, const X86Subtarget &Subtarget, SelectionDAG &DAG, const SDLoc &dl)
Widen a mask vector to a minimum of v8i1/v16i1 to allow use of KSHIFT and bitcast with integer types.
static SDValue LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, const SDLoc &dl, SelectionDAG &DAG)
static bool isInRange(int Val, int Low, int Hi)
Return true if Val falls within the specified range (L, H].
static SDValue combineKSHIFT(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue combineTargetShuffle(SDValue N, const SDLoc &DL, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Try to combine x86 target specific shuffles.
static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static std::pair< SDValue, SDValue > splitVector(SDValue Op, SelectionDAG &DAG, const SDLoc &dl)
static SDValue getBT(SDValue Src, SDValue BitNo, const SDLoc &DL, SelectionDAG &DAG)
Helper for attempting to create a X86ISD::BT node.
static SDValue EmitTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &DL, SDValue Val, SDValue Ptr, EVT MemVT, MachineMemOperand *MMO, SelectionDAG &DAG)
Emit Truncating Store with signed or unsigned saturation.
static SDValue ExtendToType(SDValue InOp, MVT NVT, SelectionDAG &DAG, bool FillWithZeroes=false)
Widen a vector input to a vector of NVT.
static void getHorizDemandedElts(EVT VT, const APInt &DemandedElts, APInt &DemandedLHS, APInt &DemandedRHS)
static SDValue LowerMSTORE(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineFMA(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsBlendAndPermute(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG, bool ImmBlends=false)
Try to lower as a blend of elements from two inputs followed by a single-input permutation.
static bool matchShuffleAsEXTRQ(MVT VT, SDValue &V1, SDValue &V2, ArrayRef< int > Mask, uint64_t &BitLen, uint64_t &BitIdx, const APInt &Zeroable)
const unsigned X87StateSize
static SDValue lowerV8I64Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 8-lane 64-bit integer shuffles.
static bool isUndefOrEqual(int Val, int CmpVal)
Val is the undef sentinel value or equal to the specified value.
static SDValue lowerShuffleAsVTRUNC(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static bool isTargetShuffle(unsigned Opcode)
static bool isSingleElementRepeatedMask(ArrayRef< int > Mask)
Check if the Mask consists of the same element repeated multiple times.
static SDValue LowerCVTPS2PH(SDValue Op, SelectionDAG &DAG)
static SDValue combineShiftRightArithmetic(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combinePMULDQ(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineX86ShufflesRecursively(ArrayRef< SDValue > SrcOps, int SrcOpIndex, unsigned RootOpc, MVT RootVT, ArrayRef< int > RootMask, ArrayRef< const SDNode * > SrcNodes, unsigned Depth, unsigned MaxDepth, bool AllowVariableCrossLaneMask, bool AllowVariablePerLaneMask, bool IsMaskedShuffle, SelectionDAG &DAG, const SDLoc &DL, const X86Subtarget &Subtarget)
Fully generic combining of x86 shuffle instructions.
static SDValue LowerIntVSETCC_AVX512(SDValue Op, const SDLoc &dl, SelectionDAG &DAG)
static SDValue lowerShuffleWithSSE4A(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, SelectionDAG &DAG)
Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ.
static SDValue lowerShuffleOfExtractsAsVperm(const SDLoc &DL, SDValue N0, SDValue N1, ArrayRef< int > Mask, SelectionDAG &DAG)
If we are extracting two 128-bit halves of a vector and shuffling the result, match that to a 256-bit...
static SDValue lowerV4F64Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 4-lane 64-bit floating point shuffles.
static SDValue getAVX512Node(unsigned Opcode, const SDLoc &DL, MVT VT, ArrayRef< SDValue > Ops, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue lowerToAddSubOrFMAddSub(const BuildVectorSDNode *BV, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to fold a build_vector that performs an 'addsub' or 'fmaddsub' or 'fsubadd' operation accordingly...
static SDValue lowerV8I16GeneralSingleInputShuffle(const SDLoc &DL, MVT VT, SDValue V, MutableArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lowering of single-input v8i16 shuffles is the cornerstone of SSE2 shuffle lowering,...
static SDValue lowerV2F64Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 2-lane 64-bit floating point shuffles.
static SDValue isUpperSubvectorUndef(SDValue V, const SDLoc &DL, SelectionDAG &DAG)
static cl::opt< int > BrMergingLikelyBias("x86-br-merging-likely-bias", cl::init(0), cl::desc("Increases 'x86-br-merging-base-cost' in cases that it is likely " "that all conditionals will be executed. For example for merging " "the conditionals (a == b && c > d), if its known that a == b is " "likely, then it is likely that if the conditionals are split " "both sides will be executed, so it may be desirable to increase " "the instruction cost threshold. Set to -1 to never merge likely " "branches."), cl::Hidden)
static SDValue getInvertedVectorForFMA(SDValue V, SelectionDAG &DAG)
static bool IsElementEquivalent(int MaskSize, SDValue Op, SDValue ExpectedOp, int Idx, int ExpectedIdx)
Checks whether the vector elements referenced by two shuffle masks are equivalent.
static int matchShuffleAsElementRotate(SDValue &V1, SDValue &V2, ArrayRef< int > Mask)
Try to match a vector shuffle as an element rotation.
static SDValue combineVEXTRACT_STORE(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static bool isUndefOrZeroOrInRange(int Val, int Low, int Hi)
Return true if Val is undef, zero or if its value falls within the specified range (L,...
static const Constant * getTargetConstantFromBasePtr(SDValue Ptr)
static SDValue lowerShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Original, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to emit a blend instruction for a shuffle.
static bool findEltLoadSrc(SDValue Elt, LoadSDNode *&Ld, int64_t &ByteOffset)
static bool isUndefOrInRange(int Val, int Low, int Hi)
Return true if Val is undef or if its value falls within the specified range (L, H].
static SDValue combineAddOfPMADDWD(SelectionDAG &DAG, SDValue N0, SDValue N1, const SDLoc &DL, EVT VT)
static bool collectConcatOps(SDNode *N, SmallVectorImpl< SDValue > &Ops, SelectionDAG &DAG)
static SDValue truncateVectorWithPACK(unsigned Opcode, EVT DstVT, SDValue In, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Helper to recursively truncate vector elements in half with PACKSS/PACKUS.
static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG)
static SDValue combineSBB(SDNode *N, SelectionDAG &DAG)
static void computeKnownBitsForPMADDUBSW(SDValue LHS, SDValue RHS, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth)
static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static std::pair< Value *, BitTestKind > FindSingleBitChange(Value *V)
static SDValue combineToFPTruncExtElt(SDNode *N, SelectionDAG &DAG)
If we are converting a value to floating-point, try to replace scalar truncate of an extracted vector...
static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef< int > Mask)
Test whether there are elements crossing 128-bit lanes in this shuffle mask.
static SDValue EmitCmp(SDValue Op0, SDValue Op1, X86::CondCode X86CC, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Emit nodes that will be selected as "cmp Op0,Op1", or something equivalent.
static SDValue LowerI64IntToFP16(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue lowerV4I64Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 4-lane 64-bit integer shuffles.
static SDValue combinevXi1ConstantToInteger(SDValue Op, SelectionDAG &DAG)
const unsigned FPStateSizeInBits
static SDValue reduceMaskedStoreToScalarStore(MaskedStoreSDNode *MS, SelectionDAG &DAG, const X86Subtarget &Subtarget)
If exactly one element of the mask is set for a non-truncating masked store, it is a vector extract a...
static unsigned convertIntLogicToFPLogicOpcode(unsigned Opcode)
static SDValue lowerX86FPLogicOp(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue narrowExtractedVectorSelect(SDNode *Ext, const SDLoc &DL, SelectionDAG &DAG)
If we are extracting a subvector of a vector select and the select condition is composed of concatena...
static SDValue combineScalarAndWithMaskSetcc(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsLanePermuteAndSHUFP(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG)
static bool isNoopShuffleMask(ArrayRef< int > Mask)
Tiny helper function to identify a no-op mask.
static SDValue getUnpackh(SelectionDAG &DAG, const SDLoc &dl, EVT VT, SDValue V1, SDValue V2)
Returns a vector_shuffle node for an unpackh operation.
static SDValue combineExtractFromVectorLoad(SDNode *N, EVT VecVT, SDValue SrcVec, uint64_t Idx, const SDLoc &dl, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue lowerShuffleAsByteShiftMask(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower a vector shuffle as a byte shift sequence.
static SDValue combineFP_TO_xINT_SAT(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool isTargetShuffleVariableMask(unsigned Opcode)
static bool isLogicOp(unsigned Opcode)
static SDValue lowerShuffleAsShift(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG, bool BitwiseOnly)
static SDValue LowerBuildVectorv8i16(SDValue Op, const SDLoc &DL, const APInt &NonZeroMask, unsigned NumNonZero, unsigned NumZero, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Custom lower build_vector of v8i16.
static bool matchBinaryShuffle(MVT MaskVT, ArrayRef< int > Mask, bool AllowFloatDomain, bool AllowIntDomain, SDValue &V1, SDValue &V2, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned &Shuffle, MVT &SrcVT, MVT &DstVT, bool IsUnary)
static SDValue lowerShuffleAsUNPCKAndPermute(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG)
Try to lower as an unpack of elements from two inputs followed by a single-input permutation.
static bool canScaleShuffleElements(ArrayRef< int > Mask, unsigned NumDstElts)
static SDValue LowerBITREVERSE_XOP(SDValue Op, SelectionDAG &DAG)
static SDValue getShuffleVectorZeroOrUndef(SDValue V2, int Idx, bool IsZero, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Return a vector_shuffle of the specified vector of zero or undef vector.
static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp, const SDLoc &dl, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Attempt to use the vbroadcast instruction to generate a splat value from a splat BUILD_VECTOR which u...
static SDValue combineMulToPMULDQ(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerPARITY(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue lowerV16F32Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 16-lane 32-bit floating point shuffles.
static SDValue LowerMINMAX(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineToExtendCMOV(SDNode *Extend, SelectionDAG &DAG)
static bool isHorizontalBinOp(unsigned HOpcode, SDValue &LHS, SDValue &RHS, SelectionDAG &DAG, const X86Subtarget &Subtarget, bool IsCommutative, SmallVectorImpl< int > &PostShuffleMask, bool ForceHorizOp)
Return 'true' if this vector operation is "horizontal" and return the operands for the horizontal ope...
static bool getTargetShuffleMaskIndices(SDValue MaskNode, unsigned MaskEltSizeInBits, SmallVectorImpl< uint64_t > &RawMask, APInt &UndefElts)
static SDValue promoteExtBeforeAdd(SDNode *Ext, SelectionDAG &DAG, const X86Subtarget &Subtarget)
sext(add_nsw(x, C)) --> add(sext(x), C_sext) zext(add_nuw(x, C)) --> add(zext(x), C_zext) Promoting a...
static const Constant * getTargetConstantFromNode(LoadSDNode *Load)
static SDValue canonicalizeBitSelect(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool canCombineAsMaskOperation(SDValue V, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsVALIGN(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower a vector shuffle as a dword/qword rotation.
static SDValue lowerVECTOR_COMPRESS(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static bool isProfitableToUseFlagOp(SDValue Op)
static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue LowerFROUND(SDValue Op, SelectionDAG &DAG)
ISD::FROUND is defined to round to nearest with ties rounding away from 0.
static SDValue detectUSatPattern(SDValue In, EVT VT, SelectionDAG &DAG, const SDLoc &DL)
Detect patterns of truncation with unsigned saturation:
static SDValue narrowShuffle(ShuffleVectorSDNode *Shuf, SelectionDAG &DAG)
If we have a shuffle of AVX/AVX512 (256/512 bit) vectors that only uses the low half of each source v...
static X86::CondCode TranslateX86CC(ISD::CondCode SetCCOpcode, const SDLoc &DL, bool isFP, SDValue &LHS, SDValue &RHS, SelectionDAG &DAG)
Do a one-to-one translation of a ISD::CondCode to the X86-specific condition code,...
static SDValue getFlagsOfCmpZeroFori1(SelectionDAG &DAG, const SDLoc &DL, SDValue Mask)
static SDValue lower512BitShuffle(const SDLoc &DL, ArrayRef< int > Mask, MVT VT, SDValue V1, SDValue V2, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
High-level routine to lower various 512-bit x86 vector shuffles.
static SDValue LowerBuildVectorv16i8(SDValue Op, const SDLoc &DL, const APInt &NonZeroMask, unsigned NumNonZero, unsigned NumZero, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Custom lower build_vector of v16i8.
static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits, APInt &UndefElts, SmallVectorImpl< APInt > &EltBits, bool AllowWholeUndefs=true, bool AllowPartialUndefs=false)
static bool detectExtMul(SelectionDAG &DAG, const SDValue &Mul, SDValue &Op0, SDValue &Op1)
static SDValue combineMOVMSK(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineLRINT_LLRINT(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue lowerAddSubToHorizontalOp(SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Depending on uarch and/or optimizing for size, we might prefer to use a vector operation in place of ...
static SDValue combineShiftToPMULH(SDNode *N, SelectionDAG &DAG, const SDLoc &DL, const X86Subtarget &Subtarget)
static bool getParamsForOneTrueMaskedElt(MaskedLoadStoreSDNode *MaskedOp, SelectionDAG &DAG, SDValue &Addr, SDValue &Index, Align &Alignment, unsigned &Offset)
Given a masked memory load/store operation, return true if it has one mask bit set.
static SDValue reduceVMULWidth(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
When the operands of vector mul are extended from smaller size values, like i8 and i16,...
static X86::CondCode TranslateIntegerX86CC(ISD::CondCode SetCCOpcode)
static SDValue MarkEHRegistrationNode(SDValue Op, SelectionDAG &DAG)
static SDValue combineBROADCAST_LOAD(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineCMP(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combinei64TruncSrlConstant(SDValue N, EVT VT, SelectionDAG &DAG, const SDLoc &DL)
static bool isLegalConversion(MVT VT, MVT FloatVT, bool IsSigned, const X86Subtarget &Subtarget)
static SDValue combineX86AddSub(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &ST)
static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG)
static SDValue createVPDPBUSD(SelectionDAG &DAG, SDValue LHS, SDValue RHS, unsigned &LogBias, const SDLoc &DL, const X86Subtarget &Subtarget)
static SDValue LowerFMINIMUM_FMAXIMUM(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue lowerV2X128Shuffle(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering 2-lane 128-bit shuffles.
static SDValue lowerUINT_TO_FP_vec(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue getSplitVectorSrc(SDValue LHS, SDValue RHS, bool AllowCommute)
static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG)
The only differences between FABS and FNEG are the mask and the logic op.
ShrinkMode
Different mul shrinking modes.
static SDValue combineVPMADD52LH(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue concatSubVectors(SDValue V1, SDValue V2, SelectionDAG &DAG, const SDLoc &dl)
static SDValue combineINTRINSIC_W_CHAIN(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue canonicalizeShuffleMaskWithHorizOp(MutableArrayRef< SDValue > Ops, MutableArrayRef< int > Mask, unsigned RootSizeInBits, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineConstantPoolLoads(SDNode *N, const SDLoc &dl, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static bool isHorizOp(unsigned Opcode)
static SDValue EmitMaskedTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &DL, SDValue Val, SDValue Ptr, SDValue Mask, EVT MemVT, MachineMemOperand *MMO, SelectionDAG &DAG)
Emit Masked Truncating Store with signed or unsigned saturation.
static SDValue lowerVSELECTtoVectorShuffle(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower a VSELECT instruction to a vector shuffle.
static bool matchShuffleAsBlend(MVT VT, SDValue V1, SDValue V2, MutableArrayRef< int > Mask, const APInt &Zeroable, bool &ForceV1Zero, bool &ForceV2Zero, uint64_t &BlendMask)
static SDValue adjustBitcastSrcVectorSSE1(SelectionDAG &DAG, SDValue Src, const SDLoc &DL)
static SDValue combineMulSpecial(uint64_t MulAmt, SDNode *N, SelectionDAG &DAG, EVT VT, const SDLoc &DL)
static SDValue combineBitcastvxi1(SelectionDAG &DAG, EVT VT, SDValue Src, const SDLoc &DL, const X86Subtarget &Subtarget)
static SDValue getUnpackl(SelectionDAG &DAG, const SDLoc &dl, EVT VT, SDValue V1, SDValue V2)
Returns a vector_shuffle node for an unpackl operation.
static SDValue getScalarValueForVectorElement(SDValue V, int Idx, SelectionDAG &DAG)
Try to get a scalar value for a specific element of a vector.
static SDValue LowerZERO_EXTEND_Mask(SDValue Op, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static unsigned getOpcodeForIndirectThunk(unsigned RPOpc)
static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Generic lowering of v16i8 shuffles.
static SDValue matchTruncateWithPACK(unsigned &PackOpcode, EVT DstVT, SDValue In, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget, const SDNodeFlags Flags=SDNodeFlags())
Helper to determine if In truncated to DstVT has the necessary signbits / leading zero bits to be tru...
static unsigned getSHUFPDImm(ArrayRef< int > Mask)
static bool isNullFPScalarOrVectorConst(SDValue V)
static bool hasIdenticalHalvesShuffleMask(ArrayRef< int > Mask)
Return true if a shuffle mask chooses elements identically in its top and bottom halves.
static bool matchShuffleWithPACK(MVT VT, MVT &SrcVT, SDValue &V1, SDValue &V2, unsigned &PackOpcode, ArrayRef< int > TargetMask, const SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned MaxStages=1)
static SDValue combineVectorCompare(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool matchShuffleAsVTRUNC(MVT &SrcVT, MVT &DstVT, MVT VT, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget)
static SDValue combineBITREVERSE(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue PromoteMaskArithmetic(SDValue N, const SDLoc &DL, EVT VT, SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned Depth)
static SDValue combineArithReduction(SDNode *ExtElt, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Try to convert a vector reduction sequence composed of binops and shuffles into horizontal ops.
static SDValue combineINSERT_SUBVECTOR(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsBitRotate(const SDLoc &DL, MVT VT, SDValue V1, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower shuffle using X86ISD::VROTLI rotations.
static SDValue lowerShuffleAsDecomposedShuffleMerge(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Generic routine to decompose a shuffle and blend into independent blends and permutes.
static SDValue combineEXTEND_VECTOR_INREG(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static bool useVPTERNLOG(const X86Subtarget &Subtarget, MVT VT)
static SDValue combineBlendOfPermutes(MVT VT, SDValue N0, SDValue N1, ArrayRef< int > BlendMask, const APInt &DemandedElts, SelectionDAG &DAG, const X86Subtarget &Subtarget, const SDLoc &DL)
static SDValue combineSetCCAtomicArith(SDValue Cmp, X86::CondCode &CC, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Combine: (brcond/cmov/setcc .., (cmp (atomic_load_add x, 1), 0), COND_S) to: (brcond/cmov/setcc ....
static SDValue combineSetCCEFLAGS(SDValue EFLAGS, X86::CondCode &CC, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Optimize an EFLAGS definition used according to the condition code CC into a simpler EFLAGS value,...
static bool isBroadcastShuffleMask(ArrayRef< int > Mask)
static SDValue LowerEXTEND_VECTOR_INREG(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combinePDEP(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue combineExtSetcc(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue canonicalizeShuffleWithOp(SDValue N, SelectionDAG &DAG, const SDLoc &DL)
static SDValue matchPMADDWD_2(SelectionDAG &DAG, SDNode *N, const SDLoc &DL, EVT VT, const X86Subtarget &Subtarget)
static SDValue LowerVectorCTLZ(SDValue Op, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue foldXor1SetCC(SDNode *N, const SDLoc &DL, SelectionDAG &DAG)
Fold a xor(setcc cond, val), 1 --> setcc (inverted(cond), val)
static SDValue MatchVectorAllEqualTest(SDValue OrigLHS, SDValue OrigRHS, ISD::CondCode CC, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG, X86::CondCode &X86CC)
static SDValue lowerShuffleAsByteRotate(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static StringRef getInstrStrFromOpNo(const SmallVectorImpl< StringRef > &AsmStrs, unsigned OpNo)
static bool isSequentialOrUndefOrZeroInRange(ArrayRef< int > Mask, unsigned Pos, unsigned Size, int Low, int Step=1)
Return true if every element in Mask, beginning from position Pos and ending in Pos+Size,...
static SDValue lowerShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Either split a vector in halves or decompose the shuffles and the blend/unpack.
static SDValue widenBuildVec(SDNode *Extend, SelectionDAG &DAG)
static bool canWidenShuffleElements(ArrayRef< int > Mask, SmallVectorImpl< int > &WidenedMask)
Helper function to test whether a shuffle mask could be simplified by widening the elements being shu...
static SDValue splitVectorIntUnary(SDValue Op, SelectionDAG &DAG, const SDLoc &dl)
Break an unary integer operation into 2 half sized ops and then concatenate the result back.
static SDValue combineSext(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue lowerV2I64Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 2-lane 64-bit integer shuffles.
static SDValue LowerADDSAT_SUBSAT(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineLogicBlendIntoConditionalNegate(EVT VT, SDValue Mask, SDValue X, SDValue Y, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue getShuffleScalarElt(SDValue Op, unsigned Index, SelectionDAG &DAG, unsigned Depth)
Returns the scalar element that will make up the i'th element of the result of the vector shuffle.
static unsigned getTargetVShiftUniformOpcode(unsigned Opc, bool IsVariable)
static bool matchShuffleAsInsertPS(SDValue &V1, SDValue &V2, unsigned &InsertPSMask, const APInt &Zeroable, ArrayRef< int > Mask, SelectionDAG &DAG)
static bool isNonZeroElementsInOrder(const APInt &Zeroable, ArrayRef< int > Mask, const EVT &VectorType, bool &IsZeroSideLeft)
static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineMul(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue emitOrXorXorTree(SDValue X, const SDLoc &DL, SelectionDAG &DAG, EVT VecVT, EVT CmpVT, bool HasPT, F SToV)
Recursive helper for combineVectorSizedSetCCEquality() to emit the memcmp expansion.
static SDValue truncateAVX512SetCCNoBWI(EVT VT, EVT OpVT, SDValue LHS, SDValue RHS, ISD::CondCode CC, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
If we have AVX512, but not BWI and this is a vXi16/vXi8 setcc, just pre-promote its result type since...
static SDValue lowerShuffleAsPermuteAndUnpack(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower a shuffle as a permute of the inputs followed by an UNPCK instruction.
static SDValue combineAndOrForCcmpCtest(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &ST)
static SDValue narrowLoadToVZLoad(LoadSDNode *LN, MVT MemVT, MVT VT, SelectionDAG &DAG)
static SDValue scalarizeExtEltFP(SDNode *ExtElt, SelectionDAG &DAG, const X86Subtarget &Subtarget, TargetLowering::DAGCombinerInfo &DCI)
Extracting a scalar FP value from vector element 0 is free, so extract each operand first,...
static SDValue combineSetCCMOVMSK(SDValue EFLAGS, X86::CondCode &CC, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool isAddSubOrSubAddMask(ArrayRef< int > Mask, bool &Op0Even)
Checks if the shuffle mask takes subsequent elements alternately from two vectors.
static bool isCompletePermute(ArrayRef< int > Mask)
Return true if every element of a single input is referenced by the shuffle mask.
static SDValue recoverFramePointer(SelectionDAG &DAG, const Function *Fn, SDValue EntryEBP)
When the MSVC runtime transfers control to us, either to an outlined function or when returning to a ...
static SDValue combineX86INT_TO_FP(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static void getReadTimeStampCounter(SDNode *N, const SDLoc &DL, unsigned Opcode, SelectionDAG &DAG, const X86Subtarget &Subtarget, SmallVectorImpl< SDValue > &Results)
Handles the lowering of builtin intrinsics that read the time stamp counter (x86_rdtsc and x86_rdtscp...
static SDValue LowerShiftByScalarImmediate(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerVectorAllEqual(const SDLoc &DL, SDValue LHS, SDValue RHS, ISD::CondCode CC, const APInt &OriginalMask, const X86Subtarget &Subtarget, SelectionDAG &DAG, X86::CondCode &X86CC)
static bool is128BitUnpackShuffleMask(ArrayRef< int > Mask, const SelectionDAG &DAG)
static bool isOrXorXorTree(SDValue X, bool Root=true)
Recursive helper for combineVectorSizedSetCCEquality() to see if we have a recognizable memcmp expans...
static SDValue LowerAVXExtend(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineFAnd(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Do target-specific dag combines on X86ISD::FAND nodes.
static SDValue combineFaddCFmul(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineCONCAT_VECTORS(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static ConstantPoolSDNode * getTargetConstantPoolFromBasePtr(SDValue Ptr)
static SDValue canonicalizeLaneShuffleWithRepeatedOps(SDValue V, SelectionDAG &DAG, const SDLoc &DL)
Attempt to fold vpermf128(op(),op()) -> op(vpermf128(),vpermf128()).
static bool isShuffleEquivalent(ArrayRef< int > Mask, ArrayRef< int > ExpectedMask, SDValue V1=SDValue(), SDValue V2=SDValue())
Checks whether a shuffle mask is equivalent to an explicit list of arguments.
static SDValue lowerV8F32Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 8-lane 32-bit floating point shuffles.
static SDValue LowerI64IntToFP_AVX512DQ(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerBUILD_VECTORAsVariablePermute(SDValue V, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsByteRotateAndPermute(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Helper to form a PALIGNR-based rotate+permute, merging 2 inputs and then permuting the elements of th...
static SDValue combineX86GatherScatter(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue combineVectorHADDSUB(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue LowerVectorCTPOP(SDValue Op, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineX86ShuffleChain(ArrayRef< SDValue > Inputs, unsigned RootOpc, MVT RootVT, ArrayRef< int > BaseMask, int Depth, ArrayRef< const SDNode * > SrcNodes, bool AllowVariableCrossLaneMask, bool AllowVariablePerLaneMask, bool IsMaskedShuffle, SelectionDAG &DAG, const SDLoc &DL, const X86Subtarget &Subtarget)
Combine an arbitrary chain of shuffles into a single instruction if possible.
static SDValue getAVX512TruncNode(const SDLoc &DL, MVT DstVT, SDValue Src, const X86Subtarget &Subtarget, SelectionDAG &DAG, bool ZeroUppers)
static bool isFMAddSubOrFMSubAdd(const X86Subtarget &Subtarget, SelectionDAG &DAG, SDValue &Opnd0, SDValue &Opnd1, SDValue &Opnd2, unsigned ExpectedUses, bool AllowSubAddOrAddSubContract)
Returns true if is possible to fold MUL and an idiom that has already been recognized as ADDSUB/SUBAD...
static void createPackShuffleMask(MVT VT, SmallVectorImpl< int > &Mask, bool Unary, unsigned NumStages=1)
Create a shuffle mask that matches the PACKSS/PACKUS truncation.
static bool isUndefOrEqualInRange(ArrayRef< int > Mask, int CmpVal, unsigned Pos, unsigned Size)
Return true if every element in Mask, beginning from position Pos and ending in Pos+Size is the undef...
static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Do target-specific dag combines on floating-point adds/subs.
static SDValue LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG, const EVT PtrVT)
static SDValue splitVectorOp(SDValue Op, SelectionDAG &DAG, const SDLoc &dl)
Break an operation into 2 half sized ops and then concatenate the results.
static cl::opt< bool > MulConstantOptimization("mul-constant-optimization", cl::init(true), cl::desc("Replace 'mul x, Const' with more effective instructions like " "SHIFT, LEA, etc."), cl::Hidden)
static SDValue getIndexFromUnindexedLoad(LoadSDNode *Ld)
static bool isAnyZero(ArrayRef< int > Mask)
Return true if the value of any element in Mask is the zero sentinel value.
static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue truncateVectorWithPACKUS(EVT DstVT, SDValue In, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Truncate using inreg zero extension (AND mask) and X86ISD::PACKUS.
static SDValue lowerINT_TO_FP_vXi64(SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool isMaskableNode(SDValue V, const X86Subtarget &Subtarget)
static void resolveZeroablesFromTargetShuffle(const SmallVectorImpl< int > &Mask, APInt &KnownUndef, APInt &KnownZero)
static SDValue rebuildGatherScatter(MaskedGatherScatterSDNode *GorS, SDValue Index, SDValue Base, SDValue Scale, SelectionDAG &DAG)
static SDValue matchVPMADD52(SDNode *N, SelectionDAG &DAG, const SDLoc &DL, EVT VT, const X86Subtarget &Subtarget)
static SDValue combineSubABS(EVT VT, const SDLoc &DL, SDValue N0, SDValue N1, SelectionDAG &DAG)
static SmallVector< int, 4 > getPSHUFShuffleMask(SDValue N)
Get the PSHUF-style mask from PSHUF node.
static SDValue scalarizeVectorStore(StoreSDNode *Store, MVT StoreVT, SelectionDAG &DAG)
Scalarize a vector store, bitcasting to TargetVT to determine the scalar type.
static SDValue LowerBUILD_VECTORvXbf16(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineShuffleToFMAddSub(SDNode *N, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Combine shuffle of two fma nodes into FMAddSub or FMSubAdd.
static SDValue getAVX2GatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, SDValue Src, SDValue Mask, SDValue Base, SDValue Index, SDValue ScaleOp, SDValue Chain, const X86Subtarget &Subtarget)
static SDValue lowerShufflePairAsUNPCKAndPermute(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG)
static bool isUndefOrZero(int Val)
Val is either the undef or zero sentinel value.
static SDValue combineAndNotOrIntoAndNotAnd(SDNode *N, const SDLoc &DL, SelectionDAG &DAG)
Folds (and X, (or Y, ~Z)) --> (and X, ~(and ~Y, Z)) This undoes the inverse fold performed in InstCom...
static SDValue combineBEXTR(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineFMADDSUB(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue combineCMov(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL].
static SDValue extract128BitVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, const SDLoc &dl)
Generate a DAG to grab 128-bits from a vector > 128 bits.
static SDValue EmitAVX512Test(SDValue Op0, SDValue Op1, ISD::CondCode CC, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget, SDValue &X86CC)
static SDValue lowerShuffleWithSHUFPS(const SDLoc &DL, MVT VT, ArrayRef< int > Mask, SDValue V1, SDValue V2, SelectionDAG &DAG)
Lower a vector shuffle using the SHUFPS instruction.
static SDValue combineStore(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineX86ShuffleChainWithExtract(ArrayRef< SDValue > Inputs, unsigned RootOpcode, MVT RootVT, ArrayRef< int > BaseMask, int Depth, ArrayRef< const SDNode * > SrcNodes, bool AllowVariableCrossLaneMask, bool AllowVariablePerLaneMask, bool IsMaskedShuffle, SelectionDAG &DAG, const SDLoc &DL, const X86Subtarget &Subtarget)
static SDValue combineX86SetCC(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineMinMaxReduction(SDNode *Extract, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineHorizOpWithShuffle(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerVectorCTLZ_AVX512CDI(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Lower a vector CTLZ using native supported vector CTLZ instruction.
static SDValue ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Extract one bit from mask vector, like v16i1 or v8i1.
static SDValue LowervXi8MulWithUNPCK(SDValue A, SDValue B, const SDLoc &dl, MVT VT, bool IsSigned, const X86Subtarget &Subtarget, SelectionDAG &DAG, SDValue *Low=nullptr)
static SDValue lowerShuffleAsBlendOfPSHUFBs(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, SelectionDAG &DAG, bool &V1InUse, bool &V2InUse)
Helper to form a PSHUFB-based shuffle+blend, opportunistically avoiding the blend if only one input i...
static bool matchShuffleAsINSERTQ(MVT VT, SDValue &V1, SDValue &V2, ArrayRef< int > Mask, uint64_t &BitLen, uint64_t &BitIdx)
static SDValue getBitSelect(const SDLoc &DL, MVT VT, SDValue LHS, SDValue RHS, SDValue Mask, SelectionDAG &DAG)
static SDValue combineAVG(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static bool isSequentialOrUndefInRange(ArrayRef< int > Mask, unsigned Pos, unsigned Size, int Low, int Step=1)
Return true if every element in Mask, beginning from position Pos and ending in Pos + Size,...
static cl::opt< int > BrMergingUnlikelyBias("x86-br-merging-unlikely-bias", cl::init(-1), cl::desc("Decreases 'x86-br-merging-base-cost' in cases that it is unlikely " "that all conditionals will be executed. For example for merging " "the conditionals (a == b && c > d), if its known that a == b is " "unlikely, then it is unlikely that if the conditionals are split " "both sides will be executed, so it may be desirable to decrease " "the instruction cost threshold. Set to -1 to never merge unlikely " "branches."), cl::Hidden)
static SDValue createSetFPEnvNodes(SDValue Ptr, SDValue Chain, const SDLoc &DL, EVT MemVT, MachineMemOperand *MMO, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool getTargetShuffleAndZeroables(SDValue N, SmallVectorImpl< int > &Mask, SmallVectorImpl< SDValue > &Ops, APInt &KnownUndef, APInt &KnownZero)
Decode a target shuffle mask and inputs and see if any values are known to be undef or zero from thei...
static SDValue combinePredicateReduction(SDNode *Extract, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerBuildVectorv4x32(SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Custom lower build_vector of v4i32 or v4f32.
static bool isTargetShuffleEquivalent(MVT VT, ArrayRef< int > Mask, ArrayRef< int > ExpectedMask, const SelectionDAG &DAG, SDValue V1=SDValue(), SDValue V2=SDValue())
Checks whether a target shuffle mask is equivalent to an explicit pattern.
static SDValue LowerABS(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue pushAddIntoCmovOfConsts(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
CMOV of constants requires materializing constant operands in registers.
static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA, SelectionDAG &DAG, const EVT PtrVT, bool Is64Bit, bool Is64BitLP64)
static SDValue combineTruncateWithSat(SDValue In, EVT VT, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineBT(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue expandFP_TO_UINT_SSE(MVT VT, SDValue Src, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec, SDValue ExtIdx)
For an EXTRACT_VECTOR_ELT with a constant index return the real underlying vector and index.
static SDValue combineVectorShiftImm(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static bool isUnaryOp(unsigned Opcode)
static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op, const SDLoc &dl, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineBrCond(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Optimize branch condition evaluation.
static bool hasFPCMov(unsigned X86CC)
Is there a floating point cmov for the specific X86 condition code?
static int getOneTrueElt(SDValue V)
If V is a build vector of boolean constants and exactly one of those constants is true,...
static SDValue lowerShuffleWithVPMOV(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue foldXorTruncShiftIntoCmp(SDNode *N, const SDLoc &DL, SelectionDAG &DAG)
Try to turn tests against the signbit in the form of: XOR(TRUNCATE(SRL(X, size(X)-1)),...
static SDValue lowerShuffleWithUNPCK(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG)
static constexpr int Concat[]
Value * RHS
Value * LHS
BinaryOperator * Mul
if(isa< SExtInst >(LHS)) std auto IsFreeTruncation
static const unsigned FramePtr
The Input class is used to parse a yaml document into in-memory structs and vectors.
static const fltSemantics & IEEEsingle()
Definition APFloat.h:296
static const fltSemantics & BFloat()
Definition APFloat.h:295
static constexpr roundingMode rmNearestTiesToEven
Definition APFloat.h:344
static const fltSemantics & IEEEquad()
Definition APFloat.h:298
static const fltSemantics & IEEEdouble()
Definition APFloat.h:297
static const fltSemantics & x87DoubleExtended()
Definition APFloat.h:317
static constexpr roundingMode rmTowardZero
Definition APFloat.h:348
static LLVM_ABI unsigned int semanticsPrecision(const fltSemantics &)
Definition APFloat.cpp:296
static const fltSemantics & IEEEhalf()
Definition APFloat.h:294
opStatus
IEEE-754R 7: Default exception handling.
Definition APFloat.h:360
LLVM_ABI opStatus convert(const fltSemantics &ToSemantics, roundingMode RM, bool *losesInfo)
Definition APFloat.cpp:6060
static LLVM_ABI APFloat getAllOnesValue(const fltSemantics &Semantics)
Returns a float which is bitcasted from an all one value int.
Definition APFloat.cpp:6086
void clearSign()
Definition APFloat.h:1280
opStatus next(bool nextDown)
Definition APFloat.h:1236
void changeSign()
Definition APFloat.h:1279
static APFloat getZero(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Zero.
Definition APFloat.h:1061
Class for arbitrary precision integers.
Definition APInt.h:78
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition APInt.h:234
void clearBit(unsigned BitPosition)
Set a given bit to 0.
Definition APInt.h:1406
bool isNegatedPowerOf2() const
Check if this APInt's negated value is a power of two greater than zero.
Definition APInt.h:449
LLVM_ABI APInt zext(unsigned width) const
Zero extend to a new width.
Definition APInt.cpp:1012
static APInt getSignMask(unsigned BitWidth)
Get the SignMask for a specific bit width.
Definition APInt.h:229
bool isMinSignedValue() const
Determine if this is the smallest signed value.
Definition APInt.h:423
uint64_t getZExtValue() const
Get zero extended value.
Definition APInt.h:1540
void setHighBits(unsigned hiBits)
Set the top hiBits bits.
Definition APInt.h:1391
unsigned popcount() const
Count the number of bits set.
Definition APInt.h:1670
void setBitsFrom(unsigned loBit)
Set the top bits starting from loBit.
Definition APInt.h:1385
LLVM_ABI uint64_t extractBitsAsZExtValue(unsigned numBits, unsigned bitPosition) const
Definition APInt.cpp:520
LLVM_ABI APInt zextOrTrunc(unsigned width) const
Zero extend or truncate to width.
Definition APInt.cpp:1033
unsigned getActiveBits() const
Compute the number of active bits in the value.
Definition APInt.h:1512
LLVM_ABI APInt trunc(unsigned width) const
Truncate to new width.
Definition APInt.cpp:936
static APInt getMaxValue(unsigned numBits)
Gets maximum unsigned value of APInt for specific bit width.
Definition APInt.h:206
void setBit(unsigned BitPosition)
Set the given bit to 1 whose position is given as "bitPosition".
Definition APInt.h:1330
APInt abs() const
Get the absolute value.
Definition APInt.h:1795
bool isAllOnes() const
Determine if all bits are set. This is true for zero-width values.
Definition APInt.h:371
static APInt getBitsSet(unsigned numBits, unsigned loBit, unsigned hiBit)
Get a value with a block of bits set.
Definition APInt.h:258
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
Definition APInt.h:380
bool isSignMask() const
Check if the APInt's value is returned by getSignMask.
Definition APInt.h:466
LLVM_ABI APInt urem(const APInt &RHS) const
Unsigned remainder operation.
Definition APInt.cpp:1666
void setSignBit()
Set the sign bit to 1.
Definition APInt.h:1340
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition APInt.h:1488
bool ult(const APInt &RHS) const
Unsigned less than comparison.
Definition APInt.h:1111
static APInt getSignedMaxValue(unsigned numBits)
Gets maximum signed value of APInt for a specific bit width.
Definition APInt.h:209
static APInt getMinValue(unsigned numBits)
Gets minimum unsigned value of APInt for a specific bit width.
Definition APInt.h:216
bool isNegative() const
Determine sign of this APInt.
Definition APInt.h:329
bool intersects(const APInt &RHS) const
This operation tests if there are any pairs of corresponding bits between this APInt and RHS that are...
Definition APInt.h:1249
bool eq(const APInt &RHS) const
Equality comparison.
Definition APInt.h:1079
int32_t exactLogBase2() const
Definition APInt.h:1783
void clearAllBits()
Set every bit to 0.
Definition APInt.h:1396
void ashrInPlace(unsigned ShiftAmt)
Arithmetic right-shift this APInt by ShiftAmt in place.
Definition APInt.h:834
unsigned countr_zero() const
Count the number of trailing zero bits.
Definition APInt.h:1639
bool isSignedIntN(unsigned N) const
Check if this APInt has an N-bits signed integer value.
Definition APInt.h:435
unsigned getNumSignBits() const
Computes the number of leading bits of this APInt that are equal to its sign bit.
Definition APInt.h:1628
unsigned countl_zero() const
The APInt version of std::countl_zero.
Definition APInt.h:1598
static LLVM_ABI APInt getSplat(unsigned NewLen, const APInt &V)
Return a value containing V broadcasted over NewLen bits.
Definition APInt.cpp:651
static APInt getSignedMinValue(unsigned numBits)
Gets minimum signed value of APInt for a specific bit width.
Definition APInt.h:219
unsigned countTrailingZeros() const
Definition APInt.h:1647
unsigned getSignificantBits() const
Get the minimum bit size for this signed APInt.
Definition APInt.h:1531
void flipAllBits()
Toggle every bit to its opposite value.
Definition APInt.h:1452
unsigned countl_one() const
Count the number of leading one bits.
Definition APInt.h:1615
LLVM_ABI void insertBits(const APInt &SubBits, unsigned bitPosition)
Insert the bits from a smaller APInt starting at bitPosition.
Definition APInt.cpp:397
void clearLowBits(unsigned loBits)
Set bottom loBits bits to 0.
Definition APInt.h:1435
unsigned logBase2() const
Definition APInt.h:1761
APInt ashr(unsigned ShiftAmt) const
Arithmetic right-shift function.
Definition APInt.h:827
void setAllBits()
Set every bit to 1.
Definition APInt.h:1319
bool getBoolValue() const
Convert APInt to a boolean value.
Definition APInt.h:471
bool isMask(unsigned numBits) const
Definition APInt.h:488
bool isMaxSignedValue() const
Determine if this is the largest signed value.
Definition APInt.h:405
bool isNonNegative() const
Determine if this APInt Value is non-negative (>= 0)
Definition APInt.h:334
bool ule(const APInt &RHS) const
Unsigned less or equal comparison.
Definition APInt.h:1150
LLVM_ABI APInt sext(unsigned width) const
Sign extend to a new width.
Definition APInt.cpp:985
void setBits(unsigned loBit, unsigned hiBit)
Set the bits from loBit (inclusive) to hiBit (exclusive) to 1.
Definition APInt.h:1367
APInt shl(unsigned shiftAmt) const
Left-shift function.
Definition APInt.h:873
bool isSubsetOf(const APInt &RHS) const
This operation checks that all bits set in this APInt are also set in RHS.
Definition APInt.h:1257
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
Definition APInt.h:440
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition APInt.h:306
bool isSignBitSet() const
Determine if sign bit of this APInt is set.
Definition APInt.h:341
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition APInt.h:296
static APInt getZero(unsigned numBits)
Get the '0' value for the specified bit-width.
Definition APInt.h:200
void setLowBits(unsigned loBits)
Set the bottom loBits bits.
Definition APInt.h:1388
LLVM_ABI APInt extractBits(unsigned numBits, unsigned bitPosition) const
Return an APInt with the extracted bits [bitPosition,bitPosition+numBits).
Definition APInt.cpp:482
bool isIntN(unsigned N) const
Check if this APInt has an N-bits unsigned integer value.
Definition APInt.h:432
bool isOne() const
Determine if this is a value of 1.
Definition APInt.h:389
static APInt getBitsSetFrom(unsigned numBits, unsigned loBit)
Constructs an APInt value that has a contiguous range of bits set.
Definition APInt.h:286
static APInt getOneBitSet(unsigned numBits, unsigned BitNo)
Return an APInt with exactly one bit set in the result.
Definition APInt.h:239
void lshrInPlace(unsigned ShiftAmt)
Logical right-shift this APInt by ShiftAmt in place.
Definition APInt.h:858
APInt lshr(unsigned shiftAmt) const
Logical right-shift function.
Definition APInt.h:851
unsigned countr_one() const
Count the number of trailing one bits.
Definition APInt.h:1656
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
Definition APInt.h:1221
bool isMaxValue() const
Determine if this is the largest unsigned value.
Definition APInt.h:399
LLVM_ABI APInt truncSSat(unsigned width) const
Truncate to new width with signed saturation.
Definition APInt.cpp:973
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:41
bool equals(ArrayRef RHS) const
equals - Check for element-wise equality.
Definition ArrayRef.h:183
iterator end() const
Definition ArrayRef.h:136
size_t size() const
size - Get the array size.
Definition ArrayRef.h:147
ArrayRef< T > drop_back(size_t N=1) const
Drop the last N elements of the array.
Definition ArrayRef.h:206
iterator begin() const
Definition ArrayRef.h:135
bool empty() const
empty - Check if the array is empty.
Definition ArrayRef.h:142
ArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
Definition ArrayRef.h:191
static LLVM_ABI ArrayType * get(Type *ElementType, uint64_t NumElements)
This static method is the primary way to construct an ArrayType.
static AtomicOrdering getStrongestFailureOrdering(AtomicOrdering SuccessOrdering)
Returns the strongest permitted ordering on failure, given the desired ordering on success.
an instruction that atomically reads a memory location, combines it with another value,...
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
BinOp
This enumeration lists the possible modifications atomicrmw can make.
@ Add
*p = old + v
@ FAdd
*p = old + v
@ USubCond
Subtract only if no unsigned overflow.
@ Min
*p = old <signed v ? old : v
@ Sub
*p = old - v
@ And
*p = old & v
@ Xor
*p = old ^ v
@ USubSat
*p = usub.sat(old, v) usub.sat matches the behavior of llvm.usub.sat.
@ FSub
*p = old - v
@ UIncWrap
Increment one up to a maximum value.
@ Max
*p = old >signed v ? old : v
@ UMin
*p = old <unsigned v ? old : v
@ FMin
*p = minnum(old, v) minnum matches the behavior of llvm.minnum.
@ UMax
*p = old >unsigned v ? old : v
@ FMax
*p = maxnum(old, v) maxnum matches the behavior of llvm.maxnum.
@ UDecWrap
Decrement one until a minimum value or zero.
@ Nand
*p = ~(old & v)
Value * getPointerOperand()
BinOp getOperation() const
SyncScope::ID getSyncScopeID() const
Returns the synchronization scope ID of this rmw instruction.
AtomicOrdering getOrdering() const
Returns the ordering constraint of this rmw instruction.
This is an SDNode representing atomic operations.
LLVM_ABI StringRef getValueAsString() const
Return the attribute's value as a string.
LLVM Basic Block Representation.
Definition BasicBlock.h:62
size_type count() const
count - Returns the number of bits which are set.
Definition BitVector.h:181
bool any() const
any - Returns true if any bit is set.
Definition BitVector.h:189
bool none() const
none - Returns true if none of the bits are set.
Definition BitVector.h:207
A "pseudo-class" with methods for operating on BUILD_VECTORs.
LLVM_ABI bool getRepeatedSequence(const APInt &DemandedElts, SmallVectorImpl< SDValue > &Sequence, BitVector *UndefElements=nullptr) const
Find the shortest repeating sequence of values in the build vector.
LLVM_ABI SDValue getSplatValue(const APInt &DemandedElts, BitVector *UndefElements=nullptr) const
Returns the demanded splatted value or a null value if this is not a splat.
LLVM_ABI bool isConstantSplat(APInt &SplatValue, APInt &SplatUndef, unsigned &SplatBitSize, bool &HasAnyUndefs, unsigned MinSplatBits=0, bool isBigEndian=false) const
Check if this is a constant splat, and if so, find the smallest element size that splats the vector.
LLVM_ABI bool isConstant() const
This class represents a function call, abstracting a target machine's calling convention.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:676
@ ICMP_SLT
signed less than
Definition InstrTypes.h:705
@ ICMP_SGT
signed greater than
Definition InstrTypes.h:703
@ ICMP_NE
not equal
Definition InstrTypes.h:698
Predicate getPredicate() const
Return the predicate for this instruction.
Definition InstrTypes.h:765
An abstraction over a floating-point predicate, and a pack of an integer predicate with samesign info...
static LLVM_ABI Constant * get(ArrayType *T, ArrayRef< Constant * > V)
static LLVM_ABI Constant * get(LLVMContext &Context, ArrayRef< uint8_t > Elts)
get() constructors - Return a constant with vector type with an element count and element type matchi...
This is the shared class of boolean and integer constants.
Definition Constants.h:87
static LLVM_ABI bool isValueValidForType(Type *Ty, uint64_t V)
This static method returns true if the type Ty is big enough to represent the value V.
const Constant * getConstVal() const
uint64_t getZExtValue() const
const APInt & getAPIntValue() const
static LLVM_ABI Constant * get(ArrayRef< Constant * > V)
This is an important base class in LLVM.
Definition Constant.h:43
static LLVM_ABI Constant * getIntegerValue(Type *Ty, const APInt &V)
Return the value for an integer or pointer constant, or a vector thereof, with the given scalar value...
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
LLVM_ABI Constant * getAggregateElement(unsigned Elt) const
For aggregates (struct/array/vector) return the constant that corresponds to the specified element if...
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:63
TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
A debug info location.
Definition DebugLoc.h:124
iterator find(const_arg_type_t< KeyT > Val)
Definition DenseMap.h:167
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
Definition DenseMap.h:237
unsigned size() const
Definition DenseMap.h:110
bool empty() const
Definition DenseMap.h:109
iterator begin()
Definition DenseMap.h:78
iterator end()
Definition DenseMap.h:81
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition DenseMap.h:222
Tagged union holding either a T or a Error.
Definition Error.h:485
This is a fast-path instruction selection class that generates poor code and doesn't support illegal ...
Definition FastISel.h:66
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:803
FunctionLoweringInfo - This contains information that is global to a function that is used when lower...
Type::subtype_iterator param_iterator
bool hasOptSize() const
Optimize this function for size (-Os) or minimum size (-Oz).
Definition Function.h:706
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
Definition Function.cpp:762
uint64_t getFnAttributeAsParsedInteger(StringRef Kind, uint64_t Default=0) const
For a string attribute Kind, parse attribute as an integer.
Definition Function.cpp:774
bool hasMinSize() const
Optimize this function for minimum size (-Oz).
Definition Function.h:703
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition Function.h:270
bool hasPersonalityFn() const
Check whether this function has a personality function.
Definition Function.h:903
Constant * getPersonalityFn() const
Get the personality function associated with this function.
AttributeList getAttributes() const
Return the attribute list for this Function.
Definition Function.h:352
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:359
bool hasFnAttribute(Attribute::AttrKind Kind) const
Return true if the function has the attribute.
Definition Function.cpp:727
const GlobalValue * getGlobal() const
static StringRef dropLLVMManglingEscape(StringRef Name)
If the given string begins with the GlobalValue name mangling escape character '\1',...
LLVM_ABI bool isAbsoluteSymbolRef() const
Returns whether this is a reference to an absolute symbol.
Definition Globals.cpp:437
ThreadLocalMode getThreadLocalMode() const
Module * getParent()
Get the module that this global value is contained inside of...
This class is used to form a handle around another node that is persistent and is updated across invo...
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Instruction * user_back()
Specialize the methods defined in Value, as we know that an instruction can only be used by other ins...
LLVM_ABI const Function * getFunction() const
Return the function this instruction belongs to.
Class to represent integer types.
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
bool isIndexed() const
Return true if this is a pre/post inc/dec load/store.
An instruction for reading from memory.
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this load instruction.
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
bool usesWindowsCFI() const
Definition MCAsmInfo.h:652
LLVM_ABI MCSymbol * getOrCreateParentFrameOffsetSymbol(const Twine &FuncName)
LLVM_ABI MCSymbol * getOrCreateLSDASymbol(const Twine &FuncName)
MCSymbol - Instances of this class represent a symbol name in the MC file, and MCSymbols are created ...
Definition MCSymbol.h:42
Set of metadata that should be preserved when using BuildMI().
Machine Value Type.
static MVT getFloatingPointVT(unsigned BitWidth)
bool is128BitVector() const
Return true if this is a 128-bit vector type.
@ INVALID_SIMPLE_VALUE_TYPE
SimpleValueType SimpleTy
uint64_t getScalarSizeInBits() const
MVT changeVectorElementType(MVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
bool bitsLE(MVT VT) const
Return true if this has no more bits than VT.
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isInteger() const
Return true if this is an integer or a vector integer type.
bool isScalableVector() const
Return true if this is a vector value type where the runtime length is machine dependent.
bool is32BitVector() const
Return true if this is a 32-bit vector type.
MVT changeTypeToInteger()
Return the type converted to an equivalently sized integer or vector with integer element type.
static LLVM_ABI MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
bool bitsLT(MVT VT) const
Return true if this has less bits than VT.
bool is512BitVector() const
Return true if this is a 512-bit vector type.
static auto integer_valuetypes()
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
static auto fixedlen_vector_valuetypes()
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
LLVM_ABI const fltSemantics & getFltSemantics() const
Returns an APFloat semantics tag appropriate for the value type.
bool bitsGT(MVT VT) const
Return true if this has more bits than VT.
bool is256BitVector() const
Return true if this is a 256-bit vector type.
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
bool bitsGE(MVT VT) const
Return true if this has no less bits than VT.
bool isScalarInteger() const
Return true if this is an integer, not including vectors.
static MVT getVectorVT(MVT VT, unsigned NumElements)
MVT getVectorElementType() const
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
static MVT getIntegerVT(unsigned BitWidth)
MVT getDoubleNumVectorElementsVT() const
MVT getHalfNumVectorElementsVT() const
Return a VT for a vector type with the same element type but half the number of elements.
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
bool is64BitVector() const
Return true if this is a 64-bit vector type.
MVT changeVectorElementTypeToInteger() const
Return a vector with the same number of elements as this vector, but with the element type converted ...
LLVM_ABI void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
bool isEHPad() const
Returns true if the block is a landing pad.
LLVM_ABI instr_iterator insert(instr_iterator I, MachineInstr *M)
Insert MI into the instruction list before I, possibly inside a bundle.
void push_back(MachineInstr *MI)
void setCallFrameSize(unsigned N)
Set the call frame size on entry to this basic block.
const BasicBlock * getBasicBlock() const
Return the LLVM basic block that this instance corresponded to originally.
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
LLVM_ABI void removeSuccessor(MachineBasicBlock *Succ, bool NormalizeSuccProbs=false)
Remove successor from the successors list of this MachineBasicBlock.
Instructions::iterator instr_iterator
MachineInstrBundleIterator< MachineInstr, true > reverse_iterator
succ_reverse_iterator succ_rbegin()
void addLiveIn(MCRegister PhysReg, LaneBitmask LaneMask=LaneBitmask::getAll())
Adds the specified register as a live in.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
LLVM_ABI instr_iterator erase(instr_iterator I)
Remove an instruction from the instruction list and delete it.
iterator insertAfter(iterator I, MachineInstr *MI)
Insert MI into the instruction list after I.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
MachineInstrBundleIterator< MachineInstr > iterator
succ_reverse_iterator succ_rend()
void setMachineBlockAddressTaken()
Set this block to indicate that its address is used as something other than the target of a terminato...
LLVM_ABI bool isLiveIn(MCRegister Reg, LaneBitmask LaneMask=LaneBitmask::getAll()) const
Return true if the specified register is in the live in set.
void setIsEHPad(bool V=true)
Indicates the block is a landing pad.
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
LLVM_ABI int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
LLVM_ABI int CreateStackObject(uint64_t Size, Align Alignment, bool isSpillSlot, const AllocaInst *Alloca=nullptr, uint8_t ID=0)
Create a new statically sized stack object, returning a nonnegative identifier to represent it.
void setFrameAddressIsTaken(bool T)
void setReturnAddressIsTaken(bool s)
void setHasCopyImplyingStackAdjustment(bool B)
bool isFixedObjectIndex(int ObjectIdx) const
Returns true if the specified index corresponds to a fixed stack object.
void setObjectAlignment(int ObjectIdx, Align Alignment)
setObjectAlignment - Change the alignment of the specified stack object.
int getFunctionContextIndex() const
Return the index for the function context object.
const WinEHFuncInfo * getWinEHFuncInfo() const
getWinEHFuncInfo - Return information about how the current function uses Windows exception handling.
void moveAdditionalCallInfo(const MachineInstr *Old, const MachineInstr *New)
Move the call site info from Old to \New call site info.
unsigned getFunctionNumber() const
getFunctionNumber - Return a unique ID for the current function.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
void push_back(MachineBasicBlock *MBB)
MCContext & getContext() const
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
const DataLayout & getDataLayout() const
Return the DataLayout attached to the Module associated to this MF.
Function & getFunction()
Return the LLVM function that this machine code represents.
BasicBlockListType::iterator iterator
bool shouldSplitStack() const
Should we be emitting segmented stack stuff for the function.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
const MachineInstrBuilder & setMemRefs(ArrayRef< MachineMemOperand * > MMOs) const
const MachineInstrBuilder & addExternalSymbol(const char *FnName, unsigned TargetFlags=0) const
const MachineInstrBuilder & setMIFlag(MachineInstr::MIFlag Flag) const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addRegMask(const uint32_t *Mask) const
const MachineInstrBuilder & addGlobalAddress(const GlobalValue *GV, int64_t Offset=0, unsigned TargetFlags=0) const
const MachineInstrBuilder & addDisp(const MachineOperand &Disp, int64_t off, unsigned char TargetFlags=0) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addJumpTableIndex(unsigned Idx, unsigned TargetFlags=0) const
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & addUse(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register use operand.
MachineInstr * getInstr() const
If conversion operators fail, use this method to get the MachineInstr explicitly.
const MachineInstrBuilder & addDef(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register definition operand.
Representation of each machine instruction.
bool killsRegister(Register Reg, const TargetRegisterInfo *TRI) const
Return true if the MachineInstr kills the specified register.
LLVM_ABI void eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
const MachineOperand & getOperand(unsigned i) const
LLVM_ABI unsigned createJumpTableIndex(const std::vector< MachineBasicBlock * > &DestBBs)
createJumpTableIndex - Create a new jump table.
@ EK_LabelDifference32
EK_LabelDifference32 - Each entry is the address of the block minus the address of the jump table.
@ EK_BlockAddress
EK_BlockAddress - Each entry is a plain address of block, e.g.: .word LBB123.
A description of a memory reference used in the backend.
Flags
Flags values. These may be or'd together.
@ MOVolatile
The memory access is volatile.
@ MOLoad
The memory access reads data.
@ MOStore
The memory access writes data.
Flags getFlags() const
Return the raw flags of the source value,.
MachineOperand class - Representation of each machine instruction operand.
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
Register getReg() const
getReg - Returns the register number.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLVM_ABI Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
An SDNode that represents everything that will be needed to construct a MachineInstr.
This class is used to represent an MGATHER node.
This is a base class used to represent MGATHER and MSCATTER nodes.
This class is used to represent an MLOAD node.
This base class is used to represent MLOAD and MSTORE nodes.
const SDValue & getMask() const
ISD::MemIndexedMode getAddressingMode() const
Return the addressing mode for this load or store: unindexed, pre-inc, pre-dec, post-inc,...
This class is used to represent an MSCATTER node.
This class is used to represent an MSTORE node.
bool isCompressingStore() const
Returns true if the op does a compression to the vector before storing.
const SDValue & getOffset() const
const SDValue & getBasePtr() const
const SDValue & getMask() const
const SDValue & getValue() const
bool isTruncatingStore() const
Return true if the op does a truncation before store.
This SDNode is used for target intrinsics that touch memory and need an associated MachineMemOperand.
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
Align getBaseAlign() const
Returns alignment and volatility of the memory access.
Align getAlign() const
bool isVolatile() const
AAMDNodes getAAInfo() const
Returns the AA info that describes the dereference.
SyncScope::ID getSyncScopeID() const
Returns the synchronization scope ID for this memory operation.
bool isSimple() const
Returns true if the memory operation is neither atomic or volatile.
AtomicOrdering getSuccessOrdering() const
Return the atomic ordering requirements for this memory operation.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
const SDValue & getBasePtr() const
const MachinePointerInfo & getPointerInfo() const
const SDValue & getChain() const
bool isNonTemporal() const
EVT getMemoryVT() const
Return the type of the in-memory value.
Root of the metadata hierarchy.
Definition Metadata.h:64
A Module instance is used to store all the information related to an LLVM module.
Definition Module.h:67
Metadata * getModuleFlag(StringRef Key) const
Return the corresponding value if Key appears in module flags, otherwise return null.
Definition Module.cpp:353
MutableArrayRef - Represent a mutable reference to an array (0 or more elements consecutively in memo...
Definition ArrayRef.h:303
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
static LLVM_ABI PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
Wrapper class representing virtual and physical registers.
Definition Register.h:19
constexpr bool isValid() const
Definition Register.h:107
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
Represents one node in the SelectionDAG.
bool isStrictFPOpcode()
Test if this node is a strict floating point pseudo-op.
ArrayRef< SDUse > ops() const
const APInt & getAsAPIntVal() const
Helper method returns the APInt value of a ConstantSDNode.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
SDNode * getGluedUser() const
If this node has a glue value with a user, return the user (there is at most one).
bool hasOneUse() const
Return true if there is exactly one use of this node.
LLVM_ABI bool isOnlyUserOf(const SDNode *N) const
Return true if this node is the only use of N.
iterator_range< value_op_iterator > op_values() const
SDNodeFlags getFlags() const
TypeSize getValueSizeInBits(unsigned ResNo) const
Returns MVT::getSizeInBits(getValueType(ResNo)).
MVT getSimpleValueType(unsigned ResNo) const
Return the type of a specified result as a simple type.
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getNumValues() const
Return the number of values defined/returned by this operator.
unsigned getNumOperands() const
Return the number of values used by this operation.
SDVTList getVTList() const
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
static LLVM_ABI bool areOnlyUsersOf(ArrayRef< const SDNode * > Nodes, const SDNode *N)
Return true if all the users of N are contained in Nodes.
bool hasNUsesOfValue(unsigned NUses, unsigned Value) const
Return true if there are exactly NUSES uses of the indicated value.
LLVM_ABI bool hasAnyUseOfValue(unsigned Value) const
Return true if there are any use of the indicated value.
EVT getValueType(unsigned ResNo) const
Return the type of a specified result.
bool isUndef() const
Returns true if the node type is UNDEF or POISON.
iterator_range< user_iterator > users()
void setFlags(SDNodeFlags NewFlags)
user_iterator user_begin() const
Provide iteration support to walk over all users of an SDNode.
op_iterator op_end() const
op_iterator op_begin() const
Represents a use of a SDNode.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
bool isUndef() const
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
const APInt & getConstantOperandAPInt(unsigned i) const
uint64_t getScalarValueSizeInBits() const
unsigned getResNo() const
get the index which selects a specific result in the SDNode
uint64_t getConstantOperandVal(unsigned i) const
MVT getSimpleValueType() const
Return the simple ValueType of the referenced return value.
unsigned getOpcode() const
unsigned getNumOperands() const
Targets can subclass this to parameterize the SelectionDAG lowering and instruction selection process...
virtual bool isTargetStrictFPOpcode(unsigned Opcode) const
Returns true if a node with the given target-specific opcode has strict floating-point semantics.
Help to insert SDNodeFlags automatically in transforming.
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
bool willNotOverflowAdd(bool IsSigned, SDValue N0, SDValue N1) const
Determine if the result of the addition of 2 nodes can never overflow.
static unsigned getOpcode_EXTEND_VECTOR_INREG(unsigned Opcode)
Convert *_EXTEND to *_EXTEND_VECTOR_INREG opcode.
LLVM_ABI SDValue getShiftAmountOperand(EVT LHSTy, SDValue Op)
Return the specified value casted to the target's desired shift amount type.
LLVM_ABI SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
SDValue getTargetGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, unsigned TargetFlags=0)
SDValue getExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT, unsigned Opcode)
Convert Op, which must be of integer type, to the integer type VT, by either any/sign/zero-extending ...
SDValue getExtractVectorElt(const SDLoc &DL, EVT VT, SDValue Vec, unsigned Idx)
Extract element at Idx from Vec.
LLVM_ABI SDValue getSplatSourceVector(SDValue V, int &SplatIndex)
If V is a splatted value, return the source vector and its splat index.
LLVM_ABI unsigned ComputeMaxSignificantBits(SDValue Op, unsigned Depth=0) const
Get the upper bound on bit size for this Value Op as a signed integer.
LLVM_ABI SDValue getMaskedGather(SDVTList VTs, EVT MemVT, const SDLoc &dl, ArrayRef< SDValue > Ops, MachineMemOperand *MMO, ISD::MemIndexType IndexType, ISD::LoadExtType ExtTy)
LLVM_ABI SDValue getAddrSpaceCast(const SDLoc &dl, EVT VT, SDValue Ptr, unsigned SrcAS, unsigned DestAS)
Return an AddrSpaceCastSDNode.
const TargetSubtargetInfo & getSubtarget() const
SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, Register Reg, SDValue N)
LLVM_ABI SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
LLVM_ABI SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
LLVM_ABI SDValue getShiftAmountConstant(uint64_t Val, EVT VT, const SDLoc &DL)
LLVM_ABI SDValue getAllOnesConstant(const SDLoc &DL, EVT VT, bool IsTarget=false, bool IsOpaque=false)
LLVM_ABI MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
LLVM_ABI SDNode * getNodeIfExists(unsigned Opcode, SDVTList VTList, ArrayRef< SDValue > Ops, const SDNodeFlags Flags, bool AllowCommute=false)
Get the specified node if it's already available, or else return NULL.
LLVM_ABI SDValue getFreeze(SDValue V)
Return a freeze using the SDLoc of the value operand.
LLVM_ABI SDValue getConstantPool(const Constant *C, EVT VT, MaybeAlign Align=std::nullopt, int Offs=0, bool isT=false, unsigned TargetFlags=0)
LLVM_ABI SDValue makeEquivalentMemoryOrdering(SDValue OldChain, SDValue NewMemOpChain)
If an existing load has uses of its chain, create a token factor node with that chain and the new mem...
LLVM_ABI bool isConstantIntBuildVectorOrConstantInt(SDValue N, bool AllowOpaques=true) const
Test whether the given value is a constant int or similar node.
LLVM_ABI SDValue getJumpTableDebugInfo(int JTI, SDValue Chain, const SDLoc &DL)
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false)
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
LLVM_ABI SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
SDValue getExtractSubvector(const SDLoc &DL, EVT VT, SDValue Vec, unsigned Idx)
Return the VT typed sub-vector of Vec at Idx.
LLVM_ABI SDValue getRegister(Register Reg, EVT VT)
LLVM_ABI SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
LLVM_ABI SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, LocationSize Size=LocationSize::precise(0), const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
SDValue getInsertSubvector(const SDLoc &DL, SDValue Vec, SDValue SubVec, unsigned Idx)
Insert SubVec at the Idx element of Vec.
LLVM_ABI SDValue getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT, SDValue Chain, SDValue Ptr, SDValue Val, MachineMemOperand *MMO)
Gets a node for an atomic op, produces result (if relevant) and chain and takes 2 operands.
LLVM_ABI SDValue getMemcpy(SDValue Chain, const SDLoc &dl, SDValue Dst, SDValue Src, SDValue Size, Align Alignment, bool isVol, bool AlwaysInline, const CallInst *CI, std::optional< bool > OverrideTailCall, MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo, const AAMDNodes &AAInfo=AAMDNodes(), BatchAAResults *BatchAA=nullptr)
LLVM_ABI bool shouldOptForSize() const
std::pair< SDValue, SDValue > SplitVectorOperand(const SDNode *N, unsigned OpNo)
Split the node's operand with EXTRACT_SUBVECTOR and return the low/high part.
LLVM_ABI SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
const TargetLowering & getTargetLoweringInfo() const
LLVM_ABI bool isEqualTo(SDValue A, SDValue B) const
Test whether two SDValues are known to compare equal.
static constexpr unsigned MaxRecursionDepth
LLVM_ABI SDValue expandVACopy(SDNode *Node)
Expand the specified ISD::VACOPY node as the Legalize pass would.
LLVM_ABI std::pair< EVT, EVT > GetSplitDestVTs(const EVT &VT) const
Compute the VTs needed for the low/hi parts of a type which is split (or expanded) into two not neces...
SDValue getTargetJumpTable(int JTI, EVT VT, unsigned TargetFlags=0)
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
LLVM_ABI bool isSplatValue(SDValue V, const APInt &DemandedElts, APInt &UndefElts, unsigned Depth=0) const
Test whether V has a splatted value for all the demanded elements.
LLVM_ABI SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, Register Reg, EVT VT)
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build Select's if you just have operands and don't want to check...
LLVM_ABI SDValue getNegative(SDValue Val, const SDLoc &DL, EVT VT)
Create negative operation as (SUB 0, Val).
LLVM_ABI std::optional< unsigned > getValidShiftAmount(SDValue V, const APInt &DemandedElts, unsigned Depth=0) const
If a SHL/SRA/SRL node V has a uniform shift amount that is less than the element bit-width of the shi...
LLVM_ABI SDValue simplifySelect(SDValue Cond, SDValue TVal, SDValue FVal)
Try to simplify a select/vselect into 1 of its operands or a constant.
LLVM_ABI SDValue getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT)
Return the expression required to zero extend the Op value assuming it was the smaller SrcTy value.
const DataLayout & getDataLayout() const
LLVM_ABI SDValue expandVAArg(SDNode *Node)
Expand the specified ISD::VAARG node as the Legalize pass would.
LLVM_ABI bool doesNodeExist(unsigned Opcode, SDVTList VTList, ArrayRef< SDValue > Ops)
Check if a node exists without modifying its flags.
const SelectionDAGTargetInfo & getSelectionDAGInfo() const
LLVM_ABI bool areNonVolatileConsecutiveLoads(LoadSDNode *LD, LoadSDNode *Base, unsigned Bytes, int Dist) const
Return true if loads are next to each other and can be merged.
LLVM_ABI SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
LLVM_ABI SDValue getMemBasePlusOffset(SDValue Base, TypeSize Offset, const SDLoc &DL, const SDNodeFlags Flags=SDNodeFlags())
Returns sum of the base pointer and offset.
LLVM_ABI SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
LLVM_ABI void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
LLVM_ABI SDValue getCommutedVectorShuffle(const ShuffleVectorSDNode &SV)
Returns an ISD::VECTOR_SHUFFLE node semantically equivalent to the shuffle node in input but with swa...
LLVM_ABI std::pair< SDValue, SDValue > SplitVector(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the vector with EXTRACT_SUBVECTOR using the provided VTs and return the low/high part.
LLVM_ABI bool isGuaranteedNotToBeUndefOrPoison(SDValue Op, bool PoisonOnly=false, unsigned Depth=0) const
Return true if this function can prove that Op is never poison and, if PoisonOnly is false,...
LLVM_ABI SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
LLVM_ABI SDValue getSignedConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
LLVM_ABI MaybeAlign InferPtrAlign(SDValue Ptr) const
Infer alignment of a load / store address.
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
LLVM_ABI bool SignBitIsZero(SDValue Op, unsigned Depth=0) const
Return true if the sign bit of Op is known to be zero.
LLVM_ABI SDValue getTargetExtractSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand)
A convenience function for creating TargetInstrInfo::EXTRACT_SUBREG nodes.
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
LLVM_ABI SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either sign-extending or trunca...
LLVM_ABI bool isKnownNeverZero(SDValue Op, unsigned Depth=0) const
Test whether the given SDValue is known to contain non-zero value(s).
LLVM_ABI SDValue FoldConstantArithmetic(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDValue > Ops, SDNodeFlags Flags=SDNodeFlags())
LLVM_ABI std::optional< unsigned > getValidMinimumShiftAmount(SDValue V, const APInt &DemandedElts, unsigned Depth=0) const
If a SHL/SRA/SRL node V has shift amounts that are all less than the element bit-width of the shift n...
LLVM_ABI SDValue getMaskedStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Base, SDValue Offset, SDValue Mask, EVT MemVT, MachineMemOperand *MMO, ISD::MemIndexedMode AM, bool IsTruncating=false, bool IsCompressing=false)
LLVM_ABI SDValue getExternalSymbol(const char *Sym, EVT VT)
const TargetMachine & getTarget() const
LLVM_ABI std::pair< SDValue, SDValue > getStrictFPExtendOrRound(SDValue Op, SDValue Chain, const SDLoc &DL, EVT VT)
Convert Op, which must be a STRICT operation of float type, to the float type VT, by either extending...
LLVM_ABI SDValue getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either any-extending or truncat...
LLVM_ABI bool isKnownNeverZeroFloat(SDValue Op) const
Test whether the given floating point SDValue is known to never be positive or negative zero.
LLVM_ABI SDValue getIntPtrConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
LLVM_ABI SDValue getValueType(EVT)
LLVM_ABI SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
LLVM_ABI SDValue getFPExtendOrRound(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of float type, to the float type VT, by either extending or rounding (by tr...
LLVM_ABI bool isKnownNeverNaN(SDValue Op, const APInt &DemandedElts, bool SNaN=false, unsigned Depth=0) const
Test whether the given SDValue (or all elements of it, if it is a vector) is known to never be NaN in...
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI unsigned ComputeNumSignBits(SDValue Op, unsigned Depth=0) const
Return the number of times the sign bit of the register is replicated into the other bits.
LLVM_ABI bool MaskedVectorIsZero(SDValue Op, const APInt &DemandedElts, unsigned Depth=0) const
Return true if 'Op' is known to be zero in DemandedElts.
SDValue getTargetBlockAddress(const BlockAddress *BA, EVT VT, int64_t Offset=0, unsigned TargetFlags=0)
LLVM_ABI bool isBaseWithConstantOffset(SDValue Op) const
Return true if the specified operand is an ISD::ADD with a ConstantSDNode on the right-hand side,...
LLVM_ABI SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
LLVM_ABI void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
SDValue getSplatBuildVector(EVT VT, const SDLoc &DL, SDValue Op)
Return a splat ISD::BUILD_VECTOR node, consisting of Op splatted to all elements.
LLVM_ABI SDValue getFrameIndex(int FI, EVT VT, bool isTarget=false)
LLVM_ABI KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
LLVM_ABI SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
LLVM_ABI SDValue getCondCode(ISD::CondCode Cond)
LLVM_ABI bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
LLVMContext * getContext() const
LLVM_ABI SDValue getTargetExternalSymbol(const char *Sym, EVT VT, unsigned TargetFlags=0)
LLVM_ABI SDValue getMCSymbol(MCSymbol *Sym, EVT VT)
LLVM_ABI SDValue CreateStackTemporary(TypeSize Bytes, Align Alignment)
Create a stack temporary based on the size in bytes and the alignment.
LLVM_ABI SDNode * UpdateNodeOperands(SDNode *N, SDValue Op)
Mutate the specified node in-place to have the specified operands.
SDValue getTargetConstantPool(const Constant *C, EVT VT, MaybeAlign Align=std::nullopt, int Offset=0, unsigned TargetFlags=0)
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
LLVM_ABI SDValue getMaskedLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Base, SDValue Offset, SDValue Mask, SDValue Src0, EVT MemVT, MachineMemOperand *MMO, ISD::MemIndexedMode AM, ISD::LoadExtType, bool IsExpanding=false)
SDValue getSplat(EVT VT, const SDLoc &DL, SDValue Op)
Returns a node representing a splat of one value into all lanes of the provided vector type.
LLVM_ABI std::pair< SDValue, SDValue > SplitScalar(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the scalar node with EXTRACT_ELEMENT using the provided VTs and return the low/high part.
static unsigned getOpcode_EXTEND(unsigned Opcode)
Convert *_EXTEND_VECTOR_INREG to *_EXTEND opcode.
LLVM_ABI SDValue matchBinOpReduction(SDNode *Extract, ISD::NodeType &BinOp, ArrayRef< ISD::NodeType > CandidateBinOps, bool AllowPartials=false)
Match a binop + shuffle pyramid that represents a horizontal reduction over the elements of a vector ...
LLVM_ABI SDValue getVectorShuffle(EVT VT, const SDLoc &dl, SDValue N1, SDValue N2, ArrayRef< int > Mask)
Return an ISD::VECTOR_SHUFFLE node.
LLVM_ABI SDValue getMaskedScatter(SDVTList VTs, EVT MemVT, const SDLoc &dl, ArrayRef< SDValue > Ops, MachineMemOperand *MMO, ISD::MemIndexType IndexType, bool IsTruncating=false)
static LLVM_ABI bool isBitRotateMask(ArrayRef< int > Mask, unsigned EltSizeInBits, unsigned MinSubElts, unsigned MaxSubElts, unsigned &NumSubElts, unsigned &RotateAmt)
Checks if the shuffle is a bit rotation of the first operand across multiple subelements,...
This SDNode is used to implement the code generator support for the llvm IR shufflevector instruction...
int getMaskElt(unsigned Idx) const
static int getSplatMaskIndex(ArrayRef< int > Mask)
ArrayRef< int > getMask() const
static void commuteMask(MutableArrayRef< int > Mask)
Change values in a shuffle permute mask assuming the two vector operands have swapped position.
static LLVM_ABI bool isSplatMask(ArrayRef< int > Mask)
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
void resize(unsigned N, bool t=false)
Grow or shrink the bitvector.
void insert_range(Range &&R)
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition SmallSet.h:133
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition SmallSet.h:183
size_type size() const
Definition SmallSet.h:170
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void assign(size_type NumElts, ValueParamT Elt)
iterator erase(const_iterator CI)
typename SuperClass::const_iterator const_iterator
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void resize(size_type N)
void push_back(const T &Elt)
pointer data()
Return a pointer to the vector's buffer, even if empty().
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
This class is used to represent ISD::STORE nodes.
const SDValue & getBasePtr() const
const SDValue & getValue() const
bool isTruncatingStore() const
Return true if the op does a truncation before store.
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
constexpr bool empty() const
empty - Check if the string is empty.
Definition StringRef.h:143
constexpr size_t size() const
size - Get the string size.
Definition StringRef.h:146
bool ends_with(StringRef Suffix) const
Check if this string ends with the given Suffix.
Definition StringRef.h:273
static constexpr size_t npos
Definition StringRef.h:57
bool equals_insensitive(StringRef RHS) const
Check for string equality, ignoring case.
Definition StringRef.h:172
A switch()-like statement whose cases are string literals.
StringSwitch & Case(StringLiteral S, T Value)
static LLVM_ABI StructType * get(LLVMContext &Context, ArrayRef< Type * > Elements, bool isPacked=false)
This static method is the primary way to create a literal StructType.
Definition Type.cpp:414
Information about stack frame layout on the target.
bool hasFP(const MachineFunction &MF) const
hasFP - Return true if the specified function should have a dedicated frame pointer register.
Align getStackAlign() const
getStackAlignment - This method returns the number of bytes to which the stack pointer must be aligne...
TargetInstrInfo - Interface to description of machine instruction set.
Provides information about what library functions are available for the current target.
bool isOperationExpand(unsigned Op, EVT VT) const
Return true if the specified operation is illegal on this target or unlikely to be made legal with cu...
void setBooleanVectorContents(BooleanContent Ty)
Specify how the target extends the result of a vector boolean value from a vector of i1 to a wider ty...
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
void setMaxDivRemBitWidthSupported(unsigned SizeInBits)
Set the size in bits of the maximum div/rem the backend supports.
virtual bool hasAndNot(SDValue X) const
Return true if the target has a bitwise and-not operation: X = ~A & B This can be used to simplify se...
bool PredictableSelectIsExpensive
Tells the code generator that select is more expensive than a branch if the branch is usually predict...
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
CallingConv::ID getLibcallCallingConv(RTLIB::Libcall Call) const
Get the CallingConv that should be used for the specified libcall.
unsigned MaxStoresPerMemcpyOptSize
Likewise for functions with the OptSize attribute.
MachineBasicBlock * emitPatchPoint(MachineInstr &MI, MachineBasicBlock *MBB) const
Replace/modify any TargetFrameIndex operands with a targte-dependent sequence of memory operands that...
virtual const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const
Return the register class that should be used for the specified value type.
ShiftLegalizationStrategy
Return the preferred strategy to legalize tihs SHIFT instruction, with ExpansionFactor being the recu...
const TargetMachine & getTargetMachine() const
unsigned MaxLoadsPerMemcmp
Specify maximum number of load instructions per memcmp call.
void setOperationPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
Convenience method to set an operation to Promote and specify the type in a single call.
LegalizeTypeAction
This enum indicates whether a types are legal for a target, and if not, what action should be used to...
virtual bool areJTsAllowed(const Function *Fn) const
Return true if lowering to a jump table is allowed.
bool isOperationLegalOrPromote(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal using promotion.
void addBypassSlowDiv(unsigned int SlowBitWidth, unsigned int FastBitWidth)
Tells the code generator which bitwidths to bypass.
void setMaxLargeFPConvertBitWidthSupported(unsigned SizeInBits)
Set the size in bits of the maximum fp to/from int conversion the backend supports.
bool isTruncStoreLegal(EVT ValVT, EVT MemVT) const
Return true if the specified store with truncation is legal on this target.
virtual bool isLoadBitCastBeneficial(EVT LoadVT, EVT BitcastVT, const SelectionDAG &DAG, const MachineMemOperand &MMO) const
Return true if the following transform is beneficial: fold (conv (load x)) -> (load (conv*)x) On arch...
void setPrefLoopAlignment(Align Alignment)
Set the target's preferred loop alignment.
virtual bool isCommutativeBinOp(unsigned Opcode) const
Returns true if the opcode is a commutative binary operation.
void setMaxAtomicSizeInBitsSupported(unsigned SizeInBits)
Set the maximum atomic operation size supported by the backend.
virtual TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const
Return the preferred vector type legalization action.
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
bool isOperationCustom(unsigned Op, EVT VT) const
Return true if the operation uses custom lowering, regardless of whether the type is legal or not.
unsigned MaxStoresPerMemsetOptSize
Likewise for functions with the OptSize attribute.
void setBooleanContents(BooleanContent Ty)
Specify how the target extends the result of integer and floating point boolean values from i1 to a w...
unsigned MaxStoresPerMemmove
Specify maximum number of store instructions per memmove call.
virtual Align getPrefLoopAlignment(MachineLoop *ML=nullptr) const
Return the preferred loop alignment.
void computeRegisterProperties(const TargetRegisterInfo *TRI)
Once all of the register classes are added, this allows us to compute derived properties we expose.
virtual EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const
Return the ValueType of the result of SETCC operations.
virtual EVT getTypeToTransformTo(LLVMContext &Context, EVT VT) const
For types supported by the target, this is an identity function.
unsigned MaxStoresPerMemmoveOptSize
Likewise for functions with the OptSize attribute.
BooleanContent getBooleanContents(bool isVec, bool isFloat) const
For targets without i1 registers, this gives the nature of the high-bits of boolean values held in ty...
virtual MVT getPreferredSwitchConditionType(LLVMContext &Context, EVT ConditionVT) const
Returns preferred type for switch condition.
void addRegisterClass(MVT VT, const TargetRegisterClass *RC)
Add the specified register class as an available regclass for the specified value type.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
virtual bool shouldFoldConstantShiftPairToMask(const SDNode *N) const
Return true if it is profitable to fold a pair of shifts into a mask.
virtual MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
void setPrefFunctionAlignment(Align Alignment)
Set the target's preferred function alignment.
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
unsigned MaxStoresPerMemset
Specify maximum number of store instructions per memset call.
virtual bool shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y, unsigned OldShiftOpcode, unsigned NewShiftOpcode, SelectionDAG &DAG) const
Given the pattern (X & (C l>>/<< Y)) ==/!= 0 return true if it should be transformed into: ((X <</l>>...
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
BooleanContent
Enum that describes how the target represents true/false values.
virtual ShiftLegalizationStrategy preferredShiftLegalizationStrategy(SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const
bool isOperationLegalOrCustom(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
virtual bool allowsMemoryAccess(LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const
Return true if the target supports a memory access of this type for the given address space and align...
unsigned MaxLoadsPerMemcmpOptSize
Likewise for functions with the OptSize attribute.
virtual bool isBinOp(unsigned Opcode) const
Return true if the node is a math/logic binary operator.
void setStackPointerRegisterToSaveRestore(Register R)
If set to a physical register, this specifies the register that llvm.savestack/llvm....
bool isLoadExtLegal(unsigned ExtType, EVT ValVT, EVT MemVT) const
Return true if the specified load with extension is legal on this target.
AtomicExpansionKind
Enum that specifies what an atomic load/AtomicRMWInst is expanded to, if at all.
void setCondCodeAction(ArrayRef< ISD::CondCode > CCs, MVT VT, LegalizeAction Action)
Indicate that the specified condition code is or isn't supported on the target and indicate what to d...
AndOrSETCCFoldKind
Enum of different potentially desirable ways to fold (and/or (setcc ...), (setcc ....
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
void setLoadExtAction(unsigned ExtType, MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified load with extension does not work with the specified type and indicate wh...
NegatibleCost
Enum that specifies when a float negation is beneficial.
LegalizeTypeAction getTypeAction(LLVMContext &Context, EVT VT) const
Return how we should legalize values of this type, either it is already legal (return 'Legal') or we ...
const char * getLibcallName(RTLIB::Libcall Call) const
Get the libcall routine name for the specified libcall.
std::vector< ArgListEntry > ArgListTy
unsigned MaxStoresPerMemcpy
Specify maximum number of store instructions per memcpy call.
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
virtual bool shouldConvertPhiType(Type *From, Type *To) const
Given a set in interconnected phis of type 'From' that are loaded/stored or bitcast to type 'To',...
bool isOperationLegalOrCustomOrPromote(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
static ISD::NodeType getExtendForContent(BooleanContent Content)
This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...
SDValue buildSDIVPow2WithCMov(SDNode *N, const APInt &Divisor, SelectionDAG &DAG, SmallVectorImpl< SDNode * > &Created) const
Build sdiv by power-of-2 with conditional move instructions Ref: "Hacker's Delight" by Henry Warren 1...
bool SimplifyDemandedVectorElts(SDValue Op, const APInt &DemandedEltMask, APInt &KnownUndef, APInt &KnownZero, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Vector Op.
void softenSetCCOperands(SelectionDAG &DAG, EVT VT, SDValue &NewLHS, SDValue &NewRHS, ISD::CondCode &CCCode, const SDLoc &DL, const SDValue OldLHS, const SDValue OldRHS) const
Soften the operands of a comparison.
std::pair< SDValue, SDValue > makeLibCall(SelectionDAG &DAG, RTLIB::Libcall LC, EVT RetVT, ArrayRef< SDValue > Ops, MakeLibCallOptions CallOptions, const SDLoc &dl, SDValue Chain=SDValue()) const
Returns a pair of (return value, chain).
virtual SDValue expandIndirectJTBranch(const SDLoc &dl, SDValue Value, SDValue Addr, int JTI, SelectionDAG &DAG) const
Expands target specific indirect branch for the case of JumpTable expansion.
SDValue getCheaperNegatedExpression(SDValue Op, SelectionDAG &DAG, bool LegalOps, bool OptForSize, unsigned Depth=0) const
This is the helper function to return the newly negated expression only when the cost is cheaper.
SDValue SimplifyMultipleUseDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth=0) const
More limited version of SimplifyDemandedBits that can be used to "lookthrough" ops that don't contrib...
SDValue SimplifyMultipleUseDemandedVectorElts(SDValue Op, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth=0) const
Helper wrapper around SimplifyMultipleUseDemandedBits, demanding all bits from only some vector eleme...
virtual ConstraintType getConstraintType(StringRef Constraint) const
Given a constraint, return the type of constraint it is for this target.
bool ShrinkDemandedConstant(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, TargetLoweringOpt &TLO) const
Check to see if the specified operand of the specified instruction is a constant integer.
virtual SDValue LowerToTLSEmulatedModel(const GlobalAddressSDNode *GA, SelectionDAG &DAG) const
Lower TLS global address SDNode for target independent emulated TLS model.
virtual const char * LowerXConstraint(EVT ConstraintVT) const
Try to replace an X constraint, which matches anything, with another that has more specific requireme...
std::pair< SDValue, SDValue > LowerCallTo(CallLoweringInfo &CLI) const
This function lowers an abstract call to a function into an actual call.
bool expandDIVREMByConstant(SDNode *N, SmallVectorImpl< SDValue > &Result, EVT HiLoVT, SelectionDAG &DAG, SDValue LL=SDValue(), SDValue LH=SDValue()) const
Attempt to expand an n-bit div/rem/divrem by constant using a n/2-bit urem by constant and other arit...
bool isPositionIndependent() const
virtual SDValue getNegatedExpression(SDValue Op, SelectionDAG &DAG, bool LegalOps, bool OptForSize, NegatibleCost &Cost, unsigned Depth=0) const
Return the newly negated expression if the cost is not expensive and set the cost in Cost to indicate...
virtual ConstraintWeight getSingleConstraintMatchWeight(AsmOperandInfo &info, const char *constraint) const
Examine constraint string and operand type and determine a weight value.
virtual SDValue SimplifyMultipleUseDemandedBitsForTargetNode(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth) const
More limited version of SimplifyDemandedBits that can be used to "lookthrough" ops that don't contrib...
virtual std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const
Given a physical register constraint (e.g.
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Op.
virtual bool SimplifyDemandedBitsForTargetNode(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0) const
Attempt to simplify any target nodes based on the demanded bits/elts, returning true on success.
TargetLowering(const TargetLowering &)=delete
virtual bool isSplatValueForTargetNode(SDValue Op, const APInt &DemandedElts, APInt &UndefElts, const SelectionDAG &DAG, unsigned Depth=0) const
Return true if vector Op has the same value across all DemandedElts, indicating any elements which ma...
SDValue getVectorElementPointer(SelectionDAG &DAG, SDValue VecPtr, EVT VecVT, SDValue Index) const
Get a pointer to vector element Idx located in memory for a vector of type VecVT starting at a base a...
virtual unsigned combineRepeatedFPDivisors() const
Indicate whether this target prefers to combine FDIVs with the same divisor.
virtual void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const
Lower the specified operand into the Ops vector.
virtual bool isGuaranteedNotToBeUndefOrPoisonForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool PoisonOnly, unsigned Depth) const
Return true if this function can prove that Op is never poison and, if PoisonOnly is false,...
void expandShiftParts(SDNode *N, SDValue &Lo, SDValue &Hi, SelectionDAG &DAG) const
Expand shift-by-parts.
virtual bool canCreateUndefOrPoisonForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const
Return true if Op can create undef or poison from non-undef & non-poison operands.
Primary interface to the complete machine description for the target machine.
TLSModel::Model getTLSModel(const GlobalValue *GV) const
Returns the TLS model which should be used for the given global variable.
const Triple & getTargetTriple() const
bool useTLSDESC() const
Returns true if this target uses TLS Descriptors.
bool useEmulatedTLS() const
Returns true if this target uses emulated TLS.
TargetOptions Options
CodeModel::Model getCodeModel() const
Returns the code model.
const MCAsmInfo * getMCAsmInfo() const
Return target specific asm information.
unsigned NoSignedZerosFPMath
NoSignedZerosFPMath - This flag is enabled when the -enable-no-signed-zeros-fp-math is specified on t...
unsigned NoNaNsFPMath
NoNaNsFPMath - This flag is enabled when the -enable-no-nans-fp-math flag is specified on the command...
FPOpFusion::FPOpFusionMode AllowFPOpFusion
AllowFPOpFusion - This flag is set by the -fp-contract=xxx option.
bool contains(Register Reg) const
Return true if the specified register is included in this register class.
bool hasSuperClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a super-class of or equal to this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
virtual const TargetInstrInfo * getInstrInfo() const
Target - Wrapper for Target specific information.
bool isOSBinFormatCOFF() const
Tests whether the OS uses the COFF binary format.
Definition Triple.h:775
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:344
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
bool isVectorTy() const
True if this is an instance of VectorType.
Definition Type.h:273
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:297
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition Type.h:153
LLVM_ABI TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Definition Type.cpp:198
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition Type.h:142
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:231
bool isDoubleTy() const
Return true if this is 'double', a 64-bit IEEE fp type.
Definition Type.h:156
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
Definition Type.cpp:294
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
Definition Type.cpp:301
static LLVM_ABI UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
User * getUser() const
Returns the User that contains this Use.
Definition Use.h:61
LLVM_ABI unsigned getOperandNo() const
Return the operand # of this use in its User.
Definition Use.cpp:35
Value * getOperand(unsigned i) const
Definition User.h:232
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
user_iterator user_begin()
Definition Value.h:402
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:546
iterator_range< user_iterator > users()
Definition Value.h:426
use_iterator use_begin()
Definition Value.h:364
bool use_empty() const
Definition Value.h:346
LLVM_ABI LLVMContext & getContext() const
All values hold a context through their type.
Definition Value.cpp:1099
iterator_range< use_iterator > uses()
Definition Value.h:380
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:322
static LLVM_ABI bool isValidElementType(Type *ElemTy)
Return true if the specified type is valid as a element type.
bool has128ByteRedZone(const MachineFunction &MF) const
Return true if the function has a redzone (accessible bytes past the frame of the top of stack functi...
bool Uses64BitFramePtr
True if the 64-bit frame or stack pointer should be used.
Register getGlobalBaseReg(MachineFunction *MF) const
getGlobalBaseReg - Return a virtual register initialized with the the global base register value.
X86MachineFunctionInfo - This class is derived from MachineFunction and contains private X86 target-s...
void setAMXProgModel(AMXProgModelEnum Model)
ArrayRef< size_t > getPreallocatedArgOffsets(const size_t Id)
void setRestoreBasePointer(const MachineFunction *MF)
size_t getPreallocatedStackSize(const size_t Id)
bool hasBasePointer(const MachineFunction &MF) const
Register getPtrSizedFrameRegister(const MachineFunction &MF) const
Register getFrameRegister(const MachineFunction &MF) const override
Register getPtrSizedStackRegister(const MachineFunction &MF) const
Register getStackRegister() const
unsigned getSlotSize() const
Register getBaseRegister() const
const uint32_t * getNoPreservedMask() const override
bool canExtendTo512BW() const
bool hasAnyFMA() const
bool hasSSE1() const
bool avoidMFence() const
Avoid use of mfence forfence seq_cst, and instead use lock or.
bool hasBitScanPassThrough() const
bool hasSSE42() const
const X86TargetLowering * getTargetLowering() const override
bool hasMFence() const
Use mfence if we have SSE2 or we're on x86-64 (even if we asked for no-sse2).
bool canUseCMOV() const
bool isTargetDarwin() const
bool isTarget64BitLP64() const
Is this x86_64 with the LP64 programming model (standard AMD64, no x32)?
const X86InstrInfo * getInstrInfo() const override
bool useAVX512Regs() const
bool hasSSE3() const
bool isCallingConvWin64(CallingConv::ID CC) const
bool hasAVX512() const
bool canExtendTo512DQ() const
bool hasSSE41() const
bool hasSSE2() const
bool hasSSSE3() const
bool hasInt256() const
const X86RegisterInfo * getRegisterInfo() const override
bool hasAVX() const
unsigned getPreferVectorWidth() const
const X86FrameLowering * getFrameLowering() const override
bool useBWIRegs() const
bool hasAVX2() const
bool shouldFormOverflowOp(unsigned Opcode, EVT VT, bool MathUsed) const override
Overflow nodes should get combined/lowered to optimal instructions (they should allow eliminating exp...
Align getPrefLoopAlignment(MachineLoop *ML) const override
Return the preferred loop alignment.
bool isLegalAddImmediate(int64_t Imm) const override
Return true if the specified immediate is legal add immediate, that is the target has add instruction...
bool preferSextInRegOfTruncate(EVT TruncVT, EVT VT, EVT ExtVT) const override
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
bool preferABDSToABSWithNSW(EVT VT) const override
bool isCheapToSpeculateCtlz(Type *Ty) const override
Return true if it is cheap to speculate a call to intrinsic ctlz.
unsigned getJumpTableEncoding() const override
Return the entry encoding for a jump table in the current function.
std::pair< SDValue, SDValue > BuildFILD(EVT DstVT, EVT SrcVT, const SDLoc &DL, SDValue Chain, SDValue Pointer, MachinePointerInfo PtrInfo, Align Alignment, SelectionDAG &DAG) const
bool SimplifyDemandedVectorEltsForTargetNode(SDValue Op, const APInt &DemandedElts, APInt &KnownUndef, APInt &KnownZero, TargetLoweringOpt &TLO, unsigned Depth) const override
Attempt to simplify any target nodes based on the demanded vector elements, returning true on success...
SDValue LowerAsmOutputForConstraint(SDValue &Chain, SDValue &Flag, const SDLoc &DL, const AsmOperandInfo &Constraint, SelectionDAG &DAG) const override
Handle Lowering flag assembly outputs.
const char * LowerXConstraint(EVT ConstraintVT) const override
Try to replace an X constraint, which matches anything, with another that has more specific requireme...
SDValue SimplifyMultipleUseDemandedBitsForTargetNode(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth) const override
More limited version of SimplifyDemandedBits that can be used to "lookthrough" ops that don't contrib...
bool useLoadStackGuardNode(const Module &M) const override
If this function returns true, SelectionDAGBuilder emits a LOAD_STACK_GUARD node when it is lowering ...
bool isSplatValueForTargetNode(SDValue Op, const APInt &DemandedElts, APInt &UndefElts, const SelectionDAG &DAG, unsigned Depth) const override
Return true if vector Op has the same value across all DemandedElts, indicating any elements which ma...
bool convertSelectOfConstantsToMath(EVT VT) const override
Return true if a select of constants (select Cond, C1, C2) should be transformed into simple math ops...
ConstraintType getConstraintType(StringRef Constraint) const override
Given a constraint letter, return the type of constraint for this target.
Register getExceptionSelectorRegister(const Constant *PersonalityFn) const override
If a physical register, this returns the register that receives the exception typeid on entry to a la...
ShiftLegalizationStrategy preferredShiftLegalizationStrategy(SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const override
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
Provide custom lowering hooks for some operations.
bool isLegalStoreImmediate(int64_t Imm) const override
Return true if the specified immediate is legal for the value input of a store instruction.
SDValue visitMaskedStore(SelectionDAG &DAG, const SDLoc &DL, SDValue Chain, MachineMemOperand *MMO, SDValue Ptr, SDValue Val, SDValue Mask) const override
SDValue getNegatedExpression(SDValue Op, SelectionDAG &DAG, bool LegalOperations, bool ForCodeSize, NegatibleCost &Cost, unsigned Depth) const override
Return the newly negated expression if the cost is not expensive and set the cost in Cost to indicate...
bool isTypeDesirableForOp(unsigned Opc, EVT VT) const override
Return true if the target has native support for the specified value type and it is 'desirable' to us...
bool isCtlzFast() const override
Return true if ctlz instruction is fast.
Register getRegisterByName(const char *RegName, LLT VT, const MachineFunction &MF) const override
Return the register ID of the name passed in.
bool shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y, unsigned OldShiftOpcode, unsigned NewShiftOpcode, SelectionDAG &DAG) const override
Given the pattern (X & (C l>>/<< Y)) ==/!= 0 return true if it should be transformed into: ((X <</l>>...
bool supportSwiftError() const override
Return true if the target supports swifterror attribute.
bool isCheapToSpeculateCttz(Type *Ty) const override
Return true if it is cheap to speculate a call to intrinsic cttz.
bool shouldSplatInsEltVarIndex(EVT VT) const override
Return true if inserting a scalar into a variable element of an undef vector is more efficiently hand...
bool isInlineAsmTargetBranch(const SmallVectorImpl< StringRef > &AsmStrs, unsigned OpNo) const override
On x86, return true if the operand with index OpNo is a CALL or JUMP instruction, which can use eithe...
MVT hasFastEqualityCompare(unsigned NumBits) const override
Vector-sized comparisons are fast using PCMPEQ + PMOVMSK or PTEST.
bool SimplifyDemandedVectorEltsForTargetShuffle(SDValue Op, const APInt &DemandedElts, unsigned MaskIndex, TargetLoweringOpt &TLO, unsigned Depth) const
bool isLegalICmpImmediate(int64_t Imm) const override
Return true if the specified immediate is legal icmp immediate, that is the target has icmp instructi...
bool hasInlineStackProbe(const MachineFunction &MF) const override
Returns true if stack probing through inline assembly is requested.
MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const override
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
unsigned preferedOpcodeForCmpEqPiecesOfOperand(EVT VT, unsigned ShiftOpc, bool MayTransformRotate, const APInt &ShiftOrRotateAmt, const std::optional< APInt > &AndMask) const override
bool isXAndYEqZeroPreferableToXAndYEqY(ISD::CondCode Cond, EVT VT) const override
bool canMergeStoresTo(unsigned AddressSpace, EVT MemVT, const MachineFunction &MF) const override
Returns if it's reasonable to merge stores to MemVT size.
bool hasAndNot(SDValue Y) const override
Return true if the target has a bitwise and-not operation: X = ~A & B This can be used to simplify se...
bool SimplifyDemandedBitsForTargetNode(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth) const override
Attempt to simplify any target nodes based on the demanded bits/elts, returning true on success.
bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override
Returns true if it is beneficial to convert a load of a constant to just the constant itself.
bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT, std::optional< unsigned > ByteOffset) const override
Return true if we believe it is correct and profitable to reduce the load node to a smaller type.
bool preferScalarizeSplat(SDNode *N) const override
std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override
Given a physical register constraint (e.g.
bool isNarrowingProfitable(SDNode *N, EVT SrcVT, EVT DestVT) const override
Return true if it's profitable to narrow operations of type SrcVT to DestVT.
bool isFPImmLegal(const APFloat &Imm, EVT VT, bool ForCodeSize) const override
Returns true if the target can instruction select the specified FP immediate natively.
MachineInstr * EmitKCFICheck(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator &MBBI, const TargetInstrInfo *TII) const override
bool isLoadBitCastBeneficial(EVT LoadVT, EVT BitcastVT, const SelectionDAG &DAG, const MachineMemOperand &MMO) const override
Return true if the following transform is beneficial: fold (conv (load x)) -> (load (conv*)x) On arch...
bool hasAndNotCompare(SDValue Y) const override
Return true if the target should transform: (X & Y) == Y ---> (~X & Y) == 0 (X & Y) !...
bool reduceSelectOfFPConstantLoads(EVT CmpOpVT) const override
Return true if it is profitable to convert a select of FP constants into a constant pool load whose a...
StringRef getStackProbeSymbolName(const MachineFunction &MF) const override
Returns the name of the symbol used to emit stack probes or the empty string if not applicable.
bool hasBitTest(SDValue X, SDValue Y) const override
Return true if the target has a bit-test instruction: (X & (1 << Y)) ==/!= 0 This knowledge can be us...
bool allowTruncateForTailCall(Type *Ty1, Type *Ty2) const override
Return true if a truncation from FromTy to ToTy is permitted when deciding whether a call is in tail ...
bool isShuffleMaskLegal(ArrayRef< int > Mask, EVT VT) const override
Targets can use this to indicate that they only support some VECTOR_SHUFFLE operations,...
bool useStackGuardXorFP() const override
If this function returns true, stack protection checks should XOR the frame pointer (or whichever poi...
unsigned ComputeNumSignBitsForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const override
Determine the number of bits in the operation that are sign bits.
bool shouldScalarizeBinop(SDValue) const override
Scalar ops always have equal or better analysis/performance/power than the vector equivalent,...
bool isTruncateFree(Type *Ty1, Type *Ty2) const override
Return true if it's free to truncate a value of type Ty1 to type Ty2.
bool shouldFoldConstantShiftPairToMask(const SDNode *N) const override
Return true if it is profitable to fold a pair of shifts into a mask.
bool decomposeMulByConstant(LLVMContext &Context, EVT VT, SDValue C) const override
Return true if it is profitable to transform an integer multiplication-by-constant into simpler opera...
bool areJTsAllowed(const Function *Fn) const override
Returns true if lowering to a jump table is allowed.
bool isCommutativeBinOp(unsigned Opcode) const override
Returns true if the opcode is a commutative binary operation.
bool isScalarFPTypeInSSEReg(EVT VT) const
Return true if the specified scalar FP type is computed in an SSE register, not on the X87 floating p...
const char * getTargetNodeName(unsigned Opcode) const override
This method returns the name of a target specific DAG node.
MVT getPreferredSwitchConditionType(LLVMContext &Context, EVT ConditionVT) const override
Returns preferred type for switch condition.
SDValue visitMaskedLoad(SelectionDAG &DAG, const SDLoc &DL, SDValue Chain, MachineMemOperand *MMO, SDValue &NewLoad, SDValue Ptr, SDValue PassThru, SDValue Mask) const override
bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, EVT VT) const override
Return true if an FMA operation is faster than a pair of fmul and fadd instructions.
bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, unsigned Index) const override
Return true if EXTRACT_SUBVECTOR is cheap for this result type with this index.
bool isVectorClearMaskLegal(ArrayRef< int > Mask, EVT VT) const override
Similar to isShuffleMaskLegal.
ConstraintWeight getSingleConstraintMatchWeight(AsmOperandInfo &Info, const char *Constraint) const override
Examine constraint string and operand type and determine a weight value.
bool isIntDivCheap(EVT VT, AttributeList Attr) const override
Return true if integer divide is usually cheaper than a sequence of several shifts,...
LegalizeTypeAction getPreferredVectorAction(MVT VT) const override
Customize the preferred legalization strategy for certain types.
bool shouldConvertPhiType(Type *From, Type *To) const override
Given a set in interconnected phis of type 'From' that are loaded/stored or bitcast to type 'To',...
bool hasStackProbeSymbol(const MachineFunction &MF) const override
Returns true if stack probing through a function call is requested.
bool isZExtFree(Type *Ty1, Type *Ty2) const override
Return true if any actual instruction that defines a value of type Ty1 implicit zero-extends the valu...
bool allowsMemoryAccess(LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const override
This function returns true if the memory access is aligned or if the target allows this specific unal...
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I=nullptr) const override
Return true if the addressing mode represented by AM is legal for this target, for a load/store of th...
SDValue emitStackGuardXorFP(SelectionDAG &DAG, SDValue Val, const SDLoc &DL) const override
TargetLowering::AndOrSETCCFoldKind isDesirableToCombineLogicOpOfSETCC(const SDNode *LogicOp, const SDNode *SETCC0, const SDNode *SETCC1) const override
Return prefered fold type, Abs if this is a vector, AddAnd if its an integer, None otherwise.
bool shouldFoldMaskToVariableShiftPair(SDValue Y) const override
There are two ways to clear extreme bits (either low or high): Mask: x & (-1 << y) (the instcombine c...
bool shouldFoldSelectWithIdentityConstant(unsigned BinOpcode, EVT VT, unsigned SelectOpcode, SDValue X, SDValue Y) const override
Return true if pulling a binary operation into a select with an identity constant is profitable.
bool addressingModeSupportsTLS(const GlobalValue &GV) const override
Returns true if the targets addressing mode can target thread local storage (TLS).
SDValue getReturnAddressFrameIndex(SelectionDAG &DAG) const
bool targetShrinkDemandedConstant(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, TargetLoweringOpt &TLO) const override
Register getExceptionPointerRegister(const Constant *PersonalityFn) const override
If a physical register, this returns the register that receives the exception address on entry to an ...
SDValue expandIndirectJTBranch(const SDLoc &dl, SDValue Value, SDValue Addr, int JTI, SelectionDAG &DAG) const override
Expands target specific indirect branch for the case of JumpTable expansion.
FastISel * createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo) const override
This method returns a target specific FastISel object, or null if the target does not support "fast" ...
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
bool isBinOp(unsigned Opcode) const override
Add x86-specific opcodes to the default list.
bool isGuaranteedNotToBeUndefOrPoisonForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool PoisonOnly, unsigned Depth) const override
Return true if this function can prove that Op is never poison and, if PoisonOnly is false,...
bool IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const override
Return true if the target has native support for the specified value type and it is 'desirable' to us...
SDValue unwrapAddress(SDValue N) const override
CondMergingParams getJumpConditionMergingParams(Instruction::BinaryOps Opc, const Value *Lhs, const Value *Rhs) const override
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override
Return the value type to use for ISD::SETCC.
X86TargetLowering(const X86TargetMachine &TM, const X86Subtarget &STI)
bool isTargetCanonicalSelect(SDNode *N) const override
Return true if the given select/vselect should be considered canonical and not be transformed.
bool isVectorLoadExtDesirable(SDValue) const override
Return true if folding a vector load into ExtVal (a sign, zero, or any extend node) is profitable.
bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I, MachineFunction &MF, unsigned Intrinsic) const override
Given an intrinsic, checks if on the target the intrinsic will need to map to a MemIntrinsicNode (tou...
const Constant * getTargetConstantFromLoad(LoadSDNode *LD) const override
This method returns the constant pool value that will be loaded by LD.
EVT getTypeToTransformTo(LLVMContext &Context, EVT VT) const override
For types supported by the target, this is an identity function.
bool canCreateUndefOrPoisonForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const override
Return true if Op can create undef or poison from non-undef & non-poison operands.
unsigned getStackProbeSize(const MachineFunction &MF) const
bool ShouldShrinkFPConstant(EVT VT) const override
If true, then instruction selection should seek to shrink the FP constant of the specified type to a ...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
Replace the results of node with an illegal result type with new values built out of custom code.
bool isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const override
Return if the target supports combining a chain like:
void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const override
Lower the specified operand into the Ops vector.
bool needsFixedCatchObjects() const override
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:202
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
Definition DenseSet.h:175
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:201
An efficient, type-erasing, non-owning reference to a callable.
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:123
CallInst * Call
#define INT64_MIN
Definition DataTypes.h:74
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr char Attrs[]
Key for Kernel::Metadata::mAttrs.
LLVM_ABI APInt ScaleBitMask(const APInt &A, unsigned NewBitWidth, bool MatchAllBits=false)
Splat/Merge neighboring bits to widen/narrow the bitmask represented by.
Definition APInt.cpp:3009
@ COND_NE
Not equal.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
@ Entry
Definition COFF.h:862
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ X86_ThisCall
Similar to X86_StdCall.
@ X86_StdCall
stdcall is mostly used by the Win32 API.
Definition CallingConv.h:99
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition CallingConv.h:41
@ Tail
Attemps to make calls as fast as possible while guaranteeing that tail call optimization can always b...
Definition CallingConv.h:76
@ SwiftTail
This follows the Swift calling convention in how arguments are passed but guarantees tail calls will ...
Definition CallingConv.h:87
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ X86_FastCall
'fast' analog of X86_StdCall.
LLVM_ABI bool isConstantSplatVectorAllOnes(const SDNode *N, bool BuildVectorOnly=false)
Return true if the specified node is a BUILD_VECTOR or SPLAT_VECTOR where all of the elements are ~0 ...
bool isNON_EXTLoad(const SDNode *N)
Returns true if the specified node is a non-extending load.
NodeType
ISD::NodeType enum - This enum defines the target-independent operators for a SelectionDAG.
Definition ISDOpcodes.h:41
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition ISDOpcodes.h:807
@ MERGE_VALUES
MERGE_VALUES - This node takes multiple discrete operands and returns them all as its individual resu...
Definition ISDOpcodes.h:256
@ CTLZ_ZERO_UNDEF
Definition ISDOpcodes.h:780
@ STRICT_FSETCC
STRICT_FSETCC/STRICT_FSETCCS - Constrained versions of SETCC, used for floating-point operands only.
Definition ISDOpcodes.h:504
@ DELETED_NODE
DELETED_NODE - This is an illegal value that is used to catch errors.
Definition ISDOpcodes.h:45
@ EH_SJLJ_LONGJMP
OUTCHAIN = EH_SJLJ_LONGJMP(INCHAIN, buffer) This corresponds to the eh.sjlj.longjmp intrinsic.
Definition ISDOpcodes.h:163
@ FGETSIGN
INT = FGETSIGN(FP) - Return the sign bit of the specified floating point value as an integer 0/1 valu...
Definition ISDOpcodes.h:531
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition ISDOpcodes.h:270
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
Definition ISDOpcodes.h:593
@ BSWAP
Byte Swap and Counting operators.
Definition ISDOpcodes.h:771
@ FRAME_TO_ARGS_OFFSET
FRAME_TO_ARGS_OFFSET - This node represents offset from frame pointer to first (possible) on-stack ar...
Definition ISDOpcodes.h:140
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
Definition ISDOpcodes.h:515
@ ADD
Simple integer binary arithmetic operators.
Definition ISDOpcodes.h:259
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition ISDOpcodes.h:841
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition ISDOpcodes.h:511
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
Definition ISDOpcodes.h:215
@ EH_SJLJ_SETUP_DISPATCH
OUTCHAIN = EH_SJLJ_SETUP_DISPATCH(INCHAIN) The target initializes the dispatch table here.
Definition ISDOpcodes.h:167
@ GlobalAddress
Definition ISDOpcodes.h:88
@ STRICT_FMINIMUM
Definition ISDOpcodes.h:464
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition ISDOpcodes.h:868
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
Definition ISDOpcodes.h:577
@ FADD
Simple binary floating point operators.
Definition ISDOpcodes.h:410
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
Definition ISDOpcodes.h:744
@ SIGN_EXTEND_VECTOR_INREG
SIGN_EXTEND_VECTOR_INREG(Vector) - This operator represents an in-register sign-extension of the low ...
Definition ISDOpcodes.h:898
@ SDIVREM
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition ISDOpcodes.h:275
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
Definition ISDOpcodes.h:249
@ STRICT_FSQRT
Constrained versions of libm-equivalent floating point intrinsics.
Definition ISDOpcodes.h:431
@ BUILTIN_OP_END
BUILTIN_OP_END - This must be the last enum value in this list.
@ GlobalTLSAddress
Definition ISDOpcodes.h:89
@ EH_RETURN
OUTCHAIN = EH_RETURN(INCHAIN, OFFSET, HANDLER) - This node represents 'eh_return' gcc dwarf builtin,...
Definition ISDOpcodes.h:151
@ SIGN_EXTEND
Conversion operators.
Definition ISDOpcodes.h:832
@ AVGCEILS
AVGCEILS/AVGCEILU - Rounding averaging add - Add two integers using an integer of type i[N+2],...
Definition ISDOpcodes.h:712
@ STRICT_UINT_TO_FP
Definition ISDOpcodes.h:478
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
Definition ISDOpcodes.h:662
@ ADDROFRETURNADDR
ADDROFRETURNADDR - Represents the llvm.addressofreturnaddress intrinsic.
Definition ISDOpcodes.h:117
@ CTTZ_ZERO_UNDEF
Bit counting operators with an undefined result for zero inputs.
Definition ISDOpcodes.h:779
@ SETCCCARRY
Like SetCC, ops #0 and #1 are the LHS and RHS operands to compare, but op #2 is a boolean indicating ...
Definition ISDOpcodes.h:815
@ SSUBO
Same for subtraction.
Definition ISDOpcodes.h:347
@ FCANONICALIZE
Returns platform specific canonical encoding of a floating point number.
Definition ISDOpcodes.h:534
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
Definition ISDOpcodes.h:369
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition ISDOpcodes.h:784
@ UNDEF
UNDEF - An undefined node.
Definition ISDOpcodes.h:228
@ SPLAT_VECTOR
SPLAT_VECTOR(VAL) - Returns a vector with the scalar value VAL duplicated in all lanes.
Definition ISDOpcodes.h:669
@ BasicBlock
Various leaf nodes.
Definition ISDOpcodes.h:81
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
Definition ISDOpcodes.h:225
@ SADDO
RESULT, BOOL = [SU]ADDO(LHS, RHS) - Overflow-aware nodes for addition.
Definition ISDOpcodes.h:343
@ GET_ROUNDING
Returns current rounding mode: -1 Undefined 0 Round to 0 1 Round to nearest, ties to even 2 Round to ...
Definition ISDOpcodes.h:958
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition ISDOpcodes.h:701
@ SHL
Shift and rotation operations.
Definition ISDOpcodes.h:762
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition ISDOpcodes.h:642
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
Definition ISDOpcodes.h:607
@ STRICT_FMAXIMUM
Definition ISDOpcodes.h:463
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition ISDOpcodes.h:569
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition ISDOpcodes.h:219
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition ISDOpcodes.h:838
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
Definition ISDOpcodes.h:799
@ LOCAL_RECOVER
LOCAL_RECOVER - Represents the llvm.localrecover intrinsic.
Definition ISDOpcodes.h:130
@ SMULO
Same for multiplication.
Definition ISDOpcodes.h:351
@ ANY_EXTEND_VECTOR_INREG
ANY_EXTEND_VECTOR_INREG(Vector) - This operator represents an in-register any-extension of the low la...
Definition ISDOpcodes.h:887
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition ISDOpcodes.h:876
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition ISDOpcodes.h:724
@ VSELECT
Select with a vector condition (op #0) and two vector operands (ops #1 and #2), returning a vector re...
Definition ISDOpcodes.h:793
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:323
@ STRICT_SINT_TO_FP
STRICT_[US]INT_TO_FP - Convert a signed or unsigned integer to a floating point value.
Definition ISDOpcodes.h:477
@ STRICT_FROUNDEVEN
Definition ISDOpcodes.h:457
@ FRAMEADDR
FRAMEADDR, RETURNADDR - These nodes represent llvm.frameaddress and llvm.returnaddress on the DAG.
Definition ISDOpcodes.h:110
@ STRICT_FP_TO_UINT
Definition ISDOpcodes.h:471
@ STRICT_FP_ROUND
X = STRICT_FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision ...
Definition ISDOpcodes.h:493
@ STRICT_FP_TO_SINT
STRICT_FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:470
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:914
@ TargetConstant
TargetConstant* - Like Constant*, but the DAG does not do any folding, simplification,...
Definition ISDOpcodes.h:174
@ STRICT_FP_EXTEND
X = STRICT_FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition ISDOpcodes.h:498
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition ISDOpcodes.h:736
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition ISDOpcodes.h:200
@ AVGFLOORS
AVGFLOORS/AVGFLOORU - Averaging add - Add two integers using an integer of type i[N+1],...
Definition ISDOpcodes.h:707
@ STRICT_FADD
Constrained versions of the binary floating point operators.
Definition ISDOpcodes.h:420
@ FREEZE
FREEZE - FREEZE(VAL) returns an arbitrary value if VAL is UNDEF (or is evaluated to UNDEF),...
Definition ISDOpcodes.h:236
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
Definition ISDOpcodes.h:558
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
Definition ISDOpcodes.h:53
@ ExternalSymbol
Definition ISDOpcodes.h:93
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition ISDOpcodes.h:947
@ VECTOR_COMPRESS
VECTOR_COMPRESS(Vec, Mask, Passthru) consecutively place vector elements based on mask e....
Definition ISDOpcodes.h:696
@ ZERO_EXTEND_VECTOR_INREG
ZERO_EXTEND_VECTOR_INREG(Vector) - This operator represents an in-register zero-extension of the low ...
Definition ISDOpcodes.h:909
@ STRICT_FNEARBYINT
Definition ISDOpcodes.h:451
@ FP_TO_SINT_SAT
FP_TO_[US]INT_SAT - Convert floating point value in operand 0 to a signed or unsigned scalar integer ...
Definition ISDOpcodes.h:933
@ EH_SJLJ_SETJMP
RESULT, OUTCHAIN = EH_SJLJ_SETJMP(INCHAIN, buffer) This corresponds to the eh.sjlj....
Definition ISDOpcodes.h:157
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition ISDOpcodes.h:844
@ SHL_PARTS
SHL_PARTS/SRA_PARTS/SRL_PARTS - These operators are used for expanded integer shift operations.
Definition ISDOpcodes.h:821
@ AssertSext
AssertSext, AssertZext - These nodes record if a register contains a value that has already been zero...
Definition ISDOpcodes.h:62
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
Definition ISDOpcodes.h:527
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
Definition ISDOpcodes.h:360
@ ABDS
ABDS/ABDU - Absolute difference - Return the absolute difference between two numbers interpreted as s...
Definition ISDOpcodes.h:719
@ SADDO_CARRY
Carry-using overflow-aware nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:333
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition ISDOpcodes.h:208
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition ISDOpcodes.h:549
bool isExtVecInRegOpcode(unsigned Opcode)
bool isOverflowIntrOpRes(SDValue Op)
Returns true if the specified value is the overflow result from one of the overflow intrinsic nodes.
LLVM_ABI bool isBuildVectorOfConstantSDNodes(const SDNode *N)
Return true if the specified node is a BUILD_VECTOR node of all ConstantSDNode or undef.
bool isNormalStore(const SDNode *N)
Returns true if the specified node is a non-truncating and unindexed store.
bool isExtOpcode(unsigned Opcode)
LLVM_ABI bool isConstantSplatVectorAllZeros(const SDNode *N, bool BuildVectorOnly=false)
Return true if the specified node is a BUILD_VECTOR or SPLAT_VECTOR where all of the elements are 0 o...
LLVM_ABI CondCode getSetCCInverse(CondCode Operation, EVT Type)
Return the operation corresponding to !(X op Y), where 'op' is a valid SetCC operation.
bool isBitwiseLogicOp(unsigned Opcode)
Whether this is bitwise logic opcode.
bool isTrueWhenEqual(CondCode Cond)
Return true if the specified condition returns true if the two operands to the condition are equal.
bool isUNINDEXEDLoad(const SDNode *N)
Returns true if the specified node is an unindexed load.
bool isEXTLoad(const SDNode *N)
Returns true if the specified node is a EXTLOAD.
LLVM_ABI bool isFreezeUndef(const SDNode *N)
Return true if the specified node is FREEZE(UNDEF).
LLVM_ABI CondCode getSetCCSwappedOperands(CondCode Operation)
Return the operation corresponding to (Y op X) when given the operation for (X op Y).
LLVM_ABI bool isBuildVectorAllZeros(const SDNode *N)
Return true if the specified node is a BUILD_VECTOR where all of the elements are 0 or undef.
bool isSignedIntSetCC(CondCode Code)
Return true if this is a setcc instruction that performs a signed comparison when used with integer o...
LLVM_ABI bool isConstantSplatVector(const SDNode *N, APInt &SplatValue)
Node predicates.
bool matchUnaryPredicate(SDValue Op, std::function< bool(ConstantSDNode *)> Match, bool AllowUndefs=false, bool AllowTruncation=false)
Hook for matching ConstantSDNode predicate.
LLVM_ABI bool isBuildVectorOfConstantFPSDNodes(const SDNode *N)
Return true if the specified node is a BUILD_VECTOR node of all ConstantFPSDNode or undef.
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
LLVM_ABI bool isBuildVectorAllOnes(const SDNode *N)
Return true if the specified node is a BUILD_VECTOR where all of the elements are ~0 or undef.
LoadExtType
LoadExtType enum - This enum defines the three variants of LOADEXT (load with extension).
bool isUnsignedIntSetCC(CondCode Code)
Return true if this is a setcc instruction that performs an unsigned comparison when used with intege...
bool isNormalLoad(const SDNode *N)
Returns true if the specified node is a non-extending and unindexed load.
This namespace contains an enum with a value for every intrinsic/builtin function known by LLVM.
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
BinaryOp_match< SpecificConstantMatch, SrcTy, TargetOpcode::G_SUB > m_Neg(const SrcTy &&Src)
Matches a register negated by a G_SUB.
BinaryOp_match< SrcTy, SpecificConstantMatch, TargetOpcode::G_XOR, true > m_Not(const SrcTy &&Src)
Matches a register not-ed by a G_XOR.
OneUse_match< SubPat > m_OneUse(const SubPat &SP)
cst_pred_ty< is_all_ones > m_AllOnes()
Match an integer or vector with all bits set.
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
class_match< BinaryOperator > m_BinOp()
Match an arbitrary binary operation and ignore it.
cst_pred_ty< is_sign_mask > m_SignMask()
Match an integer or vector with only the sign bit(s) set.
BinaryOp_match< LHS, RHS, Instruction::And, true > m_c_And(const LHS &L, const RHS &R)
Matches an And with LHS and RHS in either order.
BinaryOp_match< LHS, RHS, Instruction::Xor > m_Xor(const LHS &L, const RHS &R)
specific_intval< false > m_SpecificInt(const APInt &V)
Match a specific integer value or vector with all elements equal to the value.
bool match(Val *V, const Pattern &P)
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
CmpClass_match< LHS, RHS, ICmpInst, true > m_c_ICmp(CmpPredicate &Pred, const LHS &L, const RHS &R)
Matches an ICmp with a predicate over LHS and RHS in either order.
TwoOps_match< Val_t, Idx_t, Instruction::ExtractElement > m_ExtractElt(const Val_t &Val, const Idx_t &Idx)
Matches ExtractElementInst.
MaxMin_match< ICmpInst, LHS, RHS, smin_pred_ty > m_SMin(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Xor, true > m_c_Xor(const LHS &L, const RHS &R)
Matches an Xor with LHS and RHS in either order.
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
deferredval_ty< Value > m_Deferred(Value *const &V)
Like m_Specific(), but works if the specific value to match is determined as part of the same match()...
SpecificCmpClass_match< LHS, RHS, ICmpInst > m_SpecificICmp(CmpPredicate MatchPred, const LHS &L, const RHS &R)
CastInst_match< OpTy, ZExtInst > m_ZExt(const OpTy &Op)
Matches ZExt.
MaxMin_match< ICmpInst, LHS, RHS, umax_pred_ty > m_UMax(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Add, true > m_c_Add(const LHS &L, const RHS &R)
Matches a Add with LHS and RHS in either order.
CastOperator_match< OpTy, Instruction::BitCast > m_BitCast(const OpTy &Op)
Matches BitCast.
MaxMin_match< ICmpInst, LHS, RHS, smax_pred_ty > m_SMax(const LHS &L, const RHS &R)
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
AnyBinaryOp_match< LHS, RHS, true > m_c_BinOp(const LHS &L, const RHS &R)
Matches a BinaryOperator with LHS and RHS in either order.
CmpClass_match< LHS, RHS, ICmpInst > m_ICmp(CmpPredicate &Pred, const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
auto m_Undef()
Match an arbitrary undef constant.
BinaryOp_match< LHS, RHS, Instruction::Or > m_Or(const LHS &L, const RHS &R)
CastInst_match< OpTy, SExtInst > m_SExt(const OpTy &Op)
Matches SExt.
is_zero m_Zero()
Match any null constant or a vector with all elements equal to 0.
BinaryOp_match< LHS, RHS, Instruction::Or, true > m_c_Or(const LHS &L, const RHS &R)
Matches an Or with LHS and RHS in either order.
BinOpPred_match< LHS, RHS, is_bitwiselogic_op > m_BitwiseLogic(const LHS &L, const RHS &R)
Matches bitwise logic operations.
BinaryOp_match< LHS, RHS, Instruction::Sub > m_Sub(const LHS &L, const RHS &R)
MaxMin_match< ICmpInst, LHS, RHS, umin_pred_ty > m_UMin(const LHS &L, const RHS &R)
LLVM_ABI Libcall getSINTTOFP(EVT OpVT, EVT RetVT)
getSINTTOFP - Return the SINTTOFP_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
LLVM_ABI Libcall getUINTTOFP(EVT OpVT, EVT RetVT)
getUINTTOFP - Return the UINTTOFP_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
LLVM_ABI Libcall getFPTOUINT(EVT OpVT, EVT RetVT)
getFPTOUINT - Return the FPTOUINT_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
LLVM_ABI Libcall getFPTOSINT(EVT OpVT, EVT RetVT)
getFPTOSINT - Return the FPTOSINT_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
LLVM_ABI Libcall getFPROUND(EVT OpVT, EVT RetVT)
getFPROUND - Return the FPROUND_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Dead
Unused definition.
@ Define
Register definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
Opcode_match m_Opc(unsigned Opcode)
BinaryOpc_match< LHS, RHS > m_Srl(const LHS &L, const RHS &R)
auto m_SpecificVT(EVT RefVT, const Pattern &P)
Match a specific ValueType.
TernaryOpc_match< LHS, RHS, IDX > m_InsertSubvector(const LHS &Base, const RHS &Sub, const IDX &Idx)
UnaryOpc_match< Opnd > m_Abs(const Opnd &Op)
Or< Preds... > m_AnyOf(const Preds &...preds)
And< Preds... > m_AllOf(const Preds &...preds)
TernaryOpc_match< T0_P, T1_P, T2_P > m_SetCC(const T0_P &LHS, const T1_P &RHS, const T2_P &CC)
UnaryOpc_match< Opnd > m_AnyExt(const Opnd &Op)
auto m_Node(unsigned Opcode, const OpndPreds &...preds)
TernaryOpc_match< T0_P, T1_P, T2_P > m_VSelect(const T0_P &Cond, const T1_P &T, const T2_P &F)
bool sd_match(SDNode *N, const SelectionDAG *DAG, Pattern &&P)
CondCode_match m_SpecificCondCode(ISD::CondCode CC)
Match a conditional code SDNode with a specific ISD::CondCode.
auto m_SpecificVectorElementVT(EVT RefVT, const Pattern &P)
Match a vector ValueType.
CondCode_match m_CondCode()
Match any conditional code SDNode.
ConstantInt_match m_ConstInt()
Match any integer constants or splat of an integer constant.
@ System
Synchronized with respect to all concurrently executing threads.
Definition LLVMContext.h:58
Invariant opcodes: All instruction sets have these as their low opcodes.
@ X86
Windows x64, Windows Itanium (IA-64)
Definition MCAsmInfo.h:50
@ PTR32_UPTR
Definition X86.h:217
@ PTR64
Definition X86.h:218
@ PTR32_SPTR
Definition X86.h:216
@ MO_TLSLD
MO_TLSLD - On a symbol operand this indicates that the immediate is the offset of the GOT entry with ...
@ MO_GOTPCREL_NORELAX
MO_GOTPCREL_NORELAX - Same as MO_GOTPCREL except that R_X86_64_GOTPCREL relocations are guaranteed to...
@ MO_COFFSTUB
MO_COFFSTUB - On a symbol operand "FOO", this indicates that the reference is actually to the "....
@ MO_NTPOFF
MO_NTPOFF - On a symbol operand this indicates that the immediate is the negative thread-pointer offs...
@ MO_INDNTPOFF
MO_INDNTPOFF - On a symbol operand this indicates that the immediate is the absolute address of the G...
@ MO_GOTNTPOFF
MO_GOTNTPOFF - On a symbol operand this indicates that the immediate is the offset of the GOT entry w...
@ MO_TPOFF
MO_TPOFF - On a symbol operand this indicates that the immediate is the thread-pointer offset for the...
@ MO_TLVP_PIC_BASE
MO_TLVP_PIC_BASE - On a symbol operand this indicates that the immediate is some TLS offset from the ...
@ MO_TLSGD
MO_TLSGD - On a symbol operand this indicates that the immediate is the offset of the GOT entry with ...
@ MO_NO_FLAG
MO_NO_FLAG - No flag for the operand.
@ MO_TLVP
MO_TLVP - On a symbol operand this indicates that the immediate is some TLS offset.
@ MO_DLLIMPORT
MO_DLLIMPORT - On a symbol operand "FOO", this indicates that the reference is actually to the "__imp...
@ MO_GOTTPOFF
MO_GOTTPOFF - On a symbol operand this indicates that the immediate is the offset of the GOT entry wi...
@ MO_SECREL
MO_SECREL - On a symbol operand this indicates that the immediate is the offset from beginning of sec...
@ MO_DTPOFF
MO_DTPOFF - On a symbol operand this indicates that the immediate is the offset of the GOT entry with...
@ MO_TLSLDM
MO_TLSLDM - On a symbol operand this indicates that the immediate is the offset of the GOT entry with...
@ MO_GOTPCREL
MO_GOTPCREL - On a symbol operand this indicates that the immediate is offset to the GOT entry for th...
@ FST
This instruction implements a truncating store from FP stack slots.
@ CMPM
Vector comparison generating mask bits for fp and integer signed and unsigned data types.
@ FMAX
Floating point max and min.
@ BT
X86 bit-test instructions.
@ HADD
Integer horizontal add/sub.
@ MOVQ2DQ
Copies a 64-bit value from an MMX vector to the low word of an XMM vector, with the high word zero fi...
@ BLENDI
Blend where the selector is an immediate.
@ CMP
X86 compare and logical compare instructions.
@ BLENDV
Dynamic (non-constant condition) vector blend where only the sign bits of the condition elements are ...
@ ADDSUB
Combined add and sub on an FP vector.
@ STRICT_FMAX
Floating point max and min.
@ STRICT_CMPM
Vector comparison generating mask bits for fp and integer signed and unsigned data types.
@ FHADD
Floating point horizontal add/sub.
@ BSR
Bit scan reverse.
@ SETCC
X86 SetCC.
@ NT_BRIND
BRIND node with NoTrack prefix.
@ SELECTS
X86 Select.
@ FSETCCM
X86 FP SETCC, similar to above, but with output as an i1 mask and and a version with SAE.
@ PEXTRB
Extract an 8-bit value from a vector and zero extend it to i32, corresponds to X86::PEXTRB.
@ FXOR
Bitwise logical XOR of floating point values.
@ BRCOND
X86 conditional branches.
@ FSETCC
X86 FP SETCC, implemented with CMP{cc}SS/CMP{cc}SD.
@ PINSRB
Insert the lower 8-bits of a 32-bit value to a vector, corresponds to X86::PINSRB.
@ INSERTPS
Insert any element of a 4 x float vector into any element of a destination 4 x floatvector.
@ PSHUFB
Shuffle 16 8-bit values within a vector.
@ PEXTRW
Extract a 16-bit value from a vector and zero extend it to i32, corresponds to X86::PEXTRW.
@ AADD
RAO arithmetic instructions.
@ FANDN
Bitwise logical ANDNOT of floating point values.
@ GlobalBaseReg
On Darwin, this node represents the result of the popl at function entry, used for PIC code.
@ FMAXC
Commutative FMIN and FMAX.
@ EXTRQI
SSE4A Extraction and Insertion.
@ FLD
This instruction implements an extending load to FP stack slots.
@ PSADBW
Compute Sum of Absolute Differences.
@ FOR
Bitwise logical OR of floating point values.
@ FIST
This instruction implements a fp->int store from FP stack slots.
@ FP_TO_INT_IN_MEM
This instruction implements FP_TO_SINT with the integer destination in memory and a FP reg source.
@ LADD
LOCK-prefixed arithmetic read-modify-write instructions.
@ MMX_MOVW2D
Copies a GPR into the low 32-bit word of a MMX vector and zero out the high word.
@ Wrapper
A wrapper node for TargetConstantPool, TargetJumpTable, TargetExternalSymbol, TargetGlobalAddress,...
@ PINSRW
Insert the lower 16-bits of a 32-bit value to a vector, corresponds to X86::PINSRW.
@ CMPCCXADD
Compare and Add if Condition is Met.
@ MMX_MOVD2W
Copies a 32-bit value from the low word of a MMX vector to a GPR.
@ FILD
This instruction implements SINT_TO_FP with the integer source in memory and FP reg result.
@ MOVDQ2Q
Copies a 64-bit value from the low word of an XMM vector to an MMX vector.
@ ANDNP
Bitwise Logical AND NOT of Packed FP values.
@ BSF
Bit scan forward.
@ VAARG_64
These instructions grab the address of the next argument from a va_list.
@ FAND
Bitwise logical AND of floating point values.
@ CMOV
X86 conditional moves.
@ WrapperRIP
Special wrapper used under X86-64 PIC mode for RIP relative displacements.
@ FSHL
X86 funnel/double shift i16 instructions.
@ FRSQRT
Floating point reciprocal-sqrt and reciprocal approximation.
Define some predicates that are used for node matching.
@ AddrNumOperands
Definition X86BaseInfo.h:36
bool mayFoldLoadIntoBroadcastFromMem(SDValue Op, MVT EltVT, const X86Subtarget &Subtarget, bool AssumeSingleUse=false)
Check if Op is a load operation that could be folded into a vector splat instruction as a memory oper...
bool isZeroNode(SDValue Elt)
Returns true if Elt is a constant zero or floating point constant +0.0.
CondCode GetOppositeBranchCondition(CondCode CC)
GetOppositeBranchCondition - Return the inverse of the specified cond, e.g.
bool mayFoldIntoZeroExtend(SDValue Op)
Check if Op is an operation that could be folded into a zero extend x86 instruction.
bool mayFoldIntoStore(SDValue Op)
Check if Op is a value that could be used to fold a store into some other x86 instruction as a memory...
bool isExtendedSwiftAsyncFrameSupported(const X86Subtarget &Subtarget, const MachineFunction &MF)
True if the target supports the extended frame for async Swift functions.
int getRoundingModeX86(unsigned RM)
Convert LLVM rounding mode to X86 rounding mode.
int getCCMPCondFlagsFromCondCode(CondCode CC)
bool mayFoldLoad(SDValue Op, const X86Subtarget &Subtarget, bool AssumeSingleUse=false)
Check if Op is a load operation that could be folded into some other x86 instruction as a memory oper...
FastISel * createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo)
bool isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M, bool hasSymbolicDisplacement)
Returns true of the given offset can be fit into displacement field of the instruction.
bool isConstantSplat(SDValue Op, APInt &SplatVal, bool AllowPartialUndefs)
If Op is a constant whose elements are all the same constant or undefined, return true and return the...
initializer< Ty > init(const Ty &Val)
constexpr double e
@ User
could "use" a pointer
NodeAddr< NodeBase * > Node
Definition RDFGraph.h:381
NodeAddr< FuncNode * > Func
Definition RDFGraph.h:393
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73
This is an optimization pass for GlobalISel generic memory operations.
void DecodeZeroExtendMask(unsigned SrcScalarBits, unsigned DstScalarBits, unsigned NumDstElts, bool IsAnyExtend, SmallVectorImpl< int > &ShuffleMask)
Decode a zero extension instruction as a shuffle mask.
IterT next_nodbg(IterT It, IterT End, bool SkipPseudoOp=true)
Increment It, then continue incrementing it while it points to a debug instruction.
static bool isGlobalStubReference(unsigned char TargetFlag)
isGlobalStubReference - Return true if the specified TargetFlag operand is a reference to a stub for ...
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:316
@ Low
Lower the current thread's priority such that it does not affect foreground tasks significantly.
Definition Threading.h:280
@ Offset
Definition DWP.cpp:477
@ Length
Definition DWP.cpp:477
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
void DecodeMOVHLPSMask(unsigned NElts, SmallVectorImpl< int > &ShuffleMask)
Decode a MOVHLPS instruction as a v2f64/v4f32 shuffle mask.
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1751
static bool isGlobalRelativeToPICBase(unsigned char TargetFlag)
isGlobalRelativeToPICBase - Return true if the specified global value reference is relative to a 32-b...
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1725
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition STLExtras.h:1655
void DecodeZeroMoveLowMask(unsigned NumElts, SmallVectorImpl< int > &ShuffleMask)
Decode a move lower and zero upper instruction as a shuffle mask.
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
InstructionCost Cost
void DecodeVPERMILPMask(unsigned NumElts, unsigned ScalarBits, ArrayRef< uint64_t > RawMask, const APInt &UndefElts, SmallVectorImpl< int > &ShuffleMask)
Decode a VPERMILPD/VPERMILPS variable mask from a raw array of constants.
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:165
LLVM_ABI bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
LLVM_ABI bool isAllOnesOrAllOnesSplat(const MachineInstr &MI, const MachineRegisterInfo &MRI, bool AllowUndefs=false)
Return true if the value is a constant -1 integer or a splatted vector of a constant -1 integer (with...
Definition Utils.cpp:1606
void DecodePSHUFLWMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
Decodes the shuffle masks for pshuflw.
static const IntrinsicData * getIntrinsicWithChain(unsigned IntNo)
LLVM_ABI SDValue peekThroughBitcasts(SDValue V)
Return the non-bitcasted source operand of V if it exists.
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2472
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
unsigned Log2_64_Ceil(uint64_t Value)
Return the ceil log base 2 of the specified value, 64 if the value is zero.
Definition MathExtras.h:352
MCRegister getX86SubSuperRegister(MCRegister Reg, unsigned Size, bool High=false)
@ SjLj
setjmp/longjmp based exceptions
Definition CodeGen.h:56
bool isIntOrFPConstant(SDValue V)
Return true if V is either a integer or FP constant.
void DecodeVPERMV3Mask(ArrayRef< uint64_t > RawMask, const APInt &UndefElts, SmallVectorImpl< int > &ShuffleMask)
Decode a VPERMT2 W/D/Q/PS/PD mask from a raw array of constants.
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
Definition bit.h:289
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:632
void DecodeBLENDMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
Decode a BLEND immediate mask into a shuffle mask.
void decodeVSHUF64x2FamilyMask(unsigned NumElts, unsigned ScalarSize, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
Decode a shuffle packed values at 128-bit granularity (SHUFF32x4/SHUFF64x2/SHUFI32x4/SHUFI64x2) immed...
void DecodeVPERMMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
Decodes the shuffle masks for VPERMQ/VPERMPD.
static const MachineInstrBuilder & addFrameReference(const MachineInstrBuilder &MIB, int FI, int Offset=0, bool mem=true)
addFrameReference - This function is used to add a reference to the base of an abstract object on the...
void DecodeEXTRQIMask(unsigned NumElts, unsigned EltSize, int Len, int Idx, SmallVectorImpl< int > &ShuffleMask)
Decode a SSE4A EXTRQ instruction as a shuffle mask.
static const MachineInstrBuilder & addFullAddress(const MachineInstrBuilder &MIB, const X86AddressMode &AM)
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition MathExtras.h:284
static const IntrinsicData * getIntrinsicWithoutChain(unsigned IntNo)
auto unique(Range &&R, Predicate P)
Definition STLExtras.h:2076
LLVM_ABI bool isNullOrNullSplat(const MachineInstr &MI, const MachineRegisterInfo &MRI, bool AllowUndefs=false)
Return true if the value is a constant 0 integer or a splatted vector of a constant 0 integer (with n...
Definition Utils.cpp:1588
void DecodePSRLDQMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
T bit_ceil(T Value)
Returns the smallest integral power of two no smaller than Value if Value is nonzero.
Definition bit.h:331
OutputIt copy_if(R &&Range, OutputIt Out, UnaryPredicate P)
Provide wrappers to std::copy_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1777
unsigned Log2_64(uint64_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:339
LLVM_ABI bool isMinSignedConstant(SDValue V)
Returns true if V is a constant min signed integer value.
LLVM_ABI ConstantFPSDNode * isConstOrConstSplatFP(SDValue N, bool AllowUndefs=false)
Returns the SDNode if it is a constant splat BuildVector or constant float.
void DecodeINSERTPSMask(unsigned Imm, SmallVectorImpl< int > &ShuffleMask, bool SrcIsMem)
Decode a 128-bit INSERTPS instruction as a v4f32 shuffle mask.
void DecodeVPERM2X128Mask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
static void setDirectAddressInInstr(MachineInstr *MI, unsigned Operand, Register Reg)
Replace the address used in the instruction with the direct memory reference.
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition bit.h:186
unsigned M1(unsigned Val)
Definition VE.h:377
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:753
void DecodeVPERMIL2PMask(unsigned NumElts, unsigned ScalarBits, unsigned M2Z, ArrayRef< uint64_t > RawMask, const APInt &UndefElts, SmallVectorImpl< int > &ShuffleMask)
Decode a VPERMIL2PD/VPERMIL2PS variable mask from a raw array of constants.
constexpr bool has_single_bit(T Value) noexcept
Definition bit.h:147
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1732
void DecodeMOVLHPSMask(unsigned NElts, SmallVectorImpl< int > &ShuffleMask)
Decode a MOVLHPS instruction as a v2f64/v4f32 shuffle mask.
LLVM_ABI bool getShuffleDemandedElts(int SrcWidth, ArrayRef< int > Mask, const APInt &DemandedElts, APInt &DemandedLHS, APInt &DemandedRHS, bool AllowUndefElts=false)
Transform a shuffle mask's output demanded element mask into demanded element masks for the 2 operand...
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:333
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
Definition bit.h:222
bool isAlpha(char C)
Checks if character C is a valid letter as classified by "C" locale.
LLVM_ABI bool isBitwiseNot(SDValue V, bool AllowUndefs=false)
Returns true if V is a bitwise not operation.
auto reverse(ContainerTy &&C)
Definition STLExtras.h:406
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:279
LLVM_ABI void getHorizDemandedEltsForFirstOperand(unsigned VectorBitWidth, const APInt &DemandedElts, APInt &DemandedLHS, APInt &DemandedRHS)
Compute the demanded elements mask of horizontal binary operations.
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
void createUnpackShuffleMask(EVT VT, SmallVectorImpl< int > &Mask, bool Lo, bool Unary)
Generate unpacklo/unpackhi shuffle mask.
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition MathExtras.h:150
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
LLVM_ABI SDValue peekThroughTruncates(SDValue V)
Return the non-truncated source operand of V if it exists.
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1739
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:167
FunctionAddr VTableAddr Count
Definition InstrProf.h:139
void DecodeINSERTQIMask(unsigned NumElts, unsigned EltSize, int Len, int Idx, SmallVectorImpl< int > &ShuffleMask)
Decode a SSE4A INSERTQ instruction as a shuffle mask.
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
Definition MathExtras.h:189
LLVM_ABI SDValue peekThroughOneUseBitcasts(SDValue V)
Return the non-bitcasted and one-use source operand of V if it exists.
LLVM_ABI EHPersonality classifyEHPersonality(const Value *Pers)
See if the given exception handling personality function is one that we understand.
@ Default
-O2, -Os
Definition CodeGen.h:85
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition MathExtras.h:155
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
@ Success
The lock was released successfully.
void DecodeVPERMVMask(ArrayRef< uint64_t > RawMask, const APInt &UndefElts, SmallVectorImpl< int > &ShuffleMask)
Decode a VPERM W/D/Q/PS/PD mask from a raw array of constants.
static void verifyIntrinsicTables()
AtomicOrdering
Atomic ordering for LLVM's memory model.
@ Mod
The access may modify the value stored in memory.
Definition ModRef.h:34
void createSplat2ShuffleMask(MVT VT, SmallVectorImpl< int > &Mask, bool Lo)
Similar to unpacklo/unpackhi, but without the 128-bit lane limitation imposed by AVX and specific to ...
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
Definition ModRef.h:71
bool isFuncletEHPersonality(EHPersonality Pers)
Returns true if this is a personality function that invokes handler funclets (which must return to it...
IRBuilder(LLVMContext &, FolderTy, InserterTy, MDNode *, ArrayRef< OperandBundleDef >) -> IRBuilder< FolderTy, InserterTy >
void DecodeVALIGNMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
auto lower_bound(R &&Range, T &&Value)
Provide wrappers to std::lower_bound which take ranges instead of having to pass begin/end explicitly...
Definition STLExtras.h:1994
LLVM_ABI void narrowShuffleMaskElts(int Scale, ArrayRef< int > Mask, SmallVectorImpl< int > &ScaledMask)
Replace each shuffle mask index with the scaled sequential indices for an equivalent mask of narrowed...
To bit_cast(const From &from) noexcept
Definition bit.h:90
void replace(R &&Range, const T &OldValue, const T &NewValue)
Provide wrappers to std::replace which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1860
@ UMin
Unsigned integer min implemented in terms of select(cmp()).
@ Xor
Bitwise or logical XOR of integers.
@ SMax
Signed integer max implemented in terms of select(cmp()).
@ And
Bitwise or logical AND of integers.
@ SMin
Signed integer min implemented in terms of select(cmp()).
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
uint16_t MCPhysReg
An unsigned integer type large enough to represent all physical registers, but not necessarily virtua...
Definition MCRegister.h:21
void DecodeScalarMoveMask(unsigned NumElts, bool IsLoad, SmallVectorImpl< int > &ShuffleMask)
Decode a scalar float move instruction as a shuffle mask.
LLVM_ABI bool isNullConstantOrUndef(SDValue V)
Returns true if V is a constant integer zero or an UNDEF node.
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
Definition STLExtras.h:1954
static X86AddressMode getAddressFromInstr(const MachineInstr *MI, unsigned Operand)
Compute the addressing mode from an machine instruction starting with the given operand.
void DecodeVPPERMMask(ArrayRef< uint64_t > RawMask, const APInt &UndefElts, SmallVectorImpl< int > &ShuffleMask)
Decode a VPPERM mask from a raw array of constants such as from BUILD_VECTOR.
DWARFExpression::Operation Op
void DecodePALIGNRMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
bool isPhysRegUsedAfter(Register Reg, MachineBasicBlock::iterator MBI)
Check if physical register Reg is used after MBI.
void DecodeMOVSLDUPMask(unsigned NumElts, SmallVectorImpl< int > &ShuffleMask)
RoundingMode
Rounding mode.
unsigned M0(unsigned Val)
Definition VE.h:376
ArrayRef(const T &OneElt) -> ArrayRef< T >
bool isAsynchronousEHPersonality(EHPersonality Pers)
Returns true if this personality function catches asynchronous exceptions.
std::string toString(const APInt &I, unsigned Radix, bool Signed, bool formatAsCLiteral=false, bool UpperCase=true, bool InsertSeparators=false)
LLVM_ABI ConstantSDNode * isConstOrConstSplat(SDValue N, bool AllowUndefs=false, bool AllowTruncation=false)
Returns the SDNode if it is a constant splat BuildVector or constant int.
OutputIt copy(R &&Range, OutputIt Out)
Definition STLExtras.h:1835
constexpr unsigned BitWidth
void DecodeUNPCKLMask(unsigned NumElts, unsigned ScalarBits, SmallVectorImpl< int > &ShuffleMask)
Decodes the shuffle masks for unpcklps/unpcklpd and punpckl*.
void DecodePSLLDQMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
Definition STLExtras.h:1961
void DecodeUNPCKHMask(unsigned NumElts, unsigned ScalarBits, SmallVectorImpl< int > &ShuffleMask)
Decodes the shuffle masks for unpckhps/unpckhpd and punpckh*.
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1758
LLVM_ABI bool isOneConstant(SDValue V)
Returns true if V is a constant integer one.
static const MachineInstrBuilder & addDirectMem(const MachineInstrBuilder &MIB, Register Reg)
addDirectMem - This function is used to add a direct memory reference to the current instruction – th...
static uint32_t extractBits(uint64_t Val, uint32_t Hi, uint32_t Lo)
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1897
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition Alignment.h:201
LLVM_ABI bool isNullFPConstant(SDValue V)
Returns true if V is an FP constant with a value of positive zero.
bool all_equal(std::initializer_list< T > Values)
Returns true if all Values in the initializer lists are equal or the list.
Definition STLExtras.h:2108
@ TRUNCATE_TO_MEM_VI16
@ INTR_TYPE_SCALAR_MASK_SAE
@ INTR_TYPE_1OP_SAE
@ TRUNCATE_TO_MEM_VI32
@ INTR_TYPE_2OP_SAE
@ INTR_TYPE_3OP_SCALAR_MASK_SAE
@ INTR_TYPE_3OP_MASK_SAE
@ INTR_TYPE_2OP_MASK
@ TRUNCATE_TO_MEM_VI8
@ CVTNEPS2BF16_MASK
@ CMP_MASK_SCALAR_CC
@ INTR_TYPE_1OP_MASK_SAE
@ INTR_TYPE_SCALAR_MASK
@ INTR_TYPE_3OP_IMM8
@ INTR_TYPE_2OP_MASK_SAE
@ INTR_TYPE_SCALAR_MASK_RND
@ INTR_TYPE_1OP_MASK
@ COMPRESS_EXPAND_IN_REG
@ INTR_TYPE_CAST_MMX
@ INTR_TYPE_4OP_IMM8
void DecodePSHUFMask(unsigned NumElts, unsigned ScalarBits, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
Decodes the shuffle masks for pshufd/pshufw/vpermilpd/vpermilps.
auto seq(T Begin, T End)
Iterate over an integral type from Begin up to - but not including - End.
Definition Sequence.h:305
void DecodeMOVDDUPMask(unsigned NumElts, SmallVectorImpl< int > &ShuffleMask)
void array_pod_sort(IteratorTy Start, IteratorTy End)
array_pod_sort - This sorts an array with the specified start and end extent.
Definition STLExtras.h:1582
void DecodeVectorBroadcast(unsigned NumElts, SmallVectorImpl< int > &ShuffleMask)
Decodes a broadcast of the first element of a vector.
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
Definition bit.h:316
void DecodeSHUFPMask(unsigned NumElts, unsigned ScalarBits, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
Decodes the shuffle masks for shufp*.
void DecodePSHUFHWMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
Decodes the shuffle masks for pshufhw.
static const MachineInstrBuilder & addRegOffset(const MachineInstrBuilder &MIB, Register Reg, bool isKill, int Offset)
addRegOffset - This function is used to add a memory reference of the form [Reg + Offset],...
void DecodeMOVSHDUPMask(unsigned NumElts, SmallVectorImpl< int > &ShuffleMask)
constexpr T rotl(T V, int R)
Definition bit.h:355
LLVM_ABI bool isAllOnesConstant(SDValue V)
Returns true if V is an integer constant with all bits set.
int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition bit.h:154
@ SM_SentinelUndef
@ SM_SentinelZero
LLVM_ABI bool scaleShuffleMaskElts(unsigned NumDstElts, ArrayRef< int > Mask, SmallVectorImpl< int > &ScaledMask)
Attempt to narrow/widen the Mask shuffle mask to the NumDstElts target width.
void DecodePSHUFBMask(ArrayRef< uint64_t > RawMask, const APInt &UndefElts, SmallVectorImpl< int > &ShuffleMask)
Decode a PSHUFB mask from a raw array of constants such as from BUILD_VECTOR.
LLVM_ABI int getSplatIndex(ArrayRef< int > Mask)
If all non-negative Mask elements are the same value, return that value.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:869
#define N
#define EQ(a, b)
Definition regexec.c:65
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
constexpr uint64_t value() const
This is a hole in the type system and should not be abused.
Definition Alignment.h:77
Extended Value Type.
Definition ValueTypes.h:35
EVT changeVectorElementTypeToInteger() const
Return a vector with the same number of elements as this vector, but with the element type converted ...
Definition ValueTypes.h:94
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
Definition ValueTypes.h:395
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition ValueTypes.h:137
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition ValueTypes.h:74
EVT changeTypeToInteger() const
Return the type converted to an equivalently sized integer or vector with integer element type.
Definition ValueTypes.h:121
bool bitsGT(EVT VT) const
Return true if this has more bits than VT.
Definition ValueTypes.h:284
bool bitsLT(EVT VT) const
Return true if this has less bits than VT.
Definition ValueTypes.h:300
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
Definition ValueTypes.h:147
ElementCount getVectorElementCount() const
Definition ValueTypes.h:350
EVT getDoubleNumVectorElementsVT(LLVMContext &Context) const
Definition ValueTypes.h:463
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition ValueTypes.h:373
bool isByteSized() const
Return true if the bit size is a multiple of 8.
Definition ValueTypes.h:243
unsigned getVectorMinNumElements() const
Given a vector type, return the minimum number of elements it contains.
Definition ValueTypes.h:359
uint64_t getScalarSizeInBits() const
Definition ValueTypes.h:385
static LLVM_ABI EVT getEVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
TypeSize getStoreSizeInBits() const
Return the number of bits overwritten by a store of the specified value type.
Definition ValueTypes.h:412
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition ValueTypes.h:316
bool is128BitVector() const
Return true if this is a 128-bit vector type.
Definition ValueTypes.h:207
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
Definition ValueTypes.h:65
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
Definition ValueTypes.h:381
bool is512BitVector() const
Return true if this is a 512-bit vector type.
Definition ValueTypes.h:217
static EVT getFloatingPointVT(unsigned BitWidth)
Returns the EVT that represents a floating-point type with the given number of bits.
Definition ValueTypes.h:59
bool isVector() const
Return true if this is a vector value type.
Definition ValueTypes.h:168
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition ValueTypes.h:323
bool is256BitVector() const
Return true if this is a 256-bit vector type.
Definition ValueTypes.h:212
LLVM_ABI Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition ValueTypes.h:328
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
Definition ValueTypes.h:157
EVT changeVectorElementType(EVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
Definition ValueTypes.h:102
LLVM_ABI const fltSemantics & getFltSemantics() const
Returns an APFloat semantics tag appropriate for the value type.
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition ValueTypes.h:336
EVT getHalfNumVectorElementsVT(LLVMContext &Context) const
Definition ValueTypes.h:453
bool isInteger() const
Return true if this is an integer or a vector integer type.
Definition ValueTypes.h:152
bool is64BitVector() const
Return true if this is a 64-bit vector type.
Definition ValueTypes.h:202
static KnownBits makeConstant(const APInt &C)
Create known bits from a known constant.
Definition KnownBits.h:301
static LLVM_ABI KnownBits sadd_sat(const KnownBits &LHS, const KnownBits &RHS)
Compute knownbits resulting from llvm.sadd.sat(LHS, RHS)
static LLVM_ABI std::optional< bool > eq(const KnownBits &LHS, const KnownBits &RHS)
Determine if these known bits always give the same ICMP_EQ result.
KnownBits anyextOrTrunc(unsigned BitWidth) const
Return known bits for an "any" extension or truncation of the value we're tracking.
Definition KnownBits.h:186
static LLVM_ABI KnownBits mulhu(const KnownBits &LHS, const KnownBits &RHS)
Compute known bits from zero-extended multiply-hi.
bool isNonNegative() const
Returns true if this value is known to be non-negative.
Definition KnownBits.h:108
bool isZero() const
Returns true if value is all zero.
Definition KnownBits.h:80
unsigned countMinTrailingZeros() const
Returns the minimum number of trailing zero bits.
Definition KnownBits.h:242
bool isUnknown() const
Returns true if we don't know any bits.
Definition KnownBits.h:66
unsigned countMaxTrailingZeros() const
Returns the maximum number of trailing zero bits possible.
Definition KnownBits.h:274
KnownBits trunc(unsigned BitWidth) const
Return known bits for a truncation of the value we're tracking.
Definition KnownBits.h:161
unsigned countMaxPopulation() const
Returns the maximum number of bits that could be one.
Definition KnownBits.h:289
void setAllZero()
Make all bits known to be zero and discard any previous information.
Definition KnownBits.h:86
unsigned getBitWidth() const
Get the bit width of this value.
Definition KnownBits.h:44
KnownBits zext(unsigned BitWidth) const
Return known bits for a zero extension of the value we're tracking.
Definition KnownBits.h:172
bool isConstant() const
Returns true if we know the value of all bits.
Definition KnownBits.h:54
void resetAll()
Resets the known state of all bits.
Definition KnownBits.h:74
bool isNonZero() const
Returns true if this value is known to be non-zero.
Definition KnownBits.h:111
static LLVM_ABI KnownBits abdu(const KnownBits &LHS, const KnownBits &RHS)
Compute known bits for abdu(LHS, RHS).
KnownBits extractBits(unsigned NumBits, unsigned BitPosition) const
Return a subset of the known bits from [bitPosition,bitPosition+numBits).
Definition KnownBits.h:225
unsigned countMaxActiveBits() const
Returns the maximum number of bits needed to represent all possible unsigned values with these known ...
Definition KnownBits.h:296
KnownBits intersectWith(const KnownBits &RHS) const
Returns KnownBits information that is known to be true for both this and RHS.
Definition KnownBits.h:311
KnownBits sext(unsigned BitWidth) const
Return known bits for a sign extension of the value we're tracking.
Definition KnownBits.h:180
static KnownBits add(const KnownBits &LHS, const KnownBits &RHS, bool NSW=false, bool NUW=false)
Compute knownbits resulting from addition of LHS and RHS.
Definition KnownBits.h:347
KnownBits zextOrTrunc(unsigned BitWidth) const
Return known bits for a zero extension or truncation of the value we're tracking.
Definition KnownBits.h:196
unsigned countMinLeadingZeros() const
Returns the minimum number of leading zero bits.
Definition KnownBits.h:248
APInt getMaxValue() const
Return the maximal unsigned value possible given these KnownBits.
Definition KnownBits.h:145
static LLVM_ABI KnownBits computeForAddSub(bool Add, bool NSW, bool NUW, const KnownBits &LHS, const KnownBits &RHS)
Compute known bits resulting from adding LHS and RHS.
Definition KnownBits.cpp:60
bool isNegative() const
Returns true if this value is known to be negative.
Definition KnownBits.h:105
void setAllOnes()
Make all bits known to be one and discard any previous information.
Definition KnownBits.h:92
static LLVM_ABI KnownBits mul(const KnownBits &LHS, const KnownBits &RHS, bool NoUndefSelfMultiply=false)
Compute known bits resulting from multiplying LHS and RHS.
static LLVM_ABI std::optional< bool > sgt(const KnownBits &LHS, const KnownBits &RHS)
Determine if these known bits always give the same ICMP_SGT result.
bool isAllOnes() const
Returns true if value is all one bits.
Definition KnownBits.h:83
const APInt & getConstant() const
Returns the value when all bits have a known value.
Definition KnownBits.h:60
Matching combinators.
This class contains a discriminated union of information about pointers in memory operands,...
LLVM_ABI bool isDereferenceable(unsigned Size, LLVMContext &C, const DataLayout &DL) const
Return true if memory region [V, V+Offset+Size) is known to be dereferenceable.
static LLVM_ABI MachinePointerInfo getConstantPool(MachineFunction &MF)
Return a MachinePointerInfo record that refers to the constant pool.
MachinePointerInfo getWithOffset(int64_t O) const
static LLVM_ABI MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
static LLVM_ABI MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition Alignment.h:106
These are IR-level optimization flags that may be propagated to SDNodes.
bool hasAllowContract() const
bool hasNoSignedZeros() const
void setNoSignedWrap(bool b)
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
unsigned int NumVTs
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
This contains information for each constraint that we are lowering.
This structure contains all information that is necessary for lowering calls.
CallLoweringInfo & setLibCallee(CallingConv::ID CC, Type *ResultType, SDValue Target, ArgListTy &&ArgsList)
CallLoweringInfo & setDebugLoc(const SDLoc &dl)
CallLoweringInfo & setChain(SDValue InChain)
LLVM_ABI void AddToWorklist(SDNode *N)
LLVM_ABI bool recursivelyDeleteUnusedNodes(SDNode *N)
LLVM_ABI SDValue CombineTo(SDNode *N, ArrayRef< SDValue > To, bool AddTo=true)
LLVM_ABI void CommitTargetLoweringOpt(const TargetLoweringOpt &TLO)
This structure is used to pass arguments to makeLibCall function.
A convenience struct that encapsulates a DAG, and two SDValues for returning information from TargetL...
X86AddressMode - This struct holds a generalized full x86 address mode.