71#define DEBUG_TYPE "x86-isel"
74 "x86-experimental-pref-innermost-loop-alignment",
cl::init(4),
76 "Sets the preferable loop alignment for experiments (as log2 bytes) "
77 "for innermost loops only. If specified, this option overrides "
78 "alignment set by x86-experimental-pref-loop-alignment."),
82 "x86-br-merging-base-cost",
cl::init(2),
84 "Sets the cost threshold for when multiple conditionals will be merged "
85 "into one branch versus be split in multiple branches. Merging "
86 "conditionals saves branches at the cost of additional instructions. "
87 "This value sets the instruction cost limit, below which conditionals "
88 "will be merged, and above which conditionals will be split. Set to -1 "
89 "to never merge branches."),
93 "x86-br-merging-ccmp-bias",
cl::init(6),
94 cl::desc(
"Increases 'x86-br-merging-base-cost' in cases that the target "
95 "supports conditional compare instructions."),
100 cl::desc(
"Replace narrow shifts with wider shifts."),
104 "x86-br-merging-likely-bias",
cl::init(0),
105 cl::desc(
"Increases 'x86-br-merging-base-cost' in cases that it is likely "
106 "that all conditionals will be executed. For example for merging "
107 "the conditionals (a == b && c > d), if its known that a == b is "
108 "likely, then it is likely that if the conditionals are split "
109 "both sides will be executed, so it may be desirable to increase "
110 "the instruction cost threshold. Set to -1 to never merge likely "
115 "x86-br-merging-unlikely-bias",
cl::init(-1),
117 "Decreases 'x86-br-merging-base-cost' in cases that it is unlikely "
118 "that all conditionals will be executed. For example for merging "
119 "the conditionals (a == b && c > d), if its known that a == b is "
120 "unlikely, then it is unlikely that if the conditionals are split "
121 "both sides will be executed, so it may be desirable to decrease "
122 "the instruction cost threshold. Set to -1 to never merge unlikely "
127 "mul-constant-optimization",
cl::init(
true),
128 cl::desc(
"Replace 'mul x, Const' with more effective instructions like "
135 bool UseX87 = !Subtarget.useSoftFloat() && Subtarget.hasX87();
152 if (Subtarget.isAtom())
154 else if (Subtarget.is64Bit())
163 if (Subtarget.hasSlowDivide32())
165 if (Subtarget.hasSlowDivide64() && Subtarget.is64Bit())
169 if (Subtarget.canUseCMPXCHG16B())
171 else if (Subtarget.canUseCMPXCHG8B())
184 if (Subtarget.is64Bit())
201 for (
auto VT : {MVT::f32, MVT::f64, MVT::f80}) {
207 if (Subtarget.canUseCMOV()) {
210 if (Subtarget.is64Bit())
219 if (Subtarget.is64Bit())
227 if (Subtarget.is64Bit())
238 if (Subtarget.is64Bit())
242 if (!Subtarget.useSoftFloat()) {
306 if (!Subtarget.is64Bit() && Subtarget.hasX87()) {
312 if (Subtarget.hasSSE2()) {
315 for (
MVT VT : { MVT::i8, MVT::i16, MVT::i32 }) {
321 if (Subtarget.is64Bit()) {
326 if (Subtarget.hasAVX10_2()) {
331 for (
MVT VT : {MVT::i32, MVT::v4i32, MVT::v8i32, MVT::v16i32, MVT::v2i64,
336 if (Subtarget.is64Bit()) {
347 if (!Subtarget.hasSSE2()) {
352 if (Subtarget.is64Bit()) {
357 }
else if (!Subtarget.is64Bit())
370 for (
auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
381 for (
auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128,
382 MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
386 if (Subtarget.is64Bit())
397 if (!Subtarget.useSoftFloat() && Subtarget.hasX87()) {
415 if (!Subtarget.hasBMI()) {
418 if (Subtarget.is64Bit()) {
424 if (Subtarget.hasLZCNT()) {
430 for (
auto VT : {MVT::i8, MVT::i16, MVT::i32, MVT::i64}) {
431 if (VT == MVT::i64 && !Subtarget.is64Bit())
445 (!Subtarget.useSoftFloat() && Subtarget.hasF16C()) ?
Custom :
Expand);
452 for (
auto VT : {MVT::f32, MVT::f64, MVT::f80, MVT::f128}) {
457 for (
MVT VT : {MVT::f32, MVT::f64, MVT::f80, MVT::f128}) {
470 if (Subtarget.is64Bit())
472 if (Subtarget.hasPOPCNT()) {
486 if (!Subtarget.hasMOVBE())
490 for (
auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128 }) {
496 for (
auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
497 if (VT == MVT::i64 && !Subtarget.is64Bit())
517 for (
auto VT : { MVT::i32, MVT::i64 }) {
518 if (VT == MVT::i64 && !Subtarget.is64Bit())
529 for (
auto VT : { MVT::i32, MVT::i64 }) {
530 if (VT == MVT::i64 && !Subtarget.is64Bit())
537 if (Subtarget.hasSSEPrefetch())
543 for (
auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
553 if (!Subtarget.is64Bit())
556 if (Subtarget.is64Bit() && Subtarget.hasAVX()) {
563 if (Subtarget.canUseCMPXCHG16B())
567 if (!Subtarget.isTargetDarwin() && !Subtarget.isTargetELF() &&
568 !Subtarget.isTargetCygMing() && !Subtarget.isTargetWin64() &&
581 if (Subtarget.isTargetPS())
589 bool Is64Bit = Subtarget.is64Bit();
644 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
648 : &X86::FR16RegClass);
650 : &X86::FR32RegClass);
652 : &X86::FR64RegClass);
660 for (
auto VT : { MVT::f32, MVT::f64 }) {
681 setF16Action(MVT::f16,
Promote);
738 }
else if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1() &&
739 (UseX87 || Is64Bit)) {
777 for (
auto VT : { MVT::f32, MVT::f64 }) {
790 if (UseX87 && (
getRegClassFor(MVT::f32) == &X86::RFP32RegClass)) {
791 addLegalFPImmediate(
APFloat(+0.0f));
792 addLegalFPImmediate(
APFloat(+1.0f));
793 addLegalFPImmediate(
APFloat(-0.0f));
794 addLegalFPImmediate(
APFloat(-1.0f));
796 addLegalFPImmediate(
APFloat(+0.0f));
801 addLegalFPImmediate(
APFloat(+0.0));
802 addLegalFPImmediate(
APFloat(+1.0));
803 addLegalFPImmediate(
APFloat(-0.0));
804 addLegalFPImmediate(
APFloat(-1.0));
806 addLegalFPImmediate(
APFloat(+0.0));
837 addLegalFPImmediate(TmpFlt);
839 addLegalFPImmediate(TmpFlt);
845 addLegalFPImmediate(TmpFlt2);
847 addLegalFPImmediate(TmpFlt2);
896 if (!Subtarget.useSoftFloat() && Subtarget.is64Bit() && Subtarget.hasSSE1()) {
898 : &X86::VR128RegClass);
975 for (
auto VT : { MVT::v8f16, MVT::v16f16, MVT::v32f16,
976 MVT::v4f32, MVT::v8f32, MVT::v16f32,
977 MVT::v2f64, MVT::v4f64, MVT::v8f64 }) {
1060 if (!Subtarget.useSoftFloat() && Subtarget.hasMMX()) {
1065 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1()) {
1067 : &X86::VR128RegClass);
1095 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
1097 : &X86::VR128RegClass);
1102 : &X86::VR128RegClass);
1104 : &X86::VR128RegClass);
1106 : &X86::VR128RegClass);
1108 : &X86::VR128RegClass);
1110 : &X86::VR128RegClass);
1112 for (
auto VT : { MVT::f64, MVT::v4f32, MVT::v2f64 }) {
1119 for (
auto VT : { MVT::v2i8, MVT::v4i8, MVT::v8i8,
1120 MVT::v2i16, MVT::v4i16, MVT::v2i32 }) {
1161 if (Subtarget.hasPCLMUL()) {
1162 for (
auto VT : {MVT::i64, MVT::v4i32, MVT::v2i64}) {
1171 for (
auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1194 for (
auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1214 for (
auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
1222 for (
auto VT : { MVT::v8f16, MVT::v2f64, MVT::v2i64 }) {
1227 if (VT == MVT::v2i64 && !Subtarget.is64Bit())
1233 setF16Action(MVT::v8f16,
Expand);
1258 for (
auto VT : {MVT::v2i8, MVT::v4i8, MVT::v8i8, MVT::v2i16, MVT::v4i16}) {
1304 if (!Subtarget.hasAVX512())
1332 for (
auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1336 if (VT == MVT::v2i64)
continue;
1350 if (!Subtarget.useSoftFloat() && Subtarget.hasGFNI()) {
1356 for (
auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64}) {
1364 if (!Subtarget.useSoftFloat() && Subtarget.hasSSSE3()) {
1369 for (
auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64}) {
1381 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE41()) {
1382 for (
MVT RoundedTy : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64}) {
1422 for (
auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1437 if (Subtarget.is64Bit() && !Subtarget.hasAVX512()) {
1449 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE42()) {
1453 if (!Subtarget.useSoftFloat() && Subtarget.hasXOP()) {
1454 for (
MVT VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
1455 MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1461 for (
auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 })
1465 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX()) {
1466 bool HasInt256 = Subtarget.hasInt256();
1469 : &X86::VR256RegClass);
1471 : &X86::VR256RegClass);
1473 : &X86::VR256RegClass);
1475 : &X86::VR256RegClass);
1477 : &X86::VR256RegClass);
1479 : &X86::VR256RegClass);
1481 : &X86::VR256RegClass);
1483 for (
auto VT : { MVT::v8f32, MVT::v4f64 }) {
1547 if (!Subtarget.hasAVX512())
1552 for (
auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1558 if (VT == MVT::v4i64)
continue;
1579 for (
auto VT : { MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1590 for (
auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1609 if (Subtarget.hasAnyFMA()) {
1610 for (
auto VT : { MVT::f32, MVT::f64, MVT::v4f32, MVT::v8f32,
1611 MVT::v2f64, MVT::v4f64 }) {
1617 for (
auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1658 for (
auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
1666 for (
auto VT : {MVT::v16i16, MVT::v8i32, MVT::v4i64}) {
1688 for (
auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1689 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) {
1696 for (
auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
1697 MVT::v8f16, MVT::v4f32, MVT::v2f64 }) {
1702 for (
MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
1703 MVT::v16f16, MVT::v8f32, MVT::v4f64 }) {
1714 setF16Action(MVT::v16f16,
Expand);
1724 if (Subtarget.hasPCLMUL()) {
1725 for (
auto VT : {MVT::v8i32, MVT::v4i64}) {
1738 for (
auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1739 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
1743 if (Subtarget.hasGFNI()) {
1749 if (!Subtarget.useSoftFloat() && !Subtarget.hasFP16() &&
1750 Subtarget.hasF16C()) {
1751 for (
MVT VT : { MVT::f16, MVT::v2f16, MVT::v4f16, MVT::v8f16 }) {
1755 for (
MVT VT : { MVT::f32, MVT::v2f32, MVT::v4f32, MVT::v8f32 }) {
1770 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
1798 if (!Subtarget.hasDQI()) {
1811 for (
auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1817 for (
auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 })
1820 for (
auto VT : { MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 }) {
1833 for (
auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1 })
1836 if (Subtarget.hasDQI() && Subtarget.hasVLX()) {
1837 for (
MVT VT : {MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) {
1846 if (!Subtarget.useSoftFloat() && Subtarget.useAVX512Regs()) {
1847 bool HasBWI = Subtarget.hasBWI();
1867 for (
MVT VT : { MVT::v16f32, MVT::v8f64 }) {
1883 if (Subtarget.hasDQI())
1900 for (
MVT VT : { MVT::v16i1, MVT::v16i8 }) {
1907 for (
MVT VT : { MVT::v16i16, MVT::v16i32 }) {
1944 if (!Subtarget.hasVLX()) {
1945 for (
auto VT : {MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1946 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) {
1972 for (
auto VT : { MVT::v16f32, MVT::v8f64 }) {
1989 for (
auto VT : {MVT::v32i16, MVT::v16i32, MVT::v8i64}) {
2016 for (
auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64 }) {
2040 for (
auto VT : { MVT::v16i32, MVT::v8i64 }) {
2051 for (
auto VT : { MVT::v64i8, MVT::v32i16 }) {
2072 if (Subtarget.hasDQI() || Subtarget.hasFP16())
2078 if (Subtarget.hasDQI())
2081 if (Subtarget.hasCDI()) {
2083 for (
auto VT : { MVT::v16i32, MVT::v8i64} ) {
2088 if (Subtarget.hasVPOPCNTDQ()) {
2089 for (
auto VT : { MVT::v16i32, MVT::v8i64 })
2097 for (
auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
2098 MVT::v16f16, MVT::v8f32, MVT::v4f64 })
2101 for (
auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64,
2102 MVT::v32f16, MVT::v16f32, MVT::v8f64 }) {
2113 setF16Action(MVT::v32f16,
Expand);
2122 for (
auto VT : { MVT::v16i32, MVT::v8i64, MVT::v16f32, MVT::v8f64 }) {
2129 for (
auto VT : { MVT::v64i8, MVT::v32i16 }) {
2138 if (Subtarget.hasVBMI2()) {
2139 for (
auto VT : {MVT::v32i16, MVT::v16i32, MVT::v8i64}) {
2149 if (Subtarget.hasPCLMUL()) {
2150 for (
auto VT : {MVT::v16i32, MVT::v8i64}) {
2160 if (Subtarget.hasGFNI()) {
2166 if (!Subtarget.useSoftFloat() && Subtarget.hasVBMI2()) {
2167 for (
auto VT : {MVT::v8i16, MVT::v4i32, MVT::v2i64, MVT::v16i16, MVT::v8i32,
2182 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
2183 for (
MVT VT : {MVT::f16, MVT::f32, MVT::f64, MVT::v8f16, MVT::v4f32,
2184 MVT::v2f64, MVT::v16f16, MVT::v8f32, MVT::v4f64, MVT::v32f16,
2185 MVT::v16f32, MVT::v8f64})
2194 if (Subtarget.hasDQI()) {
2199 "Unexpected operation action!");
2207 for (
auto VT : { MVT::v2i64, MVT::v4i64 }) {
2215 for (
auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
2224 for (
auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
2225 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
2228 if (Subtarget.hasDQI()) {
2239 if (Subtarget.hasCDI()) {
2240 for (
auto VT : {MVT::i256, MVT::i512}) {
2241 if (VT == MVT::i512 && !Subtarget.useAVX512Regs())
2248 for (
auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
2253 if (Subtarget.hasVPOPCNTDQ()) {
2254 for (
auto VT : {MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64})
2262 for (
MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v4i32, MVT::v4f32, MVT::v4i64,
2263 MVT::v4f64, MVT::v2i64, MVT::v2f64, MVT::v16i8, MVT::v8i16,
2264 MVT::v16i16, MVT::v8i8})
2269 for (
MVT VT : {MVT::v16i32, MVT::v16f32, MVT::v8i64, MVT::v8f64})
2273 if (Subtarget.hasVLX())
2274 for (
MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v4i32, MVT::v4f32, MVT::v4i64,
2275 MVT::v4f64, MVT::v2i64, MVT::v2f64})
2279 if (Subtarget.hasVBMI2())
2280 for (
MVT VT : {MVT::v32i16, MVT::v64i8})
2284 if (Subtarget.hasVBMI2() && Subtarget.hasVLX())
2285 for (
MVT VT : {MVT::v16i8, MVT::v8i16, MVT::v32i8, MVT::v16i16})
2291 if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) {
2295 for (
auto VT : { MVT::v32i1, MVT::v64i1 }) {
2308 for (
auto VT : { MVT::v16i1, MVT::v32i1 })
2316 for (
auto VT : {MVT::v32i8, MVT::v16i8, MVT::v16i16, MVT::v8i16,
2317 MVT::v16f16, MVT::v8f16}) {
2326 if (Subtarget.hasBITALG()) {
2327 for (
auto VT : { MVT::v16i8, MVT::v32i8, MVT::v8i16, MVT::v16i16 })
2332 if (!Subtarget.useSoftFloat() && Subtarget.hasFP16()) {
2333 auto setGroup = [&] (
MVT VT) {
2403 if (Subtarget.useAVX512Regs()) {
2404 setGroup(MVT::v32f16);
2455 if (Subtarget.hasVLX()) {
2456 setGroup(MVT::v8f16);
2457 setGroup(MVT::v16f16);
2508 if (!Subtarget.useSoftFloat() &&
2509 (Subtarget.hasAVXNECONVERT() || Subtarget.hasBF16())) {
2511 : &X86::VR128RegClass);
2512 addRegisterClass(MVT::v16bf16, Subtarget.hasAVX512() ? &X86::VR256XRegClass
2513 : &X86::VR256RegClass);
2519 for (
auto VT : {MVT::v8bf16, MVT::v16bf16}) {
2520 setF16Action(VT,
Expand);
2521 if (!Subtarget.hasBF16())
2538 if (!Subtarget.useSoftFloat() && Subtarget.hasBF16() &&
2539 Subtarget.useAVX512Regs()) {
2541 setF16Action(MVT::v32bf16,
Expand);
2552 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX10_2()) {
2564 for (
auto VT : {MVT::v8bf16, MVT::v16bf16}) {
2577 for (
auto VT : {MVT::f16, MVT::f32, MVT::f64}) {
2583 if (!Subtarget.useSoftFloat() && Subtarget.hasVLX()) {
2596 if (Subtarget.hasBWI()) {
2601 if (Subtarget.hasFP16()) {
2633 if (!Subtarget.useSoftFloat() && Subtarget.hasAMXTILE()) {
2641 if (!Subtarget.is64Bit()) {
2651 for (
auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
2652 if (VT == MVT::i64 && !Subtarget.is64Bit())
2674 if (Subtarget.isTargetWin64()) {
2693 if (Subtarget.is32Bit() &&
2694 (Subtarget.isTargetWindowsMSVC() || Subtarget.isTargetWindowsItanium()))
2722 if (Subtarget.isOSWindows()) {
2835 return Subtarget.isTargetMachO() && Subtarget.is64Bit();
2840 return Subtarget.getTargetTriple().isOSMSVCRT() && !Subtarget.isTargetMachO();
2846 unsigned XorOp = Subtarget.is64Bit() ? X86::XOR64_FP : X86::XOR32_FP;
2853 if ((VT == MVT::v32i1 || VT == MVT::v64i1) && Subtarget.hasAVX512() &&
2854 !Subtarget.hasBWI())
2881 bool AssumeSingleUse,
bool IgnoreAlignment) {
2882 if (!AssumeSingleUse && !
Op.hasOneUse())
2889 if (!IgnoreAlignment && !Subtarget.
hasAVX() &&
2890 !Subtarget.hasSSEUnalignedMem() && Ld->getValueSizeInBits(0) == 128 &&
2891 Ld->getAlign() <
Align(16))
2902 bool AssumeSingleUse) {
2903 assert(Subtarget.
hasAVX() &&
"Expected AVX for broadcast from memory");
2910 return !Ld->isVolatile() ||
2915 if (!
Op.hasOneUse())
2928 if (
Op.hasOneUse()) {
2929 unsigned Opcode =
Op.getNode()->user_begin()->getOpcode();
2943 EVT VT =
Op.getValueType();
2944 unsigned Opcode =
Op.getOpcode();
2945 if ((VT == MVT::i128 || VT == MVT::i256 || VT == MVT::i512) &&
2980 default:
return false;
2981 case X86ISD::BLENDI:
2982 case X86ISD::PSHUFB:
2983 case X86ISD::PSHUFD:
2984 case X86ISD::PSHUFHW:
2985 case X86ISD::PSHUFLW:
2987 case X86ISD::INSERTPS:
2988 case X86ISD::EXTRQI:
2989 case X86ISD::INSERTQI:
2990 case X86ISD::VALIGN:
2991 case X86ISD::PALIGNR:
2992 case X86ISD::VSHLDQ:
2993 case X86ISD::VSRLDQ:
2994 case X86ISD::MOVLHPS:
2995 case X86ISD::MOVHLPS:
2996 case X86ISD::MOVSHDUP:
2997 case X86ISD::MOVSLDUP:
2998 case X86ISD::MOVDDUP:
3002 case X86ISD::UNPCKL:
3003 case X86ISD::UNPCKH:
3004 case X86ISD::VBROADCAST:
3005 case X86ISD::VPERMILPI:
3006 case X86ISD::VPERMILPV:
3007 case X86ISD::VPERM2X128:
3008 case X86ISD::SHUF128:
3009 case X86ISD::VPERMIL2:
3010 case X86ISD::VPERMI:
3011 case X86ISD::VPPERM:
3012 case X86ISD::VPERMV:
3013 case X86ISD::VPERMV3:
3014 case X86ISD::VZEXT_MOVL:
3015 case X86ISD::COMPRESS:
3016 case X86ISD::EXPAND:
3023 default:
return false;
3025 case X86ISD::PSHUFB:
3026 case X86ISD::VPERMILPV:
3027 case X86ISD::VPERMIL2:
3028 case X86ISD::VPPERM:
3029 case X86ISD::VPERMV:
3030 case X86ISD::VPERMV3:
3044 int ReturnAddrIndex = FuncInfo->
getRAIndex();
3046 if (ReturnAddrIndex == 0) {
3048 unsigned SlotSize = RegInfo->getSlotSize();
3059 bool HasSymbolicDisplacement) {
3066 if (!HasSymbolicDisplacement)
3084 return Offset < 16 * 1024 * 1024;
3108 switch (SetCCOpcode) {
3133 if (SetCCOpcode ==
ISD::SETGT && RHSC->isAllOnes()) {
3138 if (SetCCOpcode ==
ISD::SETLT && RHSC->isZero()) {
3142 if (SetCCOpcode ==
ISD::SETGE && RHSC->isZero()) {
3146 if (SetCCOpcode ==
ISD::SETLT && RHSC->isOne()) {
3161 SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
3165 switch (SetCCOpcode) {
3181 switch (SetCCOpcode) {
3242 case Intrinsic::x86_aesenc128kl:
3243 case Intrinsic::x86_aesdec128kl:
3245 Info.ptrVal =
I.getArgOperand(1);
3247 Info.align =
Align(1);
3251 case Intrinsic::x86_aesenc256kl:
3252 case Intrinsic::x86_aesdec256kl:
3254 Info.ptrVal =
I.getArgOperand(1);
3256 Info.align =
Align(1);
3260 case Intrinsic::x86_aesencwide128kl:
3261 case Intrinsic::x86_aesdecwide128kl:
3263 Info.ptrVal =
I.getArgOperand(0);
3265 Info.align =
Align(1);
3269 case Intrinsic::x86_aesencwide256kl:
3270 case Intrinsic::x86_aesdecwide256kl:
3272 Info.ptrVal =
I.getArgOperand(0);
3274 Info.align =
Align(1);
3278 case Intrinsic::x86_cmpccxadd32:
3279 case Intrinsic::x86_cmpccxadd64:
3280 case Intrinsic::x86_atomic_bts:
3281 case Intrinsic::x86_atomic_btc:
3282 case Intrinsic::x86_atomic_btr: {
3284 Info.ptrVal =
I.getArgOperand(0);
3285 unsigned Size =
I.getType()->getScalarSizeInBits();
3293 case Intrinsic::x86_atomic_bts_rm:
3294 case Intrinsic::x86_atomic_btc_rm:
3295 case Intrinsic::x86_atomic_btr_rm: {
3297 Info.ptrVal =
I.getArgOperand(0);
3298 unsigned Size =
I.getArgOperand(1)->getType()->getScalarSizeInBits();
3306 case Intrinsic::x86_aadd32:
3307 case Intrinsic::x86_aadd64:
3308 case Intrinsic::x86_aand32:
3309 case Intrinsic::x86_aand64:
3310 case Intrinsic::x86_aor32:
3311 case Intrinsic::x86_aor64:
3312 case Intrinsic::x86_axor32:
3313 case Intrinsic::x86_axor64:
3314 case Intrinsic::x86_atomic_add_cc:
3315 case Intrinsic::x86_atomic_sub_cc:
3316 case Intrinsic::x86_atomic_or_cc:
3317 case Intrinsic::x86_atomic_and_cc:
3318 case Intrinsic::x86_atomic_xor_cc: {
3320 Info.ptrVal =
I.getArgOperand(0);
3321 unsigned Size =
I.getArgOperand(1)->getType()->getScalarSizeInBits();
3333 switch (IntrData->
Type) {
3338 Info.ptrVal =
I.getArgOperand(0);
3344 ScalarVT = MVT::i16;
3346 ScalarVT = MVT::i32;
3349 Info.align =
Align(1);
3357 Info.ptrVal =
nullptr;
3363 Info.align =
Align(1);
3370 Info.ptrVal =
nullptr;
3376 Info.align =
Align(1);
3390 bool ForCodeSize)
const {
3391 for (
const APFloat &FPImm : LegalFPImmediates)
3392 if (Imm.bitwiseIsEqual(FPImm))
3399 std::optional<unsigned> ByteOffset)
const {
3402 auto PeekThroughOneUserBitcasts = [](
const SDNode *
N) {
3404 N = *
N->user_begin();
3411 if (BasePtr.getOpcode() == X86ISD::WrapperRIP)
3419 EVT VT = Load->getValueType(0);
3421 !
SDValue(Load, 0).hasOneUse()) {
3422 bool FullWidthUse =
false;
3423 bool AllExtractStores =
true;
3426 if (
Use.getResNo() != 0)
3434 const SDNode *Inner = PeekThroughOneUserBitcasts(U.getUser());
3435 return Inner->getOpcode() == ISD::STORE;
3439 AllExtractStores =
false;
3446 FullWidthUse =
true;
3449 if (AllExtractStores)
3466 assert(Ty->isIntegerTy());
3468 unsigned BitSize = Ty->getPrimitiveSizeInBits();
3469 if (BitSize == 0 || BitSize > 64)
3480 return !IsFPSetCC || !Subtarget.isTarget64BitLP64() || !Subtarget.hasAVX();
3486 if (VT.
isVector() && Subtarget.hasAVX512())
3506 unsigned TZeros = ShiftedMulC == 2 ? 0 : ShiftedMulC.
countr_zero();
3508 if ((ShiftedMulC - 1).isPowerOf2() || (ShiftedMulC + 1).isPowerOf2())
3528 (EltSizeInBits != 32 || !Subtarget.isPMULLDSlow()))
3532 return (MulC + 1).isPowerOf2() || (MulC - 1).isPowerOf2() ||
3533 (1 - MulC).isPowerOf2() || (-(MulC + 1)).isPowerOf2();
3537 unsigned Index)
const {
3580 return Subtarget.hasBMI() || Subtarget.canUseCMOV() ||
3581 Subtarget.hasBitScanPassThrough() ||
3582 (!Ty->isVectorTy() &&
3583 Ty->getScalarSizeInBits() < (Subtarget.is64Bit() ? 64u : 32u));
3589 return Subtarget.hasLZCNT() || Subtarget.canUseCMOV() ||
3590 Subtarget.hasBitScanPassThrough();
3597 return !Subtarget.hasSSE2() || VT == MVT::f80;
3601 return (VT == MVT::f64 && Subtarget.hasSSE2()) ||
3602 (VT == MVT::f32 && Subtarget.hasSSE1()) || VT == MVT::f16;
3608 if (!Subtarget.hasAVX512() && !LoadVT.
isVector() && BitcastVT.
isVector() &&
3612 if (!Subtarget.hasDQI() && BitcastVT == MVT::v8i1 && LoadVT == MVT::i8)
3638 unsigned MaxIntSize = Subtarget.is64Bit() ? 64 : 32;
3643 if (MemVT.
getSizeInBits() > Subtarget.getPreferVectorWidth())
3650 return Subtarget.hasFastLZCNT();
3661 return Y.getValueType().isScalarInteger();
3665 EVT VT =
Y.getValueType();
3668 if (!Subtarget.hasBMI())
3672 if (VT != MVT::i32 && VT != MVT::i64)
3681 if (VT == MVT::v4i32)
3684 return Subtarget.hasSSE2();
3688 return X.getValueType().isScalarInteger();
3694 unsigned OldShiftOpcode,
unsigned NewShiftOpcode,
3698 X, XC, CC,
Y, OldShiftOpcode, NewShiftOpcode, DAG))
3701 if (
X.getValueType().isScalarInteger())
3708 if (Subtarget.hasAVX2())
3715 EVT VT,
unsigned ShiftOpc,
bool MayTransformRotate,
3716 const APInt &ShiftOrRotateAmt,
const std::optional<APInt> &AndMask)
const {
3720 bool PreferRotate =
false;
3724 PreferRotate = Subtarget.hasAVX512() && (VT.
getScalarType() == MVT::i32 ||
3729 PreferRotate = Subtarget.hasBMI2();
3730 if (!PreferRotate) {
3733 PreferRotate = (MaskBits != 8) && (MaskBits != 16) && (MaskBits != 32);
3738 assert(AndMask.has_value() &&
"Null andmask when querying about shift+and");
3740 if (PreferRotate && MayTransformRotate)
3774 if (PreferRotate || !MayTransformRotate || VT.
isVector())
3784 const Value *Rhs)
const {
3788 if (BaseCost >= 0 && Subtarget.hasCCMP())
3791 if (BaseCost >= 0 &&
Opc == Instruction::And &&
3800 if (BaseCost >= 0 && !Subtarget.hasCCMP() &&
Opc == Instruction::Or &&
3803 return {-1, -1, -1};
3816 N->getOperand(0).getOpcode() ==
ISD::SRL) ||
3818 N->getOperand(0).getOpcode() ==
ISD::SHL)) &&
3819 "Expected shift-shift mask");
3821 EVT VT =
N->getValueType(0);
3822 if ((Subtarget.hasFastVectorShiftMasks() && VT.
isVector()) ||
3823 (Subtarget.hasFastScalarShiftMasks() && !VT.
isVector())) {
3827 return N->getOperand(1) ==
N->getOperand(0).getOperand(1);
3833 EVT VT =
Y.getValueType();
3839 unsigned MaxWidth = Subtarget.is64Bit() ? 64 : 32;
3847 !Subtarget.isOSWindows())
3897 [CmpVal](
int M) { return isUndefOrEqual(M, CmpVal); });
3913 unsigned NumElts = Mask.size();
3919 unsigned NumElts = Mask.size();
3925 return (Val >=
Low && Val <
Hi);
3968 unsigned NumElts = Mask.size();
3979 unsigned Size,
int Low,
int Step = 1) {
3980 for (
unsigned i = Pos, e = Pos +
Size; i != e; ++i,
Low += Step)
3992 for (
unsigned i = Pos, e = Pos +
Size; i != e; ++i,
Low += Step)
4008 unsigned NumElts = Mask.size();
4027 WidenedMask.
assign(Mask.size() / 2, 0);
4028 for (
int i = 0,
Size = Mask.size(); i <
Size; i += 2) {
4030 int M1 = Mask[i + 1];
4041 WidenedMask[i / 2] =
M1 / 2;
4045 WidenedMask[i / 2] =
M0 / 2;
4062 WidenedMask[i / 2] =
M0 / 2;
4069 assert(WidenedMask.
size() == Mask.size() / 2 &&
4070 "Incorrect size of mask after widening the elements!");
4076 const APInt &Zeroable,
4083 assert(!Zeroable.
isZero() &&
"V2's non-undef elements are used?!");
4084 for (
int i = 0,
Size = Mask.size(); i !=
Size; ++i)
4100 unsigned NumSrcElts = Mask.size();
4101 assert(((NumSrcElts % NumDstElts) == 0 || (NumDstElts % NumSrcElts) == 0) &&
4102 "Illegal shuffle scale factor");
4105 if (NumDstElts >= NumSrcElts) {
4106 int Scale = NumDstElts / NumSrcElts;
4114 while (ScaledMask.
size() > NumDstElts) {
4118 ScaledMask = std::move(WidenedMask);
4135 unsigned SrcSizeInBits,
unsigned DstSizeInBits) {
4136 assert(DstMask.
empty() &&
"Expected an empty shuffle mas");
4137 assert((DstSizeInBits % SrcSizeInBits) == 0 &&
"Illegal shuffle scale");
4138 unsigned Scale = DstSizeInBits / SrcSizeInBits;
4139 unsigned NumSrcElts = SrcMask.
size();
4141 for (
int &M : DstMask) {
4144 M = (M % NumSrcElts) + ((M / NumSrcElts) * Scale * NumSrcElts);
4158 const SDLoc &dl,
bool IsMask =
false) {
4163 MVT ConstVecVT = VT;
4172 for (
unsigned i = 0; i < NumElts; ++i) {
4173 bool IsUndef = Values[i] < 0 && IsMask;
4176 Ops.push_back(OpNode);
4190 "Unequal constant and undef arrays");
4194 MVT ConstVecVT = VT;
4204 for (
unsigned i = 0, e = Bits.size(); i != e; ++i) {
4209 const APInt &V = Bits[i];
4213 Ops.push_back(DAG.
getConstant(V.extractBits(32, 32), dl, EltVT));
4234 "Unexpected vector type");
4248 "Unexpected vector type");
4262 LHS.getValueType() !=
RHS.getValueType() ||
4263 LHS.getOperand(0) !=
RHS.getOperand(0))
4267 if (Src.getValueSizeInBits() != (
LHS.getValueSizeInBits() * 2))
4270 unsigned NumElts =
LHS.getValueType().getVectorNumElements();
4271 if ((
LHS.getConstantOperandAPInt(1) == 0 &&
4272 RHS.getConstantOperandAPInt(1) == NumElts) ||
4273 (AllowCommute &&
RHS.getConstantOperandAPInt(1) == 0 &&
4274 LHS.getConstantOperandAPInt(1) == NumElts))
4281 const SDLoc &dl,
unsigned vectorWidth) {
4284 unsigned ResultNumElts =
4289 "Illegal subvector extraction");
4292 unsigned ElemsPerChunk = vectorWidth / ElVT.
getSizeInBits();
4297 IdxVal &= ~(ElemsPerChunk - 1);
4302 Vec->
ops().slice(IdxVal, ElemsPerChunk));
4323 "Unexpected vector size!");
4336 unsigned vectorWidth) {
4337 assert((vectorWidth == 128 || vectorWidth == 256) &&
4338 "Unsupported vector width");
4350 IdxVal &= ~(ElemsPerChunk - 1);
4374 "Unsupported vector widening type");
4394 const SDLoc &dl,
unsigned WideSizeInBits) {
4397 "Unsupported vector widening type");
4401 return widenSubVector(VT, Vec, ZeroNewElements, Subtarget, DAG, dl);
4409 if ((!Subtarget.hasDQI() && NumElts == 8) || NumElts < 8)
4410 return Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
4420 return widenSubVector(VT, Vec, ZeroNewElements, Subtarget, DAG, dl);
4428 assert(
Ops.empty() &&
"Expected an empty ops vector");
4431 Ops.append(
N->op_begin(),
N->op_end());
4438 const APInt &Idx =
N->getConstantOperandAPInt(2);
4439 EVT VT = Src.getValueType();
4440 EVT SubVT =
Sub.getValueType();
4444 if (Idx == 0 && Src.isUndef()) {
4452 Src.getOperand(1).getValueType() == SubVT &&
4476 if (Src.isUndef()) {
4486 EVT VT =
N->getValueType(0);
4488 uint64_t Idx =
N->getConstantOperandVal(1);
4495 (VT.
getSizeInBits() % SrcOps[0].getValueSizeInBits()) == 0 &&
4496 (Idx % SrcOps[0].getValueType().getVectorNumElements()) == 0) {
4497 unsigned SubIdx = Idx / SrcOps[0].getValueType().getVectorNumElements();
4498 unsigned NumSubs = VT.
getSizeInBits() / SrcOps[0].getValueSizeInBits();
4499 Ops.append(SrcOps.
begin() + SubIdx, SrcOps.
begin() + SubIdx + NumSubs);
4504 assert(
Ops.empty() &&
"Expected an empty ops vector");
4516 unsigned NumSubOps = SubOps.
size();
4517 unsigned HalfNumSubOps = NumSubOps / 2;
4518 assert((NumSubOps % 2) == 0 &&
"Unexpected number of subvectors");
4538 EVT VT =
Op.getValueType();
4541 assert((NumElems % 2) == 0 && (SizeInBits % 2) == 0 &&
4542 "Can't split odd sized vector");
4546 assert((SubOps.
size() % 2) == 0 &&
"Can't split odd sized vector concat");
4547 unsigned HalfOps = SubOps.
size() / 2;
4553 return std::make_pair(
Lo,
Hi);
4560 return std::make_pair(
Lo,
Lo);
4563 return std::make_pair(
Lo,
Hi);
4568 unsigned NumOps =
Op.getNumOperands();
4569 EVT VT =
Op.getValueType();
4574 for (
unsigned I = 0;
I !=
NumOps; ++
I) {
4576 if (!
SrcOp.getValueType().isVector()) {
4586 DAG.
getNode(
Op.getOpcode(), dl, LoVT, LoOps),
4587 DAG.
getNode(
Op.getOpcode(), dl, HiVT, HiOps));
4596 [[maybe_unused]]
EVT VT =
Op.getValueType();
4597 assert((
Op.getOperand(0).getValueType().is256BitVector() ||
4598 Op.getOperand(0).getValueType().is512BitVector()) &&
4600 assert(
Op.getOperand(0).getValueType().getVectorNumElements() ==
4611 [[maybe_unused]]
EVT VT =
Op.getValueType();
4612 assert(
Op.getOperand(0).getValueType() == VT &&
4613 Op.getOperand(1).getValueType() == VT &&
"Unexpected VTs!");
4625template <
typename F>
4628 F Builder,
bool CheckBWI =
true,
4629 bool AllowAVX512 =
true) {
4630 assert(Subtarget.
hasSSE2() &&
"Target assumed to support at least SSE2");
4631 unsigned NumSubs = 1;
4632 if (AllowAVX512 && ((CheckBWI && Subtarget.
useBWIRegs()) ||
4638 }
else if (Subtarget.
hasAVX2()) {
4651 return Builder(DAG,
DL,
Ops);
4654 for (
unsigned i = 0; i != NumSubs; ++i) {
4657 EVT OpVT =
Op.getValueType();
4678 unsigned OpEltSizeInBits = OpVT.getScalarSizeInBits();
4681 if (!OpVT.isInteger() || OpEltSizeInBits < 32 ||
4688 APInt SplatValue, SplatUndef;
4689 unsigned SplatBitSize;
4691 if (BV->isConstantSplat(SplatValue, SplatUndef, SplatBitSize,
4692 HasAnyUndefs, OpEltSizeInBits) &&
4693 !HasAnyUndefs && SplatValue.
getBitWidth() == OpEltSizeInBits)
4708 MVT OpVT =
Op.getSimpleValueType();
4712 assert(OpVT == VT &&
"Vector type mismatch");
4714 if (
SDValue BroadcastOp = MakeBroadcastOp(
Op, OpVT, DstVT)) {
4740 unsigned IdxVal =
Op.getConstantOperandVal(2);
4746 if (IdxVal == 0 && Vec.
isUndef())
4749 MVT OpVT =
Op.getSimpleValueType();
4768 assert(IdxVal + SubVecNumElems <= NumElems &&
4770 "Unexpected index value in INSERT_SUBVECTOR");
4779 Vec = DAG.
getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
4780 Vec = DAG.
getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
4790 Undef, SubVec, ZeroIdx);
4793 assert(IdxVal != 0 &&
"Unexpected index");
4794 SubVec = DAG.
getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4800 assert(IdxVal != 0 &&
"Unexpected index");
4803 [](
SDValue V) { return V.isUndef(); })) {
4804 SubVec = DAG.
getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4808 unsigned ShiftLeft = NumElems - SubVecNumElems;
4809 unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
4810 SubVec = DAG.
getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4812 if (ShiftRight != 0)
4813 SubVec = DAG.
getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
4820 if (IdxVal + SubVecNumElems == NumElems) {
4821 SubVec = DAG.
getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4823 if (SubVecNumElems * 2 == NumElems) {
4833 Undef, Vec, ZeroIdx);
4836 Vec = DAG.
getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
4837 Vec = DAG.
getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
4850 unsigned ShiftLeft = NumElems - SubVecNumElems;
4851 unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
4854 if (WideOpVT != MVT::v64i1 || Subtarget.is64Bit()) {
4860 SubVec = DAG.
getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4862 SubVec = DAG.
getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
4871 SubVec = DAG.
getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4873 SubVec = DAG.
getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
4877 unsigned LowShift = NumElems - IdxVal;
4884 unsigned HighShift = IdxVal + SubVecNumElems;
4915 "Expected a 128/256/512-bit vector type");
4927 return IsVariable ? X86ISD::VSHL : X86ISD::VSHLI;
4931 return IsVariable ? X86ISD::VSRL : X86ISD::VSRLI;
4935 return IsVariable ? X86ISD::VSRA : X86ISD::VSRAI;
4949 if (VT !=
SrcOp.getSimpleValueType())
4957 if (ShiftAmt >= ElementType.getSizeInBits()) {
4958 if (
Opc == X86ISD::VSRAI)
4959 ShiftAmt = ElementType.getSizeInBits() - 1;
4965 (
Opc == X86ISD::VSHLI ||
Opc == X86ISD::VSRLI ||
Opc == X86ISD::VSRAI) &&
4966 "Unknown target vector shift-by-constant node");
5003 "Illegal vector splat index");
5006 if (ShAmtIdx != 0) {
5025 bool IsMasked =
false;
5033 ShAmt = DAG.
getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, ShAmt);
5046 {ShAmt.getOperand(1), Mask}))) {
5062 if (AmtVT == MVT::v4i32 && (ShAmt.
getOpcode() == X86ISD::VBROADCAST ||
5063 ShAmt.
getOpcode() == X86ISD::VBROADCAST_LOAD)) {
5064 ShAmt = DAG.
getNode(X86ISD::VZEXT_MOVL,
SDLoc(ShAmt), MVT::v4i32, ShAmt);
5072 ShAmt = DAG.
getNode(X86ISD::VSHLDQ,
SDLoc(ShAmt), MVT::v16i8, ShAmt,
5074 ShAmt = DAG.
getNode(X86ISD::VSRLDQ,
SDLoc(ShAmt), MVT::v16i8, ShAmt,
5093 EVT InVT = In.getValueType();
5118 "Expected VTs to be the same size!");
5122 InVT = In.getValueType();
5140 bool Lo,
bool Unary) {
5142 "Illegal vector type to unpack");
5143 assert(Mask.empty() &&
"Expected an empty shuffle mask vector");
5146 for (
int i = 0; i < NumElts; ++i) {
5147 unsigned LaneStart = (i / NumEltsInLane) * NumEltsInLane;
5148 int Pos = (i % NumEltsInLane) / 2 + LaneStart;
5149 Pos += (Unary ? 0 : NumElts * (i % 2));
5150 Pos += (
Lo ? 0 : NumEltsInLane / 2);
5151 Mask.push_back(Pos);
5161 assert(Mask.empty() &&
"Expected an empty shuffle mask vector");
5163 for (
int i = 0; i < NumElts; ++i) {
5165 Pos += (
Lo ? 0 : NumElts / 2);
5166 Mask.push_back(Pos);
5176 for (
int I = 0, NumElts = Mask.size();
I != NumElts; ++
I) {
5180 SDValue V = (M < NumElts) ? V1 : V2;
5183 Ops[
I] = V.getOperand(M % NumElts);
5212 bool PackHiHalf =
false) {
5213 MVT OpVT =
LHS.getSimpleValueType();
5215 bool UsePackUS = Subtarget.
hasSSE41() || EltSizeInBits == 8;
5216 assert(OpVT ==
RHS.getSimpleValueType() &&
5219 "Unexpected PACK operand types");
5220 assert((EltSizeInBits == 8 || EltSizeInBits == 16 || EltSizeInBits == 32) &&
5221 "Unexpected PACK result type");
5224 if (EltSizeInBits == 32) {
5226 int Offset = PackHiHalf ? 1 : 0;
5228 for (
int I = 0;
I != NumElts;
I += 4) {
5286 for (
int i = 0; i != NumElems; ++i)
5288 MaskVec[i] = (i == Idx) ? NumElems : i;
5293 if (Ptr.
getOpcode() == X86ISD::Wrapper ||
5320 assert(LD &&
"Unexpected null LoadSDNode");
5328 EVT CondVT =
Cond.getValueType();
5329 return N->getOpcode() ==
ISD::VSELECT && Subtarget.hasAVX512() &&
5338 bool AllowWholeUndefs =
true,
5339 bool AllowPartialUndefs =
false) {
5340 assert(EltBits.
empty() &&
"Expected an empty EltBits vector");
5344 EVT VT =
Op.getValueType();
5346 unsigned NumElts = SizeInBits / EltSizeInBits;
5349 if ((SizeInBits % EltSizeInBits) != 0)
5355 unsigned SrcEltSizeInBits = SrcEltBits[0].getBitWidth();
5356 assert((NumSrcElts * SrcEltSizeInBits) == SizeInBits &&
5357 "Constant bit sizes don't match");
5360 bool AllowUndefs = AllowWholeUndefs || AllowPartialUndefs;
5365 if (NumSrcElts == NumElts) {
5366 UndefElts = UndefSrcElts;
5367 EltBits.
assign(SrcEltBits.begin(), SrcEltBits.end());
5372 APInt UndefBits(SizeInBits, 0);
5373 APInt MaskBits(SizeInBits, 0);
5375 for (
unsigned i = 0; i != NumSrcElts; ++i) {
5376 unsigned BitOffset = i * SrcEltSizeInBits;
5377 if (UndefSrcElts[i])
5378 UndefBits.
setBits(BitOffset, BitOffset + SrcEltSizeInBits);
5379 MaskBits.
insertBits(SrcEltBits[i], BitOffset);
5383 UndefElts =
APInt(NumElts, 0);
5386 for (
unsigned i = 0; i != NumElts; ++i) {
5387 unsigned BitOffset = i * EltSizeInBits;
5392 if (!AllowWholeUndefs)
5400 if (UndefEltBits.
getBoolValue() && !AllowPartialUndefs)
5403 EltBits[i] = MaskBits.
extractBits(EltSizeInBits, BitOffset);
5410 unsigned UndefBitIndex) {
5414 Undefs.setBit(UndefBitIndex);
5418 Mask = CInt->getValue();
5422 Mask = CFP->getValueAPF().bitcastToAPInt();
5426 Type *Ty = CDS->getType();
5428 Type *EltTy = CDS->getElementType();
5432 if (!IsInteger && !IsFP)
5435 for (
unsigned I = 0,
E = CDS->getNumElements();
I !=
E; ++
I)
5437 Mask.insertBits(CDS->getElementAsAPInt(
I),
I * EltBits);
5439 Mask.insertBits(CDS->getElementAsAPFloat(
I).bitcastToAPInt(),
5450 return CastBitData(UndefSrcElts, SrcEltBits);
5457 return CastBitData(UndefSrcElts, SrcEltBits);
5461 APInt RawBits = Cst->getValueAPF().bitcastToAPInt();
5463 return CastBitData(UndefSrcElts, SrcEltBits);
5471 if (BV->getConstantRawBits(
true, SrcEltSizeInBits, SrcEltBits, Undefs)) {
5473 for (
unsigned I = 0,
E = SrcEltBits.
size();
I !=
E; ++
I)
5476 return CastBitData(UndefSrcElts, SrcEltBits);
5484 if (!CstTy->
isVectorTy() || (CstSizeInBits % SizeInBits) != 0)
5488 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
5489 if ((SizeInBits % SrcEltSizeInBits) != 0)
5492 APInt UndefSrcElts(NumSrcElts, 0);
5494 for (
unsigned i = 0; i != NumSrcElts; ++i)
5499 return CastBitData(UndefSrcElts, SrcEltBits);
5503 if (
Op.getOpcode() == X86ISD::VBROADCAST_LOAD &&
5509 SDValue Ptr = MemIntr->getBasePtr();
5512 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
5514 APInt UndefSrcElts(NumSrcElts, 0);
5516 if (CollectConstantBits(
C, SrcEltBits[0], UndefSrcElts, 0)) {
5517 if (UndefSrcElts[0])
5518 UndefSrcElts.
setBits(0, NumSrcElts);
5519 if (SrcEltBits[0].
getBitWidth() != SrcEltSizeInBits)
5520 SrcEltBits[0] = SrcEltBits[0].trunc(SrcEltSizeInBits);
5521 SrcEltBits.
append(NumSrcElts - 1, SrcEltBits[0]);
5522 return CastBitData(UndefSrcElts, SrcEltBits);
5528 if (
Op.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) {
5530 SDValue Ptr = MemIntr->getBasePtr();
5536 unsigned SubVecSizeInBits = MemIntr->getMemoryVT().getStoreSizeInBits();
5537 if (!CstTy->
isVectorTy() || (CstSizeInBits % SubVecSizeInBits) != 0 ||
5538 (SizeInBits % SubVecSizeInBits) != 0)
5541 unsigned NumSubElts = SubVecSizeInBits / CstEltSizeInBits;
5542 unsigned NumSubVecs = SizeInBits / SubVecSizeInBits;
5543 APInt UndefSubElts(NumSubElts, 0);
5545 APInt(CstEltSizeInBits, 0));
5546 for (
unsigned i = 0; i != NumSubElts; ++i) {
5550 for (
unsigned j = 1; j != NumSubVecs; ++j)
5551 SubEltBits[i + (j * NumSubElts)] = SubEltBits[i];
5555 return CastBitData(UndefSubElts, SubEltBits);
5560 if (
Op.getOpcode() == X86ISD::VZEXT_MOVL &&
5564 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
5566 APInt UndefSrcElts(NumSrcElts, 0);
5568 const APInt &
C =
Op.getOperand(0).getConstantOperandAPInt(0);
5569 SrcEltBits.
push_back(
C.zextOrTrunc(SrcEltSizeInBits));
5570 SrcEltBits.
append(NumSrcElts - 1,
APInt(SrcEltSizeInBits, 0));
5571 return CastBitData(UndefSrcElts, SrcEltBits);
5579 bool AllowUndefs = EltSizeInBits >= SrcEltSizeInBits;
5581 APInt UndefSrcElts, UndefSubElts;
5584 UndefSubElts, EltSubBits,
5585 AllowWholeUndefs && AllowUndefs,
5586 AllowPartialUndefs && AllowUndefs) &&
5588 UndefSrcElts, EltSrcBits,
5589 AllowWholeUndefs && AllowUndefs,
5590 AllowPartialUndefs && AllowUndefs)) {
5591 unsigned BaseIdx =
Op.getConstantOperandVal(2);
5592 UndefSrcElts.
insertBits(UndefSubElts, BaseIdx);
5593 for (
unsigned i = 0, e = EltSubBits.
size(); i != e; ++i)
5594 EltSrcBits[BaseIdx + i] = EltSubBits[i];
5595 return CastBitData(UndefSrcElts, EltSrcBits);
5602 EltBits, AllowWholeUndefs,
5603 AllowPartialUndefs)) {
5604 EVT SrcVT =
Op.getOperand(0).getValueType();
5605 unsigned NumSrcElts = SrcVT.
getSizeInBits() / EltSizeInBits;
5608 unsigned BaseIdx = BaseOfs / EltSizeInBits;
5611 (BaseOfs % EltSizeInBits) == 0 &&
"Bad subvector index");
5613 UndefElts = UndefElts.
extractBits(NumSubElts, BaseIdx);
5614 if ((BaseIdx + NumSubElts) != NumSrcElts)
5615 EltBits.
erase(EltBits.
begin() + BaseIdx + NumSubElts, EltBits.
end());
5628 if ((!AllowWholeUndefs || !AllowPartialUndefs) &&
5632 APInt UndefElts0, UndefElts1;
5636 UndefElts0, EltBits0, AllowWholeUndefs,
5637 AllowPartialUndefs))
5641 UndefElts1, EltBits1, AllowWholeUndefs,
5642 AllowPartialUndefs))
5646 for (
int i = 0; i != (int)NumElts; ++i) {
5651 }
else if (M < (
int)NumElts) {
5656 if (UndefElts1[M - NumElts])
5658 EltBits.
push_back(EltBits1[M - NumElts]);
5673 Op,
Op.getScalarValueSizeInBits(), UndefElts, EltBits,
5674 true, AllowPartialUndefs)) {
5675 int SplatIndex = -1;
5676 for (
int i = 0, e = EltBits.
size(); i != e; ++i) {
5679 if (0 <= SplatIndex && EltBits[i] != EltBits[SplatIndex]) {
5685 if (0 <= SplatIndex) {
5686 SplatVal = EltBits[SplatIndex];
5699 case ::llvm::RoundingMode::TowardPositive:
return X86::rmUpward;
5710 unsigned MaskEltSizeInBits,
5721 for (
const APInt &Elt : EltBits)
5736 bool IsPow2OrUndef =
true;
5737 for (
unsigned I = 0,
E = EltBits.
size();
I !=
E; ++
I)
5738 IsPow2OrUndef &= UndefElts[
I] || EltBits[
I].isPowerOf2();
5739 return IsPow2OrUndef;
5746 EVT VT = V.getValueType();
5752 return V.getOperand(0);
5756 (
isNullConstant(V.getOperand(1)) || V.getOperand(0).hasOneUse())) {
5758 Not = DAG.
getBitcast(V.getOperand(0).getValueType(), Not);
5765 if (V.getOpcode() == X86ISD::PCMPGT &&
5768 V.getOperand(0).hasOneUse()) {
5772 V.getScalarValueSizeInBits(), UndefElts,
5776 bool MinSigned =
false;
5777 for (
APInt &Elt : EltBits) {
5778 MinSigned |= Elt.isMinSignedValue();
5783 MVT VT = V.getSimpleValueType();
5784 return DAG.
getNode(X86ISD::PCMPGT,
DL, VT, V.getOperand(1),
5793 for (
SDValue &CatOp : CatOps) {
5797 CatOp = DAG.
getBitcast(CatOp.getValueType(), NotCat);
5804 V.getOperand(0).hasOneUse() && V.getOperand(1).hasOneUse()) {
5819 bool Unary,
unsigned NumStages = 1) {
5820 assert(Mask.empty() &&
"Expected an empty shuffle mask vector");
5824 unsigned Offset = Unary ? 0 : NumElts;
5825 unsigned Repetitions = 1u << (NumStages - 1);
5827 assert((NumEltsPerLane >> NumStages) > 0 &&
"Illegal packing compaction");
5829 for (
unsigned Lane = 0; Lane != NumLanes; ++Lane) {
5830 for (
unsigned Stage = 0; Stage != Repetitions; ++Stage) {
5831 for (
unsigned Elt = 0; Elt != NumEltsPerLane; Elt +=
Increment)
5832 Mask.push_back(Elt + (Lane * NumEltsPerLane));
5833 for (
unsigned Elt = 0; Elt != NumEltsPerLane; Elt +=
Increment)
5834 Mask.push_back(Elt + (Lane * NumEltsPerLane) +
Offset);
5844 int NumInnerElts = NumElts / 2;
5845 int NumEltsPerLane = NumElts / NumLanes;
5846 int NumInnerEltsPerLane = NumInnerElts / NumLanes;
5852 for (
int Lane = 0; Lane != NumLanes; ++Lane) {
5853 for (
int Elt = 0; Elt != NumInnerEltsPerLane; ++Elt) {
5854 int OuterIdx = (Lane * NumEltsPerLane) + Elt;
5855 int InnerIdx = (Lane * NumInnerEltsPerLane) + Elt;
5856 if (DemandedElts[OuterIdx])
5857 DemandedLHS.
setBit(InnerIdx);
5858 if (DemandedElts[OuterIdx + NumInnerEltsPerLane])
5859 DemandedRHS.
setBit(InnerIdx);
5868 DemandedLHS, DemandedRHS);
5869 DemandedLHS |= DemandedLHS << 1;
5870 DemandedRHS |= DemandedRHS << 1;
5886 MVT VT =
N.getSimpleValueType();
5893 assert(Mask.empty() &&
"getTargetShuffleMask expects an empty Mask vector");
5894 assert(
Ops.empty() &&
"getTargetShuffleMask expects an empty Ops vector");
5897 bool IsFakeUnary =
false;
5898 switch (
N.getOpcode()) {
5899 case X86ISD::BLENDI:
5900 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
5901 assert(
N.getOperand(1).getValueType() == VT &&
"Unexpected value type");
5902 ImmN =
N.getConstantOperandVal(
N.getNumOperands() - 1);
5904 IsUnary = IsFakeUnary =
N.getOperand(0) ==
N.getOperand(1);
5907 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
5908 assert(
N.getOperand(1).getValueType() == VT &&
"Unexpected value type");
5909 ImmN =
N.getConstantOperandVal(
N.getNumOperands() - 1);
5911 IsUnary = IsFakeUnary =
N.getOperand(0) ==
N.getOperand(1);
5913 case X86ISD::INSERTPS:
5914 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
5915 assert(
N.getOperand(1).getValueType() == VT &&
"Unexpected value type");
5916 ImmN =
N.getConstantOperandVal(
N.getNumOperands() - 1);
5918 IsUnary = IsFakeUnary =
N.getOperand(0) ==
N.getOperand(1);
5920 case X86ISD::EXTRQI:
5921 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
5924 int BitLen =
N.getConstantOperandVal(1);
5925 int BitIdx =
N.getConstantOperandVal(2);
5930 case X86ISD::INSERTQI:
5931 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
5932 assert(
N.getOperand(1).getValueType() == VT &&
"Unexpected value type");
5935 int BitLen =
N.getConstantOperandVal(2);
5936 int BitIdx =
N.getConstantOperandVal(3);
5938 IsUnary = IsFakeUnary =
N.getOperand(0) ==
N.getOperand(1);
5941 case X86ISD::UNPCKH:
5942 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
5943 assert(
N.getOperand(1).getValueType() == VT &&
"Unexpected value type");
5945 IsUnary = IsFakeUnary =
N.getOperand(0) ==
N.getOperand(1);
5947 case X86ISD::UNPCKL:
5948 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
5949 assert(
N.getOperand(1).getValueType() == VT &&
"Unexpected value type");
5951 IsUnary = IsFakeUnary =
N.getOperand(0) ==
N.getOperand(1);
5953 case X86ISD::MOVHLPS:
5954 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
5955 assert(
N.getOperand(1).getValueType() == VT &&
"Unexpected value type");
5957 IsUnary = IsFakeUnary =
N.getOperand(0) ==
N.getOperand(1);
5959 case X86ISD::MOVLHPS:
5960 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
5961 assert(
N.getOperand(1).getValueType() == VT &&
"Unexpected value type");
5963 IsUnary = IsFakeUnary =
N.getOperand(0) ==
N.getOperand(1);
5965 case X86ISD::VALIGN:
5967 "Only 32-bit and 64-bit elements are supported!");
5968 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
5969 assert(
N.getOperand(1).getValueType() == VT &&
"Unexpected value type");
5970 ImmN =
N.getConstantOperandVal(
N.getNumOperands() - 1);
5972 IsUnary = IsFakeUnary =
N.getOperand(0) ==
N.getOperand(1);
5973 Ops.push_back(
N.getOperand(1));
5974 Ops.push_back(
N.getOperand(0));
5976 case X86ISD::PALIGNR:
5978 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
5979 assert(
N.getOperand(1).getValueType() == VT &&
"Unexpected value type");
5980 ImmN =
N.getConstantOperandVal(
N.getNumOperands() - 1);
5982 IsUnary = IsFakeUnary =
N.getOperand(0) ==
N.getOperand(1);
5983 Ops.push_back(
N.getOperand(1));
5984 Ops.push_back(
N.getOperand(0));
5986 case X86ISD::VSHLDQ:
5988 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
5989 ImmN =
N.getConstantOperandVal(
N.getNumOperands() - 1);
5993 case X86ISD::VSRLDQ:
5995 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
5996 ImmN =
N.getConstantOperandVal(
N.getNumOperands() - 1);
6000 case X86ISD::PSHUFD:
6001 case X86ISD::VPERMILPI:
6002 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
6003 ImmN =
N.getConstantOperandVal(
N.getNumOperands() - 1);
6007 case X86ISD::PSHUFHW:
6008 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
6009 ImmN =
N.getConstantOperandVal(
N.getNumOperands() - 1);
6013 case X86ISD::PSHUFLW:
6014 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
6015 ImmN =
N.getConstantOperandVal(
N.getNumOperands() - 1);
6019 case X86ISD::VZEXT_MOVL:
6020 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
6024 case X86ISD::VBROADCAST:
6028 if (
N.getOperand(0).getValueType() == VT) {
6034 case X86ISD::VPERMILPV: {
6035 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
6037 SDValue MaskNode =
N.getOperand(1);
6045 case X86ISD::PSHUFB: {
6047 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
6048 assert(
N.getOperand(1).getValueType() == VT &&
"Unexpected value type");
6050 SDValue MaskNode =
N.getOperand(1);
6057 case X86ISD::VPERMI:
6058 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
6059 ImmN =
N.getConstantOperandVal(
N.getNumOperands() - 1);
6066 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
6067 assert(
N.getOperand(1).getValueType() == VT &&
"Unexpected value type");
6070 case X86ISD::VPERM2X128:
6071 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
6072 assert(
N.getOperand(1).getValueType() == VT &&
"Unexpected value type");
6073 ImmN =
N.getConstantOperandVal(
N.getNumOperands() - 1);
6075 IsUnary = IsFakeUnary =
N.getOperand(0) ==
N.getOperand(1);
6077 case X86ISD::SHUF128:
6078 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
6079 assert(
N.getOperand(1).getValueType() == VT &&
"Unexpected value type");
6080 ImmN =
N.getConstantOperandVal(
N.getNumOperands() - 1);
6082 IsUnary = IsFakeUnary =
N.getOperand(0) ==
N.getOperand(1);
6084 case X86ISD::MOVSLDUP:
6085 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
6089 case X86ISD::MOVSHDUP:
6090 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
6094 case X86ISD::MOVDDUP:
6095 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
6099 case X86ISD::VPERMIL2: {
6100 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
6101 assert(
N.getOperand(1).getValueType() == VT &&
"Unexpected value type");
6102 IsUnary = IsFakeUnary =
N.getOperand(0) ==
N.getOperand(1);
6103 SDValue MaskNode =
N.getOperand(2);
6104 SDValue CtrlNode =
N.getOperand(3);
6106 unsigned CtrlImm = CtrlOp->getZExtValue();
6116 case X86ISD::VPPERM: {
6117 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
6118 assert(
N.getOperand(1).getValueType() == VT &&
"Unexpected value type");
6119 IsUnary = IsFakeUnary =
N.getOperand(0) ==
N.getOperand(1);
6120 SDValue MaskNode =
N.getOperand(2);
6127 case X86ISD::VPERMV: {
6128 assert(
N.getOperand(1).getValueType() == VT &&
"Unexpected value type");
6131 Ops.push_back(
N.getOperand(1));
6132 SDValue MaskNode =
N.getOperand(0);
6140 case X86ISD::VPERMV3: {
6141 assert(
N.getOperand(0).getValueType() == VT &&
"Unexpected value type");
6142 assert(
N.getOperand(2).getValueType() == VT &&
"Unexpected value type");
6143 IsUnary = IsFakeUnary =
N.getOperand(0) ==
N.getOperand(2);
6145 Ops.push_back(
N.getOperand(0));
6146 Ops.push_back(
N.getOperand(2));
6147 SDValue MaskNode =
N.getOperand(1);
6155 case X86ISD::COMPRESS: {
6157 SDValue PassThru =
N.getOperand(1);
6164 "Illegal compression mask");
6165 for (
unsigned I = 0;
I != NumElems; ++
I) {
6169 while (Mask.size() != NumElems) {
6170 Mask.push_back(NumElems + Mask.size());
6172 Ops.push_back(CmpVec);
6173 Ops.push_back(PassThru);
6176 case X86ISD::EXPAND: {
6178 SDValue PassThru =
N.getOperand(1);
6185 "Illegal expansion mask");
6186 unsigned ExpIndex = 0;
6187 for (
unsigned I = 0;
I != NumElems; ++
I) {
6189 Mask.push_back(
I + NumElems);
6191 Mask.push_back(ExpIndex++);
6193 Ops.push_back(ExpVec);
6194 Ops.push_back(PassThru);
6206 if (!AllowSentinelZero &&
isAnyZero(Mask))
6214 if (M >= (
int)Mask.size())
6220 Ops.push_back(
N.getOperand(0));
6221 if (!IsUnary || IsFakeUnary)
6222 Ops.push_back(
N.getOperand(1));
6247 int Size = Mask.size();
6257 int ScalarSizeInBits = VectorSizeInBits /
Size;
6258 assert(!(VectorSizeInBits % ScalarSizeInBits) &&
"Illegal shuffle mask size");
6260 for (
int i = 0; i <
Size; ++i) {
6267 if ((M >= 0 && M <
Size && V1IsZero) || (M >=
Size && V2IsZero)) {
6282 if ((
Size % V.getNumOperands()) == 0) {
6283 int Scale =
Size / V->getNumOperands();
6290 APInt Val = Cst->getAPIntValue();
6291 Val = Val.
extractBits(ScalarSizeInBits, (M % Scale) * ScalarSizeInBits);
6295 APInt Val = Cst->getValueAPF().bitcastToAPInt();
6296 Val = Val.
extractBits(ScalarSizeInBits, (M % Scale) * ScalarSizeInBits);
6305 if ((V.getNumOperands() %
Size) == 0) {
6306 int Scale = V->getNumOperands() /
Size;
6307 bool AllUndef =
true;
6308 bool AllZero =
true;
6309 for (
int j = 0; j < Scale; ++j) {
6310 SDValue Op = V.getOperand((M * Scale) + j);
6311 AllUndef &=
Op.isUndef();
6334 MVT VT =
N.getSimpleValueType();
6338 int Size = Mask.size();
6347 "Illegal split of shuffle value type");
6351 APInt UndefSrcElts[2];
6353 bool IsSrcConstant[2] = {
6355 SrcEltBits[0],
true,
6358 SrcEltBits[1],
true,
6361 for (
int i = 0; i <
Size; ++i) {
6375 unsigned SrcIdx = M /
Size;
6390 (
Size % V.getValueType().getVectorNumElements()) == 0) {
6391 int Scale =
Size / V.getValueType().getVectorNumElements();
6392 int Idx = M / Scale;
6406 int Idx = V.getConstantOperandVal(2);
6407 int NumSubElts = V.getOperand(1).getValueType().getVectorNumElements();
6408 if (M < Idx || (Idx + NumSubElts) <= M)
6415 if (IsSrcConstant[SrcIdx]) {
6416 if (UndefSrcElts[SrcIdx][M])
6418 else if (SrcEltBits[SrcIdx][M] == 0)
6424 "Different mask size from vector size!");
6430 const APInt &KnownUndef,
6431 const APInt &KnownZero,
6432 bool ResolveKnownZeros=
true) {
6433 unsigned NumElts = Mask.size();
6435 KnownZero.
getBitWidth() == NumElts &&
"Shuffle mask size mismatch");
6437 for (
unsigned i = 0; i != NumElts; ++i) {
6440 else if (ResolveKnownZeros && KnownZero[i])
6449 unsigned NumElts = Mask.size();
6452 for (
unsigned i = 0; i != NumElts; ++i) {
6464 EVT CondVT =
Cond.getValueType();
6477 for (
int i = 0; i != (int)NumElts; ++i) {
6482 if (UndefElts[i] || (!IsBLENDV && EltBits[i].
isZero()) ||
6483 (IsBLENDV && EltBits[i].isNonNegative()))
6495 bool ResolveKnownElts);
6505 bool ResolveKnownElts) {
6509 MVT VT =
N.getSimpleValueType();
6513 if ((NumBitsPerElt % 8) != 0 || (NumSizeInBits % 8) != 0)
6516 unsigned NumSizeInBytes = NumSizeInBits / 8;
6517 unsigned NumBytesPerElt = NumBitsPerElt / 8;
6519 unsigned Opcode =
N.getOpcode();
6525 Mask.append(ShuffleMask.begin(), ShuffleMask.end());
6526 Ops.push_back(
N.getOperand(0));
6527 Ops.push_back(
N.getOperand(1));
6533 case X86ISD::ANDNP: {
6539 bool IsAndN = (X86ISD::ANDNP == Opcode);
6540 uint64_t ZeroMask = IsAndN ? 255 : 0;
6547 assert(UndefElts.
isZero() &&
"Unexpected UNDEF element in AND/ANDNP mask");
6548 for (
int i = 0, e = (
int)EltBits.
size(); i != e; ++i) {
6549 const APInt &ByteBits = EltBits[i];
6550 if (ByteBits != 0 && ByteBits != 255)
6554 Ops.push_back(IsAndN ? N1 : N0);
6575 size_t MaskSize = std::max(SrcMask0.
size(), SrcMask1.
size());
6579 for (
int i = 0; i != (int)MaskSize; ++i) {
6589 Mask.push_back(i + MaskSize);
6590 else if (MaskSize == NumElts && !DemandedElts[i])
6595 Ops.push_back(
N.getOperand(0));
6596 Ops.push_back(
N.getOperand(1));
6601 unsigned NumSubElts =
N.getOperand(0).getValueType().getVectorNumElements();
6602 if (NumBitsPerElt == 64) {
6603 for (
unsigned I = 0,
E =
N.getNumOperands();
I !=
E; ++
I) {
6604 for (
unsigned M = 0; M != NumSubElts; ++M)
6605 Mask.push_back((
I * NumElts) + M);
6606 Ops.push_back(
N.getOperand(
I));
6615 EVT SubVT =
Sub.getValueType();
6617 uint64_t InsertIdx =
N.getConstantOperandVal(2);
6619 if (DemandedElts.
extractBits(NumSubElts, InsertIdx) == 0) {
6620 Mask.resize(NumElts);
6621 std::iota(Mask.begin(), Mask.end(), 0);
6627 if (
Depth > 0 && InsertIdx == NumSubElts && NumElts == (2 * NumSubElts) &&
6629 Src.getOperand(0).isUndef() &&
6630 Src.getOperand(1).getValueType() == SubVT &&
6631 Src.getConstantOperandVal(2) == 0 &&
6632 (NumBitsPerElt == 64 || Src.getOperand(1) ==
Sub) &&
6634 Mask.resize(NumElts);
6635 std::iota(Mask.begin(), Mask.begin() + NumSubElts, 0);
6636 std::iota(Mask.begin() + NumSubElts, Mask.end(), NumElts);
6637 Ops.push_back(Src.getOperand(1));
6641 if (!
N->isOnlyUserOf(
Sub.getNode()))
6656 unsigned NumSubSrcSrcElts =
6658 unsigned MaxElts = std::max(NumElts, NumSubSrcSrcElts);
6659 assert((MaxElts % NumElts) == 0 && (MaxElts % NumSubSrcSrcElts) == 0 &&
6660 "Subvector valuetype mismatch");
6661 InsertIdx *= (MaxElts / NumElts);
6662 ExtractIdx *= (MaxElts / NumSubSrcSrcElts);
6663 NumSubElts *= (MaxElts / NumElts);
6664 bool SrcIsUndef = Src.isUndef();
6665 for (
int i = 0; i != (int)MaxElts; ++i)
6667 for (
int i = 0; i != (int)NumSubElts; ++i)
6668 Mask[InsertIdx + i] = (SrcIsUndef ? 0 : MaxElts) + ExtractIdx + i;
6671 Ops.push_back(SubSrcSrc);
6678 Depth + 1, ResolveKnownElts))
6688 if (SubMask.
size() != NumSubElts) {
6689 assert(((SubMask.
size() % NumSubElts) == 0 ||
6690 (NumSubElts % SubMask.
size()) == 0) &&
6691 "Illegal submask scale");
6692 if ((NumSubElts % SubMask.
size()) == 0) {
6693 int Scale = NumSubElts / SubMask.
size();
6696 SubMask = ScaledSubMask;
6698 int Scale = SubMask.
size() / NumSubElts;
6699 NumSubElts = SubMask.
size();
6709 for (
int i = 0; i != (int)NumElts; ++i)
6711 for (
int i = 0; i != (int)NumSubElts; ++i) {
6714 int InputIdx = M / NumSubElts;
6715 M = (NumElts * (1 + InputIdx)) + (M % NumSubElts);
6717 Mask[i + InsertIdx] = M;
6721 case X86ISD::PINSRB:
6722 case X86ISD::PINSRW:
6729 unsigned DstIdx = 0;
6733 N.getConstantOperandAPInt(2).uge(NumElts))
6735 DstIdx =
N.getConstantOperandVal(2);
6739 Ops.push_back(
N.getOperand(0));
6740 for (
unsigned i = 0; i != NumElts; ++i)
6760 if ((MinBitsPerElt % 8) != 0)
6780 unsigned DstByte = DstIdx * NumBytesPerElt;
6786 Ops.push_back(SrcVec);
6789 Ops.push_back(SrcVec);
6790 Ops.push_back(
N.getOperand(0));
6791 for (
int i = 0; i != (int)NumSizeInBytes; ++i)
6792 Mask.push_back(NumSizeInBytes + i);
6795 unsigned MinBytesPerElts = MinBitsPerElt / 8;
6796 MinBytesPerElts = std::min(MinBytesPerElts, NumBytesPerElt);
6797 for (
unsigned i = 0; i != MinBytesPerElts; ++i)
6798 Mask[DstByte + i] = SrcByte + i;
6799 for (
unsigned i = MinBytesPerElts; i < NumBytesPerElt; ++i)
6803 case X86ISD::PACKSS:
6804 case X86ISD::PACKUS: {
6809 "Unexpected input value type");
6811 APInt EltsLHS, EltsRHS;
6816 bool Offset0 =
false, Offset1 =
false;
6817 if (Opcode == X86ISD::PACKSS) {
6845 bool IsUnary = (N0 == N1);
6853 if (Offset0 || Offset1) {
6855 if ((Offset0 &&
isInRange(M, 0, NumElts)) ||
6856 (Offset1 &&
isInRange(M, NumElts, 2 * NumElts)))
6862 case X86ISD::BLENDV: {
6865 Ops.push_back(
N.getOperand(1));
6866 Ops.push_back(
N.getOperand(2));
6871 case X86ISD::VTRUNC: {
6873 EVT SrcVT = Src.getValueType();
6878 unsigned Scale = NumBitsPerSrcElt / NumBitsPerElt;
6879 assert((NumBitsPerSrcElt % NumBitsPerElt) == 0 &&
"Illegal truncation");
6880 for (
unsigned i = 0; i != NumSrcElts; ++i)
6881 Mask.push_back(i * Scale);
6897 for (
unsigned I = 0;
I != NumElts; ++
I)
6898 if (DemandedElts[
I] && !UndefElts[
I] &&
6899 (EltBits[
I].urem(8) != 0 || EltBits[
I].uge(NumBitsPerElt)))
6903 Ops.push_back(
N.getOperand(0));
6905 for (
unsigned I = 0;
I != NumElts; ++
I) {
6906 if (!DemandedElts[
I] || UndefElts[
I])
6908 unsigned ByteShift = EltBits[
I].getZExtValue() / 8;
6909 unsigned Lo =
I * NumBytesPerElt;
6910 unsigned Hi =
Lo + NumBytesPerElt;
6914 std::iota(Mask.begin() +
Lo + ByteShift, Mask.begin() +
Hi,
Lo);
6916 std::iota(Mask.begin() +
Lo, Mask.begin() +
Hi - ByteShift,
6922 case X86ISD::VSRLI: {
6923 uint64_t ShiftVal =
N.getConstantOperandVal(1);
6925 if (NumBitsPerElt <= ShiftVal) {
6931 if ((ShiftVal % 8) != 0)
6935 Ops.push_back(
N.getOperand(0));
6940 if (X86ISD::VSHLI == Opcode) {
6941 for (
unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt)
6942 for (
unsigned j = ByteShift; j != NumBytesPerElt; ++j)
6943 Mask[i + j] = i + j - ByteShift;
6945 for (
unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt)
6946 for (
unsigned j = ByteShift; j != NumBytesPerElt; ++j)
6947 Mask[i + j - ByteShift] = i + j;
6962 for (
unsigned I = 0;
I != NumElts; ++
I)
6963 if (DemandedElts[
I] && !UndefElts[
I] &&
6964 (EltBits[
I].urem(NumBitsPerElt) % 8) != 0)
6967 Ops.push_back(
N.getOperand(0));
6968 for (
unsigned I = 0;
I != NumElts; ++
I) {
6969 if (!DemandedElts[
I] || UndefElts[
I]) {
6973 int Offset = EltBits[
I].urem(NumBitsPerElt) / 8;
6975 int BaseIdx =
I * NumBytesPerElt;
6976 for (
int J = 0; J != (int)NumBytesPerElt; ++J) {
6977 Mask.push_back(BaseIdx + ((
Offset + J) % NumBytesPerElt));
6982 case X86ISD::VROTLI:
6983 case X86ISD::VROTRI: {
6985 uint64_t RotateVal =
N.getConstantOperandAPInt(1).urem(NumBitsPerElt);
6986 if ((RotateVal % 8) != 0)
6988 Ops.push_back(
N.getOperand(0));
6989 int Offset = RotateVal / 8;
6991 for (
int i = 0; i != (int)NumElts; ++i) {
6992 int BaseIdx = i * NumBytesPerElt;
6993 for (
int j = 0; j != (int)NumBytesPerElt; ++j) {
6994 Mask.push_back(BaseIdx + ((
Offset + j) % NumBytesPerElt));
6999 case X86ISD::VBROADCAST: {
7001 if (!Src.getSimpleValueType().isVector()) {
7004 Src.getOperand(0).getValueType().getScalarType() !=
7007 Src = Src.getOperand(0);
7010 Mask.append(NumElts, 0);
7015 EVT SrcVT = Src.getValueType();
7020 (NumBitsPerSrcElt % 8) != 0)
7024 APInt DemandedSrcElts =
7029 assert((NumBitsPerElt % NumBitsPerSrcElt) == 0 &&
"Unexpected extension");
7030 unsigned Scale = NumBitsPerElt / NumBitsPerSrcElt;
7031 for (
unsigned I = 0;
I != NumElts; ++
I)
7032 Mask.append(Scale,
I);
7041 EVT SrcVT = Src.getValueType();
7063 int MaskWidth = Mask.size();
7065 for (
int i = 0, e = Inputs.
size(); i < e; ++i) {
7066 int lo = UsedInputs.
size() * MaskWidth;
7067 int hi = lo + MaskWidth;
7072 if ((lo <= M) && (M < hi))
7076 if (
none_of(Mask, [lo, hi](
int i) {
return (lo <= i) && (i < hi); })) {
7084 bool IsRepeat =
false;
7085 for (
int j = 0, ue = UsedInputs.
size(); j != ue; ++j) {
7086 if (UsedInputs[j] != Inputs[i])
7090 M = (M < hi) ? ((M - lo) + (j * MaskWidth)) : (M - MaskWidth);
7099 Inputs = std::move(UsedInputs);
7110 bool ResolveKnownElts) {
7114 EVT VT =
Op.getValueType();
7119 if (ResolveKnownElts)
7124 ResolveKnownElts)) {
7135 bool ResolveKnownElts) {
7136 APInt KnownUndef, KnownZero;
7138 KnownZero, DAG,
Depth, ResolveKnownElts);
7144 bool ResolveKnownElts =
true) {
7145 EVT VT =
Op.getValueType();
7149 unsigned NumElts =
Op.getValueType().getVectorNumElements();
7159 assert((Opcode == X86ISD::VBROADCAST_LOAD ||
7160 Opcode == X86ISD::SUBV_BROADCAST_LOAD) &&
7161 "Unknown broadcast load type");
7172 Opcode,
DL, Tys,
Ops, MemVT,
7186 EVT VT =
Op.getValueType();
7187 unsigned Opcode =
Op.getOpcode();
7192 int Elt = SV->getMaskElt(Index);
7211 int Elt = ShuffleMask[Index];
7218 assert(0 <= Elt && Elt < (2 * NumElems) &&
"Shuffle index out of range");
7227 uint64_t SubIdx =
Op.getConstantOperandVal(2);
7228 unsigned NumSubElts =
Sub.getValueType().getVectorNumElements();
7230 if (SubIdx <= Index && Index < (SubIdx + NumSubElts))
7237 EVT SubVT =
Op.getOperand(0).getValueType();
7239 uint64_t SubIdx = Index / NumSubElts;
7240 uint64_t SubElt = Index % NumSubElts;
7247 uint64_t SrcIdx =
Op.getConstantOperandVal(1);
7254 EVT SrcVT = Src.getValueType();
7266 if (
Op.getConstantOperandAPInt(2) == Index)
7267 return Op.getOperand(1);
7272 return (Index == 0) ?
Op.getOperand(0)
7276 return Op.getOperand(Index);
7283 const APInt &NonZeroMask,
7284 unsigned NumNonZero,
unsigned NumZero,
7287 MVT VT =
Op.getSimpleValueType();
7290 ((VT == MVT::v16i8 || VT == MVT::v4i32) && Subtarget.
hasSSE41())) &&
7291 "Illegal vector insertion");
7296 for (
unsigned i = 0; i < NumElts; ++i) {
7297 bool IsNonZero = NonZeroMask[i];
7306 if (NumZero || 0 != i)
7309 assert(0 == i &&
"Expected insertion into zero-index");
7325 const APInt &NonZeroMask,
7326 unsigned NumNonZero,
unsigned NumZero,
7329 if (NumNonZero > 8 && !Subtarget.
hasSSE41())
7343 for (
unsigned I = 0;
I != 4; ++
I) {
7344 if (!NonZeroMask[
I])
7352 assert(V &&
"Failed to fold v16i8 vector to zero");
7354 V = DAG.
getNode(X86ISD::VZEXT_MOVL,
DL, MVT::v4i32, V);
7357 for (
unsigned i = V ? 4 : 0; i < 16; i += 2) {
7358 bool ThisIsNonZero = NonZeroMask[i];
7359 bool NextIsNonZero = NonZeroMask[i + 1];
7360 if (!ThisIsNonZero && !NextIsNonZero)
7364 if (ThisIsNonZero) {
7365 if (NumZero || NextIsNonZero)
7371 if (NextIsNonZero) {
7373 if (i == 0 && NumZero)
7389 if (i != 0 || NumZero)
7407 const APInt &NonZeroMask,
7408 unsigned NumNonZero,
unsigned NumZero,
7411 if (NumNonZero > 4 && !Subtarget.
hasSSE41())
7427 if (Subtarget.
hasSSE3() && !Subtarget.hasXOP() &&
7428 Op.getOperand(0) ==
Op.getOperand(2) &&
7429 Op.getOperand(1) ==
Op.getOperand(3) &&
7430 Op.getOperand(0) !=
Op.getOperand(1)) {
7431 MVT VT =
Op.getSimpleValueType();
7443 std::bitset<4> Zeroable, Undefs;
7444 for (
int i = 0; i < 4; ++i) {
7449 assert(Zeroable.size() - Zeroable.count() > 1 &&
7450 "We expect at least two non-zero elements!");
7455 unsigned FirstNonZeroIdx;
7456 for (
unsigned i = 0; i < 4; ++i) {
7467 if (!FirstNonZero.
getNode()) {
7469 FirstNonZeroIdx = i;
7473 assert(FirstNonZero.
getNode() &&
"Unexpected build vector of all zeros!");
7479 unsigned EltMaskIdx, EltIdx;
7481 for (EltIdx = 0; EltIdx < 4; ++EltIdx) {
7482 if (Zeroable[EltIdx]) {
7484 Mask[EltIdx] = EltIdx+4;
7488 Elt =
Op->getOperand(EltIdx);
7491 if (Elt.
getOperand(0) != V1 || EltMaskIdx != EltIdx)
7493 Mask[EltIdx] = EltIdx;
7498 SDValue VZeroOrUndef = (Zeroable == Undefs)
7511 if (Elt == FirstNonZero && EltIdx == FirstNonZeroIdx)
7514 bool CanFold =
true;
7515 for (
unsigned i = EltIdx + 1; i < 4 && CanFold; ++i) {
7529 assert(V1.
getNode() &&
"Expected at least two non-zero elements!");
7536 unsigned ZMask = Zeroable.to_ulong();
7538 unsigned InsertPSMask = EltMaskIdx << 6 | EltIdx << 4 | ZMask;
7539 assert((InsertPSMask & ~0xFFu) == 0 &&
"Invalid mask!");
7541 DAG.
getNode(X86ISD::INSERTPS,
DL, MVT::v4f32, V1, V2,
7551 MVT ShVT = MVT::v16i8;
7552 unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
7554 assert(NumBits % 8 == 0 &&
"Only support byte sized shifts");
7566 SDValue Ptr = LD->getBasePtr();
7569 EVT PVT = LD->getValueType(0);
7570 if (PVT != MVT::i32 && PVT != MVT::f32)
7576 FI = FINode->getIndex();
7590 SDValue Chain = LD->getChain();
7594 if (!InferredAlign || *InferredAlign < RequiredAlign) {
7611 int64_t StartOffset =
Offset & ~int64_t(RequiredAlign.
value() - 1);
7618 int EltNo = (
Offset - StartOffset) >> 2;
7623 LD->getPointerInfo().getWithOffset(StartOffset));
7637 if (!BaseLd->isSimple())
7651 uint64_t Amt = AmtC->getZExtValue();
7653 ByteOffset += Amt / 8;
7663 if (DstSizeInBits == SrcSizeInBits && (SrcSizeInBits % 8) == 0 &&
7665 uint64_t Idx = IdxC->getZExtValue();
7666 ByteOffset += Idx * (SrcSizeInBits / 8);
7684 bool IsAfterLegalize,
7685 unsigned Depth = 0) {
7691 unsigned NumElems = Elts.
size();
7693 int LastLoadedElt = -1;
7703 for (
unsigned i = 0; i < NumElems; ++i) {
7722 if (!
findEltLoadSrc(Elt, Loads[i], ByteOffsets[i]) || ByteOffsets[i] < 0)
7724 unsigned LoadSizeInBits = Loads[i]->getValueSizeInBits(0);
7725 if (((ByteOffsets[i] * 8) + EltSizeInBits) > LoadSizeInBits)
7733 "Incomplete element masks");
7736 if (UndefMask.
popcount() == NumElems)
7747 "Register/Memory size mismatch");
7749 assert(LDBase &&
"Did not find base load for merging consecutive loads");
7751 unsigned BaseSizeInBytes = BaseSizeInBits / 8;
7752 int NumLoadedElts = (1 + LastLoadedElt - FirstLoadedElt);
7753 int LoadSizeInBits = NumLoadedElts * BaseSizeInBits;
7754 assert((BaseSizeInBits % 8) == 0 &&
"Sub-byte element loads detected");
7757 if (ByteOffsets[FirstLoadedElt] != 0)
7764 int64_t ByteOffset = ByteOffsets[EltIdx];
7765 if (ByteOffset && (ByteOffset % BaseSizeInBytes) == 0) {
7766 int64_t BaseIdx = EltIdx - (ByteOffset / BaseSizeInBytes);
7767 return (0 <= BaseIdx && BaseIdx < (
int)NumElems && LoadMask[BaseIdx] &&
7768 Loads[BaseIdx] == Ld && ByteOffsets[BaseIdx] == 0);
7770 int Stride = EltIdx - FirstLoadedElt;
7776 unsigned BaseMemSizeInBits =
Base->getMemoryVT().getSizeInBits();
7777 if (((Stride * BaseSizeInBits) % BaseMemSizeInBits) == 0 &&
7778 (BaseMemSizeInBits % BaseSizeInBits) == 0) {
7779 unsigned Scale = BaseMemSizeInBits / BaseSizeInBits;
7789 bool IsConsecutiveLoad =
true;
7790 bool IsConsecutiveLoadWithZeros =
true;
7791 for (
int i = FirstLoadedElt + 1; i <= LastLoadedElt; ++i) {
7793 if (!CheckConsecutiveLoad(LDBase, i)) {
7794 IsConsecutiveLoad =
false;
7795 IsConsecutiveLoadWithZeros =
false;
7798 }
else if (ZeroMask[i]) {
7799 IsConsecutiveLoad =
false;
7806 "Cannot merge volatile or atomic loads.");
7810 for (
auto *LD : Loads)
7825 if (FirstLoadedElt == 0 &&
7826 (NumLoadedElts == (
int)NumElems || IsDereferenceable) &&
7827 (IsConsecutiveLoad || IsConsecutiveLoadWithZeros)) {
7838 return DAG.
getBitcast(VT, Elts[FirstLoadedElt]);
7841 return CreateLoad(VT, LDBase);
7845 if (!IsAfterLegalize && VT.
isVector()) {
7847 if ((NumMaskElts % NumElems) == 0) {
7848 unsigned Scale = NumMaskElts / NumElems;
7850 for (
unsigned i = 0; i < NumElems; ++i) {
7853 int Offset = ZeroMask[i] ? NumMaskElts : 0;
7854 for (
unsigned j = 0; j != Scale; ++j)
7855 ClearMask[(i * Scale) + j] = (i * Scale) + j +
Offset;
7857 SDValue V = CreateLoad(VT, LDBase);
7867 unsigned HalfNumElems = NumElems / 2;
7873 DAG, Subtarget, IsAfterLegalize,
Depth + 1);
7881 if (IsConsecutiveLoad && FirstLoadedElt == 0 &&
7882 ((LoadSizeInBits == 16 && Subtarget.hasFP16()) || LoadSizeInBits == 32 ||
7883 LoadSizeInBits == 64) &&
7890 if (!Subtarget.
hasSSE2() && VT == MVT::v4f32)
7898 for (
auto *LD : Loads)
7909 for (
unsigned SubElems = 1; SubElems < NumElems; SubElems *= 2) {
7910 unsigned RepeatSize = SubElems * BaseSizeInBits;
7911 unsigned ScalarSize = std::min(RepeatSize, 64u);
7912 if (!Subtarget.
hasAVX2() && ScalarSize < 32)
7917 if (RepeatSize > ScalarSize && SubElems == 1)
7922 for (
unsigned i = 0; i != NumElems && Match; ++i) {
7926 if (RepeatedLoads[i % SubElems].
isUndef())
7927 RepeatedLoads[i % SubElems] = Elt;
7929 Match &= (RepeatedLoads[i % SubElems] == Elt);
7933 Match &= !RepeatedLoads.
front().isUndef();
7934 Match &= !RepeatedLoads.
back().isUndef();
7942 if (RepeatSize > ScalarSize)
7944 RepeatSize / ScalarSize);
7950 RepeatVT, RepeatedLoads,
DL, DAG, Subtarget, IsAfterLegalize,
7952 SDValue Broadcast = RepeatLoad;
7953 if (RepeatSize > ScalarSize) {
7964 DAG.
getNode(X86ISD::VBROADCAST,
DL, BroadcastVT, RepeatLoad);
7979 VT, ReverseElts,
DL, DAG, Subtarget, IsAfterLegalize,
Depth + 1)) {
7981 std::iota(ReverseMask.
rbegin(), ReverseMask.
rend(), 0);
7995 bool IsAfterLegalize) {
8014 auto getConstantScalar = [&](
const APInt &Val) ->
Constant * {
8016 if (ScalarSize == 16)
8018 if (ScalarSize == 32)
8020 assert(ScalarSize == 64 &&
"Unsupported floating point scalar size");
8027 for (
unsigned I = 0,
E = Bits.size();
I !=
E; ++
I)
8029 : getConstantScalar(Bits[
I]));
8038 auto getConstantScalar = [&](
const APInt &Val) ->
Constant * {
8040 if (ScalarSize == 16)
8042 if (ScalarSize == 32)
8044 assert(ScalarSize == 64 &&
"Unsupported floating point scalar size");
8050 if (ScalarSize == SplatBitSize)
8051 return getConstantScalar(SplatValue);
8053 unsigned NumElm = SplatBitSize / ScalarSize;
8055 for (
unsigned I = 0;
I != NumElm; ++
I) {
8057 ConstantVec.
push_back(getConstantScalar(Val));
8063 for (
auto *U :
N->users()) {
8064 unsigned Opc = U->getOpcode();
8066 if (
Opc == X86ISD::VPERMV && U->getOperand(0).getNode() ==
N)
8068 if (
Opc == X86ISD::VPERMV3 && U->getOperand(1).getNode() ==
N)
8074 if (
N->hasOneUse()) {
8077 if (
Opc == X86ISD::VPDPBUSD && U->getOperand(2).getNode() !=
N)
8088 unsigned SizeInBits = V.getValueSizeInBits();
8089 if ((SizeInBits == 512 && Subtarget.
hasAVX512()) ||
8090 (SizeInBits >= 128 && Subtarget.hasVLX())) {
8091 if (V.hasOneUse() && V->user_begin()->getOpcode() ==
ISD::VSELECT &&
8092 V->user_begin()->getOperand(0).getScalarValueSizeInBits() == 1) {
8120 "Unsupported vector type for broadcast.");
8127 assert((NumElts % Sequence.size()) == 0 &&
"Sequence doesn't fit.");
8128 if (Sequence.size() == 1)
8138 if (!Sequence.empty() && Subtarget.hasCDI()) {
8140 unsigned SeqLen = Sequence.size();
8141 bool UpperZeroOrUndef =
8146 if (UpperZeroOrUndef && ((Op0.getOpcode() ==
ISD::BITCAST) ||
8151 : Op0.getOperand(0).getOperand(0);
8154 if ((EltType == MVT::i64 && MaskVT == MVT::v8i1) ||
8155 (EltType == MVT::i32 && MaskVT == MVT::v16i1)) {
8161 SDValue Bcst = DAG.
getNode(X86ISD::VBROADCASTM, dl, BcstVT, BOperand);
8169 unsigned NumUndefElts = UndefElements.
count();
8170 if (!Ld || (NumElts - NumUndefElts) <= 1) {
8172 unsigned SplatBitSize;
8185 if (SplatBitSize == 32 || SplatBitSize == 64 ||
8186 (SplatBitSize < 32 && Subtarget.
hasAVX2())) {
8203 if (SplatBitSize > 64) {
8215 Ops, VVT, MPI, Alignment,
8225 if (!Ld || NumElts - NumUndefElts != 1)
8228 if (!(UndefElements[0] || (ScalarSize != 32 && ScalarSize != 64)))
8232 bool ConstSplatVal =
8260 if (ConstSplatVal && (Subtarget.
hasAVX2() || OptForSize)) {
8268 if (ScalarSize == 32 ||
8269 (ScalarSize == 64 && (IsGE256 || Subtarget.hasVLX())) ||
8270 (CVT == MVT::f16 && Subtarget.
hasAVX2()) ||
8271 (OptForSize && (ScalarSize == 64 || Subtarget.
hasAVX2()))) {
8274 C = CI->getConstantIntValue();
8276 C = CF->getConstantFPValue();
8278 assert(
C &&
"Invalid constant type");
8295 (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)))
8296 return DAG.
getNode(X86ISD::VBROADCAST, dl, VT, Ld);
8306 if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
8307 (Subtarget.hasVLX() && ScalarSize == 64)) {
8310 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
8313 LN->getMemoryVT(), LN->getMemOperand());
8321 (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64)) {
8324 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
8327 LN->getMemoryVT(), LN->getMemOperand());
8332 if (ScalarSize == 16 && Subtarget.hasFP16() && IsGE256)
8333 return DAG.
getNode(X86ISD::VBROADCAST, dl, VT, Ld);
8368 ExtractedFromVec = ShuffleVec;
8376 MVT VT =
Op.getSimpleValueType();
8383 unsigned NumElems =
Op.getNumOperands();
8389 for (
unsigned i = 0; i != NumElems; ++i) {
8390 unsigned Opc =
Op.getOperand(i).getOpcode();
8397 if (InsertIndices.
size() > 1)
8404 SDValue ExtractedFromVec =
Op.getOperand(i).getOperand(0);
8405 SDValue ExtIdx =
Op.getOperand(i).getOperand(1);
8417 VecIn1 = ExtractedFromVec;
8418 else if (VecIn1 != ExtractedFromVec) {
8420 VecIn2 = ExtractedFromVec;
8421 else if (VecIn2 != ExtractedFromVec)
8426 if (ExtractedFromVec == VecIn1)
8428 else if (ExtractedFromVec == VecIn2)
8429 Mask[i] = Idx + NumElems;
8438 for (
unsigned Idx : InsertIndices)
8448 MVT VT =
Op.getSimpleValueType();
8449 MVT SVT = Subtarget.hasFP16() ? MVT::f16 : MVT::i16;
8452 for (
unsigned I = 0,
E =
Op.getNumOperands();
I !=
E; ++
I)
8463 MVT VT =
Op.getSimpleValueType();
8465 "Unexpected type in LowerBUILD_VECTORvXi1!");
8472 bool IsSplat =
true;
8473 bool HasConstElts =
false;
8475 for (
unsigned idx = 0, e =
Op.getNumOperands(); idx < e; ++idx) {
8480 Immediate |= (InC->getZExtValue() & 0x1) << idx;
8481 HasConstElts =
true;
8487 else if (In !=
Op.getOperand(SplatIdx))
8498 assert(
Cond.getValueType() == MVT::i8 &&
"Unexpected VT!");
8504 if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
8525 if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
8542 for (
unsigned InsertIdx : NonConstIdx) {
8544 Op.getOperand(InsertIdx),
8552 case X86ISD::PACKSS:
8553 case X86ISD::PACKUS:
8587 unsigned BaseIdx,
unsigned LastIdx,
8589 EVT VT =
N->getValueType(0);
8591 assert(BaseIdx * 2 <= LastIdx &&
"Invalid Indices in input!");
8593 "Invalid Vector in input!");
8596 bool CanFold =
true;
8597 unsigned ExpectedVExtractIdx = BaseIdx;
8598 unsigned NumElts = LastIdx - BaseIdx;
8603 for (
unsigned i = 0, e = NumElts; i != e && CanFold; ++i) {
8607 if (
Op->isUndef()) {
8609 if (i * 2 == NumElts)
8610 ExpectedVExtractIdx = BaseIdx;
8611 ExpectedVExtractIdx += 2;
8615 CanFold =
Op->getOpcode() == Opcode &&
Op->hasOneUse();
8636 if (i * 2 < NumElts) {
8648 if (i * 2 == NumElts)
8649 ExpectedVExtractIdx = BaseIdx;
8653 if (I0 == ExpectedVExtractIdx)
8655 else if (IsCommutable && I1 == ExpectedVExtractIdx) {
8662 ExpectedVExtractIdx += 2;
8701 unsigned X86Opcode,
bool Mode,
8702 bool isUndefLO,
bool isUndefHI) {
8705 "Invalid nodes in input!");
8719 if (!isUndefLO && !V0->
isUndef())
8720 LO = DAG.
getNode(X86Opcode,
DL, NewVT, V0_LO, V0_HI);
8721 if (!isUndefHI && !V1->
isUndef())
8722 HI = DAG.
getNode(X86Opcode,
DL, NewVT, V1_LO, V1_HI);
8726 LO = DAG.
getNode(X86Opcode,
DL, NewVT, V0_LO, V1_LO);
8729 HI = DAG.
getNode(X86Opcode,
DL, NewVT, V0_HI, V1_HI);
8743 unsigned &NumExtracts,
bool &IsSubAdd,
8744 bool &HasAllowContract) {
8756 HasAllowContract = NumElts != 0;
8762 unsigned Opc[2] = {0, 0};
8763 for (
unsigned i = 0, e = NumElts; i != e; ++i) {
8767 unsigned Opcode =
Op.getOpcode();
8787 if (
Opc[i % 2] != 0 &&
Opc[i % 2] != Opcode)
8789 Opc[i % 2] = Opcode;
8815 HasAllowContract &=
Op->getFlags().hasAllowContract();
8861 unsigned ExpectedUses,
8862 bool AllowSubAddOrAddSubContract) {
8872 (AllowSubAddOrAddSubContract && Opnd0->
getFlags().hasAllowContract());
8891 unsigned NumExtracts;
8893 bool HasAllowContract;
8894 if (!
isAddSubOrSubAdd(BV, Subtarget, DAG, Opnd0, Opnd1, NumExtracts, IsSubAdd,
8903 HasAllowContract)) {
8904 unsigned Opc = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
8918 Mask.push_back(
I +
E + 1);
8925 return DAG.
getNode(X86ISD::ADDSUB,
DL, VT, Opnd0, Opnd1);
8942 unsigned NumEltsIn128Bits = NumElts / Num128BitChunks;
8943 unsigned NumEltsIn64Bits = NumEltsIn128Bits / 2;
8944 for (
unsigned i = 0; i != Num128BitChunks; ++i) {
8945 for (
unsigned j = 0; j != NumEltsIn128Bits; ++j) {
8957 GenericOpcode =
Op.getOpcode();
8958 switch (GenericOpcode) {
8960 case ISD::ADD: HOpcode = X86ISD::HADD;
break;
8961 case ISD::SUB: HOpcode = X86ISD::HSUB;
break;
8962 case ISD::FADD: HOpcode = X86ISD::FHADD;
break;
8963 case ISD::FSUB: HOpcode = X86ISD::FHSUB;
break;
8964 default:
return false;
8980 if (j < NumEltsIn64Bits) {
8988 SDValue SourceVec = (j < NumEltsIn64Bits) ? V0 : V1;
8995 unsigned ExpectedIndex = i * NumEltsIn128Bits +
8996 (j % NumEltsIn64Bits) * 2;
8997 if (ExpectedIndex == ExtIndex0 && ExtIndex1 == ExtIndex0 + 1)
9006 if (ExpectedIndex == ExtIndex1 && ExtIndex0 == ExtIndex1 + 1)
9037 for (
unsigned i = 0; i != NumElts; ++i)
9042 unsigned HalfNumElts = NumElts / 2;
9051 return DAG.
getNode(HOpcode,
DL, VT, V0, V1);
9059 unsigned NumNonUndefs =
9061 if (NumNonUndefs < 2)
9068 if (((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget.
hasSSE3()) ||
9069 ((VT == MVT::v8i16 || VT == MVT::v4i32) && Subtarget.
hasSSSE3()) ||
9070 ((VT == MVT::v8f32 || VT == MVT::v4f64) && Subtarget.
hasAVX()) ||
9071 ((VT == MVT::v16i16 || VT == MVT::v8i32) && Subtarget.
hasAVX2())) {
9084 unsigned Half = NumElts / 2;
9085 unsigned NumUndefsLO = 0;
9086 unsigned NumUndefsHI = 0;
9087 for (
unsigned i = 0, e = Half; i != e; ++i)
9091 for (
unsigned i = Half, e = NumElts; i != e; ++i)
9096 if (VT == MVT::v8i32 || VT == MVT::v16i16) {
9099 bool CanFold =
true;
9106 X86Opcode = X86ISD::HADD;
9113 X86Opcode = X86ISD::HSUB;
9120 if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
9129 bool isUndefLO = NumUndefsLO == Half;
9130 bool isUndefHI = NumUndefsHI == Half;
9136 if (VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v8i32 ||
9137 VT == MVT::v16i16) {
9141 X86Opcode = X86ISD::HADD;
9144 X86Opcode = X86ISD::HSUB;
9147 X86Opcode = X86ISD::FHADD;
9150 X86Opcode = X86ISD::FHSUB;
9156 if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
9161 bool isUndefLO = NumUndefsLO == Half;
9162 bool isUndefHI = NumUndefsHI == Half;
9164 isUndefLO, isUndefHI);
9182 MVT VT =
Op->getSimpleValueType(0);
9189 unsigned Opcode =
Op->getOperand(0).getOpcode();
9190 for (
unsigned i = 1; i < NumElems; ++i)
9191 if (Opcode !=
Op->getOperand(i).getOpcode())
9195 bool IsShift =
false;
9209 if (
Op->getSplatValue())
9217 bool RHSAllConst =
true;
9236 if (Op1.getValueSizeInBits() != ElemSize)
9242 if (
any_of(RHSElts, [&](
SDValue V) {
return RHSElts[0] != V; }))
9247 return V.getValueSizeInBits() == ElemSize;
9249 "Element size mismatch");
9257 if (!
LHS && !
RHS && !RHSAllConst)
9283 if (VT != MVT::v4f64)
9291 UniqueOps.insert(
Op);
9295 if (UniqueOps.size() != 2u)
9299 UniqueOps.erase(Op0);
9300 SDValue Op1 = *UniqueOps.begin();
9307 for (
auto I = 0u;
I < NumElems; ++
I) {
9309 Mask[
I] =
Op == Op0 ?
I :
I + NumElems;
9330 if (SVT != MVT::i8 && SVT != MVT::i16 && SVT != MVT::i32)
9333 unsigned WideBits = 2 * EltBits;
9340 for (
unsigned I = 0;
I != NumElts;
I += 2) {
9356 X.getValueType().bitsGE(WideSVT)) {
9357 if (
X.getValueType().bitsGT(WideSVT))
9366 assert(WideOps.
size() == (NumElts / 2) &&
"Failed to widen build vector");
9376 MVT VT =
Op.getSimpleValueType();
9386 if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32)
9408 "Illegal variable permute mask size");
9416 SDLoc(IndicesVec), SizeInBits);
9420 IndicesVT, IndicesVec);
9432 Subtarget, DAG,
SDLoc(IndicesVec));
9457 for (
uint64_t i = 0; i != Scale; ++i) {
9458 IndexScale |= Scale << (i * NumDstBits);
9459 IndexOffset |= i << (i * NumDstBits);
9469 unsigned Opcode = 0;
9475 Opcode = X86ISD::PSHUFB;
9478 if (Subtarget.hasVLX() && Subtarget.hasBWI())
9479 Opcode = X86ISD::VPERMV;
9481 Opcode = X86ISD::PSHUFB;
9482 ShuffleVT = MVT::v16i8;
9487 if (Subtarget.
hasAVX()) {
9488 Opcode = X86ISD::VPERMILPV;
9489 ShuffleVT = MVT::v4f32;
9491 Opcode = X86ISD::PSHUFB;
9492 ShuffleVT = MVT::v16i8;
9497 if (Subtarget.
hasAVX()) {
9500 Opcode = X86ISD::VPERMILPV;
9501 ShuffleVT = MVT::v2f64;
9513 if (Subtarget.hasVLX() && Subtarget.hasVBMI())
9514 Opcode = X86ISD::VPERMV;
9515 else if (Subtarget.hasXOP()) {
9522 DAG.
getNode(X86ISD::VPPERM,
DL, MVT::v16i8, LoSrc, HiSrc, LoIdx),
9523 DAG.
getNode(X86ISD::VPPERM,
DL, MVT::v16i8, LoSrc, HiSrc, HiIdx));
9524 }
else if (Subtarget.
hasAVX()) {
9547 if (Subtarget.hasVLX() && Subtarget.hasBWI())
9548 Opcode = X86ISD::VPERMV;
9549 else if (Subtarget.
hasAVX()) {
9551 IndicesVec = ScaleIndices(IndicesVec, 2);
9554 MVT::v32i8, DAG.
getBitcast(MVT::v32i8, SrcVec),
9555 DAG.
getBitcast(MVT::v32i8, IndicesVec),
DL, DAG, Subtarget));
9561 Opcode = X86ISD::VPERMV;
9562 else if (Subtarget.
hasAVX()) {
9565 {0, 1, 2, 3, 0, 1, 2, 3});
9567 {4, 5, 6, 7, 4, 5, 6, 7});
9568 if (Subtarget.hasXOP())
9570 VT, DAG.
getNode(X86ISD::VPERMIL2,
DL, MVT::v8f32, LoLo, HiHi,
9576 DAG.
getNode(X86ISD::VPERMILPV,
DL, MVT::v8f32, HiHi, IndicesVec),
9577 DAG.
getNode(X86ISD::VPERMILPV,
DL, MVT::v8f32, LoLo, IndicesVec),
9585 if (!Subtarget.hasVLX()) {
9587 SrcVec =
widenSubVector(WidenSrcVT, SrcVec,
false, Subtarget, DAG,
9589 IndicesVec =
widenSubVector(MVT::v8i64, IndicesVec,
false, Subtarget,
9590 DAG,
SDLoc(IndicesVec));
9595 Opcode = X86ISD::VPERMV;
9596 }
else if (Subtarget.
hasAVX()) {
9604 if (Subtarget.hasXOP())
9606 VT, DAG.
getNode(X86ISD::VPERMIL2,
DL, MVT::v4f64, LoLo, HiHi,
9612 DAG.
getNode(X86ISD::VPERMILPV,
DL, MVT::v4f64, HiHi, IndicesVec),
9613 DAG.
getNode(X86ISD::VPERMILPV,
DL, MVT::v4f64, LoLo, IndicesVec),
9619 if (Subtarget.hasVBMI())
9620 Opcode = X86ISD::VPERMV;
9623 if (Subtarget.hasBWI())
9624 Opcode = X86ISD::VPERMV;
9631 Opcode = X86ISD::VPERMV;
9639 "Illegal variable permute shuffle type");
9643 IndicesVec = ScaleIndices(IndicesVec, Scale);
9646 IndicesVec = DAG.
getBitcast(ShuffleIdxVT, IndicesVec);
9649 SDValue Res = Opcode == X86ISD::VPERMV
9650 ? DAG.
getNode(Opcode,
DL, ShuffleVT, IndicesVec, SrcVec)
9651 : DAG.
getNode(Opcode,
DL, ShuffleVT, SrcVec, IndicesVec);
9672 auto PeekThroughFreeze = [](
SDValue N) {
9674 return N->getOperand(0);
9680 for (
unsigned Idx = 0,
E = V.getNumOperands(); Idx !=
E; ++Idx) {
9681 SDValue Op = PeekThroughFreeze(V.getOperand(Idx));
9689 SrcVec =
Op.getOperand(0);
9690 else if (SrcVec !=
Op.getOperand(0))
9692 SDValue ExtractedIndex =
Op->getOperand(1);
9696 ExtractedIndex = ExtractedIndex.
getOperand(0);
9705 else if (IndicesVec != ExtractedIndex.
getOperand(0))
9709 if (!PermIdx || PermIdx->getAPIntValue() != Idx)
9713 MVT VT = V.getSimpleValueType();
9721 MVT VT =
Op.getSimpleValueType();
9723 MVT OpEltVT =
Op.getOperand(0).getSimpleValueType();
9731 (Subtarget.hasAVXNECONVERT() || Subtarget.hasBF16()))
9742 bool IsAllConstants =
true;
9743 bool OneUseFrozenUndefs =
true;
9744 SmallSet<SDValue, 8> Values;
9745 unsigned NumConstants = NumElems;
9746 for (
unsigned i = 0; i < NumElems; ++i) {
9753 OneUseFrozenUndefs = OneUseFrozenUndefs && Elt->
hasOneUse();
9754 FrozenUndefMask.
setBit(i);
9759 IsAllConstants =
false;
9774 if (OneUseFrozenUndefs && (UndefMask | FrozenUndefMask).isAllOnes())
9778 if ((UndefMask | FrozenUndefMask | ZeroMask).isAllOnes())
9786 if (
unsigned NumFrozenUndefElts = FrozenUndefMask.
popcount();
9787 NumFrozenUndefElts >= 2 && NumFrozenUndefElts < NumElems) {
9788 SmallVector<int, 16> BlendMask(NumElems, -1);
9790 for (
unsigned i = 0; i < NumElems; ++i) {
9796 if (!FrozenUndefMask[i])
9797 Elts[i] =
Op.getOperand(i);
9799 BlendMask[i] += NumElems;
9814 unsigned UpperElems = NumElems / 2;
9815 APInt UndefOrZeroMask = FrozenUndefMask | UndefMask | ZeroMask;
9816 unsigned NumUpperUndefsOrZeros = UndefOrZeroMask.
countl_one();
9817 if (NumUpperUndefsOrZeros >= UpperElems) {
9819 NumUpperUndefsOrZeros >= (NumElems - (NumElems / 4)))
9820 UpperElems = NumElems - (NumElems / 4);
9822 bool UndefUpper = UndefMask.
countl_one() >= UpperElems;
9826 return widenSubVector(VT, NewBV, !UndefUpper, Subtarget, DAG, dl);
9833 return HorizontalOp;
9843 unsigned NumZero = ZeroMask.
popcount();
9844 unsigned NumNonZero = NonZeroMask.
popcount();
9852 if (NumConstants == NumElems - 1 && NumNonZero != 1 &&
9853 FrozenUndefMask.
isZero() &&
9860 Type *EltType =
Op.getValueType().getScalarType().getTypeForEVT(
Context);
9864 for (
unsigned i = 0; i != NumElems; ++i) {
9867 ConstVecOps[i] = ConstantInt::get(
Context,
C->getAPIntValue());
9869 ConstVecOps[i] = ConstantFP::get(
Context,
C->getValueAPF());
9872 "Expected one variable element in this vector");
9886 SDValue LegalDAGConstVec = LowerConstantPool(DAGConstVec, DAG);
9892 if (InsertC < NumEltsInLow128Bits)
9898 assert(Subtarget.hasAVX() &&
"Must have AVX with >16-byte vector");
9899 SmallVector<int, 8> ShuffleMask;
9901 for (
unsigned i = 0; i != NumElts; ++i)
9902 ShuffleMask.
push_back(i == InsertC ? NumElts : i);
9908 if (NumNonZero == 1) {
9920 if (EltVT == MVT::i32 || EltVT == MVT::f16 || EltVT == MVT::f32 ||
9921 EltVT == MVT::f64 || (EltVT == MVT::i64 && Subtarget.is64Bit()) ||
9922 (EltVT == MVT::i16 && Subtarget.hasFP16())) {
9925 "Expected an SSE value type!");
9934 if (EltVT == MVT::i16 || EltVT == MVT::i8) {
9944 if (NumElems == 2 && Idx == 1 &&
9950 VT,
Op.getOperand(1)),
9951 NumBits/2, DAG, *
this, dl);
9962 if (EVTBits == 32) {
9969 if (Values.
size() == 1) {
9970 if (EVTBits == 32) {
9977 if (
Op.getNode()->isOnlyUserOf(Item.
getNode()))
10002 if (Subtarget.hasAVX2() && EVTBits == 32 && Values.
size() == 2) {
10007 for (
unsigned i = 2; i != NumElems; ++i)
10008 if (
Ops[i % 2] !=
Op.getOperand(i))
10012 if (CanSplat(
Op, NumElems,
Ops)) {
10034 HVT, dl,
Op->ops().slice(NumElems / 2, NumElems /2));
10041 if (EVTBits == 64) {
10042 if (NumNonZero == 1) {
10046 Op.getOperand(Idx));
10053 if (EVTBits == 8 && NumElems == 16)
10055 NumZero, DAG, Subtarget))
10058 if (EltVT == MVT::i16 && NumElems == 8)
10060 NumZero, DAG, Subtarget))
10064 if (EVTBits == 32 && NumElems == 4)
10069 if (NumElems == 4 && NumZero > 0) {
10071 for (
unsigned i = 0; i < 4; ++i) {
10072 bool isZero = !NonZeroMask[i];
10079 for (
unsigned i = 0; i < 2; ++i) {
10086 Ops[i] = getMOVL(DAG, dl, VT,
Ops[i*2+1],
Ops[i*2]);
10089 Ops[i] = getMOVL(DAG, dl, VT,
Ops[i*2],
Ops[i*2+1]);
10102 static_cast<int>(Reverse2 ? NumElems+1 : NumElems),
10103 static_cast<int>(Reverse2 ? NumElems : NumElems+1)
10108 assert(Values.
size() > 1 &&
"Expected non-undef and non-splat vector");
10115 if (Subtarget.hasSSE41() && EltVT != MVT::f16) {
10117 if (!
Op.getOperand(0).isUndef())
10122 for (
unsigned i = 1; i < NumElems; ++i) {
10123 if (
Op.getOperand(i).isUndef())
continue;
10134 for (
unsigned i = 0; i < NumElems; ++i) {
10135 if (!
Op.getOperand(i).isUndef())
10145 for (
unsigned Scale = 1; Scale < NumElems; Scale *= 2) {
10147 SmallVector<int, 16>
Mask;
10148 for(
unsigned i = 0; i != Scale; ++i)
10150 for (
unsigned i = 0; i != Scale; ++i)
10151 Mask.push_back(NumElems+i);
10154 for (
unsigned i = 0, e = NumElems / (2 * Scale); i !=
e; ++i)
10166 MVT ResVT =
Op.getSimpleValueType();
10168 "Value type must be 256-/512-bit wide");
10170 unsigned NumOperands =
Op.getNumOperands();
10171 unsigned NumFreezeUndef = 0;
10172 unsigned NumZero = 0;
10173 unsigned NumNonZero = 0;
10174 unsigned NonZeros = 0;
10176 for (
unsigned i = 0; i != NumOperands; ++i) {
10192 assert(i <
sizeof(NonZeros) * CHAR_BIT);
10193 NonZeros |= 1 << i;
10199 if (NumNonZero > 2) {
10203 Ops.slice(0, NumOperands/2));
10205 Ops.slice(NumOperands/2));
10217 U,
getZeroVector(U.getSimpleValueType(), Subtarget, DAG, dl));
10219 MVT SubVT =
Op.getOperand(0).getSimpleValueType();
10221 for (
unsigned i = 0; i != NumOperands; ++i) {
10222 if ((NonZeros & (1 << i)) == 0)
10239 MVT ResVT =
Op.getSimpleValueType();
10240 unsigned NumOperands =
Op.getNumOperands();
10242 "Unexpected number of operands in CONCAT_VECTORS");
10246 for (
unsigned i = 0; i != NumOperands; ++i) {
10250 assert(i <
sizeof(NonZeros) * CHAR_BIT);
10262 if (
isPowerOf2_64(NonZeros) && Zeros != 0 && NonZeros > Zeros &&
10263 Log2_64(NonZeros) != NumOperands - 1) {
10264 unsigned Idx =
Log2_64(NonZeros);
10269 Op = DAG.
getNode(X86ISD::KSHIFTL, dl, ShiftVT,
Op,
10280 unsigned Idx =
Log2_64(NonZeros);
10287 if (NumOperands > 2) {
10291 Ops.slice(0, NumOperands / 2));
10293 Ops.slice(NumOperands / 2));
10313 MVT VT =
Op.getSimpleValueType();
10322 (
Op.getNumOperands() == 2 ||
Op.getNumOperands() == 4)));
10340 int Idx,
int ExpectedIdx) {
10341 assert(0 <= Idx && Idx < MaskSize && 0 <= ExpectedIdx &&
10342 ExpectedIdx < MaskSize &&
"Out of range element index");
10343 if (!
Op || !ExpectedOp ||
Op.getOpcode() != ExpectedOp.
getOpcode())
10346 EVT VT =
Op.getValueType();
10356 if (Idx == ExpectedIdx &&
Op == ExpectedOp)
10359 switch (
Op.getOpcode()) {
10363 return Op.getOperand(Idx) == ExpectedOp.
getOperand(ExpectedIdx);
10366 EVT SrcVT = Src.getValueType();
10370 return (Idx % Scale) == (ExpectedIdx % Scale) &&
10372 Idx / Scale, ExpectedIdx / Scale);
10376 for (
unsigned I = 0;
I != Scale; ++
I)
10379 (ExpectedIdx * Scale) +
I))
10388 return Op == ExpectedOp &&
10389 SVN->getMaskElt(Idx) == SVN->getMaskElt(ExpectedIdx);
10391 case X86ISD::VBROADCAST:
10392 case X86ISD::VBROADCAST_LOAD:
10393 return Op == ExpectedOp;
10394 case X86ISD::SUBV_BROADCAST_LOAD:
10395 if (
Op == ExpectedOp) {
10397 unsigned NumMemElts =
MemOp->getMemoryVT().getVectorNumElements();
10398 return (Idx % NumMemElts) == (ExpectedIdx % NumMemElts);
10401 case X86ISD::VPERMI: {
10402 if (
Op == ExpectedOp) {
10407 Mask[ExpectedIdx]);
10413 case X86ISD::FHADD:
10414 case X86ISD::FHSUB:
10415 case X86ISD::PACKSS:
10416 case X86ISD::PACKUS:
10419 if (
Op == ExpectedOp &&
Op.getOperand(0) ==
Op.getOperand(1)) {
10422 int NumEltsPerLane = NumElts / NumLanes;
10423 int NumHalfEltsPerLane = NumEltsPerLane / 2;
10424 bool SameLane = (Idx / NumEltsPerLane) == (ExpectedIdx / NumEltsPerLane);
10426 (Idx % NumHalfEltsPerLane) == (ExpectedIdx % NumHalfEltsPerLane);
10427 return SameLane && SameElt;
10443 for (
int i = 0,
Size = Mask.size(); i <
Size; ++i) {
10444 assert(Mask[i] >= -1 &&
"Out of bound mask element!");
10445 if (Mask[i] >= 0 && Mask[i] != i)
10457 unsigned ScalarSizeInBits,
10459 assert(LaneSizeInBits && ScalarSizeInBits &&
10460 (LaneSizeInBits % ScalarSizeInBits) == 0 &&
10461 "Illegal shuffle lane size");
10462 int LaneSize = LaneSizeInBits / ScalarSizeInBits;
10463 int Size = Mask.size();
10464 for (
int i = 0; i <
Size; ++i)
10465 if (Mask[i] >= 0 && (Mask[i] %
Size) / LaneSize != i / LaneSize)
10480 unsigned ScalarSizeInBits,
10482 assert(LaneSizeInBits && ScalarSizeInBits &&
10483 (LaneSizeInBits % ScalarSizeInBits) == 0 &&
10484 "Illegal shuffle lane size");
10485 int NumElts = Mask.size();
10486 int NumEltsPerLane = LaneSizeInBits / ScalarSizeInBits;
10487 int NumLanes = NumElts / NumEltsPerLane;
10488 if (NumLanes > 1) {
10489 for (
int i = 0; i != NumLanes; ++i) {
10491 for (
int j = 0; j != NumEltsPerLane; ++j) {
10492 int M = Mask[(i * NumEltsPerLane) + j];
10495 int Lane = (M % NumElts) / NumEltsPerLane;
10496 if (SrcLane >= 0 && SrcLane != Lane)
10520 RepeatedMask.
assign(LaneSize, -1);
10521 int Size = Mask.size();
10522 for (
int i = 0; i <
Size; ++i) {
10526 if ((Mask[i] %
Size) / LaneSize != i / LaneSize)
10532 int LocalM = Mask[i] <
Size ? Mask[i] % LaneSize
10533 : Mask[i] % LaneSize + LaneSize;
10534 if (RepeatedMask[i % LaneSize] < 0)
10536 RepeatedMask[i % LaneSize] = LocalM;
10537 else if (RepeatedMask[i % LaneSize] != LocalM)
10567 unsigned EltSizeInBits,
10570 int LaneSize = LaneSizeInBits / EltSizeInBits;
10572 int Size = Mask.size();
10573 for (
int i = 0; i <
Size; ++i) {
10583 if ((Mask[i] %
Size) / LaneSize != i / LaneSize)
10589 int LaneM = Mask[i] /
Size;
10590 int LocalM = (Mask[i] % LaneSize) + (LaneM * LaneSize);
10593 RepeatedMask[i % LaneSize] = LocalM;
10594 else if (RepeatedMask[i % LaneSize] != LocalM)
10607 Mask, RepeatedMask);
10623 int Size = Mask.size();
10624 if (
Size != (
int)ExpectedMask.
size())
10627 for (
int i = 0; i <
Size; ++i) {
10628 assert(Mask[i] >= -1 &&
"Out of bound mask element!");
10629 int MaskIdx = Mask[i];
10630 int ExpectedIdx = ExpectedMask[i];
10631 if (0 <= MaskIdx && MaskIdx != ExpectedIdx) {
10633 SDValue ExpectedV = ExpectedIdx <
Size ? V1 : V2;
10634 MaskIdx = MaskIdx <
Size ? MaskIdx : (MaskIdx -
Size);
10635 ExpectedIdx = ExpectedIdx <
Size ? ExpectedIdx : (ExpectedIdx -
Size);
10657 int Size = Mask.size();
10658 if (
Size != (
int)ExpectedMask.
size())
10665 "Illegal target shuffle mask");
10672 if (V1 && (V1.getValueSizeInBits() != VT.
getSizeInBits() ||
10673 !V1.getValueType().isVector()))
10675 if (V2 && (V2.getValueSizeInBits() != VT.
getSizeInBits() ||
10676 !V2.getValueType().isVector()))
10682 for (
int i = 0; i <
Size; ++i) {
10683 int MaskIdx = Mask[i];
10684 int ExpectedIdx = ExpectedMask[i];
10688 if (ExpectedIdx < 0)
10694 SDValue ExpectedV = ExpectedIdx <
Size ? V1 : V2;
10697 int BitIdx = ExpectedIdx <
Size ? ExpectedIdx : (ExpectedIdx -
Size);
10698 APInt &ZeroMask = ExpectedIdx <
Size ? ZeroV1 : ZeroV2;
10699 ZeroMask.
setBit(BitIdx);
10703 if (MaskIdx >= 0) {
10705 SDValue ExpectedV = ExpectedIdx <
Size ? V1 : V2;
10706 MaskIdx = MaskIdx <
Size ? MaskIdx : (MaskIdx -
Size);
10707 ExpectedIdx = ExpectedIdx <
Size ? ExpectedIdx : (ExpectedIdx -
Size);
10721 if (VT != MVT::v8i32 && VT != MVT::v8f32)
10732 return IsUnpackwdMask;
10746 for (
unsigned i = 0; i != 4; ++i) {
10761 assert(Mask.size() % 2 == 0 &&
"Expecting even number of elements in mask");
10762 unsigned HalfSize = Mask.size() / 2;
10763 for (
unsigned i = 0; i != HalfSize; ++i) {
10764 if (Mask[i] != Mask[i + HalfSize])
10779 assert(Mask.size() == 4 &&
"Only 4-lane shuffle masks");
10780 assert(Mask[0] >= -1 && Mask[0] < 4 &&
"Out of bound mask element!");
10781 assert(Mask[1] >= -1 && Mask[1] < 4 &&
"Out of bound mask element!");
10782 assert(Mask[2] >= -1 && Mask[2] < 4 &&
"Out of bound mask element!");
10783 assert(Mask[3] >= -1 && Mask[3] < 4 &&
"Out of bound mask element!");
10787 int FirstIndex =
find_if(Mask, [](
int M) {
return M >= 0; }) - Mask.begin();
10788 assert(0 <= FirstIndex && FirstIndex < 4 &&
"All undef shuffle mask");
10790 int FirstElt = Mask[FirstIndex];
10791 if (
all_of(Mask, [FirstElt](
int M) {
return M < 0 || M == FirstElt; }))
10792 return (FirstElt << 6) | (FirstElt << 4) | (FirstElt << 2) | FirstElt;
10795 Imm |= (Mask[0] < 0 ? 0 : Mask[0]) << 0;
10796 Imm |= (Mask[1] < 0 ? 1 : Mask[1]) << 2;
10797 Imm |= (Mask[2] < 0 ? 2 : Mask[2]) << 4;
10798 Imm |= (Mask[3] < 0 ? 3 : Mask[3]) << 6;
10810 assert((Mask.size() == 2 || Mask.size() == 4 || Mask.size() == 8) &&
10811 "Unexpected SHUFPD mask size");
10812 assert(
all_of(Mask, [](
int M) {
return -1 <= M && M <= 1; }) &&
10813 "Unexpected SHUFPD mask elements");
10817 int FirstIndex =
find_if(Mask, [](
int M) {
return M >= 0; }) - Mask.begin();
10818 assert(0 <= FirstIndex && FirstIndex < (
int)Mask.size() &&
10819 "All undef shuffle mask");
10821 int FirstElt = Mask[FirstIndex];
10822 if (
all_of(Mask, [FirstElt](
int M) {
return M < 0 || M == FirstElt; }) &&
10823 count_if(Mask, [FirstElt](
int M) {
return M == FirstElt; }) > 1) {
10825 for (
unsigned I = 0,
E = Mask.size();
I !=
E; ++
I)
10826 Imm |= FirstElt <<
I;
10833 for (
unsigned I = 0,
E = Mask.size();
I !=
E; ++
I)
10834 Imm |= (Mask[
I] < 0 ? (
I & 1) : Mask[
I]) <<
I;
10853 bool &IsZeroSideLeft) {
10854 int NextElement = -1;
10856 for (
int i = 0, e = Mask.size(); i < e; i++) {
10858 assert(Mask[i] >= -1 &&
"Out of bound mask element!");
10864 if (NextElement < 0) {
10865 NextElement = Mask[i] != 0 ?
VectorType.getVectorNumElements() : 0;
10866 IsZeroSideLeft = NextElement != 0;
10869 if (NextElement != Mask[i])
10879 unsigned Depth = 0);
10887 int Size = Mask.size();
10901 for (
int i = 0; i < NumBytes; ++i) {
10902 int M = Mask[i / NumEltBytes];
10904 PSHUFBMask[i] = DAG.
getUNDEF(MVT::i8);
10907 if (Zeroable[i / NumEltBytes]) {
10908 PSHUFBMask[i] = ZeroMask;
10914 if (V && V != SrcV)
10920 if ((M / LaneSize) != ((i / NumEltBytes) / LaneSize))
10924 M = M * NumEltBytes + (i % NumEltBytes);
10927 assert(V &&
"Failed to find a source input");
10942 const APInt &Zeroable,
10945 bool IsLeftZeroSide =
true;
10949 unsigned VEXPANDMask = (~Zeroable).getZExtValue();
10954 assert((NumElts == 4 || NumElts == 8 || NumElts == 16) &&
10955 "Unexpected number of vector elements");
10957 Subtarget, DAG,
DL);
10959 SDValue ExpandedVector = IsLeftZeroSide ? V2 : V1;
10960 return DAG.
getNode(X86ISD::EXPAND,
DL, VT, ExpandedVector, ZeroVector, VMask);
10964 unsigned &UnpackOpcode,
bool IsUnary,
10970 bool Undef1 =
true, Undef2 =
true, Zero1 =
true, Zero2 =
true;
10971 for (
int i = 0; i != NumElts; i += 2) {
10972 int M1 = TargetMask[i + 0];
10973 int M2 = TargetMask[i + 1];
10979 assert(!((Undef1 || Zero1) && (Undef2 || Zero2)) &&
10980 "Zeroable shuffle detected");
10986 (IsUnary ? V1 : V2))) {
10987 UnpackOpcode = X86ISD::UNPCKL;
10988 V2 = (Undef2 ? DAG.
getUNDEF(VT) : (IsUnary ? V1 : V2));
10989 V1 = (Undef1 ? DAG.
getUNDEF(VT) : V1);
10995 (IsUnary ? V1 : V2))) {
10996 UnpackOpcode = X86ISD::UNPCKH;
10997 V2 = (Undef2 ? DAG.
getUNDEF(VT) : (IsUnary ? V1 : V2));
10998 V1 = (Undef1 ? DAG.
getUNDEF(VT) : V1);
11003 if (IsUnary && (Zero1 || Zero2)) {
11005 if ((Subtarget.
hasSSE41() || VT == MVT::v2i64 || VT == MVT::v2f64) &&
11009 bool MatchLo =
true, MatchHi =
true;
11010 for (
int i = 0; (i != NumElts) && (MatchLo || MatchHi); ++i) {
11011 int M = TargetMask[i];
11014 if ((((i & 1) == 0) && Zero1) || (((i & 1) == 1) && Zero2) ||
11018 MatchLo &= (M == Unpckl[i]);
11019 MatchHi &= (M == Unpckh[i]);
11022 if (MatchLo || MatchHi) {
11023 UnpackOpcode = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
11034 UnpackOpcode = X86ISD::UNPCKL;
11041 UnpackOpcode = X86ISD::UNPCKH;
11058 return DAG.
getNode(X86ISD::UNPCKL,
DL, VT, V1, V2);
11063 return DAG.
getNode(X86ISD::UNPCKH,
DL, VT, V1, V2);
11068 return DAG.
getNode(X86ISD::UNPCKL,
DL, VT, V2, V1);
11072 return DAG.
getNode(X86ISD::UNPCKH,
DL, VT, V2, V1);
11086 unsigned UnpackOpcode;
11088 UnpackOpcode = X86ISD::UNPCKL;
11090 UnpackOpcode = X86ISD::UNPCKH;
11098 DAG.
getUNDEF(MVT::v4f64), {0, 2, 1, 3});
11100 return DAG.
getNode(UnpackOpcode,
DL, VT, V1, V1);
11111 unsigned NumElts = Mask.size();
11113 unsigned MaxScale = 64 / EltSizeInBits;
11115 for (
unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
11116 unsigned SrcEltBits = EltSizeInBits * Scale;
11117 if (SrcEltBits < 32 && !Subtarget.hasBWI())
11119 unsigned NumSrcElts = NumElts / Scale;
11122 unsigned UpperElts = NumElts - NumSrcElts;
11128 if ((NumSrcElts * EltSizeInBits) >= 128) {
11146 MVT SrcVT = Src.getSimpleValueType();
11156 if (NumSrcElts == NumDstElts)
11159 if (NumSrcElts > NumDstElts) {
11165 if ((NumSrcElts * DstEltSizeInBits) >= 128) {
11182 if (DstVT != TruncVT)
11206 const APInt &Zeroable,
11209 assert((VT == MVT::v16i8 || VT == MVT::v8i16) &&
"Unexpected VTRUNC type");
11215 unsigned MaxScale = 64 / EltSizeInBits;
11216 for (
unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
11217 unsigned SrcEltBits = EltSizeInBits * Scale;
11218 unsigned NumSrcElts = NumElts / Scale;
11219 unsigned UpperElts = NumElts - NumSrcElts;
11228 Src.getScalarValueSizeInBits() == SrcEltBits) {
11229 Src = Src.getOperand(0);
11230 }
else if (Subtarget.hasVLX()) {
11243 if (!Subtarget.hasBWI() && Src.getScalarValueSizeInBits() < 32)
11256 const APInt &Zeroable,
11260 "Unexpected VTRUNC type");
11267 unsigned MaxScale = 64 / EltSizeInBits;
11268 for (
unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
11270 unsigned SrcEltBits = EltSizeInBits * Scale;
11271 if (SrcEltBits < 32 && !Subtarget.hasBWI())
11276 unsigned NumHalfSrcElts = NumElts / Scale;
11277 unsigned NumSrcElts = 2 * NumHalfSrcElts;
11284 unsigned UpperElts = NumElts - NumSrcElts;
11285 if (UpperElts > 0 &&
11312 X86ISD::VSRLI,
DL, SrcVT, Src,
11349 bool IsSingleInput) {
11352 int ShuffleModulus = Mask.size() * (IsSingleInput ? 1 : 2);
11354 "We should only be called with masks with a power-of-2 size!");
11357 int Offset = MatchEven ? 0 : 1;
11362 bool ViableForN[3] = {
true,
true,
true};
11364 for (
int i = 0, e = Mask.size(); i < e; ++i) {
11370 bool IsAnyViable =
false;
11371 for (
unsigned j = 0; j != std::size(ViableForN); ++j)
11372 if (ViableForN[j]) {
11377 IsAnyViable =
true;
11379 ViableForN[j] =
false;
11386 for (
unsigned j = 0; j != std::size(ViableForN); ++j)
11402 unsigned MaxStages = 1) {
11405 assert(0 < MaxStages && MaxStages <= 3 && (BitSize << MaxStages) <= 64 &&
11406 "Illegal maximum compaction");
11409 unsigned NumSrcBits = PackVT.getScalarSizeInBits();
11410 unsigned NumPackedBits = NumSrcBits - BitSize;
11414 unsigned NumBits2 = N2.getScalarValueSizeInBits();
11417 if ((!N1.
isUndef() && !IsZero1 && NumBits1 != NumSrcBits) ||
11418 (!N2.isUndef() && !IsZero2 && NumBits2 != NumSrcBits))
11420 if (Subtarget.
hasSSE41() || BitSize == 8) {
11427 PackOpcode = X86ISD::PACKUS;
11433 if ((N1.
isUndef() || IsZero1 || IsAllOnes1 ||
11435 (N2.isUndef() || IsZero2 || IsAllOnes2 ||
11440 PackOpcode = X86ISD::PACKSS;
11447 for (
unsigned NumStages = 1; NumStages <= MaxStages; ++NumStages) {
11455 if (MatchPACK(V1, V2, PackVT))
11462 if (MatchPACK(V1, V1, PackVT))
11474 unsigned PackOpcode;
11477 unsigned MaxStages =
Log2_32(64 / EltBits);
11479 Subtarget, MaxStages))
11483 unsigned NumStages =
Log2_32(CurrentEltBits / EltBits);
11486 if (NumStages != 1 && SizeBits == 128 && Subtarget.hasVLX())
11491 unsigned MaxPackBits = 16;
11492 if (CurrentEltBits > 16 &&
11493 (PackOpcode == X86ISD::PACKSS || Subtarget.
hasSSE41()))
11498 for (
unsigned i = 0; i != NumStages; ++i) {
11499 unsigned SrcEltBits = std::min(MaxPackBits, CurrentEltBits);
11500 unsigned NumSrcElts = SizeBits / SrcEltBits;
11508 CurrentEltBits /= 2;
11511 "Failed to lower compaction shuffle");
11521 const APInt &Zeroable,
11528 if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {
11546 for (
int i = 0,
Size = Mask.size(); i <
Size; ++i) {
11549 if (Mask[i] %
Size != i)
11552 V = Mask[i] <
Size ? V1 : V2;
11553 else if (V != (Mask[i] <
Size ? V1 : V2))
11581 for (
int i = 0,
Size = Mask.size(); i <
Size; ++i) {
11582 if (Mask[i] >= 0 && Mask[i] != i && Mask[i] != i +
Size)
11584 MaskOps.push_back(Mask[i] <
Size ?
AllOnes : Zero);
11598 const APInt &Zeroable,
bool &ForceV1Zero,
11599 bool &ForceV2Zero,
uint64_t &BlendMask) {
11600 bool V1IsZeroOrUndef =
11602 bool V2IsZeroOrUndef =
11606 ForceV1Zero =
false, ForceV2Zero =
false;
11607 assert(Mask.size() <= 64 &&
"Shuffle mask too big for blend mask");
11609 int NumElts = Mask.size();
11611 int NumEltsPerLane = NumElts / NumLanes;
11612 assert((NumLanes * NumEltsPerLane) == NumElts &&
"Value type mismatch");
11616 bool ForceWholeLaneMasks =
11621 for (
int Lane = 0; Lane != NumLanes; ++Lane) {
11623 bool LaneV1InUse =
false;
11624 bool LaneV2InUse =
false;
11626 for (
int LaneElt = 0; LaneElt != NumEltsPerLane; ++LaneElt) {
11627 int Elt = (Lane * NumEltsPerLane) + LaneElt;
11631 if (M == Elt || (0 <= M && M < NumElts &&
11634 LaneV1InUse =
true;
11637 if (M == (Elt + NumElts) ||
11640 LaneBlendMask |= 1ull << LaneElt;
11641 Mask[Elt] = Elt + NumElts;
11642 LaneV2InUse =
true;
11645 if (Zeroable[Elt]) {
11646 if (V1IsZeroOrUndef) {
11647 ForceV1Zero =
true;
11649 LaneV1InUse =
true;
11652 if (V2IsZeroOrUndef) {
11653 ForceV2Zero =
true;
11654 LaneBlendMask |= 1ull << LaneElt;
11655 Mask[Elt] = Elt + NumElts;
11656 LaneV2InUse =
true;
11666 if (ForceWholeLaneMasks && LaneV2InUse && !LaneV1InUse)
11667 LaneBlendMask = (1ull << NumEltsPerLane) - 1;
11669 BlendMask |= LaneBlendMask << (Lane * NumEltsPerLane);
11682 const APInt &Zeroable,
11686 bool ForceV1Zero =
false, ForceV2Zero =
false;
11703 assert(Subtarget.
hasAVX2() &&
"256-bit integer blends require AVX2!");
11707 assert(Subtarget.
hasAVX() &&
"256-bit float blends require AVX!");
11714 assert(Subtarget.
hasSSE41() &&
"128-bit blends require SSE41!");
11715 return DAG.
getNode(X86ISD::BLENDI,
DL, VT, V1, V2,
11717 case MVT::v16i16: {
11718 assert(Subtarget.
hasAVX2() &&
"v16i16 blends require AVX2!");
11722 assert(RepeatedMask.
size() == 8 &&
"Repeated mask size doesn't match!");
11724 for (
int i = 0; i < 8; ++i)
11725 if (RepeatedMask[i] >= 8)
11726 BlendMask |= 1ull << i;
11727 return DAG.
getNode(X86ISD::BLENDI,
DL, MVT::v16i16, V1, V2,
11733 uint64_t LoMask = BlendMask & 0xFF;
11734 uint64_t HiMask = (BlendMask >> 8) & 0xFF;
11735 if (LoMask == 0 || LoMask == 255 || HiMask == 0 || HiMask == 255) {
11741 MVT::v16i16,
DL,
Lo,
Hi,
11742 {0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31});
11747 assert(Subtarget.
hasAVX2() &&
"256-bit byte-blends require AVX2!");
11750 assert(Subtarget.
hasSSE41() &&
"128-bit byte-blends require SSE41!");
11757 if (Subtarget.hasBWI() && Subtarget.hasVLX()) {
11764 if (Subtarget.hasVLX())
11797 for (
int i = 0,
Size = Mask.size(); i <
Size; ++i)
11798 for (
int j = 0; j < Scale; ++j)
11845 bool ImmBlends =
false) {
11851 for (
int i = 0,
Size = Mask.size(); i <
Size; ++i) {
11855 assert(Mask[i] <
Size * 2 &&
"Shuffle input is out of bounds.");
11857 if (BlendMask[Mask[i] %
Size] < 0)
11858 BlendMask[Mask[i] %
Size] = Mask[i];
11859 else if (BlendMask[Mask[i] %
Size] != Mask[i])
11862 PermuteMask[i] = Mask[i] %
Size;
11884 int NumElts = Mask.size();
11886 int NumLaneElts = NumElts / NumLanes;
11887 int NumHalfLaneElts = NumLaneElts / 2;
11889 bool MatchLo =
true, MatchHi =
true;
11893 for (
int Elt = 0; Elt != NumElts; ++Elt) {
11901 if (M < NumElts && (
Op.isUndef() ||
Op == V1))
11903 else if (NumElts <= M && (
Op.isUndef() ||
Op == V2)) {
11909 bool MatchLoAnyLane =
false, MatchHiAnyLane =
false;
11910 for (
int Lane = 0; Lane != NumElts; Lane += NumLaneElts) {
11911 int Lo = Lane, Mid = Lane + NumHalfLaneElts,
Hi = Lane + NumLaneElts;
11914 if (MatchLoAnyLane || MatchHiAnyLane) {
11915 assert((MatchLoAnyLane ^ MatchHiAnyLane) &&
11916 "Failed to match UNPCKLO/UNPCKHI");
11920 MatchLo &= MatchLoAnyLane;
11921 MatchHi &= MatchHiAnyLane;
11922 if (!MatchLo && !MatchHi)
11925 assert((MatchLo ^ MatchHi) &&
"Failed to match UNPCKLO/UNPCKHI");
11931 for (
int Elt = 0; Elt != NumElts; ++Elt) {
11938 bool IsFirstOp = M < NumElts;
11940 NumLaneElts * (NormM / NumLaneElts) + (2 * (NormM % NumHalfLaneElts));
11941 if ((IsFirstOp && V1 ==
Ops[0]) || (!IsFirstOp && V2 ==
Ops[0]))
11942 PermuteMask[Elt] = BaseMaskElt;
11943 else if ((IsFirstOp && V1 ==
Ops[1]) || (!IsFirstOp && V2 ==
Ops[1]))
11944 PermuteMask[Elt] = BaseMaskElt + 1;
11945 assert(PermuteMask[Elt] != -1 &&
11946 "Input mask element is defined but failed to assign permute mask");
11949 unsigned UnpckOp = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
11968 int Size = Mask.size();
11969 assert(Mask.size() >= 2 &&
"Single element masks are invalid.");
11980 bool UnpackLo = NumLoInputs >= NumHiInputs;
11982 auto TryUnpack = [&](
int ScalarSize,
int Scale) {
11986 for (
int i = 0; i <
Size; ++i) {
11991 int UnpackIdx = i / Scale;
11995 if ((UnpackIdx % 2 == 0) != (Mask[i] <
Size))
12001 VMask[(UnpackIdx / 2) * Scale + i % Scale + (UnpackLo ? 0 :
Size / 2)] =
12023 VT, DAG.
getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH,
DL,
12024 UnpackVT, V1, V2));
12030 for (
int ScalarSize = 64; ScalarSize >= OrigScalarSize; ScalarSize /= 2)
12031 if (
SDValue Unpack = TryUnpack(ScalarSize, ScalarSize / OrigScalarSize))
12042 if (NumLoInputs == 0 || NumHiInputs == 0) {
12043 assert((NumLoInputs > 0 || NumHiInputs > 0) &&
12044 "We have to have *some* inputs!");
12045 int HalfOffset = NumLoInputs == 0 ?
Size / 2 : 0;
12053 for (
int i = 0; i <
Size; ++i) {
12057 assert(Mask[i] %
Size >= HalfOffset &&
"Found input from wrong half!");
12060 2 * ((Mask[i] %
Size) - HalfOffset) + (Mask[i] <
Size ? 0 : 1);
12064 DAG.
getNode(NumLoInputs == 0 ? X86ISD::UNPCKH : X86ISD::UNPCKL,
DL, VT,
12089 int NumEltsPerLane = NumElts / NumLanes;
12092 bool Blend1 =
true;
12093 bool Blend2 =
true;
12094 std::pair<int, int> Range1 = std::make_pair(INT_MAX, INT_MIN);
12095 std::pair<int, int> Range2 = std::make_pair(INT_MAX, INT_MIN);
12096 for (
int Lane = 0; Lane != NumElts; Lane += NumEltsPerLane) {
12097 for (
int Elt = 0; Elt != NumEltsPerLane; ++Elt) {
12098 int M = Mask[Lane + Elt];
12102 Blend1 &= (M == (Lane + Elt));
12103 assert(Lane <= M && M < (Lane + NumEltsPerLane) &&
"Out of range mask");
12104 M = M % NumEltsPerLane;
12105 Range1.first = std::min(Range1.first, M);
12106 Range1.second = std::max(Range1.second, M);
12109 Blend2 &= (M == (Lane + Elt));
12110 assert(Lane <= M && M < (Lane + NumEltsPerLane) &&
"Out of range mask");
12111 M = M % NumEltsPerLane;
12112 Range2.first = std::min(Range2.first, M);
12113 Range2.second = std::max(Range2.second, M);
12121 if (!(0 <= Range1.first && Range1.second < NumEltsPerLane) ||
12122 !(0 <= Range2.first && Range2.second < NumEltsPerLane))
12136 for (
int Lane = 0; Lane != NumElts; Lane += NumEltsPerLane) {
12137 for (
int Elt = 0; Elt != NumEltsPerLane; ++Elt) {
12138 int M = Mask[Lane + Elt];
12142 PermMask[Lane + Elt] = Lane + ((M + Ofs - RotAmt) % NumEltsPerLane);
12144 PermMask[Lane + Elt] = Lane + ((M - Ofs - RotAmt) % NumEltsPerLane);
12151 if (Range2.second < Range1.first)
12152 return RotateAndPermute(V1, V2, Range1.first, 0);
12153 if (Range1.second < Range2.first)
12154 return RotateAndPermute(V2, V1, Range2.first, NumElts);
12168 size_t NumUndefs = 0;
12169 std::optional<int> UniqueElt;
12170 for (
int Elt : Mask) {
12175 if (UniqueElt.has_value() && UniqueElt.value() != Elt)
12181 return NumUndefs <= Mask.size() / 2 && UniqueElt.has_value();
12194 int NumElts = Mask.size();
12196 int NumEltsPerLane = NumElts / NumLanes;
12200 bool IsAlternating =
true;
12201 bool V1Zero =
true, V2Zero =
true;
12205 for (
int i = 0; i < NumElts; ++i) {
12207 if (M >= 0 && M < NumElts) {
12210 V1Zero &= Zeroable[i];
12211 IsAlternating &= (i & 1) == 0;
12212 }
else if (M >= NumElts) {
12213 V2Mask[i] = M - NumElts;
12214 FinalMask[i] = i + NumElts;
12215 V2Zero &= Zeroable[i];
12216 IsAlternating &= (i & 1) == 1;
12223 auto canonicalizeBroadcastableInput = [
DL, VT, &Subtarget,
12226 unsigned EltSizeInBits =
Input.getScalarValueSizeInBits();
12227 if (!Subtarget.
hasAVX2() && (!Subtarget.
hasAVX() || EltSizeInBits < 32 ||
12233 "Expected to demand only the 0'th element.");
12236 int &InputMaskElt =
I.value();
12237 if (InputMaskElt >= 0)
12238 InputMaskElt =
I.index();
12248 canonicalizeBroadcastableInput(V1, V1Mask);
12249 canonicalizeBroadcastableInput(V2, V2Mask);
12263 DL, VT, V1, V2, Mask, Subtarget, DAG))
12284 DL, VT, V1, V2, Mask, Subtarget, DAG))
12294 DL, VT, V1, V2, Mask, Subtarget, DAG))
12303 V1Mask.
assign(NumElts, -1);
12304 V2Mask.
assign(NumElts, -1);
12305 FinalMask.
assign(NumElts, -1);
12306 for (
int i = 0; i != NumElts; i += NumEltsPerLane)
12307 for (
int j = 0; j != NumEltsPerLane; ++j) {
12308 int M = Mask[i + j];
12309 if (M >= 0 && M < NumElts) {
12310 V1Mask[i + (j / 2)] = M;
12311 FinalMask[i + j] = i + (j / 2);
12312 }
else if (M >= NumElts) {
12313 V2Mask[i + (j / 2)] = M - NumElts;
12314 FinalMask[i + j] = i + (j / 2) + NumElts;
12328 assert(EltSizeInBits < 64 &&
"Can't rotate 64-bit integers");
12331 int MinSubElts = Subtarget.
hasAVX512() ? std::max(32 / EltSizeInBits, 2) : 2;
12332 int MaxSubElts = 64 / EltSizeInBits;
12333 unsigned RotateAmt, NumSubElts;
12335 MaxSubElts, NumSubElts, RotateAmt))
12337 unsigned NumElts = Mask.size();
12352 if (!IsLegal && Subtarget.
hasSSE3())
12365 if ((RotateAmt % 16) == 0)
12368 unsigned ShlAmt = RotateAmt;
12390 int NumElts = Mask.size();
12401 for (
int i = 0; i < NumElts; ++i) {
12404 "Unexpected mask index.");
12409 int StartIdx = i - (M % NumElts);
12417 int CandidateRotation = StartIdx < 0 ? -StartIdx : NumElts - StartIdx;
12420 Rotation = CandidateRotation;
12421 else if (Rotation != CandidateRotation)
12426 SDValue MaskV = M < NumElts ? V1 : V2;
12437 else if (TargetV != MaskV)
12444 assert(Rotation != 0 &&
"Failed to locate a viable rotation!");
12445 assert((
Lo ||
Hi) &&
"Failed to find a rotated input vector!");
12490 int NumElts = RepeatedMask.
size();
12491 int Scale = 16 / NumElts;
12492 return Rotation * Scale;
12503 if (ByteRotation <= 0)
12515 "512-bit PALIGNR requires BWI instructions");
12522 "Rotate-based lowering only supports 128-bit lowering!");
12523 assert(Mask.size() <= 16 &&
12524 "Can shuffle at most 16 bytes in a 128-bit vector!");
12525 assert(ByteVT == MVT::v16i8 &&
12526 "SSE2 rotate lowering only needed for v16i8!");
12529 int LoByteShift = 16 - ByteRotation;
12530 int HiByteShift = ByteRotation;
12554 const APInt &Zeroable,
12558 "Only 32-bit and 64-bit elements are supported!");
12562 &&
"VLX required for 128/256-bit vectors");
12574 unsigned NumElts = Mask.size();
12577 assert((ZeroLo + ZeroHi) < NumElts &&
"Zeroable shuffle detected");
12578 if (!ZeroLo && !ZeroHi)
12582 SDValue Src = Mask[ZeroLo] < (int)NumElts ? V1 : V2;
12583 int Low = Mask[ZeroLo] < (int)NumElts ? 0 : NumElts;
12585 return DAG.
getNode(X86ISD::VALIGN,
DL, VT, Src,
12591 SDValue Src = Mask[0] < (int)NumElts ? V1 : V2;
12592 int Low = Mask[0] < (int)NumElts ? 0 : NumElts;
12594 return DAG.
getNode(X86ISD::VALIGN,
DL, VT,
12605 const APInt &Zeroable,
12615 if (!ZeroLo && !ZeroHi)
12618 unsigned NumElts = Mask.size();
12619 unsigned Len = NumElts - (ZeroLo + ZeroHi);
12629 SDValue Res = Mask[ZeroLo] < (int)NumElts ? V1 : V2;
12638 unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts);
12639 Res = DAG.
getNode(X86ISD::VSHLDQ,
DL, MVT::v16i8, Res,
12641 Res = DAG.
getNode(X86ISD::VSRLDQ,
DL, MVT::v16i8, Res,
12643 }
else if (ZeroHi == 0) {
12644 unsigned Shift = Mask[ZeroLo] % NumElts;
12645 Res = DAG.
getNode(X86ISD::VSRLDQ,
DL, MVT::v16i8, Res,
12647 Res = DAG.
getNode(X86ISD::VSHLDQ,
DL, MVT::v16i8, Res,
12649 }
else if (!Subtarget.
hasSSSE3()) {
12653 unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts);
12654 Res = DAG.
getNode(X86ISD::VSHLDQ,
DL, MVT::v16i8, Res,
12656 Shift += Mask[ZeroLo] % NumElts;
12657 Res = DAG.
getNode(X86ISD::VSRLDQ,
DL, MVT::v16i8, Res,
12659 Res = DAG.
getNode(X86ISD::VSHLDQ,
DL, MVT::v16i8, Res,
12692 int MaskOffset,
const APInt &Zeroable,
12694 int Size = Mask.size();
12695 unsigned SizeInBits =
Size * ScalarSizeInBits;
12697 auto CheckZeros = [&](
int Shift,
int Scale,
bool Left) {
12698 for (
int i = 0; i <
Size; i += Scale)
12699 for (
int j = 0; j < Shift; ++j)
12700 if (!Zeroable[i + j + (
Left ? 0 : (Scale - Shift))])
12706 auto MatchShift = [&](
int Shift,
int Scale,
bool Left) {
12707 for (
int i = 0; i !=
Size; i += Scale) {
12708 unsigned Pos =
Left ? i + Shift : i;
12709 unsigned Low =
Left ? i : i + Shift;
12710 unsigned Len = Scale - Shift;
12715 int ShiftEltBits = ScalarSizeInBits * Scale;
12716 bool ByteShift = ShiftEltBits > 64;
12717 Opcode =
Left ? (ByteShift ? X86ISD::VSHLDQ : X86ISD::VSHLI)
12718 : (ByteShift ? X86ISD::VSRLDQ : X86ISD::VSRLI);
12719 int ShiftAmt = Shift * ScalarSizeInBits / (ByteShift ? 8 : 1);
12723 Scale = ByteShift ? Scale / 2 : Scale;
12738 unsigned MaxWidth = ((SizeInBits == 512) && !Subtarget.hasBWI() ? 64 : 128);
12739 for (
int Scale = 2; Scale * ScalarSizeInBits <= MaxWidth; Scale *= 2)
12740 for (
int Shift = 1; Shift != Scale; ++Shift)
12741 for (
bool Left : {
true,
false})
12742 if (CheckZeros(Shift, Scale,
Left)) {
12743 int ShiftAmt = MatchShift(Shift, Scale,
Left);
12754 const APInt &Zeroable,
12757 int Size = Mask.size();
12766 Mask, 0, Zeroable, Subtarget);
12769 if (ShiftAmt < 0) {
12771 Mask,
Size, Zeroable, Subtarget);
12778 if (BitwiseOnly && (Opcode == X86ISD::VSHLDQ || Opcode == X86ISD::VSRLDQ))
12782 "Illegal integer vector type");
12784 V = DAG.
getNode(Opcode,
DL, ShiftVT, V,
12794 int Size = Mask.size();
12795 int HalfSize =
Size / 2;
12805 int Len = HalfSize;
12806 for (; Len > 0; --Len)
12807 if (!Zeroable[Len - 1])
12809 assert(Len > 0 &&
"Zeroable shuffle mask");
12814 for (
int i = 0; i != Len; ++i) {
12823 if (i > M || M >= HalfSize)
12826 if (Idx < 0 || (Src == V && Idx == (M - i))) {
12834 if (!Src || Idx < 0)
12837 assert((Idx + Len) <= HalfSize &&
"Illegal extraction mask");
12850 int Size = Mask.size();
12851 int HalfSize =
Size / 2;
12858 for (
int Idx = 0; Idx != HalfSize; ++Idx) {
12874 for (
int Hi = Idx + 1;
Hi <= HalfSize; ++
Hi) {
12876 int Len =
Hi - Idx;
12890 }
else if ((!
Base || (
Base == V1)) &&
12893 }
else if ((!
Base || (
Base == V2)) &&
12918 return DAG.
getNode(X86ISD::EXTRQI,
DL, VT, V1,
12942 unsigned ExtOpc,
SDValue InputV,
12946 assert(Scale > 1 &&
"Need a scale to extend.");
12950 int NumEltsPerLane = 128 / EltBits;
12951 int OffsetLane =
Offset / NumEltsPerLane;
12952 assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&
12953 "Only 8, 16, and 32 bit elements can be extended.");
12954 assert(Scale * EltBits <= 64 &&
"Cannot zero extend past 64 bits.");
12955 assert(0 <=
Offset &&
"Extension offset must be positive.");
12957 "Extension offset must be in the first lane or start an upper lane.");
12960 auto SafeOffset = [&](
int Idx) {
12961 return OffsetLane == (Idx / NumEltsPerLane);
12965 auto ShuffleOffset = [&](
SDValue V) {
12970 for (
int i = 0; i * Scale < NumElements; ++i) {
12971 int SrcIdx = i +
Offset;
12972 ShMask[i] = SafeOffset(SrcIdx) ? SrcIdx : -1;
12985 NumElements / Scale);
12987 InputV = ShuffleOffset(InputV);
13002 if (AnyExt && EltBits == 32) {
13006 VT, DAG.
getNode(X86ISD::PSHUFD,
DL, MVT::v4i32,
13010 if (AnyExt && EltBits == 16 && Scale > 2) {
13011 int PSHUFDMask[4] = {
Offset / 2, -1,
13013 InputV = DAG.
getNode(X86ISD::PSHUFD,
DL, MVT::v4i32,
13016 int PSHUFWMask[4] = {1, -1, -1, -1};
13017 unsigned OddEvenOp = (
Offset & 1) ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
13019 VT, DAG.
getNode(OddEvenOp,
DL, MVT::v8i16,
13026 if ((Scale * EltBits) == 64 && EltBits < 32 && Subtarget.hasSSE4A()) {
13027 assert(NumElements == (
int)Mask.size() &&
"Unexpected shuffle mask size!");
13030 int LoIdx =
Offset * EltBits;
13032 MVT::v2i64, DAG.
getNode(X86ISD::EXTRQI,
DL, VT, InputV,
13039 int HiIdx = (
Offset + 1) * EltBits;
13041 MVT::v2i64, DAG.
getNode(X86ISD::EXTRQI,
DL, VT, InputV,
13051 if (Scale > 4 && EltBits == 8 && Subtarget.
hasSSSE3()) {
13052 assert(NumElements == 16 &&
"Unexpected byte vector width!");
13054 for (
int i = 0; i < 16; ++i) {
13055 int Idx =
Offset + (i / Scale);
13056 if ((i % Scale == 0 && SafeOffset(Idx))) {
13063 InputV = DAG.
getBitcast(MVT::v16i8, InputV);
13065 VT, DAG.
getNode(X86ISD::PSHUFB,
DL, MVT::v16i8, InputV,
13071 int AlignToUnpack =
Offset % (NumElements / Scale);
13072 if (AlignToUnpack) {
13074 for (
int i = AlignToUnpack; i < NumElements; ++i)
13075 ShMask[i - AlignToUnpack] = i;
13077 Offset -= AlignToUnpack;
13082 unsigned UnpackLoHi = X86ISD::UNPCKL;
13083 if (
Offset >= (NumElements / 2)) {
13084 UnpackLoHi = X86ISD::UNPCKH;
13085 Offset -= (NumElements / 2);
13092 InputV = DAG.
getNode(UnpackLoHi,
DL, InputVT, InputV, Ext);
13096 }
while (Scale > 1);
13117 int NumLanes = Bits / 128;
13119 int NumEltsPerLane = NumElements / NumLanes;
13121 "Exceeds 32-bit integer zero extension limit");
13122 assert((
int)Mask.size() == NumElements &&
"Unexpected shuffle mask size");
13128 bool AnyExt =
true;
13131 for (
int i = 0; i < NumElements; ++i) {
13135 if (i % Scale != 0) {
13147 SDValue V = M < NumElements ? V1 : V2;
13148 M = M % NumElements;
13151 Offset = M - (i / Scale);
13152 }
else if (InputV != V)
13159 (
Offset % NumEltsPerLane) == 0))
13164 if (
Offset && (
Offset / NumEltsPerLane) != (M / NumEltsPerLane))
13167 if ((M % NumElements) != (
Offset + (i / Scale)))
13180 if (
Offset != 0 && Matches < 2)
13185 InputV, Mask, Subtarget, DAG);
13189 assert(Bits % 64 == 0 &&
13190 "The number of bits in a vector must be divisible by 64 on x86!");
13191 int NumExtElements = Bits / 64;
13195 for (; NumExtElements < NumElements; NumExtElements *= 2) {
13196 assert(NumElements % NumExtElements == 0 &&
13197 "The input vector size must be divisible by the extended size.");
13208 auto CanZExtLowHalf = [&]() {
13209 for (
int i = NumElements / 2; i != NumElements; ++i)
13219 if (
SDValue V = CanZExtLowHalf()) {
13221 V = DAG.
getNode(X86ISD::VZEXT_MOVL,
DL, MVT::v2i64, V);
13234 MVT VT = V.getSimpleValueType();
13240 MVT NewVT = V.getSimpleValueType();
13261 return V.hasOneUse() &&
13265template<
typename T>
13267 T EltVT = VT.getScalarType();
13268 return (EltVT == MVT::bf16 && !Subtarget.hasAVX10_2()) ||
13269 (EltVT == MVT::f16 && !Subtarget.hasFP16());
13289 find_if(Mask, [&Mask](
int M) {
return M >= (int)Mask.size(); }) -
13292 bool IsV1Zeroable =
true;
13293 for (
int i = 0,
Size = Mask.size(); i <
Size; ++i)
13294 if (i != V2Index && !Zeroable[i]) {
13295 IsV1Zeroable =
false;
13300 if (!IsV1Zeroable) {
13302 V1Mask[V2Index] = -1;
13317 if (EltVT == MVT::i8 || (EltVT == MVT::i16 && !Subtarget.hasFP16())) {
13321 if (!IsV1Zeroable && !(IsV1Constant && V2Index == 0))
13330 if (!IsV1Zeroable) {
13341 }
else if (Mask[V2Index] != (
int)Mask.size() || EltVT == MVT::i8 ||
13342 (EltVT == MVT::i16 && !Subtarget.hasAVX10_2())) {
13348 if (!IsV1Zeroable) {
13351 assert(VT == ExtVT &&
"Cannot change extended type when non-zeroable!");
13358 unsigned MovOpc = 0;
13359 if (EltVT == MVT::f16)
13360 MovOpc = X86ISD::MOVSH;
13361 else if (EltVT == MVT::f32)
13362 MovOpc = X86ISD::MOVSS;
13363 else if (EltVT == MVT::f64)
13364 MovOpc = X86ISD::MOVSD;
13367 return DAG.
getNode(MovOpc,
DL, ExtVT, V1, V2);
13374 V2 = DAG.
getNode(X86ISD::VZEXT_MOVL,
DL, ExtVT, V2);
13378 if (V2Index != 0) {
13385 V2Shuffle[V2Index] = 0;
13390 X86ISD::VSHLDQ,
DL, MVT::v16i8, V2,
13407 "We can only lower integer broadcasts with AVX2!");
13413 assert(V0VT.
isVector() &&
"Unexpected non-vector vector-sized value!");
13423 if (V0EltSize <= EltSize)
13426 assert(((V0EltSize % EltSize) == 0) &&
13427 "Scalar type sizes must all be powers of 2 on x86!");
13430 const unsigned Scale = V0EltSize / EltSize;
13431 const unsigned V0BroadcastIdx = BroadcastIdx / Scale;
13443 if (
const int OffsetIdx = BroadcastIdx % Scale)
13447 return DAG.
getNode(X86ISD::VBROADCAST,
DL, VT,
13457 assert(Mask.size() == 4 &&
"Unsupported mask size!");
13458 assert(Mask[0] >= -1 && Mask[0] < 8 &&
"Out of bound mask element!");
13459 assert(Mask[1] >= -1 && Mask[1] < 8 &&
"Out of bound mask element!");
13460 assert(Mask[2] >= -1 && Mask[2] < 8 &&
"Out of bound mask element!");
13461 assert(Mask[3] >= -1 && Mask[3] < 8 &&
"Out of bound mask element!");
13465 if (Mask[0] >= 0 && Mask[1] >= 0 && (Mask[0] < 4) != (Mask[1] < 4))
13467 if (Mask[2] >= 0 && Mask[3] >= 0 && (Mask[2] < 4) != (Mask[3] < 4))
13480 int Size = Mask.size();
13481 for (
int i = 0; i <
Size; ++i)
13482 if (Mask[i] >= 0 && Mask[i] /
Size ==
Input && Mask[i] %
Size != i)
13492 int BroadcastableElement = 0) {
13494 int Size = Mask.size();
13495 for (
int i = 0; i <
Size; ++i)
13496 if (Mask[i] >= 0 && Mask[i] /
Size ==
Input &&
13497 Mask[i] %
Size != BroadcastableElement)
13511 "VPERM* family of shuffles requires 32-bit or 64-bit elements");
13531 if (ExtIndex1 == 0 && ExtIndex0 == NumElts)
13533 else if (ExtIndex0 != 0 || ExtIndex1 != NumElts)
13539 if (NumElts == 4 &&
13544 NewMask.
append(NumElts, -1);
13564 if (!((Subtarget.
hasSSE3() && VT == MVT::v2f64) ||
13565 (Subtarget.
hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) ||
13572 unsigned Opcode = (VT == MVT::v2f64 && !Subtarget.
hasAVX2())
13574 : X86ISD::VBROADCAST;
13575 bool BroadcastFromReg = (Opcode == X86ISD::MOVDDUP) || Subtarget.
hasAVX2();
13579 if (BroadcastIdx < 0) {
13586 assert(BroadcastIdx < (
int)Mask.size() &&
"We only expect to be called with "
13587 "a sorted mask where the broadcast "
13589 int NumActiveElts =
count_if(Mask, [](
int M) {
return M >= 0; });
13595 int BitOffset = BroadcastIdx * NumEltBits;
13598 switch (V.getOpcode()) {
13600 V = V.getOperand(0);
13604 int OpBitWidth = V.getOperand(0).getValueSizeInBits();
13605 int OpIdx = BitOffset / OpBitWidth;
13606 V = V.getOperand(
OpIdx);
13607 BitOffset %= OpBitWidth;
13612 unsigned EltBitWidth = V.getScalarValueSizeInBits();
13613 unsigned Idx = V.getConstantOperandVal(1);
13614 unsigned BeginOffset = Idx * EltBitWidth;
13615 BitOffset += BeginOffset;
13616 V = V.getOperand(0);
13622 int Idx = (int)V.getConstantOperandVal(2);
13623 int NumSubElts = (int)VInner.getSimpleValueType().getVectorNumElements();
13624 int BeginOffset = Idx * EltBitWidth;
13625 int EndOffset = BeginOffset + NumSubElts * EltBitWidth;
13626 if (BeginOffset <= BitOffset && BitOffset < EndOffset) {
13627 BitOffset -= BeginOffset;
13637 assert((BitOffset % NumEltBits) == 0 &&
"Illegal bit-offset");
13638 BroadcastIdx = BitOffset / NumEltBits;
13641 bool BitCastSrc = V.getScalarValueSizeInBits() != NumEltBits;
13650 DL, VT, V, BroadcastIdx, Subtarget, DAG))
13651 return TruncBroadcast;
13657 V = V.getOperand(BroadcastIdx);
13670 SDValue BaseAddr = Ld->getBasePtr();
13673 assert((
int)(
Offset * 8) == BitOffset &&
"Unexpected bit-offset");
13680 if (Opcode == X86ISD::VBROADCAST) {
13684 X86ISD::VBROADCAST_LOAD,
DL, Tys,
Ops, SVT,
13690 assert(SVT == MVT::f64 &&
"Unexpected VT!");
13691 V = DAG.
getLoad(SVT,
DL, Ld->getChain(), NewAddr,
13695 }
else if (!BroadcastFromReg) {
13698 }
else if (BitOffset != 0) {
13706 if (VT == MVT::v4f64 || VT == MVT::v4i64)
13711 if (BitOffset < 128 && NumActiveElts > 1 &&
13712 V.getScalarValueSizeInBits() == NumEltBits) {
13713 assert((BitOffset % V.getScalarValueSizeInBits()) == 0 &&
13714 "Unexpected bit-offset");
13716 ExtractMask[0] = BitOffset / V.getScalarValueSizeInBits();
13721 if ((BitOffset % 128) != 0)
13724 assert((BitOffset % V.getScalarValueSizeInBits()) == 0 &&
13725 "Unexpected bit-offset");
13726 assert((V.getValueSizeInBits() == 256 || V.getValueSizeInBits() == 512) &&
13727 "Unexpected vector size");
13728 unsigned ExtractIdx = BitOffset / V.getScalarValueSizeInBits();
13734 if (Opcode == X86ISD::MOVDDUP && !V.getValueType().isVector()) {
13736 if (Subtarget.
hasAVX()) {
13737 V = DAG.
getNode(X86ISD::VBROADCAST,
DL, MVT::v2f64, V);
13744 if (!V.getValueType().isVector()) {
13745 assert(V.getScalarValueSizeInBits() == NumEltBits &&
13746 "Unexpected scalar size");
13755 if (V.getValueSizeInBits() > 128)
13760 unsigned NumSrcElts = V.getValueSizeInBits() / NumEltBits;
13772 unsigned &InsertPSMask,
13773 const APInt &Zeroable,
13777 assert(Mask.size() == 4 &&
"Unexpected mask size for v4 shuffle!");
13784 unsigned ZMask = 0;
13785 int VADstIndex = -1;
13786 int VBDstIndex = -1;
13787 bool VAUsedInPlace =
false;
13789 for (
int i = 0; i < 4; ++i) {
13797 if (i == CandidateMask[i]) {
13798 VAUsedInPlace =
true;
13803 if (VADstIndex >= 0 || VBDstIndex >= 0)
13806 if (CandidateMask[i] < 4) {
13816 if (VADstIndex < 0 && VBDstIndex < 0)
13821 unsigned VBSrcIndex = 0;
13822 if (VADstIndex >= 0) {
13825 VBSrcIndex = CandidateMask[VADstIndex];
13826 VBDstIndex = VADstIndex;
13829 VBSrcIndex = CandidateMask[VBDstIndex] - 4;
13834 if (!VAUsedInPlace)
13842 InsertPSMask = VBSrcIndex << 6 | VBDstIndex << 4 | ZMask;
13843 assert((InsertPSMask & ~0xFFu) == 0 &&
"Invalid mask!");
13847 if (matchAsInsertPS(V1, V2, Mask))
13853 if (matchAsInsertPS(V2, V1, CommutedMask))
13866 unsigned InsertPSMask = 0;
13871 return DAG.
getNode(X86ISD::INSERTPS,
DL, MVT::v4f32, V1, V2,
13888 assert(Mask.size() == 2 &&
"Unexpected mask size for v2 shuffle!");
13893 Mask, Subtarget, DAG))
13898 unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1);
13900 if (Subtarget.
hasAVX()) {
13903 return DAG.
getNode(X86ISD::VPERMILPI,
DL, MVT::v2f64, V1,
13908 X86ISD::SHUFP,
DL, MVT::v2f64,
13913 assert(Mask[0] >= 0 &&
"No undef lanes in multi-input v2 shuffles!");
13914 assert(Mask[1] >= 0 &&
"No undef lanes in multi-input v2 shuffles!");
13915 assert(Mask[0] < 2 &&
"We sort V1 to be the first input.");
13916 assert(Mask[1] >= 2 &&
"We sort V2 to be the second input.");
13925 DL, MVT::v2f64, V1, V2, Mask, Zeroable, Subtarget, DAG))
13929 int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2),
13930 Mask[1] < 0 ? -1 : (Mask[1] ^ 2)};
13932 DL, MVT::v2f64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
13943 X86ISD::MOVSD,
DL, MVT::v2f64, V2,
13948 Zeroable, Subtarget, DAG))
13955 unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1);
13956 return DAG.
getNode(X86ISD::SHUFP,
DL, MVT::v2f64, V1, V2,
13972 assert(Mask.size() == 2 &&
"Unexpected mask size for v2 shuffle!");
13977 Mask, Subtarget, DAG))
13984 int WidenedMask[4] = {Mask[0] < 0 ? -1 : (Mask[0] * 2),
13985 Mask[0] < 0 ? -1 : ((Mask[0] * 2) + 1),
13986 Mask[1] < 0 ? -1 : (Mask[1] * 2),
13987 Mask[1] < 0 ? -1 : ((Mask[1] * 2) + 1)};
13990 DAG.
getNode(X86ISD::PSHUFD,
DL, MVT::v4i32, V1,
13993 assert(Mask[0] != -1 &&
"No undef lanes in multi-input v2 shuffles!");
13994 assert(Mask[1] != -1 &&
"No undef lanes in multi-input v2 shuffles!");
13995 assert(Mask[0] < 2 &&
"We sort V1 to be the first input.");
13996 assert(Mask[1] >= 2 &&
"We sort V2 to be the second input.");
14011 DL, MVT::v2i64, V1, V2, Mask, Zeroable, Subtarget, DAG))
14015 int InverseMask[2] = {Mask[0] ^ 2, Mask[1] ^ 2};
14017 DL, MVT::v2i64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
14022 bool IsBlendSupported = Subtarget.
hasSSE41();
14023 if (IsBlendSupported)
14025 Zeroable, Subtarget, DAG))
14035 if (Subtarget.hasVLX())
14037 Zeroable, Subtarget, DAG))
14047 if (IsBlendSupported)
14049 Zeroable, Subtarget, DAG);
14069 SDValue LowV = V1, HighV = V2;
14071 int NumV2Elements =
count_if(Mask, [](
int M) {
return M >= 4; });
14073 if (NumV2Elements == 1) {
14074 int V2Index =
find_if(Mask, [](
int M) {
return M >= 4; }) - Mask.begin();
14078 int V2AdjIndex = V2Index ^ 1;
14080 if (Mask[V2AdjIndex] < 0) {
14086 NewMask[V2Index] -= 4;
14090 int V1Index = V2AdjIndex;
14091 int BlendMask[4] = {Mask[V2Index] - 4, 0, Mask[V1Index], 0};
14092 V2 = DAG.
getNode(X86ISD::SHUFP,
DL, VT, V2, V1,
14103 NewMask[V1Index] = 2;
14104 NewMask[V2Index] = 0;
14106 }
else if (NumV2Elements == 2) {
14107 if (Mask[0] < 4 && Mask[1] < 4) {
14112 }
else if (Mask[2] < 4 && Mask[3] < 4) {
14127 int BlendMask[4] = {Mask[0] < 4 ? Mask[0] : Mask[1],
14128 Mask[2] < 4 ? Mask[2] : Mask[3],
14129 (Mask[0] >= 4 ? Mask[0] : Mask[1]) - 4,
14130 (Mask[2] >= 4 ? Mask[2] : Mask[3]) - 4};
14131 V1 = DAG.
getNode(X86ISD::SHUFP,
DL, VT, V1, V2,
14137 NewMask[0] = Mask[0] < 4 ? 0 : 2;
14138 NewMask[1] = Mask[0] < 4 ? 2 : 0;
14139 NewMask[2] = Mask[2] < 4 ? 1 : 3;
14140 NewMask[3] = Mask[2] < 4 ? 3 : 1;
14142 }
else if (NumV2Elements == 3) {
14149 return DAG.
getNode(X86ISD::SHUFP,
DL, VT, LowV, HighV,
14164 assert(Mask.size() == 4 &&
"Unexpected mask size for v4 shuffle!");
14168 Zeroable, Subtarget, DAG))
14171 int NumV2Elements =
count_if(Mask, [](
int M) {
return M >= 4; });
14173 if (NumV2Elements == 0) {
14176 Mask, Subtarget, DAG))
14182 return DAG.
getNode(X86ISD::MOVSLDUP,
DL, MVT::v4f32, V1);
14184 return DAG.
getNode(X86ISD::MOVSHDUP,
DL, MVT::v4f32, V1);
14187 if (Subtarget.
hasAVX()) {
14190 return DAG.
getNode(X86ISD::VPERMILPI,
DL, MVT::v4f32, V1,
14198 return DAG.
getNode(X86ISD::MOVLHPS,
DL, MVT::v4f32, V1, V1);
14200 return DAG.
getNode(X86ISD::MOVHLPS,
DL, MVT::v4f32, V1, V1);
14205 return DAG.
getNode(X86ISD::SHUFP,
DL, MVT::v4f32, V1, V1,
14211 DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG)) {
14225 if (NumV2Elements == 1 && Mask[0] >= 4)
14227 Zeroable, Subtarget, DAG))
14244 return DAG.
getNode(X86ISD::MOVLHPS,
DL, MVT::v4f32, V1, V2);
14246 return DAG.
getNode(X86ISD::MOVHLPS,
DL, MVT::v4f32, V2, V1);
14267 assert(Mask.size() == 4 &&
"Unexpected mask size for v4 shuffle!");
14273 Zeroable, Subtarget, DAG))
14276 int NumV2Elements =
count_if(Mask, [](
int M) {
return M >= 4; });
14279 if (Subtarget.preferLowerShuffleAsShift()) {
14282 Subtarget, DAG,
true))
14284 if (NumV2Elements == 0)
14290 if (NumV2Elements == 0) {
14292 if (
count_if(Mask, [](
int M) {
return M >= 0 && M < 4; }) > 1) {
14294 Mask, Subtarget, DAG))
14303 const int UnpackLoMask[] = {0, 0, 1, 1};
14304 const int UnpackHiMask[] = {2, 2, 3, 3};
14307 Mask = UnpackLoMask;
14309 Mask = UnpackHiMask;
14312 return DAG.
getNode(X86ISD::PSHUFD,
DL, MVT::v4i32, V1,
14327 if (NumV2Elements == 1)
14329 DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
14334 bool IsBlendSupported = Subtarget.
hasSSE41();
14335 if (IsBlendSupported)
14337 Zeroable, Subtarget, DAG))
14341 Zeroable, Subtarget, DAG))
14351 if (Subtarget.hasVLX())
14352 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v4i32, V1, V2, Mask,
14353 Zeroable, Subtarget, DAG))
14356 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v4i32, V1, V2, Mask,
14367 if (IsBlendSupported)
14369 Zeroable, Subtarget, DAG);
14373 Mask, Subtarget, DAG))
14410 assert(Mask.size() == 8 &&
"Shuffle mask length doesn't match!");
14417 return DAG.
getNode(X86ISD::PSHUFLW,
DL, VT, V,
14422 for (
int i = 0; i != 4; ++i)
14423 HiMask[i] = (HiMask[i] < 0 ? HiMask[i] : (HiMask[i] - 4));
14424 return DAG.
getNode(X86ISD::PSHUFHW,
DL, VT, V,
14429 copy_if(LoMask, std::back_inserter(LoInputs), [](
int M) {
return M >= 0; });
14433 copy_if(HiMask, std::back_inserter(HiInputs), [](
int M) {
return M >= 0; });
14437 int NumHToL = LoInputs.
size() - NumLToL;
14439 int NumHToH = HiInputs.
size() - NumLToH;
14453 V = DAG.
getNode(X86ISD::PSHUFD,
DL, PSHUFDVT, V,
14458 if ((NumHToL + NumHToH) == 0 || (NumLToL + NumLToH) == 0) {
14459 int PSHUFDMask[4] = { -1, -1, -1, -1 };
14461 int DOffset = ((NumHToL + NumHToH) == 0 ? 0 : 2);
14464 for (
int DWord = 0; DWord != 4; ++DWord) {
14465 int M0 = Mask[2 * DWord + 0];
14466 int M1 = Mask[2 * DWord + 1];
14469 if (
M0 < 0 &&
M1 < 0)
14472 bool Match =
false;
14473 for (
int j = 0, e = DWordPairs.
size(); j < e; ++j) {
14474 auto &DWordPair = DWordPairs[j];
14477 DWordPair.first = (
M0 >= 0 ?
M0 : DWordPair.first);
14478 DWordPair.second = (
M1 >= 0 ?
M1 : DWordPair.second);
14479 PSHUFDMask[DWord] = DOffset + j;
14485 PSHUFDMask[DWord] = DOffset + DWordPairs.
size();
14490 if (DWordPairs.
size() <= 2) {
14491 DWordPairs.
resize(2, std::make_pair(-1, -1));
14492 int PSHUFHalfMask[4] = {DWordPairs[0].first, DWordPairs[0].second,
14493 DWordPairs[1].first, DWordPairs[1].second};
14498 std::fill(PSHUFHalfMask, PSHUFHalfMask + 4, SplatIdx);
14499 PSHUFDMask[0] = PSHUFDMask[2] = DOffset + 0;
14500 PSHUFDMask[1] = PSHUFDMask[3] = DOffset + 1;
14502 if ((NumHToL + NumHToH) == 0)
14503 return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFLW);
14504 if ((NumLToL + NumLToH) == 0)
14505 return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFHW);
14541 int AOffset,
int BOffset) {
14543 "Must call this with A having 3 or 1 inputs from the A half.");
14545 "Must call this with B having 1 or 3 inputs from the B half.");
14547 "Must call this with either 3:1 or 1:3 inputs (summing to 4).");
14549 bool ThreeAInputs = AToAInputs.
size() == 3;
14555 int ADWord = 0, BDWord = 0;
14556 int &TripleDWord = ThreeAInputs ? ADWord : BDWord;
14557 int &OneInputDWord = ThreeAInputs ? BDWord : ADWord;
14558 int TripleInputOffset = ThreeAInputs ? AOffset : BOffset;
14559 ArrayRef<int> TripleInputs = ThreeAInputs ? AToAInputs : BToAInputs;
14560 int OneInput = ThreeAInputs ? BToAInputs[0] : AToAInputs[0];
14561 int TripleInputSum = 0 + 1 + 2 + 3 + (4 * TripleInputOffset);
14562 int TripleNonInputIdx =
14563 TripleInputSum - std::accumulate(TripleInputs.
begin(), TripleInputs.
end(), 0);
14564 TripleDWord = TripleNonInputIdx / 2;
14568 OneInputDWord = (OneInput / 2) ^ 1;
14575 if (BToBInputs.
size() == 2 && AToBInputs.
size() == 2) {
14580 int NumFlippedAToBInputs =
llvm::count(AToBInputs, 2 * ADWord) +
14582 int NumFlippedBToBInputs =
llvm::count(BToBInputs, 2 * BDWord) +
14584 if ((NumFlippedAToBInputs == 1 &&
14585 (NumFlippedBToBInputs == 0 || NumFlippedBToBInputs == 2)) ||
14586 (NumFlippedBToBInputs == 1 &&
14587 (NumFlippedAToBInputs == 0 || NumFlippedAToBInputs == 2))) {
14592 auto FixFlippedInputs = [&V, &
DL, &Mask, &DAG](
int PinnedIdx,
int DWord,
14594 int FixIdx = PinnedIdx ^ 1;
14595 bool IsFixIdxInput =
is_contained(Inputs, PinnedIdx ^ 1);
14599 int FixFreeIdx = 2 * (DWord ^ (PinnedIdx / 2 == DWord));
14600 bool IsFixFreeIdxInput =
is_contained(Inputs, FixFreeIdx);
14601 if (IsFixIdxInput == IsFixFreeIdxInput)
14604 assert(IsFixIdxInput != IsFixFreeIdxInput &&
14605 "We need to be changing the number of flipped inputs!");
14606 int PSHUFHalfMask[] = {0, 1, 2, 3};
14607 std::swap(PSHUFHalfMask[FixFreeIdx % 4], PSHUFHalfMask[FixIdx % 4]);
14609 FixIdx < 4 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW,
DL,
14613 for (
int &M : Mask)
14614 if (M >= 0 && M == FixIdx)
14616 else if (M >= 0 && M == FixFreeIdx)
14619 if (NumFlippedBToBInputs != 0) {
14621 BToAInputs.
size() == 3 ? TripleNonInputIdx : OneInput;
14622 FixFlippedInputs(BPinnedIdx, BDWord, BToBInputs);
14624 assert(NumFlippedAToBInputs != 0 &&
"Impossible given predicates!");
14625 int APinnedIdx = ThreeAInputs ? TripleNonInputIdx : OneInput;
14626 FixFlippedInputs(APinnedIdx, ADWord, AToBInputs);
14631 int PSHUFDMask[] = {0, 1, 2, 3};
14632 PSHUFDMask[ADWord] = BDWord;
14633 PSHUFDMask[BDWord] = ADWord;
14640 for (
int &M : Mask)
14641 if (M >= 0 && M/2 == ADWord)
14642 M = 2 * BDWord + M % 2;
14643 else if (M >= 0 && M/2 == BDWord)
14644 M = 2 * ADWord + M % 2;
14650 if ((NumLToL == 3 && NumHToL == 1) || (NumLToL == 1 && NumHToL == 3))
14651 return balanceSides(LToLInputs, HToLInputs, HToHInputs, LToHInputs, 0, 4);
14652 if ((NumHToH == 3 && NumLToH == 1) || (NumHToH == 1 && NumLToH == 3))
14653 return balanceSides(HToHInputs, LToHInputs, LToLInputs, HToLInputs, 4, 0);
14660 int PSHUFLMask[4] = {-1, -1, -1, -1};
14661 int PSHUFHMask[4] = {-1, -1, -1, -1};
14662 int PSHUFDMask[4] = {-1, -1, -1, -1};
14667 auto fixInPlaceInputs =
14671 if (InPlaceInputs.
empty())
14673 if (InPlaceInputs.
size() == 1) {
14674 SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
14675 InPlaceInputs[0] - HalfOffset;
14676 PSHUFDMask[InPlaceInputs[0] / 2] = InPlaceInputs[0] / 2;
14679 if (IncomingInputs.
empty()) {
14681 for (
int Input : InPlaceInputs) {
14682 SourceHalfMask[
Input - HalfOffset] =
Input - HalfOffset;
14688 assert(InPlaceInputs.
size() == 2 &&
"Cannot handle 3 or 4 inputs!");
14689 SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
14690 InPlaceInputs[0] - HalfOffset;
14693 int AdjIndex = InPlaceInputs[0] ^ 1;
14694 SourceHalfMask[AdjIndex - HalfOffset] = InPlaceInputs[1] - HalfOffset;
14696 PSHUFDMask[AdjIndex / 2] = AdjIndex / 2;
14698 fixInPlaceInputs(LToLInputs, HToLInputs, PSHUFLMask, LoMask, 0);
14699 fixInPlaceInputs(HToHInputs, LToHInputs, PSHUFHMask, HiMask, 4);
14705 auto moveInputsToRightHalf = [&PSHUFDMask](
14710 auto isWordClobbered = [](
ArrayRef<int> SourceHalfMask,
int Word) {
14711 return SourceHalfMask[Word] >= 0 && SourceHalfMask[Word] != Word;
14713 auto isDWordClobbered = [&isWordClobbered](
ArrayRef<int> SourceHalfMask,
14715 int LowWord = Word & ~1;
14716 int HighWord = Word | 1;
14717 return isWordClobbered(SourceHalfMask, LowWord) ||
14718 isWordClobbered(SourceHalfMask, HighWord);
14721 if (IncomingInputs.
empty())
14724 if (ExistingInputs.
empty()) {
14726 for (
int Input : IncomingInputs) {
14729 if (isWordClobbered(SourceHalfMask,
Input - SourceOffset)) {
14730 if (SourceHalfMask[SourceHalfMask[
Input - SourceOffset]] < 0) {
14731 SourceHalfMask[SourceHalfMask[
Input - SourceOffset]] =
14732 Input - SourceOffset;
14734 for (
int &M : HalfMask)
14735 if (M == SourceHalfMask[
Input - SourceOffset] + SourceOffset)
14737 else if (M ==
Input)
14738 M = SourceHalfMask[
Input - SourceOffset] + SourceOffset;
14740 assert(SourceHalfMask[SourceHalfMask[
Input - SourceOffset]] ==
14741 Input - SourceOffset &&
14742 "Previous placement doesn't match!");
14747 Input = SourceHalfMask[
Input - SourceOffset] + SourceOffset;
14751 if (PSHUFDMask[(
Input - SourceOffset + DestOffset) / 2] < 0)
14752 PSHUFDMask[(
Input - SourceOffset + DestOffset) / 2] =
Input / 2;
14754 assert(PSHUFDMask[(
Input - SourceOffset + DestOffset) / 2] ==
14756 "Previous placement doesn't match!");
14762 for (
int &M : HalfMask)
14763 if (M >= SourceOffset && M < SourceOffset + 4) {
14764 M = M - SourceOffset + DestOffset;
14765 assert(M >= 0 &&
"This should never wrap below zero!");
14773 if (IncomingInputs.
size() == 1) {
14774 if (isWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
14775 int InputFixed =
find(SourceHalfMask, -1) - std::begin(SourceHalfMask) +
14777 SourceHalfMask[InputFixed - SourceOffset] =
14778 IncomingInputs[0] - SourceOffset;
14780 IncomingInputs[0] = InputFixed;
14782 }
else if (IncomingInputs.
size() == 2) {
14783 if (IncomingInputs[0] / 2 != IncomingInputs[1] / 2 ||
14784 isDWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
14788 int InputsFixed[2] = {IncomingInputs[0] - SourceOffset,
14789 IncomingInputs[1] - SourceOffset};
14794 if (!isWordClobbered(SourceHalfMask, InputsFixed[0]) &&
14795 SourceHalfMask[InputsFixed[0] ^ 1] < 0) {
14796 SourceHalfMask[InputsFixed[0]] = InputsFixed[0];
14797 SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
14798 InputsFixed[1] = InputsFixed[0] ^ 1;
14799 }
else if (!isWordClobbered(SourceHalfMask, InputsFixed[1]) &&
14800 SourceHalfMask[InputsFixed[1] ^ 1] < 0) {
14801 SourceHalfMask[InputsFixed[1]] = InputsFixed[1];
14802 SourceHalfMask[InputsFixed[1] ^ 1] = InputsFixed[0];
14803 InputsFixed[0] = InputsFixed[1] ^ 1;
14804 }
else if (SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] < 0 &&
14805 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] < 0) {
14809 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] = InputsFixed[0];
14810 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] = InputsFixed[1];
14811 InputsFixed[0] = 2 * ((InputsFixed[0] / 2) ^ 1);
14812 InputsFixed[1] = 2 * ((InputsFixed[0] / 2) ^ 1) + 1;
14818 for (
int i = 0; i < 4; ++i)
14819 assert((SourceHalfMask[i] < 0 || SourceHalfMask[i] == i) &&
14820 "We can't handle any clobbers here!");
14821 assert(InputsFixed[1] != (InputsFixed[0] ^ 1) &&
14822 "Cannot have adjacent inputs here!");
14824 SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
14825 SourceHalfMask[InputsFixed[1]] = InputsFixed[0] ^ 1;
14829 for (
int &M : FinalSourceHalfMask)
14830 if (M == (InputsFixed[0] ^ 1) + SourceOffset)
14831 M = InputsFixed[1] + SourceOffset;
14832 else if (M == InputsFixed[1] + SourceOffset)
14833 M = (InputsFixed[0] ^ 1) + SourceOffset;
14835 InputsFixed[1] = InputsFixed[0] ^ 1;
14839 for (
int &M : HalfMask)
14840 if (M == IncomingInputs[0])
14841 M = InputsFixed[0] + SourceOffset;
14842 else if (M == IncomingInputs[1])
14843 M = InputsFixed[1] + SourceOffset;
14845 IncomingInputs[0] = InputsFixed[0] + SourceOffset;
14846 IncomingInputs[1] = InputsFixed[1] + SourceOffset;
14853 int FreeDWord = (PSHUFDMask[DestOffset / 2] < 0 ? 0 : 1) + DestOffset / 2;
14854 assert(PSHUFDMask[FreeDWord] < 0 &&
"DWord not free");
14855 PSHUFDMask[FreeDWord] = IncomingInputs[0] / 2;
14856 for (
int &M : HalfMask)
14857 for (
int Input : IncomingInputs)
14859 M = FreeDWord * 2 +
Input % 2;
14861 moveInputsToRightHalf(HToLInputs, LToLInputs, PSHUFHMask, LoMask, HiMask,
14863 moveInputsToRightHalf(LToHInputs, HToHInputs, PSHUFLMask, HiMask, LoMask,
14869 V = DAG.
getNode(X86ISD::PSHUFLW,
DL, VT, V,
14872 V = DAG.
getNode(X86ISD::PSHUFHW,
DL, VT, V,
14883 "Failed to lift all the high half inputs to the low mask!");
14884 assert(
none_of(HiMask, [](
int M) {
return M >= 0 && M < 4; }) &&
14885 "Failed to lift all the low half inputs to the high mask!");
14889 V = DAG.
getNode(X86ISD::PSHUFLW,
DL, VT, V,
14893 for (
int &M : HiMask)
14897 V = DAG.
getNode(X86ISD::PSHUFHW,
DL, VT, V,
14909 "Lane crossing shuffle masks not supported");
14912 int Size = Mask.size();
14913 int Scale = NumBytes /
Size;
14920 for (
int i = 0; i < NumBytes; ++i) {
14921 int M = Mask[i / Scale];
14925 const int ZeroMask = 0x80;
14926 int V1Idx = M <
Size ? M * Scale + i % Scale : ZeroMask;
14927 int V2Idx = M <
Size ? ZeroMask : (M -
Size) * Scale + i % Scale;
14928 if (Zeroable[i / Scale])
14929 V1Idx = V2Idx = ZeroMask;
14933 V1InUse |= (ZeroMask != V1Idx);
14934 V2InUse |= (ZeroMask != V2Idx);
14947 if (V1InUse && V2InUse)
14950 V = V1InUse ? V1 : V2;
14974 assert(Mask.size() == 8 &&
"Unexpected mask size for v8 shuffle!");
14979 Zeroable, Subtarget, DAG))
14987 int NumV2Inputs =
count_if(Mask, [](
int M) {
return M >= 8; });
14989 if (NumV2Inputs == 0) {
14993 Subtarget, DAG,
false))
14998 Mask, Subtarget, DAG))
15027 "All single-input shuffles should be canonicalized to be V1-input "
15037 if (Subtarget.hasSSE4A())
15043 if (NumV2Inputs == 1)
15045 DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
15050 bool IsBlendSupported = Subtarget.
hasSSE41();
15051 if (IsBlendSupported)
15053 Zeroable, Subtarget, DAG))
15057 Zeroable, Subtarget, DAG))
15085 Zeroable, Subtarget, DAG))
15090 if ((NumEvenDrops == 1 || (NumEvenDrops == 2 && Subtarget.
hasSSE41())) &&
15091 !Subtarget.hasVLX()) {
15093 unsigned PackOpc = 0;
15094 if (NumEvenDrops == 2 && Subtarget.
hasAVX2() &&
15098 V1V2 = DAG.
getNode(X86ISD::BLENDI,
DL, MVT::v16i16, V1V2,
15104 PackOpc = X86ISD::PACKUS;
15105 }
else if (Subtarget.
hasSSE41()) {
15108 for (
unsigned i = 0; i != 4; i += 1 << (NumEvenDrops - 1))
15116 PackOpc = X86ISD::PACKUS;
15117 }
else if (!Subtarget.
hasSSSE3()) {
15121 V1 = DAG.
getNode(X86ISD::VSHLI,
DL, MVT::v4i32, V1, ShAmt);
15122 V2 = DAG.
getNode(X86ISD::VSHLI,
DL, MVT::v4i32, V2, ShAmt);
15123 V1 = DAG.
getNode(X86ISD::VSRAI,
DL, MVT::v4i32, V1, ShAmt);
15124 V2 = DAG.
getNode(X86ISD::VSRAI,
DL, MVT::v4i32, V2, ShAmt);
15125 PackOpc = X86ISD::PACKSS;
15130 if (NumEvenDrops == 2) {
15131 Result = DAG.
getBitcast(MVT::v4i32, Result);
15132 Result = DAG.
getNode(PackOpc,
DL, MVT::v8i16, Result, Result);
15140 if (NumOddDrops == 1) {
15141 bool HasSSE41 = Subtarget.
hasSSE41();
15142 V1 = DAG.
getNode(HasSSE41 ? X86ISD::VSRLI : X86ISD::VSRAI,
DL, MVT::v4i32,
15145 V2 = DAG.
getNode(HasSSE41 ? X86ISD::VSRLI : X86ISD::VSRAI,
DL, MVT::v4i32,
15148 return DAG.
getNode(HasSSE41 ? X86ISD::PACKUS : X86ISD::PACKSS,
DL,
15149 MVT::v8i16, V1, V2);
15154 Mask, Subtarget, DAG))
15159 if (!IsBlendSupported && Subtarget.
hasSSSE3()) {
15160 bool V1InUse, V2InUse;
15162 Zeroable, DAG, V1InUse, V2InUse);
15168 Zeroable, Subtarget, DAG);
15178 assert(Mask.size() == 8 &&
"Unexpected mask size for v8 shuffle!");
15179 int NumV2Elements =
count_if(Mask, [](
int M) {
return M >= 8; });
15181 if (Subtarget.hasFP16()) {
15182 if (NumV2Elements == 0) {
15185 Mask, Subtarget, DAG))
15188 if (NumV2Elements == 1 && Mask[0] >= 8)
15190 DL, MVT::v8f16, V1, V2, Mask, Zeroable, Subtarget, DAG))
15217 MVT ShuffleVT = VT;
15227 for (
int &M : AdjustedMask)
15229 M += (Scale - 1) * NumElts;
15238 Result = DAG.
getNode(X86ISD::VPERMV,
DL, ShuffleVT, MaskNode, V1);
15240 Result = DAG.
getNode(X86ISD::VPERMV3,
DL, ShuffleVT, V1, MaskNode, V2);
15242 if (VT != ShuffleVT)
15261 assert(Mask.size() == 16 &&
"Unexpected mask size for v16 shuffle!");
15281 Zeroable, Subtarget, DAG))
15294 if (Subtarget.hasSSE4A())
15299 int NumV2Elements =
count_if(Mask, [](
int M) {
return M >= 16; });
15302 if (NumV2Elements == 0) {
15305 Mask, Subtarget, DAG))
15325 for (
int i = 0; i < 16; i += 2)
15326 if (Mask[i] >= 0 && Mask[i + 1] >= 0 && Mask[i] != Mask[i + 1])
15331 auto tryToWidenViaDuplication = [&]() ->
SDValue {
15332 if (!canWidenViaDuplication(Mask))
15335 copy_if(Mask, std::back_inserter(LoInputs),
15336 [](
int M) {
return M >= 0 && M < 8; });
15340 copy_if(Mask, std::back_inserter(HiInputs), [](
int M) {
return M >= 8; });
15344 bool TargetLo = LoInputs.
size() >= HiInputs.
size();
15345 ArrayRef<int> InPlaceInputs = TargetLo ? LoInputs : HiInputs;
15346 ArrayRef<int> MovingInputs = TargetLo ? HiInputs : LoInputs;
15348 int PreDupI16Shuffle[] = {-1, -1, -1, -1, -1, -1, -1, -1};
15350 for (
int I : InPlaceInputs) {
15351 PreDupI16Shuffle[
I/2] =
I/2;
15354 int j = TargetLo ? 0 : 4, je = j + 4;
15355 for (
int i = 0, ie = MovingInputs.
size(); i < ie; ++i) {
15358 if (PreDupI16Shuffle[j] != MovingInputs[i] / 2) {
15361 while (j < je && PreDupI16Shuffle[j] >= 0)
15369 PreDupI16Shuffle[j] = MovingInputs[i] / 2;
15373 LaneMap[MovingInputs[i]] = 2 * j + MovingInputs[i] % 2;
15378 DAG.
getUNDEF(MVT::v8i16), PreDupI16Shuffle));
15381 bool EvenInUse =
false, OddInUse =
false;
15382 for (
int i = 0; i < 16; i += 2) {
15383 EvenInUse |= (Mask[i + 0] >= 0);
15384 OddInUse |= (Mask[i + 1] >= 0);
15385 if (EvenInUse && OddInUse)
15388 V1 = DAG.
getNode(TargetLo ? X86ISD::UNPCKL : X86ISD::UNPCKH,
DL,
15389 MVT::v16i8, EvenInUse ? V1 : DAG.
getUNDEF(MVT::v16i8),
15390 OddInUse ? V1 : DAG.
getUNDEF(MVT::v16i8));
15392 int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
15393 for (
int i = 0; i < 16; ++i)
15394 if (Mask[i] >= 0) {
15395 int MappedMask = LaneMap[Mask[i]] - (TargetLo ? 0 : 8);
15396 assert(MappedMask < 8 &&
"Invalid v8 shuffle mask!");
15397 if (PostDupI16Shuffle[i / 2] < 0)
15398 PostDupI16Shuffle[i / 2] = MappedMask;
15400 assert(PostDupI16Shuffle[i / 2] == MappedMask &&
15401 "Conflicting entries in the original shuffle!");
15406 DAG.
getUNDEF(MVT::v8i16), PostDupI16Shuffle));
15408 if (
SDValue V = tryToWidenViaDuplication())
15413 Zeroable, Subtarget, DAG))
15422 Zeroable, Subtarget, DAG))
15426 bool IsSingleInput = V2.
isUndef();
15445 if (Subtarget.
hasSSSE3() && (IsSingleInput || NumEvenDrops != 1)) {
15446 bool V1InUse =
false;
15447 bool V2InUse =
false;
15450 DL, MVT::v16i8, V1, V2, Mask, Zeroable, DAG, V1InUse, V2InUse);
15455 if (V1InUse && V2InUse) {
15458 Zeroable, Subtarget, DAG))
15470 DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
15474 if (Subtarget.hasVBMI())
15479 if (Subtarget.hasXOP()) {
15481 return DAG.
getNode(X86ISD::VPPERM,
DL, MVT::v16i8, V1, V2, MaskNode);
15487 DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
15495 if (NumV2Elements == 1)
15497 DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
15510 if (NumEvenDrops) {
15516 assert(NumEvenDrops <= 3 &&
15517 "No support for dropping even elements more than 3 times.");
15519 for (
unsigned i = 0; i != 8; i += 1 << (NumEvenDrops - 1))
15524 if (!IsSingleInput)
15530 IsSingleInput ? V1 : V2);
15531 for (
int i = 1; i < NumEvenDrops; ++i) {
15532 Result = DAG.
getBitcast(MVT::v8i16, Result);
15533 Result = DAG.
getNode(X86ISD::PACKUS,
DL, MVT::v16i8, Result, Result);
15539 if (NumOddDrops == 1) {
15540 V1 = DAG.
getNode(X86ISD::VSRLI,
DL, MVT::v8i16,
15543 if (!IsSingleInput)
15544 V2 = DAG.
getNode(X86ISD::VSRLI,
DL, MVT::v8i16,
15547 return DAG.
getNode(X86ISD::PACKUS,
DL, MVT::v16i8, V1,
15548 IsSingleInput ? V1 : V2);
15552 if (NumV2Elements > 0)
15554 Zeroable, Subtarget, DAG);
15561 std::array<int, 8> LoBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
15562 std::array<int, 8> HiBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
15563 for (
int i = 0; i < 16; ++i)
15565 (i < 8 ? LoBlendMask[i] : HiBlendMask[i % 8]) = Mask[i];
15571 if (
none_of(LoBlendMask, [](
int M) {
return M >= 0 && M % 2 == 1; }) &&
15572 none_of(HiBlendMask, [](
int M) {
return M >= 0 && M % 2 == 1; })) {
15579 VHiHalf = DAG.
getUNDEF(MVT::v8i16);
15582 for (
int &M : LoBlendMask)
15585 for (
int &M : HiBlendMask)
15594 MVT::v8i16, DAG.
getNode(X86ISD::UNPCKL,
DL, MVT::v16i8, V, Zero));
15596 MVT::v8i16, DAG.
getNode(X86ISD::UNPCKH,
DL, MVT::v16i8, V, Zero));
15602 return DAG.
getNode(X86ISD::PACKUS,
DL, MVT::v16i8, LoV, HiV);
15611 const APInt &Zeroable,
15614 if (VT == MVT::v8bf16) {
15651 "Only for 256-bit or wider vector shuffles!");
15656 if (VT == MVT::v8f32) {
15670 int SplitNumElements = NumElements / 2;
15676 auto SplitVector = [&](
SDValue V) {
15679 return std::make_pair(DAG.
getBitcast(SplitVT, LoV),
15683 SDValue LoV1, HiV1, LoV2, HiV2;
15684 std::tie(LoV1, HiV1) = SplitVector(V1);
15685 std::tie(LoV2, HiV2) = SplitVector(V2);
15688 auto GetHalfBlendPiecesReq = [&](
const ArrayRef<int> &HalfMask,
bool &UseLoV1,
15689 bool &UseHiV1,
bool &UseLoV2,
15691 UseLoV1 = UseHiV1 = UseLoV2 = UseHiV2 =
false;
15692 for (
int i = 0; i < SplitNumElements; ++i) {
15693 int M = HalfMask[i];
15694 if (M >= NumElements) {
15695 if (M >= NumElements + SplitNumElements)
15699 }
else if (M >= 0) {
15700 if (M >= SplitNumElements)
15708 auto CheckHalfBlendUsable = [&](
const ArrayRef<int> &HalfMask) ->
bool {
15712 bool UseLoV1, UseHiV1, UseLoV2, UseHiV2;
15713 GetHalfBlendPiecesReq(HalfMask, UseLoV1, UseHiV1, UseLoV2, UseHiV2);
15715 return !(UseHiV1 || UseHiV2);
15722 for (
int i = 0; i < SplitNumElements; ++i) {
15723 int M = HalfMask[i];
15724 if (M >= NumElements) {
15725 V2BlendMask[i] = M - NumElements;
15726 BlendMask[i] = SplitNumElements + i;
15727 }
else if (M >= 0) {
15728 V1BlendMask[i] = M;
15733 bool UseLoV1, UseHiV1, UseLoV2, UseHiV2;
15734 GetHalfBlendPiecesReq(HalfMask, UseLoV1, UseHiV1, UseLoV2, UseHiV2);
15739 assert((!SimpleOnly || (!UseHiV1 && !UseHiV2)) &&
"Shuffle isn't simple");
15742 if (!UseLoV1 && !UseHiV1 && !UseLoV2 && !UseHiV2)
15744 if (!UseLoV2 && !UseHiV2)
15746 if (!UseLoV1 && !UseHiV1)
15750 if (UseLoV1 && UseHiV1) {
15754 V1Blend = UseLoV1 ? LoV1 : HiV1;
15755 for (
int i = 0; i < SplitNumElements; ++i)
15756 if (BlendMask[i] >= 0 && BlendMask[i] < SplitNumElements)
15757 BlendMask[i] = V1BlendMask[i] - (UseLoV1 ? 0 : SplitNumElements);
15759 if (UseLoV2 && UseHiV2) {
15763 V2Blend = UseLoV2 ? LoV2 : HiV2;
15764 for (
int i = 0; i < SplitNumElements; ++i)
15765 if (BlendMask[i] >= SplitNumElements)
15766 BlendMask[i] = V2BlendMask[i] + (UseLoV2 ? SplitNumElements : 0);
15771 if (!CheckHalfBlendUsable(LoMask) || !CheckHalfBlendUsable(HiMask))
15789 const APInt &Zeroable,
15792 assert(!V2.
isUndef() &&
"This routine must not be used to lower single-input "
15793 "shuffles as it could then recurse on itself.");
15794 int Size = Mask.size();
15799 auto DoBothBroadcast = [&] {
15800 int V1BroadcastIdx = -1, V2BroadcastIdx = -1;
15803 if (V2BroadcastIdx < 0)
15804 V2BroadcastIdx = M -
Size;
15805 else if ((M -
Size) != V2BroadcastIdx &&
15808 }
else if (M >= 0) {
15809 if (V1BroadcastIdx < 0)
15810 V1BroadcastIdx = M;
15811 else if (M != V1BroadcastIdx &&
15817 if (DoBothBroadcast())
15825 int LaneSize =
Size / LaneCount;
15827 LaneInputs[0].
resize(LaneCount,
false);
15828 LaneInputs[1].
resize(LaneCount,
false);
15829 for (
int i = 0; i <
Size; ++i)
15831 LaneInputs[Mask[i] /
Size][(Mask[i] %
Size) / LaneSize] =
true;
15832 if (LaneInputs[0].
count() <= 1 && LaneInputs[1].
count() <= 1)
15845 if (SplatOrSplitV1 && SplatOrSplitV2)
15862 assert(VT == MVT::v4f64 &&
"Only for v4f64 shuffles");
15864 int LHSMask[4] = {-1, -1, -1, -1};
15865 int RHSMask[4] = {-1, -1, -1, -1};
15866 int SHUFPDMask[4] = {-1, -1, -1, -1};
15870 for (
int i = 0; i != 4; ++i) {
15874 int LaneBase = i & ~1;
15875 auto &LaneMask = (i & 1) ? RHSMask : LHSMask;
15876 LaneMask[LaneBase + (M & 1)] = M;
15877 SHUFPDMask[i] = M & 1;
15899 int NumEltsPerLane = NumElts / NumLanes;
15907 auto getSublanePermute = [&](
int NumSublanes) ->
SDValue {
15908 int NumSublanesPerLane = NumSublanes / NumLanes;
15909 int NumEltsPerSublane = NumElts / NumSublanes;
15917 for (
int i = 0; i != NumElts; ++i) {
15922 int SrcSublane = M / NumEltsPerSublane;
15923 int DstLane = i / NumEltsPerLane;
15927 bool Found =
false;
15928 int DstSubStart = DstLane * NumSublanesPerLane;
15929 int DstSubEnd = DstSubStart + NumSublanesPerLane;
15930 for (
int DstSublane = DstSubStart; DstSublane < DstSubEnd; ++DstSublane) {
15931 if (!
isUndefOrEqual(CrossLaneMaskLarge[DstSublane], SrcSublane))
15935 CrossLaneMaskLarge[DstSublane] = SrcSublane;
15936 int DstSublaneOffset = DstSublane * NumEltsPerSublane;
15937 InLaneMask[i] = DstSublaneOffset + M % NumEltsPerSublane;
15938 DemandedCrossLane.
setBit(InLaneMask[i]);
15948 if (!CanUseSublanes) {
15953 int NumIdentityLanes = 0;
15954 bool OnlyShuffleLowestLane =
true;
15955 for (
int i = 0; i != NumLanes; ++i) {
15956 int LaneOffset = i * NumEltsPerLane;
15958 i * NumEltsPerLane))
15959 NumIdentityLanes++;
15960 else if (CrossLaneMask[LaneOffset] != 0)
15961 OnlyShuffleLowestLane =
false;
15963 if (OnlyShuffleLowestLane && NumIdentityLanes == (NumLanes - 1))
15969 for (
int i = 0; i != NumElts; ++i)
15970 if (!DemandedCrossLane[i])
15976 if (CrossLaneMask == Mask || InLaneMask == Mask)
15985 if (
SDValue V = getSublanePermute(NumLanes))
15989 if (!CanUseSublanes)
15993 if (
SDValue V = getSublanePermute(NumLanes * 2))
15998 if (!Subtarget.hasFastVariableCrossLaneShuffle())
16001 return getSublanePermute(NumLanes * 4);
16007 int Size = Mask.size();
16008 InLaneMask.
assign(Mask.begin(), Mask.end());
16009 for (
int i = 0; i <
Size; ++i) {
16010 int &M = InLaneMask[i];
16013 if (((M %
Size) / LaneSize) != (i / LaneSize))
16014 M = (M % LaneSize) + ((i / LaneSize) * LaneSize) +
Size;
16030 int Size = Mask.size();
16031 int LaneSize =
Size / 2;
16036 if (VT == MVT::v4f64 &&
16037 !
all_of(Mask, [LaneSize](
int M) {
return M < LaneSize; }))
16045 bool LaneCrossing[2] = {
false,
false};
16046 for (
int i = 0; i <
Size; ++i)
16047 if (Mask[i] >= 0 && ((Mask[i] %
Size) / LaneSize) != (i / LaneSize))
16048 LaneCrossing[(Mask[i] %
Size) / LaneSize] =
true;
16049 AllLanes = LaneCrossing[0] && LaneCrossing[1];
16051 bool LaneUsed[2] = {
false,
false};
16052 for (
int i = 0; i <
Size; ++i)
16054 LaneUsed[(Mask[i] %
Size) / LaneSize] =
true;
16055 AllLanes = LaneUsed[0] && LaneUsed[1];
16060 "This last part of this routine only works on single input shuffles");
16066 "In-lane shuffle mask expected");
16086 const APInt &Zeroable,
16099 VT, MemVT, Ld, Ofs, DAG))
16114 bool IsLowZero = (Zeroable & 0x3) == 0x3;
16115 bool IsHighZero = (Zeroable & 0xc) == 0xc;
16118 if (WidenedMask[0] == 0 && IsHighZero) {
16138 if (!IsLowZero && !IsHighZero) {
16157 if (Subtarget.hasVLX()) {
16158 if (WidenedMask[0] < 2 && WidenedMask[1] >= 2) {
16159 unsigned PermMask = ((WidenedMask[0] % 2) << 0) |
16160 ((WidenedMask[1] % 2) << 1);
16161 return DAG.
getNode(X86ISD::SHUF128,
DL, VT, V1, V2,
16180 assert((WidenedMask[0] >= 0 || IsLowZero) &&
16181 (WidenedMask[1] >= 0 || IsHighZero) &&
"Undef half?");
16183 unsigned PermMask = 0;
16184 PermMask |= IsLowZero ? 0x08 : (WidenedMask[0] << 0);
16185 PermMask |= IsHighZero ? 0x80 : (WidenedMask[1] << 4);
16188 if ((PermMask & 0x0a) != 0x00 && (PermMask & 0xa0) != 0x00)
16190 if ((PermMask & 0x0a) != 0x02 && (PermMask & 0xa0) != 0x20)
16193 return DAG.
getNode(X86ISD::VPERM2X128,
DL, VT, V1, V2,
16211 int NumElts = Mask.size();
16219 for (
int Lane = 0; Lane != NumLanes; ++Lane) {
16220 int Srcs[2] = {-1, -1};
16222 for (
int i = 0; i != NumLaneElts; ++i) {
16223 int M = Mask[(Lane * NumLaneElts) + i];
16230 int LaneSrc = M / NumLaneElts;
16232 if (Srcs[0] < 0 || Srcs[0] == LaneSrc)
16234 else if (Srcs[1] < 0 || Srcs[1] == LaneSrc)
16239 Srcs[Src] = LaneSrc;
16240 InLaneMask[i] = (M % NumLaneElts) + Src * NumElts;
16247 LaneSrcs[Lane][0] = Srcs[0];
16248 LaneSrcs[Lane][1] = Srcs[1];
16251 assert(
M1.size() == M2.size() &&
"Unexpected mask size");
16252 for (
int i = 0, e =
M1.size(); i != e; ++i)
16253 if (
M1[i] >= 0 && M2[i] >= 0 &&
M1[i] != M2[i])
16259 assert(Mask.size() == MergedMask.size() &&
"Unexpected mask size");
16260 for (
int i = 0, e = MergedMask.size(); i != e; ++i) {
16264 assert((MergedMask[i] < 0 || MergedMask[i] == M) &&
16265 "Unexpected mask element");
16270 if (MatchMasks(InLaneMask, RepeatMask)) {
16272 MergeMasks(InLaneMask, RepeatMask);
16277 std::swap(LaneSrcs[Lane][0], LaneSrcs[Lane][1]);
16280 if (MatchMasks(InLaneMask, RepeatMask)) {
16282 MergeMasks(InLaneMask, RepeatMask);
16291 for (
int Lane = 0; Lane != NumLanes; ++Lane) {
16293 if (LaneSrcs[Lane][0] >= 0)
16296 for (
int i = 0; i != NumLaneElts; ++i) {
16297 int M = Mask[(Lane * NumLaneElts) + i];
16302 if (RepeatMask[i] < 0)
16303 RepeatMask[i] = M % NumLaneElts;
16305 if (RepeatMask[i] < NumElts) {
16306 if (RepeatMask[i] != M % NumLaneElts)
16308 LaneSrcs[Lane][0] = M / NumLaneElts;
16310 if (RepeatMask[i] != ((M % NumLaneElts) + NumElts))
16312 LaneSrcs[Lane][1] = M / NumLaneElts;
16316 if (LaneSrcs[Lane][0] < 0 && LaneSrcs[Lane][1] < 0)
16321 for (
int Lane = 0; Lane != NumLanes; ++Lane) {
16322 int Src = LaneSrcs[Lane][0];
16323 for (
int i = 0; i != NumLaneElts; ++i) {
16326 M = Src * NumLaneElts + i;
16327 NewMask[Lane * NumLaneElts + i] = M;
16338 for (
int Lane = 0; Lane != NumLanes; ++Lane) {
16339 int Src = LaneSrcs[Lane][1];
16340 for (
int i = 0; i != NumLaneElts; ++i) {
16343 M = Src * NumLaneElts + i;
16344 NewMask[Lane * NumLaneElts + i] = M;
16355 for (
int i = 0; i != NumElts; ++i) {
16360 NewMask[i] = RepeatMask[i % NumLaneElts];
16361 if (NewMask[i] < 0)
16364 NewMask[i] += (i / NumLaneElts) * NumLaneElts;
16377 int &HalfIdx1,
int &HalfIdx2) {
16378 assert((Mask.size() == HalfMask.
size() * 2) &&
16379 "Expected input mask to be twice as long as output");
16384 if (UndefLower == UndefUpper)
16387 unsigned HalfNumElts = HalfMask.
size();
16388 unsigned MaskIndexOffset = UndefLower ? HalfNumElts : 0;
16391 for (
unsigned i = 0; i != HalfNumElts; ++i) {
16392 int M = Mask[i + MaskIndexOffset];
16400 int HalfIdx = M / HalfNumElts;
16403 int HalfElt = M % HalfNumElts;
16407 if (HalfIdx1 < 0 || HalfIdx1 == HalfIdx) {
16408 HalfMask[i] = HalfElt;
16409 HalfIdx1 = HalfIdx;
16412 if (HalfIdx2 < 0 || HalfIdx2 == HalfIdx) {
16413 HalfMask[i] = HalfElt + HalfNumElts;
16414 HalfIdx2 = HalfIdx;
16429 int HalfIdx2,
bool UndefLower,
16438 auto getHalfVector = [&](
int HalfIdx) {
16441 SDValue V = (HalfIdx < 2 ? V1 : V2);
16442 HalfIdx = (HalfIdx % 2) * HalfNumElts;
16448 SDValue Half1 = getHalfVector(HalfIdx1);
16449 SDValue Half2 = getHalfVector(HalfIdx2);
16459 unsigned Offset = UndefLower ? HalfNumElts : 0;
16472 "Expected 256-bit or 512-bit vector");
16479 "Completely undef shuffle mask should have been simplified already");
16503 int HalfIdx1, HalfIdx2;
16508 assert(HalfMask.
size() == HalfNumElts &&
"Unexpected shuffle mask length");
16511 unsigned NumLowerHalves =
16512 (HalfIdx1 == 0 || HalfIdx1 == 2) + (HalfIdx2 == 0 || HalfIdx2 == 2);
16513 unsigned NumUpperHalves =
16514 (HalfIdx1 == 1 || HalfIdx1 == 3) + (HalfIdx2 == 1 || HalfIdx2 == 3);
16515 assert(NumLowerHalves + NumUpperHalves <= 2 &&
"Only 1 or 2 halves allowed");
16523 if (NumUpperHalves == 0)
16527 if (NumUpperHalves == 1) {
16531 if (EltWidth == 32 && NumLowerHalves && HalfVT.
is128BitVector() &&
16534 Subtarget.hasFastVariableCrossLaneShuffle()))
16540 if (EltWidth == 64 && V2.
isUndef())
16544 if (EltWidth == 8 && HalfIdx1 == 0 && HalfIdx2 == 1)
16556 assert(NumUpperHalves == 2 &&
"Half vector count went wrong");
16561 if (NumUpperHalves == 0) {
16564 if (Subtarget.
hasAVX2() && EltWidth == 64)
16587 int NumLaneElts = NumElts / NumLanes;
16592 for (
unsigned BroadcastSize : {16, 32, 64}) {
16601 for (
int i = 0; i != NumElts; i += NumBroadcastElts)
16602 for (
int j = 0; j != NumBroadcastElts; ++j) {
16603 int M = Mask[i + j];
16606 int &R = RepeatMask[j];
16607 if (0 != ((M % NumElts) / NumLaneElts))
16609 if (0 <= R && R != M)
16617 if (!FindRepeatingBroadcastMask(RepeatMask))
16625 for (
int i = 0; i != NumElts; i += NumBroadcastElts)
16626 for (
int j = 0; j != NumBroadcastElts; ++j)
16627 BroadcastMask[i + j] = j;
16631 if (BroadcastMask == Mask)
16649 auto ShuffleSubLanes = [&](
int SubLaneScale) {
16650 int NumSubLanes = NumLanes * SubLaneScale;
16651 int NumSubLaneElts = NumLaneElts / SubLaneScale;
16656 int TopSrcSubLane = -1;
16662 for (
int DstSubLane = 0; DstSubLane != NumSubLanes; ++DstSubLane) {
16667 for (
int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
16668 int M = Mask[(DstSubLane * NumSubLaneElts) + Elt];
16671 int Lane = (M % NumElts) / NumLaneElts;
16672 if ((0 <= SrcLane) && (SrcLane != Lane))
16675 int LocalM = (M % NumLaneElts) + (M < NumElts ? 0 : NumElts);
16676 SubLaneMask[Elt] = LocalM;
16684 for (
int SubLane = 0; SubLane != SubLaneScale; ++SubLane) {
16686 for (
int i = 0; i != NumSubLaneElts; ++i) {
16687 if (
M1[i] < 0 || M2[i] < 0)
16689 if (
M1[i] != M2[i])
16695 auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane];
16696 if (!MatchMasks(SubLaneMask, RepeatedSubLaneMask))
16700 for (
int i = 0; i != NumSubLaneElts; ++i) {
16701 int M = SubLaneMask[i];
16704 assert((RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask[i] == M) &&
16705 "Unexpected mask element");
16706 RepeatedSubLaneMask[i] = M;
16711 int SrcSubLane = (SrcLane * SubLaneScale) + SubLane;
16712 TopSrcSubLane = std::max(TopSrcSubLane, SrcSubLane);
16713 Dst2SrcSubLanes[DstSubLane] = SrcSubLane;
16718 if (Dst2SrcSubLanes[DstSubLane] < 0)
16721 assert(0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes &&
16722 "Unexpected source lane");
16726 for (
int SubLane = 0; SubLane <= TopSrcSubLane; ++SubLane) {
16727 int Lane = SubLane / SubLaneScale;
16728 auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane % SubLaneScale];
16729 for (
int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
16730 int M = RepeatedSubLaneMask[Elt];
16733 int Idx = (SubLane * NumSubLaneElts) + Elt;
16734 RepeatedMask[Idx] = M + (Lane * NumLaneElts);
16740 for (
int i = 0; i != NumElts; i += NumSubLaneElts) {
16741 int SrcSubLane = Dst2SrcSubLanes[i / NumSubLaneElts];
16742 if (SrcSubLane < 0)
16744 for (
int j = 0; j != NumSubLaneElts; ++j)
16745 SubLaneMask[i + j] = j + (SrcSubLane * NumSubLaneElts);
16750 if (RepeatedMask == Mask || SubLaneMask == Mask)
16764 int MinSubLaneScale = 1, MaxSubLaneScale = 1;
16767 MinSubLaneScale = 2;
16769 (!OnlyLowestElts && V2.
isUndef() && VT == MVT::v32i8) ? 4 : 2;
16771 if (Subtarget.hasBWI() && VT == MVT::v64i8)
16772 MinSubLaneScale = MaxSubLaneScale = 4;
16774 for (
int Scale = MinSubLaneScale; Scale <= MaxSubLaneScale; Scale *= 2)
16775 if (
SDValue Shuffle = ShuffleSubLanes(Scale))
16782 bool &ForceV1Zero,
bool &ForceV2Zero,
16784 const APInt &Zeroable) {
16787 (NumElts == 2 || NumElts == 4 || NumElts == 8) &&
16788 "Unexpected data type for VSHUFPD");
16790 "Illegal shuffle mask");
16792 bool ZeroLane[2] = {
true,
true };
16793 for (
int i = 0; i < NumElts; ++i)
16794 ZeroLane[i & 1] &= Zeroable[i];
16798 bool IsSHUFPD =
true;
16799 bool IsCommutable =
true;
16801 for (
int i = 0; i < NumElts; ++i) {
16806 int Val = (i & 6) + NumElts * (i & 1);
16807 int CommutVal = (i & 0xe) + NumElts * ((i & 1) ^ 1);
16808 if (Mask[i] < Val || Mask[i] > Val + 1)
16810 if (Mask[i] < CommutVal || Mask[i] > CommutVal + 1)
16811 IsCommutable =
false;
16812 SHUFPDMask[i] = Mask[i] % 2;
16815 if (!IsSHUFPD && !IsCommutable)
16818 if (!IsSHUFPD && IsCommutable)
16821 ForceV1Zero = ZeroLane[0];
16822 ForceV2Zero = ZeroLane[1];
16829 const APInt &Zeroable,
16832 assert((VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64) &&
16833 "Unexpected data type for VSHUFPD");
16835 unsigned Immediate = 0;
16836 bool ForceV1Zero =
false, ForceV2Zero =
false;
16847 return DAG.
getNode(X86ISD::SHUFP,
DL, VT, V1, V2,
16857 const APInt &Zeroable,
16859 assert(VT == MVT::v32i8 &&
"Unexpected type!");
16866 if (Zeroable.
countl_one() < (Mask.size() - 8))
16872 V1 = DAG.
getNode(X86ISD::VTRUNC,
DL, MVT::v16i8, V1);
16873 V2 = DAG.
getNode(X86ISD::VTRUNC,
DL, MVT::v16i8, V2);
16878 { 0, 1, 2, 3, 16, 17, 18, 19,
16879 4, 5, 6, 7, 20, 21, 22, 23 });
16906 if (VT != MVT::v8f32 && VT != MVT::v8i32 && VT != MVT::v16i16 &&
16910 auto IsInterleavingPattern = [&](
ArrayRef<int> Mask,
unsigned Begin0,
16912 size_t Size = Mask.size();
16913 assert(
Size % 2 == 0 &&
"Expected even mask size");
16914 for (
unsigned I = 0;
I <
Size;
I += 2) {
16915 if (Mask[
I] != (
int)(Begin0 +
I / 2) ||
16916 Mask[
I + 1] != (
int)(Begin1 +
I / 2))
16923 size_t FirstQtr = NumElts / 2;
16924 size_t ThirdQtr = NumElts + NumElts / 2;
16925 bool IsFirstHalf = IsInterleavingPattern(Mask, 0, NumElts);
16926 bool IsSecondHalf = IsInterleavingPattern(Mask, FirstQtr, ThirdQtr);
16927 if (!IsFirstHalf && !IsSecondHalf)
16937 if (Shuffles.
size() != 2)
16944 if (IsInterleavingPattern(SVN1->getMask(), 0, NumElts) &&
16945 IsInterleavingPattern(SVN2->getMask(), FirstQtr, ThirdQtr)) {
16946 FirstHalf = Shuffles[0];
16947 SecondHalf = Shuffles[1];
16948 }
else if (IsInterleavingPattern(SVN1->getMask(), FirstQtr, ThirdQtr) &&
16949 IsInterleavingPattern(SVN2->getMask(), 0, NumElts)) {
16950 FirstHalf = Shuffles[1];
16951 SecondHalf = Shuffles[0];
16981 assert(Mask.size() == 4 &&
"Unexpected mask size for v4 shuffle!");
16990 Mask, Subtarget, DAG))
16995 return DAG.
getNode(X86ISD::MOVDDUP,
DL, MVT::v4f64, V1);
17000 unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
17001 ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3);
17002 return DAG.
getNode(X86ISD::VPERMILPI,
DL, MVT::v4f64, V1,
17008 return DAG.
getNode(X86ISD::VPERMI,
DL, MVT::v4f64, V1,
17014 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
17019 Mask, DAG, Subtarget))
17028 Zeroable, Subtarget, DAG))
17036 Zeroable, Subtarget, DAG))
17049 !
all_of(Mask, [](
int M) {
return M < 2 || (4 <= M && M < 6); }) &&
17053 !((V1IsInPlace || V1IsSplat) && (V2IsInPlace || V2IsSplat))))
17058 if (V1IsInPlace || V2IsInPlace)
17060 Zeroable, Subtarget, DAG);
17065 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
17072 if (!(Subtarget.
hasAVX2() && (V1IsInPlace || V2IsInPlace)))
17074 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
17078 if (Subtarget.hasVLX())
17080 Zeroable, Subtarget, DAG))
17087 Zeroable, Subtarget, DAG);
17104 assert(Mask.size() == 4 &&
"Unexpected mask size for v4 shuffle!");
17105 assert(Subtarget.
hasAVX2() &&
"We can only lower v4i64 with AVX2!");
17112 Zeroable, Subtarget, DAG))
17121 if (Subtarget.preferLowerShuffleAsShift())
17124 Subtarget, DAG,
true))
17136 DAG.
getNode(X86ISD::PSHUFD,
DL, MVT::v8i32,
17143 return DAG.
getNode(X86ISD::VPERMI,
DL, MVT::v4i64, V1,
17154 if (Subtarget.hasVLX()) {
17156 Zeroable, Subtarget, DAG))
17160 Zeroable, Subtarget, DAG))
17178 if (V1IsInPlace || V2IsInPlace)
17180 Zeroable, Subtarget, DAG);
17185 DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
17197 if (!V1IsInPlace && !V2IsInPlace)
17199 DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
17204 Zeroable, Subtarget, DAG);
17217 assert(Mask.size() == 8 &&
"Unexpected mask size for v8 shuffle!");
17220 Zeroable, Subtarget, DAG))
17238 Zeroable, Subtarget, DAG))
17246 "Repeated masks must be half the mask width!");
17250 return DAG.
getNode(X86ISD::MOVSLDUP,
DL, MVT::v8f32, V1);
17252 return DAG.
getNode(X86ISD::MOVSHDUP,
DL, MVT::v8f32, V1);
17255 return DAG.
getNode(X86ISD::VPERMILPI,
DL, MVT::v8f32, V1,
17270 DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
17278 return DAG.
getNode(X86ISD::VPERMILPV,
DL, MVT::v8f32, V1, VPermMask);
17282 return DAG.
getNode(X86ISD::VPERMV,
DL, MVT::v8f32, VPermMask, V1);
17292 DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
17296 if (Subtarget.hasVLX())
17298 Zeroable, Subtarget, DAG))
17323 Zeroable, Subtarget, DAG);
17340 assert(Mask.size() == 8 &&
"Unexpected mask size for v8 shuffle!");
17341 assert(Subtarget.
hasAVX2() &&
"We can only lower v8i32 with AVX2!");
17343 int NumV2Elements =
count_if(Mask, [](
int M) {
return M >= 8; });
17349 Zeroable, Subtarget, DAG))
17368 Zeroable, Subtarget, DAG))
17377 if (Subtarget.preferLowerShuffleAsShift()) {
17380 Subtarget, DAG,
true))
17382 if (NumV2Elements == 0)
17392 bool Is128BitLaneRepeatedShuffle =
17394 if (Is128BitLaneRepeatedShuffle) {
17395 assert(RepeatedMask.
size() == 4 &&
"Unexpected repeated mask size!");
17397 return DAG.
getNode(X86ISD::PSHUFD,
DL, MVT::v8i32, V1,
17411 if (!Subtarget.preferLowerShuffleAsShift() && NumV2Elements == 0)
17417 if (Subtarget.hasVLX()) {
17419 Zeroable, Subtarget, DAG))
17423 Zeroable, Subtarget, DAG))
17435 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
17447 return DAG.
getNode(X86ISD::VPERMV,
DL, MVT::v8i32, VPermMask, V1);
17457 CastV1, CastV2, DAG);
17464 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
17469 Zeroable, Subtarget, DAG);
17482 assert(Mask.size() == 16 &&
"Unexpected mask size for v16 shuffle!");
17483 assert(Subtarget.
hasAVX2() &&
"We can only lower v16i16 with AVX2!");
17489 DL, MVT::v16i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
17498 Zeroable, Subtarget, DAG))
17518 Subtarget, DAG,
false))
17529 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
17547 DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget))
17560 DL, MVT::v16i16, V1, RepeatedMask, Subtarget, DAG);
17565 Zeroable, Subtarget, DAG))
17569 if (Subtarget.hasBWI())
17575 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
17580 DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget))
17605 assert(Mask.size() == 32 &&
"Unexpected mask size for v32 shuffle!");
17606 assert(Subtarget.
hasAVX2() &&
"We can only lower v32i8 with AVX2!");
17612 Zeroable, Subtarget, DAG))
17621 Zeroable, Subtarget, DAG))
17658 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
17670 DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget))
17678 Zeroable, Subtarget, DAG))
17682 if (Subtarget.hasVBMI())
17688 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
17693 DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget))
17699 if (Subtarget.hasVLX())
17701 Mask, Zeroable, DAG))
17728 int NumV2Elements =
count_if(Mask, [NumElts](
int M) {
return M >= NumElts; });
17730 if (NumV2Elements == 1 && Mask[0] >= NumElts)
17732 DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
17748 if (ElementBits < 32) {
17766 if (VT == MVT::v16f16 || VT == MVT::v16bf16) {
17798 "Unexpected element type size for 128bit shuffle.");
17808 assert(Widened128Mask.
size() == 4 &&
"Shuffle widening mismatch");
17811 if (Widened128Mask[0] == 0 && (Zeroable & 0xf0) == 0xf0 &&
17812 (Widened128Mask[1] == 1 || (Zeroable & 0x0c) == 0x0c)) {
17813 unsigned NumElts = ((Zeroable & 0x0c) == 0x0c) ? 2 : 4;
17824 bool OnlyUsesV1 =
isShuffleEquivalent(Mask, {0, 1, 2, 3, 0, 1, 2, 3}, V1, V2);
17826 isShuffleEquivalent(Mask, {0, 1, 2, 3, 8, 9, 10, 11}, V1, V2)) {
17836 bool IsInsert =
true;
17838 for (
int i = 0; i < 4; ++i) {
17839 assert(Widened128Mask[i] >= -1 &&
"Illegal shuffle sentinel value");
17840 if (Widened128Mask[i] < 0)
17844 if (Widened128Mask[i] < 4) {
17845 if (Widened128Mask[i] != i) {
17851 if (V2Index >= 0 || Widened128Mask[i] != 4) {
17858 if (IsInsert && V2Index >= 0) {
17871 Widened128Mask.
clear();
17877 int PermMask[4] = {-1, -1, -1, -1};
17879 for (
int i = 0; i < 4; ++i) {
17880 assert(Widened128Mask[i] >= -1 &&
"Illegal shuffle sentinel value");
17881 if (Widened128Mask[i] < 0)
17884 SDValue Op = Widened128Mask[i] >= 4 ? V2 : V1;
17891 PermMask[i] = Widened128Mask[i] % 4;
17905 assert(Mask.size() == 8 &&
"Unexpected mask size for v8 shuffle!");
17909 if (
isShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6}, V1, V2))
17910 return DAG.
getNode(X86ISD::MOVDDUP,
DL, MVT::v8f64, V1);
17915 unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
17916 ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3) |
17917 ((Mask[4] == 5) << 4) | ((Mask[5] == 5) << 5) |
17918 ((Mask[6] == 7) << 6) | ((Mask[7] == 7) << 7);
17919 return DAG.
getNode(X86ISD::VPERMILPI,
DL, MVT::v8f64, V1,
17925 return DAG.
getNode(X86ISD::VPERMI,
DL, MVT::v8f64, V1,
17930 V2, Subtarget, DAG))
17938 Zeroable, Subtarget, DAG))
17946 Zeroable, Subtarget, DAG))
17959 assert(Mask.size() == 16 &&
"Unexpected mask size for v16 shuffle!");
17965 assert(RepeatedMask.
size() == 4 &&
"Unexpected repeated mask size!");
17969 return DAG.
getNode(X86ISD::MOVSLDUP,
DL, MVT::v16f32, V1);
17971 return DAG.
getNode(X86ISD::MOVSHDUP,
DL, MVT::v16f32, V1);
17974 return DAG.
getNode(X86ISD::VPERMILPI,
DL, MVT::v16f32, V1,
17982 Zeroable, Subtarget, DAG))
17990 Zeroable, Subtarget, DAG))
17994 DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
18000 DL, MVT::v16f32, V1, V2, Mask, Subtarget, DAG))
18008 return DAG.
getNode(X86ISD::VPERMILPV,
DL, MVT::v16f32, V1, VPermMask);
18013 Zeroable, Subtarget, DAG))
18026 assert(Mask.size() == 8 &&
"Unexpected mask size for v8 shuffle!");
18029 if (Subtarget.preferLowerShuffleAsShift())
18032 Subtarget, DAG,
true))
18045 DAG.
getNode(X86ISD::PSHUFD,
DL, MVT::v16i32,
18052 return DAG.
getNode(X86ISD::VPERMI,
DL, MVT::v8i64, V1,
18057 V2, Subtarget, DAG))
18068 Zeroable, Subtarget, DAG))
18072 if (Subtarget.hasBWI())
18086 Zeroable, Subtarget, DAG))
18099 assert(Mask.size() == 16 &&
"Unexpected mask size for v16 shuffle!");
18101 int NumV2Elements =
count_if(Mask, [](
int M) {
return M >= 16; });
18107 DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
18111 if (Subtarget.preferLowerShuffleAsShift()) {
18114 Subtarget, DAG,
true))
18116 if (NumV2Elements == 0)
18126 bool Is128BitLaneRepeatedShuffle =
18128 if (Is128BitLaneRepeatedShuffle) {
18129 assert(RepeatedMask.
size() == 4 &&
"Unexpected repeated mask size!");
18131 return DAG.
getNode(X86ISD::PSHUFD,
DL, MVT::v16i32, V1,
18142 Subtarget, DAG,
false))
18145 if (!Subtarget.preferLowerShuffleAsShift() && NumV2Elements != 0)
18152 Zeroable, Subtarget, DAG))
18156 if (Subtarget.hasBWI())
18167 CastV1, CastV2, DAG);
18174 DL, MVT::v16i32, V1, V2, Mask, Subtarget, DAG))
18179 Zeroable, Subtarget, DAG))
18183 Zeroable, Subtarget, DAG))
18196 assert(Mask.size() == 32 &&
"Unexpected mask size for v32 shuffle!");
18197 assert(Subtarget.hasBWI() &&
"We can only lower v32i16 with AVX-512-BWI!");
18203 DL, MVT::v32i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
18218 Subtarget, DAG,
false))
18238 RepeatedMask, Subtarget, DAG);
18243 Zeroable, Subtarget, DAG))
18247 Zeroable, Subtarget, DAG))
18253 DL, MVT::v32i16, V1, V2, Mask, Subtarget, DAG))
18266 assert(Mask.size() == 64 &&
"Unexpected mask size for v64 shuffle!");
18267 assert(Subtarget.hasBWI() &&
"We can only lower v64i8 with AVX-512-BWI!");
18273 DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
18304 Zeroable, Subtarget, DAG))
18308 Zeroable, Subtarget, DAG))
18315 if (!Subtarget.hasVBMI())
18317 DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
18321 DL, MVT::v64i8, V1, V2, Mask, DAG, Subtarget))
18325 Zeroable, Subtarget, DAG))
18332 Mask, Subtarget, DAG))
18337 if (Subtarget.hasVBMI())
18343 bool V1InUse, V2InUse;
18345 DAG, V1InUse, V2InUse);
18351 DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
18355 if (Subtarget.hasVBMI())
18369 const APInt &Zeroable,
18373 "Cannot lower 512-bit vectors w/ basic ISA!");
18377 int NumElts = Mask.size();
18378 int NumV2Elements =
count_if(Mask, [NumElts](
int M) {
return M >= NumElts; });
18380 if (NumV2Elements == 1 && Mask[0] >= NumElts)
18382 DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
18395 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI()) {
18407 if (VT == MVT::v32f16 || VT == MVT::v32bf16) {
18408 if (!Subtarget.hasBWI())
18450 int NumElts = Mask.size();
18451 for (
int i = 0; i != NumElts; ++i) {
18454 "Unexpected mask index.");
18459 if (ShiftAmt < 0) {
18466 if (ShiftAmt != M - i)
18469 assert(ShiftAmt >= 0 &&
"All undef?");
18483 int MaskOffset,
const APInt &Zeroable) {
18484 int Size = Mask.size();
18486 auto CheckZeros = [&](
int Shift,
bool Left) {
18487 for (
int j = 0; j < Shift; ++j)
18488 if (!Zeroable[j + (
Left ? 0 : (
Size - Shift))])
18494 auto MatchShift = [&](
int Shift,
bool Left) {
18495 unsigned Pos =
Left ? Shift : 0;
18496 unsigned Low =
Left ? 0 : Shift;
18497 unsigned Len =
Size - Shift;
18501 for (
int Shift = 1; Shift !=
Size; ++Shift)
18502 for (
bool Left : {
true,
false})
18503 if (CheckZeros(Shift,
Left) && MatchShift(Shift,
Left)) {
18504 Opcode =
Left ? X86ISD::KSHIFTL : X86ISD::KSHIFTR;
18518 const APInt &Zeroable,
18522 "Cannot lower 512-bit vectors w/o basic ISA!");
18524 int NumElts = Mask.size();
18525 int NumV2Elements =
count_if(Mask, [NumElts](
int M) {
return M >= NumElts; });
18528 int SubvecElts = 0;
18530 for (
int i = 0; i != NumElts; ++i) {
18531 if (Mask[i] >= 0) {
18535 Src = Mask[i] / NumElts;
18536 if (Src != (Mask[i] / NumElts) || (Mask[i] % NumElts) != i)
18542 assert(SubvecElts != NumElts &&
"Identity shuffle?");
18549 if ((
int)Zeroable.
countl_one() >= (NumElts - SubvecElts)) {
18550 assert(Src >= 0 &&
"Expected a source!");
18570 if (ShiftAmt >= 0) {
18574 if (Opcode == X86ISD::KSHIFTR && WideVT != VT) {
18578 DAG.
getNode(X86ISD::KSHIFTL,
DL, WideVT, Res,
18581 ShiftAmt += WideElts - NumElts;
18584 Res = DAG.
getNode(Opcode,
DL, WideVT, Res,
18608 return Zeroable[M.index()] || (M.value() == (
int)M.index());
18610 if (IsBlendWithZero) {
18611 const unsigned Width = std::max<unsigned>(NumElts, 8u);
18614 APInt MaskValue = (~Zeroable).zextOrTrunc(Width);
18629 ExtVT = MVT::v2i64;
18632 ExtVT = MVT::v4i32;
18637 ExtVT = Subtarget.hasVLX() ? MVT::v8i32 : MVT::v8i64;
18647 assert(Subtarget.hasBWI() &&
"Expected AVX512BW support");
18655 ExtVT = MVT::v64i8;
18665 if ((Subtarget.hasBWI() && (NumElems >= 32)) ||
18666 (Subtarget.hasDQI() && (NumElems < 32)))
18676 int NumElements = Mask.size();
18678 int NumV1Elements = 0, NumV2Elements = 0;
18682 else if (M < NumElements)
18690 if (NumV2Elements > NumV1Elements)
18693 assert(NumV1Elements > 0 &&
"No V1 indices");
18695 if (NumV2Elements == 0)
18703 if (NumV1Elements == NumV2Elements) {
18704 int LowV1Elements = 0, LowV2Elements = 0;
18705 for (
int M : Mask.slice(0, NumElements / 2))
18706 if (M >= NumElements)
18710 if (LowV2Elements > LowV1Elements)
18712 if (LowV2Elements == LowV1Elements) {
18713 int SumV1Indices = 0, SumV2Indices = 0;
18714 for (
int i = 0,
Size = Mask.size(); i <
Size; ++i)
18715 if (Mask[i] >= NumElements)
18717 else if (Mask[i] >= 0)
18719 if (SumV2Indices < SumV1Indices)
18721 if (SumV2Indices == SumV1Indices) {
18722 int NumV1OddIndices = 0, NumV2OddIndices = 0;
18723 for (
int i = 0,
Size = Mask.size(); i <
Size; ++i)
18724 if (Mask[i] >= NumElements)
18725 NumV2OddIndices += i % 2;
18726 else if (Mask[i] >= 0)
18727 NumV1OddIndices += i % 2;
18728 if (NumV2OddIndices < NumV1OddIndices)
18742 if (!V.getValueType().isSimple())
18746 if ((VT == MVT::i16 || VT == MVT::i8) && !Subtarget.hasBWI())
18751 if ((VT == MVT::i16 || VT == MVT::i8) &&
18752 V.getSimpleValueType().getSizeInBits() < 512)
18755 auto HasMaskOperation = [&](
SDValue V) {
18758 switch (V->getOpcode()) {
18777 if (!V->hasOneUse())
18783 if (HasMaskOperation(V))
18808 MVT VT =
Op.getSimpleValueType();
18814 "Can't lower MMX shuffles");
18816 bool V1IsUndef = V1.
isUndef();
18817 bool V2IsUndef = V2.
isUndef();
18818 if (V1IsUndef && V2IsUndef)
18831 any_of(OrigMask, [NumElements](
int M) {
return M >= NumElements; })) {
18833 for (
int &M : NewMask)
18834 if (M >= NumElements)
18840 int MaskUpperLimit = OrigMask.
size() * (V2IsUndef ? 1 : 2);
18841 (void)MaskUpperLimit;
18843 [&](
int M) {
return -1 <= M && M < MaskUpperLimit; }) &&
18844 "Out of bounds shuffle index");
18849 APInt KnownUndef, KnownZero;
18852 APInt Zeroable = KnownUndef | KnownZero;
18878 int NewNumElts = NumElements / 2;
18886 bool UsedZeroVector =
false;
18888 "V2's non-undef elements are used?!");
18889 for (
int i = 0; i != NewNumElts; ++i)
18891 WidenedMask[i] = i + NewNumElts;
18892 UsedZeroVector =
true;
18896 if (UsedZeroVector)
18920 assert(NumElements == (
int)Mask.size() &&
18921 "canonicalizeShuffleMaskWithHorizOp "
18922 "shouldn't alter the shuffle mask size");
18928 auto CanonicalizeConstant = [VT, &
DL, &DAG](
SDValue V) {
18932 if (Undefs.
any() &&
18941 V1 = CanonicalizeConstant(V1);
18942 V2 = CanonicalizeConstant(V2);
18971 "Need AVX512 for custom VECTOR_COMPRESS lowering.");
18986 if (NumVecBits != 128 && NumVecBits != 256)
18989 if (NumElementBits == 32 || NumElementBits == 64) {
18990 unsigned NumLargeElements = 512 / NumElementBits;
18998 Subtarget, DAG,
DL);
19002 Subtarget, DAG,
DL);
19010 if (VecVT == MVT::v8i16 || VecVT == MVT::v8i8 || VecVT == MVT::v16i8 ||
19011 VecVT == MVT::v16i16) {
19016 Passthru = Passthru.
isUndef()
19035 MVT VT =
Op.getSimpleValueType();
19054 MVT VT =
Op.getSimpleValueType();
19076 MVT CondVT =
Cond.getSimpleValueType();
19077 unsigned CondEltSize =
Cond.getScalarValueSizeInBits();
19078 if (CondEltSize == 1)
19082 if (!Subtarget.hasSSE41())
19089 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
19106 if (CondEltSize != EltSize) {
19122 if (EltSize < 32 && VT.
is256BitVector() && !Subtarget.hasAVX2() &&
19123 !Subtarget.hasXOP()) {
19129 if (FreeCond && (FreeLHS || FreeRHS))
19143 if (Subtarget.hasAVX2())
19151 case MVT::v16f16: {
19164 MVT VT =
Op.getSimpleValueType();
19183 SDValue Extract = DAG.
getNode(X86ISD::PEXTRB, dl, MVT::i32, Vec,
19188 if (VT == MVT::f32) {
19194 if (!
Op.hasOneUse())
19199 User->getValueType(0) != MVT::i32))
19206 if (VT == MVT::i32 || VT == MVT::i64)
19221 MVT EltVT =
Op.getSimpleValueType();
19224 "Unexpected vector type in ExtractBitFromMaskVector");
19232 if (NumElts == 1) {
19244 unsigned IdxVal = IdxC->getZExtValue();
19261 MVT VT =
N->getSimpleValueType(0);
19265 switch (
User->getOpcode()) {
19266 case X86ISD::PEXTRB:
19267 case X86ISD::PEXTRW:
19271 return DemandedElts;
19273 DemandedElts.
setBit(
User->getConstantOperandVal(1));
19276 if (!
User->getValueType(0).isSimple() ||
19277 !
User->getValueType(0).isVector()) {
19279 return DemandedElts;
19287 return DemandedElts;
19290 return DemandedElts;
19294X86TargetLowering::LowerEXTRACT_VECTOR_ELT(
SDValue Op,
19339 unsigned IdxVal = IdxC->getZExtValue();
19353 IdxVal &= ElemsPerChunk - 1;
19360 MVT VT =
Op.getSimpleValueType();
19362 if (VT == MVT::i16) {
19367 if (Subtarget.hasFP16())
19375 SDValue Extract = DAG.
getNode(X86ISD::PEXTRW, dl, MVT::i32, Vec,
19380 if (Subtarget.hasSSE41())
19387 if (VT == MVT::i8) {
19392 int DWordIdx = IdxVal / 4;
19393 if (DWordIdx == 0 && DemandedElts == (DemandedElts & 15)) {
19397 int ShiftVal = (IdxVal % 4) * 8;
19404 int WordIdx = IdxVal / 2;
19405 if (DemandedElts == (DemandedElts & (3 << (WordIdx * 2)))) {
19409 int ShiftVal = (IdxVal % 2) * 8;
19423 Mask[0] =
static_cast<int>(IdxVal);
19439 int Mask[2] = { 1, -1 };
19477 MVT VT =
Op.getSimpleValueType();
19482 if (EltVT == MVT::i1)
19491 if (EltVT == MVT::bf16) {
19503 if (!(Subtarget.hasBWI() ||
19504 (Subtarget.hasAVX512() && EltSizeInBits >= 32) ||
19505 (Subtarget.hasSSE41() && (EltVT == MVT::f32 || EltVT == MVT::f64))))
19518 for (
unsigned I = 0;
I != NumElts; ++
I)
19523 return DAG.
getSelectCC(dl, IdxSplat, Indices, EltSplat, N0,
19527 if (N2C->getAPIntValue().uge(NumElts))
19529 uint64_t IdxVal = N2C->getZExtValue();
19534 if (IsZeroElt || IsAllOnesElt) {
19537 if (IsAllOnesElt &&
19538 ((VT == MVT::v16i8 && !Subtarget.hasSSE41()) ||
19539 ((VT == MVT::v32i8 || VT == MVT::v16i16) && !Subtarget.hasInt256()))) {
19543 CstVectorElts[IdxVal] = OnesCst;
19549 if (Subtarget.hasSSE41() &&
19551 SmallVector<int, 8> BlendMask;
19552 for (
unsigned i = 0; i != NumElts; ++i)
19553 BlendMask.
push_back(i == IdxVal ? i + NumElts : i);
19569 if ((Subtarget.hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) ||
19570 (Subtarget.hasAVX2() && (EltVT == MVT::i32 || EltVT == MVT::i64))) {
19572 return DAG.
getNode(X86ISD::BLENDI, dl, VT, N0, N1Vec,
19577 unsigned NumEltsIn128 = 128 / EltSizeInBits;
19579 "Vectors will always have power-of-two number of elements.");
19584 if (IdxVal >= NumEltsIn128 &&
19585 ((Subtarget.hasAVX2() && EltSizeInBits != 8) ||
19586 (Subtarget.hasAVX() && (EltSizeInBits >= 32) &&
19589 SmallVector<int, 8> BlendMask;
19590 for (
unsigned i = 0; i != NumElts; ++i)
19591 BlendMask.
push_back(i == IdxVal ? i + NumElts : i);
19600 unsigned IdxIn128 = IdxVal & (NumEltsIn128 - 1);
19612 if (EltVT == MVT::i32 || EltVT == MVT::f32 || EltVT == MVT::f64 ||
19613 EltVT == MVT::f16 || EltVT == MVT::i64) {
19620 if (EltVT == MVT::i16 || EltVT == MVT::i8) {
19631 if (VT == MVT::v8i16 || (VT == MVT::v16i8 && Subtarget.hasSSE41())) {
19633 if (VT == MVT::v8i16) {
19634 assert(Subtarget.hasSSE2() &&
"SSE2 required for PINSRW");
19635 Opc = X86ISD::PINSRW;
19637 assert(VT == MVT::v16i8 &&
"PINSRB requires v16i8 vector");
19638 assert(Subtarget.hasSSE41() &&
"SSE41 required for PINSRB");
19639 Opc = X86ISD::PINSRB;
19645 return DAG.
getNode(
Opc, dl, VT, N0, N1, N2);
19648 if (Subtarget.hasSSE41()) {
19649 if (EltVT == MVT::f32) {
19669 return DAG.
getNode(X86ISD::BLENDI, dl, VT, N0, N1,
19674 return DAG.
getNode(X86ISD::INSERTPS, dl, VT, N0, N1,
19679 if (EltVT == MVT::i32 || EltVT == MVT::i64)
19690 MVT XTy =
X.getSimpleValueType();
19697 if (!Subtarget.hasFP16())
19703 128 /
X.getSimpleValueType().getSizeInBits());
19719 return DAG.
getNode(X86ISD::SCALEF,
DL, XTy,
X, Exp);
19724 if (Subtarget.hasFP16()) {
19725 if (Subtarget.hasVLX()) {
19727 return DAG.
getNode(X86ISD::SCALEF,
DL, XTy,
X, Exp);
19733 X.getSimpleValueType().changeTypeToInteger());
19736 if (Subtarget.hasFP16()) {
19738 return DAG.
getNode(X86ISD::SCALEF,
DL, XTy,
X, Exp);
19755 MVT OpVT =
Op.getSimpleValueType();
19776 "Expected an SSE type!");
19780 if (OpVT == MVT::v4i32 || (OpVT == MVT::v8i16 && Subtarget.hasFP16()))
19793 assert(
Op.getSimpleValueType().getVectorElementType() == MVT::i1);
19800 assert(
Op.getSimpleValueType().getVectorElementType() == MVT::i1 &&
19801 "Only vXi1 extract_subvectors need custom lowering");
19805 uint64_t IdxVal =
Op.getConstantOperandVal(1);
19822unsigned X86TargetLowering::getGlobalWrapperKind(
19823 const GlobalValue *GV,
const unsigned char OpFlags)
const {
19826 return X86ISD::Wrapper;
19829 if (Subtarget.isPICStyleRIPRel() &&
19832 return X86ISD::WrapperRIP;
19836 return X86ISD::WrapperRIP;
19838 return X86ISD::Wrapper;
19853 unsigned char OpFlag = Subtarget.classifyLocalReference(
nullptr);
19860 DAG.
getNode(getGlobalWrapperKind(
nullptr, OpFlag),
DL, PtrVT, Result);
19876 unsigned char OpFlag = Subtarget.classifyLocalReference(
nullptr);
19878 EVT PtrVT =
Op.getValueType();
19882 DAG.
getNode(getGlobalWrapperKind(
nullptr, OpFlag),
DL, PtrVT, Result);
19895 return LowerGlobalOrExternal(
Op, DAG,
false,
nullptr);
19901 unsigned char OpFlags =
19902 Subtarget.classifyBlockAddressReference();
19906 EVT PtrVT =
Op.getValueType();
19909 DAG.
getNode(getGlobalWrapperKind(
nullptr, OpFlags), dl, PtrVT, Result);
19924 bool *IsImpCall)
const {
19927 const GlobalValue *GV =
nullptr;
19929 const char *ExternalSym =
nullptr;
19931 GV =
G->getGlobal();
19935 ExternalSym = ES->getSymbol();
19940 unsigned char OpFlags;
19942 OpFlags = Subtarget.classifyGlobalFunctionReference(GV,
Mod);
19944 OpFlags = Subtarget.classifyGlobalReference(GV,
Mod);
19949 EVT PtrVT =
Op.getValueType();
19958 int64_t GlobalOffset = 0;
19971 if (ForCall && !NeedsLoad && !HasPICReg &&
Offset == 0)
19977 Mod.getModuleFlag(
"import-call-optimization")) {
19978 assert(ForCall &&
"Should only enable import call optimization if we are "
19979 "lowering a call");
19984 Result = DAG.
getNode(getGlobalWrapperKind(GV, OpFlags), dl, PtrVT, Result);
20009 return LowerGlobalOrExternal(
Op, DAG,
false,
nullptr);
20013 const EVT PtrVT,
unsigned ReturnReg,
20014 unsigned char OperandFlags,
20015 bool LoadGlobalBaseReg =
false,
20016 bool LocalDynamic =
false) {
20024 if (LocalDynamic && UseTLSDESC) {
20031 "Unexpected TLSDESC DAG");
20035 "Unexpected TLSDESC DAG");
20037 auto *CopyFromRegOp = CallSeqEndOp->getGluedUser();
20039 "Unexpected TLSDESC DAG");
20040 Ret =
SDValue(CopyFromRegOp, 0);
20048 unsigned CallType = UseTLSDESC ? X86ISD::TLSDESC
20049 : LocalDynamic ? X86ISD::TLSBASEADDR
20053 if (LoadGlobalBaseReg) {
20059 Chain = DAG.
getNode(CallType, dl, NodeTys, {Chain, TGA, InGlue});
20061 Chain = DAG.
getNode(CallType, dl, NodeTys, {Chain, TGA});
20109 bool Is64Bit,
bool Is64BitLP64) {
20119 unsigned ReturnReg = Is64BitLP64 ? X86::RAX : X86::EAX;
20134 unsigned WrapperKind = X86ISD::Wrapper;
20158 unsigned char OperandFlags = 0;
20161 unsigned WrapperKind = X86ISD::Wrapper;
20167 WrapperKind = X86ISD::WrapperRIP;
20207 const GlobalValue *GV = GA->
getGlobal();
20208 EVT PtrVT =
Op.getValueType();
20211 if (Subtarget.isTargetELF()) {
20215 if (Subtarget.is64Bit()) {
20216 if (Subtarget.isTarget64BitLP64())
20223 Subtarget.isTarget64BitLP64());
20227 PositionIndependent);
20232 if (Subtarget.isTargetDarwin()) {
20234 unsigned char OpFlag = 0;
20235 unsigned WrapperKind = 0;
20239 bool PIC32 = PositionIndependent && !Subtarget.is64Bit();
20242 WrapperKind = X86ISD::Wrapper;
20245 WrapperKind = X86ISD::WrapperRIP;
20262 SDVTList NodeTys = DAG.
getVTList(MVT::Other, MVT::Glue);
20265 Chain = DAG.
getNode(X86ISD::TLSCALL,
DL, NodeTys, Args);
20274 unsigned Reg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
20278 if (Subtarget.isOSWindows()) {
20300 SDValue TlsArray = Subtarget.is64Bit()
20302 : (Subtarget.isTargetWindowsGNU()
20307 DAG.
getLoad(PtrVT, dl, Chain, TlsArray, MachinePointerInfo(Ptr));
20311 res = ThreadPointer;
20315 if (Subtarget.is64Bit())
20317 MachinePointerInfo(), MVT::i32);
20319 IDX = DAG.
getLoad(PtrVT, dl, Chain, IDX, MachinePointerInfo());
20329 res = DAG.
getLoad(PtrVT, dl, Chain, res, MachinePointerInfo());
20346 if (Subtarget.is64Bit() && Subtarget.isTargetELF()) {
20387 "Unexpected opcode!");
20388 bool IsStrict =
Op->isStrictFPOpcode();
20389 unsigned OpNo = IsStrict ? 1 : 0;
20391 MVT SrcVT = Src.getSimpleValueType();
20392 MVT VT =
Op.getSimpleValueType();
20394 if (!Subtarget.hasDQI() || SrcVT != MVT::i64 || Subtarget.is64Bit() ||
20395 (VT != MVT::f32 && VT != MVT::f64))
20401 unsigned NumElts = Subtarget.hasVLX() ? 4 : 8;
20408 {Op.getOperand(0), InVec});
20428 "Unexpected opcode!");
20429 bool IsStrict =
Op->isStrictFPOpcode();
20430 SDValue Src =
Op.getOperand(IsStrict ? 1 : 0);
20431 MVT SrcVT = Src.getSimpleValueType();
20432 MVT VT =
Op.getSimpleValueType();
20434 if (SrcVT != MVT::i64 || Subtarget.is64Bit() || VT != MVT::f16)
20439 assert(Subtarget.hasFP16() &&
"Expected FP16");
20443 SDValue CvtVec = DAG.
getNode(
Op.getOpcode(), dl, {MVT::v2f16, MVT::Other},
20444 {Op.getOperand(0), InVec});
20462 if (!Subtarget.
hasSSE2() || FromVT != MVT::v4i32)
20465 return ToVT == MVT::v4f32 || (Subtarget.
hasAVX() && ToVT == MVT::v4f64);
20469 if (!Subtarget.
hasAVX512() || FromVT != MVT::v4i32)
20472 return ToVT == MVT::v4f32 || ToVT == MVT::v4f64;
20512 if (FromVT != Vec128VT)
20537 MVT SrcVT =
X.getSimpleValueType();
20538 if (SrcVT != MVT::f32 && SrcVT != MVT::f64)
20543 if (!Subtarget.
hasSSE2() || (VT != MVT::f32 && VT != MVT::f64) ||
20544 (IntVT != MVT::i32 && IntVT != MVT::i64))
20551 unsigned ToIntOpcode =
20553 unsigned ToFPOpcode =
20555 unsigned Width = 128;
20557 if (Subtarget.hasVLX() && Subtarget.hasDQI()) {
20566 if (IsUnsigned || IntVT == MVT::i64) {
20578 MVT VecSrcVT, VecIntVT, VecVT;
20580 unsigned SrcElts, VTElts;
20582 if (Width == 512) {
20583 NumElts = std::min(Width / IntSize, Width / SrcSize);
20587 NumElts = Width / IntSize;
20588 SrcElts = Width / SrcSize;
20589 VTElts = Width / VTSize;
20610 bool IsStrict =
Op->isStrictFPOpcode();
20611 MVT VT =
Op->getSimpleValueType(0);
20612 SDValue Src =
Op->getOperand(IsStrict ? 1 : 0);
20614 if (Subtarget.hasDQI()) {
20615 assert(!Subtarget.hasVLX() &&
"Unexpected features");
20617 assert((Src.getSimpleValueType() == MVT::v2i64 ||
20618 Src.getSimpleValueType() == MVT::v4i64) &&
20619 "Unsupported custom type");
20622 assert((VT == MVT::v4f32 || VT == MVT::v2f64 || VT == MVT::v4f64) &&
20624 MVT WideVT = VT == MVT::v4f32 ? MVT::v8f32 : MVT::v8f64;
20634 Res = DAG.
getNode(
Op.getOpcode(),
DL, {WideVT, MVT::Other},
20635 {Op->getOperand(0), Src});
20638 Res = DAG.
getNode(
Op.getOpcode(),
DL, WideVT, Src);
20651 if (VT != MVT::v4f32 || IsSigned)
20663 for (
int i = 0; i != 4; ++i) {
20669 {
Op.getOperand(0), Elt});
20670 Chains[i] = SignCvts[i].getValue(1);
20681 {Chain, SignCvt, SignCvt});
20698 bool IsStrict =
Op->isStrictFPOpcode();
20699 SDValue Src =
Op.getOperand(IsStrict ? 1 : 0);
20701 MVT VT =
Op.getSimpleValueType();
20709 DAG.
getNode(
Op.getOpcode(), dl, {NVT, MVT::Other}, {Chain, Src}),
20712 DAG.
getNode(
Op.getOpcode(), dl, NVT, Src), Rnd);
20717 if (FloatVT.
getScalarType() != MVT::f16 || Subtarget.hasVLX()) {
20718 if (VT == MVT::v4i32 && Subtarget.
hasSSE2() && IsSigned)
20720 if (VT == MVT::v8i32 && Subtarget.
hasAVX() && IsSigned)
20723 if (Subtarget.hasVLX() && (VT == MVT::v4i32 || VT == MVT::v8i32))
20726 if (VT == MVT::v16i32)
20728 if (VT == MVT::v8i64 && FloatVT == MVT::v8f16 && Subtarget.hasFP16())
20730 if (VT == MVT::v8i64 && Subtarget.hasDQI())
20733 if (Subtarget.hasDQI() && Subtarget.hasVLX() &&
20734 (VT == MVT::v2i64 || VT == MVT::v4i64))
20741 bool IsStrict =
Op->isStrictFPOpcode();
20742 unsigned OpNo = IsStrict ? 1 : 0;
20745 MVT SrcVT = Src.getSimpleValueType();
20746 MVT VT =
Op.getSimpleValueType();
20754 if (Subtarget.isTargetWin64() && SrcVT == MVT::i128)
20755 return LowerWin64_INT128_TO_FP(
Op, DAG);
20764 if (SrcVT == MVT::v2i32 && VT == MVT::v2f64) {
20769 X86ISD::STRICT_CVTSI2P, dl, {VT, MVT::Other},
20772 return DAG.
getNode(X86ISD::CVTSI2P, dl, VT,
20776 if (SrcVT == MVT::v2i64 || SrcVT == MVT::v4i64)
20782 assert(SrcVT <= MVT::i64 && SrcVT >= MVT::i16 &&
20783 "Unknown SINT_TO_FP to lower!");
20789 if (SrcVT == MVT::i32 && UseSSEReg)
20791 if (SrcVT == MVT::i64 && UseSSEReg && Subtarget.is64Bit())
20800 if (SrcVT == MVT::i16 && (UseSSEReg || VT == MVT::f128)) {
20809 if (VT == MVT::f128 || !Subtarget.hasX87())
20813 if (SrcVT == MVT::i64 && Subtarget.hasSSE2() && !Subtarget.is64Bit())
20817 ValueToStore = DAG.
getBitcast(MVT::f64, ValueToStore);
20824 MachinePointerInfo MPI =
20827 Chain = DAG.
getStore(Chain, dl, ValueToStore, StackSlot, MPI, Alignment);
20828 std::pair<SDValue, SDValue> Tmp =
20829 BuildFILD(VT, SrcVT, dl, Chain, StackSlot, MPI, Alignment, DAG);
20844 Tys = DAG.
getVTList(MVT::f80, MVT::Other);
20846 Tys = DAG.
getVTList(DstVT, MVT::Other);
20848 SDValue FILDOps[] = {Chain, Pointer};
20852 Chain = Result.getValue(1);
20862 SDValue FSTOps[] = {Chain, Result, StackSlot};
20870 DstVT,
DL, Chain, StackSlot,
20872 Chain = Result.getValue(1);
20875 return { Result, Chain };
20884 bool HasFastHOps = Subtarget.hasFastHorizontalOps();
20885 return !IsSingleSource || IsOptimizingSize || HasFastHOps;
20895 assert(!
Op->isStrictFPOpcode() &&
"Expected non-strict uint_to_fp!");
20912 static const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 };
20920 APInt(64, 0x4330000000000000ULL))));
20923 APInt(64, 0x4530000000000000ULL))));
20937 MVT::v2f64, dl, CLod0.
getValue(1), CPIdx1,
20946 Result = DAG.
getNode(X86ISD::FHADD, dl, MVT::v2f64,
Sub,
Sub);
20960 unsigned OpNo =
Op.getNode()->isStrictFPOpcode() ? 1 : 0;
20982 if (
Op.getNode()->isStrictFPOpcode()) {
20987 {Chain,
Or, Bias});
20989 if (
Op.getValueType() ==
Sub.getValueType())
20994 Sub,
Sub.getValue(1), dl,
Op.getSimpleValueType());
20996 return DAG.
getMergeValues({ResultPair.first, ResultPair.second}, dl);
21010 if (
Op.getSimpleValueType() != MVT::v2f64)
21013 bool IsStrict =
Op->isStrictFPOpcode();
21015 SDValue N0 =
Op.getOperand(IsStrict ? 1 : 0);
21019 if (!Subtarget.hasVLX()) {
21027 {Op.getOperand(0), N0});
21038 return DAG.
getNode(X86ISD::STRICT_CVTUI2P,
DL, {MVT::v2f64, MVT::Other},
21039 {
Op.getOperand(0), N0});
21040 return DAG.
getNode(X86ISD::CVTUI2P,
DL, MVT::v2f64, N0);
21056 {
Op.getOperand(0),
Or, VBias});
21063 bool IsStrict =
Op->isStrictFPOpcode();
21064 SDValue V =
Op->getOperand(IsStrict ? 1 : 0);
21065 MVT VecIntVT = V.getSimpleValueType();
21066 assert((VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) &&
21067 "Unsupported custom type");
21071 assert(!Subtarget.hasVLX() &&
"Unexpected features");
21072 MVT VT =
Op->getSimpleValueType(0);
21075 if (VT == MVT::v8f64)
21078 assert((VT == MVT::v4f32 || VT == MVT::v8f32 || VT == MVT::v4f64 ||
21079 VT == MVT::v8f16) &&
21081 MVT WideVT = VT == MVT::v8f16 ? MVT::v16f16 : MVT::v16f32;
21082 MVT WideIntVT = MVT::v16i32;
21083 if (VT == MVT::v4f64) {
21084 WideVT = MVT::v8f64;
21085 WideIntVT = MVT::v8i32;
21097 {
Op->getOperand(0), V});
21111 if (Subtarget.
hasAVX() && VecIntVT == MVT::v4i32 &&
21112 Op->getSimpleValueType(0) == MVT::v4f64) {
21122 X86ISD::VBROADCAST_LOAD,
DL, Tys,
Ops, MVT::f64,
21132 {
Op.getOperand(0),
Or, VBias});
21148 bool Is128 = VecIntVT == MVT::v4i32;
21149 MVT VecFloatVT = Is128 ? MVT::v4f32 : MVT::v8f32;
21152 if (VecFloatVT !=
Op->getSimpleValueType(0))
21173 MVT VecI16VT = Is128 ? MVT::v8i16 : MVT::v16i16;
21179 Low = DAG.
getNode(X86ISD::BLENDI,
DL, VecI16VT, VecBitcast,
21187 High = DAG.
getNode(X86ISD::BLENDI,
DL, VecI16VT, VecShiftBitcast,
21214 {
Op.getOperand(0), HighBitcast, VecCstFSub});
21216 {FHigh.
getValue(1), LowBitcast, FHigh});
21226 unsigned OpNo =
Op.getNode()->isStrictFPOpcode() ? 1 : 0;
21246 bool IsStrict =
Op->isStrictFPOpcode();
21247 unsigned OpNo = IsStrict ? 1 : 0;
21251 MVT SrcVT = Src.getSimpleValueType();
21252 MVT DstVT =
Op->getSimpleValueType(0);
21256 if (DstVT == MVT::f128)
21270 if (Subtarget.isTargetWin64() && SrcVT == MVT::i128)
21271 return LowerWin64_INT128_TO_FP(
Op, DAG);
21277 (SrcVT == MVT::i32 || (SrcVT == MVT::i64 && Subtarget.is64Bit()))) {
21284 if (SrcVT == MVT::i32 && Subtarget.is64Bit()) {
21299 if (SrcVT == MVT::i64 && DstVT == MVT::f64 && Subtarget.hasSSE2() &&
21304 if (SrcVT == MVT::i32 && Subtarget.hasSSE2() && DstVT != MVT::f80 &&
21307 if (Subtarget.is64Bit() && SrcVT == MVT::i64 &&
21308 (DstVT == MVT::f32 || DstVT == MVT::f64))
21314 Align SlotAlign(8);
21315 MachinePointerInfo MPI =
21317 if (SrcVT == MVT::i32) {
21320 SDValue Store1 = DAG.
getStore(Chain, dl, Src, StackSlot, MPI, SlotAlign);
21323 std::pair<SDValue, SDValue> Tmp =
21324 BuildFILD(DstVT, MVT::i64, dl, Store2, StackSlot, MPI, SlotAlign, DAG);
21331 assert(SrcVT == MVT::i64 &&
"Unexpected type in UINT_TO_FP");
21337 ValueToStore = DAG.
getBitcast(MVT::f64, ValueToStore);
21340 DAG.
getStore(Chain, dl, ValueToStore, StackSlot, MPI, SlotAlign);
21344 SDVTList Tys = DAG.
getVTList(MVT::f80, MVT::Other);
21357 APInt FF(64, 0x5F80000000000000ULL);
21379 if (Subtarget.isOSWindows() && DstVT == MVT::f32)
21380 Opc = X86ISD::STRICT_FP80_ADD;
21383 DAG.
getNode(
Opc, dl, {MVT::f80, MVT::Other}, {Chain, Fild, Fudge});
21385 if (DstVT == MVT::f80)
21393 if (Subtarget.isOSWindows() && DstVT == MVT::f32)
21394 Opc = X86ISD::FP80_ADD;
21410 bool IsStrict =
Op->isStrictFPOpcode();
21413 EVT DstTy =
Op.getValueType();
21415 EVT TheVT =
Value.getValueType();
21418 if (TheVT != MVT::f32 && TheVT != MVT::f64 && TheVT != MVT::f80) {
21427 bool UnsignedFixup = !IsSigned && DstTy == MVT::i64;
21431 if (!IsSigned && DstTy != MVT::i64) {
21434 assert(DstTy == MVT::i32 &&
"Unexpected FP_TO_UINT");
21438 assert(DstTy.getSimpleVT() <= MVT::i64 &&
21439 DstTy.getSimpleVT() >= MVT::i16 &&
21440 "Unknown FP_TO_INT to lower!");
21445 unsigned MemSize = DstTy.getStoreSize();
21454 if (UnsignedFixup) {
21474 bool LosesInfo =
false;
21475 if (TheVT == MVT::f64)
21479 else if (TheVT == MVT::f80)
21484 "FP conversion should have been exact");
21494 Chain =
Cmp.getValue(1);
21519 { Chain,
Value, FltOfs });
21520 Chain =
Value.getValue(1);
21530 assert(DstTy == MVT::i64 &&
"Invalid FP_TO_SINT to lower!");
21532 SDVTList Tys = DAG.
getVTList(MVT::f80, MVT::Other);
21536 assert(FLDSize <= MemSize &&
"Stack slot not big enough");
21540 Chain =
Value.getValue(1);
21563 MVT VT =
Op.getSimpleValueType();
21565 MVT InVT = In.getSimpleValueType();
21566 unsigned Opc =
Op.getOpcode();
21570 "Unexpected extension opcode");
21572 "Expected same number of elements");
21576 "Unexpected element type");
21580 "Unexpected element type");
21584 if (VT == MVT::v32i16 && !Subtarget.hasBWI()) {
21585 assert(InVT == MVT::v32i8 &&
"Unexpected VT!");