LLVM 20.0.0git
X86ISelLowering.cpp
Go to the documentation of this file.
1//===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines the interfaces that X86 uses to lower LLVM code into a
10// selection DAG.
11//
12//===----------------------------------------------------------------------===//
13
14#include "X86ISelLowering.h"
16#include "X86.h"
17#include "X86FrameLowering.h"
18#include "X86InstrBuilder.h"
19#include "X86IntrinsicsInfo.h"
21#include "X86TargetMachine.h"
23#include "llvm/ADT/SmallSet.h"
24#include "llvm/ADT/Statistic.h"
41#include "llvm/IR/CallingConv.h"
42#include "llvm/IR/Constants.h"
45#include "llvm/IR/Function.h"
46#include "llvm/IR/GlobalAlias.h"
48#include "llvm/IR/IRBuilder.h"
50#include "llvm/IR/Intrinsics.h"
52#include "llvm/MC/MCAsmInfo.h"
53#include "llvm/MC/MCContext.h"
54#include "llvm/MC/MCExpr.h"
55#include "llvm/MC/MCSymbol.h"
57#include "llvm/Support/Debug.h"
62#include <algorithm>
63#include <bitset>
64#include <cctype>
65#include <numeric>
66using namespace llvm;
67
68#define DEBUG_TYPE "x86-isel"
69
71 "x86-experimental-pref-innermost-loop-alignment", cl::init(4),
73 "Sets the preferable loop alignment for experiments (as log2 bytes) "
74 "for innermost loops only. If specified, this option overrides "
75 "alignment set by x86-experimental-pref-loop-alignment."),
77
79 "x86-br-merging-base-cost", cl::init(2),
81 "Sets the cost threshold for when multiple conditionals will be merged "
82 "into one branch versus be split in multiple branches. Merging "
83 "conditionals saves branches at the cost of additional instructions. "
84 "This value sets the instruction cost limit, below which conditionals "
85 "will be merged, and above which conditionals will be split. Set to -1 "
86 "to never merge branches."),
88
90 "x86-br-merging-ccmp-bias", cl::init(6),
91 cl::desc("Increases 'x86-br-merging-base-cost' in cases that the target "
92 "supports conditional compare instructions."),
94
95static cl::opt<bool>
96 WidenShift("x86-widen-shift", cl::init(true),
97 cl::desc("Replacte narrow shifts with wider shifts."),
99
101 "x86-br-merging-likely-bias", cl::init(0),
102 cl::desc("Increases 'x86-br-merging-base-cost' in cases that it is likely "
103 "that all conditionals will be executed. For example for merging "
104 "the conditionals (a == b && c > d), if its known that a == b is "
105 "likely, then it is likely that if the conditionals are split "
106 "both sides will be executed, so it may be desirable to increase "
107 "the instruction cost threshold. Set to -1 to never merge likely "
108 "branches."),
109 cl::Hidden);
110
112 "x86-br-merging-unlikely-bias", cl::init(-1),
113 cl::desc(
114 "Decreases 'x86-br-merging-base-cost' in cases that it is unlikely "
115 "that all conditionals will be executed. For example for merging "
116 "the conditionals (a == b && c > d), if its known that a == b is "
117 "unlikely, then it is unlikely that if the conditionals are split "
118 "both sides will be executed, so it may be desirable to decrease "
119 "the instruction cost threshold. Set to -1 to never merge unlikely "
120 "branches."),
121 cl::Hidden);
122
124 "mul-constant-optimization", cl::init(true),
125 cl::desc("Replace 'mul x, Const' with more effective instructions like "
126 "SHIFT, LEA, etc."),
127 cl::Hidden);
128
130 const X86Subtarget &STI)
131 : TargetLowering(TM), Subtarget(STI) {
132 bool UseX87 = !Subtarget.useSoftFloat() && Subtarget.hasX87();
133 MVT PtrVT = MVT::getIntegerVT(TM.getPointerSizeInBits(0));
134
135 // Set up the TargetLowering object.
136
137 // X86 is weird. It always uses i8 for shift amounts and setcc results.
139 // X86-SSE is even stranger. It uses -1 or 0 for vector masks.
141
142 // X86 instruction cache is coherent with its data cache so we can use the
143 // default expansion to a no-op.
145
146 // For 64-bit, since we have so many registers, use the ILP scheduler.
147 // For 32-bit, use the register pressure specific scheduling.
148 // For Atom, always use ILP scheduling.
149 if (Subtarget.isAtom())
151 else if (Subtarget.is64Bit())
153 else
155 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
157
158 // Bypass expensive divides and use cheaper ones.
159 if (TM.getOptLevel() >= CodeGenOptLevel::Default) {
160 if (Subtarget.hasSlowDivide32())
161 addBypassSlowDiv(32, 8);
162 if (Subtarget.hasSlowDivide64() && Subtarget.is64Bit())
163 addBypassSlowDiv(64, 32);
164 }
165
166 // Setup Windows compiler runtime calls.
167 if (Subtarget.isTargetWindowsMSVC() || Subtarget.isTargetWindowsItanium()) {
168 static const struct {
169 const RTLIB::Libcall Op;
170 const char * const Name;
171 const CallingConv::ID CC;
172 } LibraryCalls[] = {
173 { RTLIB::SDIV_I64, "_alldiv", CallingConv::X86_StdCall },
174 { RTLIB::UDIV_I64, "_aulldiv", CallingConv::X86_StdCall },
175 { RTLIB::SREM_I64, "_allrem", CallingConv::X86_StdCall },
176 { RTLIB::UREM_I64, "_aullrem", CallingConv::X86_StdCall },
177 { RTLIB::MUL_I64, "_allmul", CallingConv::X86_StdCall },
178 };
179
180 for (const auto &LC : LibraryCalls) {
181 setLibcallName(LC.Op, LC.Name);
182 setLibcallCallingConv(LC.Op, LC.CC);
183 }
184 }
185
186 if (Subtarget.canUseCMPXCHG16B())
188 else if (Subtarget.canUseCMPXCHG8B())
190 else
192
193 setMaxDivRemBitWidthSupported(Subtarget.is64Bit() ? 128 : 64);
194
196
197 // Set up the register classes.
198 addRegisterClass(MVT::i8, &X86::GR8RegClass);
199 addRegisterClass(MVT::i16, &X86::GR16RegClass);
200 addRegisterClass(MVT::i32, &X86::GR32RegClass);
201 if (Subtarget.is64Bit())
202 addRegisterClass(MVT::i64, &X86::GR64RegClass);
203
204 for (MVT VT : MVT::integer_valuetypes())
206
207 // We don't accept any truncstore of integer registers.
208 setTruncStoreAction(MVT::i64, MVT::i32, Expand);
209 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
210 setTruncStoreAction(MVT::i64, MVT::i8 , Expand);
211 setTruncStoreAction(MVT::i32, MVT::i16, Expand);
212 setTruncStoreAction(MVT::i32, MVT::i8 , Expand);
213 setTruncStoreAction(MVT::i16, MVT::i8, Expand);
214
215 setTruncStoreAction(MVT::f64, MVT::f32, Expand);
216
217 // SETOEQ and SETUNE require checking two conditions.
218 for (auto VT : {MVT::f32, MVT::f64, MVT::f80}) {
221 }
222
223 // Integer absolute.
224 if (Subtarget.canUseCMOV()) {
225 setOperationAction(ISD::ABS , MVT::i16 , Custom);
226 setOperationAction(ISD::ABS , MVT::i32 , Custom);
227 if (Subtarget.is64Bit())
228 setOperationAction(ISD::ABS , MVT::i64 , Custom);
229 }
230
231 // Absolute difference.
232 for (auto Op : {ISD::ABDS, ISD::ABDU}) {
233 setOperationAction(Op , MVT::i8 , Custom);
234 setOperationAction(Op , MVT::i16 , Custom);
235 setOperationAction(Op , MVT::i32 , Custom);
236 if (Subtarget.is64Bit())
237 setOperationAction(Op , MVT::i64 , Custom);
238 }
239
240 // Signed saturation subtraction.
244 if (Subtarget.is64Bit())
246
247 // Funnel shifts.
248 for (auto ShiftOp : {ISD::FSHL, ISD::FSHR}) {
249 // For slow shld targets we only lower for code size.
250 LegalizeAction ShiftDoubleAction = Subtarget.isSHLDSlow() ? Custom : Legal;
251
252 setOperationAction(ShiftOp , MVT::i8 , Custom);
253 setOperationAction(ShiftOp , MVT::i16 , Custom);
254 setOperationAction(ShiftOp , MVT::i32 , ShiftDoubleAction);
255 if (Subtarget.is64Bit())
256 setOperationAction(ShiftOp , MVT::i64 , ShiftDoubleAction);
257 }
258
259 if (!Subtarget.useSoftFloat()) {
260 // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
261 // operation.
266 // We have an algorithm for SSE2, and we turn this into a 64-bit
267 // FILD or VCVTUSI2SS/SD for other targets.
270 // We have an algorithm for SSE2->double, and we turn this into a
271 // 64-bit FILD followed by conditional FADD for other targets.
274
275 // Promote i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
276 // this operation.
279 // SSE has no i16 to fp conversion, only i32. We promote in the handler
280 // to allow f80 to use i16 and f64 to use i16 with sse1 only
283 // f32 and f64 cases are Legal with SSE1/SSE2, f80 case is not
286 // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64
287 // are Legal, f80 is custom lowered.
290
291 // Promote i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
292 // this operation.
294 // FIXME: This doesn't generate invalid exception when it should. PR44019.
300 // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64
301 // are Legal, f80 is custom lowered.
304
305 // Handle FP_TO_UINT by promoting the destination to a larger signed
306 // conversion.
308 // FIXME: This doesn't generate invalid exception when it should. PR44019.
311 // FIXME: This doesn't generate invalid exception when it should. PR44019.
317
322
323 if (!Subtarget.is64Bit()) {
326 }
327 }
328
329 if (Subtarget.hasSSE2()) {
330 // Custom lowering for saturating float to int conversions.
331 // We handle promotion to larger result types manually.
332 for (MVT VT : { MVT::i8, MVT::i16, MVT::i32 }) {
335 }
337 if (Subtarget.is64Bit()) {
341 }
342 }
343 if (Subtarget.hasAVX10_2()) {
346 if (Subtarget.is64Bit()) {
349 }
350 }
351
352 // Handle address space casts between mixed sized pointers.
355
356 // TODO: when we have SSE, these could be more efficient, by using movd/movq.
357 if (!Subtarget.hasSSE2()) {
363 if (Subtarget.is64Bit()) {
365 // Without SSE, i64->f64 goes through memory.
367 }
368 } else if (!Subtarget.is64Bit())
370
371 // Scalar integer divide and remainder are lowered to use operations that
372 // produce two results, to match the available instructions. This exposes
373 // the two-result form to trivial CSE, which is able to combine x/y and x%y
374 // into a single instruction.
375 //
376 // Scalar integer multiply-high is also lowered to use two-result
377 // operations, to match the available instructions. However, plain multiply
378 // (low) operations are left as Legal, as there are single-result
379 // instructions for this in x86. Using the two-result multiply instructions
380 // when both high and low results are needed must be arranged by dagcombine.
381 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
388 }
389
390 setOperationAction(ISD::BR_JT , MVT::Other, Expand);
392 for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128,
393 MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
396 }
397 if (Subtarget.is64Bit())
402
403 setOperationAction(ISD::FREM , MVT::f32 , Expand);
404 setOperationAction(ISD::FREM , MVT::f64 , Expand);
405 setOperationAction(ISD::FREM , MVT::f80 , Expand);
406 setOperationAction(ISD::FREM , MVT::f128 , Expand);
407
408 if (!Subtarget.useSoftFloat() && Subtarget.hasX87()) {
414 }
415
416 // Promote the i8 variants and force them on up to i32 which has a shorter
417 // encoding.
418 setOperationPromotedToType(ISD::CTTZ , MVT::i8 , MVT::i32);
420 // Promoted i16. tzcntw has a false dependency on Intel CPUs. For BSF, we emit
421 // a REP prefix to encode it as TZCNT for modern CPUs so it makes sense to
422 // promote that too.
423 setOperationPromotedToType(ISD::CTTZ , MVT::i16 , MVT::i32);
425
426 if (!Subtarget.hasBMI()) {
427 setOperationAction(ISD::CTTZ , MVT::i32 , Custom);
429 if (Subtarget.is64Bit()) {
430 setOperationPromotedToType(ISD::CTTZ , MVT::i32, MVT::i64);
431 setOperationAction(ISD::CTTZ , MVT::i64 , Custom);
433 }
434 }
435
436 if (Subtarget.hasLZCNT()) {
437 // When promoting the i8 variants, force them to i32 for a shorter
438 // encoding.
439 setOperationPromotedToType(ISD::CTLZ , MVT::i8 , MVT::i32);
441 } else {
442 for (auto VT : {MVT::i8, MVT::i16, MVT::i32, MVT::i64}) {
443 if (VT == MVT::i64 && !Subtarget.is64Bit())
444 continue;
447 }
448 }
449
452 // Special handling for half-precision floating point conversions.
453 // If we don't have F16C support, then lower half float conversions
454 // into library calls.
456 Op, MVT::f32,
457 (!Subtarget.useSoftFloat() && Subtarget.hasF16C()) ? Custom : Expand);
458 // There's never any support for operations beyond MVT::f32.
459 setOperationAction(Op, MVT::f64, Expand);
460 setOperationAction(Op, MVT::f80, Expand);
461 setOperationAction(Op, MVT::f128, Expand);
462 }
463
464 for (auto VT : {MVT::f32, MVT::f64, MVT::f80, MVT::f128}) {
467 }
468
469 for (MVT VT : {MVT::f32, MVT::f64, MVT::f80, MVT::f128}) {
470 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand);
471 setLoadExtAction(ISD::EXTLOAD, VT, MVT::bf16, Expand);
472 setTruncStoreAction(VT, MVT::f16, Expand);
473 setTruncStoreAction(VT, MVT::bf16, Expand);
474
477 }
478
482 if (Subtarget.is64Bit())
484 if (Subtarget.hasPOPCNT()) {
485 setOperationPromotedToType(ISD::CTPOP, MVT::i8, MVT::i32);
486 // popcntw is longer to encode than popcntl and also has a false dependency
487 // on the dest that popcntl hasn't had since Cannon Lake.
488 setOperationPromotedToType(ISD::CTPOP, MVT::i16, MVT::i32);
489 } else {
494 }
495
497
498 if (!Subtarget.hasMOVBE())
500
501 // X86 wants to expand cmov itself.
502 for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128 }) {
507 }
508 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
509 if (VT == MVT::i64 && !Subtarget.is64Bit())
510 continue;
513 }
514
515 // Custom action for SELECT MMX and expand action for SELECT_CC MMX
518
520 // NOTE: EH_SJLJ_SETJMP/_LONGJMP are not recommended, since
521 // LLVM/Clang supports zero-cost DWARF and SEH exception handling.
525 if (TM.Options.ExceptionModel == ExceptionHandling::SjLj)
526 setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume");
527
528 // Darwin ABI issue.
529 for (auto VT : { MVT::i32, MVT::i64 }) {
530 if (VT == MVT::i64 && !Subtarget.is64Bit())
531 continue;
538 }
539
540 // 64-bit shl, sra, srl (iff 32-bit x86)
541 for (auto VT : { MVT::i32, MVT::i64 }) {
542 if (VT == MVT::i64 && !Subtarget.is64Bit())
543 continue;
547 }
548
549 if (Subtarget.hasSSEPrefetch())
551
553
554 // Expand certain atomics
555 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
563 }
564
565 if (!Subtarget.is64Bit())
567
568 if (Subtarget.is64Bit() && Subtarget.hasAVX()) {
569 // All CPUs supporting AVX will atomically load/store aligned 128-bit
570 // values, so we can emit [V]MOVAPS/[V]MOVDQA.
573 }
574
575 if (Subtarget.canUseCMPXCHG16B())
577
578 // FIXME - use subtarget debug flags
579 if (!Subtarget.isTargetDarwin() && !Subtarget.isTargetELF() &&
580 !Subtarget.isTargetCygMing() && !Subtarget.isTargetWin64() &&
581 TM.Options.ExceptionModel != ExceptionHandling::SjLj) {
583 }
584
587
590
591 setOperationAction(ISD::TRAP, MVT::Other, Legal);
593 if (Subtarget.isTargetPS())
595 else
597
598 // VASTART needs to be custom lowered to use the VarArgsFrameIndex
600 setOperationAction(ISD::VAEND , MVT::Other, Expand);
601 bool Is64Bit = Subtarget.is64Bit();
602 setOperationAction(ISD::VAARG, MVT::Other, Is64Bit ? Custom : Expand);
603 setOperationAction(ISD::VACOPY, MVT::Other, Is64Bit ? Custom : Expand);
604
607
609
610 // GC_TRANSITION_START and GC_TRANSITION_END need custom lowering.
613
615
616 auto setF16Action = [&] (MVT VT, LegalizeAction Action) {
617 setOperationAction(ISD::FABS, VT, Action);
618 setOperationAction(ISD::FNEG, VT, Action);
620 setOperationAction(ISD::FREM, VT, Action);
621 setOperationAction(ISD::FMA, VT, Action);
622 setOperationAction(ISD::FMINNUM, VT, Action);
623 setOperationAction(ISD::FMAXNUM, VT, Action);
626 setOperationAction(ISD::FSIN, VT, Action);
627 setOperationAction(ISD::FCOS, VT, Action);
628 setOperationAction(ISD::FSINCOS, VT, Action);
629 setOperationAction(ISD::FTAN, VT, Action);
630 setOperationAction(ISD::FSQRT, VT, Action);
631 setOperationAction(ISD::FPOW, VT, Action);
632 setOperationAction(ISD::FPOWI, VT, Action);
633 setOperationAction(ISD::FLOG, VT, Action);
634 setOperationAction(ISD::FLOG2, VT, Action);
635 setOperationAction(ISD::FLOG10, VT, Action);
636 setOperationAction(ISD::FEXP, VT, Action);
637 setOperationAction(ISD::FEXP2, VT, Action);
638 setOperationAction(ISD::FEXP10, VT, Action);
639 setOperationAction(ISD::FCEIL, VT, Action);
640 setOperationAction(ISD::FFLOOR, VT, Action);
642 setOperationAction(ISD::FRINT, VT, Action);
643 setOperationAction(ISD::BR_CC, VT, Action);
644 setOperationAction(ISD::SETCC, VT, Action);
647 setOperationAction(ISD::FROUND, VT, Action);
649 setOperationAction(ISD::FTRUNC, VT, Action);
650 setOperationAction(ISD::FLDEXP, VT, Action);
651 };
652
653 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
654 // f16, f32 and f64 use SSE.
655 // Set up the FP register classes.
656 addRegisterClass(MVT::f16, Subtarget.hasAVX512() ? &X86::FR16XRegClass
657 : &X86::FR16RegClass);
658 addRegisterClass(MVT::f32, Subtarget.hasAVX512() ? &X86::FR32XRegClass
659 : &X86::FR32RegClass);
660 addRegisterClass(MVT::f64, Subtarget.hasAVX512() ? &X86::FR64XRegClass
661 : &X86::FR64RegClass);
662
663 // Disable f32->f64 extload as we can only generate this in one instruction
664 // under optsize. So its easier to pattern match (fpext (load)) for that
665 // case instead of needing to emit 2 instructions for extload in the
666 // non-optsize case.
667 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
668
669 for (auto VT : { MVT::f32, MVT::f64 }) {
670 // Use ANDPD to simulate FABS.
672
673 // Use XORP to simulate FNEG.
675
676 // Use ANDPD and ORPD to simulate FCOPYSIGN.
678
679 // These might be better off as horizontal vector ops.
682
683 // We don't support sin/cos/fmod
687 }
688
689 // Half type will be promoted by default.
690 setF16Action(MVT::f16, Promote);
698
729
730 setLibcallName(RTLIB::FPROUND_F32_F16, "__truncsfhf2");
731 setLibcallName(RTLIB::FPEXT_F16_F32, "__extendhfsf2");
732
733 // Lower this to MOVMSK plus an AND.
736
737 } else if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1() &&
738 (UseX87 || Is64Bit)) {
739 // Use SSE for f32, x87 for f64.
740 // Set up the FP register classes.
741 addRegisterClass(MVT::f32, &X86::FR32RegClass);
742 if (UseX87)
743 addRegisterClass(MVT::f64, &X86::RFP64RegClass);
744
745 // Use ANDPS to simulate FABS.
747
748 // Use XORP to simulate FNEG.
750
751 if (UseX87)
753
754 // Use ANDPS and ORPS to simulate FCOPYSIGN.
755 if (UseX87)
758
759 // We don't support sin/cos/fmod
763
764 if (UseX87) {
765 // Always expand sin/cos functions even though x87 has an instruction.
769 }
770 } else if (UseX87) {
771 // f32 and f64 in x87.
772 // Set up the FP register classes.
773 addRegisterClass(MVT::f64, &X86::RFP64RegClass);
774 addRegisterClass(MVT::f32, &X86::RFP32RegClass);
775
776 for (auto VT : { MVT::f32, MVT::f64 }) {
779
780 // Always expand sin/cos functions even though x87 has an instruction.
784 }
785 }
786
787 // Expand FP32 immediates into loads from the stack, save special cases.
788 if (isTypeLegal(MVT::f32)) {
789 if (UseX87 && (getRegClassFor(MVT::f32) == &X86::RFP32RegClass)) {
790 addLegalFPImmediate(APFloat(+0.0f)); // FLD0
791 addLegalFPImmediate(APFloat(+1.0f)); // FLD1
792 addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
793 addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
794 } else // SSE immediates.
795 addLegalFPImmediate(APFloat(+0.0f)); // xorps
796 }
797 // Expand FP64 immediates into loads from the stack, save special cases.
798 if (isTypeLegal(MVT::f64)) {
799 if (UseX87 && getRegClassFor(MVT::f64) == &X86::RFP64RegClass) {
800 addLegalFPImmediate(APFloat(+0.0)); // FLD0
801 addLegalFPImmediate(APFloat(+1.0)); // FLD1
802 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
803 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
804 } else // SSE immediates.
805 addLegalFPImmediate(APFloat(+0.0)); // xorpd
806 }
807 // Support fp16 0 immediate.
808 if (isTypeLegal(MVT::f16))
809 addLegalFPImmediate(APFloat::getZero(APFloat::IEEEhalf()));
810
811 // Handle constrained floating-point operations of scalar.
824
825 // We don't support FMA.
828
829 // f80 always uses X87.
830 if (UseX87) {
831 addRegisterClass(MVT::f80, &X86::RFP80RegClass);
834 {
836 addLegalFPImmediate(TmpFlt); // FLD0
837 TmpFlt.changeSign();
838 addLegalFPImmediate(TmpFlt); // FLD0/FCHS
839
840 bool ignored;
841 APFloat TmpFlt2(+1.0);
843 &ignored);
844 addLegalFPImmediate(TmpFlt2); // FLD1
845 TmpFlt2.changeSign();
846 addLegalFPImmediate(TmpFlt2); // FLD1/FCHS
847 }
848
849 // Always expand sin/cos functions even though x87 has an instruction.
850 // clang-format off
862 // clang-format on
863
875
876 // Handle constrained floating-point operations of scalar.
882 if (isTypeLegal(MVT::f16)) {
885 } else {
887 }
888 // FIXME: When the target is 64-bit, STRICT_FP_ROUND will be overwritten
889 // as Custom.
891 }
892
893 // f128 uses xmm registers, but most operations require libcalls.
894 if (!Subtarget.useSoftFloat() && Subtarget.is64Bit() && Subtarget.hasSSE1()) {
895 addRegisterClass(MVT::f128, Subtarget.hasVLX() ? &X86::VR128XRegClass
896 : &X86::VR128RegClass);
897
898 addLegalFPImmediate(APFloat::getZero(APFloat::IEEEquad())); // xorps
899
910
914
915 // clang-format off
923 // clang-format on
924 // No STRICT_FSINCOS
927
930 // We need to custom handle any FP_ROUND with an f128 input, but
931 // LegalizeDAG uses the result type to know when to run a custom handler.
932 // So we have to list all legal floating point result types here.
933 if (isTypeLegal(MVT::f32)) {
936 }
937 if (isTypeLegal(MVT::f64)) {
940 }
941 if (isTypeLegal(MVT::f80)) {
945 }
946
948
949 setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f32, Expand);
950 setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f64, Expand);
951 setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f80, Expand);
952 setTruncStoreAction(MVT::f128, MVT::f32, Expand);
953 setTruncStoreAction(MVT::f128, MVT::f64, Expand);
954 setTruncStoreAction(MVT::f128, MVT::f80, Expand);
955 }
956
957 // Always use a library call for pow.
958 setOperationAction(ISD::FPOW , MVT::f32 , Expand);
959 setOperationAction(ISD::FPOW , MVT::f64 , Expand);
960 setOperationAction(ISD::FPOW , MVT::f80 , Expand);
961 setOperationAction(ISD::FPOW , MVT::f128 , Expand);
962
971
972 // Some FP actions are always expanded for vector types.
973 for (auto VT : { MVT::v8f16, MVT::v16f16, MVT::v32f16,
974 MVT::v4f32, MVT::v8f32, MVT::v16f32,
975 MVT::v2f64, MVT::v4f64, MVT::v8f64 }) {
976 // clang-format off
990 // clang-format on
991 }
992
993 // First set operation action for all vector types to either promote
994 // (for widening) or expand (for scalarization). Then we will selectively
995 // turn on ones that can be effectively codegen'd.
1035 for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
1036 setTruncStoreAction(InnerVT, VT, Expand);
1037
1038 setLoadExtAction(ISD::SEXTLOAD, InnerVT, VT, Expand);
1039 setLoadExtAction(ISD::ZEXTLOAD, InnerVT, VT, Expand);
1040
1041 // N.b. ISD::EXTLOAD legality is basically ignored except for i1-like
1042 // types, we have to deal with them whether we ask for Expansion or not.
1043 // Setting Expand causes its own optimisation problems though, so leave
1044 // them legal.
1045 if (VT.getVectorElementType() == MVT::i1)
1046 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
1047
1048 // EXTLOAD for MVT::f16 vectors is not legal because f16 vectors are
1049 // split/scalarized right now.
1050 if (VT.getVectorElementType() == MVT::f16 ||
1051 VT.getVectorElementType() == MVT::bf16)
1052 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
1053 }
1054 }
1055
1056 // FIXME: In order to prevent SSE instructions being expanded to MMX ones
1057 // with -msoft-float, disable use of MMX as well.
1058 if (!Subtarget.useSoftFloat() && Subtarget.hasMMX()) {
1059 addRegisterClass(MVT::x86mmx, &X86::VR64RegClass);
1060 // No operations on x86mmx supported, everything uses intrinsics.
1061 }
1062
1063 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1()) {
1064 addRegisterClass(MVT::v4f32, Subtarget.hasVLX() ? &X86::VR128XRegClass
1065 : &X86::VR128RegClass);
1066
1069
1070 setOperationAction(ISD::FNEG, MVT::v4f32, Custom);
1071 setOperationAction(ISD::FABS, MVT::v4f32, Custom);
1079
1080 setOperationAction(ISD::LOAD, MVT::v2f32, Custom);
1081 setOperationAction(ISD::STORE, MVT::v2f32, Custom);
1083
1089 }
1090
1091 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
1092 addRegisterClass(MVT::v2f64, Subtarget.hasVLX() ? &X86::VR128XRegClass
1093 : &X86::VR128RegClass);
1094
1095 // FIXME: Unfortunately, -soft-float and -no-implicit-float mean XMM
1096 // registers cannot be used even for integer operations.
1097 addRegisterClass(MVT::v16i8, Subtarget.hasVLX() ? &X86::VR128XRegClass
1098 : &X86::VR128RegClass);
1099 addRegisterClass(MVT::v8i16, Subtarget.hasVLX() ? &X86::VR128XRegClass
1100 : &X86::VR128RegClass);
1101 addRegisterClass(MVT::v8f16, Subtarget.hasVLX() ? &X86::VR128XRegClass
1102 : &X86::VR128RegClass);
1103 addRegisterClass(MVT::v4i32, Subtarget.hasVLX() ? &X86::VR128XRegClass
1104 : &X86::VR128RegClass);
1105 addRegisterClass(MVT::v2i64, Subtarget.hasVLX() ? &X86::VR128XRegClass
1106 : &X86::VR128RegClass);
1107
1108 for (auto VT : { MVT::f64, MVT::v4f32, MVT::v2f64 }) {
1111 }
1112
1113 for (auto VT : { MVT::v2i8, MVT::v4i8, MVT::v8i8,
1114 MVT::v2i16, MVT::v4i16, MVT::v2i32 }) {
1119 }
1120
1121 setOperationAction(ISD::MUL, MVT::v2i8, Custom);
1122 setOperationAction(ISD::MUL, MVT::v4i8, Custom);
1123 setOperationAction(ISD::MUL, MVT::v8i8, Custom);
1124
1125 setOperationAction(ISD::MUL, MVT::v16i8, Custom);
1126 setOperationAction(ISD::MUL, MVT::v4i32, Custom);
1127 setOperationAction(ISD::MUL, MVT::v2i64, Custom);
1128 setOperationAction(ISD::MULHU, MVT::v4i32, Custom);
1129 setOperationAction(ISD::MULHS, MVT::v4i32, Custom);
1130 setOperationAction(ISD::MULHU, MVT::v16i8, Custom);
1131 setOperationAction(ISD::MULHS, MVT::v16i8, Custom);
1132 setOperationAction(ISD::MULHU, MVT::v8i16, Legal);
1133 setOperationAction(ISD::MULHS, MVT::v8i16, Legal);
1134 setOperationAction(ISD::MUL, MVT::v8i16, Legal);
1137
1138 setOperationAction(ISD::SMULO, MVT::v16i8, Custom);
1139 setOperationAction(ISD::UMULO, MVT::v16i8, Custom);
1140 setOperationAction(ISD::UMULO, MVT::v2i32, Custom);
1141
1142 setOperationAction(ISD::FNEG, MVT::v2f64, Custom);
1144 setOperationAction(ISD::FABS, MVT::v2f64, Custom);
1146
1147 setOperationAction(ISD::LRINT, MVT::v4f32, Custom);
1148
1149 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1150 setOperationAction(ISD::SMAX, VT, VT == MVT::v8i16 ? Legal : Custom);
1151 setOperationAction(ISD::SMIN, VT, VT == MVT::v8i16 ? Legal : Custom);
1152 setOperationAction(ISD::UMAX, VT, VT == MVT::v16i8 ? Legal : Custom);
1153 setOperationAction(ISD::UMIN, VT, VT == MVT::v16i8 ? Legal : Custom);
1154 }
1155
1166
1171
1172 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1178
1179 // The condition codes aren't legal in SSE/AVX and under AVX512 we use
1180 // setcc all the way to isel and prefer SETGT in some isel patterns.
1183 }
1184
1185 setOperationAction(ISD::SETCC, MVT::v2f64, Custom);
1186 setOperationAction(ISD::SETCC, MVT::v4f32, Custom);
1191
1192 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
1198 }
1199
1200 for (auto VT : { MVT::v8f16, MVT::v2f64, MVT::v2i64 }) {
1204
1205 if (VT == MVT::v2i64 && !Subtarget.is64Bit())
1206 continue;
1207
1210 }
1211 setF16Action(MVT::v8f16, Expand);
1212 setOperationAction(ISD::FADD, MVT::v8f16, Expand);
1213 setOperationAction(ISD::FSUB, MVT::v8f16, Expand);
1214 setOperationAction(ISD::FMUL, MVT::v8f16, Expand);
1215 setOperationAction(ISD::FDIV, MVT::v8f16, Expand);
1216 setOperationAction(ISD::FNEG, MVT::v8f16, Custom);
1217 setOperationAction(ISD::FABS, MVT::v8f16, Custom);
1219
1220 // Custom lower v2i64 and v2f64 selects.
1227
1234
1235 // Custom legalize these to avoid over promotion or custom promotion.
1236 for (auto VT : {MVT::v2i8, MVT::v4i8, MVT::v8i8, MVT::v2i16, MVT::v4i16}) {
1241 }
1242
1247
1250
1253
1254 // Fast v2f32 UINT_TO_FP( v2i32 ) custom conversion.
1259
1264
1265 // We want to legalize this to an f64 load rather than an i64 load on
1266 // 64-bit targets and two 32-bit loads on a 32-bit target. Similar for
1267 // store.
1268 setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
1269 setOperationAction(ISD::LOAD, MVT::v4i16, Custom);
1270 setOperationAction(ISD::LOAD, MVT::v8i8, Custom);
1271 setOperationAction(ISD::STORE, MVT::v2i32, Custom);
1272 setOperationAction(ISD::STORE, MVT::v4i16, Custom);
1274
1275 // Add 32-bit vector stores to help vectorization opportunities.
1276 setOperationAction(ISD::STORE, MVT::v2i16, Custom);
1278
1282 if (!Subtarget.hasAVX512())
1284
1288
1290
1307
1308 // In the customized shift lowering, the legal v4i32/v2i64 cases
1309 // in AVX2 will be recognized.
1310 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1314 if (VT == MVT::v2i64) continue;
1319 }
1320
1326 }
1327
1328 if (Subtarget.hasGFNI()) {
1333 }
1334
1335 if (!Subtarget.useSoftFloat() && Subtarget.hasSSSE3()) {
1336 setOperationAction(ISD::ABS, MVT::v16i8, Legal);
1337 setOperationAction(ISD::ABS, MVT::v8i16, Legal);
1338 setOperationAction(ISD::ABS, MVT::v4i32, Legal);
1339
1340 for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64}) {
1343 }
1344
1345 // These might be better off as horizontal vector ops.
1350 }
1351
1352 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE41()) {
1353 for (MVT RoundedTy : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64}) {
1356 setOperationAction(ISD::FCEIL, RoundedTy, Legal);
1360 setOperationAction(ISD::FRINT, RoundedTy, Legal);
1366
1368 }
1369
1370 setOperationAction(ISD::SMAX, MVT::v16i8, Legal);
1371 setOperationAction(ISD::SMAX, MVT::v4i32, Legal);
1372 setOperationAction(ISD::UMAX, MVT::v8i16, Legal);
1373 setOperationAction(ISD::UMAX, MVT::v4i32, Legal);
1374 setOperationAction(ISD::SMIN, MVT::v16i8, Legal);
1375 setOperationAction(ISD::SMIN, MVT::v4i32, Legal);
1376 setOperationAction(ISD::UMIN, MVT::v8i16, Legal);
1377 setOperationAction(ISD::UMIN, MVT::v4i32, Legal);
1378
1382
1383 // FIXME: Do we need to handle scalar-to-vector here?
1384 setOperationAction(ISD::MUL, MVT::v4i32, Legal);
1385 setOperationAction(ISD::SMULO, MVT::v2i32, Custom);
1386
1387 // We directly match byte blends in the backend as they match the VSELECT
1388 // condition form.
1390
1391 // SSE41 brings specific instructions for doing vector sign extend even in
1392 // cases where we don't have SRA.
1393 for (auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1396 }
1397
1398 // SSE41 also has vector sign/zero extending loads, PMOV[SZ]X
1399 for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
1400 setLoadExtAction(LoadExtOp, MVT::v8i16, MVT::v8i8, Legal);
1401 setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i8, Legal);
1402 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i8, Legal);
1403 setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i16, Legal);
1404 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i16, Legal);
1405 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i32, Legal);
1406 }
1407
1408 if (Subtarget.is64Bit() && !Subtarget.hasAVX512()) {
1409 // We need to scalarize v4i64->v432 uint_to_fp using cvtsi2ss, but we can
1410 // do the pre and post work in the vector domain.
1413 // We need to mark SINT_TO_FP as Custom even though we want to expand it
1414 // so that DAG combine doesn't try to turn it into uint_to_fp.
1417 }
1418 }
1419
1420 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE42()) {
1422 }
1423
1424 if (!Subtarget.useSoftFloat() && Subtarget.hasXOP()) {
1425 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
1426 MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1429 }
1430
1431 // XOP can efficiently perform BITREVERSE with VPPERM.
1432 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 })
1434 }
1435
1436 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX()) {
1437 bool HasInt256 = Subtarget.hasInt256();
1438
1439 addRegisterClass(MVT::v32i8, Subtarget.hasVLX() ? &X86::VR256XRegClass
1440 : &X86::VR256RegClass);
1441 addRegisterClass(MVT::v16i16, Subtarget.hasVLX() ? &X86::VR256XRegClass
1442 : &X86::VR256RegClass);
1443 addRegisterClass(MVT::v16f16, Subtarget.hasVLX() ? &X86::VR256XRegClass
1444 : &X86::VR256RegClass);
1445 addRegisterClass(MVT::v8i32, Subtarget.hasVLX() ? &X86::VR256XRegClass
1446 : &X86::VR256RegClass);
1447 addRegisterClass(MVT::v8f32, Subtarget.hasVLX() ? &X86::VR256XRegClass
1448 : &X86::VR256RegClass);
1449 addRegisterClass(MVT::v4i64, Subtarget.hasVLX() ? &X86::VR256XRegClass
1450 : &X86::VR256RegClass);
1451 addRegisterClass(MVT::v4f64, Subtarget.hasVLX() ? &X86::VR256XRegClass
1452 : &X86::VR256RegClass);
1453
1454 for (auto VT : { MVT::v8f32, MVT::v4f64 }) {
1467
1469
1473
1477 }
1478
1479 setOperationAction(ISD::LRINT, MVT::v8f32, Custom);
1480 setOperationAction(ISD::LRINT, MVT::v4f64, Custom);
1481
1482 // (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted
1483 // even though v8i16 is a legal type.
1484 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i16, MVT::v8i32);
1485 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i16, MVT::v8i32);
1486 setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v8i16, MVT::v8i32);
1487 setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v8i16, MVT::v8i32);
1491
1498
1510
1511 if (!Subtarget.hasAVX512())
1513
1514 // In the customized shift lowering, the legal v8i32/v4i64 cases
1515 // in AVX2 will be recognized.
1516 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1522 if (VT == MVT::v4i64) continue;
1527 }
1528
1529 // These types need custom splitting if their input is a 128-bit vector.
1534
1538 setOperationAction(ISD::SELECT, MVT::v16i16, Custom);
1539 setOperationAction(ISD::SELECT, MVT::v16f16, Custom);
1542
1543 for (auto VT : { MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1547 }
1548
1553
1554 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1559
1560 // The condition codes aren't legal in SSE/AVX and under AVX512 we use
1561 // setcc all the way to isel and prefer SETGT in some isel patterns.
1564 }
1565
1566 setOperationAction(ISD::SETCC, MVT::v4f64, Custom);
1567 setOperationAction(ISD::SETCC, MVT::v8f32, Custom);
1572
1573 if (Subtarget.hasAnyFMA()) {
1574 for (auto VT : { MVT::f32, MVT::f64, MVT::v4f32, MVT::v8f32,
1575 MVT::v2f64, MVT::v4f64 }) {
1578 }
1579 }
1580
1581 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1582 setOperationAction(ISD::ADD, VT, HasInt256 ? Legal : Custom);
1583 setOperationAction(ISD::SUB, VT, HasInt256 ? Legal : Custom);
1584 }
1585
1586 setOperationAction(ISD::MUL, MVT::v4i64, Custom);
1587 setOperationAction(ISD::MUL, MVT::v8i32, HasInt256 ? Legal : Custom);
1588 setOperationAction(ISD::MUL, MVT::v16i16, HasInt256 ? Legal : Custom);
1589 setOperationAction(ISD::MUL, MVT::v32i8, Custom);
1590
1591 setOperationAction(ISD::MULHU, MVT::v8i32, Custom);
1592 setOperationAction(ISD::MULHS, MVT::v8i32, Custom);
1593 setOperationAction(ISD::MULHU, MVT::v16i16, HasInt256 ? Legal : Custom);
1594 setOperationAction(ISD::MULHS, MVT::v16i16, HasInt256 ? Legal : Custom);
1595 setOperationAction(ISD::MULHU, MVT::v32i8, Custom);
1596 setOperationAction(ISD::MULHS, MVT::v32i8, Custom);
1597 setOperationAction(ISD::AVGCEILU, MVT::v16i16, HasInt256 ? Legal : Custom);
1598 setOperationAction(ISD::AVGCEILU, MVT::v32i8, HasInt256 ? Legal : Custom);
1599
1600 setOperationAction(ISD::SMULO, MVT::v32i8, Custom);
1601 setOperationAction(ISD::UMULO, MVT::v32i8, Custom);
1602
1603 setOperationAction(ISD::ABS, MVT::v4i64, Custom);
1604 setOperationAction(ISD::SMAX, MVT::v4i64, Custom);
1605 setOperationAction(ISD::UMAX, MVT::v4i64, Custom);
1606 setOperationAction(ISD::SMIN, MVT::v4i64, Custom);
1607 setOperationAction(ISD::UMIN, MVT::v4i64, Custom);
1608
1609 setOperationAction(ISD::UADDSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1610 setOperationAction(ISD::SADDSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1611 setOperationAction(ISD::USUBSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1612 setOperationAction(ISD::SSUBSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1613 setOperationAction(ISD::UADDSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1614 setOperationAction(ISD::SADDSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1615 setOperationAction(ISD::USUBSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1616 setOperationAction(ISD::SSUBSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1621
1622 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
1623 setOperationAction(ISD::ABS, VT, HasInt256 ? Legal : Custom);
1624 setOperationAction(ISD::SMAX, VT, HasInt256 ? Legal : Custom);
1625 setOperationAction(ISD::UMAX, VT, HasInt256 ? Legal : Custom);
1626 setOperationAction(ISD::SMIN, VT, HasInt256 ? Legal : Custom);
1627 setOperationAction(ISD::UMIN, VT, HasInt256 ? Legal : Custom);
1628 }
1629
1630 for (auto VT : {MVT::v16i16, MVT::v8i32, MVT::v4i64}) {
1633 }
1634
1635 if (HasInt256) {
1636 // The custom lowering for UINT_TO_FP for v8i32 becomes interesting
1637 // when we have a 256bit-wide blend with immediate.
1640
1641 // AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X
1642 for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
1643 setLoadExtAction(LoadExtOp, MVT::v16i16, MVT::v16i8, Legal);
1644 setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i8, Legal);
1645 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i8, Legal);
1646 setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i16, Legal);
1647 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i16, Legal);
1648 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i32, Legal);
1649 }
1650 }
1651
1652 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1653 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) {
1654 setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom);
1656 }
1657
1658 // Extract subvector is special because the value type
1659 // (result) is 128-bit but the source is 256-bit wide.
1660 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
1661 MVT::v8f16, MVT::v4f32, MVT::v2f64 }) {
1663 }
1664
1665 // Custom lower several nodes for 256-bit types.
1666 for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
1667 MVT::v16f16, MVT::v8f32, MVT::v4f64 }) {
1677 }
1678 setF16Action(MVT::v16f16, Expand);
1679 setOperationAction(ISD::FNEG, MVT::v16f16, Custom);
1680 setOperationAction(ISD::FABS, MVT::v16f16, Custom);
1682 setOperationAction(ISD::FADD, MVT::v16f16, Expand);
1683 setOperationAction(ISD::FSUB, MVT::v16f16, Expand);
1684 setOperationAction(ISD::FMUL, MVT::v16f16, Expand);
1685 setOperationAction(ISD::FDIV, MVT::v16f16, Expand);
1686
1687 if (HasInt256) {
1689
1690 // Custom legalize 2x32 to get a little better code.
1693
1694 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1695 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
1697 }
1698 }
1699
1700 if (!Subtarget.useSoftFloat() && !Subtarget.hasFP16() &&
1701 Subtarget.hasF16C()) {
1702 for (MVT VT : { MVT::f16, MVT::v2f16, MVT::v4f16, MVT::v8f16 }) {
1705 }
1706 for (MVT VT : { MVT::f32, MVT::v2f32, MVT::v4f32, MVT::v8f32 }) {
1709 }
1710 for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV}) {
1711 setOperationPromotedToType(Opc, MVT::v8f16, MVT::v8f32);
1712 setOperationPromotedToType(Opc, MVT::v16f16, MVT::v16f32);
1713 }
1714 setOperationAction(ISD::SETCC, MVT::v8f16, Custom);
1715 setOperationAction(ISD::SETCC, MVT::v16f16, Custom);
1716 }
1717
1718 // This block controls legalization of the mask vector sizes that are
1719 // available with AVX512. 512-bit vectors are in a separate block controlled
1720 // by useAVX512Regs.
1721 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
1722 addRegisterClass(MVT::v1i1, &X86::VK1RegClass);
1723 addRegisterClass(MVT::v2i1, &X86::VK2RegClass);
1724 addRegisterClass(MVT::v4i1, &X86::VK4RegClass);
1725 addRegisterClass(MVT::v8i1, &X86::VK8RegClass);
1726 addRegisterClass(MVT::v16i1, &X86::VK16RegClass);
1727
1731
1732 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i1, MVT::v8i32);
1733 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i1, MVT::v8i32);
1734 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v4i1, MVT::v4i32);
1735 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v4i1, MVT::v4i32);
1736 setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v8i1, MVT::v8i32);
1737 setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v8i1, MVT::v8i32);
1738 setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v4i1, MVT::v4i32);
1739 setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v4i1, MVT::v4i32);
1747
1748 // There is no byte sized k-register load or store without AVX512DQ.
1749 if (!Subtarget.hasDQI()) {
1750 setOperationAction(ISD::LOAD, MVT::v1i1, Custom);
1751 setOperationAction(ISD::LOAD, MVT::v2i1, Custom);
1752 setOperationAction(ISD::LOAD, MVT::v4i1, Custom);
1753 setOperationAction(ISD::LOAD, MVT::v8i1, Custom);
1754
1759 }
1760
1761 // Extends of v16i1/v8i1/v4i1/v2i1 to 128-bit vectors.
1762 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1766 }
1767
1768 for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 })
1770
1771 for (auto VT : { MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 }) {
1775
1782 }
1783
1784 for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1 })
1786 }
1787 if (Subtarget.hasDQI() && Subtarget.hasVLX()) {
1788 for (MVT VT : {MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) {
1791 }
1792 }
1793
1794 // This block controls legalization for 512-bit operations with 8/16/32/64 bit
1795 // elements. 512-bits can be disabled based on prefer-vector-width and
1796 // required-vector-width function attributes.
1797 if (!Subtarget.useSoftFloat() && Subtarget.useAVX512Regs()) {
1798 bool HasBWI = Subtarget.hasBWI();
1799
1800 addRegisterClass(MVT::v16i32, &X86::VR512RegClass);
1801 addRegisterClass(MVT::v16f32, &X86::VR512RegClass);
1802 addRegisterClass(MVT::v8i64, &X86::VR512RegClass);
1803 addRegisterClass(MVT::v8f64, &X86::VR512RegClass);
1804 addRegisterClass(MVT::v32i16, &X86::VR512RegClass);
1805 addRegisterClass(MVT::v32f16, &X86::VR512RegClass);
1806 addRegisterClass(MVT::v64i8, &X86::VR512RegClass);
1807
1808 for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD}) {
1809 setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i8, Legal);
1810 setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i16, Legal);
1811 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i8, Legal);
1812 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i16, Legal);
1813 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i32, Legal);
1814 if (HasBWI)
1815 setLoadExtAction(ExtType, MVT::v32i16, MVT::v32i8, Legal);
1816 }
1817
1818 for (MVT VT : { MVT::v16f32, MVT::v8f64 }) {
1827 }
1828 setOperationAction(ISD::LRINT, MVT::v16f32,
1829 Subtarget.hasDQI() ? Legal : Custom);
1830 setOperationAction(ISD::LRINT, MVT::v8f64,
1831 Subtarget.hasDQI() ? Legal : Custom);
1832 if (Subtarget.hasDQI())
1833 setOperationAction(ISD::LLRINT, MVT::v8f64, Legal);
1834
1835 for (MVT VT : { MVT::v16i1, MVT::v16i8 }) {
1840 }
1841
1842 for (MVT VT : { MVT::v16i16, MVT::v16i32 }) {
1847 }
1848
1855
1867
1868 setTruncStoreAction(MVT::v8i64, MVT::v8i8, Legal);
1869 setTruncStoreAction(MVT::v8i64, MVT::v8i16, Legal);
1870 setTruncStoreAction(MVT::v8i64, MVT::v8i32, Legal);
1871 setTruncStoreAction(MVT::v16i32, MVT::v16i8, Legal);
1872 setTruncStoreAction(MVT::v16i32, MVT::v16i16, Legal);
1873 if (HasBWI)
1874 setTruncStoreAction(MVT::v32i16, MVT::v32i8, Legal);
1875
1876 // With 512-bit vectors and no VLX, we prefer to widen MLOAD/MSTORE
1877 // to 512-bit rather than use the AVX2 instructions so that we can use
1878 // k-masks.
1879 if (!Subtarget.hasVLX()) {
1880 for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1881 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) {
1884 }
1885 }
1886
1888 setOperationAction(ISD::TRUNCATE, MVT::v16i16, Legal);
1889 setOperationAction(ISD::TRUNCATE, MVT::v32i8, HasBWI ? Legal : Custom);
1899
1900 if (HasBWI) {
1901 // Extends from v64i1 masks to 512-bit vectors.
1905 }
1906
1907 for (auto VT : { MVT::v16f32, MVT::v8f64 }) {
1920
1922 }
1923
1924 for (auto VT : {MVT::v32i16, MVT::v16i32, MVT::v8i64}) {
1927 }
1928
1929 setOperationAction(ISD::ADD, MVT::v32i16, HasBWI ? Legal : Custom);
1930 setOperationAction(ISD::SUB, MVT::v32i16, HasBWI ? Legal : Custom);
1931 setOperationAction(ISD::ADD, MVT::v64i8, HasBWI ? Legal : Custom);
1932 setOperationAction(ISD::SUB, MVT::v64i8, HasBWI ? Legal : Custom);
1933
1934 setOperationAction(ISD::MUL, MVT::v8i64, Custom);
1935 setOperationAction(ISD::MUL, MVT::v16i32, Legal);
1936 setOperationAction(ISD::MUL, MVT::v32i16, HasBWI ? Legal : Custom);
1937 setOperationAction(ISD::MUL, MVT::v64i8, Custom);
1938
1939 setOperationAction(ISD::MULHU, MVT::v16i32, Custom);
1940 setOperationAction(ISD::MULHS, MVT::v16i32, Custom);
1941 setOperationAction(ISD::MULHS, MVT::v32i16, HasBWI ? Legal : Custom);
1942 setOperationAction(ISD::MULHU, MVT::v32i16, HasBWI ? Legal : Custom);
1943 setOperationAction(ISD::MULHS, MVT::v64i8, Custom);
1944 setOperationAction(ISD::MULHU, MVT::v64i8, Custom);
1945 setOperationAction(ISD::AVGCEILU, MVT::v32i16, HasBWI ? Legal : Custom);
1946 setOperationAction(ISD::AVGCEILU, MVT::v64i8, HasBWI ? Legal : Custom);
1947
1948 setOperationAction(ISD::SMULO, MVT::v64i8, Custom);
1949 setOperationAction(ISD::UMULO, MVT::v64i8, Custom);
1950
1951 for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64 }) {
1961
1962 // The condition codes aren't legal in SSE/AVX and under AVX512 we use
1963 // setcc all the way to isel and prefer SETGT in some isel patterns.
1966 }
1967
1968 setOperationAction(ISD::SETCC, MVT::v8f64, Custom);
1969 setOperationAction(ISD::SETCC, MVT::v16f32, Custom);
1974
1975 for (auto VT : { MVT::v16i32, MVT::v8i64 }) {
1982 }
1983
1984 for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
1985 setOperationAction(ISD::ABS, VT, HasBWI ? Legal : Custom);
1986 setOperationAction(ISD::CTPOP, VT, Subtarget.hasBITALG() ? Legal : Custom);
1988 setOperationAction(ISD::SMAX, VT, HasBWI ? Legal : Custom);
1989 setOperationAction(ISD::UMAX, VT, HasBWI ? Legal : Custom);
1990 setOperationAction(ISD::SMIN, VT, HasBWI ? Legal : Custom);
1991 setOperationAction(ISD::UMIN, VT, HasBWI ? Legal : Custom);
1996 }
1997
1998 setOperationAction(ISD::FSHL, MVT::v64i8, Custom);
1999 setOperationAction(ISD::FSHR, MVT::v64i8, Custom);
2000 setOperationAction(ISD::FSHL, MVT::v32i16, Custom);
2001 setOperationAction(ISD::FSHR, MVT::v32i16, Custom);
2002 setOperationAction(ISD::FSHL, MVT::v16i32, Custom);
2003 setOperationAction(ISD::FSHR, MVT::v16i32, Custom);
2004
2005 if (Subtarget.hasDQI()) {
2009 setOperationAction(Opc, MVT::v8i64, Custom);
2010 setOperationAction(ISD::MUL, MVT::v8i64, Legal);
2011 }
2012
2013 if (Subtarget.hasCDI()) {
2014 // NonVLX sub-targets extend 128/256 vectors to use the 512 version.
2015 for (auto VT : { MVT::v16i32, MVT::v8i64} ) {
2017 }
2018 } // Subtarget.hasCDI()
2019
2020 if (Subtarget.hasVPOPCNTDQ()) {
2021 for (auto VT : { MVT::v16i32, MVT::v8i64 })
2023 }
2024
2025 // Extract subvector is special because the value type
2026 // (result) is 256-bit but the source is 512-bit wide.
2027 // 128-bit was made Legal under AVX1.
2028 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
2029 MVT::v16f16, MVT::v8f32, MVT::v4f64 })
2031
2032 for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64,
2033 MVT::v32f16, MVT::v16f32, MVT::v8f64 }) {
2043 }
2044 setF16Action(MVT::v32f16, Expand);
2049 for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV})
2050 setOperationPromotedToType(Opc, MVT::v32f16, MVT::v32f32);
2051 setOperationAction(ISD::SETCC, MVT::v32f16, Custom);
2052
2053 for (auto VT : { MVT::v16i32, MVT::v8i64, MVT::v16f32, MVT::v8f64 }) {
2058 }
2059 if (HasBWI) {
2060 for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
2063 }
2064 } else {
2065 setOperationAction(ISD::STORE, MVT::v32i16, Custom);
2066 setOperationAction(ISD::STORE, MVT::v64i8, Custom);
2067 }
2068
2069 if (Subtarget.hasVBMI2()) {
2070 for (auto VT : {MVT::v32i16, MVT::v16i32, MVT::v8i64}) {
2073 }
2074
2075 setOperationAction(ISD::ROTL, MVT::v32i16, Custom);
2076 setOperationAction(ISD::ROTR, MVT::v32i16, Custom);
2077 }
2078
2079 setOperationAction(ISD::FNEG, MVT::v32f16, Custom);
2080 setOperationAction(ISD::FABS, MVT::v32f16, Custom);
2082 }// useAVX512Regs
2083
2084 if (!Subtarget.useSoftFloat() && Subtarget.hasVBMI2()) {
2085 for (auto VT : {MVT::v8i16, MVT::v4i32, MVT::v2i64, MVT::v16i16, MVT::v8i32,
2086 MVT::v4i64}) {
2089 }
2090 }
2091
2092 // This block controls legalization for operations that don't have
2093 // pre-AVX512 equivalents. Without VLX we use 512-bit operations for
2094 // narrower widths.
2095 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
2096 // These operations are handled on non-VLX by artificially widening in
2097 // isel patterns.
2098
2102
2103 if (Subtarget.hasDQI()) {
2104 // Fast v2f32 SINT_TO_FP( v2i64 ) custom conversion.
2105 // v2f32 UINT_TO_FP is already custom under SSE2.
2108 "Unexpected operation action!");
2109 // v2i64 FP_TO_S/UINT(v2f32) custom conversion.
2114 }
2115
2116 for (auto VT : { MVT::v2i64, MVT::v4i64 }) {
2122 }
2123
2124 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
2127 }
2128
2129 // Custom legalize 2x32 to get a little better code.
2132
2133 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
2134 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
2136
2137 if (Subtarget.hasDQI()) {
2141 setOperationAction(Opc, MVT::v2i64, Custom);
2142 setOperationAction(Opc, MVT::v4i64, Custom);
2143 }
2144 setOperationAction(ISD::MUL, MVT::v2i64, Legal);
2145 setOperationAction(ISD::MUL, MVT::v4i64, Legal);
2146 }
2147
2148 if (Subtarget.hasCDI()) {
2149 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
2151 }
2152 } // Subtarget.hasCDI()
2153
2154 if (Subtarget.hasVPOPCNTDQ()) {
2155 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 })
2157 }
2158
2159 // We can try to convert vectors to different sizes to leverage legal
2160 // `vpcompress` cases. So we mark these supported vector sizes as Custom and
2161 // then specialize to Legal below.
2162 for (MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v4i32, MVT::v4f32, MVT::v4i64,
2163 MVT::v4f64, MVT::v2i64, MVT::v2f64, MVT::v16i8, MVT::v8i16,
2164 MVT::v16i16, MVT::v8i8})
2166
2167 // Legal vpcompress depends on various AVX512 extensions.
2168 // Legal in AVX512F
2169 for (MVT VT : {MVT::v16i32, MVT::v16f32, MVT::v8i64, MVT::v8f64})
2171
2172 // Legal in AVX512F + AVX512VL
2173 if (Subtarget.hasVLX())
2174 for (MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v4i32, MVT::v4f32, MVT::v4i64,
2175 MVT::v4f64, MVT::v2i64, MVT::v2f64})
2177
2178 // Legal in AVX512F + AVX512VBMI2
2179 if (Subtarget.hasVBMI2())
2180 for (MVT VT : {MVT::v32i16, MVT::v64i8})
2182
2183 // Legal in AVX512F + AVX512VL + AVX512VBMI2
2184 if (Subtarget.hasVBMI2() && Subtarget.hasVLX())
2185 for (MVT VT : {MVT::v16i8, MVT::v8i16, MVT::v32i8, MVT::v16i16})
2187 }
2188
2189 // This block control legalization of v32i1/v64i1 which are available with
2190 // AVX512BW..
2191 if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) {
2192 addRegisterClass(MVT::v32i1, &X86::VK32RegClass);
2193 addRegisterClass(MVT::v64i1, &X86::VK64RegClass);
2194
2195 for (auto VT : { MVT::v32i1, MVT::v64i1 }) {
2206 }
2207
2208 for (auto VT : { MVT::v16i1, MVT::v32i1 })
2210
2211 // Extends from v32i1 masks to 256-bit vectors.
2215
2216 for (auto VT : { MVT::v32i8, MVT::v16i8, MVT::v16i16, MVT::v8i16 }) {
2217 setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom);
2218 setOperationAction(ISD::MSTORE, VT, Subtarget.hasVLX() ? Legal : Custom);
2219 }
2220
2221 // These operations are handled on non-VLX by artificially widening in
2222 // isel patterns.
2223 // TODO: Custom widen in lowering on non-VLX and drop the isel patterns?
2224
2225 if (Subtarget.hasBITALG()) {
2226 for (auto VT : { MVT::v16i8, MVT::v32i8, MVT::v8i16, MVT::v16i16 })
2228 }
2229 }
2230
2231 if (!Subtarget.useSoftFloat() && Subtarget.hasFP16()) {
2232 auto setGroup = [&] (MVT VT) {
2243
2256
2258
2261
2267
2273
2277 };
2278
2279 // AVX512_FP16 scalar operations
2280 setGroup(MVT::f16);
2294
2297
2298 if (Subtarget.useAVX512Regs()) {
2299 setGroup(MVT::v32f16);
2305 setOperationAction(ISD::FP_ROUND, MVT::v16f16, Legal);
2312
2317 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v32i8, MVT::v32i16);
2319 MVT::v32i16);
2320 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v32i8, MVT::v32i16);
2322 MVT::v32i16);
2323 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v32i1, MVT::v32i16);
2325 MVT::v32i16);
2326 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v32i1, MVT::v32i16);
2328 MVT::v32i16);
2329
2333
2334 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Legal);
2335 setLoadExtAction(ISD::EXTLOAD, MVT::v16f32, MVT::v16f16, Legal);
2336
2339 }
2340
2341 if (Subtarget.hasVLX()) {
2342 setGroup(MVT::v8f16);
2343 setGroup(MVT::v16f16);
2344
2355
2366
2367 // INSERT_VECTOR_ELT v8f16 extended to VECTOR_SHUFFLE
2370
2374
2375 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Legal);
2376 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Legal);
2377 setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Legal);
2378 setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Legal);
2379
2380 // Need to custom widen these to prevent scalarization.
2381 setOperationAction(ISD::LOAD, MVT::v4f16, Custom);
2382 setOperationAction(ISD::STORE, MVT::v4f16, Custom);
2383
2386
2389 }
2390 }
2391
2392 if (!Subtarget.useSoftFloat() &&
2393 (Subtarget.hasAVXNECONVERT() || Subtarget.hasBF16())) {
2394 addRegisterClass(MVT::v8bf16, Subtarget.hasAVX512() ? &X86::VR128XRegClass
2395 : &X86::VR128RegClass);
2396 addRegisterClass(MVT::v16bf16, Subtarget.hasAVX512() ? &X86::VR256XRegClass
2397 : &X86::VR256RegClass);
2398 // We set the type action of bf16 to TypeSoftPromoteHalf, but we don't
2399 // provide the method to promote BUILD_VECTOR and INSERT_VECTOR_ELT.
2400 // Set the operation action Custom to do the customization later.
2403 for (auto VT : {MVT::v8bf16, MVT::v16bf16}) {
2404 setF16Action(VT, Expand);
2405 if (!Subtarget.hasBF16())
2411 }
2412 for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV}) {
2413 setOperationPromotedToType(Opc, MVT::v8bf16, MVT::v8f32);
2414 setOperationPromotedToType(Opc, MVT::v16bf16, MVT::v16f32);
2415 }
2416 setOperationAction(ISD::SETCC, MVT::v8bf16, Custom);
2417 setOperationAction(ISD::SETCC, MVT::v16bf16, Custom);
2419 addLegalFPImmediate(APFloat::getZero(APFloat::BFloat()));
2420 }
2421
2422 if (!Subtarget.useSoftFloat() && Subtarget.hasBF16() &&
2423 Subtarget.useAVX512Regs()) {
2424 addRegisterClass(MVT::v32bf16, &X86::VR512RegClass);
2425 setF16Action(MVT::v32bf16, Expand);
2426 for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV})
2427 setOperationPromotedToType(Opc, MVT::v32bf16, MVT::v32f32);
2428 setOperationAction(ISD::SETCC, MVT::v32bf16, Custom);
2430 setOperationAction(ISD::FP_ROUND, MVT::v16bf16, Custom);
2434 }
2435
2436 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX10_2()) {
2437 for (auto VT : {MVT::v8bf16, MVT::v16bf16}) {
2445 }
2446 if (Subtarget.hasAVX10_2_512()) {
2447 setOperationAction(ISD::FADD, MVT::v32bf16, Legal);
2448 setOperationAction(ISD::FSUB, MVT::v32bf16, Legal);
2449 setOperationAction(ISD::FMUL, MVT::v32bf16, Legal);
2450 setOperationAction(ISD::FDIV, MVT::v32bf16, Legal);
2451 setOperationAction(ISD::FSQRT, MVT::v32bf16, Legal);
2452 setOperationAction(ISD::FMA, MVT::v32bf16, Legal);
2453 setOperationAction(ISD::SETCC, MVT::v32bf16, Custom);
2454 }
2455 for (auto VT : {MVT::f16, MVT::f32, MVT::f64}) {
2458 }
2459 }
2460
2461 if (!Subtarget.useSoftFloat() && Subtarget.hasVLX()) {
2462 setTruncStoreAction(MVT::v4i64, MVT::v4i8, Legal);
2463 setTruncStoreAction(MVT::v4i64, MVT::v4i16, Legal);
2464 setTruncStoreAction(MVT::v4i64, MVT::v4i32, Legal);
2465 setTruncStoreAction(MVT::v8i32, MVT::v8i8, Legal);
2466 setTruncStoreAction(MVT::v8i32, MVT::v8i16, Legal);
2467
2468 setTruncStoreAction(MVT::v2i64, MVT::v2i8, Legal);
2469 setTruncStoreAction(MVT::v2i64, MVT::v2i16, Legal);
2470 setTruncStoreAction(MVT::v2i64, MVT::v2i32, Legal);
2471 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal);
2472 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal);
2473
2474 if (Subtarget.hasBWI()) {
2475 setTruncStoreAction(MVT::v16i16, MVT::v16i8, Legal);
2476 setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal);
2477 }
2478
2479 if (Subtarget.hasFP16()) {
2480 // vcvttph2[u]dq v4f16 -> v4i32/64, v2f16 -> v2i32/64
2489 // vcvt[u]dq2ph v4i32/64 -> v4f16, v2i32/64 -> v2f16
2498 // vcvtps2phx v4f32 -> v4f16, v2f32 -> v2f16
2503 // vcvtph2psx v4f16 -> v4f32, v2f16 -> v2f32
2508 }
2509 }
2510
2511 if (!Subtarget.useSoftFloat() && Subtarget.hasAMXTILE()) {
2512 addRegisterClass(MVT::x86amx, &X86::TILERegClass);
2513 }
2514
2515 // We want to custom lower some of our intrinsics.
2519 if (!Subtarget.is64Bit()) {
2521 }
2522
2523 // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
2524 // handle type legalization for these operations here.
2525 //
2526 // FIXME: We really should do custom legalization for addition and
2527 // subtraction on x86-32 once PR3203 is fixed. We really can't do much better
2528 // than generic legalization for 64-bit multiplication-with-overflow, though.
2529 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
2530 if (VT == MVT::i64 && !Subtarget.is64Bit())
2531 continue;
2532 // Add/Sub/Mul with overflow operations are custom lowered.
2539
2540 // Support carry in as value rather than glue.
2546 }
2547
2548 // Combine sin / cos into _sincos_stret if it is available.
2549 if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&
2550 getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {
2553 }
2554
2555 if (Subtarget.isTargetWin64()) {
2556 setOperationAction(ISD::SDIV, MVT::i128, Custom);
2557 setOperationAction(ISD::UDIV, MVT::i128, Custom);
2558 setOperationAction(ISD::SREM, MVT::i128, Custom);
2559 setOperationAction(ISD::UREM, MVT::i128, Custom);
2568 }
2569
2570 // On 32 bit MSVC, `fmodf(f32)` is not defined - only `fmod(f64)`
2571 // is. We should promote the value to 64-bits to solve this.
2572 // This is what the CRT headers do - `fmodf` is an inline header
2573 // function casting to f64 and calling `fmod`.
2574 if (Subtarget.is32Bit() &&
2575 (Subtarget.isTargetWindowsMSVC() || Subtarget.isTargetWindowsItanium()))
2576 // clang-format off
2577 for (ISD::NodeType Op :
2595 if (isOperationExpand(Op, MVT::f32))
2596 setOperationAction(Op, MVT::f32, Promote);
2597 // clang-format on
2598
2599 // On MSVC, both 32-bit and 64-bit, ldexpf(f32) is not defined. MinGW has
2600 // it, but it's just a wrapper around ldexp.
2601 if (Subtarget.isOSWindows()) {
2603 if (isOperationExpand(Op, MVT::f32))
2604 setOperationAction(Op, MVT::f32, Promote);
2605 }
2606
2607 // We have target-specific dag combine patterns for the following nodes:
2618 ISD::SHL,
2619 ISD::SRA,
2620 ISD::SRL,
2621 ISD::OR,
2622 ISD::AND,
2628 ISD::ADD,
2629 ISD::FADD,
2630 ISD::FSUB,
2631 ISD::FNEG,
2632 ISD::FMA,
2636 ISD::SUB,
2637 ISD::LOAD,
2638 ISD::LRINT,
2640 ISD::MLOAD,
2641 ISD::STORE,
2655 ISD::SETCC,
2656 ISD::MUL,
2657 ISD::XOR,
2668
2670
2671 MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
2673 MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores
2675 MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
2677
2678 // TODO: These control memcmp expansion in CGP and could be raised higher, but
2679 // that needs to benchmarked and balanced with the potential use of vector
2680 // load/store types (PR33329, PR33914).
2683
2684 // Default loop alignment, which can be overridden by -align-loops.
2686
2687 // An out-of-order CPU can speculatively execute past a predictable branch,
2688 // but a conditional move could be stalled by an expensive earlier operation.
2689 PredictableSelectIsExpensive = Subtarget.getSchedModel().isOutOfOrder();
2690 EnableExtLdPromotion = true;
2692
2694
2695 // Default to having -disable-strictnode-mutation on
2696 IsStrictFPEnabled = true;
2697}
2698
2699// This has so far only been implemented for 64-bit MachO.
2701 return Subtarget.isTargetMachO() && Subtarget.is64Bit();
2702}
2703
2705 // Currently only MSVC CRTs XOR the frame pointer into the stack guard value.
2706 return Subtarget.getTargetTriple().isOSMSVCRT() && !Subtarget.isTargetMachO();
2707}
2708
2710 const SDLoc &DL) const {
2711 EVT PtrTy = getPointerTy(DAG.getDataLayout());
2712 unsigned XorOp = Subtarget.is64Bit() ? X86::XOR64_FP : X86::XOR32_FP;
2713 MachineSDNode *Node = DAG.getMachineNode(XorOp, DL, PtrTy, Val);
2714 return SDValue(Node, 0);
2715}
2716
2719 if ((VT == MVT::v32i1 || VT == MVT::v64i1) && Subtarget.hasAVX512() &&
2720 !Subtarget.hasBWI())
2721 return TypeSplitVector;
2722
2723 if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
2724 !Subtarget.hasF16C() && VT.getVectorElementType() == MVT::f16)
2725 return TypeSplitVector;
2726
2727 if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
2728 VT.getVectorElementType() != MVT::i1)
2729 return TypeWidenVector;
2730
2732}
2733
2734FastISel *
2736 const TargetLibraryInfo *libInfo) const {
2737 return X86::createFastISel(funcInfo, libInfo);
2738}
2739
2740//===----------------------------------------------------------------------===//
2741// Other Lowering Hooks
2742//===----------------------------------------------------------------------===//
2743
2745 bool AssumeSingleUse) {
2746 if (!AssumeSingleUse && !Op.hasOneUse())
2747 return false;
2748 if (!ISD::isNormalLoad(Op.getNode()))
2749 return false;
2750
2751 // If this is an unaligned vector, make sure the target supports folding it.
2752 auto *Ld = cast<LoadSDNode>(Op.getNode());
2753 if (!Subtarget.hasAVX() && !Subtarget.hasSSEUnalignedMem() &&
2754 Ld->getValueSizeInBits(0) == 128 && Ld->getAlign() < Align(16))
2755 return false;
2756
2757 // TODO: If this is a non-temporal load and the target has an instruction
2758 // for it, it should not be folded. See "useNonTemporalLoad()".
2759
2760 return true;
2761}
2762
2764 const X86Subtarget &Subtarget,
2765 bool AssumeSingleUse) {
2766 assert(Subtarget.hasAVX() && "Expected AVX for broadcast from memory");
2767 if (!X86::mayFoldLoad(Op, Subtarget, AssumeSingleUse))
2768 return false;
2769
2770 // We can not replace a wide volatile load with a broadcast-from-memory,
2771 // because that would narrow the load, which isn't legal for volatiles.
2772 auto *Ld = cast<LoadSDNode>(Op.getNode());
2773 return !Ld->isVolatile() ||
2774 Ld->getValueSizeInBits(0) == EltVT.getScalarSizeInBits();
2775}
2776
2778 return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->user_begin());
2779}
2780
2782 if (Op.hasOneUse()) {
2783 unsigned Opcode = Op.getNode()->user_begin()->getOpcode();
2784 return (ISD::ZERO_EXTEND == Opcode);
2785 }
2786 return false;
2787}
2788
2789static bool isLogicOp(unsigned Opcode) {
2790 // TODO: Add support for X86ISD::FAND/FOR/FXOR/FANDN with test coverage.
2791 return ISD::isBitwiseLogicOp(Opcode) || X86ISD::ANDNP == Opcode;
2792}
2793
2794static bool isTargetShuffle(unsigned Opcode) {
2795 switch(Opcode) {
2796 default: return false;
2797 case X86ISD::BLENDI:
2798 case X86ISD::PSHUFB:
2799 case X86ISD::PSHUFD:
2800 case X86ISD::PSHUFHW:
2801 case X86ISD::PSHUFLW:
2802 case X86ISD::SHUFP:
2803 case X86ISD::INSERTPS:
2804 case X86ISD::EXTRQI:
2805 case X86ISD::INSERTQI:
2806 case X86ISD::VALIGN:
2807 case X86ISD::PALIGNR:
2808 case X86ISD::VSHLDQ:
2809 case X86ISD::VSRLDQ:
2810 case X86ISD::MOVLHPS:
2811 case X86ISD::MOVHLPS:
2812 case X86ISD::MOVSHDUP:
2813 case X86ISD::MOVSLDUP:
2814 case X86ISD::MOVDDUP:
2815 case X86ISD::MOVSS:
2816 case X86ISD::MOVSD:
2817 case X86ISD::MOVSH:
2818 case X86ISD::UNPCKL:
2819 case X86ISD::UNPCKH:
2820 case X86ISD::VBROADCAST:
2821 case X86ISD::VPERMILPI:
2822 case X86ISD::VPERMILPV:
2823 case X86ISD::VPERM2X128:
2824 case X86ISD::SHUF128:
2825 case X86ISD::VPERMIL2:
2826 case X86ISD::VPERMI:
2827 case X86ISD::VPPERM:
2828 case X86ISD::VPERMV:
2829 case X86ISD::VPERMV3:
2830 case X86ISD::VZEXT_MOVL:
2831 return true;
2832 }
2833}
2834
2835static bool isTargetShuffleVariableMask(unsigned Opcode) {
2836 switch (Opcode) {
2837 default: return false;
2838 // Target Shuffles.
2839 case X86ISD::PSHUFB:
2840 case X86ISD::VPERMILPV:
2841 case X86ISD::VPERMIL2:
2842 case X86ISD::VPPERM:
2843 case X86ISD::VPERMV:
2844 case X86ISD::VPERMV3:
2845 return true;
2846 // 'Faux' Target Shuffles.
2847 case ISD::OR:
2848 case ISD::AND:
2849 case X86ISD::ANDNP:
2850 return true;
2851 }
2852}
2853
2856 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
2858 int ReturnAddrIndex = FuncInfo->getRAIndex();
2859
2860 if (ReturnAddrIndex == 0) {
2861 // Set up a frame object for the return address.
2862 unsigned SlotSize = RegInfo->getSlotSize();
2863 ReturnAddrIndex = MF.getFrameInfo().CreateFixedObject(SlotSize,
2864 -(int64_t)SlotSize,
2865 false);
2866 FuncInfo->setRAIndex(ReturnAddrIndex);
2867 }
2868
2869 return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy(DAG.getDataLayout()));
2870}
2871
2873 bool HasSymbolicDisplacement) {
2874 // Offset should fit into 32 bit immediate field.
2875 if (!isInt<32>(Offset))
2876 return false;
2877
2878 // If we don't have a symbolic displacement - we don't have any extra
2879 // restrictions.
2880 if (!HasSymbolicDisplacement)
2881 return true;
2882
2883 // We can fold large offsets in the large code model because we always use
2884 // 64-bit offsets.
2885 if (CM == CodeModel::Large)
2886 return true;
2887
2888 // For kernel code model we know that all object resist in the negative half
2889 // of 32bits address space. We may not accept negative offsets, since they may
2890 // be just off and we may accept pretty large positive ones.
2891 if (CM == CodeModel::Kernel)
2892 return Offset >= 0;
2893
2894 // For other non-large code models we assume that latest small object is 16MB
2895 // before end of 31 bits boundary. We may also accept pretty large negative
2896 // constants knowing that all objects are in the positive half of address
2897 // space.
2898 return Offset < 16 * 1024 * 1024;
2899}
2900
2901/// Return true if the condition is an signed comparison operation.
2902static bool isX86CCSigned(unsigned X86CC) {
2903 switch (X86CC) {
2904 default:
2905 llvm_unreachable("Invalid integer condition!");
2906 case X86::COND_E:
2907 case X86::COND_NE:
2908 case X86::COND_B:
2909 case X86::COND_A:
2910 case X86::COND_BE:
2911 case X86::COND_AE:
2912 return false;
2913 case X86::COND_G:
2914 case X86::COND_GE:
2915 case X86::COND_L:
2916 case X86::COND_LE:
2917 return true;
2918 }
2919}
2920
2922 switch (SetCCOpcode) {
2923 // clang-format off
2924 default: llvm_unreachable("Invalid integer condition!");
2925 case ISD::SETEQ: return X86::COND_E;
2926 case ISD::SETGT: return X86::COND_G;
2927 case ISD::SETGE: return X86::COND_GE;
2928 case ISD::SETLT: return X86::COND_L;
2929 case ISD::SETLE: return X86::COND_LE;
2930 case ISD::SETNE: return X86::COND_NE;
2931 case ISD::SETULT: return X86::COND_B;
2932 case ISD::SETUGT: return X86::COND_A;
2933 case ISD::SETULE: return X86::COND_BE;
2934 case ISD::SETUGE: return X86::COND_AE;
2935 // clang-format on
2936 }
2937}
2938
2939/// Do a one-to-one translation of a ISD::CondCode to the X86-specific
2940/// condition code, returning the condition code and the LHS/RHS of the
2941/// comparison to make.
2943 bool isFP, SDValue &LHS, SDValue &RHS,
2944 SelectionDAG &DAG) {
2945 if (!isFP) {
2946 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
2947 if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnes()) {
2948 // X > -1 -> X == 0, jump !sign.
2949 RHS = DAG.getConstant(0, DL, RHS.getValueType());
2950 return X86::COND_NS;
2951 }
2952 if (SetCCOpcode == ISD::SETLT && RHSC->isZero()) {
2953 // X < 0 -> X == 0, jump on sign.
2954 return X86::COND_S;
2955 }
2956 if (SetCCOpcode == ISD::SETGE && RHSC->isZero()) {
2957 // X >= 0 -> X == 0, jump on !sign.
2958 return X86::COND_NS;
2959 }
2960 if (SetCCOpcode == ISD::SETLT && RHSC->isOne()) {
2961 // X < 1 -> X <= 0
2962 RHS = DAG.getConstant(0, DL, RHS.getValueType());
2963 return X86::COND_LE;
2964 }
2965 }
2966
2967 return TranslateIntegerX86CC(SetCCOpcode);
2968 }
2969
2970 // First determine if it is required or is profitable to flip the operands.
2971
2972 // If LHS is a foldable load, but RHS is not, flip the condition.
2973 if (ISD::isNON_EXTLoad(LHS.getNode()) &&
2974 !ISD::isNON_EXTLoad(RHS.getNode())) {
2975 SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
2976 std::swap(LHS, RHS);
2977 }
2978
2979 switch (SetCCOpcode) {
2980 default: break;
2981 case ISD::SETOLT:
2982 case ISD::SETOLE:
2983 case ISD::SETUGT:
2984 case ISD::SETUGE:
2985 std::swap(LHS, RHS);
2986 break;
2987 }
2988
2989 // On a floating point condition, the flags are set as follows:
2990 // ZF PF CF op
2991 // 0 | 0 | 0 | X > Y
2992 // 0 | 0 | 1 | X < Y
2993 // 1 | 0 | 0 | X == Y
2994 // 1 | 1 | 1 | unordered
2995 switch (SetCCOpcode) {
2996 // clang-format off
2997 default: llvm_unreachable("Condcode should be pre-legalized away");
2998 case ISD::SETUEQ:
2999 case ISD::SETEQ: return X86::COND_E;
3000 case ISD::SETOLT: // flipped
3001 case ISD::SETOGT:
3002 case ISD::SETGT: return X86::COND_A;
3003 case ISD::SETOLE: // flipped
3004 case ISD::SETOGE:
3005 case ISD::SETGE: return X86::COND_AE;
3006 case ISD::SETUGT: // flipped
3007 case ISD::SETULT:
3008 case ISD::SETLT: return X86::COND_B;
3009 case ISD::SETUGE: // flipped
3010 case ISD::SETULE:
3011 case ISD::SETLE: return X86::COND_BE;
3012 case ISD::SETONE:
3013 case ISD::SETNE: return X86::COND_NE;
3014 case ISD::SETUO: return X86::COND_P;
3015 case ISD::SETO: return X86::COND_NP;
3016 case ISD::SETOEQ:
3017 case ISD::SETUNE: return X86::COND_INVALID;
3018 // clang-format on
3019 }
3020}
3021
3022/// Is there a floating point cmov for the specific X86 condition code?
3023/// Current x86 isa includes the following FP cmov instructions:
3024/// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.
3025static bool hasFPCMov(unsigned X86CC) {
3026 switch (X86CC) {
3027 default:
3028 return false;
3029 case X86::COND_B:
3030 case X86::COND_BE:
3031 case X86::COND_E:
3032 case X86::COND_P:
3033 case X86::COND_A:
3034 case X86::COND_AE:
3035 case X86::COND_NE:
3036 case X86::COND_NP:
3037 return true;
3038 }
3039}
3040
3041static bool useVPTERNLOG(const X86Subtarget &Subtarget, MVT VT) {
3042 return Subtarget.hasVLX() || Subtarget.canExtendTo512DQ() ||
3043 VT.is512BitVector();
3044}
3045
3047 const CallInst &I,
3048 MachineFunction &MF,
3049 unsigned Intrinsic) const {
3051 Info.offset = 0;
3052
3053 const IntrinsicData* IntrData = getIntrinsicWithChain(Intrinsic);
3054 if (!IntrData) {
3055 switch (Intrinsic) {
3056 case Intrinsic::x86_aesenc128kl:
3057 case Intrinsic::x86_aesdec128kl:
3059 Info.ptrVal = I.getArgOperand(1);
3060 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 48);
3061 Info.align = Align(1);
3063 return true;
3064 case Intrinsic::x86_aesenc256kl:
3065 case Intrinsic::x86_aesdec256kl:
3067 Info.ptrVal = I.getArgOperand(1);
3068 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 64);
3069 Info.align = Align(1);
3071 return true;
3072 case Intrinsic::x86_aesencwide128kl:
3073 case Intrinsic::x86_aesdecwide128kl:
3075 Info.ptrVal = I.getArgOperand(0);
3076 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 48);
3077 Info.align = Align(1);
3079 return true;
3080 case Intrinsic::x86_aesencwide256kl:
3081 case Intrinsic::x86_aesdecwide256kl:
3083 Info.ptrVal = I.getArgOperand(0);
3084 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 64);
3085 Info.align = Align(1);
3087 return true;
3088 case Intrinsic::x86_cmpccxadd32:
3089 case Intrinsic::x86_cmpccxadd64:
3090 case Intrinsic::x86_atomic_bts:
3091 case Intrinsic::x86_atomic_btc:
3092 case Intrinsic::x86_atomic_btr: {
3094 Info.ptrVal = I.getArgOperand(0);
3095 unsigned Size = I.getType()->getScalarSizeInBits();
<