LLVM 20.0.0git
X86ISelLowering.cpp
Go to the documentation of this file.
1//===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines the interfaces that X86 uses to lower LLVM code into a
10// selection DAG.
11//
12//===----------------------------------------------------------------------===//
13
14#include "X86ISelLowering.h"
16#include "X86.h"
17#include "X86CallingConv.h"
18#include "X86FrameLowering.h"
19#include "X86InstrBuilder.h"
20#include "X86IntrinsicsInfo.h"
22#include "X86TargetMachine.h"
23#include "X86TargetObjectFile.h"
25#include "llvm/ADT/SmallSet.h"
26#include "llvm/ADT/Statistic.h"
44#include "llvm/IR/CallingConv.h"
45#include "llvm/IR/Constants.h"
48#include "llvm/IR/Function.h"
49#include "llvm/IR/GlobalAlias.h"
51#include "llvm/IR/IRBuilder.h"
53#include "llvm/IR/Intrinsics.h"
55#include "llvm/MC/MCAsmInfo.h"
56#include "llvm/MC/MCContext.h"
57#include "llvm/MC/MCExpr.h"
58#include "llvm/MC/MCSymbol.h"
60#include "llvm/Support/Debug.h"
65#include <algorithm>
66#include <bitset>
67#include <cctype>
68#include <numeric>
69using namespace llvm;
70
71#define DEBUG_TYPE "x86-isel"
72
74 "x86-experimental-pref-innermost-loop-alignment", cl::init(4),
76 "Sets the preferable loop alignment for experiments (as log2 bytes) "
77 "for innermost loops only. If specified, this option overrides "
78 "alignment set by x86-experimental-pref-loop-alignment."),
80
82 "x86-br-merging-base-cost", cl::init(2),
84 "Sets the cost threshold for when multiple conditionals will be merged "
85 "into one branch versus be split in multiple branches. Merging "
86 "conditionals saves branches at the cost of additional instructions. "
87 "This value sets the instruction cost limit, below which conditionals "
88 "will be merged, and above which conditionals will be split. Set to -1 "
89 "to never merge branches."),
91
93 "x86-br-merging-ccmp-bias", cl::init(6),
94 cl::desc("Increases 'x86-br-merging-base-cost' in cases that the target "
95 "supports conditional compare instructions."),
97
99 "x86-br-merging-likely-bias", cl::init(0),
100 cl::desc("Increases 'x86-br-merging-base-cost' in cases that it is likely "
101 "that all conditionals will be executed. For example for merging "
102 "the conditionals (a == b && c > d), if its known that a == b is "
103 "likely, then it is likely that if the conditionals are split "
104 "both sides will be executed, so it may be desirable to increase "
105 "the instruction cost threshold. Set to -1 to never merge likely "
106 "branches."),
107 cl::Hidden);
108
110 "x86-br-merging-unlikely-bias", cl::init(-1),
111 cl::desc(
112 "Decreases 'x86-br-merging-base-cost' in cases that it is unlikely "
113 "that all conditionals will be executed. For example for merging "
114 "the conditionals (a == b && c > d), if its known that a == b is "
115 "unlikely, then it is unlikely that if the conditionals are split "
116 "both sides will be executed, so it may be desirable to decrease "
117 "the instruction cost threshold. Set to -1 to never merge unlikely "
118 "branches."),
119 cl::Hidden);
120
122 "mul-constant-optimization", cl::init(true),
123 cl::desc("Replace 'mul x, Const' with more effective instructions like "
124 "SHIFT, LEA, etc."),
125 cl::Hidden);
126
128 const X86Subtarget &STI)
129 : TargetLowering(TM), Subtarget(STI) {
130 bool UseX87 = !Subtarget.useSoftFloat() && Subtarget.hasX87();
131 MVT PtrVT = MVT::getIntegerVT(TM.getPointerSizeInBits(0));
132
133 // Set up the TargetLowering object.
134
135 // X86 is weird. It always uses i8 for shift amounts and setcc results.
137 // X86-SSE is even stranger. It uses -1 or 0 for vector masks.
139
140 // X86 instruction cache is coherent with its data cache so we can use the
141 // default expansion to a no-op.
143
144 // For 64-bit, since we have so many registers, use the ILP scheduler.
145 // For 32-bit, use the register pressure specific scheduling.
146 // For Atom, always use ILP scheduling.
147 if (Subtarget.isAtom())
149 else if (Subtarget.is64Bit())
151 else
153 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
155
156 // Bypass expensive divides and use cheaper ones.
157 if (TM.getOptLevel() >= CodeGenOptLevel::Default) {
158 if (Subtarget.hasSlowDivide32())
159 addBypassSlowDiv(32, 8);
160 if (Subtarget.hasSlowDivide64() && Subtarget.is64Bit())
161 addBypassSlowDiv(64, 32);
162 }
163
164 // Setup Windows compiler runtime calls.
165 if (Subtarget.isTargetWindowsMSVC() || Subtarget.isTargetWindowsItanium()) {
166 static const struct {
167 const RTLIB::Libcall Op;
168 const char * const Name;
169 const CallingConv::ID CC;
170 } LibraryCalls[] = {
171 { RTLIB::SDIV_I64, "_alldiv", CallingConv::X86_StdCall },
172 { RTLIB::UDIV_I64, "_aulldiv", CallingConv::X86_StdCall },
173 { RTLIB::SREM_I64, "_allrem", CallingConv::X86_StdCall },
174 { RTLIB::UREM_I64, "_aullrem", CallingConv::X86_StdCall },
175 { RTLIB::MUL_I64, "_allmul", CallingConv::X86_StdCall },
176 };
177
178 for (const auto &LC : LibraryCalls) {
179 setLibcallName(LC.Op, LC.Name);
180 setLibcallCallingConv(LC.Op, LC.CC);
181 }
182 }
183
184 if (Subtarget.canUseCMPXCHG16B())
186 else if (Subtarget.canUseCMPXCHG8B())
188 else
190
191 setMaxDivRemBitWidthSupported(Subtarget.is64Bit() ? 128 : 64);
192
194
195 // Set up the register classes.
196 addRegisterClass(MVT::i8, &X86::GR8RegClass);
197 addRegisterClass(MVT::i16, &X86::GR16RegClass);
198 addRegisterClass(MVT::i32, &X86::GR32RegClass);
199 if (Subtarget.is64Bit())
200 addRegisterClass(MVT::i64, &X86::GR64RegClass);
201
202 for (MVT VT : MVT::integer_valuetypes())
204
205 // We don't accept any truncstore of integer registers.
206 setTruncStoreAction(MVT::i64, MVT::i32, Expand);
207 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
208 setTruncStoreAction(MVT::i64, MVT::i8 , Expand);
209 setTruncStoreAction(MVT::i32, MVT::i16, Expand);
210 setTruncStoreAction(MVT::i32, MVT::i8 , Expand);
211 setTruncStoreAction(MVT::i16, MVT::i8, Expand);
212
213 setTruncStoreAction(MVT::f64, MVT::f32, Expand);
214
215 // SETOEQ and SETUNE require checking two conditions.
216 for (auto VT : {MVT::f32, MVT::f64, MVT::f80}) {
219 }
220
221 // Integer absolute.
222 if (Subtarget.canUseCMOV()) {
223 setOperationAction(ISD::ABS , MVT::i16 , Custom);
224 setOperationAction(ISD::ABS , MVT::i32 , Custom);
225 if (Subtarget.is64Bit())
226 setOperationAction(ISD::ABS , MVT::i64 , Custom);
227 }
228
229 // Absolute difference.
230 for (auto Op : {ISD::ABDS, ISD::ABDU}) {
231 setOperationAction(Op , MVT::i8 , Custom);
232 setOperationAction(Op , MVT::i16 , Custom);
233 setOperationAction(Op , MVT::i32 , Custom);
234 if (Subtarget.is64Bit())
235 setOperationAction(Op , MVT::i64 , Custom);
236 }
237
238 // Signed saturation subtraction.
242 if (Subtarget.is64Bit())
244
245 // Funnel shifts.
246 for (auto ShiftOp : {ISD::FSHL, ISD::FSHR}) {
247 // For slow shld targets we only lower for code size.
248 LegalizeAction ShiftDoubleAction = Subtarget.isSHLDSlow() ? Custom : Legal;
249
250 setOperationAction(ShiftOp , MVT::i8 , Custom);
251 setOperationAction(ShiftOp , MVT::i16 , Custom);
252 setOperationAction(ShiftOp , MVT::i32 , ShiftDoubleAction);
253 if (Subtarget.is64Bit())
254 setOperationAction(ShiftOp , MVT::i64 , ShiftDoubleAction);
255 }
256
257 if (!Subtarget.useSoftFloat()) {
258 // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
259 // operation.
264 // We have an algorithm for SSE2, and we turn this into a 64-bit
265 // FILD or VCVTUSI2SS/SD for other targets.
268 // We have an algorithm for SSE2->double, and we turn this into a
269 // 64-bit FILD followed by conditional FADD for other targets.
272
273 // Promote i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
274 // this operation.
277 // SSE has no i16 to fp conversion, only i32. We promote in the handler
278 // to allow f80 to use i16 and f64 to use i16 with sse1 only
281 // f32 and f64 cases are Legal with SSE1/SSE2, f80 case is not
284 // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64
285 // are Legal, f80 is custom lowered.
288
289 // Promote i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
290 // this operation.
292 // FIXME: This doesn't generate invalid exception when it should. PR44019.
298 // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64
299 // are Legal, f80 is custom lowered.
302
303 // Handle FP_TO_UINT by promoting the destination to a larger signed
304 // conversion.
306 // FIXME: This doesn't generate invalid exception when it should. PR44019.
309 // FIXME: This doesn't generate invalid exception when it should. PR44019.
315
320
321 if (!Subtarget.is64Bit()) {
324 }
325 }
326
327 if (Subtarget.hasSSE2()) {
328 // Custom lowering for saturating float to int conversions.
329 // We handle promotion to larger result types manually.
330 for (MVT VT : { MVT::i8, MVT::i16, MVT::i32 }) {
333 }
334 if (Subtarget.is64Bit()) {
337 }
338 }
339
340 // Handle address space casts between mixed sized pointers.
343
344 // TODO: when we have SSE, these could be more efficient, by using movd/movq.
345 if (!Subtarget.hasSSE2()) {
348 if (Subtarget.is64Bit()) {
350 // Without SSE, i64->f64 goes through memory.
352 }
353 } else if (!Subtarget.is64Bit())
355
356 // Scalar integer divide and remainder are lowered to use operations that
357 // produce two results, to match the available instructions. This exposes
358 // the two-result form to trivial CSE, which is able to combine x/y and x%y
359 // into a single instruction.
360 //
361 // Scalar integer multiply-high is also lowered to use two-result
362 // operations, to match the available instructions. However, plain multiply
363 // (low) operations are left as Legal, as there are single-result
364 // instructions for this in x86. Using the two-result multiply instructions
365 // when both high and low results are needed must be arranged by dagcombine.
366 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
373 }
374
375 setOperationAction(ISD::BR_JT , MVT::Other, Expand);
377 for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128,
378 MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
381 }
382 if (Subtarget.is64Bit())
387
388 setOperationAction(ISD::FREM , MVT::f32 , Expand);
389 setOperationAction(ISD::FREM , MVT::f64 , Expand);
390 setOperationAction(ISD::FREM , MVT::f80 , Expand);
391 setOperationAction(ISD::FREM , MVT::f128 , Expand);
392
393 if (!Subtarget.useSoftFloat() && Subtarget.hasX87()) {
399 }
400
401 // Promote the i8 variants and force them on up to i32 which has a shorter
402 // encoding.
403 setOperationPromotedToType(ISD::CTTZ , MVT::i8 , MVT::i32);
405 // Promoted i16. tzcntw has a false dependency on Intel CPUs. For BSF, we emit
406 // a REP prefix to encode it as TZCNT for modern CPUs so it makes sense to
407 // promote that too.
408 setOperationPromotedToType(ISD::CTTZ , MVT::i16 , MVT::i32);
410
411 if (!Subtarget.hasBMI()) {
412 setOperationAction(ISD::CTTZ , MVT::i32 , Custom);
414 if (Subtarget.is64Bit()) {
415 setOperationAction(ISD::CTTZ , MVT::i64 , Custom);
417 }
418 }
419
420 if (Subtarget.hasLZCNT()) {
421 // When promoting the i8 variants, force them to i32 for a shorter
422 // encoding.
423 setOperationPromotedToType(ISD::CTLZ , MVT::i8 , MVT::i32);
425 } else {
426 for (auto VT : {MVT::i8, MVT::i16, MVT::i32, MVT::i64}) {
427 if (VT == MVT::i64 && !Subtarget.is64Bit())
428 continue;
431 }
432 }
433
436 // Special handling for half-precision floating point conversions.
437 // If we don't have F16C support, then lower half float conversions
438 // into library calls.
440 Op, MVT::f32,
441 (!Subtarget.useSoftFloat() && Subtarget.hasF16C()) ? Custom : Expand);
442 // There's never any support for operations beyond MVT::f32.
443 setOperationAction(Op, MVT::f64, Expand);
444 setOperationAction(Op, MVT::f80, Expand);
445 setOperationAction(Op, MVT::f128, Expand);
446 }
447
448 for (auto VT : {MVT::f32, MVT::f64, MVT::f80, MVT::f128}) {
451 }
452
453 for (MVT VT : {MVT::f32, MVT::f64, MVT::f80, MVT::f128}) {
454 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand);
455 setLoadExtAction(ISD::EXTLOAD, VT, MVT::bf16, Expand);
456 setTruncStoreAction(VT, MVT::f16, Expand);
457 setTruncStoreAction(VT, MVT::bf16, Expand);
458
461 }
462
466 if (Subtarget.is64Bit())
468 if (Subtarget.hasPOPCNT()) {
469 setOperationPromotedToType(ISD::CTPOP, MVT::i8, MVT::i32);
470 // popcntw is longer to encode than popcntl and also has a false dependency
471 // on the dest that popcntl hasn't had since Cannon Lake.
472 setOperationPromotedToType(ISD::CTPOP, MVT::i16, MVT::i32);
473 } else {
478 }
479
481
482 if (!Subtarget.hasMOVBE())
484
485 // X86 wants to expand cmov itself.
486 for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128 }) {
491 }
492 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
493 if (VT == MVT::i64 && !Subtarget.is64Bit())
494 continue;
497 }
498
499 // Custom action for SELECT MMX and expand action for SELECT_CC MMX
502
504 // NOTE: EH_SJLJ_SETJMP/_LONGJMP are not recommended, since
505 // LLVM/Clang supports zero-cost DWARF and SEH exception handling.
509 if (TM.Options.ExceptionModel == ExceptionHandling::SjLj)
510 setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume");
511
512 // Darwin ABI issue.
513 for (auto VT : { MVT::i32, MVT::i64 }) {
514 if (VT == MVT::i64 && !Subtarget.is64Bit())
515 continue;
522 }
523
524 // 64-bit shl, sra, srl (iff 32-bit x86)
525 for (auto VT : { MVT::i32, MVT::i64 }) {
526 if (VT == MVT::i64 && !Subtarget.is64Bit())
527 continue;
531 }
532
533 if (Subtarget.hasSSEPrefetch())
535
537
538 // Expand certain atomics
539 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
547 }
548
549 if (!Subtarget.is64Bit())
551
552 if (Subtarget.is64Bit() && Subtarget.hasAVX()) {
553 // All CPUs supporting AVX will atomically load/store aligned 128-bit
554 // values, so we can emit [V]MOVAPS/[V]MOVDQA.
557 }
558
559 if (Subtarget.canUseCMPXCHG16B())
561
562 // FIXME - use subtarget debug flags
563 if (!Subtarget.isTargetDarwin() && !Subtarget.isTargetELF() &&
564 !Subtarget.isTargetCygMing() && !Subtarget.isTargetWin64() &&
565 TM.Options.ExceptionModel != ExceptionHandling::SjLj) {
567 }
568
571
574
575 setOperationAction(ISD::TRAP, MVT::Other, Legal);
577 if (Subtarget.isTargetPS())
579 else
581
582 // VASTART needs to be custom lowered to use the VarArgsFrameIndex
584 setOperationAction(ISD::VAEND , MVT::Other, Expand);
585 bool Is64Bit = Subtarget.is64Bit();
586 setOperationAction(ISD::VAARG, MVT::Other, Is64Bit ? Custom : Expand);
587 setOperationAction(ISD::VACOPY, MVT::Other, Is64Bit ? Custom : Expand);
588
591
593
594 // GC_TRANSITION_START and GC_TRANSITION_END need custom lowering.
597
599
600 auto setF16Action = [&] (MVT VT, LegalizeAction Action) {
601 setOperationAction(ISD::FABS, VT, Action);
602 setOperationAction(ISD::FNEG, VT, Action);
604 setOperationAction(ISD::FREM, VT, Action);
605 setOperationAction(ISD::FMA, VT, Action);
606 setOperationAction(ISD::FMINNUM, VT, Action);
607 setOperationAction(ISD::FMAXNUM, VT, Action);
610 setOperationAction(ISD::FSIN, VT, Action);
611 setOperationAction(ISD::FCOS, VT, Action);
612 setOperationAction(ISD::FSINCOS, VT, Action);
613 setOperationAction(ISD::FTAN, VT, Action);
614 setOperationAction(ISD::FSQRT, VT, Action);
615 setOperationAction(ISD::FPOW, VT, Action);
616 setOperationAction(ISD::FLOG, VT, Action);
617 setOperationAction(ISD::FLOG2, VT, Action);
618 setOperationAction(ISD::FLOG10, VT, Action);
619 setOperationAction(ISD::FEXP, VT, Action);
620 setOperationAction(ISD::FEXP2, VT, Action);
621 setOperationAction(ISD::FEXP10, VT, Action);
622 setOperationAction(ISD::FCEIL, VT, Action);
623 setOperationAction(ISD::FFLOOR, VT, Action);
625 setOperationAction(ISD::FRINT, VT, Action);
626 setOperationAction(ISD::BR_CC, VT, Action);
627 setOperationAction(ISD::SETCC, VT, Action);
630 setOperationAction(ISD::FROUND, VT, Action);
632 setOperationAction(ISD::FTRUNC, VT, Action);
633 setOperationAction(ISD::FLDEXP, VT, Action);
634 };
635
636 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
637 // f16, f32 and f64 use SSE.
638 // Set up the FP register classes.
639 addRegisterClass(MVT::f16, Subtarget.hasAVX512() ? &X86::FR16XRegClass
640 : &X86::FR16RegClass);
641 addRegisterClass(MVT::f32, Subtarget.hasAVX512() ? &X86::FR32XRegClass
642 : &X86::FR32RegClass);
643 addRegisterClass(MVT::f64, Subtarget.hasAVX512() ? &X86::FR64XRegClass
644 : &X86::FR64RegClass);
645
646 // Disable f32->f64 extload as we can only generate this in one instruction
647 // under optsize. So its easier to pattern match (fpext (load)) for that
648 // case instead of needing to emit 2 instructions for extload in the
649 // non-optsize case.
650 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
651
652 for (auto VT : { MVT::f32, MVT::f64 }) {
653 // Use ANDPD to simulate FABS.
655
656 // Use XORP to simulate FNEG.
658
659 // Use ANDPD and ORPD to simulate FCOPYSIGN.
661
662 // These might be better off as horizontal vector ops.
665
666 // We don't support sin/cos/fmod
670 }
671
672 // Half type will be promoted by default.
673 setF16Action(MVT::f16, Promote);
681
711
712 setLibcallName(RTLIB::FPROUND_F32_F16, "__truncsfhf2");
713 setLibcallName(RTLIB::FPEXT_F16_F32, "__extendhfsf2");
714
715 // Lower this to MOVMSK plus an AND.
718
719 } else if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1() &&
720 (UseX87 || Is64Bit)) {
721 // Use SSE for f32, x87 for f64.
722 // Set up the FP register classes.
723 addRegisterClass(MVT::f32, &X86::FR32RegClass);
724 if (UseX87)
725 addRegisterClass(MVT::f64, &X86::RFP64RegClass);
726
727 // Use ANDPS to simulate FABS.
729
730 // Use XORP to simulate FNEG.
732
733 if (UseX87)
735
736 // Use ANDPS and ORPS to simulate FCOPYSIGN.
737 if (UseX87)
740
741 // We don't support sin/cos/fmod
745
746 if (UseX87) {
747 // Always expand sin/cos functions even though x87 has an instruction.
751 }
752 } else if (UseX87) {
753 // f32 and f64 in x87.
754 // Set up the FP register classes.
755 addRegisterClass(MVT::f64, &X86::RFP64RegClass);
756 addRegisterClass(MVT::f32, &X86::RFP32RegClass);
757
758 for (auto VT : { MVT::f32, MVT::f64 }) {
761
762 // Always expand sin/cos functions even though x87 has an instruction.
766 }
767 }
768
769 // Expand FP32 immediates into loads from the stack, save special cases.
770 if (isTypeLegal(MVT::f32)) {
771 if (UseX87 && (getRegClassFor(MVT::f32) == &X86::RFP32RegClass)) {
772 addLegalFPImmediate(APFloat(+0.0f)); // FLD0
773 addLegalFPImmediate(APFloat(+1.0f)); // FLD1
774 addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
775 addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
776 } else // SSE immediates.
777 addLegalFPImmediate(APFloat(+0.0f)); // xorps
778 }
779 // Expand FP64 immediates into loads from the stack, save special cases.
780 if (isTypeLegal(MVT::f64)) {
781 if (UseX87 && getRegClassFor(MVT::f64) == &X86::RFP64RegClass) {
782 addLegalFPImmediate(APFloat(+0.0)); // FLD0
783 addLegalFPImmediate(APFloat(+1.0)); // FLD1
784 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
785 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
786 } else // SSE immediates.
787 addLegalFPImmediate(APFloat(+0.0)); // xorpd
788 }
789 // Support fp16 0 immediate.
790 if (isTypeLegal(MVT::f16))
791 addLegalFPImmediate(APFloat::getZero(APFloat::IEEEhalf()));
792
793 // Handle constrained floating-point operations of scalar.
806
807 // We don't support FMA.
810
811 // f80 always uses X87.
812 if (UseX87) {
813 addRegisterClass(MVT::f80, &X86::RFP80RegClass);
816 {
818 addLegalFPImmediate(TmpFlt); // FLD0
819 TmpFlt.changeSign();
820 addLegalFPImmediate(TmpFlt); // FLD0/FCHS
821
822 bool ignored;
823 APFloat TmpFlt2(+1.0);
825 &ignored);
826 addLegalFPImmediate(TmpFlt2); // FLD1
827 TmpFlt2.changeSign();
828 addLegalFPImmediate(TmpFlt2); // FLD1/FCHS
829 }
830
831 // Always expand sin/cos functions even though x87 has an instruction.
832 // clang-format off
843 // clang-format on
844
856
857 // Handle constrained floating-point operations of scalar.
863 if (isTypeLegal(MVT::f16)) {
866 } else {
868 }
869 // FIXME: When the target is 64-bit, STRICT_FP_ROUND will be overwritten
870 // as Custom.
872 }
873
874 // f128 uses xmm registers, but most operations require libcalls.
875 if (!Subtarget.useSoftFloat() && Subtarget.is64Bit() && Subtarget.hasSSE1()) {
876 addRegisterClass(MVT::f128, Subtarget.hasVLX() ? &X86::VR128XRegClass
877 : &X86::VR128RegClass);
878
879 addLegalFPImmediate(APFloat::getZero(APFloat::IEEEquad())); // xorps
880
891
895
896 // clang-format off
904 // clang-format on
905 // No STRICT_FSINCOS
908
911 // We need to custom handle any FP_ROUND with an f128 input, but
912 // LegalizeDAG uses the result type to know when to run a custom handler.
913 // So we have to list all legal floating point result types here.
914 if (isTypeLegal(MVT::f32)) {
917 }
918 if (isTypeLegal(MVT::f64)) {
921 }
922 if (isTypeLegal(MVT::f80)) {
925 }
926
928
929 setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f32, Expand);
930 setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f64, Expand);
931 setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f80, Expand);
932 setTruncStoreAction(MVT::f128, MVT::f32, Expand);
933 setTruncStoreAction(MVT::f128, MVT::f64, Expand);
934 setTruncStoreAction(MVT::f128, MVT::f80, Expand);
935 }
936
937 // Always use a library call for pow.
938 setOperationAction(ISD::FPOW , MVT::f32 , Expand);
939 setOperationAction(ISD::FPOW , MVT::f64 , Expand);
940 setOperationAction(ISD::FPOW , MVT::f80 , Expand);
941 setOperationAction(ISD::FPOW , MVT::f128 , Expand);
942
951
952 // Some FP actions are always expanded for vector types.
953 for (auto VT : { MVT::v8f16, MVT::v16f16, MVT::v32f16,
954 MVT::v4f32, MVT::v8f32, MVT::v16f32,
955 MVT::v2f64, MVT::v4f64, MVT::v8f64 }) {
956 // clang-format off
970 // clang-format on
971 }
972
973 // First set operation action for all vector types to either promote
974 // (for widening) or expand (for scalarization). Then we will selectively
975 // turn on ones that can be effectively codegen'd.
1015 for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
1016 setTruncStoreAction(InnerVT, VT, Expand);
1017
1018 setLoadExtAction(ISD::SEXTLOAD, InnerVT, VT, Expand);
1019 setLoadExtAction(ISD::ZEXTLOAD, InnerVT, VT, Expand);
1020
1021 // N.b. ISD::EXTLOAD legality is basically ignored except for i1-like
1022 // types, we have to deal with them whether we ask for Expansion or not.
1023 // Setting Expand causes its own optimisation problems though, so leave
1024 // them legal.
1025 if (VT.getVectorElementType() == MVT::i1)
1026 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
1027
1028 // EXTLOAD for MVT::f16 vectors is not legal because f16 vectors are
1029 // split/scalarized right now.
1030 if (VT.getVectorElementType() == MVT::f16 ||
1031 VT.getVectorElementType() == MVT::bf16)
1032 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
1033 }
1034 }
1035
1036 // FIXME: In order to prevent SSE instructions being expanded to MMX ones
1037 // with -msoft-float, disable use of MMX as well.
1038 if (!Subtarget.useSoftFloat() && Subtarget.hasMMX()) {
1039 addRegisterClass(MVT::x86mmx, &X86::VR64RegClass);
1040 // No operations on x86mmx supported, everything uses intrinsics.
1041 }
1042
1043 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1()) {
1044 addRegisterClass(MVT::v4f32, Subtarget.hasVLX() ? &X86::VR128XRegClass
1045 : &X86::VR128RegClass);
1046
1049
1050 setOperationAction(ISD::FNEG, MVT::v4f32, Custom);
1051 setOperationAction(ISD::FABS, MVT::v4f32, Custom);
1058
1059 setOperationAction(ISD::LOAD, MVT::v2f32, Custom);
1060 setOperationAction(ISD::STORE, MVT::v2f32, Custom);
1061
1067 }
1068
1069 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
1070 addRegisterClass(MVT::v2f64, Subtarget.hasVLX() ? &X86::VR128XRegClass
1071 : &X86::VR128RegClass);
1072
1073 // FIXME: Unfortunately, -soft-float and -no-implicit-float mean XMM
1074 // registers cannot be used even for integer operations.
1075 addRegisterClass(MVT::v16i8, Subtarget.hasVLX() ? &X86::VR128XRegClass
1076 : &X86::VR128RegClass);
1077 addRegisterClass(MVT::v8i16, Subtarget.hasVLX() ? &X86::VR128XRegClass
1078 : &X86::VR128RegClass);
1079 addRegisterClass(MVT::v8f16, Subtarget.hasVLX() ? &X86::VR128XRegClass
1080 : &X86::VR128RegClass);
1081 addRegisterClass(MVT::v4i32, Subtarget.hasVLX() ? &X86::VR128XRegClass
1082 : &X86::VR128RegClass);
1083 addRegisterClass(MVT::v2i64, Subtarget.hasVLX() ? &X86::VR128XRegClass
1084 : &X86::VR128RegClass);
1085
1086 for (auto VT : { MVT::f64, MVT::v4f32, MVT::v2f64 }) {
1089 }
1090
1091 for (auto VT : { MVT::v2i8, MVT::v4i8, MVT::v8i8,
1092 MVT::v2i16, MVT::v4i16, MVT::v2i32 }) {
1097 }
1098
1099 setOperationAction(ISD::MUL, MVT::v2i8, Custom);
1100 setOperationAction(ISD::MUL, MVT::v4i8, Custom);
1101 setOperationAction(ISD::MUL, MVT::v8i8, Custom);
1102
1103 setOperationAction(ISD::MUL, MVT::v16i8, Custom);
1104 setOperationAction(ISD::MUL, MVT::v4i32, Custom);
1105 setOperationAction(ISD::MUL, MVT::v2i64, Custom);
1106 setOperationAction(ISD::MULHU, MVT::v4i32, Custom);
1107 setOperationAction(ISD::MULHS, MVT::v4i32, Custom);
1108 setOperationAction(ISD::MULHU, MVT::v16i8, Custom);
1109 setOperationAction(ISD::MULHS, MVT::v16i8, Custom);
1110 setOperationAction(ISD::MULHU, MVT::v8i16, Legal);
1111 setOperationAction(ISD::MULHS, MVT::v8i16, Legal);
1112 setOperationAction(ISD::MUL, MVT::v8i16, Legal);
1115
1116 setOperationAction(ISD::SMULO, MVT::v16i8, Custom);
1117 setOperationAction(ISD::UMULO, MVT::v16i8, Custom);
1118 setOperationAction(ISD::UMULO, MVT::v2i32, Custom);
1119
1120 setOperationAction(ISD::FNEG, MVT::v2f64, Custom);
1121 setOperationAction(ISD::FABS, MVT::v2f64, Custom);
1123
1124 setOperationAction(ISD::LRINT, MVT::v4f32, Custom);
1125
1126 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1127 setOperationAction(ISD::SMAX, VT, VT == MVT::v8i16 ? Legal : Custom);
1128 setOperationAction(ISD::SMIN, VT, VT == MVT::v8i16 ? Legal : Custom);
1129 setOperationAction(ISD::UMAX, VT, VT == MVT::v16i8 ? Legal : Custom);
1130 setOperationAction(ISD::UMIN, VT, VT == MVT::v16i8 ? Legal : Custom);
1131 }
1132
1143
1148
1149 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1155
1156 // The condition codes aren't legal in SSE/AVX and under AVX512 we use
1157 // setcc all the way to isel and prefer SETGT in some isel patterns.
1160 }
1161
1162 setOperationAction(ISD::SETCC, MVT::v2f64, Custom);
1163 setOperationAction(ISD::SETCC, MVT::v4f32, Custom);
1168
1169 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
1175 }
1176
1177 for (auto VT : { MVT::v8f16, MVT::v2f64, MVT::v2i64 }) {
1181
1182 if (VT == MVT::v2i64 && !Subtarget.is64Bit())
1183 continue;
1184
1187 }
1188 setF16Action(MVT::v8f16, Expand);
1189 setOperationAction(ISD::FADD, MVT::v8f16, Expand);
1190 setOperationAction(ISD::FSUB, MVT::v8f16, Expand);
1191 setOperationAction(ISD::FMUL, MVT::v8f16, Expand);
1192 setOperationAction(ISD::FDIV, MVT::v8f16, Expand);
1193 setOperationAction(ISD::FNEG, MVT::v8f16, Custom);
1194 setOperationAction(ISD::FABS, MVT::v8f16, Custom);
1196
1197 // Custom lower v2i64 and v2f64 selects.
1204
1211
1212 // Custom legalize these to avoid over promotion or custom promotion.
1213 for (auto VT : {MVT::v2i8, MVT::v4i8, MVT::v8i8, MVT::v2i16, MVT::v4i16}) {
1218 }
1219
1224
1227
1230
1231 // Fast v2f32 UINT_TO_FP( v2i32 ) custom conversion.
1236
1241
1242 // We want to legalize this to an f64 load rather than an i64 load on
1243 // 64-bit targets and two 32-bit loads on a 32-bit target. Similar for
1244 // store.
1245 setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
1246 setOperationAction(ISD::LOAD, MVT::v4i16, Custom);
1247 setOperationAction(ISD::LOAD, MVT::v8i8, Custom);
1248 setOperationAction(ISD::STORE, MVT::v2i32, Custom);
1249 setOperationAction(ISD::STORE, MVT::v4i16, Custom);
1251
1252 // Add 32-bit vector stores to help vectorization opportunities.
1253 setOperationAction(ISD::STORE, MVT::v2i16, Custom);
1255
1259 if (!Subtarget.hasAVX512())
1261
1265
1267
1284
1285 // In the customized shift lowering, the legal v4i32/v2i64 cases
1286 // in AVX2 will be recognized.
1287 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1291 if (VT == MVT::v2i64) continue;
1296 }
1297
1303 }
1304
1305 if (Subtarget.hasGFNI()) {
1310 }
1311
1312 if (!Subtarget.useSoftFloat() && Subtarget.hasSSSE3()) {
1313 setOperationAction(ISD::ABS, MVT::v16i8, Legal);
1314 setOperationAction(ISD::ABS, MVT::v8i16, Legal);
1315 setOperationAction(ISD::ABS, MVT::v4i32, Legal);
1316
1317 for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64}) {
1320 }
1321
1322 // These might be better off as horizontal vector ops.
1327 }
1328
1329 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE41()) {
1330 for (MVT RoundedTy : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64}) {
1333 setOperationAction(ISD::FCEIL, RoundedTy, Legal);
1337 setOperationAction(ISD::FRINT, RoundedTy, Legal);
1343
1345 }
1346
1347 setOperationAction(ISD::SMAX, MVT::v16i8, Legal);
1348 setOperationAction(ISD::SMAX, MVT::v4i32, Legal);
1349 setOperationAction(ISD::UMAX, MVT::v8i16, Legal);
1350 setOperationAction(ISD::UMAX, MVT::v4i32, Legal);
1351 setOperationAction(ISD::SMIN, MVT::v16i8, Legal);
1352 setOperationAction(ISD::SMIN, MVT::v4i32, Legal);
1353 setOperationAction(ISD::UMIN, MVT::v8i16, Legal);
1354 setOperationAction(ISD::UMIN, MVT::v4i32, Legal);
1355
1359
1360 // FIXME: Do we need to handle scalar-to-vector here?
1361 setOperationAction(ISD::MUL, MVT::v4i32, Legal);
1362 setOperationAction(ISD::SMULO, MVT::v2i32, Custom);
1363
1364 // We directly match byte blends in the backend as they match the VSELECT
1365 // condition form.
1367
1368 // SSE41 brings specific instructions for doing vector sign extend even in
1369 // cases where we don't have SRA.
1370 for (auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1373 }
1374
1375 // SSE41 also has vector sign/zero extending loads, PMOV[SZ]X
1376 for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
1377 setLoadExtAction(LoadExtOp, MVT::v8i16, MVT::v8i8, Legal);
1378 setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i8, Legal);
1379 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i8, Legal);
1380 setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i16, Legal);
1381 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i16, Legal);
1382 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i32, Legal);
1383 }
1384
1385 if (Subtarget.is64Bit() && !Subtarget.hasAVX512()) {
1386 // We need to scalarize v4i64->v432 uint_to_fp using cvtsi2ss, but we can
1387 // do the pre and post work in the vector domain.
1390 // We need to mark SINT_TO_FP as Custom even though we want to expand it
1391 // so that DAG combine doesn't try to turn it into uint_to_fp.
1394 }
1395 }
1396
1397 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE42()) {
1399 }
1400
1401 if (!Subtarget.useSoftFloat() && Subtarget.hasXOP()) {
1402 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
1403 MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1406 }
1407
1408 // XOP can efficiently perform BITREVERSE with VPPERM.
1409 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 })
1411 }
1412
1413 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX()) {
1414 bool HasInt256 = Subtarget.hasInt256();
1415
1416 addRegisterClass(MVT::v32i8, Subtarget.hasVLX() ? &X86::VR256XRegClass
1417 : &X86::VR256RegClass);
1418 addRegisterClass(MVT::v16i16, Subtarget.hasVLX() ? &X86::VR256XRegClass
1419 : &X86::VR256RegClass);
1420 addRegisterClass(MVT::v16f16, Subtarget.hasVLX() ? &X86::VR256XRegClass
1421 : &X86::VR256RegClass);
1422 addRegisterClass(MVT::v8i32, Subtarget.hasVLX() ? &X86::VR256XRegClass
1423 : &X86::VR256RegClass);
1424 addRegisterClass(MVT::v8f32, Subtarget.hasVLX() ? &X86::VR256XRegClass
1425 : &X86::VR256RegClass);
1426 addRegisterClass(MVT::v4i64, Subtarget.hasVLX() ? &X86::VR256XRegClass
1427 : &X86::VR256RegClass);
1428 addRegisterClass(MVT::v4f64, Subtarget.hasVLX() ? &X86::VR256XRegClass
1429 : &X86::VR256RegClass);
1430
1431 for (auto VT : { MVT::v8f32, MVT::v4f64 }) {
1444
1446
1450
1453 }
1454
1455 setOperationAction(ISD::LRINT, MVT::v8f32, Custom);
1456 setOperationAction(ISD::LRINT, MVT::v4f64, Custom);
1457
1458 // (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted
1459 // even though v8i16 is a legal type.
1460 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i16, MVT::v8i32);
1461 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i16, MVT::v8i32);
1462 setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v8i16, MVT::v8i32);
1463 setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v8i16, MVT::v8i32);
1467
1474
1486
1487 if (!Subtarget.hasAVX512())
1489
1490 // In the customized shift lowering, the legal v8i32/v4i64 cases
1491 // in AVX2 will be recognized.
1492 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1498 if (VT == MVT::v4i64) continue;
1503 }
1504
1505 // These types need custom splitting if their input is a 128-bit vector.
1510
1514 setOperationAction(ISD::SELECT, MVT::v16i16, Custom);
1515 setOperationAction(ISD::SELECT, MVT::v16f16, Custom);
1518
1519 for (auto VT : { MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1523 }
1524
1529
1530 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1535
1536 // The condition codes aren't legal in SSE/AVX and under AVX512 we use
1537 // setcc all the way to isel and prefer SETGT in some isel patterns.
1540 }
1541
1542 setOperationAction(ISD::SETCC, MVT::v4f64, Custom);
1543 setOperationAction(ISD::SETCC, MVT::v8f32, Custom);
1548
1549 if (Subtarget.hasAnyFMA()) {
1550 for (auto VT : { MVT::f32, MVT::f64, MVT::v4f32, MVT::v8f32,
1551 MVT::v2f64, MVT::v4f64 }) {
1554 }
1555 }
1556
1557 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1558 setOperationAction(ISD::ADD, VT, HasInt256 ? Legal : Custom);
1559 setOperationAction(ISD::SUB, VT, HasInt256 ? Legal : Custom);
1560 }
1561
1562 setOperationAction(ISD::MUL, MVT::v4i64, Custom);
1563 setOperationAction(ISD::MUL, MVT::v8i32, HasInt256 ? Legal : Custom);
1564 setOperationAction(ISD::MUL, MVT::v16i16, HasInt256 ? Legal : Custom);
1565 setOperationAction(ISD::MUL, MVT::v32i8, Custom);
1566
1567 setOperationAction(ISD::MULHU, MVT::v8i32, Custom);
1568 setOperationAction(ISD::MULHS, MVT::v8i32, Custom);
1569 setOperationAction(ISD::MULHU, MVT::v16i16, HasInt256 ? Legal : Custom);
1570 setOperationAction(ISD::MULHS, MVT::v16i16, HasInt256 ? Legal : Custom);
1571 setOperationAction(ISD::MULHU, MVT::v32i8, Custom);
1572 setOperationAction(ISD::MULHS, MVT::v32i8, Custom);
1573 setOperationAction(ISD::AVGCEILU, MVT::v16i16, HasInt256 ? Legal : Custom);
1574 setOperationAction(ISD::AVGCEILU, MVT::v32i8, HasInt256 ? Legal : Custom);
1575
1576 setOperationAction(ISD::SMULO, MVT::v32i8, Custom);
1577 setOperationAction(ISD::UMULO, MVT::v32i8, Custom);
1578
1579 setOperationAction(ISD::ABS, MVT::v4i64, Custom);
1580 setOperationAction(ISD::SMAX, MVT::v4i64, Custom);
1581 setOperationAction(ISD::UMAX, MVT::v4i64, Custom);
1582 setOperationAction(ISD::SMIN, MVT::v4i64, Custom);
1583 setOperationAction(ISD::UMIN, MVT::v4i64, Custom);
1584
1585 setOperationAction(ISD::UADDSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1586 setOperationAction(ISD::SADDSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1587 setOperationAction(ISD::USUBSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1588 setOperationAction(ISD::SSUBSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1589 setOperationAction(ISD::UADDSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1590 setOperationAction(ISD::SADDSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1591 setOperationAction(ISD::USUBSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1592 setOperationAction(ISD::SSUBSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1597
1598 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
1599 setOperationAction(ISD::ABS, VT, HasInt256 ? Legal : Custom);
1600 setOperationAction(ISD::SMAX, VT, HasInt256 ? Legal : Custom);
1601 setOperationAction(ISD::UMAX, VT, HasInt256 ? Legal : Custom);
1602 setOperationAction(ISD::SMIN, VT, HasInt256 ? Legal : Custom);
1603 setOperationAction(ISD::UMIN, VT, HasInt256 ? Legal : Custom);
1604 }
1605
1606 for (auto VT : {MVT::v16i16, MVT::v8i32, MVT::v4i64}) {
1609 }
1610
1611 if (HasInt256) {
1612 // The custom lowering for UINT_TO_FP for v8i32 becomes interesting
1613 // when we have a 256bit-wide blend with immediate.
1616
1617 // AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X
1618 for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
1619 setLoadExtAction(LoadExtOp, MVT::v16i16, MVT::v16i8, Legal);
1620 setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i8, Legal);
1621 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i8, Legal);
1622 setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i16, Legal);
1623 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i16, Legal);
1624 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i32, Legal);
1625 }
1626 }
1627
1628 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1629 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) {
1630 setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom);
1632 }
1633
1634 // Extract subvector is special because the value type
1635 // (result) is 128-bit but the source is 256-bit wide.
1636 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
1637 MVT::v8f16, MVT::v4f32, MVT::v2f64 }) {
1639 }
1640
1641 // Custom lower several nodes for 256-bit types.
1642 for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
1643 MVT::v16f16, MVT::v8f32, MVT::v4f64 }) {
1653 }
1654 setF16Action(MVT::v16f16, Expand);
1655 setOperationAction(ISD::FNEG, MVT::v16f16, Custom);
1656 setOperationAction(ISD::FABS, MVT::v16f16, Custom);
1658 setOperationAction(ISD::FADD, MVT::v16f16, Expand);
1659 setOperationAction(ISD::FSUB, MVT::v16f16, Expand);
1660 setOperationAction(ISD::FMUL, MVT::v16f16, Expand);
1661 setOperationAction(ISD::FDIV, MVT::v16f16, Expand);
1662
1663 if (HasInt256) {
1665
1666 // Custom legalize 2x32 to get a little better code.
1669
1670 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1671 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
1673 }
1674 }
1675
1676 if (!Subtarget.useSoftFloat() && !Subtarget.hasFP16() &&
1677 Subtarget.hasF16C()) {
1678 for (MVT VT : { MVT::f16, MVT::v2f16, MVT::v4f16, MVT::v8f16 }) {
1681 }
1682 for (MVT VT : { MVT::f32, MVT::v2f32, MVT::v4f32, MVT::v8f32 }) {
1685 }
1686 for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV}) {
1687 setOperationPromotedToType(Opc, MVT::v8f16, MVT::v8f32);
1688 setOperationPromotedToType(Opc, MVT::v16f16, MVT::v16f32);
1689 }
1690 }
1691
1692 // This block controls legalization of the mask vector sizes that are
1693 // available with AVX512. 512-bit vectors are in a separate block controlled
1694 // by useAVX512Regs.
1695 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
1696 addRegisterClass(MVT::v1i1, &X86::VK1RegClass);
1697 addRegisterClass(MVT::v2i1, &X86::VK2RegClass);
1698 addRegisterClass(MVT::v4i1, &X86::VK4RegClass);
1699 addRegisterClass(MVT::v8i1, &X86::VK8RegClass);
1700 addRegisterClass(MVT::v16i1, &X86::VK16RegClass);
1701
1705
1706 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i1, MVT::v8i32);
1707 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i1, MVT::v8i32);
1708 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v4i1, MVT::v4i32);
1709 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v4i1, MVT::v4i32);
1710 setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v8i1, MVT::v8i32);
1711 setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v8i1, MVT::v8i32);
1712 setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v4i1, MVT::v4i32);
1713 setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v4i1, MVT::v4i32);
1718
1719 // There is no byte sized k-register load or store without AVX512DQ.
1720 if (!Subtarget.hasDQI()) {
1721 setOperationAction(ISD::LOAD, MVT::v1i1, Custom);
1722 setOperationAction(ISD::LOAD, MVT::v2i1, Custom);
1723 setOperationAction(ISD::LOAD, MVT::v4i1, Custom);
1724 setOperationAction(ISD::LOAD, MVT::v8i1, Custom);
1725
1730 }
1731
1732 // Extends of v16i1/v8i1/v4i1/v2i1 to 128-bit vectors.
1733 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1737 }
1738
1739 for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 })
1741
1742 for (auto VT : { MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 }) {
1746
1753 }
1754
1755 for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1 })
1757 }
1758 if (Subtarget.hasDQI() && Subtarget.hasVLX()) {
1759 for (MVT VT : {MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) {
1762 }
1763 }
1764
1765 // This block controls legalization for 512-bit operations with 8/16/32/64 bit
1766 // elements. 512-bits can be disabled based on prefer-vector-width and
1767 // required-vector-width function attributes.
1768 if (!Subtarget.useSoftFloat() && Subtarget.useAVX512Regs()) {
1769 bool HasBWI = Subtarget.hasBWI();
1770
1771 addRegisterClass(MVT::v16i32, &X86::VR512RegClass);
1772 addRegisterClass(MVT::v16f32, &X86::VR512RegClass);
1773 addRegisterClass(MVT::v8i64, &X86::VR512RegClass);
1774 addRegisterClass(MVT::v8f64, &X86::VR512RegClass);
1775 addRegisterClass(MVT::v32i16, &X86::VR512RegClass);
1776 addRegisterClass(MVT::v32f16, &X86::VR512RegClass);
1777 addRegisterClass(MVT::v64i8, &X86::VR512RegClass);
1778
1779 for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD}) {
1780 setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i8, Legal);
1781 setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i16, Legal);
1782 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i8, Legal);
1783 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i16, Legal);
1784 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i32, Legal);
1785 if (HasBWI)
1786 setLoadExtAction(ExtType, MVT::v32i16, MVT::v32i8, Legal);
1787 }
1788
1789 for (MVT VT : { MVT::v16f32, MVT::v8f64 }) {
1797 }
1798 setOperationAction(ISD::LRINT, MVT::v16f32,
1799 Subtarget.hasDQI() ? Legal : Custom);
1800 setOperationAction(ISD::LRINT, MVT::v8f64,
1801 Subtarget.hasDQI() ? Legal : Custom);
1802 if (Subtarget.hasDQI())
1803 setOperationAction(ISD::LLRINT, MVT::v8f64, Legal);
1804
1805 for (MVT VT : { MVT::v16i1, MVT::v16i8 }) {
1810 }
1811
1812 for (MVT VT : { MVT::v16i16, MVT::v16i32 }) {
1817 }
1818
1825
1837
1838 setTruncStoreAction(MVT::v8i64, MVT::v8i8, Legal);
1839 setTruncStoreAction(MVT::v8i64, MVT::v8i16, Legal);
1840 setTruncStoreAction(MVT::v8i64, MVT::v8i32, Legal);
1841 setTruncStoreAction(MVT::v16i32, MVT::v16i8, Legal);
1842 setTruncStoreAction(MVT::v16i32, MVT::v16i16, Legal);
1843 if (HasBWI)
1844 setTruncStoreAction(MVT::v32i16, MVT::v32i8, Legal);
1845
1846 // With 512-bit vectors and no VLX, we prefer to widen MLOAD/MSTORE
1847 // to 512-bit rather than use the AVX2 instructions so that we can use
1848 // k-masks.
1849 if (!Subtarget.hasVLX()) {
1850 for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1851 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) {
1854 }
1855 }
1856
1858 setOperationAction(ISD::TRUNCATE, MVT::v16i16, Legal);
1859 setOperationAction(ISD::TRUNCATE, MVT::v32i8, HasBWI ? Legal : Custom);
1869
1870 if (HasBWI) {
1871 // Extends from v64i1 masks to 512-bit vectors.
1875 }
1876
1877 for (auto VT : { MVT::v16f32, MVT::v8f64 }) {
1890
1892 }
1893
1894 for (auto VT : {MVT::v32i16, MVT::v16i32, MVT::v8i64}) {
1897 }
1898
1899 setOperationAction(ISD::ADD, MVT::v32i16, HasBWI ? Legal : Custom);
1900 setOperationAction(ISD::SUB, MVT::v32i16, HasBWI ? Legal : Custom);
1901 setOperationAction(ISD::ADD, MVT::v64i8, HasBWI ? Legal : Custom);
1902 setOperationAction(ISD::SUB, MVT::v64i8, HasBWI ? Legal : Custom);
1903
1904 setOperationAction(ISD::MUL, MVT::v8i64, Custom);
1905 setOperationAction(ISD::MUL, MVT::v16i32, Legal);
1906 setOperationAction(ISD::MUL, MVT::v32i16, HasBWI ? Legal : Custom);
1907 setOperationAction(ISD::MUL, MVT::v64i8, Custom);
1908
1909 setOperationAction(ISD::MULHU, MVT::v16i32, Custom);
1910 setOperationAction(ISD::MULHS, MVT::v16i32, Custom);
1911 setOperationAction(ISD::MULHS, MVT::v32i16, HasBWI ? Legal : Custom);
1912 setOperationAction(ISD::MULHU, MVT::v32i16, HasBWI ? Legal : Custom);
1913 setOperationAction(ISD::MULHS, MVT::v64i8, Custom);
1914 setOperationAction(ISD::MULHU, MVT::v64i8, Custom);
1915 setOperationAction(ISD::AVGCEILU, MVT::v32i16, HasBWI ? Legal : Custom);
1916 setOperationAction(ISD::AVGCEILU, MVT::v64i8, HasBWI ? Legal : Custom);
1917
1918 setOperationAction(ISD::SMULO, MVT::v64i8, Custom);
1919 setOperationAction(ISD::UMULO, MVT::v64i8, Custom);
1920
1921 for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64 }) {
1931
1932 // The condition codes aren't legal in SSE/AVX and under AVX512 we use
1933 // setcc all the way to isel and prefer SETGT in some isel patterns.
1936 }
1937
1938 setOperationAction(ISD::SETCC, MVT::v8f64, Custom);
1939 setOperationAction(ISD::SETCC, MVT::v16f32, Custom);
1944
1945 for (auto VT : { MVT::v16i32, MVT::v8i64 }) {
1952 }
1953
1954 for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
1955 setOperationAction(ISD::ABS, VT, HasBWI ? Legal : Custom);
1956 setOperationAction(ISD::CTPOP, VT, Subtarget.hasBITALG() ? Legal : Custom);
1958 setOperationAction(ISD::SMAX, VT, HasBWI ? Legal : Custom);
1959 setOperationAction(ISD::UMAX, VT, HasBWI ? Legal : Custom);
1960 setOperationAction(ISD::SMIN, VT, HasBWI ? Legal : Custom);
1961 setOperationAction(ISD::UMIN, VT, HasBWI ? Legal : Custom);
1966 }
1967
1968 setOperationAction(ISD::FSHL, MVT::v64i8, Custom);
1969 setOperationAction(ISD::FSHR, MVT::v64i8, Custom);
1970 setOperationAction(ISD::FSHL, MVT::v32i16, Custom);
1971 setOperationAction(ISD::FSHR, MVT::v32i16, Custom);
1972 setOperationAction(ISD::FSHL, MVT::v16i32, Custom);
1973 setOperationAction(ISD::FSHR, MVT::v16i32, Custom);
1974
1975 if (Subtarget.hasDQI()) {
1979 setOperationAction(Opc, MVT::v8i64, Custom);
1980 setOperationAction(ISD::MUL, MVT::v8i64, Legal);
1981 }
1982
1983 if (Subtarget.hasCDI()) {
1984 // NonVLX sub-targets extend 128/256 vectors to use the 512 version.
1985 for (auto VT : { MVT::v16i32, MVT::v8i64} ) {
1987 }
1988 } // Subtarget.hasCDI()
1989
1990 if (Subtarget.hasVPOPCNTDQ()) {
1991 for (auto VT : { MVT::v16i32, MVT::v8i64 })
1993 }
1994
1995 // Extract subvector is special because the value type
1996 // (result) is 256-bit but the source is 512-bit wide.
1997 // 128-bit was made Legal under AVX1.
1998 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
1999 MVT::v16f16, MVT::v8f32, MVT::v4f64 })
2001
2002 for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64,
2003 MVT::v32f16, MVT::v16f32, MVT::v8f64 }) {
2013 }
2014 setF16Action(MVT::v32f16, Expand);
2019 for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV})
2020 setOperationPromotedToType(Opc, MVT::v32f16, MVT::v32f32);
2021
2022 for (auto VT : { MVT::v16i32, MVT::v8i64, MVT::v16f32, MVT::v8f64 }) {
2027 }
2028 if (HasBWI) {
2029 for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
2032 }
2033 } else {
2034 setOperationAction(ISD::STORE, MVT::v32i16, Custom);
2035 setOperationAction(ISD::STORE, MVT::v64i8, Custom);
2036 }
2037
2038 if (Subtarget.hasVBMI2()) {
2039 for (auto VT : {MVT::v32i16, MVT::v16i32, MVT::v8i64}) {
2042 }
2043
2044 setOperationAction(ISD::ROTL, MVT::v32i16, Custom);
2045 setOperationAction(ISD::ROTR, MVT::v32i16, Custom);
2046 }
2047
2048 setOperationAction(ISD::FNEG, MVT::v32f16, Custom);
2049 setOperationAction(ISD::FABS, MVT::v32f16, Custom);
2051 }// useAVX512Regs
2052
2053 if (!Subtarget.useSoftFloat() && Subtarget.hasVBMI2()) {
2054 for (auto VT : {MVT::v8i16, MVT::v4i32, MVT::v2i64, MVT::v16i16, MVT::v8i32,
2055 MVT::v4i64}) {
2058 }
2059 }
2060
2061 // This block controls legalization for operations that don't have
2062 // pre-AVX512 equivalents. Without VLX we use 512-bit operations for
2063 // narrower widths.
2064 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
2065 // These operations are handled on non-VLX by artificially widening in
2066 // isel patterns.
2067
2071
2072 if (Subtarget.hasDQI()) {
2073 // Fast v2f32 SINT_TO_FP( v2i64 ) custom conversion.
2074 // v2f32 UINT_TO_FP is already custom under SSE2.
2077 "Unexpected operation action!");
2078 // v2i64 FP_TO_S/UINT(v2f32) custom conversion.
2083 }
2084
2085 for (auto VT : { MVT::v2i64, MVT::v4i64 }) {
2091 }
2092
2093 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
2096 }
2097
2098 // Custom legalize 2x32 to get a little better code.
2101
2102 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
2103 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
2105
2106 if (Subtarget.hasDQI()) {
2110 setOperationAction(Opc, MVT::v2i64, Custom);
2111 setOperationAction(Opc, MVT::v4i64, Custom);
2112 }
2113 setOperationAction(ISD::MUL, MVT::v2i64, Legal);
2114 setOperationAction(ISD::MUL, MVT::v4i64, Legal);
2115 }
2116
2117 if (Subtarget.hasCDI()) {
2118 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
2120 }
2121 } // Subtarget.hasCDI()
2122
2123 if (Subtarget.hasVPOPCNTDQ()) {
2124 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 })
2126 }
2127 }
2128
2129 // This block control legalization of v32i1/v64i1 which are available with
2130 // AVX512BW..
2131 if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) {
2132 addRegisterClass(MVT::v32i1, &X86::VK32RegClass);
2133 addRegisterClass(MVT::v64i1, &X86::VK64RegClass);
2134
2135 for (auto VT : { MVT::v32i1, MVT::v64i1 }) {
2146 }
2147
2148 for (auto VT : { MVT::v16i1, MVT::v32i1 })
2150
2151 // Extends from v32i1 masks to 256-bit vectors.
2155
2156 for (auto VT : { MVT::v32i8, MVT::v16i8, MVT::v16i16, MVT::v8i16 }) {
2157 setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom);
2158 setOperationAction(ISD::MSTORE, VT, Subtarget.hasVLX() ? Legal : Custom);
2159 }
2160
2161 // These operations are handled on non-VLX by artificially widening in
2162 // isel patterns.
2163 // TODO: Custom widen in lowering on non-VLX and drop the isel patterns?
2164
2165 if (Subtarget.hasBITALG()) {
2166 for (auto VT : { MVT::v16i8, MVT::v32i8, MVT::v8i16, MVT::v16i16 })
2168 }
2169 }
2170
2171 if (!Subtarget.useSoftFloat() && Subtarget.hasFP16()) {
2172 auto setGroup = [&] (MVT VT) {
2183
2196
2198
2201
2207
2213
2217 };
2218
2219 // AVX512_FP16 scalar operations
2220 setGroup(MVT::f16);
2234
2237
2238 if (Subtarget.useAVX512Regs()) {
2239 setGroup(MVT::v32f16);
2245 setOperationAction(ISD::FP_ROUND, MVT::v16f16, Legal);
2252
2257 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v32i8, MVT::v32i16);
2259 MVT::v32i16);
2260 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v32i8, MVT::v32i16);
2262 MVT::v32i16);
2263 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v32i1, MVT::v32i16);
2265 MVT::v32i16);
2266 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v32i1, MVT::v32i16);
2268 MVT::v32i16);
2269
2273
2274 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Legal);
2275 setLoadExtAction(ISD::EXTLOAD, MVT::v16f32, MVT::v16f16, Legal);
2276 }
2277
2278 if (Subtarget.hasVLX()) {
2279 setGroup(MVT::v8f16);
2280 setGroup(MVT::v16f16);
2281
2292
2303
2304 // INSERT_VECTOR_ELT v8f16 extended to VECTOR_SHUFFLE
2307
2311
2312 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Legal);
2313 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Legal);
2314 setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Legal);
2315 setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Legal);
2316
2317 // Need to custom widen these to prevent scalarization.
2318 setOperationAction(ISD::LOAD, MVT::v4f16, Custom);
2319 setOperationAction(ISD::STORE, MVT::v4f16, Custom);
2320 }
2321 }
2322
2323 if (!Subtarget.useSoftFloat() &&
2324 (Subtarget.hasAVXNECONVERT() || Subtarget.hasBF16())) {
2325 addRegisterClass(MVT::v8bf16, Subtarget.hasAVX512() ? &X86::VR128XRegClass
2326 : &X86::VR128RegClass);
2327 addRegisterClass(MVT::v16bf16, Subtarget.hasAVX512() ? &X86::VR256XRegClass
2328 : &X86::VR256RegClass);
2329 // We set the type action of bf16 to TypeSoftPromoteHalf, but we don't
2330 // provide the method to promote BUILD_VECTOR and INSERT_VECTOR_ELT.
2331 // Set the operation action Custom to do the customization later.
2334 for (auto VT : {MVT::v8bf16, MVT::v16bf16}) {
2335 setF16Action(VT, Expand);
2340 }
2341 for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV}) {
2342 setOperationPromotedToType(Opc, MVT::v8bf16, MVT::v8f32);
2343 setOperationPromotedToType(Opc, MVT::v16bf16, MVT::v16f32);
2344 }
2346 addLegalFPImmediate(APFloat::getZero(APFloat::BFloat()));
2347 }
2348
2349 if (!Subtarget.useSoftFloat() && Subtarget.hasBF16()) {
2350 addRegisterClass(MVT::v32bf16, &X86::VR512RegClass);
2351 setF16Action(MVT::v32bf16, Expand);
2352 for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV})
2353 setOperationPromotedToType(Opc, MVT::v32bf16, MVT::v32f32);
2355 setOperationAction(ISD::FP_ROUND, MVT::v16bf16, Custom);
2359 }
2360
2361 if (!Subtarget.useSoftFloat() && Subtarget.hasVLX()) {
2362 setTruncStoreAction(MVT::v4i64, MVT::v4i8, Legal);
2363 setTruncStoreAction(MVT::v4i64, MVT::v4i16, Legal);
2364 setTruncStoreAction(MVT::v4i64, MVT::v4i32, Legal);
2365 setTruncStoreAction(MVT::v8i32, MVT::v8i8, Legal);
2366 setTruncStoreAction(MVT::v8i32, MVT::v8i16, Legal);
2367
2368 setTruncStoreAction(MVT::v2i64, MVT::v2i8, Legal);
2369 setTruncStoreAction(MVT::v2i64, MVT::v2i16, Legal);
2370 setTruncStoreAction(MVT::v2i64, MVT::v2i32, Legal);
2371 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal);
2372 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal);
2373
2374 if (Subtarget.hasBWI()) {
2375 setTruncStoreAction(MVT::v16i16, MVT::v16i8, Legal);
2376 setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal);
2377 }
2378
2379 if (Subtarget.hasFP16()) {
2380 // vcvttph2[u]dq v4f16 -> v4i32/64, v2f16 -> v2i32/64
2389 // vcvt[u]dq2ph v4i32/64 -> v4f16, v2i32/64 -> v2f16
2398 // vcvtps2phx v4f32 -> v4f16, v2f32 -> v2f16
2403 // vcvtph2psx v4f16 -> v4f32, v2f16 -> v2f32
2408 }
2409 }
2410
2411 if (!Subtarget.useSoftFloat() && Subtarget.hasAMXTILE()) {
2412 addRegisterClass(MVT::x86amx, &X86::TILERegClass);
2413 }
2414
2415 // We want to custom lower some of our intrinsics.
2419 if (!Subtarget.is64Bit()) {
2421 }
2422
2423 // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
2424 // handle type legalization for these operations here.
2425 //
2426 // FIXME: We really should do custom legalization for addition and
2427 // subtraction on x86-32 once PR3203 is fixed. We really can't do much better
2428 // than generic legalization for 64-bit multiplication-with-overflow, though.
2429 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
2430 if (VT == MVT::i64 && !Subtarget.is64Bit())
2431 continue;
2432 // Add/Sub/Mul with overflow operations are custom lowered.
2439
2440 // Support carry in as value rather than glue.
2446 }
2447
2448 // Combine sin / cos into _sincos_stret if it is available.
2449 if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&
2450 getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {
2453 }
2454
2455 if (Subtarget.isTargetWin64()) {
2456 setOperationAction(ISD::SDIV, MVT::i128, Custom);
2457 setOperationAction(ISD::UDIV, MVT::i128, Custom);
2458 setOperationAction(ISD::SREM, MVT::i128, Custom);
2459 setOperationAction(ISD::UREM, MVT::i128, Custom);
2468 }
2469
2470 // On 32 bit MSVC, `fmodf(f32)` is not defined - only `fmod(f64)`
2471 // is. We should promote the value to 64-bits to solve this.
2472 // This is what the CRT headers do - `fmodf` is an inline header
2473 // function casting to f64 and calling `fmod`.
2474 if (Subtarget.is32Bit() &&
2475 (Subtarget.isTargetWindowsMSVC() || Subtarget.isTargetWindowsItanium()))
2476 // clang-format off
2477 for (ISD::NodeType Op :
2494 if (isOperationExpand(Op, MVT::f32))
2495 setOperationAction(Op, MVT::f32, Promote);
2496 // clang-format on
2497
2498 // On MSVC, both 32-bit and 64-bit, ldexpf(f32) is not defined. MinGW has
2499 // it, but it's just a wrapper around ldexp.
2500 if (Subtarget.isOSWindows()) {
2502 if (isOperationExpand(Op, MVT::f32))
2503 setOperationAction(Op, MVT::f32, Promote);
2504 }
2505
2506 // We have target-specific dag combine patterns for the following nodes:
2517 ISD::SHL,
2518 ISD::SRA,
2519 ISD::SRL,
2520 ISD::OR,
2521 ISD::AND,
2527 ISD::ADD,
2528 ISD::FADD,
2529 ISD::FSUB,
2530 ISD::FNEG,
2531 ISD::FMA,
2535 ISD::SUB,
2536 ISD::LOAD,
2537 ISD::LRINT,
2539 ISD::MLOAD,
2540 ISD::STORE,
2554 ISD::SETCC,
2555 ISD::MUL,
2556 ISD::XOR,
2567
2569
2570 MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
2572 MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores
2574 MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
2576
2577 // TODO: These control memcmp expansion in CGP and could be raised higher, but
2578 // that needs to benchmarked and balanced with the potential use of vector
2579 // load/store types (PR33329, PR33914).
2582
2583 // Default loop alignment, which can be overridden by -align-loops.
2585
2586 // An out-of-order CPU can speculatively execute past a predictable branch,
2587 // but a conditional move could be stalled by an expensive earlier operation.
2588 PredictableSelectIsExpensive = Subtarget.getSchedModel().isOutOfOrder();
2589 EnableExtLdPromotion = true;
2591
2593
2594 // Default to having -disable-strictnode-mutation on
2595 IsStrictFPEnabled = true;
2596}
2597
2598// This has so far only been implemented for 64-bit MachO.
2600 return Subtarget.isTargetMachO() && Subtarget.is64Bit();
2601}
2602
2604 // Currently only MSVC CRTs XOR the frame pointer into the stack guard value.
2605 return Subtarget.getTargetTriple().isOSMSVCRT() && !Subtarget.isTargetMachO();
2606}
2607
2609 const SDLoc &DL) const {
2610 EVT PtrTy = getPointerTy(DAG.getDataLayout());
2611 unsigned XorOp = Subtarget.is64Bit() ? X86::XOR64_FP : X86::XOR32_FP;
2612 MachineSDNode *Node = DAG.getMachineNode(XorOp, DL, PtrTy, Val);
2613 return SDValue(Node, 0);
2614}
2615
2618 if ((VT == MVT::v32i1 || VT == MVT::v64i1) && Subtarget.hasAVX512() &&
2619 !Subtarget.hasBWI())
2620 return TypeSplitVector;
2621
2622 if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
2623 !Subtarget.hasF16C() && VT.getVectorElementType() == MVT::f16)
2624 return TypeSplitVector;
2625
2626 if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
2627 VT.getVectorElementType() != MVT::i1)
2628 return TypeWidenVector;
2629
2631}
2632
2633FastISel *
2635 const TargetLibraryInfo *libInfo) const {
2636 return X86::createFastISel(funcInfo, libInfo);
2637}
2638
2639//===----------------------------------------------------------------------===//
2640// Other Lowering Hooks
2641//===----------------------------------------------------------------------===//
2642
2644 bool AssumeSingleUse) {
2645 if (!AssumeSingleUse && !Op.hasOneUse())
2646 return false;
2647 if (!ISD::isNormalLoad(Op.getNode()))
2648 return false;
2649
2650 // If this is an unaligned vector, make sure the target supports folding it.
2651 auto *Ld = cast<LoadSDNode>(Op.getNode());
2652 if (!Subtarget.hasAVX() && !Subtarget.hasSSEUnalignedMem() &&
2653 Ld->getValueSizeInBits(0) == 128 && Ld->getAlign() < Align(16))
2654 return false;
2655
2656 // TODO: If this is a non-temporal load and the target has an instruction
2657 // for it, it should not be folded. See "useNonTemporalLoad()".
2658
2659 return true;
2660}
2661
2663 const X86Subtarget &Subtarget,
2664 bool AssumeSingleUse) {
2665 assert(Subtarget.hasAVX() && "Expected AVX for broadcast from memory");
2666 if (!X86::mayFoldLoad(Op, Subtarget, AssumeSingleUse))
2667 return false;
2668
2669 // We can not replace a wide volatile load with a broadcast-from-memory,
2670 // because that would narrow the load, which isn't legal for volatiles.
2671 auto *Ld = cast<LoadSDNode>(Op.getNode());
2672 return !Ld->isVolatile() ||
2673 Ld->getValueSizeInBits(0) == EltVT.getScalarSizeInBits();
2674}
2675
2677 return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin());
2678}
2679
2681 if (Op.hasOneUse()) {
2682 unsigned Opcode = Op.getNode()->use_begin()->getOpcode();
2683 return (ISD::ZERO_EXTEND == Opcode);
2684 }
2685 return false;
2686}
2687
2688static bool isLogicOp(unsigned Opcode) {
2689 // TODO: Add support for X86ISD::FAND/FOR/FXOR/FANDN with test coverage.
2690 return ISD::isBitwiseLogicOp(Opcode) || X86ISD::ANDNP == Opcode;
2691}
2692
2693static bool isTargetShuffle(unsigned Opcode) {
2694 switch(Opcode) {
2695 default: return false;
2696 case X86ISD::BLENDI:
2697 case X86ISD::PSHUFB:
2698 case X86ISD::PSHUFD:
2699 case X86ISD::PSHUFHW:
2700 case X86ISD::PSHUFLW:
2701 case X86ISD::SHUFP:
2702 case X86ISD::INSERTPS:
2703 case X86ISD::EXTRQI:
2704 case X86ISD::INSERTQI:
2705 case X86ISD::VALIGN:
2706 case X86ISD::PALIGNR:
2707 case X86ISD::VSHLDQ:
2708 case X86ISD::VSRLDQ:
2709 case X86ISD::MOVLHPS:
2710 case X86ISD::MOVHLPS:
2711 case X86ISD::MOVSHDUP:
2712 case X86ISD::MOVSLDUP:
2713 case X86ISD::MOVDDUP:
2714 case X86ISD::MOVSS:
2715 case X86ISD::MOVSD:
2716 case X86ISD::MOVSH:
2717 case X86ISD::UNPCKL:
2718 case X86ISD::UNPCKH:
2719 case X86ISD::VBROADCAST:
2720 case X86ISD::VPERMILPI:
2721 case X86ISD::VPERMILPV:
2722 case X86ISD::VPERM2X128:
2723 case X86ISD::SHUF128:
2724 case X86ISD::VPERMIL2:
2725 case X86ISD::VPERMI:
2726 case X86ISD::VPPERM:
2727 case X86ISD::VPERMV:
2728 case X86ISD::VPERMV3:
2729 case X86ISD::VZEXT_MOVL:
2730 return true;
2731 }
2732}
2733
2734static bool isTargetShuffleVariableMask(unsigned Opcode) {
2735 switch (Opcode) {
2736 default: return false;
2737 // Target Shuffles.
2738 case X86ISD::PSHUFB:
2739 case X86ISD::VPERMILPV:
2740 case X86ISD::VPERMIL2:
2741 case X86ISD::VPPERM:
2742 case X86ISD::VPERMV:
2743 case X86ISD::VPERMV3:
2744 return true;
2745 // 'Faux' Target Shuffles.
2746 case ISD::OR:
2747 case ISD::AND:
2748 case X86ISD::ANDNP:
2749 return true;
2750 }
2751}
2752
2755 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
2757 int ReturnAddrIndex = FuncInfo->getRAIndex();
2758
2759 if (ReturnAddrIndex == 0) {
2760 // Set up a frame object for the return address.
2761 unsigned SlotSize = RegInfo->getSlotSize();
2762 ReturnAddrIndex = MF.getFrameInfo().CreateFixedObject(SlotSize,
2763 -(int64_t)SlotSize,
2764 false);
2765 FuncInfo->setRAIndex(ReturnAddrIndex);
2766 }
2767
2768 return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy(DAG.getDataLayout()));
2769}
2770
2772 bool HasSymbolicDisplacement) {
2773 // Offset should fit into 32 bit immediate field.
2774 if (!isInt<32>(Offset))
2775 return false;
2776
2777 // If we don't have a symbolic displacement - we don't have any extra
2778 // restrictions.
2779 if (!HasSymbolicDisplacement)
2780 return true;
2781
2782 // We can fold large offsets in the large code model because we always use
2783 // 64-bit offsets.
2784 if (CM == CodeModel::Large)
2785 return true;
2786
2787 // For kernel code model we know that all object resist in the negative half
2788 // of 32bits address space. We may not accept negative offsets, since they may
2789 // be just off and we may accept pretty large positive ones.
2790 if (CM == CodeModel::Kernel)
2791 return Offset >= 0;
2792
2793 // For other non-large code models we assume that latest small object is 16MB
2794 // before end of 31 bits boundary. We may also accept pretty large negative
2795 // constants knowing that all objects are in the positive half of address
2796 // space.
2797 return Offset < 16 * 1024 * 1024;
2798}
2799
2800/// Return true if the condition is an signed comparison operation.
2801static bool isX86CCSigned(unsigned X86CC) {
2802 switch (X86CC) {
2803 default:
2804 llvm_unreachable("Invalid integer condition!");
2805 case X86::COND_E:
2806 case X86::COND_NE:
2807 case X86::COND_B:
2808 case X86::COND_A:
2809 case X86::COND_BE:
2810 case X86::COND_AE:
2811 return false;
2812 case X86::COND_G:
2813 case X86::COND_GE:
2814 case X86::COND_L:
2815 case X86::COND_LE:
2816 return true;
2817 }
2818}
2819
2821 switch (SetCCOpcode) {
2822 // clang-format off
2823 default: llvm_unreachable("Invalid integer condition!");
2824 case ISD::SETEQ: return X86::COND_E;
2825 case ISD::SETGT: return X86::COND_G;
2826 case ISD::SETGE: return X86::COND_GE;
2827 case ISD::SETLT: return X86::COND_L;
2828 case ISD::SETLE: return X86::COND_LE;
2829 case ISD::SETNE: return X86::COND_NE;
2830 case ISD::SETULT: return X86::COND_B;
2831 case ISD::SETUGT: return X86::COND_A;
2832 case ISD::SETULE: return X86::COND_BE;
2833 case ISD::SETUGE: return X86::COND_AE;
2834 // clang-format on
2835 }
2836}
2837
2838/// Do a one-to-one translation of a ISD::CondCode to the X86-specific
2839/// condition code, returning the condition code and the LHS/RHS of the
2840/// comparison to make.
2842 bool isFP, SDValue &LHS, SDValue &RHS,
2843 SelectionDAG &DAG) {
2844 if (!isFP) {
2845 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
2846 if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnes()) {
2847 // X > -1 -> X == 0, jump !sign.
2848 RHS = DAG.getConstant(0, DL, RHS.getValueType());
2849 return X86::COND_NS;
2850 }
2851 if (SetCCOpcode == ISD::SETLT && RHSC->isZero()) {
2852 // X < 0 -> X == 0, jump on sign.
2853 return X86::COND_S;
2854 }
2855 if (SetCCOpcode == ISD::SETGE && RHSC->isZero()) {
2856 // X >= 0 -> X == 0, jump on !sign.
2857 return X86::COND_NS;
2858 }
2859 if (SetCCOpcode == ISD::SETLT && RHSC->isOne()) {
2860 // X < 1 -> X <= 0
2861 RHS = DAG.getConstant(0, DL, RHS.getValueType());
2862 return X86::COND_LE;
2863 }
2864 }
2865
2866 return TranslateIntegerX86CC(SetCCOpcode);
2867 }
2868
2869 // First determine if it is required or is profitable to flip the operands.
2870
2871 // If LHS is a foldable load, but RHS is not, flip the condition.
2872 if (ISD::isNON_EXTLoad(LHS.getNode()) &&
2873 !ISD::isNON_EXTLoad(RHS.getNode())) {
2874 SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
2875 std::swap(LHS, RHS);
2876 }
2877
2878 switch (SetCCOpcode) {
2879 default: break;
2880 case ISD::SETOLT:
2881 case ISD::SETOLE:
2882 case ISD::SETUGT:
2883 case ISD::SETUGE:
2884 std::swap(LHS, RHS);
2885 break;
2886 }
2887
2888 // On a floating point condition, the flags are set as follows:
2889 // ZF PF CF op
2890 // 0 | 0 | 0 | X > Y
2891 // 0 | 0 | 1 | X < Y
2892 // 1 | 0 | 0 | X == Y
2893 // 1 | 1 | 1 | unordered
2894 switch (SetCCOpcode) {
2895 // clang-format off
2896 default: llvm_unreachable("Condcode should be pre-legalized away");
2897 case ISD::SETUEQ:
2898 case ISD::SETEQ: return X86::COND_E;
2899 case ISD::SETOLT: // flipped
2900 case ISD::SETOGT:
2901 case ISD::SETGT: return X86::COND_A;
2902 case ISD::SETOLE: // flipped
2903 case ISD::SETOGE:
2904 case ISD::SETGE: return X86::COND_AE;
2905 case ISD::SETUGT: // flipped
2906 case ISD::SETULT:
2907 case ISD::SETLT: return X86::COND_B;
2908 case ISD::SETUGE: // flipped
2909 case ISD::SETULE:
2910 case ISD::SETLE: return X86::COND_BE;
2911 case ISD::SETONE:
2912 case ISD::SETNE: return X86::COND_NE;
2913 case ISD::SETUO: return X86::COND_P;
2914 case ISD::SETO: return X86::COND_NP;
2915 case ISD::SETOEQ:
2916 case ISD::SETUNE: return X86::COND_INVALID;
2917 // clang-format on
2918 }
2919}
2920
2921/// Is there a floating point cmov for the specific X86 condition code?
2922/// Current x86 isa includes the following FP cmov instructions:
2923/// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.
2924static bool hasFPCMov(unsigned X86CC) {
2925 switch (X86CC) {
2926 default:
2927 return false;
2928 case X86::COND_B:
2929 case X86::COND_BE:
2930 case X86::COND_E:
2931 case X86::COND_P:
2932 case X86::COND_A:
2933 case X86::COND_AE:
2934 case X86::COND_NE:
2935 case X86::COND_NP:
2936 return true;
2937 }
2938}
2939
2940static bool useVPTERNLOG(const X86Subtarget &Subtarget, MVT VT) {
2941 return Subtarget.hasVLX() || Subtarget.canExtendTo512DQ() ||
2942 VT.is512BitVector();
2943}
2944
2946 const CallInst &I,
2947 MachineFunction &MF,
2948 unsigned Intrinsic) const {
2950 Info.offset = 0;
2951
2952 const IntrinsicData* IntrData = getIntrinsicWithChain(Intrinsic);
2953 if (!IntrData) {
2954 switch (Intrinsic) {
2955 case Intrinsic::x86_aesenc128kl:
2956 case Intrinsic::x86_aesdec128kl:
2958 Info.ptrVal = I.getArgOperand(1);
2959 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 48);
2960 Info.align = Align(1);
2962 return true;
2963 case Intrinsic::x86_aesenc256kl:
2964 case Intrinsic::x86_aesdec256kl:
2966 Info.ptrVal = I.getArgOperand(1);
2967 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 64);
2968 Info.align = Align(1);
2970 return true;
2971 case Intrinsic::x86_aesencwide128kl:
2972 case Intrinsic::x86_aesdecwide128kl:
2974 Info.ptrVal = I.getArgOperand(0);
2975 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 48);
2976 Info.align = Align(1);
2978 return true;
2979 case Intrinsic::x86_aesencwide256kl:
2980 case Intrinsic::x86_aesdecwide256kl:
2982 Info.ptrVal = I.getArgOperand(0);
2983 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 64);
2984 Info.align = Align(1);
2986 return true;
2987 case Intrinsic::x86_cmpccxadd32:
2988 case Intrinsic::x86_cmpccxadd64:
2989 case Intrinsic::x86_atomic_bts:
2990 case Intrinsic::x86_atomic_btc:
2991 case Intrinsic::x86_atomic_btr: {
2993 Info.ptrVal = I.getArgOperand(0);
2994 unsigned Size = I.getType()->getScalarSizeInBits();
2995 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), Size);
2996 Info.align = Align(Size);
2999 return true;
3000 }
3001 case Intrinsic::x86_atomic_bts_rm:
3002 case Intrinsic::x86_atomic_btc_rm:
3003 case Intrinsic::x86_atomic_btr_rm: {
3005 Info.ptrVal = I.getArgOperand(0);
3006 unsigned Size = I.getArgOperand(1)->getType()->getScalarSizeInBits();
3007 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), Size);
3008 Info.align = Align(Size);
3011 return true;
3012 }
3013 case Intrinsic::x86_aadd32:
3014 case Intrinsic::x86_aadd64:
3015 case Intrinsic::x86_aand32:
3016 case Intrinsic::x86_aand64:
3017 case Intrinsic::x86_aor32:
3018 case Intrinsic::x86_aor64:
3019 case Intrinsic::x86_axor32:
3020 case Intrinsic::x86_axor64:
3021 case Intrinsic::x86_atomic_add_cc:
3022 case Intrinsic::x86_atomic_sub_cc:
3023 case Intrinsic::x86_atomic_or_cc:
3024 case Intrinsic::x86_atomic_and_cc:
3025 case Intrinsic::x86_atomic_xor_cc: {
3027 Info.ptrVal = I.getArgOperand(0);
3028 unsigned Size = I.getArgOperand(1)->getType()->getScalarSizeInBits();
3029 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), Size);
3030 Info.align = Align(Size);
3033 return true;
3034 }
3035 }
3036 return false;
3037 }
3038
3039 switch (IntrData->Type) {
3042 case TRUNCATE_TO_MEM_VI32: {
3044 Info.ptrVal = I.getArgOperand(0);
3045 MVT VT = MVT::getVT(I.getArgOperand(1)->getType());
3047 if (IntrData->Type == TRUNCATE_TO_MEM_VI8)
3048 ScalarVT = MVT::i8;
3049 else if (IntrData->Type == TRUNCATE_TO_MEM_VI16)
3050 ScalarVT = MVT::i16;
3051 else if (IntrData->Type == TRUNCATE_TO_MEM_VI32)
3052 ScalarVT = MVT::i32;
3053
3054 Info.memVT = MVT::getVectorVT(ScalarVT, VT.getVectorNumElements());
3055 Info.align = Align(1);
3057 break;
3058 }
3059 case GATHER:
3060 case GATHER_AVX2: {
3062 Info.ptrVal = nullptr;
3063 MVT DataVT = MVT::getVT(I.getType());
3064 MVT IndexVT = MVT::getVT(I.getArgOperand(2)->getType());
3065 unsigned NumElts = std::min(DataVT.getVectorNumElements(),
3066 IndexVT.getVectorNumElements());
3067 Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts);
3068 Info.align = Align(1);
3070 break;
3071 }
3072 case SCATTER: {
3074 Info.ptrVal = nullptr;
3075 MVT DataVT = MVT::getVT(I.getArgOperand(3)->getType());
3076 MVT IndexVT = MVT::getVT(I.getArgOperand(2)->getType());
3077 unsigned NumElts = std::min(DataVT.getVectorNumElements(),
3078 IndexVT.getVectorNumElements());
3079 Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts);
3080 Info.align = Align(1);
3082 break;
3083 }
3084 default:
3085 return false;
3086 }
3087
3088 return true;
3089}
3090
3091/// Returns true if the target can instruction select the
3092/// specified FP immediate natively. If false, the legalizer will
3093/// materialize the FP immediate as a load from a constant pool.
3095 bool ForCodeSize) const {
3096 for (const APFloat &FPImm : LegalFPImmediates)
3097 if (Imm.bitwiseIsEqual(FPImm))
3098 return true;
3099 return false;
3100}
3101
3103 ISD::LoadExtType ExtTy,
3104 EVT NewVT) const {
3105 assert(cast<LoadSDNode>(Load)->isSimple() && "illegal to narrow");
3106
3107 // "ELF Handling for Thread-Local Storage" specifies that R_X86_64_GOTTPOFF
3108 // relocation target a movq or addq instruction: don't let the load shrink.
3109 SDValue BasePtr = cast<LoadSDNode>(Load)->getBasePtr();
3110 if (BasePtr.getOpcode() == X86ISD::WrapperRIP)
3111 if (const auto *GA = dyn_cast<GlobalAddressSDNode>(BasePtr.getOperand(0)))
3112 return GA->getTargetFlags() != X86II::MO_GOTTPOFF;
3113
3114 // If this is an (1) AVX vector load with (2) multiple uses and (3) all of
3115 // those uses are extracted directly into a store, then the extract + store
3116 // can be store-folded. Therefore, it's probably not worth splitting the load.
3117 EVT VT = Load->getValueType(0);
3118 if ((VT.is256BitVector() || VT.is512BitVector()) && !Load->hasOneUse()) {
3119 for (auto UI = Load->use_begin(), UE = Load->use_end(); UI != UE; ++UI) {
3120 // Skip uses of the chain value. Result 0 of the node is the load value.
3121 if (UI.getUse().getResNo() != 0)
3122 continue;
3123
3124 // If this use is not an extract + store, it's probably worth splitting.
3125 if (UI->getOpcode() != ISD::EXTRACT_SUBVECTOR || !UI->hasOneUse() ||
3126 UI->use_begin()->getOpcode() != ISD::STORE)
3127 return true;
3128 }
3129 // All non-chain uses are extract + store.
3130 return false;
3131 }
3132
3133 return true;
3134}
3135
3136/// Returns true if it is beneficial to convert a load of a constant
3137/// to just the constant itself.
3139 Type *Ty) const {
3140 assert(Ty->isIntegerTy());
3141
3142 unsigned BitSize = Ty->getPrimitiveSizeInBits();
3143 if (BitSize == 0 || BitSize > 64)
3144 return false;
3145 return true;
3146}
3147