LLVM  13.0.0git
X86ISelLowering.cpp
Go to the documentation of this file.
1 //===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file defines the interfaces that X86 uses to lower LLVM code into a
10 // selection DAG.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "X86ISelLowering.h"
16 #include "X86.h"
17 #include "X86CallingConv.h"
18 #include "X86FrameLowering.h"
19 #include "X86InstrBuilder.h"
20 #include "X86IntrinsicsInfo.h"
21 #include "X86MachineFunctionInfo.h"
22 #include "X86TargetMachine.h"
23 #include "X86TargetObjectFile.h"
25 #include "llvm/ADT/SmallSet.h"
26 #include "llvm/ADT/Statistic.h"
27 #include "llvm/ADT/StringExtras.h"
28 #include "llvm/ADT/StringSwitch.h"
43 #include "llvm/IR/CallingConv.h"
44 #include "llvm/IR/Constants.h"
45 #include "llvm/IR/DerivedTypes.h"
46 #include "llvm/IR/DiagnosticInfo.h"
47 #include "llvm/IR/Function.h"
48 #include "llvm/IR/GlobalAlias.h"
49 #include "llvm/IR/GlobalVariable.h"
50 #include "llvm/IR/Instructions.h"
51 #include "llvm/IR/Intrinsics.h"
52 #include "llvm/MC/MCAsmInfo.h"
53 #include "llvm/MC/MCContext.h"
54 #include "llvm/MC/MCExpr.h"
55 #include "llvm/MC/MCSymbol.h"
57 #include "llvm/Support/Debug.h"
59 #include "llvm/Support/KnownBits.h"
62 #include <algorithm>
63 #include <bitset>
64 #include <cctype>
65 #include <numeric>
66 using namespace llvm;
67 
68 #define DEBUG_TYPE "x86-isel"
69 
70 STATISTIC(NumTailCalls, "Number of tail calls");
71 
73  "x86-experimental-pref-loop-alignment", cl::init(4),
74  cl::desc(
75  "Sets the preferable loop alignment for experiments (as log2 bytes)"
76  "(the last x86-experimental-pref-loop-alignment bits"
77  " of the loop header PC will be 0)."),
78  cl::Hidden);
79 
81  "x86-experimental-pref-innermost-loop-alignment", cl::init(4),
82  cl::desc(
83  "Sets the preferable loop alignment for experiments (as log2 bytes) "
84  "for innermost loops only. If specified, this option overrides "
85  "alignment set by x86-experimental-pref-loop-alignment."),
86  cl::Hidden);
87 
89  "mul-constant-optimization", cl::init(true),
90  cl::desc("Replace 'mul x, Const' with more effective instructions like "
91  "SHIFT, LEA, etc."),
92  cl::Hidden);
93 
95  "x86-experimental-unordered-atomic-isel", cl::init(false),
96  cl::desc("Use LoadSDNode and StoreSDNode instead of "
97  "AtomicSDNode for unordered atomic loads and "
98  "stores respectively."),
99  cl::Hidden);
100 
101 /// Call this when the user attempts to do something unsupported, like
102 /// returning a double without SSE2 enabled on x86_64. This is not fatal, unlike
103 /// report_fatal_error, so calling code should attempt to recover without
104 /// crashing.
105 static void errorUnsupported(SelectionDAG &DAG, const SDLoc &dl,
106  const char *Msg) {
108  DAG.getContext()->diagnose(
110 }
111 
113  const X86Subtarget &STI)
114  : TargetLowering(TM), Subtarget(STI) {
115  bool UseX87 = !Subtarget.useSoftFloat() && Subtarget.hasX87();
116  X86ScalarSSEf64 = Subtarget.hasSSE2();
117  X86ScalarSSEf32 = Subtarget.hasSSE1();
118  MVT PtrVT = MVT::getIntegerVT(TM.getPointerSizeInBits(0));
119 
120  // Set up the TargetLowering object.
121 
122  // X86 is weird. It always uses i8 for shift amounts and setcc results.
124  // X86-SSE is even stranger. It uses -1 or 0 for vector masks.
126 
127  // For 64-bit, since we have so many registers, use the ILP scheduler.
128  // For 32-bit, use the register pressure specific scheduling.
129  // For Atom, always use ILP scheduling.
130  if (Subtarget.isAtom())
132  else if (Subtarget.is64Bit())
134  else
136  const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
138 
139  // Bypass expensive divides and use cheaper ones.
140  if (TM.getOptLevel() >= CodeGenOpt::Default) {
141  if (Subtarget.hasSlowDivide32())
142  addBypassSlowDiv(32, 8);
143  if (Subtarget.hasSlowDivide64() && Subtarget.is64Bit())
144  addBypassSlowDiv(64, 32);
145  }
146 
147  // Setup Windows compiler runtime calls.
148  if (Subtarget.isTargetWindowsMSVC() || Subtarget.isTargetWindowsItanium()) {
149  static const struct {
150  const RTLIB::Libcall Op;
151  const char * const Name;
152  const CallingConv::ID CC;
153  } LibraryCalls[] = {
154  { RTLIB::SDIV_I64, "_alldiv", CallingConv::X86_StdCall },
155  { RTLIB::UDIV_I64, "_aulldiv", CallingConv::X86_StdCall },
156  { RTLIB::SREM_I64, "_allrem", CallingConv::X86_StdCall },
157  { RTLIB::UREM_I64, "_aullrem", CallingConv::X86_StdCall },
158  { RTLIB::MUL_I64, "_allmul", CallingConv::X86_StdCall },
159  };
160 
161  for (const auto &LC : LibraryCalls) {
162  setLibcallName(LC.Op, LC.Name);
163  setLibcallCallingConv(LC.Op, LC.CC);
164  }
165  }
166 
167  if (Subtarget.getTargetTriple().isOSMSVCRT()) {
168  // MSVCRT doesn't have powi; fall back to pow
169  setLibcallName(RTLIB::POWI_F32, nullptr);
170  setLibcallName(RTLIB::POWI_F64, nullptr);
171  }
172 
173  // If we don't have cmpxchg8b(meaing this is a 386/486), limit atomic size to
174  // 32 bits so the AtomicExpandPass will expand it so we don't need cmpxchg8b.
175  // FIXME: Should we be limiting the atomic size on other configs? Default is
176  // 1024.
177  if (!Subtarget.hasCmpxchg8b())
179 
180  // Set up the register classes.
181  addRegisterClass(MVT::i8, &X86::GR8RegClass);
182  addRegisterClass(MVT::i16, &X86::GR16RegClass);
183  addRegisterClass(MVT::i32, &X86::GR32RegClass);
184  if (Subtarget.is64Bit())
185  addRegisterClass(MVT::i64, &X86::GR64RegClass);
186 
187  for (MVT VT : MVT::integer_valuetypes())
189 
190  // We don't accept any truncstore of integer registers.
197 
199 
200  // SETOEQ and SETUNE require checking two conditions.
201  for (auto VT : {MVT::f32, MVT::f64, MVT::f80}) {
204  }
205 
206  // Integer absolute.
207  if (Subtarget.hasCMov()) {
210  if (Subtarget.is64Bit())
212  }
213 
214  // Funnel shifts.
215  for (auto ShiftOp : {ISD::FSHL, ISD::FSHR}) {
216  // For slow shld targets we only lower for code size.
217  LegalizeAction ShiftDoubleAction = Subtarget.isSHLDSlow() ? Custom : Legal;
218 
219  setOperationAction(ShiftOp , MVT::i8 , Custom);
220  setOperationAction(ShiftOp , MVT::i16 , Custom);
221  setOperationAction(ShiftOp , MVT::i32 , ShiftDoubleAction);
222  if (Subtarget.is64Bit())
223  setOperationAction(ShiftOp , MVT::i64 , ShiftDoubleAction);
224  }
225 
226  if (!Subtarget.useSoftFloat()) {
227  // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
228  // operation.
233  // We have an algorithm for SSE2, and we turn this into a 64-bit
234  // FILD or VCVTUSI2SS/SD for other targets.
237  // We have an algorithm for SSE2->double, and we turn this into a
238  // 64-bit FILD followed by conditional FADD for other targets.
241 
242  // Promote i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
243  // this operation.
246  // SSE has no i16 to fp conversion, only i32. We promote in the handler
247  // to allow f80 to use i16 and f64 to use i16 with sse1 only
250  // f32 and f64 cases are Legal with SSE1/SSE2, f80 case is not
253  // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64
254  // are Legal, f80 is custom lowered.
257 
258  // Promote i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
259  // this operation.
261  // FIXME: This doesn't generate invalid exception when it should. PR44019.
267  // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64
268  // are Legal, f80 is custom lowered.
271 
272  // Handle FP_TO_UINT by promoting the destination to a larger signed
273  // conversion.
275  // FIXME: This doesn't generate invalid exception when it should. PR44019.
278  // FIXME: This doesn't generate invalid exception when it should. PR44019.
284 
289 
290  if (!Subtarget.is64Bit()) {
293  }
294  }
295 
296  if (Subtarget.hasSSE2()) {
297  // Custom lowering for saturating float to int conversions.
298  // We handle promotion to larger result types manually.
299  for (MVT VT : { MVT::i8, MVT::i16, MVT::i32 }) {
302  }
303  if (Subtarget.is64Bit()) {
306  }
307  }
308 
309  // Handle address space casts between mixed sized pointers.
312 
313  // TODO: when we have SSE, these could be more efficient, by using movd/movq.
314  if (!X86ScalarSSEf64) {
317  if (Subtarget.is64Bit()) {
319  // Without SSE, i64->f64 goes through memory.
321  }
322  } else if (!Subtarget.is64Bit())
324 
325  // Scalar integer divide and remainder are lowered to use operations that
326  // produce two results, to match the available instructions. This exposes
327  // the two-result form to trivial CSE, which is able to combine x/y and x%y
328  // into a single instruction.
329  //
330  // Scalar integer multiply-high is also lowered to use two-result
331  // operations, to match the available instructions. However, plain multiply
332  // (low) operations are left as Legal, as there are single-result
333  // instructions for this in x86. Using the two-result multiply instructions
334  // when both high and low results are needed must be arranged by dagcombine.
335  for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
342  }
343 
346  for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128,
350  }
351  if (Subtarget.is64Bit())
356 
362 
363  // Promote the i8 variants and force them on up to i32 which has a shorter
364  // encoding.
367  if (!Subtarget.hasBMI()) {
372  if (Subtarget.is64Bit()) {
375  }
376  }
377 
378  if (Subtarget.hasLZCNT()) {
379  // When promoting the i8 variants, force them to i32 for a shorter
380  // encoding.
383  } else {
384  for (auto VT : {MVT::i8, MVT::i16, MVT::i32, MVT::i64}) {
385  if (VT == MVT::i64 && !Subtarget.is64Bit())
386  continue;
389  }
390  }
391 
394  // Special handling for half-precision floating point conversions.
395  // If we don't have F16C support, then lower half float conversions
396  // into library calls.
398  Op, MVT::f32,
399  (!Subtarget.useSoftFloat() && Subtarget.hasF16C()) ? Custom : Expand);
400  // There's never any support for operations beyond MVT::f32.
404  }
405 
414 
416  if (Subtarget.hasPOPCNT()) {
418  } else {
422  if (Subtarget.is64Bit())
424  else
426 
429  if (Subtarget.is64Bit())
431  }
432 
434 
435  if (!Subtarget.hasMOVBE())
437 
438  // X86 wants to expand cmov itself.
439  for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128 }) {
444  }
445  for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
446  if (VT == MVT::i64 && !Subtarget.is64Bit())
447  continue;
450  }
451 
452  // Custom action for SELECT MMX and expand action for SELECT_CC MMX
455 
457  // NOTE: EH_SJLJ_SETJMP/_LONGJMP are not recommended, since
458  // LLVM/Clang supports zero-cost DWARF and SEH exception handling.
462  if (TM.Options.ExceptionModel == ExceptionHandling::SjLj)
463  setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume");
464 
465  // Darwin ABI issue.
466  for (auto VT : { MVT::i32, MVT::i64 }) {
467  if (VT == MVT::i64 && !Subtarget.is64Bit())
468  continue;
475  }
476 
477  // 64-bit shl, sra, srl (iff 32-bit x86)
478  for (auto VT : { MVT::i32, MVT::i64 }) {
479  if (VT == MVT::i64 && !Subtarget.is64Bit())
480  continue;
484  }
485 
486  if (Subtarget.hasSSEPrefetch() || Subtarget.has3DNow())
488 
490 
491  // Expand certain atomics
492  for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
500  }
501 
502  if (!Subtarget.is64Bit())
504 
505  if (Subtarget.hasCmpxchg16b()) {
507  }
508 
509  // FIXME - use subtarget debug flags
510  if (!Subtarget.isTargetDarwin() && !Subtarget.isTargetELF() &&
511  !Subtarget.isTargetCygMing() && !Subtarget.isTargetWin64() &&
512  TM.Options.ExceptionModel != ExceptionHandling::SjLj) {
514  }
515 
518 
521 
525 
526  // VASTART needs to be custom lowered to use the VarArgsFrameIndex
529  bool Is64Bit = Subtarget.is64Bit();
532 
535 
537 
538  // GC_TRANSITION_START and GC_TRANSITION_END need custom lowering.
541 
542  if (!Subtarget.useSoftFloat() && X86ScalarSSEf64) {
543  // f32 and f64 use SSE.
544  // Set up the FP register classes.
545  addRegisterClass(MVT::f32, Subtarget.hasAVX512() ? &X86::FR32XRegClass
546  : &X86::FR32RegClass);
547  addRegisterClass(MVT::f64, Subtarget.hasAVX512() ? &X86::FR64XRegClass
548  : &X86::FR64RegClass);
549 
550  // Disable f32->f64 extload as we can only generate this in one instruction
551  // under optsize. So its easier to pattern match (fpext (load)) for that
552  // case instead of needing to emit 2 instructions for extload in the
553  // non-optsize case.
555 
556  for (auto VT : { MVT::f32, MVT::f64 }) {
557  // Use ANDPD to simulate FABS.
559 
560  // Use XORP to simulate FNEG.
562 
563  // Use ANDPD and ORPD to simulate FCOPYSIGN.
565 
566  // These might be better off as horizontal vector ops.
569 
570  // We don't support sin/cos/fmod
574  }
575 
576  // Lower this to MOVMSK plus an AND.
579 
580  } else if (!Subtarget.useSoftFloat() && X86ScalarSSEf32 &&
581  (UseX87 || Is64Bit)) {
582  // Use SSE for f32, x87 for f64.
583  // Set up the FP register classes.
584  addRegisterClass(MVT::f32, &X86::FR32RegClass);
585  if (UseX87)
586  addRegisterClass(MVT::f64, &X86::RFP64RegClass);
587 
588  // Use ANDPS to simulate FABS.
590 
591  // Use XORP to simulate FNEG.
593 
594  if (UseX87)
596 
597  // Use ANDPS and ORPS to simulate FCOPYSIGN.
598  if (UseX87)
601 
602  // We don't support sin/cos/fmod
606 
607  if (UseX87) {
608  // Always expand sin/cos functions even though x87 has an instruction.
612  }
613  } else if (UseX87) {
614  // f32 and f64 in x87.
615  // Set up the FP register classes.
616  addRegisterClass(MVT::f64, &X86::RFP64RegClass);
617  addRegisterClass(MVT::f32, &X86::RFP32RegClass);
618 
619  for (auto VT : { MVT::f32, MVT::f64 }) {
622 
623  // Always expand sin/cos functions even though x87 has an instruction.
627  }
628  }
629 
630  // Expand FP32 immediates into loads from the stack, save special cases.
631  if (isTypeLegal(MVT::f32)) {
632  if (UseX87 && (getRegClassFor(MVT::f32) == &X86::RFP32RegClass)) {
633  addLegalFPImmediate(APFloat(+0.0f)); // FLD0
634  addLegalFPImmediate(APFloat(+1.0f)); // FLD1
635  addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
636  addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
637  } else // SSE immediates.
638  addLegalFPImmediate(APFloat(+0.0f)); // xorps
639  }
640  // Expand FP64 immediates into loads from the stack, save special cases.
641  if (isTypeLegal(MVT::f64)) {
642  if (UseX87 && getRegClassFor(MVT::f64) == &X86::RFP64RegClass) {
643  addLegalFPImmediate(APFloat(+0.0)); // FLD0
644  addLegalFPImmediate(APFloat(+1.0)); // FLD1
645  addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
646  addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
647  } else // SSE immediates.
648  addLegalFPImmediate(APFloat(+0.0)); // xorpd
649  }
650  // Handle constrained floating-point operations of scalar.
664 
665  // We don't support FMA.
668 
669  // f80 always uses X87.
670  if (UseX87) {
671  addRegisterClass(MVT::f80, &X86::RFP80RegClass);
674  {
676  addLegalFPImmediate(TmpFlt); // FLD0
677  TmpFlt.changeSign();
678  addLegalFPImmediate(TmpFlt); // FLD0/FCHS
679 
680  bool ignored;
681  APFloat TmpFlt2(+1.0);
683  &ignored);
684  addLegalFPImmediate(TmpFlt2); // FLD1
685  TmpFlt2.changeSign();
686  addLegalFPImmediate(TmpFlt2); // FLD1/FCHS
687  }
688 
689  // Always expand sin/cos functions even though x87 has an instruction.
693 
704 
705  // Handle constrained floating-point operations of scalar.
712  // FIXME: When the target is 64-bit, STRICT_FP_ROUND will be overwritten
713  // as Custom.
715  }
716 
717  // f128 uses xmm registers, but most operations require libcalls.
718  if (!Subtarget.useSoftFloat() && Subtarget.is64Bit() && Subtarget.hasSSE1()) {
719  addRegisterClass(MVT::f128, Subtarget.hasVLX() ? &X86::VR128XRegClass
720  : &X86::VR128RegClass);
721 
722  addLegalFPImmediate(APFloat::getZero(APFloat::IEEEquad())); // xorps
723 
734 
738 
744  // No STRICT_FSINCOS
747 
750  // We need to custom handle any FP_ROUND with an f128 input, but
751  // LegalizeDAG uses the result type to know when to run a custom handler.
752  // So we have to list all legal floating point result types here.
753  if (isTypeLegal(MVT::f32)) {
756  }
757  if (isTypeLegal(MVT::f64)) {
760  }
761  if (isTypeLegal(MVT::f80)) {
764  }
765 
767 
774  }
775 
776  // Always use a library call for pow.
781 
789 
790  // Some FP actions are always expanded for vector types.
791  for (auto VT : { MVT::v4f32, MVT::v8f32, MVT::v16f32,
804  }
805 
806  // First set operation action for all vector types to either promote
807  // (for widening) or expand (for scalarization). Then we will selectively
808  // turn on ones that can be effectively codegen'd.
809  for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
847  for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
848  setTruncStoreAction(InnerVT, VT, Expand);
849 
850  setLoadExtAction(ISD::SEXTLOAD, InnerVT, VT, Expand);
851  setLoadExtAction(ISD::ZEXTLOAD, InnerVT, VT, Expand);
852 
853  // N.b. ISD::EXTLOAD legality is basically ignored except for i1-like
854  // types, we have to deal with them whether we ask for Expansion or not.
855  // Setting Expand causes its own optimisation problems though, so leave
856  // them legal.
857  if (VT.getVectorElementType() == MVT::i1)
858  setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
859 
860  // EXTLOAD for MVT::f16 vectors is not legal because f16 vectors are
861  // split/scalarized right now.
862  if (VT.getVectorElementType() == MVT::f16)
863  setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
864  }
865  }
866 
867  // FIXME: In order to prevent SSE instructions being expanded to MMX ones
868  // with -msoft-float, disable use of MMX as well.
869  if (!Subtarget.useSoftFloat() && Subtarget.hasMMX()) {
870  addRegisterClass(MVT::x86mmx, &X86::VR64RegClass);
871  // No operations on x86mmx supported, everything uses intrinsics.
872  }
873 
874  if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1()) {
875  addRegisterClass(MVT::v4f32, Subtarget.hasVLX() ? &X86::VR128XRegClass
876  : &X86::VR128RegClass);
877 
886 
889 
895  }
896 
897  if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
898  addRegisterClass(MVT::v2f64, Subtarget.hasVLX() ? &X86::VR128XRegClass
899  : &X86::VR128RegClass);
900 
901  // FIXME: Unfortunately, -soft-float and -no-implicit-float mean XMM
902  // registers cannot be used even for integer operations.
903  addRegisterClass(MVT::v16i8, Subtarget.hasVLX() ? &X86::VR128XRegClass
904  : &X86::VR128RegClass);
905  addRegisterClass(MVT::v8i16, Subtarget.hasVLX() ? &X86::VR128XRegClass
906  : &X86::VR128RegClass);
907  addRegisterClass(MVT::v4i32, Subtarget.hasVLX() ? &X86::VR128XRegClass
908  : &X86::VR128RegClass);
909  addRegisterClass(MVT::v2i64, Subtarget.hasVLX() ? &X86::VR128XRegClass
910  : &X86::VR128RegClass);
911 
912  for (auto VT : { MVT::v2i8, MVT::v4i8, MVT::v8i8,
918  }
919 
923 
934 
937 
941 
942  for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
947  }
948 
959 
963 
964  for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
970 
971  // The condition codes aren't legal in SSE/AVX and under AVX512 we use
972  // setcc all the way to isel and prefer SETGT in some isel patterns.
975  }
976 
977  for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
983  }
984 
985  for (auto VT : { MVT::v2f64, MVT::v2i64 }) {
989 
990  if (VT == MVT::v2i64 && !Subtarget.is64Bit())
991  continue;
992 
995  }
996 
997  // Custom lower v2i64 and v2f64 selects.
1003 
1008 
1009  // Custom legalize these to avoid over promotion or custom promotion.
1010  for (auto VT : {MVT::v2i8, MVT::v4i8, MVT::v8i8, MVT::v2i16, MVT::v4i16}) {
1015  }
1016 
1021 
1024 
1027 
1028  // Fast v2f32 UINT_TO_FP( v2i32 ) custom conversion.
1033 
1038 
1039  // We want to legalize this to an f64 load rather than an i64 load on
1040  // 64-bit targets and two 32-bit loads on a 32-bit target. Similar for
1041  // store.
1048 
1052  if (!Subtarget.hasAVX512())
1054 
1058 
1060 
1067 
1068  // In the customized shift lowering, the legal v4i32/v2i64 cases
1069  // in AVX2 will be recognized.
1070  for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1074  }
1075 
1078 
1079  // With 512-bit registers or AVX512VL+BW, expanding (and promoting the
1080  // shifts) is better.
1081  if (!Subtarget.useAVX512Regs() &&
1082  !(Subtarget.hasBWI() && Subtarget.hasVLX()))
1084 
1090  }
1091 
1092  if (!Subtarget.useSoftFloat() && Subtarget.hasSSSE3()) {
1101 
1102  // These might be better off as horizontal vector ops.
1107  }
1108 
1109  if (!Subtarget.useSoftFloat() && Subtarget.hasSSE41()) {
1110  for (MVT RoundedTy : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64}) {
1111  setOperationAction(ISD::FFLOOR, RoundedTy, Legal);
1113  setOperationAction(ISD::FCEIL, RoundedTy, Legal);
1115  setOperationAction(ISD::FTRUNC, RoundedTy, Legal);
1117  setOperationAction(ISD::FRINT, RoundedTy, Legal);
1123 
1124  setOperationAction(ISD::FROUND, RoundedTy, Custom);
1125  }
1126 
1135 
1137 
1138  // FIXME: Do we need to handle scalar-to-vector here?
1140 
1141  // We directly match byte blends in the backend as they match the VSELECT
1142  // condition form.
1144 
1145  // SSE41 brings specific instructions for doing vector sign extend even in
1146  // cases where we don't have SRA.
1147  for (auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1150  }
1151 
1152  // SSE41 also has vector sign/zero extending loads, PMOV[SZ]X
1153  for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
1160  }
1161 
1162  // i8 vectors are custom because the source register and source
1163  // source memory operand types are not the same width.
1165 
1166  if (Subtarget.is64Bit() && !Subtarget.hasAVX512()) {
1167  // We need to scalarize v4i64->v432 uint_to_fp using cvtsi2ss, but we can
1168  // do the pre and post work in the vector domain.
1171  // We need to mark SINT_TO_FP as Custom even though we want to expand it
1172  // so that DAG combine doesn't try to turn it into uint_to_fp.
1175  }
1176  }
1177 
1178  if (!Subtarget.useSoftFloat() && Subtarget.hasSSE42()) {
1180  }
1181 
1182  if (!Subtarget.useSoftFloat() && Subtarget.hasXOP()) {
1183  for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
1186 
1187  // XOP can efficiently perform BITREVERSE with VPPERM.
1188  for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 })
1190 
1191  for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
1194  }
1195 
1196  if (!Subtarget.useSoftFloat() && Subtarget.hasAVX()) {
1197  bool HasInt256 = Subtarget.hasInt256();
1198 
1199  addRegisterClass(MVT::v32i8, Subtarget.hasVLX() ? &X86::VR256XRegClass
1200  : &X86::VR256RegClass);
1201  addRegisterClass(MVT::v16i16, Subtarget.hasVLX() ? &X86::VR256XRegClass
1202  : &X86::VR256RegClass);
1203  addRegisterClass(MVT::v8i32, Subtarget.hasVLX() ? &X86::VR256XRegClass
1204  : &X86::VR256RegClass);
1205  addRegisterClass(MVT::v8f32, Subtarget.hasVLX() ? &X86::VR256XRegClass
1206  : &X86::VR256RegClass);
1207  addRegisterClass(MVT::v4i64, Subtarget.hasVLX() ? &X86::VR256XRegClass
1208  : &X86::VR256RegClass);
1209  addRegisterClass(MVT::v4f64, Subtarget.hasVLX() ? &X86::VR256XRegClass
1210  : &X86::VR256RegClass);
1211 
1212  for (auto VT : { MVT::v8f32, MVT::v4f64 }) {
1225 
1227 
1231  }
1232 
1233  // (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted
1234  // even though v8i16 is a legal type.
1241 
1244 
1257 
1258  if (!Subtarget.hasAVX512())
1260 
1261  // In the customized shift lowering, the legal v8i32/v4i64 cases
1262  // in AVX2 will be recognized.
1263  for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1267  }
1268 
1269  // These types need custom splitting if their input is a 128-bit vector.
1274 
1277 
1278  // With BWI, expanding (and promoting the shifts) is the better.
1279  if (!Subtarget.useBWIRegs())
1281 
1288 
1289  for (auto VT : { MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1293  }
1294 
1299 
1300  for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1306 
1307  // The condition codes aren't legal in SSE/AVX and under AVX512 we use
1308  // setcc all the way to isel and prefer SETGT in some isel patterns.
1311  }
1312 
1313  if (Subtarget.hasAnyFMA()) {
1314  for (auto VT : { MVT::f32, MVT::f64, MVT::v4f32, MVT::v8f32,
1315  MVT::v2f64, MVT::v4f64 }) {
1318  }
1319  }
1320 
1321  for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1322  setOperationAction(ISD::ADD, VT, HasInt256 ? Legal : Custom);
1323  setOperationAction(ISD::SUB, VT, HasInt256 ? Legal : Custom);
1324  }
1325 
1330 
1337 
1340 
1346 
1359 
1360  for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
1361  setOperationAction(ISD::ABS, VT, HasInt256 ? Legal : Custom);
1362  setOperationAction(ISD::SMAX, VT, HasInt256 ? Legal : Custom);
1363  setOperationAction(ISD::UMAX, VT, HasInt256 ? Legal : Custom);
1364  setOperationAction(ISD::SMIN, VT, HasInt256 ? Legal : Custom);
1365  setOperationAction(ISD::UMIN, VT, HasInt256 ? Legal : Custom);
1366  }
1367 
1368  for (auto VT : {MVT::v16i16, MVT::v8i32, MVT::v4i64}) {
1371  }
1372 
1373  if (HasInt256) {
1374  // The custom lowering for UINT_TO_FP for v8i32 becomes interesting
1375  // when we have a 256bit-wide blend with immediate.
1378 
1379  // AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X
1380  for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
1387  }
1388  }
1389 
1390  for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1392  setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom);
1394  }
1395 
1396  // Extract subvector is special because the value type
1397  // (result) is 128-bit but the source is 256-bit wide.
1398  for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
1399  MVT::v4f32, MVT::v2f64 }) {
1401  }
1402 
1403  // Custom lower several nodes for 256-bit types.
1405  MVT::v8f32, MVT::v4f64 }) {
1415  }
1416 
1417  if (HasInt256) {
1419 
1420  // Custom legalize 2x32 to get a little better code.
1423 
1424  for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1427  }
1428  }
1429 
1430  // This block controls legalization of the mask vector sizes that are
1431  // available with AVX512. 512-bit vectors are in a separate block controlled
1432  // by useAVX512Regs.
1433  if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
1434  addRegisterClass(MVT::v1i1, &X86::VK1RegClass);
1435  addRegisterClass(MVT::v2i1, &X86::VK2RegClass);
1436  addRegisterClass(MVT::v4i1, &X86::VK4RegClass);
1437  addRegisterClass(MVT::v8i1, &X86::VK8RegClass);
1438  addRegisterClass(MVT::v16i1, &X86::VK16RegClass);
1439 
1443 
1456 
1457  // There is no byte sized k-register load or store without AVX512DQ.
1458  if (!Subtarget.hasDQI()) {
1463 
1468  }
1469 
1470  // Extends of v16i1/v8i1/v4i1/v2i1 to 128-bit vectors.
1471  for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1475  }
1476 
1477  for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 })
1479 
1480  for (auto VT : { MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 }) {
1486 
1493  }
1494 
1495  for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1 })
1497  }
1498 
1499  // This block controls legalization for 512-bit operations with 32/64 bit
1500  // elements. 512-bits can be disabled based on prefer-vector-width and
1501  // required-vector-width function attributes.
1502  if (!Subtarget.useSoftFloat() && Subtarget.useAVX512Regs()) {
1503  bool HasBWI = Subtarget.hasBWI();
1504 
1505  addRegisterClass(MVT::v16i32, &X86::VR512RegClass);
1506  addRegisterClass(MVT::v16f32, &X86::VR512RegClass);
1507  addRegisterClass(MVT::v8i64, &X86::VR512RegClass);
1508  addRegisterClass(MVT::v8f64, &X86::VR512RegClass);
1509  addRegisterClass(MVT::v32i16, &X86::VR512RegClass);
1510  addRegisterClass(MVT::v64i8, &X86::VR512RegClass);
1511 
1512  for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD}) {
1518  if (HasBWI)
1520  }
1521 
1522  for (MVT VT : { MVT::v16f32, MVT::v8f64 }) {
1528  }
1529 
1530  for (MVT VT : { MVT::v16i1, MVT::v16i8, MVT::v16i16 }) {
1535  }
1544 
1557 
1563  if (HasBWI)
1565 
1566  // With 512-bit vectors and no VLX, we prefer to widen MLOAD/MSTORE
1567  // to 512-bit rather than use the AVX2 instructions so that we can use
1568  // k-masks.
1569  if (!Subtarget.hasVLX()) {
1570  for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1574  }
1575  }
1576 
1590 
1591  if (HasBWI) {
1592  // Extends from v64i1 masks to 512-bit vectors.
1596  }
1597 
1598  for (auto VT : { MVT::v16f32, MVT::v8f64 }) {
1611 
1613  }
1614 
1615  for (auto VT : {MVT::v32i16, MVT::v16i32, MVT::v8i64}) {
1618  }
1619 
1624 
1629 
1636 
1639 
1641 
1642  for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64 }) {
1647 
1648  // The condition codes aren't legal in SSE/AVX and under AVX512 we use
1649  // setcc all the way to isel and prefer SETGT in some isel patterns.
1652  }
1653  for (auto VT : { MVT::v16i32, MVT::v8i64 }) {
1664  }
1665 
1666  for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
1667  setOperationAction(ISD::ABS, VT, HasBWI ? Legal : Custom);
1668  setOperationAction(ISD::CTPOP, VT, Subtarget.hasBITALG() ? Legal : Custom);
1670  setOperationAction(ISD::SMAX, VT, HasBWI ? Legal : Custom);
1671  setOperationAction(ISD::UMAX, VT, HasBWI ? Legal : Custom);
1672  setOperationAction(ISD::SMIN, VT, HasBWI ? Legal : Custom);
1673  setOperationAction(ISD::UMIN, VT, HasBWI ? Legal : Custom);
1674  setOperationAction(ISD::UADDSAT, VT, HasBWI ? Legal : Custom);
1675  setOperationAction(ISD::SADDSAT, VT, HasBWI ? Legal : Custom);
1676  setOperationAction(ISD::USUBSAT, VT, HasBWI ? Legal : Custom);
1677  setOperationAction(ISD::SSUBSAT, VT, HasBWI ? Legal : Custom);
1678  }
1679 
1680  if (Subtarget.hasDQI()) {
1689 
1691  }
1692 
1693  if (Subtarget.hasCDI()) {
1694  // NonVLX sub-targets extend 128/256 vectors to use the 512 version.
1695  for (auto VT : { MVT::v16i32, MVT::v8i64} ) {
1697  }
1698  } // Subtarget.hasCDI()
1699 
1700  if (Subtarget.hasVPOPCNTDQ()) {
1701  for (auto VT : { MVT::v16i32, MVT::v8i64 })
1703  }
1704 
1705  // Extract subvector is special because the value type
1706  // (result) is 256-bit but the source is 512-bit wide.
1707  // 128-bit was made Legal under AVX1.
1708  for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
1711 
1712  for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64,
1713  MVT::v16f32, MVT::v8f64 }) {
1723  }
1724 
1725  for (auto VT : { MVT::v16i32, MVT::v8i64, MVT::v16f32, MVT::v8f64 }) {
1730  }
1731  if (HasBWI) {
1732  for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
1735  }
1736  } else {
1739  }
1740 
1741  if (Subtarget.hasVBMI2()) {
1742  for (auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64,
1747  }
1748 
1753  }
1754  }// useAVX512Regs
1755 
1756  // This block controls legalization for operations that don't have
1757  // pre-AVX512 equivalents. Without VLX we use 512-bit operations for
1758  // narrower widths.
1759  if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
1760  // These operations are handled on non-VLX by artificially widening in
1761  // isel patterns.
1762 
1764  Subtarget.hasVLX() ? Legal : Custom);
1766  Subtarget.hasVLX() ? Legal : Custom);
1769  Subtarget.hasVLX() ? Legal : Custom);
1771  Subtarget.hasVLX() ? Legal : Custom);
1774  Subtarget.hasVLX() ? Legal : Custom);
1776  Subtarget.hasVLX() ? Legal : Custom);
1778  Subtarget.hasVLX() ? Legal : Custom);
1780  Subtarget.hasVLX() ? Legal : Custom);
1781 
1782  if (Subtarget.hasDQI()) {
1783  // Fast v2f32 SINT_TO_FP( v2i64 ) custom conversion.
1784  // v2f32 UINT_TO_FP is already custom under SSE2.
1787  "Unexpected operation action!");
1788  // v2i64 FP_TO_S/UINT(v2f32) custom conversion.
1793  }
1794 
1795  for (auto VT : { MVT::v2i64, MVT::v4i64 }) {
1801  }
1802 
1803  for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
1806  }
1807 
1808  // Custom legalize 2x32 to get a little better code.
1811 
1812  for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1815 
1816  if (Subtarget.hasDQI()) {
1817  for (auto VT : { MVT::v2i64, MVT::v4i64 }) {
1819  Subtarget.hasVLX() ? Legal : Custom);
1821  Subtarget.hasVLX() ? Legal : Custom);
1823  Subtarget.hasVLX() ? Legal : Custom);
1825  Subtarget.hasVLX() ? Legal : Custom);
1827  Subtarget.hasVLX() ? Legal : Custom);
1829  Subtarget.hasVLX() ? Legal : Custom);
1831  Subtarget.hasVLX() ? Legal : Custom);
1833  Subtarget.hasVLX() ? Legal : Custom);
1835  }
1836  }
1837 
1838  if (Subtarget.hasCDI()) {
1839  for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
1841  }
1842  } // Subtarget.hasCDI()
1843 
1844  if (Subtarget.hasVPOPCNTDQ()) {
1845  for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 })
1847  }
1848  }
1849 
1850  // This block control legalization of v32i1/v64i1 which are available with
1851  // AVX512BW. 512-bit v32i16 and v64i8 vector legalization is controlled with
1852  // useBWIRegs.
1853  if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) {
1854  addRegisterClass(MVT::v32i1, &X86::VK32RegClass);
1855  addRegisterClass(MVT::v64i1, &X86::VK64RegClass);
1856 
1857  for (auto VT : { MVT::v32i1, MVT::v64i1 }) {
1868  }
1869 
1870  for (auto VT : { MVT::v16i1, MVT::v32i1 })
1872 
1873  // Extends from v32i1 masks to 256-bit vectors.
1877 
1878  for (auto VT : { MVT::v32i8, MVT::v16i8, MVT::v16i16, MVT::v8i16 }) {
1879  setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom);
1880  setOperationAction(ISD::MSTORE, VT, Subtarget.hasVLX() ? Legal : Custom);
1881  }
1882 
1883  // These operations are handled on non-VLX by artificially widening in
1884  // isel patterns.
1885  // TODO: Custom widen in lowering on non-VLX and drop the isel patterns?
1886 
1887  if (Subtarget.hasBITALG()) {
1888  for (auto VT : { MVT::v16i8, MVT::v32i8, MVT::v8i16, MVT::v16i16 })
1890  }
1891  }
1892 
1893  if (!Subtarget.useSoftFloat() && Subtarget.hasVLX()) {
1899 
1905 
1906  if (Subtarget.hasBWI()) {
1909  }
1910 
1914  }
1915 
1916  if (Subtarget.hasAMXTILE()) {
1917  addRegisterClass(MVT::x86amx, &X86::TILERegClass);
1918  }
1919 
1920  // We want to custom lower some of our intrinsics.
1924  if (!Subtarget.is64Bit()) {
1926  }
1927 
1928  // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
1929  // handle type legalization for these operations here.
1930  //
1931  // FIXME: We really should do custom legalization for addition and
1932  // subtraction on x86-32 once PR3203 is fixed. We really can't do much better
1933  // than generic legalization for 64-bit multiplication-with-overflow, though.
1934  for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
1935  if (VT == MVT::i64 && !Subtarget.is64Bit())
1936  continue;
1937  // Add/Sub/Mul with overflow operations are custom lowered.
1944 
1945  // Support carry in as value rather than glue.
1951  }
1952 
1953  if (!Subtarget.is64Bit()) {
1954  // These libcalls are not available in 32-bit.
1955  setLibcallName(RTLIB::SHL_I128, nullptr);
1956  setLibcallName(RTLIB::SRL_I128, nullptr);
1957  setLibcallName(RTLIB::SRA_I128, nullptr);
1958  setLibcallName(RTLIB::MUL_I128, nullptr);
1959  }
1960 
1961  // Combine sin / cos into _sincos_stret if it is available.
1962  if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&
1963  getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {
1966  }
1967 
1968  if (Subtarget.isTargetWin64()) {
1973  }
1974 
1975  // On 32 bit MSVC, `fmodf(f32)` is not defined - only `fmod(f64)`
1976  // is. We should promote the value to 64-bits to solve this.
1977  // This is what the CRT headers do - `fmodf` is an inline header
1978  // function casting to f64 and calling `fmod`.
1979  if (Subtarget.is32Bit() &&
1980  (Subtarget.isTargetWindowsMSVC() || Subtarget.isTargetWindowsItanium()))
1981  for (ISD::NodeType Op :
1993 
1994  // We have target-specific dag combine patterns for the following nodes:
2044 
2046 
2047  MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
2049  MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores
2051  MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
2053 
2054  // TODO: These control memcmp expansion in CGP and could be raised higher, but
2055  // that needs to benchmarked and balanced with the potential use of vector
2056  // load/store types (PR33329, PR33914).
2057  MaxLoadsPerMemcmp = 2;
2059 
2060  // Set loop alignment to 2^ExperimentalPrefLoopAlignment bytes (default: 2^4).
2062 
2063  // An out-of-order CPU can speculatively execute past a predictable branch,
2064  // but a conditional move could be stalled by an expensive earlier operation.
2065  PredictableSelectIsExpensive = Subtarget.getSchedModel().isOutOfOrder();
2066  EnableExtLdPromotion = true;
2068 
2070 
2071  // Default to having -disable-strictnode-mutation on
2072  IsStrictFPEnabled = true;
2073 }
2074 
2075 // This has so far only been implemented for 64-bit MachO.
2077  return Subtarget.isTargetMachO() && Subtarget.is64Bit();
2078 }
2079 
2081  // Currently only MSVC CRTs XOR the frame pointer into the stack guard value.
2082  return Subtarget.getTargetTriple().isOSMSVCRT() && !Subtarget.isTargetMachO();
2083 }
2084 
2086  const SDLoc &DL) const {
2087  EVT PtrTy = getPointerTy(DAG.getDataLayout());
2088  unsigned XorOp = Subtarget.is64Bit() ? X86::XOR64_FP : X86::XOR32_FP;
2089  MachineSDNode *Node = DAG.getMachineNode(XorOp, DL, PtrTy, Val);
2090  return SDValue(Node, 0);
2091 }
2092 
2095  if ((VT == MVT::v32i1 || VT == MVT::v64i1) && Subtarget.hasAVX512() &&
2096  !Subtarget.hasBWI())
2097  return TypeSplitVector;
2098 
2099  if (VT.getVectorNumElements() != 1 &&
2100  VT.getVectorElementType() != MVT::i1)
2101  return TypeWidenVector;
2102 
2104 }
2105 
2106 static std::pair<MVT, unsigned>
2108  const X86Subtarget &Subtarget) {
2109  // v2i1/v4i1/v8i1/v16i1 all pass in xmm registers unless the calling
2110  // convention is one that uses k registers.
2111  if (NumElts == 2)
2112  return {MVT::v2i64, 1};
2113  if (NumElts == 4)
2114  return {MVT::v4i32, 1};
2115  if (NumElts == 8 && CC != CallingConv::X86_RegCall &&
2117  return {MVT::v8i16, 1};
2118  if (NumElts == 16 && CC != CallingConv::X86_RegCall &&
2120  return {MVT::v16i8, 1};
2121  // v32i1 passes in ymm unless we have BWI and the calling convention is
2122  // regcall.
2123  if (NumElts == 32 && (!Subtarget.hasBWI() || CC != CallingConv::X86_RegCall))
2124  return {MVT::v32i8, 1};
2125  // Split v64i1 vectors if we don't have v64i8 available.
2126  if (NumElts == 64 && Subtarget.hasBWI() && CC != CallingConv::X86_RegCall) {
2127  if (Subtarget.useAVX512Regs())
2128  return {MVT::v64i8, 1};
2129  return {MVT::v32i8, 2};
2130  }
2131 
2132  // Break wide or odd vXi1 vectors into scalars to match avx2 behavior.
2133  if (!isPowerOf2_32(NumElts) || (NumElts == 64 && !Subtarget.hasBWI()) ||
2134  NumElts > 64)
2135  return {MVT::i8, NumElts};
2136 
2137  return {MVT::INVALID_SIMPLE_VALUE_TYPE, 0};
2138 }
2139 
2141  CallingConv::ID CC,
2142  EVT VT) const {
2143  if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
2144  Subtarget.hasAVX512()) {
2145  unsigned NumElts = VT.getVectorNumElements();
2146 
2147  MVT RegisterVT;
2148  unsigned NumRegisters;
2149  std::tie(RegisterVT, NumRegisters) =
2150  handleMaskRegisterForCallingConv(NumElts, CC, Subtarget);
2151  if (RegisterVT != MVT::INVALID_SIMPLE_VALUE_TYPE)
2152  return RegisterVT;
2153  }
2154 
2156 }
2157 
2159  CallingConv::ID CC,
2160  EVT VT) const {
2161  if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
2162  Subtarget.hasAVX512()) {
2163  unsigned NumElts = VT.getVectorNumElements();
2164 
2165  MVT RegisterVT;
2166  unsigned NumRegisters;
2167  std::tie(RegisterVT, NumRegisters) =
2168  handleMaskRegisterForCallingConv(NumElts, CC, Subtarget);
2169  if (RegisterVT != MVT::INVALID_SIMPLE_VALUE_TYPE)
2170  return NumRegisters;
2171  }
2172 
2174 }
2175 
2177  LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT,
2178  unsigned &NumIntermediates, MVT &RegisterVT) const {
2179  // Break wide or odd vXi1 vectors into scalars to match avx2 behavior.
2180  if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
2181  Subtarget.hasAVX512() &&
2183  (VT.getVectorNumElements() == 64 && !Subtarget.hasBWI()) ||
2184  VT.getVectorNumElements() > 64)) {
2185  RegisterVT = MVT::i8;
2186  IntermediateVT = MVT::i1;
2187  NumIntermediates = VT.getVectorNumElements();
2188  return NumIntermediates;
2189  }
2190 
2191  // Split v64i1 vectors if we don't have v64i8 available.
2192  if (VT == MVT::v64i1 && Subtarget.hasBWI() && !Subtarget.useAVX512Regs() &&
2193  CC != CallingConv::X86_RegCall) {
2194  RegisterVT = MVT::v32i8;
2195  IntermediateVT = MVT::v32i1;
2196  NumIntermediates = 2;
2197  return 2;
2198  }
2199 
2200  return TargetLowering::getVectorTypeBreakdownForCallingConv(Context, CC, VT, IntermediateVT,
2201  NumIntermediates, RegisterVT);
2202 }
2203 
2206  EVT VT) const {
2207  if (!VT.isVector())
2208  return MVT::i8;
2209 
2210  if (Subtarget.hasAVX512()) {
2211  // Figure out what this type will be legalized to.
2212  EVT LegalVT = VT;
2213  while (getTypeAction(Context, LegalVT) != TypeLegal)
2214  LegalVT = getTypeToTransformTo(Context, LegalVT);
2215 
2216  // If we got a 512-bit vector then we'll definitely have a vXi1 compare.
2217  if (LegalVT.getSimpleVT().is512BitVector())
2219 
2220  if (LegalVT.getSimpleVT().isVector() && Subtarget.hasVLX()) {
2221  // If we legalized to less than a 512-bit vector, then we will use a vXi1
2222  // compare for vXi32/vXi64 for sure. If we have BWI we will also support
2223  // vXi16/vXi8.
2224  MVT EltVT = LegalVT.getSimpleVT().getVectorElementType();
2225  if (Subtarget.hasBWI() || EltVT.getSizeInBits() >= 32)
2227  }
2228  }
2229 
2231 }
2232 
2233 /// Helper for getByValTypeAlignment to determine
2234 /// the desired ByVal argument alignment.
2235 static void getMaxByValAlign(Type *Ty, Align &MaxAlign) {
2236  if (MaxAlign == 16)
2237  return;
2238  if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {
2239  if (VTy->getPrimitiveSizeInBits().getFixedSize() == 128)
2240  MaxAlign = Align(16);
2241  } else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
2242  Align EltAlign;
2243  getMaxByValAlign(ATy->getElementType(), EltAlign);
2244  if (EltAlign > MaxAlign)
2245  MaxAlign = EltAlign;
2246  } else if (StructType *STy = dyn_cast<StructType>(Ty)) {
2247  for (auto *EltTy : STy->elements()) {
2248  Align EltAlign;
2249  getMaxByValAlign(EltTy, EltAlign);
2250  if (EltAlign > MaxAlign)
2251  MaxAlign = EltAlign;
2252  if (MaxAlign == 16)
2253  break;
2254  }
2255  }
2256 }
2257 
2258 /// Return the desired alignment for ByVal aggregate
2259 /// function arguments in the caller parameter area. For X86, aggregates
2260 /// that contain SSE vectors are placed at 16-byte boundaries while the rest
2261 /// are at 4-byte boundaries.
2263  const DataLayout &DL) const {
2264  if (Subtarget.is64Bit()) {
2265  // Max of 8 and alignment of type.
2266  Align TyAlign = DL.getABITypeAlign(Ty);
2267  if (TyAlign > 8)
2268  return TyAlign.value();
2269  return 8;
2270  }
2271 
2272  Align Alignment(4);
2273  if (Subtarget.hasSSE1())
2274  getMaxByValAlign(Ty, Alignment);
2275  return Alignment.value();
2276 }
2277 
2278 /// It returns EVT::Other if the type should be determined using generic
2279 /// target-independent logic.
2280 /// For vector ops we check that the overall size isn't larger than our
2281 /// preferred vector width.
2283  const MemOp &Op, const AttributeList &FuncAttributes) const {
2284  if (!FuncAttributes.hasFnAttribute(Attribute::NoImplicitFloat)) {
2285  if (Op.size() >= 16 &&
2286  (!Subtarget.isUnalignedMem16Slow() || Op.isAligned(Align(16)))) {
2287  // FIXME: Check if unaligned 64-byte accesses are slow.
2288  if (Op.size() >= 64 && Subtarget.hasAVX512() &&
2289  (Subtarget.getPreferVectorWidth() >= 512)) {
2290  return Subtarget.hasBWI() ? MVT::v64i8 : MVT::v16i32;
2291  }
2292  // FIXME: Check if unaligned 32-byte accesses are slow.
2293  if (Op.size() >= 32 && Subtarget.hasAVX() &&
2294  (Subtarget.getPreferVectorWidth() >= 256)) {
2295  // Although this isn't a well-supported type for AVX1, we'll let
2296  // legalization and shuffle lowering produce the optimal codegen. If we
2297  // choose an optimal type with a vector element larger than a byte,
2298  // getMemsetStores() may create an intermediate splat (using an integer
2299  // multiply) before we splat as a vector.
2300  return MVT::v32i8;
2301  }
2302  if (Subtarget.hasSSE2() && (Subtarget.getPreferVectorWidth() >= 128))
2303  return MVT::v16i8;
2304  // TODO: Can SSE1 handle a byte vector?
2305  // If we have SSE1 registers we should be able to use them.
2306  if (Subtarget.hasSSE1() && (Subtarget.is64Bit() || Subtarget.hasX87()) &&
2307  (Subtarget.getPreferVectorWidth() >= 128))
2308  return MVT::v4f32;
2309  } else if (((Op.isMemcpy() && !Op.isMemcpyStrSrc()) || Op.isZeroMemset()) &&
2310  Op.size() >= 8 && !Subtarget.is64Bit() && Subtarget.hasSSE2()) {
2311  // Do not use f64 to lower memcpy if source is string constant. It's
2312  // better to use i32 to avoid the loads.
2313  // Also, do not use f64 to lower memset unless this is a memset of zeros.
2314  // The gymnastics of splatting a byte value into an XMM register and then
2315  // only using 8-byte stores (because this is a CPU with slow unaligned
2316  // 16-byte accesses) makes that a loser.
2317  return MVT::f64;
2318  }
2319  }
2320  // This is a compromise. If we reach here, unaligned accesses may be slow on
2321  // this target. However, creating smaller, aligned accesses could be even
2322  // slower and would certainly be a lot more code.
2323  if (Subtarget.is64Bit() && Op.size() >= 8)
2324  return MVT::i64;
2325  return MVT::i32;
2326 }
2327 
2329  if (VT == MVT::f32)
2330  return X86ScalarSSEf32;
2331  if (VT == MVT::f64)
2332  return X86ScalarSSEf64;
2333  return true;
2334 }
2335 
2337  EVT VT, unsigned, Align Alignment, MachineMemOperand::Flags Flags,
2338  bool *Fast) const {
2339  if (Fast) {
2340  switch (VT.getSizeInBits()) {
2341  default:
2342  // 8-byte and under are always assumed to be fast.
2343  *Fast = true;
2344  break;
2345  case 128:
2346  *Fast = !Subtarget.isUnalignedMem16Slow();
2347  break;
2348  case 256:
2349  *Fast = !Subtarget.isUnalignedMem32Slow();
2350  break;
2351  // TODO: What about AVX-512 (512-bit) accesses?
2352  }
2353  }
2354  // NonTemporal vector memory ops must be aligned.
2355  if (!!(Flags & MachineMemOperand::MONonTemporal) && VT.isVector()) {
2356  // NT loads can only be vector aligned, so if its less aligned than the
2357  // minimum vector size (which we can split the vector down to), we might as
2358  // well use a regular unaligned vector load.
2359  // We don't have any NT loads pre-SSE41.
2360  if (!!(Flags & MachineMemOperand::MOLoad))
2361  return (Alignment < 16 || !Subtarget.hasSSE41());
2362  return false;
2363  }
2364  // Misaligned accesses of any size are always allowed.
2365  return true;
2366 }
2367 
2368 /// Return the entry encoding for a jump table in the
2369 /// current function. The returned value is a member of the
2370 /// MachineJumpTableInfo::JTEntryKind enum.
2372  // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF
2373  // symbol.
2374  if (isPositionIndependent() && Subtarget.isPICStyleGOT())
2376 
2377  // Otherwise, use the normal jump table encoding heuristics.
2379 }
2380 
2382  return Subtarget.useSoftFloat();
2383 }
2384 
2386  ArgListTy &Args) const {
2387 
2388  // Only relabel X86-32 for C / Stdcall CCs.
2389  if (Subtarget.is64Bit())
2390  return;
2391  if (CC != CallingConv::C && CC != CallingConv::X86_StdCall)
2392  return;
2393  unsigned ParamRegs = 0;
2394  if (auto *M = MF->getFunction().getParent())
2395  ParamRegs = M->getNumberRegisterParameters();
2396 
2397  // Mark the first N int arguments as having reg
2398  for (auto &Arg : Args) {
2399  Type *T = Arg.Ty;
2400  if (T->isIntOrPtrTy())
2401  if (MF->getDataLayout().getTypeAllocSize(T) <= 8) {
2402  unsigned numRegs = 1;
2403  if (MF->getDataLayout().getTypeAllocSize(T) > 4)
2404  numRegs = 2;
2405  if (ParamRegs < numRegs)
2406  return;
2407  ParamRegs -= numRegs;
2408  Arg.IsInReg = true;
2409  }
2410  }
2411 }
2412 
2413 const MCExpr *
2415  const MachineBasicBlock *MBB,
2416  unsigned uid,MCContext &Ctx) const{
2417  assert(isPositionIndependent() && Subtarget.isPICStyleGOT());
2418  // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF
2419  // entries.
2422 }
2423 
2424 /// Returns relocation base for the given PIC jumptable.
2426  SelectionDAG &DAG) const {
2427  if (!Subtarget.is64Bit())
2428  // This doesn't have SDLoc associated with it, but is not really the
2429  // same as a Register.
2430  return DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
2431  getPointerTy(DAG.getDataLayout()));
2432  return Table;
2433 }
2434 
2435 /// This returns the relocation base for the given PIC jumptable,
2436 /// the same as getPICJumpTableRelocBase, but as an MCExpr.
2439  MCContext &Ctx) const {
2440  // X86-64 uses RIP relative addressing based on the jump table label.
2441  if (Subtarget.isPICStyleRIPRel())
2442  return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);
2443 
2444  // Otherwise, the reference is relative to the PIC base.
2445  return MCSymbolRefExpr::create(MF->getPICBaseSymbol(), Ctx);
2446 }
2447 
2448 std::pair<const TargetRegisterClass *, uint8_t>
2450  MVT VT) const {
2451  const TargetRegisterClass *RRC = nullptr;
2452  uint8_t Cost = 1;
2453  switch (VT.SimpleTy) {
2454  default:
2456  case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64:
2457  RRC = Subtarget.is64Bit() ? &X86::GR64RegClass : &X86::GR32RegClass;
2458  break;
2459  case MVT::x86mmx:
2460  RRC = &X86::VR64RegClass;
2461  break;
2462  case MVT::f32: case MVT::f64:
2463  case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
2464  case MVT::v4f32: case MVT::v2f64:
2465  case MVT::v32i8: case MVT::v16i16: case MVT::v8i32: case MVT::v4i64:
2466  case MVT::v8f32: case MVT::v4f64:
2467  case MVT::v64i8: case MVT::v32i16: case MVT::v16i32: case MVT::v8i64:
2468  case MVT::v16f32: case MVT::v8f64:
2469  RRC = &X86::VR128XRegClass;
2470  break;
2471  }
2472  return std::make_pair(RRC, Cost);
2473 }
2474 
2475 unsigned X86TargetLowering::getAddressSpace() const {
2476  if (Subtarget.is64Bit())
2477  return (getTargetMachine().getCodeModel() == CodeModel::Kernel) ? 256 : 257;
2478  return 256;
2479 }
2480 
2481 static bool hasStackGuardSlotTLS(const Triple &TargetTriple) {
2482  return TargetTriple.isOSGlibc() || TargetTriple.isOSFuchsia() ||
2483  (TargetTriple.isAndroid() && !TargetTriple.isAndroidVersionLT(17));
2484 }
2485 
2487  unsigned Offset, unsigned AddressSpace) {
2491 }
2492 
2494  // glibc, bionic, and Fuchsia have a special slot for the stack guard in
2495  // tcbhead_t; use it instead of the usual global variable (see
2496  // sysdeps/{i386,x86_64}/nptl/tls.h)
2497  if (hasStackGuardSlotTLS(Subtarget.getTargetTriple())) {
2498  if (Subtarget.isTargetFuchsia()) {
2499  // <zircon/tls.h> defines ZX_TLS_STACK_GUARD_OFFSET with this value.
2500  return SegmentOffset(IRB, 0x10, getAddressSpace());
2501  } else {
2502  unsigned AddressSpace = getAddressSpace();
2503  // Specially, some users may customize the base reg and offset.
2505  // If we don't set -stack-protector-guard-offset value:
2506  // %fs:0x28, unless we're using a Kernel code model, in which case
2507  // it's %gs:0x28. gs:0x14 on i386.
2508  if (Offset == (unsigned)-1)
2509  Offset = (Subtarget.is64Bit()) ? 0x28 : 0x14;
2510 
2511  const auto &GuardReg = getTargetMachine().Options.StackProtectorGuardReg;
2512  if (GuardReg == "fs")
2514  else if (GuardReg == "gs")
2516  return SegmentOffset(IRB, Offset, AddressSpace);
2517  }
2518  }
2519  return TargetLowering::getIRStackGuard(IRB);
2520 }
2521 
2523  // MSVC CRT provides functionalities for stack protection.
2524  if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() ||
2526  // MSVC CRT has a global variable holding security cookie.
2527  M.getOrInsertGlobal("__security_cookie",
2528  Type::getInt8PtrTy(M.getContext()));
2529 
2530  // MSVC CRT has a function to validate security cookie.
2531  FunctionCallee SecurityCheckCookie = M.getOrInsertFunction(
2532  "__security_check_cookie", Type::getVoidTy(M.getContext()),
2533  Type::getInt8PtrTy(M.getContext()));
2534  if (Function *F = dyn_cast<Function>(SecurityCheckCookie.getCallee())) {
2535  F->setCallingConv(CallingConv::X86_FastCall);
2536  F->addAttribute(1, Attribute::AttrKind::InReg);
2537  }
2538  return;
2539  }
2540 
2541  auto GuardMode = getTargetMachine().Options.StackProtectorGuard;
2542 
2543  // glibc, bionic, and Fuchsia have a special slot for the stack guard.
2544  if ((GuardMode == llvm::StackProtectorGuards::TLS ||
2545  GuardMode == llvm::StackProtectorGuards::None)
2546  && hasStackGuardSlotTLS(Subtarget.getTargetTriple()))
2547  return;
2549 }
2550 
2552  // MSVC CRT has a global variable holding security cookie.
2553  if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() ||
2555  return M.getGlobalVariable("__security_cookie");
2556  }
2558 }
2559 
2561  // MSVC CRT has a function to validate security cookie.
2562  if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() ||
2564  return M.getFunction("__security_check_cookie");
2565  }
2567 }
2568 
2570  if (Subtarget.getTargetTriple().isOSContiki())
2571  return getDefaultSafeStackPointerLocation(IRB, false);
2572 
2573  // Android provides a fixed TLS slot for the SafeStack pointer. See the
2574  // definition of TLS_SLOT_SAFESTACK in
2575  // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
2576  if (Subtarget.isTargetAndroid()) {
2577  // %fs:0x48, unless we're using a Kernel code model, in which case it's %gs:
2578  // %gs:0x24 on i386
2579  unsigned Offset = (Subtarget.is64Bit()) ? 0x48 : 0x24;
2580  return SegmentOffset(IRB, Offset, getAddressSpace());
2581  }
2582 
2583  // Fuchsia is similar.
2584  if (Subtarget.isTargetFuchsia()) {
2585  // <zircon/tls.h> defines ZX_TLS_UNSAFE_SP_OFFSET with this value.
2586  return SegmentOffset(IRB, 0x18, getAddressSpace());
2587  }
2588 
2590 }
2591 
2592 //===----------------------------------------------------------------------===//
2593 // Return Value Calling Convention Implementation
2594 //===----------------------------------------------------------------------===//
2595 
2596 bool X86TargetLowering::CanLowerReturn(
2597  CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
2598  const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
2600  CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
2601  return CCInfo.CheckReturn(Outs, RetCC_X86);
2602 }
2603 
2604 const MCPhysReg *X86TargetLowering::getScratchRegisters(CallingConv::ID) const {
2605  static const MCPhysReg ScratchRegs[] = { X86::R11, 0 };
2606  return ScratchRegs;
2607 }
2608 
2609 /// Lowers masks values (v*i1) to the local register values
2610 /// \returns DAG node after lowering to register type
2611 static SDValue lowerMasksToReg(const SDValue &ValArg, const EVT &ValLoc,
2612  const SDLoc &Dl, SelectionDAG &DAG) {
2613  EVT ValVT = ValArg.getValueType();
2614 
2615  if (ValVT == MVT::v1i1)
2616  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, Dl, ValLoc, ValArg,
2617  DAG.getIntPtrConstant(0, Dl));
2618 
2619  if ((ValVT == MVT::v8i1 && (ValLoc == MVT::i8 || ValLoc == MVT::i32)) ||
2620  (ValVT == MVT::v16i1 && (ValLoc == MVT::i16 || ValLoc == MVT::i32))) {
2621  // Two stage lowering might be required
2622  // bitcast: v8i1 -> i8 / v16i1 -> i16
2623  // anyextend: i8 -> i32 / i16 -> i32
2624  EVT TempValLoc = ValVT == MVT::v8i1 ? MVT::i8 : MVT::i16;
2625  SDValue ValToCopy = DAG.getBitcast(TempValLoc, ValArg);
2626  if (ValLoc == MVT::i32)
2627  ValToCopy = DAG.getNode(ISD::ANY_EXTEND, Dl, ValLoc, ValToCopy);
2628  return ValToCopy;
2629  }
2630 
2631  if ((ValVT == MVT::v32i1 && ValLoc == MVT::i32) ||
2632  (ValVT == MVT::v64i1 && ValLoc == MVT::i64)) {
2633  // One stage lowering is required
2634  // bitcast: v32i1 -> i32 / v64i1 -> i64
2635  return DAG.getBitcast(ValLoc, ValArg);
2636  }
2637 
2638  return DAG.getNode(ISD::ANY_EXTEND, Dl, ValLoc, ValArg);
2639 }
2640 
2641 /// Breaks v64i1 value into two registers and adds the new node to the DAG
2643  const SDLoc &Dl, SelectionDAG &DAG, SDValue &Arg,
2644  SmallVectorImpl<std::pair<Register, SDValue>> &RegsToPass, CCValAssign &VA,
2645  CCValAssign &NextVA, const X86Subtarget &Subtarget) {
2646  assert(Subtarget.hasBWI() && "Expected AVX512BW target!");
2647  assert(Subtarget.is32Bit() && "Expecting 32 bit target");
2648  assert(Arg.getValueType() == MVT::i64 && "Expecting 64 bit value");
2649  assert(VA.isRegLoc() && NextVA.isRegLoc() &&
2650  "The value should reside in two registers");
2651 
2652  // Before splitting the value we cast it to i64
2653  Arg = DAG.getBitcast(MVT::i64, Arg);
2654 
2655  // Splitting the value into two i32 types
2656  SDValue Lo, Hi;
2658  DAG.getConstant(0, Dl, MVT::i32));
2660  DAG.getConstant(1, Dl, MVT::i32));
2661 
2662  // Attach the two i32 types into corresponding registers
2663  RegsToPass.push_back(std::make_pair(VA.getLocReg(), Lo));
2664  RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), Hi));
2665 }
2666 
2667 SDValue
2668 X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
2669  bool isVarArg,
2670  const SmallVectorImpl<ISD::OutputArg> &Outs,
2671  const SmallVectorImpl<SDValue> &OutVals,
2672  const SDLoc &dl, SelectionDAG &DAG) const {
2673  MachineFunction &MF = DAG.getMachineFunction();
2675 
2676  // In some cases we need to disable registers from the default CSR list.
2677  // For example, when they are used for argument passing.
2678  bool ShouldDisableCalleeSavedRegister =
2679  CallConv == CallingConv::X86_RegCall ||
2680  MF.getFunction().hasFnAttribute("no_caller_saved_registers");
2681 
2682  if (CallConv == CallingConv::X86_INTR && !Outs.empty())
2683  report_fatal_error("X86 interrupts may not return any value");
2684 
2686  CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext());
2687  CCInfo.AnalyzeReturn(Outs, RetCC_X86);
2688 
2690  for (unsigned I = 0, OutsIndex = 0, E = RVLocs.size(); I != E;
2691  ++I, ++OutsIndex) {
2692  CCValAssign &VA = RVLocs[I];
2693  assert(VA.isRegLoc() && "Can only return in registers!");
2694 
2695  // Add the register to the CalleeSaveDisableRegs list.
2696  if (ShouldDisableCalleeSavedRegister)
2698 
2699  SDValue ValToCopy = OutVals[OutsIndex];
2700  EVT ValVT = ValToCopy.getValueType();
2701 
2702  // Promote values to the appropriate types.
2703  if (VA.getLocInfo() == CCValAssign::SExt)
2704  ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy);
2705  else if (VA.getLocInfo() == CCValAssign::ZExt)
2706  ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy);
2707  else if (VA.getLocInfo() == CCValAssign::AExt) {
2708  if (ValVT.isVector() && ValVT.getVectorElementType() == MVT::i1)
2709  ValToCopy = lowerMasksToReg(ValToCopy, VA.getLocVT(), dl, DAG);
2710  else
2711  ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy);
2712  }
2713  else if (VA.getLocInfo() == CCValAssign::BCvt)
2714  ValToCopy = DAG.getBitcast(VA.getLocVT(), ValToCopy);
2715 
2717  "Unexpected FP-extend for return value.");
2718 
2719  // Report an error if we have attempted to return a value via an XMM
2720  // register and SSE was disabled.
2721  if (!Subtarget.hasSSE1() && X86::FR32XRegClass.contains(VA.getLocReg())) {
2722  errorUnsupported(DAG, dl, "SSE register return with SSE disabled");
2723  VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
2724  } else if (!Subtarget.hasSSE2() &&
2725  X86::FR64XRegClass.contains(VA.getLocReg()) &&
2726  ValVT == MVT::f64) {
2727  // When returning a double via an XMM register, report an error if SSE2 is
2728  // not enabled.
2729  errorUnsupported(DAG, dl, "SSE2 register return with SSE2 disabled");
2730  VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
2731  }
2732 
2733  // Returns in ST0/ST1 are handled specially: these are pushed as operands to
2734  // the RET instruction and handled by the FP Stackifier.
2735  if (VA.getLocReg() == X86::FP0 ||
2736  VA.getLocReg() == X86::FP1) {
2737  // If this is a copy from an xmm register to ST(0), use an FPExtend to
2738  // change the value to the FP stack register class.
2739  if (isScalarFPTypeInSSEReg(VA.getValVT()))
2740  ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy);
2741  RetVals.push_back(std::make_pair(VA.getLocReg(), ValToCopy));
2742  // Don't emit a copytoreg.
2743  continue;
2744  }
2745 
2746  // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64
2747  // which is returned in RAX / RDX.
2748  if (Subtarget.is64Bit()) {
2749  if (ValVT == MVT::x86mmx) {
2750  if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) {
2751  ValToCopy = DAG.getBitcast(MVT::i64, ValToCopy);
2752  ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
2753  ValToCopy);
2754  // If we don't have SSE2 available, convert to v4f32 so the generated
2755  // register is legal.
2756  if (!Subtarget.hasSSE2())
2757  ValToCopy = DAG.getBitcast(MVT::v4f32, ValToCopy);
2758  }
2759  }
2760  }
2761 
2762  if (VA.needsCustom()) {
2763  assert(VA.getValVT() == MVT::v64i1 &&
2764  "Currently the only custom case is when we split v64i1 to 2 regs");
2765 
2766  Passv64i1ArgInRegs(dl, DAG, ValToCopy, RetVals, VA, RVLocs[++I],
2767  Subtarget);
2768 
2769  // Add the second register to the CalleeSaveDisableRegs list.
2770  if (ShouldDisableCalleeSavedRegister)
2771  MF.getRegInfo().disableCalleeSavedRegister(RVLocs[I].getLocReg());
2772  } else {
2773  RetVals.push_back(std::make_pair(VA.getLocReg(), ValToCopy));
2774  }
2775  }
2776 
2777  SDValue Flag;
2778  SmallVector<SDValue, 6> RetOps;
2779  RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
2780  // Operand #1 = Bytes To Pop
2781  RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(), dl,
2782  MVT::i32));
2783 
2784  // Copy the result values into the output registers.
2785  for (auto &RetVal : RetVals) {
2786  if (RetVal.first == X86::FP0 || RetVal.first == X86::FP1) {
2787  RetOps.push_back(RetVal.second);
2788  continue; // Don't emit a copytoreg.
2789  }
2790 
2791  Chain = DAG.getCopyToReg(Chain, dl, RetVal.first, RetVal.second, Flag);
2792  Flag = Chain.getValue(1);
2793  RetOps.push_back(
2794  DAG.getRegister(RetVal.first, RetVal.second.getValueType()));
2795  }
2796 
2797  // Swift calling convention does not require we copy the sret argument
2798  // into %rax/%eax for the return, and SRetReturnReg is not set for Swift.
2799 
2800  // All x86 ABIs require that for returning structs by value we copy
2801  // the sret argument into %rax/%eax (depending on ABI) for the return.
2802  // We saved the argument into a virtual register in the entry block,
2803  // so now we copy the value out and into %rax/%eax.
2804  //
2805  // Checking Function.hasStructRetAttr() here is insufficient because the IR
2806  // may not have an explicit sret argument. If FuncInfo.CanLowerReturn is
2807  // false, then an sret argument may be implicitly inserted in the SelDAG. In
2808  // either case FuncInfo->setSRetReturnReg() will have been called.
2809  if (Register SRetReg = FuncInfo->getSRetReturnReg()) {
2810  // When we have both sret and another return value, we should use the
2811  // original Chain stored in RetOps[0], instead of the current Chain updated
2812  // in the above loop. If we only have sret, RetOps[0] equals to Chain.
2813 
2814  // For the case of sret and another return value, we have
2815  // Chain_0 at the function entry
2816  // Chain_1 = getCopyToReg(Chain_0) in the above loop
2817  // If we use Chain_1 in getCopyFromReg, we will have
2818  // Val = getCopyFromReg(Chain_1)
2819  // Chain_2 = getCopyToReg(Chain_1, Val) from below
2820 
2821  // getCopyToReg(Chain_0) will be glued together with
2822  // getCopyToReg(Chain_1, Val) into Unit A, getCopyFromReg(Chain_1) will be
2823  // in Unit B, and we will have cyclic dependency between Unit A and Unit B:
2824  // Data dependency from Unit B to Unit A due to usage of Val in
2825  // getCopyToReg(Chain_1, Val)
2826  // Chain dependency from Unit A to Unit B
2827 
2828  // So here, we use RetOps[0] (i.e Chain_0) for getCopyFromReg.
2829  SDValue Val = DAG.getCopyFromReg(RetOps[0], dl, SRetReg,
2830  getPointerTy(MF.getDataLayout()));
2831 
2832  Register RetValReg
2833  = (Subtarget.is64Bit() && !Subtarget.isTarget64BitILP32()) ?
2834  X86::RAX : X86::EAX;
2835  Chain = DAG.getCopyToReg(Chain, dl, RetValReg, Val, Flag);
2836  Flag = Chain.getValue(1);
2837 
2838  // RAX/EAX now acts like a return value.
2839  RetOps.push_back(
2840  DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout())));
2841 
2842  // Add the returned register to the CalleeSaveDisableRegs list.
2843  if (ShouldDisableCalleeSavedRegister)
2844  MF.getRegInfo().disableCalleeSavedRegister(RetValReg);
2845  }
2846 
2847  const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
2848  const MCPhysReg *I =
2849  TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
2850  if (I) {
2851  for (; *I; ++I) {
2852  if (X86::GR64RegClass.contains(*I))
2853  RetOps.push_back(DAG.getRegister(*I, MVT::i64));
2854  else
2855  llvm_unreachable("Unexpected register class in CSRsViaCopy!");
2856  }
2857  }
2858 
2859  RetOps[0] = Chain; // Update chain.
2860 
2861  // Add the flag if we have it.
2862  if (Flag.getNode())
2863  RetOps.push_back(Flag);
2864 
2866  if (CallConv == CallingConv::X86_INTR)
2867  opcode = X86ISD::IRET;
2868  return DAG.getNode(opcode, dl, MVT::Other, RetOps);
2869 }
2870 
2871 bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
2872  if (N->getNumValues() != 1 || !N->hasNUsesOfValue(1, 0))
2873  return false;
2874 
2875  SDValue TCChain = Chain;
2876  SDNode *Copy = *N->use_begin();
2877  if (Copy->getOpcode() == ISD::CopyToReg) {
2878  // If the copy has a glue operand, we conservatively assume it isn't safe to
2879  // perform a tail call.
2880  if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
2881  return false;
2882  TCChain = Copy->getOperand(0);
2883  } else if (Copy->getOpcode() != ISD::FP_EXTEND)
2884  return false;
2885 
2886  bool HasRet = false;
2887  for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end();
2888  UI != UE; ++UI) {
2889  if (UI->getOpcode() != X86ISD::RET_FLAG)
2890  return false;
2891  // If we are returning more than one value, we can definitely
2892  // not make a tail call see PR19530
2893  if (UI->getNumOperands() > 4)
2894  return false;
2895  if (UI->getNumOperands() == 4 &&
2896  UI->getOperand(UI->getNumOperands()-1).getValueType() != MVT::Glue)
2897  return false;
2898  HasRet = true;
2899  }
2900 
2901  if (!HasRet)
2902  return false;
2903 
2904  Chain = TCChain;
2905  return true;
2906 }
2907 
2908 EVT X86TargetLowering::getTypeForExtReturn(LLVMContext &Context, EVT VT,
2909  ISD::NodeType ExtendKind) const {
2910  MVT ReturnMVT = MVT::i32;
2911 
2912  bool Darwin = Subtarget.getTargetTriple().isOSDarwin();
2913  if (VT == MVT::i1 || (!Darwin && (VT == MVT::i8 || VT == MVT::i16))) {
2914  // The ABI does not require i1, i8 or i16 to be extended.
2915  //
2916  // On Darwin, there is code in the wild relying on Clang's old behaviour of
2917  // always extending i8/i16 return values, so keep doing that for now.
2918  // (PR26665).
2919  ReturnMVT = MVT::i8;
2920  }
2921 
2922  EVT MinVT = getRegisterType(Context, ReturnMVT);
2923  return VT.bitsLT(MinVT) ? MinVT : VT;
2924 }
2925 
2926 /// Reads two 32 bit registers and creates a 64 bit mask value.
2927 /// \param VA The current 32 bit value that need to be assigned.
2928 /// \param NextVA The next 32 bit value that need to be assigned.
2929 /// \param Root The parent DAG node.
2930 /// \param [in,out] InFlag Represents SDvalue in the parent DAG node for
2931 /// glue purposes. In the case the DAG is already using
2932 /// physical register instead of virtual, we should glue
2933 /// our new SDValue to InFlag SDvalue.
2934 /// \return a new SDvalue of size 64bit.
2936  SDValue &Root, SelectionDAG &DAG,
2937  const SDLoc &Dl, const X86Subtarget &Subtarget,
2938  SDValue *InFlag = nullptr) {
2939  assert((Subtarget.hasBWI()) && "Expected AVX512BW target!");
2940  assert(Subtarget.is32Bit() && "Expecting 32 bit target");
2941  assert(VA.getValVT() == MVT::v64i1 &&
2942  "Expecting first location of 64 bit width type");
2943  assert(NextVA.getValVT() == VA.getValVT() &&
2944  "The locations should have the same type");
2945  assert(VA.isRegLoc() && NextVA.isRegLoc() &&
2946  "The values should reside in two registers");
2947 
2948  SDValue Lo, Hi;
2949  SDValue ArgValueLo, ArgValueHi;
2950 
2951  MachineFunction &MF = DAG.getMachineFunction();
2952  const TargetRegisterClass *RC = &X86::GR32RegClass;
2953 
2954  // Read a 32 bit value from the registers.
2955  if (nullptr == InFlag) {
2956  // When no physical register is present,
2957  // create an intermediate virtual register.
2958  Register Reg = MF.addLiveIn(VA.getLocReg(), RC);
2959  ArgValueLo = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);
2960  Reg = MF.addLiveIn(NextVA.getLocReg(), RC);
2961  ArgValueHi = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);
2962  } else {
2963  // When a physical register is available read the value from it and glue
2964  // the reads together.
2965  ArgValueLo =
2966  DAG.getCopyFromReg(Root, Dl, VA.getLocReg(), MVT::i32, *InFlag);
2967  *InFlag = ArgValueLo.getValue(2);
2968  ArgValueHi =
2969  DAG.getCopyFromReg(Root, Dl, NextVA.getLocReg(), MVT::i32, *InFlag);
2970  *InFlag = ArgValueHi.getValue(2);
2971  }
2972 
2973  // Convert the i32 type into v32i1 type.
2974  Lo = DAG.getBitcast(MVT::v32i1, ArgValueLo);
2975 
2976  // Convert the i32 type into v32i1 type.
2977  Hi = DAG.getBitcast(MVT::v32i1, ArgValueHi);
2978 
2979  // Concatenate the two values together.
2980  return DAG.getNode(ISD::CONCAT_VECTORS, Dl, MVT::v64i1, Lo, Hi);
2981 }
2982 
2983 /// The function will lower a register of various sizes (8/16/32/64)
2984 /// to a mask value of the expected size (v8i1/v16i1/v32i1/v64i1)
2985 /// \returns a DAG node contains the operand after lowering to mask type.
2986 static SDValue lowerRegToMasks(const SDValue &ValArg, const EVT &ValVT,
2987  const EVT &ValLoc, const SDLoc &Dl,
2988  SelectionDAG &DAG) {
2989  SDValue ValReturned = ValArg;
2990 
2991  if (ValVT == MVT::v1i1)
2992  return DAG.getNode(ISD::SCALAR_TO_VECTOR, Dl, MVT::v1i1, ValReturned);
2993 
2994  if (ValVT == MVT::v64i1) {
2995  // In 32 bit machine, this case is handled by getv64i1Argument
2996  assert(ValLoc == MVT::i64 && "Expecting only i64 locations");
2997  // In 64 bit machine, There is no need to truncate the value only bitcast
2998  } else {
2999  MVT maskLen;
3000  switch (ValVT.getSimpleVT().SimpleTy) {
3001  case MVT::v8i1:
3002  maskLen = MVT::i8;
3003  break;
3004  case MVT::v16i1:
3005  maskLen = MVT::i16;
3006  break;
3007  case MVT::v32i1:
3008  maskLen = MVT::i32;
3009  break;
3010  default:
3011  llvm_unreachable("Expecting a vector of i1 types");
3012  }
3013 
3014  ValReturned = DAG.getNode(ISD::TRUNCATE, Dl, maskLen, ValReturned);
3015  }
3016  return DAG.getBitcast(ValVT, ValReturned);
3017 }
3018 
3019 /// Lower the result values of a call into the
3020 /// appropriate copies out of appropriate physical registers.
3021 ///
3022 SDValue X86TargetLowering::LowerCallResult(
3023  SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,
3024  const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
3025  SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
3026  uint32_t *RegMask) const {
3027 
3028  const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
3029  // Assign locations to each value returned by this call.
3031  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
3032  *DAG.getContext());
3033  CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
3034 
3035  // Copy all of the result registers out of their specified physreg.
3036  for (unsigned I = 0, InsIndex = 0, E = RVLocs.size(); I != E;
3037  ++I, ++InsIndex) {
3038  CCValAssign &VA = RVLocs[I];
3039  EVT CopyVT = VA.getLocVT();
3040 
3041  // In some calling conventions we need to remove the used registers
3042  // from the register mask.
3043  if (RegMask) {
3044  for (MCSubRegIterator SubRegs(VA.getLocReg(), TRI, /*IncludeSelf=*/true);
3045  SubRegs.isValid(); ++SubRegs)
3046  RegMask[*SubRegs / 32] &= ~(1u << (*SubRegs % 32));
3047  }
3048 
3049  // Report an error if there was an attempt to return FP values via XMM
3050  // registers.
3051  if (!Subtarget.hasSSE1() && X86::FR32XRegClass.contains(VA.getLocReg())) {
3052  errorUnsupported(DAG, dl, "SSE register return with SSE disabled");
3053  if (VA.getLocReg() == X86::XMM1)
3054  VA.convertToReg(X86::FP1); // Set reg to FP1, avoid hitting asserts.
3055  else
3056  VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
3057  } else if (!Subtarget.hasSSE2() &&
3058  X86::FR64XRegClass.contains(VA.getLocReg()) &&
3059  CopyVT == MVT::f64) {
3060  errorUnsupported(DAG, dl, "SSE2 register return with SSE2 disabled");
3061  if (VA.getLocReg() == X86::XMM1)
3062  VA.convertToReg(X86::FP1); // Set reg to FP1, avoid hitting asserts.
3063  else
3064  VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
3065  }
3066 
3067  // If we prefer to use the value in xmm registers, copy it out as f80 and
3068  // use a truncate to move it from fp stack reg to xmm reg.
3069  bool RoundAfterCopy = false;
3070  if ((VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) &&
3072  if (!Subtarget.hasX87())
3073  report_fatal_error("X87 register return with X87 disabled");
3074  CopyVT = MVT::f80;
3075  RoundAfterCopy = (CopyVT != VA.getLocVT());
3076  }
3077 
3078  SDValue Val;
3079  if (VA.needsCustom()) {
3080  assert(VA.getValVT() == MVT::v64i1 &&
3081  "Currently the only custom case is when we split v64i1 to 2 regs");
3082  Val =
3083  getv64i1Argument(VA, RVLocs[++I], Chain, DAG, dl, Subtarget, &InFlag);
3084  } else {
3085  Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), CopyVT, InFlag)
3086  .getValue(1);
3087  Val = Chain.getValue(0);
3088  InFlag = Chain.getValue(2);
3089  }
3090 
3091  if (RoundAfterCopy)
3092  Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val,
3093  // This truncation won't change the value.
3094  DAG.getIntPtrConstant(1, dl));
3095 
3096  if (VA.isExtInLoc()) {
3097  if (VA.getValVT().isVector() &&
3098  VA.getValVT().getScalarType() == MVT::i1 &&
3099  ((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) ||
3100  (VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) {
3101  // promoting a mask type (v*i1) into a register of type i64/i32/i16/i8
3102  Val = lowerRegToMasks(Val, VA.getValVT(), VA.getLocVT(), dl, DAG);
3103  } else
3104  Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val);
3105  }
3106 
3107  if (VA.getLocInfo() == CCValAssign::BCvt)
3108  Val = DAG.getBitcast(VA.getValVT(), Val);
3109 
3110  InVals.push_back(Val);
3111  }
3112 
3113  return Chain;
3114 }
3115 
3116 //===----------------------------------------------------------------------===//
3117 // C & StdCall & Fast Calling Convention implementation
3118 //===----------------------------------------------------------------------===//
3119 // StdCall calling convention seems to be standard for many Windows' API
3120 // routines and around. It differs from C calling convention just a little:
3121 // callee should clean up the stack, not caller. Symbols should be also
3122 // decorated in some fancy way :) It doesn't support any vector arguments.
3123 // For info on fast calling convention see Fast Calling Convention (tail call)
3124 // implementation LowerX86_32FastCCCallTo.
3125 
3126 /// CallIsStructReturn - Determines whether a call uses struct return
3127 /// semantics.
3132 };
3133 static StructReturnType
3135  if (Outs.empty())
3136  return NotStructReturn;
3137 
3138  const ISD::ArgFlagsTy &Flags = Outs[0].Flags;
3139  if (!Flags.isSRet())
3140  return NotStructReturn;
3141  if (Flags.isInReg() || IsMCU)
3142  return RegStructReturn;
3143  return StackStructReturn;
3144 }
3145 
3146 /// Determines whether a function uses struct return semantics.
3147 static StructReturnType
3149  if (Ins.empty())
3150  return NotStructReturn;
3151 
3152  const ISD::ArgFlagsTy &Flags = Ins[0].Flags;
3153  if (!Flags.isSRet())
3154  return NotStructReturn;
3155  if (Flags.isInReg() || IsMCU)
3156  return RegStructReturn;
3157  return StackStructReturn;
3158 }
3159 
3160 /// Make a copy of an aggregate at address specified by "Src" to address
3161 /// "Dst" with size and alignment information specified by the specific
3162 /// parameter attribute. The copy will be passed as a byval function parameter.
3164  SDValue Chain, ISD::ArgFlagsTy Flags,
3165  SelectionDAG &DAG, const SDLoc &dl) {
3166  SDValue SizeNode = DAG.getIntPtrConstant(Flags.getByValSize(), dl);
3167 
3168  return DAG.getMemcpy(
3169  Chain, dl, Dst, Src, SizeNode, Flags.getNonZeroByValAlign(),
3170  /*isVolatile*/ false, /*AlwaysInline=*/true,
3171  /*isTailCall*/ false, MachinePointerInfo(), MachinePointerInfo());
3172 }
3173 
3174 /// Return true if the calling convention is one that we can guarantee TCO for.
3176  return (CC == CallingConv::Fast || CC == CallingConv::GHC ||
3178  CC == CallingConv::HHVM || CC == CallingConv::Tail);
3179 }
3180 
3181 /// Return true if we might ever do TCO for calls with this calling convention.
3183  switch (CC) {
3184  // C calling conventions:
3185  case CallingConv::C:
3186  case CallingConv::Win64:
3188  // Callee pop conventions:
3193  // Swift:
3194  case CallingConv::Swift:
3195  return true;
3196  default:
3197  return canGuaranteeTCO(CC);
3198  }
3199 }
3200 
3201 /// Return true if the function is being made into a tailcall target by
3202 /// changing its ABI.
3203 static bool shouldGuaranteeTCO(CallingConv::ID CC, bool GuaranteedTailCallOpt) {
3204  return (GuaranteedTailCallOpt && canGuaranteeTCO(CC)) || CC == CallingConv::Tail;
3205 }
3206 
3207 bool X86TargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
3208  if (!CI->isTailCall())
3209  return false;
3210 
3211  CallingConv::ID CalleeCC = CI->getCallingConv();
3212  if (!mayTailCallThisCC(CalleeCC))
3213  return false;
3214 
3215  return true;
3216 }
3217 
3218 SDValue
3219 X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv,
3221  const SDLoc &dl, SelectionDAG &DAG,
3222  const CCValAssign &VA,
3223  MachineFrameInfo &MFI, unsigned i) const {
3224  // Create the nodes corresponding to a load from this parameter slot.
3225  ISD::ArgFlagsTy Flags = Ins[i].Flags;
3226  bool AlwaysUseMutable = shouldGuaranteeTCO(
3227  CallConv, DAG.getTarget().Options.GuaranteedTailCallOpt);
3228  bool isImmutable = !AlwaysUseMutable && !Flags.isByVal();
3229  EVT ValVT;
3230  MVT PtrVT = getPointerTy(DAG.getDataLayout());
3231 
3232  // If value is passed by pointer