LLVM  14.0.0git
X86InstrInfo.cpp
Go to the documentation of this file.
1 //===-- X86InstrInfo.cpp - X86 Instruction Information --------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file contains the X86 implementation of the TargetInstrInfo class.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "X86InstrInfo.h"
14 #include "X86.h"
15 #include "X86InstrBuilder.h"
16 #include "X86InstrFoldTables.h"
17 #include "X86MachineFunctionInfo.h"
18 #include "X86Subtarget.h"
19 #include "X86TargetMachine.h"
20 #include "llvm/ADT/STLExtras.h"
21 #include "llvm/ADT/Sequence.h"
30 #include "llvm/CodeGen/StackMaps.h"
32 #include "llvm/IR/DerivedTypes.h"
33 #include "llvm/IR/Function.h"
34 #include "llvm/MC/MCAsmInfo.h"
35 #include "llvm/MC/MCExpr.h"
36 #include "llvm/MC/MCInst.h"
38 #include "llvm/Support/Debug.h"
42 
43 using namespace llvm;
44 
45 #define DEBUG_TYPE "x86-instr-info"
46 
47 #define GET_INSTRINFO_CTOR_DTOR
48 #include "X86GenInstrInfo.inc"
49 
50 static cl::opt<bool>
51  NoFusing("disable-spill-fusing",
52  cl::desc("Disable fusing of spill code into instructions"),
53  cl::Hidden);
54 static cl::opt<bool>
55 PrintFailedFusing("print-failed-fuse-candidates",
56  cl::desc("Print instructions that the allocator wants to"
57  " fuse, but the X86 backend currently can't"),
58  cl::Hidden);
59 static cl::opt<bool>
60 ReMatPICStubLoad("remat-pic-stub-load",
61  cl::desc("Re-materialize load from stub in PIC mode"),
62  cl::init(false), cl::Hidden);
63 static cl::opt<unsigned>
64 PartialRegUpdateClearance("partial-reg-update-clearance",
65  cl::desc("Clearance between two register writes "
66  "for inserting XOR to avoid partial "
67  "register update"),
68  cl::init(64), cl::Hidden);
69 static cl::opt<unsigned>
70 UndefRegClearance("undef-reg-clearance",
71  cl::desc("How many idle instructions we would like before "
72  "certain undef register reads"),
73  cl::init(128), cl::Hidden);
74 
75 
76 // Pin the vtable to this file.
77 void X86InstrInfo::anchor() {}
78 
80  : X86GenInstrInfo((STI.isTarget64BitLP64() ? X86::ADJCALLSTACKDOWN64
81  : X86::ADJCALLSTACKDOWN32),
82  (STI.isTarget64BitLP64() ? X86::ADJCALLSTACKUP64
83  : X86::ADJCALLSTACKUP32),
84  X86::CATCHRET,
85  (STI.is64Bit() ? X86::RETQ : X86::RETL)),
86  Subtarget(STI), RI(STI.getTargetTriple()) {
87 }
88 
89 bool
91  Register &SrcReg, Register &DstReg,
92  unsigned &SubIdx) const {
93  switch (MI.getOpcode()) {
94  default: break;
95  case X86::MOVSX16rr8:
96  case X86::MOVZX16rr8:
97  case X86::MOVSX32rr8:
98  case X86::MOVZX32rr8:
99  case X86::MOVSX64rr8:
100  if (!Subtarget.is64Bit())
101  // It's not always legal to reference the low 8-bit of the larger
102  // register in 32-bit mode.
103  return false;
105  case X86::MOVSX32rr16:
106  case X86::MOVZX32rr16:
107  case X86::MOVSX64rr16:
108  case X86::MOVSX64rr32: {
109  if (MI.getOperand(0).getSubReg() || MI.getOperand(1).getSubReg())
110  // Be conservative.
111  return false;
112  SrcReg = MI.getOperand(1).getReg();
113  DstReg = MI.getOperand(0).getReg();
114  switch (MI.getOpcode()) {
115  default: llvm_unreachable("Unreachable!");
116  case X86::MOVSX16rr8:
117  case X86::MOVZX16rr8:
118  case X86::MOVSX32rr8:
119  case X86::MOVZX32rr8:
120  case X86::MOVSX64rr8:
121  SubIdx = X86::sub_8bit;
122  break;
123  case X86::MOVSX32rr16:
124  case X86::MOVZX32rr16:
125  case X86::MOVSX64rr16:
126  SubIdx = X86::sub_16bit;
127  break;
128  case X86::MOVSX64rr32:
129  SubIdx = X86::sub_32bit;
130  break;
131  }
132  return true;
133  }
134  }
135  return false;
136 }
137 
139  switch (MI.getOpcode()) {
140  default:
141  // By default, assume that the instruction is not data invariant.
142  return false;
143 
144  // Some target-independent operations that trivially lower to data-invariant
145  // instructions.
146  case TargetOpcode::COPY:
147  case TargetOpcode::INSERT_SUBREG:
148  case TargetOpcode::SUBREG_TO_REG:
149  return true;
150 
151  // On x86 it is believed that imul is constant time w.r.t. the loaded data.
152  // However, they set flags and are perhaps the most surprisingly constant
153  // time operations so we call them out here separately.
154  case X86::IMUL16rr:
155  case X86::IMUL16rri8:
156  case X86::IMUL16rri:
157  case X86::IMUL32rr:
158  case X86::IMUL32rri8:
159  case X86::IMUL32rri:
160  case X86::IMUL64rr:
161  case X86::IMUL64rri32:
162  case X86::IMUL64rri8:
163 
164  // Bit scanning and counting instructions that are somewhat surprisingly
165  // constant time as they scan across bits and do other fairly complex
166  // operations like popcnt, but are believed to be constant time on x86.
167  // However, these set flags.
168  case X86::BSF16rr:
169  case X86::BSF32rr:
170  case X86::BSF64rr:
171  case X86::BSR16rr:
172  case X86::BSR32rr:
173  case X86::BSR64rr:
174  case X86::LZCNT16rr:
175  case X86::LZCNT32rr:
176  case X86::LZCNT64rr:
177  case X86::POPCNT16rr:
178  case X86::POPCNT32rr:
179  case X86::POPCNT64rr:
180  case X86::TZCNT16rr:
181  case X86::TZCNT32rr:
182  case X86::TZCNT64rr:
183 
184  // Bit manipulation instructions are effectively combinations of basic
185  // arithmetic ops, and should still execute in constant time. These also
186  // set flags.
187  case X86::BLCFILL32rr:
188  case X86::BLCFILL64rr:
189  case X86::BLCI32rr:
190  case X86::BLCI64rr:
191  case X86::BLCIC32rr:
192  case X86::BLCIC64rr:
193  case X86::BLCMSK32rr:
194  case X86::BLCMSK64rr:
195  case X86::BLCS32rr:
196  case X86::BLCS64rr:
197  case X86::BLSFILL32rr:
198  case X86::BLSFILL64rr:
199  case X86::BLSI32rr:
200  case X86::BLSI64rr:
201  case X86::BLSIC32rr:
202  case X86::BLSIC64rr:
203  case X86::BLSMSK32rr:
204  case X86::BLSMSK64rr:
205  case X86::BLSR32rr:
206  case X86::BLSR64rr:
207  case X86::TZMSK32rr:
208  case X86::TZMSK64rr:
209 
210  // Bit extracting and clearing instructions should execute in constant time,
211  // and set flags.
212  case X86::BEXTR32rr:
213  case X86::BEXTR64rr:
214  case X86::BEXTRI32ri:
215  case X86::BEXTRI64ri:
216  case X86::BZHI32rr:
217  case X86::BZHI64rr:
218 
219  // Shift and rotate.
220  case X86::ROL8r1:
221  case X86::ROL16r1:
222  case X86::ROL32r1:
223  case X86::ROL64r1:
224  case X86::ROL8rCL:
225  case X86::ROL16rCL:
226  case X86::ROL32rCL:
227  case X86::ROL64rCL:
228  case X86::ROL8ri:
229  case X86::ROL16ri:
230  case X86::ROL32ri:
231  case X86::ROL64ri:
232  case X86::ROR8r1:
233  case X86::ROR16r1:
234  case X86::ROR32r1:
235  case X86::ROR64r1:
236  case X86::ROR8rCL:
237  case X86::ROR16rCL:
238  case X86::ROR32rCL:
239  case X86::ROR64rCL:
240  case X86::ROR8ri:
241  case X86::ROR16ri:
242  case X86::ROR32ri:
243  case X86::ROR64ri:
244  case X86::SAR8r1:
245  case X86::SAR16r1:
246  case X86::SAR32r1:
247  case X86::SAR64r1:
248  case X86::SAR8rCL:
249  case X86::SAR16rCL:
250  case X86::SAR32rCL:
251  case X86::SAR64rCL:
252  case X86::SAR8ri:
253  case X86::SAR16ri:
254  case X86::SAR32ri:
255  case X86::SAR64ri:
256  case X86::SHL8r1:
257  case X86::SHL16r1:
258  case X86::SHL32r1:
259  case X86::SHL64r1:
260  case X86::SHL8rCL:
261  case X86::SHL16rCL:
262  case X86::SHL32rCL:
263  case X86::SHL64rCL:
264  case X86::SHL8ri:
265  case X86::SHL16ri:
266  case X86::SHL32ri:
267  case X86::SHL64ri:
268  case X86::SHR8r1:
269  case X86::SHR16r1:
270  case X86::SHR32r1:
271  case X86::SHR64r1:
272  case X86::SHR8rCL:
273  case X86::SHR16rCL:
274  case X86::SHR32rCL:
275  case X86::SHR64rCL:
276  case X86::SHR8ri:
277  case X86::SHR16ri:
278  case X86::SHR32ri:
279  case X86::SHR64ri:
280  case X86::SHLD16rrCL:
281  case X86::SHLD32rrCL:
282  case X86::SHLD64rrCL:
283  case X86::SHLD16rri8:
284  case X86::SHLD32rri8:
285  case X86::SHLD64rri8:
286  case X86::SHRD16rrCL:
287  case X86::SHRD32rrCL:
288  case X86::SHRD64rrCL:
289  case X86::SHRD16rri8:
290  case X86::SHRD32rri8:
291  case X86::SHRD64rri8:
292 
293  // Basic arithmetic is constant time on the input but does set flags.
294  case X86::ADC8rr:
295  case X86::ADC8ri:
296  case X86::ADC16rr:
297  case X86::ADC16ri:
298  case X86::ADC16ri8:
299  case X86::ADC32rr:
300  case X86::ADC32ri:
301  case X86::ADC32ri8:
302  case X86::ADC64rr:
303  case X86::ADC64ri8:
304  case X86::ADC64ri32:
305  case X86::ADD8rr:
306  case X86::ADD8ri:
307  case X86::ADD16rr:
308  case X86::ADD16ri:
309  case X86::ADD16ri8:
310  case X86::ADD32rr:
311  case X86::ADD32ri:
312  case X86::ADD32ri8:
313  case X86::ADD64rr:
314  case X86::ADD64ri8:
315  case X86::ADD64ri32:
316  case X86::AND8rr:
317  case X86::AND8ri:
318  case X86::AND16rr:
319  case X86::AND16ri:
320  case X86::AND16ri8:
321  case X86::AND32rr:
322  case X86::AND32ri:
323  case X86::AND32ri8:
324  case X86::AND64rr:
325  case X86::AND64ri8:
326  case X86::AND64ri32:
327  case X86::OR8rr:
328  case X86::OR8ri:
329  case X86::OR16rr:
330  case X86::OR16ri:
331  case X86::OR16ri8:
332  case X86::OR32rr:
333  case X86::OR32ri:
334  case X86::OR32ri8:
335  case X86::OR64rr:
336  case X86::OR64ri8:
337  case X86::OR64ri32:
338  case X86::SBB8rr:
339  case X86::SBB8ri:
340  case X86::SBB16rr:
341  case X86::SBB16ri:
342  case X86::SBB16ri8:
343  case X86::SBB32rr:
344  case X86::SBB32ri:
345  case X86::SBB32ri8:
346  case X86::SBB64rr:
347  case X86::SBB64ri8:
348  case X86::SBB64ri32:
349  case X86::SUB8rr:
350  case X86::SUB8ri:
351  case X86::SUB16rr:
352  case X86::SUB16ri:
353  case X86::SUB16ri8:
354  case X86::SUB32rr:
355  case X86::SUB32ri:
356  case X86::SUB32ri8:
357  case X86::SUB64rr:
358  case X86::SUB64ri8:
359  case X86::SUB64ri32:
360  case X86::XOR8rr:
361  case X86::XOR8ri:
362  case X86::XOR16rr:
363  case X86::XOR16ri:
364  case X86::XOR16ri8:
365  case X86::XOR32rr:
366  case X86::XOR32ri:
367  case X86::XOR32ri8:
368  case X86::XOR64rr:
369  case X86::XOR64ri8:
370  case X86::XOR64ri32:
371  // Arithmetic with just 32-bit and 64-bit variants and no immediates.
372  case X86::ADCX32rr:
373  case X86::ADCX64rr:
374  case X86::ADOX32rr:
375  case X86::ADOX64rr:
376  case X86::ANDN32rr:
377  case X86::ANDN64rr:
378  // Unary arithmetic operations.
379  case X86::DEC8r:
380  case X86::DEC16r:
381  case X86::DEC32r:
382  case X86::DEC64r:
383  case X86::INC8r:
384  case X86::INC16r:
385  case X86::INC32r:
386  case X86::INC64r:
387  case X86::NEG8r:
388  case X86::NEG16r:
389  case X86::NEG32r:
390  case X86::NEG64r:
391 
392  // Unlike other arithmetic, NOT doesn't set EFLAGS.
393  case X86::NOT8r:
394  case X86::NOT16r:
395  case X86::NOT32r:
396  case X86::NOT64r:
397 
398  // Various move instructions used to zero or sign extend things. Note that we
399  // intentionally don't support the _NOREX variants as we can't handle that
400  // register constraint anyways.
401  case X86::MOVSX16rr8:
402  case X86::MOVSX32rr8:
403  case X86::MOVSX32rr16:
404  case X86::MOVSX64rr8:
405  case X86::MOVSX64rr16:
406  case X86::MOVSX64rr32:
407  case X86::MOVZX16rr8:
408  case X86::MOVZX32rr8:
409  case X86::MOVZX32rr16:
410  case X86::MOVZX64rr8:
411  case X86::MOVZX64rr16:
412  case X86::MOV32rr:
413 
414  // Arithmetic instructions that are both constant time and don't set flags.
415  case X86::RORX32ri:
416  case X86::RORX64ri:
417  case X86::SARX32rr:
418  case X86::SARX64rr:
419  case X86::SHLX32rr:
420  case X86::SHLX64rr:
421  case X86::SHRX32rr:
422  case X86::SHRX64rr:
423 
424  // LEA doesn't actually access memory, and its arithmetic is constant time.
425  case X86::LEA16r:
426  case X86::LEA32r:
427  case X86::LEA64_32r:
428  case X86::LEA64r:
429  return true;
430  }
431 }
432 
434  switch (MI.getOpcode()) {
435  default:
436  // By default, assume that the load will immediately leak.
437  return false;
438 
439  // On x86 it is believed that imul is constant time w.r.t. the loaded data.
440  // However, they set flags and are perhaps the most surprisingly constant
441  // time operations so we call them out here separately.
442  case X86::IMUL16rm:
443  case X86::IMUL16rmi8:
444  case X86::IMUL16rmi:
445  case X86::IMUL32rm:
446  case X86::IMUL32rmi8:
447  case X86::IMUL32rmi:
448  case X86::IMUL64rm:
449  case X86::IMUL64rmi32:
450  case X86::IMUL64rmi8:
451 
452  // Bit scanning and counting instructions that are somewhat surprisingly
453  // constant time as they scan across bits and do other fairly complex
454  // operations like popcnt, but are believed to be constant time on x86.
455  // However, these set flags.
456  case X86::BSF16rm:
457  case X86::BSF32rm:
458  case X86::BSF64rm:
459  case X86::BSR16rm:
460  case X86::BSR32rm:
461  case X86::BSR64rm:
462  case X86::LZCNT16rm:
463  case X86::LZCNT32rm:
464  case X86::LZCNT64rm:
465  case X86::POPCNT16rm:
466  case X86::POPCNT32rm:
467  case X86::POPCNT64rm:
468  case X86::TZCNT16rm:
469  case X86::TZCNT32rm:
470  case X86::TZCNT64rm:
471 
472  // Bit manipulation instructions are effectively combinations of basic
473  // arithmetic ops, and should still execute in constant time. These also
474  // set flags.
475  case X86::BLCFILL32rm:
476  case X86::BLCFILL64rm:
477  case X86::BLCI32rm:
478  case X86::BLCI64rm:
479  case X86::BLCIC32rm:
480  case X86::BLCIC64rm:
481  case X86::BLCMSK32rm:
482  case X86::BLCMSK64rm:
483  case X86::BLCS32rm:
484  case X86::BLCS64rm:
485  case X86::BLSFILL32rm:
486  case X86::BLSFILL64rm:
487  case X86::BLSI32rm:
488  case X86::BLSI64rm:
489  case X86::BLSIC32rm:
490  case X86::BLSIC64rm:
491  case X86::BLSMSK32rm:
492  case X86::BLSMSK64rm:
493  case X86::BLSR32rm:
494  case X86::BLSR64rm:
495  case X86::TZMSK32rm:
496  case X86::TZMSK64rm:
497 
498  // Bit extracting and clearing instructions should execute in constant time,
499  // and set flags.
500  case X86::BEXTR32rm:
501  case X86::BEXTR64rm:
502  case X86::BEXTRI32mi:
503  case X86::BEXTRI64mi:
504  case X86::BZHI32rm:
505  case X86::BZHI64rm:
506 
507  // Basic arithmetic is constant time on the input but does set flags.
508  case X86::ADC8rm:
509  case X86::ADC16rm:
510  case X86::ADC32rm:
511  case X86::ADC64rm:
512  case X86::ADCX32rm:
513  case X86::ADCX64rm:
514  case X86::ADD8rm:
515  case X86::ADD16rm:
516  case X86::ADD32rm:
517  case X86::ADD64rm:
518  case X86::ADOX32rm:
519  case X86::ADOX64rm:
520  case X86::AND8rm:
521  case X86::AND16rm:
522  case X86::AND32rm:
523  case X86::AND64rm:
524  case X86::ANDN32rm:
525  case X86::ANDN64rm:
526  case X86::OR8rm:
527  case X86::OR16rm:
528  case X86::OR32rm:
529  case X86::OR64rm:
530  case X86::SBB8rm:
531  case X86::SBB16rm:
532  case X86::SBB32rm:
533  case X86::SBB64rm:
534  case X86::SUB8rm:
535  case X86::SUB16rm:
536  case X86::SUB32rm:
537  case X86::SUB64rm:
538  case X86::XOR8rm:
539  case X86::XOR16rm:
540  case X86::XOR32rm:
541  case X86::XOR64rm:
542 
543  // Integer multiply w/o affecting flags is still believed to be constant
544  // time on x86. Called out separately as this is among the most surprising
545  // instructions to exhibit that behavior.
546  case X86::MULX32rm:
547  case X86::MULX64rm:
548 
549  // Arithmetic instructions that are both constant time and don't set flags.
550  case X86::RORX32mi:
551  case X86::RORX64mi:
552  case X86::SARX32rm:
553  case X86::SARX64rm:
554  case X86::SHLX32rm:
555  case X86::SHLX64rm:
556  case X86::SHRX32rm:
557  case X86::SHRX64rm:
558 
559  // Conversions are believed to be constant time and don't set flags.
560  case X86::CVTTSD2SI64rm:
561  case X86::VCVTTSD2SI64rm:
562  case X86::VCVTTSD2SI64Zrm:
563  case X86::CVTTSD2SIrm:
564  case X86::VCVTTSD2SIrm:
565  case X86::VCVTTSD2SIZrm:
566  case X86::CVTTSS2SI64rm:
567  case X86::VCVTTSS2SI64rm:
568  case X86::VCVTTSS2SI64Zrm:
569  case X86::CVTTSS2SIrm:
570  case X86::VCVTTSS2SIrm:
571  case X86::VCVTTSS2SIZrm:
572  case X86::CVTSI2SDrm:
573  case X86::VCVTSI2SDrm:
574  case X86::VCVTSI2SDZrm:
575  case X86::CVTSI2SSrm:
576  case X86::VCVTSI2SSrm:
577  case X86::VCVTSI2SSZrm:
578  case X86::CVTSI642SDrm:
579  case X86::VCVTSI642SDrm:
580  case X86::VCVTSI642SDZrm:
581  case X86::CVTSI642SSrm:
582  case X86::VCVTSI642SSrm:
583  case X86::VCVTSI642SSZrm:
584  case X86::CVTSS2SDrm:
585  case X86::VCVTSS2SDrm:
586  case X86::VCVTSS2SDZrm:
587  case X86::CVTSD2SSrm:
588  case X86::VCVTSD2SSrm:
589  case X86::VCVTSD2SSZrm:
590  // AVX512 added unsigned integer conversions.
591  case X86::VCVTTSD2USI64Zrm:
592  case X86::VCVTTSD2USIZrm:
593  case X86::VCVTTSS2USI64Zrm:
594  case X86::VCVTTSS2USIZrm:
595  case X86::VCVTUSI2SDZrm:
596  case X86::VCVTUSI642SDZrm:
597  case X86::VCVTUSI2SSZrm:
598  case X86::VCVTUSI642SSZrm:
599 
600  // Loads to register don't set flags.
601  case X86::MOV8rm:
602  case X86::MOV8rm_NOREX:
603  case X86::MOV16rm:
604  case X86::MOV32rm:
605  case X86::MOV64rm:
606  case X86::MOVSX16rm8:
607  case X86::MOVSX32rm16:
608  case X86::MOVSX32rm8:
609  case X86::MOVSX32rm8_NOREX:
610  case X86::MOVSX64rm16:
611  case X86::MOVSX64rm32:
612  case X86::MOVSX64rm8:
613  case X86::MOVZX16rm8:
614  case X86::MOVZX32rm16:
615  case X86::MOVZX32rm8:
616  case X86::MOVZX32rm8_NOREX:
617  case X86::MOVZX64rm16:
618  case X86::MOVZX64rm8:
619  return true;
620  }
621 }
622 
624  const MachineFunction *MF = MI.getParent()->getParent();
626 
627  if (isFrameInstr(MI)) {
628  int SPAdj = alignTo(getFrameSize(MI), TFI->getStackAlign());
629  SPAdj -= getFrameAdjustment(MI);
630  if (!isFrameSetup(MI))
631  SPAdj = -SPAdj;
632  return SPAdj;
633  }
634 
635  // To know whether a call adjusts the stack, we need information
636  // that is bound to the following ADJCALLSTACKUP pseudo.
637  // Look for the next ADJCALLSTACKUP that follows the call.
638  if (MI.isCall()) {
639  const MachineBasicBlock *MBB = MI.getParent();
641  for (auto E = MBB->end(); I != E; ++I) {
642  if (I->getOpcode() == getCallFrameDestroyOpcode() ||
643  I->isCall())
644  break;
645  }
646 
647  // If we could not find a frame destroy opcode, then it has already
648  // been simplified, so we don't care.
649  if (I->getOpcode() != getCallFrameDestroyOpcode())
650  return 0;
651 
652  return -(I->getOperand(1).getImm());
653  }
654 
655  // Currently handle only PUSHes we can reasonably expect to see
656  // in call sequences
657  switch (MI.getOpcode()) {
658  default:
659  return 0;
660  case X86::PUSH32i8:
661  case X86::PUSH32r:
662  case X86::PUSH32rmm:
663  case X86::PUSH32rmr:
664  case X86::PUSHi32:
665  return 4;
666  case X86::PUSH64i8:
667  case X86::PUSH64r:
668  case X86::PUSH64rmm:
669  case X86::PUSH64rmr:
670  case X86::PUSH64i32:
671  return 8;
672  }
673 }
674 
675 /// Return true and the FrameIndex if the specified
676 /// operand and follow operands form a reference to the stack frame.
677 bool X86InstrInfo::isFrameOperand(const MachineInstr &MI, unsigned int Op,
678  int &FrameIndex) const {
679  if (MI.getOperand(Op + X86::AddrBaseReg).isFI() &&
680  MI.getOperand(Op + X86::AddrScaleAmt).isImm() &&
681  MI.getOperand(Op + X86::AddrIndexReg).isReg() &&
682  MI.getOperand(Op + X86::AddrDisp).isImm() &&
683  MI.getOperand(Op + X86::AddrScaleAmt).getImm() == 1 &&
684  MI.getOperand(Op + X86::AddrIndexReg).getReg() == 0 &&
685  MI.getOperand(Op + X86::AddrDisp).getImm() == 0) {
686  FrameIndex = MI.getOperand(Op + X86::AddrBaseReg).getIndex();
687  return true;
688  }
689  return false;
690 }
691 
692 static bool isFrameLoadOpcode(int Opcode, unsigned &MemBytes) {
693  switch (Opcode) {
694  default:
695  return false;
696  case X86::MOV8rm:
697  case X86::KMOVBkm:
698  MemBytes = 1;
699  return true;
700  case X86::MOV16rm:
701  case X86::KMOVWkm:
702  case X86::VMOVSHZrm:
703  case X86::VMOVSHZrm_alt:
704  MemBytes = 2;
705  return true;
706  case X86::MOV32rm:
707  case X86::MOVSSrm:
708  case X86::MOVSSrm_alt:
709  case X86::VMOVSSrm:
710  case X86::VMOVSSrm_alt:
711  case X86::VMOVSSZrm:
712  case X86::VMOVSSZrm_alt:
713  case X86::KMOVDkm:
714  MemBytes = 4;
715  return true;
716  case X86::MOV64rm:
717  case X86::LD_Fp64m:
718  case X86::MOVSDrm:
719  case X86::MOVSDrm_alt:
720  case X86::VMOVSDrm:
721  case X86::VMOVSDrm_alt:
722  case X86::VMOVSDZrm:
723  case X86::VMOVSDZrm_alt:
724  case X86::MMX_MOVD64rm:
725  case X86::MMX_MOVQ64rm:
726  case X86::KMOVQkm:
727  MemBytes = 8;
728  return true;
729  case X86::MOVAPSrm:
730  case X86::MOVUPSrm:
731  case X86::MOVAPDrm:
732  case X86::MOVUPDrm:
733  case X86::MOVDQArm:
734  case X86::MOVDQUrm:
735  case X86::VMOVAPSrm:
736  case X86::VMOVUPSrm:
737  case X86::VMOVAPDrm:
738  case X86::VMOVUPDrm:
739  case X86::VMOVDQArm:
740  case X86::VMOVDQUrm:
741  case X86::VMOVAPSZ128rm:
742  case X86::VMOVUPSZ128rm:
743  case X86::VMOVAPSZ128rm_NOVLX:
744  case X86::VMOVUPSZ128rm_NOVLX:
745  case X86::VMOVAPDZ128rm:
746  case X86::VMOVUPDZ128rm:
747  case X86::VMOVDQU8Z128rm:
748  case X86::VMOVDQU16Z128rm:
749  case X86::VMOVDQA32Z128rm:
750  case X86::VMOVDQU32Z128rm:
751  case X86::VMOVDQA64Z128rm:
752  case X86::VMOVDQU64Z128rm:
753  MemBytes = 16;
754  return true;
755  case X86::VMOVAPSYrm:
756  case X86::VMOVUPSYrm:
757  case X86::VMOVAPDYrm:
758  case X86::VMOVUPDYrm:
759  case X86::VMOVDQAYrm:
760  case X86::VMOVDQUYrm:
761  case X86::VMOVAPSZ256rm:
762  case X86::VMOVUPSZ256rm:
763  case X86::VMOVAPSZ256rm_NOVLX:
764  case X86::VMOVUPSZ256rm_NOVLX:
765  case X86::VMOVAPDZ256rm:
766  case X86::VMOVUPDZ256rm:
767  case X86::VMOVDQU8Z256rm:
768  case X86::VMOVDQU16Z256rm:
769  case X86::VMOVDQA32Z256rm:
770  case X86::VMOVDQU32Z256rm:
771  case X86::VMOVDQA64Z256rm:
772  case X86::VMOVDQU64Z256rm:
773  MemBytes = 32;
774  return true;
775  case X86::VMOVAPSZrm:
776  case X86::VMOVUPSZrm:
777  case X86::VMOVAPDZrm:
778  case X86::VMOVUPDZrm:
779  case X86::VMOVDQU8Zrm:
780  case X86::VMOVDQU16Zrm:
781  case X86::VMOVDQA32Zrm:
782  case X86::VMOVDQU32Zrm:
783  case X86::VMOVDQA64Zrm:
784  case X86::VMOVDQU64Zrm:
785  MemBytes = 64;
786  return true;
787  }
788 }
789 
790 static bool isFrameStoreOpcode(int Opcode, unsigned &MemBytes) {
791  switch (Opcode) {
792  default:
793  return false;
794  case X86::MOV8mr:
795  case X86::KMOVBmk:
796  MemBytes = 1;
797  return true;
798  case X86::MOV16mr:
799  case X86::KMOVWmk:
800  case X86::VMOVSHZmr:
801  MemBytes = 2;
802  return true;
803  case X86::MOV32mr:
804  case X86::MOVSSmr:
805  case X86::VMOVSSmr:
806  case X86::VMOVSSZmr:
807  case X86::KMOVDmk:
808  MemBytes = 4;
809  return true;
810  case X86::MOV64mr:
811  case X86::ST_FpP64m:
812  case X86::MOVSDmr:
813  case X86::VMOVSDmr:
814  case X86::VMOVSDZmr:
815  case X86::MMX_MOVD64mr:
816  case X86::MMX_MOVQ64mr:
817  case X86::MMX_MOVNTQmr:
818  case X86::KMOVQmk:
819  MemBytes = 8;
820  return true;
821  case X86::MOVAPSmr:
822  case X86::MOVUPSmr:
823  case X86::MOVAPDmr:
824  case X86::MOVUPDmr:
825  case X86::MOVDQAmr:
826  case X86::MOVDQUmr:
827  case X86::VMOVAPSmr:
828  case X86::VMOVUPSmr:
829  case X86::VMOVAPDmr:
830  case X86::VMOVUPDmr:
831  case X86::VMOVDQAmr:
832  case X86::VMOVDQUmr:
833  case X86::VMOVUPSZ128mr:
834  case X86::VMOVAPSZ128mr:
835  case X86::VMOVUPSZ128mr_NOVLX:
836  case X86::VMOVAPSZ128mr_NOVLX:
837  case X86::VMOVUPDZ128mr:
838  case X86::VMOVAPDZ128mr:
839  case X86::VMOVDQA32Z128mr:
840  case X86::VMOVDQU32Z128mr:
841  case X86::VMOVDQA64Z128mr:
842  case X86::VMOVDQU64Z128mr:
843  case X86::VMOVDQU8Z128mr:
844  case X86::VMOVDQU16Z128mr:
845  MemBytes = 16;
846  return true;
847  case X86::VMOVUPSYmr:
848  case X86::VMOVAPSYmr:
849  case X86::VMOVUPDYmr:
850  case X86::VMOVAPDYmr:
851  case X86::VMOVDQUYmr:
852  case X86::VMOVDQAYmr:
853  case X86::VMOVUPSZ256mr:
854  case X86::VMOVAPSZ256mr:
855  case X86::VMOVUPSZ256mr_NOVLX:
856  case X86::VMOVAPSZ256mr_NOVLX:
857  case X86::VMOVUPDZ256mr:
858  case X86::VMOVAPDZ256mr:
859  case X86::VMOVDQU8Z256mr:
860  case X86::VMOVDQU16Z256mr:
861  case X86::VMOVDQA32Z256mr:
862  case X86::VMOVDQU32Z256mr:
863  case X86::VMOVDQA64Z256mr:
864  case X86::VMOVDQU64Z256mr:
865  MemBytes = 32;
866  return true;
867  case X86::VMOVUPSZmr:
868  case X86::VMOVAPSZmr:
869  case X86::VMOVUPDZmr:
870  case X86::VMOVAPDZmr:
871  case X86::VMOVDQU8Zmr:
872  case X86::VMOVDQU16Zmr:
873  case X86::VMOVDQA32Zmr:
874  case X86::VMOVDQU32Zmr:
875  case X86::VMOVDQA64Zmr:
876  case X86::VMOVDQU64Zmr:
877  MemBytes = 64;
878  return true;
879  }
880  return false;
881 }
882 
884  int &FrameIndex) const {
885  unsigned Dummy;
887 }
888 
890  int &FrameIndex,
891  unsigned &MemBytes) const {
892  if (isFrameLoadOpcode(MI.getOpcode(), MemBytes))
893  if (MI.getOperand(0).getSubReg() == 0 && isFrameOperand(MI, 1, FrameIndex))
894  return MI.getOperand(0).getReg();
895  return 0;
896 }
897 
899  int &FrameIndex) const {
900  unsigned Dummy;
901  if (isFrameLoadOpcode(MI.getOpcode(), Dummy)) {
902  unsigned Reg;
904  return Reg;
905  // Check for post-frame index elimination operations
907  if (hasLoadFromStackSlot(MI, Accesses)) {
908  FrameIndex =
909  cast<FixedStackPseudoSourceValue>(Accesses.front()->getPseudoValue())
910  ->getFrameIndex();
911  return MI.getOperand(0).getReg();
912  }
913  }
914  return 0;
915 }
916 
918  int &FrameIndex) const {
919  unsigned Dummy;
921 }
922 
924  int &FrameIndex,
925  unsigned &MemBytes) const {
926  if (isFrameStoreOpcode(MI.getOpcode(), MemBytes))
927  if (MI.getOperand(X86::AddrNumOperands).getSubReg() == 0 &&
928  isFrameOperand(MI, 0, FrameIndex))
929  return MI.getOperand(X86::AddrNumOperands).getReg();
930  return 0;
931 }
932 
934  int &FrameIndex) const {
935  unsigned Dummy;
936  if (isFrameStoreOpcode(MI.getOpcode(), Dummy)) {
937  unsigned Reg;
939  return Reg;
940  // Check for post-frame index elimination operations
942  if (hasStoreToStackSlot(MI, Accesses)) {
943  FrameIndex =
944  cast<FixedStackPseudoSourceValue>(Accesses.front()->getPseudoValue())
945  ->getFrameIndex();
946  return MI.getOperand(X86::AddrNumOperands).getReg();
947  }
948  }
949  return 0;
950 }
951 
952 /// Return true if register is PIC base; i.e.g defined by X86::MOVPC32r.
953 static bool regIsPICBase(Register BaseReg, const MachineRegisterInfo &MRI) {
954  // Don't waste compile time scanning use-def chains of physregs.
955  if (!BaseReg.isVirtual())
956  return false;
957  bool isPICBase = false;
959  E = MRI.def_instr_end(); I != E; ++I) {
960  MachineInstr *DefMI = &*I;
961  if (DefMI->getOpcode() != X86::MOVPC32r)
962  return false;
963  assert(!isPICBase && "More than one PIC base?");
964  isPICBase = true;
965  }
966  return isPICBase;
967 }
968 
970  AAResults *AA) const {
971  switch (MI.getOpcode()) {
972  default:
973  // This function should only be called for opcodes with the ReMaterializable
974  // flag set.
975  llvm_unreachable("Unknown rematerializable operation!");
976  break;
977 
978  case X86::LOAD_STACK_GUARD:
979  case X86::AVX1_SETALLONES:
980  case X86::AVX2_SETALLONES:
981  case X86::AVX512_128_SET0:
982  case X86::AVX512_256_SET0:
983  case X86::AVX512_512_SET0:
984  case X86::AVX512_512_SETALLONES:
985  case X86::AVX512_FsFLD0SD:
986  case X86::AVX512_FsFLD0SH:
987  case X86::AVX512_FsFLD0SS:
988  case X86::AVX512_FsFLD0F128:
989  case X86::AVX_SET0:
990  case X86::FsFLD0SD:
991  case X86::FsFLD0SS:
992  case X86::FsFLD0F128:
993  case X86::KSET0D:
994  case X86::KSET0Q:
995  case X86::KSET0W:
996  case X86::KSET1D:
997  case X86::KSET1Q:
998  case X86::KSET1W:
999  case X86::MMX_SET0:
1000  case X86::MOV32ImmSExti8:
1001  case X86::MOV32r0:
1002  case X86::MOV32r1:
1003  case X86::MOV32r_1:
1004  case X86::MOV32ri64:
1005  case X86::MOV64ImmSExti8:
1006  case X86::V_SET0:
1007  case X86::V_SETALLONES:
1008  case X86::MOV16ri:
1009  case X86::MOV32ri:
1010  case X86::MOV64ri:
1011  case X86::MOV64ri32:
1012  case X86::MOV8ri:
1013  case X86::PTILEZEROV:
1014  return true;
1015 
1016  case X86::MOV8rm:
1017  case X86::MOV8rm_NOREX:
1018  case X86::MOV16rm:
1019  case X86::MOV32rm:
1020  case X86::MOV64rm:
1021  case X86::MOVSSrm:
1022  case X86::MOVSSrm_alt:
1023  case X86::MOVSDrm:
1024  case X86::MOVSDrm_alt:
1025  case X86::MOVAPSrm:
1026  case X86::MOVUPSrm:
1027  case X86::MOVAPDrm:
1028  case X86::MOVUPDrm:
1029  case X86::MOVDQArm:
1030  case X86::MOVDQUrm:
1031  case X86::VMOVSSrm:
1032  case X86::VMOVSSrm_alt:
1033  case X86::VMOVSDrm:
1034  case X86::VMOVSDrm_alt:
1035  case X86::VMOVAPSrm:
1036  case X86::VMOVUPSrm:
1037  case X86::VMOVAPDrm:
1038  case X86::VMOVUPDrm:
1039  case X86::VMOVDQArm:
1040  case X86::VMOVDQUrm:
1041  case X86::VMOVAPSYrm:
1042  case X86::VMOVUPSYrm:
1043  case X86::VMOVAPDYrm:
1044  case X86::VMOVUPDYrm:
1045  case X86::VMOVDQAYrm:
1046  case X86::VMOVDQUYrm:
1047  case X86::MMX_MOVD64rm:
1048  case X86::MMX_MOVQ64rm:
1049  // AVX-512
1050  case X86::VMOVSSZrm:
1051  case X86::VMOVSSZrm_alt:
1052  case X86::VMOVSDZrm:
1053  case X86::VMOVSDZrm_alt:
1054  case X86::VMOVSHZrm:
1055  case X86::VMOVSHZrm_alt:
1056  case X86::VMOVAPDZ128rm:
1057  case X86::VMOVAPDZ256rm:
1058  case X86::VMOVAPDZrm:
1059  case X86::VMOVAPSZ128rm:
1060  case X86::VMOVAPSZ256rm:
1061  case X86::VMOVAPSZ128rm_NOVLX:
1062  case X86::VMOVAPSZ256rm_NOVLX:
1063  case X86::VMOVAPSZrm:
1064  case X86::VMOVDQA32Z128rm:
1065  case X86::VMOVDQA32Z256rm:
1066  case X86::VMOVDQA32Zrm:
1067  case X86::VMOVDQA64Z128rm:
1068  case X86::VMOVDQA64Z256rm:
1069  case X86::VMOVDQA64Zrm:
1070  case X86::VMOVDQU16Z128rm:
1071  case X86::VMOVDQU16Z256rm:
1072  case X86::VMOVDQU16Zrm:
1073  case X86::VMOVDQU32Z128rm:
1074  case X86::VMOVDQU32Z256rm:
1075  case X86::VMOVDQU32Zrm:
1076  case X86::VMOVDQU64Z128rm:
1077  case X86::VMOVDQU64Z256rm:
1078  case X86::VMOVDQU64Zrm:
1079  case X86::VMOVDQU8Z128rm:
1080  case X86::VMOVDQU8Z256rm:
1081  case X86::VMOVDQU8Zrm:
1082  case X86::VMOVUPDZ128rm:
1083  case X86::VMOVUPDZ256rm:
1084  case X86::VMOVUPDZrm:
1085  case X86::VMOVUPSZ128rm:
1086  case X86::VMOVUPSZ256rm:
1087  case X86::VMOVUPSZ128rm_NOVLX:
1088  case X86::VMOVUPSZ256rm_NOVLX:
1089  case X86::VMOVUPSZrm: {
1090  // Loads from constant pools are trivially rematerializable.
1091  if (MI.getOperand(1 + X86::AddrBaseReg).isReg() &&
1092  MI.getOperand(1 + X86::AddrScaleAmt).isImm() &&
1093  MI.getOperand(1 + X86::AddrIndexReg).isReg() &&
1094  MI.getOperand(1 + X86::AddrIndexReg).getReg() == 0 &&
1095  MI.isDereferenceableInvariantLoad(AA)) {
1096  Register BaseReg = MI.getOperand(1 + X86::AddrBaseReg).getReg();
1097  if (BaseReg == 0 || BaseReg == X86::RIP)
1098  return true;
1099  // Allow re-materialization of PIC load.
1100  if (!ReMatPICStubLoad && MI.getOperand(1 + X86::AddrDisp).isGlobal())
1101  return false;
1102  const MachineFunction &MF = *MI.getParent()->getParent();
1103  const MachineRegisterInfo &MRI = MF.getRegInfo();
1104  return regIsPICBase(BaseReg, MRI);
1105  }
1106  return false;
1107  }
1108 
1109  case X86::LEA32r:
1110  case X86::LEA64r: {
1111  if (MI.getOperand(1 + X86::AddrScaleAmt).isImm() &&
1112  MI.getOperand(1 + X86::AddrIndexReg).isReg() &&
1113  MI.getOperand(1 + X86::AddrIndexReg).getReg() == 0 &&
1114  !MI.getOperand(1 + X86::AddrDisp).isReg()) {
1115  // lea fi#, lea GV, etc. are all rematerializable.
1116  if (!MI.getOperand(1 + X86::AddrBaseReg).isReg())
1117  return true;
1118  Register BaseReg = MI.getOperand(1 + X86::AddrBaseReg).getReg();
1119  if (BaseReg == 0)
1120  return true;
1121  // Allow re-materialization of lea PICBase + x.
1122  const MachineFunction &MF = *MI.getParent()->getParent();
1123  const MachineRegisterInfo &MRI = MF.getRegInfo();
1124  return regIsPICBase(BaseReg, MRI);
1125  }
1126  return false;
1127  }
1128  }
1129 }
1130 
1133  Register DestReg, unsigned SubIdx,
1134  const MachineInstr &Orig,
1135  const TargetRegisterInfo &TRI) const {
1136  bool ClobbersEFLAGS = Orig.modifiesRegister(X86::EFLAGS, &TRI);
1137  if (ClobbersEFLAGS && MBB.computeRegisterLiveness(&TRI, X86::EFLAGS, I) !=
1139  // The instruction clobbers EFLAGS. Re-materialize as MOV32ri to avoid side
1140  // effects.
1141  int Value;
1142  switch (Orig.getOpcode()) {
1143  case X86::MOV32r0: Value = 0; break;
1144  case X86::MOV32r1: Value = 1; break;
1145  case X86::MOV32r_1: Value = -1; break;
1146  default:
1147  llvm_unreachable("Unexpected instruction!");
1148  }
1149 
1150  const DebugLoc &DL = Orig.getDebugLoc();
1151  BuildMI(MBB, I, DL, get(X86::MOV32ri))
1152  .add(Orig.getOperand(0))
1153  .addImm(Value);
1154  } else {
1156  MBB.insert(I, MI);
1157  }
1158 
1159  MachineInstr &NewMI = *std::prev(I);
1160  NewMI.substituteRegister(Orig.getOperand(0).getReg(), DestReg, SubIdx, TRI);
1161 }
1162 
1163 /// True if MI has a condition code def, e.g. EFLAGS, that is not marked dead.
1165  for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
1166  MachineOperand &MO = MI.getOperand(i);
1167  if (MO.isReg() && MO.isDef() &&
1168  MO.getReg() == X86::EFLAGS && !MO.isDead()) {
1169  return true;
1170  }
1171  }
1172  return false;
1173 }
1174 
1175 /// Check whether the shift count for a machine operand is non-zero.
1176 inline static unsigned getTruncatedShiftCount(const MachineInstr &MI,
1177  unsigned ShiftAmtOperandIdx) {
1178  // The shift count is six bits with the REX.W prefix and five bits without.
1179  unsigned ShiftCountMask = (MI.getDesc().TSFlags & X86II::REX_W) ? 63 : 31;
1180  unsigned Imm = MI.getOperand(ShiftAmtOperandIdx).getImm();
1181  return Imm & ShiftCountMask;
1182 }
1183 
1184 /// Check whether the given shift count is appropriate
1185 /// can be represented by a LEA instruction.
1186 inline static bool isTruncatedShiftCountForLEA(unsigned ShAmt) {
1187  // Left shift instructions can be transformed into load-effective-address
1188  // instructions if we can encode them appropriately.
1189  // A LEA instruction utilizes a SIB byte to encode its scale factor.
1190  // The SIB.scale field is two bits wide which means that we can encode any
1191  // shift amount less than 4.
1192  return ShAmt < 4 && ShAmt > 0;
1193 }
1194 
1196  unsigned Opc, bool AllowSP, Register &NewSrc,
1197  bool &isKill, MachineOperand &ImplicitOp,
1198  LiveVariables *LV) const {
1199  MachineFunction &MF = *MI.getParent()->getParent();
1200  const TargetRegisterClass *RC;
1201  if (AllowSP) {
1202  RC = Opc != X86::LEA32r ? &X86::GR64RegClass : &X86::GR32RegClass;
1203  } else {
1204  RC = Opc != X86::LEA32r ?
1205  &X86::GR64_NOSPRegClass : &X86::GR32_NOSPRegClass;
1206  }
1207  Register SrcReg = Src.getReg();
1208  isKill = MI.killsRegister(SrcReg);
1209 
1210  // For both LEA64 and LEA32 the register already has essentially the right
1211  // type (32-bit or 64-bit) we may just need to forbid SP.
1212  if (Opc != X86::LEA64_32r) {
1213  NewSrc = SrcReg;
1214  assert(!Src.isUndef() && "Undef op doesn't need optimization");
1215 
1216  if (NewSrc.isVirtual() && !MF.getRegInfo().constrainRegClass(NewSrc, RC))
1217  return false;
1218 
1219  return true;
1220  }
1221 
1222  // This is for an LEA64_32r and incoming registers are 32-bit. One way or
1223  // another we need to add 64-bit registers to the final MI.
1224  if (SrcReg.isPhysical()) {
1225  ImplicitOp = Src;
1226  ImplicitOp.setImplicit();
1227 
1228  NewSrc = getX86SubSuperRegister(SrcReg, 64);
1229  assert(!Src.isUndef() && "Undef op doesn't need optimization");
1230  } else {
1231  // Virtual register of the wrong class, we have to create a temporary 64-bit
1232  // vreg to feed into the LEA.
1233  NewSrc = MF.getRegInfo().createVirtualRegister(RC);
1234  MachineInstr *Copy =
1235  BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(TargetOpcode::COPY))
1236  .addReg(NewSrc, RegState::Define | RegState::Undef, X86::sub_32bit)
1237  .addReg(SrcReg, getKillRegState(isKill));
1238 
1239  // Which is obviously going to be dead after we're done with it.
1240  isKill = true;
1241 
1242  if (LV)
1243  LV->replaceKillInstruction(SrcReg, MI, *Copy);
1244  }
1245 
1246  // We've set all the parameters without issue.
1247  return true;
1248 }
1249 
1250 MachineInstr *X86InstrInfo::convertToThreeAddressWithLEA(unsigned MIOpc,
1251  MachineInstr &MI,
1252  LiveVariables *LV,
1253  bool Is8BitOp) const {
1254  // We handle 8-bit adds and various 16-bit opcodes in the switch below.
1255  MachineBasicBlock &MBB = *MI.getParent();
1256  MachineRegisterInfo &RegInfo = MBB.getParent()->getRegInfo();
1257  assert((Is8BitOp || RegInfo.getTargetRegisterInfo()->getRegSizeInBits(
1258  *RegInfo.getRegClass(MI.getOperand(0).getReg())) == 16) &&
1259  "Unexpected type for LEA transform");
1260 
1261  // TODO: For a 32-bit target, we need to adjust the LEA variables with
1262  // something like this:
1263  // Opcode = X86::LEA32r;
1264  // InRegLEA = RegInfo.createVirtualRegister(&X86::GR32_NOSPRegClass);
1265  // OutRegLEA =
1266  // Is8BitOp ? RegInfo.createVirtualRegister(&X86::GR32ABCD_RegClass)
1267  // : RegInfo.createVirtualRegister(&X86::GR32RegClass);
1268  if (!Subtarget.is64Bit())
1269  return nullptr;
1270 
1271  unsigned Opcode = X86::LEA64_32r;
1272  Register InRegLEA = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass);
1273  Register OutRegLEA = RegInfo.createVirtualRegister(&X86::GR32RegClass);
1274 
1275  // Build and insert into an implicit UNDEF value. This is OK because
1276  // we will be shifting and then extracting the lower 8/16-bits.
1277  // This has the potential to cause partial register stall. e.g.
1278  // movw (%rbp,%rcx,2), %dx
1279  // leal -65(%rdx), %esi
1280  // But testing has shown this *does* help performance in 64-bit mode (at
1281  // least on modern x86 machines).
1282  MachineBasicBlock::iterator MBBI = MI.getIterator();
1283  Register Dest = MI.getOperand(0).getReg();
1284  Register Src = MI.getOperand(1).getReg();
1285  bool IsDead = MI.getOperand(0).isDead();
1286  bool IsKill = MI.getOperand(1).isKill();
1287  unsigned SubReg = Is8BitOp ? X86::sub_8bit : X86::sub_16bit;
1288  assert(!MI.getOperand(1).isUndef() && "Undef op doesn't need optimization");
1289  BuildMI(MBB, MBBI, MI.getDebugLoc(), get(X86::IMPLICIT_DEF), InRegLEA);
1290  MachineInstr *InsMI =
1291  BuildMI(MBB, MBBI, MI.getDebugLoc(), get(TargetOpcode::COPY))
1292  .addReg(InRegLEA, RegState::Define, SubReg)
1293  .addReg(Src, getKillRegState(IsKill));
1294 
1295  MachineInstrBuilder MIB =
1296  BuildMI(MBB, MBBI, MI.getDebugLoc(), get(Opcode), OutRegLEA);
1297  switch (MIOpc) {
1298  default: llvm_unreachable("Unreachable!");
1299  case X86::SHL8ri:
1300  case X86::SHL16ri: {
1301  unsigned ShAmt = MI.getOperand(2).getImm();
1302  MIB.addReg(0).addImm(1ULL << ShAmt)
1303  .addReg(InRegLEA, RegState::Kill).addImm(0).addReg(0);
1304  break;
1305  }
1306  case X86::INC8r:
1307  case X86::INC16r:
1308  addRegOffset(MIB, InRegLEA, true, 1);
1309  break;
1310  case X86::DEC8r:
1311  case X86::DEC16r:
1312  addRegOffset(MIB, InRegLEA, true, -1);
1313  break;
1314  case X86::ADD8ri:
1315  case X86::ADD8ri_DB:
1316  case X86::ADD16ri:
1317  case X86::ADD16ri8:
1318  case X86::ADD16ri_DB:
1319  case X86::ADD16ri8_DB:
1320  addRegOffset(MIB, InRegLEA, true, MI.getOperand(2).getImm());
1321  break;
1322  case X86::ADD8rr:
1323  case X86::ADD8rr_DB:
1324  case X86::ADD16rr:
1325  case X86::ADD16rr_DB: {
1326  Register Src2 = MI.getOperand(2).getReg();
1327  bool IsKill2 = MI.getOperand(2).isKill();
1328  assert(!MI.getOperand(2).isUndef() && "Undef op doesn't need optimization");
1329  unsigned InRegLEA2 = 0;
1330  MachineInstr *InsMI2 = nullptr;
1331  if (Src == Src2) {
1332  // ADD8rr/ADD16rr killed %reg1028, %reg1028
1333  // just a single insert_subreg.
1334  addRegReg(MIB, InRegLEA, true, InRegLEA, false);
1335  } else {
1336  if (Subtarget.is64Bit())
1337  InRegLEA2 = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass);
1338  else
1339  InRegLEA2 = RegInfo.createVirtualRegister(&X86::GR32_NOSPRegClass);
1340  // Build and insert into an implicit UNDEF value. This is OK because
1341  // we will be shifting and then extracting the lower 8/16-bits.
1342  BuildMI(MBB, &*MIB, MI.getDebugLoc(), get(X86::IMPLICIT_DEF), InRegLEA2);
1343  InsMI2 = BuildMI(MBB, &*MIB, MI.getDebugLoc(), get(TargetOpcode::COPY))
1344  .addReg(InRegLEA2, RegState::Define, SubReg)
1345  .addReg(Src2, getKillRegState(IsKill2));
1346  addRegReg(MIB, InRegLEA, true, InRegLEA2, true);
1347  }
1348  if (LV && IsKill2 && InsMI2)
1349  LV->replaceKillInstruction(Src2, MI, *InsMI2);
1350  break;
1351  }
1352  }
1353 
1354  MachineInstr *NewMI = MIB;
1355  MachineInstr *ExtMI =
1356  BuildMI(MBB, MBBI, MI.getDebugLoc(), get(TargetOpcode::COPY))
1358  .addReg(OutRegLEA, RegState::Kill, SubReg);
1359 
1360  if (LV) {
1361  // Update live variables.
1362  LV->getVarInfo(InRegLEA).Kills.push_back(NewMI);
1363  LV->getVarInfo(OutRegLEA).Kills.push_back(ExtMI);
1364  if (IsKill)
1365  LV->replaceKillInstruction(Src, MI, *InsMI);
1366  if (IsDead)
1367  LV->replaceKillInstruction(Dest, MI, *ExtMI);
1368  }
1369 
1370  return ExtMI;
1371 }
1372 
1373 /// This method must be implemented by targets that
1374 /// set the M_CONVERTIBLE_TO_3_ADDR flag. When this flag is set, the target
1375 /// may be able to convert a two-address instruction into a true
1376 /// three-address instruction on demand. This allows the X86 target (for
1377 /// example) to convert ADD and SHL instructions into LEA instructions if they
1378 /// would require register copies due to two-addressness.
1379 ///
1380 /// This method returns a null pointer if the transformation cannot be
1381 /// performed, otherwise it returns the new instruction.
1382 ///
1384  LiveVariables *LV) const {
1385  // The following opcodes also sets the condition code register(s). Only
1386  // convert them to equivalent lea if the condition code register def's
1387  // are dead!
1388  if (hasLiveCondCodeDef(MI))
1389  return nullptr;
1390 
1391  MachineFunction &MF = *MI.getParent()->getParent();
1392  // All instructions input are two-addr instructions. Get the known operands.
1393  const MachineOperand &Dest = MI.getOperand(0);
1394  const MachineOperand &Src = MI.getOperand(1);
1395 
1396  // Ideally, operations with undef should be folded before we get here, but we
1397  // can't guarantee it. Bail out because optimizing undefs is a waste of time.
1398  // Without this, we have to forward undef state to new register operands to
1399  // avoid machine verifier errors.
1400  if (Src.isUndef())
1401  return nullptr;
1402  if (MI.getNumOperands() > 2)
1403  if (MI.getOperand(2).isReg() && MI.getOperand(2).isUndef())
1404  return nullptr;
1405 
1406  MachineInstr *NewMI = nullptr;
1407  bool Is64Bit = Subtarget.is64Bit();
1408 
1409  bool Is8BitOp = false;
1410  unsigned MIOpc = MI.getOpcode();
1411  switch (MIOpc) {
1412  default: llvm_unreachable("Unreachable!");
1413  case X86::SHL64ri: {
1414  assert(MI.getNumOperands() >= 3 && "Unknown shift instruction!");
1415  unsigned ShAmt = getTruncatedShiftCount(MI, 2);
1416  if (!isTruncatedShiftCountForLEA(ShAmt)) return nullptr;
1417 
1418  // LEA can't handle RSP.
1419  if (Src.getReg().isVirtual() && !MF.getRegInfo().constrainRegClass(
1420  Src.getReg(), &X86::GR64_NOSPRegClass))
1421  return nullptr;
1422 
1423  NewMI = BuildMI(MF, MI.getDebugLoc(), get(X86::LEA64r))
1424  .add(Dest)
1425  .addReg(0)
1426  .addImm(1ULL << ShAmt)
1427  .add(Src)
1428  .addImm(0)
1429  .addReg(0);
1430  break;
1431  }
1432  case X86::SHL32ri: {
1433  assert(MI.getNumOperands() >= 3 && "Unknown shift instruction!");
1434  unsigned ShAmt = getTruncatedShiftCount(MI, 2);
1435  if (!isTruncatedShiftCountForLEA(ShAmt)) return nullptr;
1436 
1437  unsigned Opc = Is64Bit ? X86::LEA64_32r : X86::LEA32r;
1438 
1439  // LEA can't handle ESP.
1440  bool isKill;
1441  Register SrcReg;
1442  MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
1443  if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ false,
1444  SrcReg, isKill, ImplicitOp, LV))
1445  return nullptr;
1446 
1447  MachineInstrBuilder MIB =
1448  BuildMI(MF, MI.getDebugLoc(), get(Opc))
1449  .add(Dest)
1450  .addReg(0)
1451  .addImm(1ULL << ShAmt)
1452  .addReg(SrcReg, getKillRegState(isKill))
1453  .addImm(0)
1454  .addReg(0);
1455  if (ImplicitOp.getReg() != 0)
1456  MIB.add(ImplicitOp);
1457  NewMI = MIB;
1458 
1459  break;
1460  }
1461  case X86::SHL8ri:
1462  Is8BitOp = true;
1464  case X86::SHL16ri: {
1465  assert(MI.getNumOperands() >= 3 && "Unknown shift instruction!");
1466  unsigned ShAmt = getTruncatedShiftCount(MI, 2);
1467  if (!isTruncatedShiftCountForLEA(ShAmt))
1468  return nullptr;
1469  return convertToThreeAddressWithLEA(MIOpc, MI, LV, Is8BitOp);
1470  }
1471  case X86::INC64r:
1472  case X86::INC32r: {
1473  assert(MI.getNumOperands() >= 2 && "Unknown inc instruction!");
1474  unsigned Opc = MIOpc == X86::INC64r ? X86::LEA64r :
1475  (Is64Bit ? X86::LEA64_32r : X86::LEA32r);
1476  bool isKill;
1477  Register SrcReg;
1478  MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
1479  if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ false, SrcReg, isKill,
1480  ImplicitOp, LV))
1481  return nullptr;
1482 
1483  MachineInstrBuilder MIB =
1484  BuildMI(MF, MI.getDebugLoc(), get(Opc))
1485  .add(Dest)
1486  .addReg(SrcReg, getKillRegState(isKill));
1487  if (ImplicitOp.getReg() != 0)
1488  MIB.add(ImplicitOp);
1489 
1490  NewMI = addOffset(MIB, 1);
1491  break;
1492  }
1493  case X86::DEC64r:
1494  case X86::DEC32r: {
1495  assert(MI.getNumOperands() >= 2 && "Unknown dec instruction!");
1496  unsigned Opc = MIOpc == X86::DEC64r ? X86::LEA64r
1497  : (Is64Bit ? X86::LEA64_32r : X86::LEA32r);
1498 
1499  bool isKill;
1500  Register SrcReg;
1501  MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
1502  if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ false, SrcReg, isKill,
1503  ImplicitOp, LV))
1504  return nullptr;
1505 
1506  MachineInstrBuilder MIB = BuildMI(MF, MI.getDebugLoc(), get(Opc))
1507  .add(Dest)
1508  .addReg(SrcReg, getKillRegState(isKill));
1509  if (ImplicitOp.getReg() != 0)
1510  MIB.add(ImplicitOp);
1511 
1512  NewMI = addOffset(MIB, -1);
1513 
1514  break;
1515  }
1516  case X86::DEC8r:
1517  case X86::INC8r:
1518  Is8BitOp = true;
1520  case X86::DEC16r:
1521  case X86::INC16r:
1522  return convertToThreeAddressWithLEA(MIOpc, MI, LV, Is8BitOp);
1523  case X86::ADD64rr:
1524  case X86::ADD64rr_DB:
1525  case X86::ADD32rr:
1526  case X86::ADD32rr_DB: {
1527  assert(MI.getNumOperands() >= 3 && "Unknown add instruction!");
1528  unsigned Opc;
1529  if (MIOpc == X86::ADD64rr || MIOpc == X86::ADD64rr_DB)
1530  Opc = X86::LEA64r;
1531  else
1532  Opc = Is64Bit ? X86::LEA64_32r : X86::LEA32r;
1533 
1534  const MachineOperand &Src2 = MI.getOperand(2);
1535  bool isKill2;
1536  Register SrcReg2;
1537  MachineOperand ImplicitOp2 = MachineOperand::CreateReg(0, false);
1538  if (!classifyLEAReg(MI, Src2, Opc, /*AllowSP=*/ false,
1539  SrcReg2, isKill2, ImplicitOp2, LV))
1540  return nullptr;
1541 
1542  bool isKill;
1543  Register SrcReg;
1544  MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
1545  if (Src.getReg() == Src2.getReg()) {
1546  // Don't call classify LEAReg a second time on the same register, in case
1547  // the first call inserted a COPY from Src2 and marked it as killed.
1548  isKill = isKill2;
1549  SrcReg = SrcReg2;
1550  } else {
1551  if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/true,
1552  SrcReg, isKill, ImplicitOp, LV))
1553  return nullptr;
1554  }
1555 
1556  MachineInstrBuilder MIB = BuildMI(MF, MI.getDebugLoc(), get(Opc)).add(Dest);
1557  if (ImplicitOp.getReg() != 0)
1558  MIB.add(ImplicitOp);
1559  if (ImplicitOp2.getReg() != 0)
1560  MIB.add(ImplicitOp2);
1561 
1562  NewMI = addRegReg(MIB, SrcReg, isKill, SrcReg2, isKill2);
1563  if (LV && Src2.isKill())
1564  LV->replaceKillInstruction(SrcReg2, MI, *NewMI);
1565  break;
1566  }
1567  case X86::ADD8rr:
1568  case X86::ADD8rr_DB:
1569  Is8BitOp = true;
1571  case X86::ADD16rr:
1572  case X86::ADD16rr_DB:
1573  return convertToThreeAddressWithLEA(MIOpc, MI, LV, Is8BitOp);
1574  case X86::ADD64ri32:
1575  case X86::ADD64ri8:
1576  case X86::ADD64ri32_DB:
1577  case X86::ADD64ri8_DB:
1578  assert(MI.getNumOperands() >= 3 && "Unknown add instruction!");
1579  NewMI = addOffset(
1580  BuildMI(MF, MI.getDebugLoc(), get(X86::LEA64r)).add(Dest).add(Src),
1581  MI.getOperand(2));
1582  break;
1583  case X86::ADD32ri:
1584  case X86::ADD32ri8:
1585  case X86::ADD32ri_DB:
1586  case X86::ADD32ri8_DB: {
1587  assert(MI.getNumOperands() >= 3 && "Unknown add instruction!");
1588  unsigned Opc = Is64Bit ? X86::LEA64_32r : X86::LEA32r;
1589 
1590  bool isKill;
1591  Register SrcReg;
1592  MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
1593  if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ true,
1594  SrcReg, isKill, ImplicitOp, LV))
1595  return nullptr;
1596 
1597  MachineInstrBuilder MIB = BuildMI(MF, MI.getDebugLoc(), get(Opc))
1598  .add(Dest)
1599  .addReg(SrcReg, getKillRegState(isKill));
1600  if (ImplicitOp.getReg() != 0)
1601  MIB.add(ImplicitOp);
1602 
1603  NewMI = addOffset(MIB, MI.getOperand(2));
1604  break;
1605  }
1606  case X86::ADD8ri:
1607  case X86::ADD8ri_DB:
1608  Is8BitOp = true;
1610  case X86::ADD16ri:
1611  case X86::ADD16ri8:
1612  case X86::ADD16ri_DB:
1613  case X86::ADD16ri8_DB:
1614  return convertToThreeAddressWithLEA(MIOpc, MI, LV, Is8BitOp);
1615  case X86::SUB8ri:
1616  case X86::SUB16ri8:
1617  case X86::SUB16ri:
1618  /// FIXME: Support these similar to ADD8ri/ADD16ri*.
1619  return nullptr;
1620  case X86::SUB32ri8:
1621  case X86::SUB32ri: {
1622  if (!MI.getOperand(2).isImm())
1623  return nullptr;
1624  int64_t Imm = MI.getOperand(2).getImm();
1625  if (!isInt<32>(-Imm))
1626  return nullptr;
1627 
1628  assert(MI.getNumOperands() >= 3 && "Unknown add instruction!");
1629  unsigned Opc = Is64Bit ? X86::LEA64_32r : X86::LEA32r;
1630 
1631  bool isKill;
1632  Register SrcReg;
1633  MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
1634  if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ true,
1635  SrcReg, isKill, ImplicitOp, LV))
1636  return nullptr;
1637 
1638  MachineInstrBuilder MIB = BuildMI(MF, MI.getDebugLoc(), get(Opc))
1639  .add(Dest)
1640  .addReg(SrcReg, getKillRegState(isKill));
1641  if (ImplicitOp.getReg() != 0)
1642  MIB.add(ImplicitOp);
1643 
1644  NewMI = addOffset(MIB, -Imm);
1645  break;
1646  }
1647 
1648  case X86::SUB64ri8:
1649  case X86::SUB64ri32: {
1650  if (!MI.getOperand(2).isImm())
1651  return nullptr;
1652  int64_t Imm = MI.getOperand(2).getImm();
1653  if (!isInt<32>(-Imm))
1654  return nullptr;
1655 
1656  assert(MI.getNumOperands() >= 3 && "Unknown sub instruction!");
1657 
1658  MachineInstrBuilder MIB = BuildMI(MF, MI.getDebugLoc(),
1659  get(X86::LEA64r)).add(Dest).add(Src);
1660  NewMI = addOffset(MIB, -Imm);
1661  break;
1662  }
1663 
1664  case X86::VMOVDQU8Z128rmk:
1665  case X86::VMOVDQU8Z256rmk:
1666  case X86::VMOVDQU8Zrmk:
1667  case X86::VMOVDQU16Z128rmk:
1668  case X86::VMOVDQU16Z256rmk:
1669  case X86::VMOVDQU16Zrmk:
1670  case X86::VMOVDQU32Z128rmk: case X86::VMOVDQA32Z128rmk:
1671  case X86::VMOVDQU32Z256rmk: case X86::VMOVDQA32Z256rmk:
1672  case X86::VMOVDQU32Zrmk: case X86::VMOVDQA32Zrmk:
1673  case X86::VMOVDQU64Z128rmk: case X86::VMOVDQA64Z128rmk:
1674  case X86::VMOVDQU64Z256rmk: case X86::VMOVDQA64Z256rmk:
1675  case X86::VMOVDQU64Zrmk: case X86::VMOVDQA64Zrmk:
1676  case X86::VMOVUPDZ128rmk: case X86::VMOVAPDZ128rmk:
1677  case X86::VMOVUPDZ256rmk: case X86::VMOVAPDZ256rmk:
1678  case X86::VMOVUPDZrmk: case X86::VMOVAPDZrmk:
1679  case X86::VMOVUPSZ128rmk: case X86::VMOVAPSZ128rmk:
1680  case X86::VMOVUPSZ256rmk: case X86::VMOVAPSZ256rmk:
1681  case X86::VMOVUPSZrmk: case X86::VMOVAPSZrmk:
1682  case X86::VBROADCASTSDZ256rmk:
1683  case X86::VBROADCASTSDZrmk:
1684  case X86::VBROADCASTSSZ128rmk:
1685  case X86::VBROADCASTSSZ256rmk:
1686  case X86::VBROADCASTSSZrmk:
1687  case X86::VPBROADCASTDZ128rmk:
1688  case X86::VPBROADCASTDZ256rmk:
1689  case X86::VPBROADCASTDZrmk:
1690  case X86::VPBROADCASTQZ128rmk:
1691  case X86::VPBROADCASTQZ256rmk:
1692  case X86::VPBROADCASTQZrmk: {
1693  unsigned Opc;
1694  switch (MIOpc) {
1695  default: llvm_unreachable("Unreachable!");
1696  case X86::VMOVDQU8Z128rmk: Opc = X86::VPBLENDMBZ128rmk; break;
1697  case X86::VMOVDQU8Z256rmk: Opc = X86::VPBLENDMBZ256rmk; break;
1698  case X86::VMOVDQU8Zrmk: Opc = X86::VPBLENDMBZrmk; break;
1699  case X86::VMOVDQU16Z128rmk: Opc = X86::VPBLENDMWZ128rmk; break;
1700  case X86::VMOVDQU16Z256rmk: Opc = X86::VPBLENDMWZ256rmk; break;
1701  case X86::VMOVDQU16Zrmk: Opc = X86::VPBLENDMWZrmk; break;
1702  case X86::VMOVDQU32Z128rmk: Opc = X86::VPBLENDMDZ128rmk; break;
1703  case X86::VMOVDQU32Z256rmk: Opc = X86::VPBLENDMDZ256rmk; break;
1704  case X86::VMOVDQU32Zrmk: Opc = X86::VPBLENDMDZrmk; break;
1705  case X86::VMOVDQU64Z128rmk: Opc = X86::VPBLENDMQZ128rmk; break;
1706  case X86::VMOVDQU64Z256rmk: Opc = X86::VPBLENDMQZ256rmk; break;
1707  case X86::VMOVDQU64Zrmk: Opc = X86::VPBLENDMQZrmk; break;
1708  case X86::VMOVUPDZ128rmk: Opc = X86::VBLENDMPDZ128rmk; break;
1709  case X86::VMOVUPDZ256rmk: Opc = X86::VBLENDMPDZ256rmk; break;
1710  case X86::VMOVUPDZrmk: Opc = X86::VBLENDMPDZrmk; break;
1711  case X86::VMOVUPSZ128rmk: Opc = X86::VBLENDMPSZ128rmk; break;
1712  case X86::VMOVUPSZ256rmk: Opc = X86::VBLENDMPSZ256rmk; break;
1713  case X86::VMOVUPSZrmk: Opc = X86::VBLENDMPSZrmk; break;
1714  case X86::VMOVDQA32Z128rmk: Opc = X86::VPBLENDMDZ128rmk; break;
1715  case X86::VMOVDQA32Z256rmk: Opc = X86::VPBLENDMDZ256rmk; break;
1716  case X86::VMOVDQA32Zrmk: Opc = X86::VPBLENDMDZrmk; break;
1717  case X86::VMOVDQA64Z128rmk: Opc = X86::VPBLENDMQZ128rmk; break;
1718  case X86::VMOVDQA64Z256rmk: Opc = X86::VPBLENDMQZ256rmk; break;
1719  case X86::VMOVDQA64Zrmk: Opc = X86::VPBLENDMQZrmk; break;
1720  case X86::VMOVAPDZ128rmk: Opc = X86::VBLENDMPDZ128rmk; break;
1721  case X86::VMOVAPDZ256rmk: Opc = X86::VBLENDMPDZ256rmk; break;
1722  case X86::VMOVAPDZrmk: Opc = X86::VBLENDMPDZrmk; break;
1723  case X86::VMOVAPSZ128rmk: Opc = X86::VBLENDMPSZ128rmk; break;
1724  case X86::VMOVAPSZ256rmk: Opc = X86::VBLENDMPSZ256rmk; break;
1725  case X86::VMOVAPSZrmk: Opc = X86::VBLENDMPSZrmk; break;
1726  case X86::VBROADCASTSDZ256rmk: Opc = X86::VBLENDMPDZ256rmbk; break;
1727  case X86::VBROADCASTSDZrmk: Opc = X86::VBLENDMPDZrmbk; break;
1728  case X86::VBROADCASTSSZ128rmk: Opc = X86::VBLENDMPSZ128rmbk; break;
1729  case X86::VBROADCASTSSZ256rmk: Opc = X86::VBLENDMPSZ256rmbk; break;
1730  case X86::VBROADCASTSSZrmk: Opc = X86::VBLENDMPSZrmbk; break;
1731  case X86::VPBROADCASTDZ128rmk: Opc = X86::VPBLENDMDZ128rmbk; break;
1732  case X86::VPBROADCASTDZ256rmk: Opc = X86::VPBLENDMDZ256rmbk; break;
1733  case X86::VPBROADCASTDZrmk: Opc = X86::VPBLENDMDZrmbk; break;
1734  case X86::VPBROADCASTQZ128rmk: Opc = X86::VPBLENDMQZ128rmbk; break;
1735  case X86::VPBROADCASTQZ256rmk: Opc = X86::VPBLENDMQZ256rmbk; break;
1736  case X86::VPBROADCASTQZrmk: Opc = X86::VPBLENDMQZrmbk; break;
1737  }
1738 
1739  NewMI = BuildMI(MF, MI.getDebugLoc(), get(Opc))
1740  .add(Dest)
1741  .add(MI.getOperand(2))
1742  .add(Src)
1743  .add(MI.getOperand(3))
1744  .add(MI.getOperand(4))
1745  .add(MI.getOperand(5))
1746  .add(MI.getOperand(6))
1747  .add(MI.getOperand(7));
1748  break;
1749  }
1750 
1751  case X86::VMOVDQU8Z128rrk:
1752  case X86::VMOVDQU8Z256rrk:
1753  case X86::VMOVDQU8Zrrk:
1754  case X86::VMOVDQU16Z128rrk:
1755  case X86::VMOVDQU16Z256rrk:
1756  case X86::VMOVDQU16Zrrk:
1757  case X86::VMOVDQU32Z128rrk: case X86::VMOVDQA32Z128rrk:
1758  case X86::VMOVDQU32Z256rrk: case X86::VMOVDQA32Z256rrk:
1759  case X86::VMOVDQU32Zrrk: case X86::VMOVDQA32Zrrk:
1760  case X86::VMOVDQU64Z128rrk: case X86::VMOVDQA64Z128rrk:
1761  case X86::VMOVDQU64Z256rrk: case X86::VMOVDQA64Z256rrk:
1762  case X86::VMOVDQU64Zrrk: case X86::VMOVDQA64Zrrk:
1763  case X86::VMOVUPDZ128rrk: case X86::VMOVAPDZ128rrk:
1764  case X86::VMOVUPDZ256rrk: case X86::VMOVAPDZ256rrk:
1765  case X86::VMOVUPDZrrk: case X86::VMOVAPDZrrk:
1766  case X86::VMOVUPSZ128rrk: case X86::VMOVAPSZ128rrk:
1767  case X86::VMOVUPSZ256rrk: case X86::VMOVAPSZ256rrk:
1768  case X86::VMOVUPSZrrk: case X86::VMOVAPSZrrk: {
1769  unsigned Opc;
1770  switch (MIOpc) {
1771  default: llvm_unreachable("Unreachable!");
1772  case X86::VMOVDQU8Z128rrk: Opc = X86::VPBLENDMBZ128rrk; break;
1773  case X86::VMOVDQU8Z256rrk: Opc = X86::VPBLENDMBZ256rrk; break;
1774  case X86::VMOVDQU8Zrrk: Opc = X86::VPBLENDMBZrrk; break;
1775  case X86::VMOVDQU16Z128rrk: Opc = X86::VPBLENDMWZ128rrk; break;
1776  case X86::VMOVDQU16Z256rrk: Opc = X86::VPBLENDMWZ256rrk; break;
1777  case X86::VMOVDQU16Zrrk: Opc = X86::VPBLENDMWZrrk; break;
1778  case X86::VMOVDQU32Z128rrk: Opc = X86::VPBLENDMDZ128rrk; break;
1779  case X86::VMOVDQU32Z256rrk: Opc = X86::VPBLENDMDZ256rrk; break;
1780  case X86::VMOVDQU32Zrrk: Opc = X86::VPBLENDMDZrrk; break;
1781  case X86::VMOVDQU64Z128rrk: Opc = X86::VPBLENDMQZ128rrk; break;
1782  case X86::VMOVDQU64Z256rrk: Opc = X86::VPBLENDMQZ256rrk; break;
1783  case X86::VMOVDQU64Zrrk: Opc = X86::VPBLENDMQZrrk; break;
1784  case X86::VMOVUPDZ128rrk: Opc = X86::VBLENDMPDZ128rrk; break;
1785  case X86::VMOVUPDZ256rrk: Opc = X86::VBLENDMPDZ256rrk; break;
1786  case X86::VMOVUPDZrrk: Opc = X86::VBLENDMPDZrrk; break;
1787  case X86::VMOVUPSZ128rrk: Opc = X86::VBLENDMPSZ128rrk; break;
1788  case X86::VMOVUPSZ256rrk: Opc = X86::VBLENDMPSZ256rrk; break;
1789  case X86::VMOVUPSZrrk: Opc = X86::VBLENDMPSZrrk; break;
1790  case X86::VMOVDQA32Z128rrk: Opc = X86::VPBLENDMDZ128rrk; break;
1791  case X86::VMOVDQA32Z256rrk: Opc = X86::VPBLENDMDZ256rrk; break;
1792  case X86::VMOVDQA32Zrrk: Opc = X86::VPBLENDMDZrrk; break;
1793  case X86::VMOVDQA64Z128rrk: Opc = X86::VPBLENDMQZ128rrk; break;
1794  case X86::VMOVDQA64Z256rrk: Opc = X86::VPBLENDMQZ256rrk; break;
1795  case X86::VMOVDQA64Zrrk: Opc = X86::VPBLENDMQZrrk; break;
1796  case X86::VMOVAPDZ128rrk: Opc = X86::VBLENDMPDZ128rrk; break;
1797  case X86::VMOVAPDZ256rrk: Opc = X86::VBLENDMPDZ256rrk; break;
1798  case X86::VMOVAPDZrrk: Opc = X86::VBLENDMPDZrrk; break;
1799  case X86::VMOVAPSZ128rrk: Opc = X86::VBLENDMPSZ128rrk; break;
1800  case X86::VMOVAPSZ256rrk: Opc = X86::VBLENDMPSZ256rrk; break;
1801  case X86::VMOVAPSZrrk: Opc = X86::VBLENDMPSZrrk; break;
1802  }
1803 
1804  NewMI = BuildMI(MF, MI.getDebugLoc(), get(Opc))
1805  .add(Dest)
1806  .add(MI.getOperand(2))
1807  .add(Src)
1808  .add(MI.getOperand(3));
1809  break;
1810  }
1811  }
1812 
1813  if (!NewMI) return nullptr;
1814 
1815  if (LV) { // Update live variables
1816  if (Src.isKill())
1817  LV->replaceKillInstruction(Src.getReg(), MI, *NewMI);
1818  if (Dest.isDead())
1819  LV->replaceKillInstruction(Dest.getReg(), MI, *NewMI);
1820  }
1821 
1822  MachineBasicBlock &MBB = *MI.getParent();
1823  MBB.insert(MI.getIterator(), NewMI); // Insert the new inst
1824  return NewMI;
1825 }
1826 
1827 /// This determines which of three possible cases of a three source commute
1828 /// the source indexes correspond to taking into account any mask operands.
1829 /// All prevents commuting a passthru operand. Returns -1 if the commute isn't
1830 /// possible.
1831 /// Case 0 - Possible to commute the first and second operands.
1832 /// Case 1 - Possible to commute the first and third operands.
1833 /// Case 2 - Possible to commute the second and third operands.
1834 static unsigned getThreeSrcCommuteCase(uint64_t TSFlags, unsigned SrcOpIdx1,
1835  unsigned SrcOpIdx2) {
1836  // Put the lowest index to SrcOpIdx1 to simplify the checks below.
1837  if (SrcOpIdx1 > SrcOpIdx2)
1838  std::swap(SrcOpIdx1, SrcOpIdx2);
1839 
1840  unsigned Op1 = 1, Op2 = 2, Op3 = 3;
1841  if (X86II::isKMasked(TSFlags)) {
1842  Op2++;
1843  Op3++;
1844  }
1845 
1846  if (SrcOpIdx1 == Op1 && SrcOpIdx2 == Op2)
1847  return 0;
1848  if (SrcOpIdx1 == Op1 && SrcOpIdx2 == Op3)
1849  return 1;
1850  if (SrcOpIdx1 == Op2 && SrcOpIdx2 == Op3)
1851  return 2;
1852  llvm_unreachable("Unknown three src commute case.");
1853 }
1854 
1856  const MachineInstr &MI, unsigned SrcOpIdx1, unsigned SrcOpIdx2,
1857  const X86InstrFMA3Group &FMA3Group) const {
1858 
1859  unsigned Opc = MI.getOpcode();
1860 
1861  // TODO: Commuting the 1st operand of FMA*_Int requires some additional
1862  // analysis. The commute optimization is legal only if all users of FMA*_Int
1863  // use only the lowest element of the FMA*_Int instruction. Such analysis are
1864  // not implemented yet. So, just return 0 in that case.
1865  // When such analysis are available this place will be the right place for
1866  // calling it.
1867  assert(!(FMA3Group.isIntrinsic() && (SrcOpIdx1 == 1 || SrcOpIdx2 == 1)) &&
1868  "Intrinsic instructions can't commute operand 1");
1869 
1870  // Determine which case this commute is or if it can't be done.
1871  unsigned Case = getThreeSrcCommuteCase(MI.getDesc().TSFlags, SrcOpIdx1,
1872  SrcOpIdx2);
1873  assert(Case < 3 && "Unexpected case number!");
1874 
1875  // Define the FMA forms mapping array that helps to map input FMA form
1876  // to output FMA form to preserve the operation semantics after
1877  // commuting the operands.
1878  const unsigned Form132Index = 0;
1879  const unsigned Form213Index = 1;
1880  const unsigned Form231Index = 2;
1881  static const unsigned FormMapping[][3] = {
1882  // 0: SrcOpIdx1 == 1 && SrcOpIdx2 == 2;
1883  // FMA132 A, C, b; ==> FMA231 C, A, b;
1884  // FMA213 B, A, c; ==> FMA213 A, B, c;
1885  // FMA231 C, A, b; ==> FMA132 A, C, b;
1886  { Form231Index, Form213Index, Form132Index },
1887  // 1: SrcOpIdx1 == 1 && SrcOpIdx2 == 3;
1888  // FMA132 A, c, B; ==> FMA132 B, c, A;
1889  // FMA213 B, a, C; ==> FMA231 C, a, B;
1890  // FMA231 C, a, B; ==> FMA213 B, a, C;
1891  { Form132Index, Form231Index, Form213Index },
1892  // 2: SrcOpIdx1 == 2 && SrcOpIdx2 == 3;
1893  // FMA132 a, C, B; ==> FMA213 a, B, C;
1894  // FMA213 b, A, C; ==> FMA132 b, C, A;
1895  // FMA231 c, A, B; ==> FMA231 c, B, A;
1896  { Form213Index, Form132Index, Form231Index }
1897  };
1898 
1899  unsigned FMAForms[3];
1900  FMAForms[0] = FMA3Group.get132Opcode();
1901  FMAForms[1] = FMA3Group.get213Opcode();
1902  FMAForms[2] = FMA3Group.get231Opcode();
1903  unsigned FormIndex;
1904  for (FormIndex = 0; FormIndex < 3; FormIndex++)
1905  if (Opc == FMAForms[FormIndex])
1906  break;
1907 
1908  // Everything is ready, just adjust the FMA opcode and return it.
1909  FormIndex = FormMapping[Case][FormIndex];
1910  return FMAForms[FormIndex];
1911 }
1912 
1913 static void commuteVPTERNLOG(MachineInstr &MI, unsigned SrcOpIdx1,
1914  unsigned SrcOpIdx2) {
1915  // Determine which case this commute is or if it can't be done.
1916  unsigned Case = getThreeSrcCommuteCase(MI.getDesc().TSFlags, SrcOpIdx1,
1917  SrcOpIdx2);
1918  assert(Case < 3 && "Unexpected case value!");
1919 
1920  // For each case we need to swap two pairs of bits in the final immediate.
1921  static const uint8_t SwapMasks[3][4] = {
1922  { 0x04, 0x10, 0x08, 0x20 }, // Swap bits 2/4 and 3/5.
1923  { 0x02, 0x10, 0x08, 0x40 }, // Swap bits 1/4 and 3/6.
1924  { 0x02, 0x04, 0x20, 0x40 }, // Swap bits 1/2 and 5/6.
1925  };
1926 
1927  uint8_t Imm = MI.getOperand(MI.getNumOperands()-1).getImm();
1928  // Clear out the bits we are swapping.
1929  uint8_t NewImm = Imm & ~(SwapMasks[Case][0] | SwapMasks[Case][1] |
1930  SwapMasks[Case][2] | SwapMasks[Case][3]);
1931  // If the immediate had a bit of the pair set, then set the opposite bit.
1932  if (Imm & SwapMasks[Case][0]) NewImm |= SwapMasks[Case][1];
1933  if (Imm & SwapMasks[Case][1]) NewImm |= SwapMasks[Case][0];
1934  if (Imm & SwapMasks[Case][2]) NewImm |= SwapMasks[Case][3];
1935  if (Imm & SwapMasks[Case][3]) NewImm |= SwapMasks[Case][2];
1936  MI.getOperand(MI.getNumOperands()-1).setImm(NewImm);
1937 }
1938 
1939 // Returns true if this is a VPERMI2 or VPERMT2 instruction that can be
1940 // commuted.
1941 static bool isCommutableVPERMV3Instruction(unsigned Opcode) {
1942 #define VPERM_CASES(Suffix) \
1943  case X86::VPERMI2##Suffix##128rr: case X86::VPERMT2##Suffix##128rr: \
1944  case X86::VPERMI2##Suffix##256rr: case X86::VPERMT2##Suffix##256rr: \
1945  case X86::VPERMI2##Suffix##rr: case X86::VPERMT2##Suffix##rr: \
1946  case X86::VPERMI2##Suffix##128rm: case X86::VPERMT2##Suffix##128rm: \
1947  case X86::VPERMI2##Suffix##256rm: case X86::VPERMT2##Suffix##256rm: \
1948  case X86::VPERMI2##Suffix##rm: case X86::VPERMT2##Suffix##rm: \
1949  case X86::VPERMI2##Suffix##128rrkz: case X86::VPERMT2##Suffix##128rrkz: \
1950  case X86::VPERMI2##Suffix##256rrkz: case X86::VPERMT2##Suffix##256rrkz: \
1951  case X86::VPERMI2##Suffix##rrkz: case X86::VPERMT2##Suffix##rrkz: \
1952  case X86::VPERMI2##Suffix##128rmkz: case X86::VPERMT2##Suffix##128rmkz: \
1953  case X86::VPERMI2##Suffix##256rmkz: case X86::VPERMT2##Suffix##256rmkz: \
1954  case X86::VPERMI2##Suffix##rmkz: case X86::VPERMT2##Suffix##rmkz:
1955 
1956 #define VPERM_CASES_BROADCAST(Suffix) \
1957  VPERM_CASES(Suffix) \
1958  case X86::VPERMI2##Suffix##128rmb: case X86::VPERMT2##Suffix##128rmb: \
1959  case X86::VPERMI2##Suffix##256rmb: case X86::VPERMT2##Suffix##256rmb: \
1960  case X86::VPERMI2##Suffix##rmb: case X86::VPERMT2##Suffix##rmb: \
1961  case X86::VPERMI2##Suffix##128rmbkz: case X86::VPERMT2##Suffix##128rmbkz: \
1962  case X86::VPERMI2##Suffix##256rmbkz: case X86::VPERMT2##Suffix##256rmbkz: \
1963  case X86::VPERMI2##Suffix##rmbkz: case X86::VPERMT2##Suffix##rmbkz:
1964 
1965  switch (Opcode) {
1966  default: return false;
1967  VPERM_CASES(B)
1972  VPERM_CASES(W)
1973  return true;
1974  }
1975 #undef VPERM_CASES_BROADCAST
1976 #undef VPERM_CASES
1977 }
1978 
1979 // Returns commuted opcode for VPERMI2 and VPERMT2 instructions by switching
1980 // from the I opcode to the T opcode and vice versa.
1981 static unsigned getCommutedVPERMV3Opcode(unsigned Opcode) {
1982 #define VPERM_CASES(Orig, New) \
1983  case X86::Orig##128rr: return X86::New##128rr; \
1984  case X86::Orig##128rrkz: return X86::New##128rrkz; \
1985  case X86::Orig##128rm: return X86::New##128rm; \
1986  case X86::Orig##128rmkz: return X86::New##128rmkz; \
1987  case X86::Orig##256rr: return X86::New##256rr; \
1988  case X86::Orig##256rrkz: return X86::New##256rrkz; \
1989  case X86::Orig##256rm: return X86::New##256rm; \
1990  case X86::Orig##256rmkz: return X86::New##256rmkz; \
1991  case X86::Orig##rr: return X86::New##rr; \
1992  case X86::Orig##rrkz: return X86::New##rrkz; \
1993  case X86::Orig##rm: return X86::New##rm; \
1994  case X86::Orig##rmkz: return X86::New##rmkz;
1995 
1996 #define VPERM_CASES_BROADCAST(Orig, New) \
1997  VPERM_CASES(Orig, New) \
1998  case X86::Orig##128rmb: return X86::New##128rmb; \
1999  case X86::Orig##128rmbkz: return X86::New##128rmbkz; \
2000  case X86::Orig##256rmb: return X86::New##256rmb; \
2001  case X86::Orig##256rmbkz: return X86::New##256rmbkz; \
2002  case X86::Orig##rmb: return X86::New##rmb; \
2003  case X86::Orig##rmbkz: return X86::New##rmbkz;
2004 
2005  switch (Opcode) {
2006  VPERM_CASES(VPERMI2B, VPERMT2B)
2007  VPERM_CASES_BROADCAST(VPERMI2D, VPERMT2D)
2008  VPERM_CASES_BROADCAST(VPERMI2PD, VPERMT2PD)
2009  VPERM_CASES_BROADCAST(VPERMI2PS, VPERMT2PS)
2010  VPERM_CASES_BROADCAST(VPERMI2Q, VPERMT2Q)
2011  VPERM_CASES(VPERMI2W, VPERMT2W)
2012  VPERM_CASES(VPERMT2B, VPERMI2B)
2013  VPERM_CASES_BROADCAST(VPERMT2D, VPERMI2D)
2014  VPERM_CASES_BROADCAST(VPERMT2PD, VPERMI2PD)
2015  VPERM_CASES_BROADCAST(VPERMT2PS, VPERMI2PS)
2016  VPERM_CASES_BROADCAST(VPERMT2Q, VPERMI2Q)
2017  VPERM_CASES(VPERMT2W, VPERMI2W)
2018  }
2019 
2020  llvm_unreachable("Unreachable!");
2021 #undef VPERM_CASES_BROADCAST
2022 #undef VPERM_CASES
2023 }
2024 
2026  unsigned OpIdx1,
2027  unsigned OpIdx2) const {
2028  auto cloneIfNew = [NewMI](MachineInstr &MI) -> MachineInstr & {
2029  if (NewMI)
2030  return *MI.getParent()->getParent()->CloneMachineInstr(&MI);
2031  return MI;
2032  };
2033 
2034  switch (MI.getOpcode()) {
2035  case X86::SHRD16rri8: // A = SHRD16rri8 B, C, I -> A = SHLD16rri8 C, B, (16-I)
2036  case X86::SHLD16rri8: // A = SHLD16rri8 B, C, I -> A = SHRD16rri8 C, B, (16-I)
2037  case X86::SHRD32rri8: // A = SHRD32rri8 B, C, I -> A = SHLD32rri8 C, B, (32-I)
2038  case X86::SHLD32rri8: // A = SHLD32rri8 B, C, I -> A = SHRD32rri8 C, B, (32-I)
2039  case X86::SHRD64rri8: // A = SHRD64rri8 B, C, I -> A = SHLD64rri8 C, B, (64-I)
2040  case X86::SHLD64rri8:{// A = SHLD64rri8 B, C, I -> A = SHRD64rri8 C, B, (64-I)
2041  unsigned Opc;
2042  unsigned Size;
2043  switch (MI.getOpcode()) {
2044  default: llvm_unreachable("Unreachable!");
2045  case X86::SHRD16rri8: Size = 16; Opc = X86::SHLD16rri8; break;
2046  case X86::SHLD16rri8: Size = 16; Opc = X86::SHRD16rri8; break;
2047  case X86::SHRD32rri8: Size = 32; Opc = X86::SHLD32rri8; break;
2048  case X86::SHLD32rri8: Size = 32; Opc = X86::SHRD32rri8; break;
2049  case X86::SHRD64rri8: Size = 64; Opc = X86::SHLD64rri8; break;
2050  case X86::SHLD64rri8: Size = 64; Opc = X86::SHRD64rri8; break;
2051  }
2052  unsigned Amt = MI.getOperand(3).getImm();
2053  auto &WorkingMI = cloneIfNew(MI);
2054  WorkingMI.setDesc(get(Opc));
2055  WorkingMI.getOperand(3).setImm(Size - Amt);
2056  return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
2057  OpIdx1, OpIdx2);
2058  }
2059  case X86::PFSUBrr:
2060  case X86::PFSUBRrr: {
2061  // PFSUB x, y: x = x - y
2062  // PFSUBR x, y: x = y - x
2063  unsigned Opc =
2064  (X86::PFSUBRrr == MI.getOpcode() ? X86::PFSUBrr : X86::PFSUBRrr);
2065  auto &WorkingMI = cloneIfNew(MI);
2066  WorkingMI.setDesc(get(Opc));
2067  return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
2068  OpIdx1, OpIdx2);
2069  }
2070  case X86::BLENDPDrri:
2071  case X86::BLENDPSrri:
2072  case X86::VBLENDPDrri:
2073  case X86::VBLENDPSrri:
2074  // If we're optimizing for size, try to use MOVSD/MOVSS.
2075  if (MI.getParent()->getParent()->getFunction().hasOptSize()) {
2076  unsigned Mask, Opc;
2077  switch (MI.getOpcode()) {
2078  default: llvm_unreachable("Unreachable!");
2079  case X86::BLENDPDrri: Opc = X86::MOVSDrr; Mask = 0x03; break;
2080  case X86::BLENDPSrri: Opc = X86::MOVSSrr; Mask = 0x0F; break;
2081  case X86::VBLENDPDrri: Opc = X86::VMOVSDrr; Mask = 0x03; break;
2082  case X86::VBLENDPSrri: Opc = X86::VMOVSSrr; Mask = 0x0F; break;
2083  }
2084  if ((MI.getOperand(3).getImm() ^ Mask) == 1) {
2085  auto &WorkingMI = cloneIfNew(MI);
2086  WorkingMI.setDesc(get(Opc));
2087  WorkingMI.RemoveOperand(3);
2088  return TargetInstrInfo::commuteInstructionImpl(WorkingMI,
2089  /*NewMI=*/false,
2090  OpIdx1, OpIdx2);
2091  }
2092  }
2094  case X86::PBLENDWrri:
2095  case X86::VBLENDPDYrri:
2096  case X86::VBLENDPSYrri:
2097  case X86::VPBLENDDrri:
2098  case X86::VPBLENDWrri:
2099  case X86::VPBLENDDYrri:
2100  case X86::VPBLENDWYrri:{
2101  int8_t Mask;
2102  switch (MI.getOpcode()) {
2103  default: llvm_unreachable("Unreachable!");
2104  case X86::BLENDPDrri: Mask = (int8_t)0x03; break;
2105  case X86::BLENDPSrri: Mask = (int8_t)0x0F; break;
2106  case X86::PBLENDWrri: Mask = (int8_t)0xFF; break;
2107  case X86::VBLENDPDrri: Mask = (int8_t)0x03; break;
2108  case X86::VBLENDPSrri: Mask = (int8_t)0x0F; break;
2109  case X86::VBLENDPDYrri: Mask = (int8_t)0x0F; break;
2110  case X86::VBLENDPSYrri: Mask = (int8_t)0xFF; break;
2111  case X86::VPBLENDDrri: Mask = (int8_t)0x0F; break;
2112  case X86::VPBLENDWrri: Mask = (int8_t)0xFF; break;
2113  case X86::VPBLENDDYrri: Mask = (int8_t)0xFF; break;
2114  case X86::VPBLENDWYrri: Mask = (int8_t)0xFF; break;
2115  }
2116  // Only the least significant bits of Imm are used.
2117  // Using int8_t to ensure it will be sign extended to the int64_t that
2118  // setImm takes in order to match isel behavior.
2119  int8_t Imm = MI.getOperand(3).getImm() & Mask;
2120  auto &WorkingMI = cloneIfNew(MI);
2121  WorkingMI.getOperand(3).setImm(Mask ^ Imm);
2122  return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
2123  OpIdx1, OpIdx2);
2124  }
2125  case X86::INSERTPSrr:
2126  case X86::VINSERTPSrr:
2127  case X86::VINSERTPSZrr: {
2128  unsigned Imm = MI.getOperand(MI.getNumOperands() - 1).getImm();
2129  unsigned ZMask = Imm & 15;
2130  unsigned DstIdx = (Imm >> 4) & 3;
2131  unsigned SrcIdx = (Imm >> 6) & 3;
2132 
2133  // We can commute insertps if we zero 2 of the elements, the insertion is
2134  // "inline" and we don't override the insertion with a zero.
2135  if (DstIdx == SrcIdx && (ZMask & (1 << DstIdx)) == 0 &&
2136  countPopulation(ZMask) == 2) {
2137  unsigned AltIdx = findFirstSet((ZMask | (1 << DstIdx)) ^ 15);
2138  assert(AltIdx < 4 && "Illegal insertion index");
2139  unsigned AltImm = (AltIdx << 6) | (AltIdx << 4) | ZMask;
2140  auto &WorkingMI = cloneIfNew(MI);
2141  WorkingMI.getOperand(MI.getNumOperands() - 1).setImm(AltImm);
2142  return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
2143  OpIdx1, OpIdx2);
2144  }
2145  return nullptr;
2146  }
2147  case X86::MOVSDrr:
2148  case X86::MOVSSrr:
2149  case X86::VMOVSDrr:
2150  case X86::VMOVSSrr:{
2151  // On SSE41 or later we can commute a MOVSS/MOVSD to a BLENDPS/BLENDPD.
2152  if (Subtarget.hasSSE41()) {
2153  unsigned Mask, Opc;
2154  switch (MI.getOpcode()) {
2155  default: llvm_unreachable("Unreachable!");
2156  case X86::MOVSDrr: Opc = X86::BLENDPDrri; Mask = 0x02; break;
2157  case X86::MOVSSrr: Opc = X86::BLENDPSrri; Mask = 0x0E; break;
2158  case X86::VMOVSDrr: Opc = X86::VBLENDPDrri; Mask = 0x02; break;
2159  case X86::VMOVSSrr: Opc = X86::VBLENDPSrri; Mask = 0x0E; break;
2160  }
2161 
2162  auto &WorkingMI = cloneIfNew(MI);
2163  WorkingMI.setDesc(get(Opc));
2164  WorkingMI.addOperand(MachineOperand::CreateImm(Mask));
2165  return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
2166  OpIdx1, OpIdx2);
2167  }
2168 
2169  // Convert to SHUFPD.
2170  assert(MI.getOpcode() == X86::MOVSDrr &&
2171  "Can only commute MOVSDrr without SSE4.1");
2172 
2173  auto &WorkingMI = cloneIfNew(MI);
2174  WorkingMI.setDesc(get(X86::SHUFPDrri));
2175  WorkingMI.addOperand(MachineOperand::CreateImm(0x02));
2176  return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
2177  OpIdx1, OpIdx2);
2178  }
2179  case X86::SHUFPDrri: {
2180  // Commute to MOVSD.
2181  assert(MI.getOperand(3).getImm() == 0x02 && "Unexpected immediate!");
2182  auto &WorkingMI = cloneIfNew(MI);
2183  WorkingMI.setDesc(get(X86::MOVSDrr));
2184  WorkingMI.RemoveOperand(3);
2185  return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
2186  OpIdx1, OpIdx2);
2187  }
2188  case X86::PCLMULQDQrr:
2189  case X86::VPCLMULQDQrr:
2190  case X86::VPCLMULQDQYrr:
2191  case X86::VPCLMULQDQZrr:
2192  case X86::VPCLMULQDQZ128rr:
2193  case X86::VPCLMULQDQZ256rr: {
2194  // SRC1 64bits = Imm[0] ? SRC1[127:64] : SRC1[63:0]
2195  // SRC2 64bits = Imm[4] ? SRC2[127:64] : SRC2[63:0]
2196  unsigned Imm = MI.getOperand(3).getImm();
2197  unsigned Src1Hi = Imm & 0x01;
2198  unsigned Src2Hi = Imm & 0x10;
2199  auto &WorkingMI = cloneIfNew(MI);
2200  WorkingMI.getOperand(3).setImm((Src1Hi << 4) | (Src2Hi >> 4));
2201  return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
2202  OpIdx1, OpIdx2);
2203  }
2204  case X86::VPCMPBZ128rri: case X86::VPCMPUBZ128rri:
2205  case X86::VPCMPBZ256rri: case X86::VPCMPUBZ256rri:
2206  case X86::VPCMPBZrri: case X86::VPCMPUBZrri:
2207  case X86::VPCMPDZ128rri: case X86::VPCMPUDZ128rri:
2208  case X86::VPCMPDZ256rri: case X86::VPCMPUDZ256rri:
2209  case X86::VPCMPDZrri: case X86::VPCMPUDZrri:
2210  case X86::VPCMPQZ128rri: case X86::VPCMPUQZ128rri:
2211  case X86::VPCMPQZ256rri: case X86::VPCMPUQZ256rri:
2212  case X86::VPCMPQZrri: case X86::VPCMPUQZrri:
2213  case X86::VPCMPWZ128rri: case X86::VPCMPUWZ128rri:
2214  case X86::VPCMPWZ256rri: case X86::VPCMPUWZ256rri:
2215  case X86::VPCMPWZrri: case X86::VPCMPUWZrri:
2216  case X86::VPCMPBZ128rrik: case X86::VPCMPUBZ128rrik:
2217  case X86::VPCMPBZ256rrik: case X86::VPCMPUBZ256rrik:
2218  case X86::VPCMPBZrrik: case X86::VPCMPUBZrrik:
2219  case X86::VPCMPDZ128rrik: case X86::VPCMPUDZ128rrik:
2220  case X86::VPCMPDZ256rrik: case X86::VPCMPUDZ256rrik:
2221  case X86::VPCMPDZrrik: case X86::VPCMPUDZrrik:
2222  case X86::VPCMPQZ128rrik: case X86::VPCMPUQZ128rrik:
2223  case X86::VPCMPQZ256rrik: case X86::VPCMPUQZ256rrik:
2224  case X86::VPCMPQZrrik: case X86::VPCMPUQZrrik:
2225  case X86::VPCMPWZ128rrik: case X86::VPCMPUWZ128rrik:
2226  case X86::VPCMPWZ256rrik: case X86::VPCMPUWZ256rrik:
2227  case X86::VPCMPWZrrik: case X86::VPCMPUWZrrik: {
2228  // Flip comparison mode immediate (if necessary).
2229  unsigned Imm = MI.getOperand(MI.getNumOperands() - 1).getImm() & 0x7;
2230  Imm = X86::getSwappedVPCMPImm(Imm);
2231  auto &WorkingMI = cloneIfNew(MI);
2232  WorkingMI.getOperand(MI.getNumOperands() - 1).setImm(Imm);
2233  return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
2234  OpIdx1, OpIdx2);
2235  }
2236  case X86::VPCOMBri: case X86::VPCOMUBri:
2237  case X86::VPCOMDri: case X86::VPCOMUDri:
2238  case X86::VPCOMQri: case X86::VPCOMUQri:
2239  case X86::VPCOMWri: case X86::VPCOMUWri: {
2240  // Flip comparison mode immediate (if necessary).
2241  unsigned Imm = MI.getOperand(3).getImm() & 0x7;
2242  Imm = X86::getSwappedVPCOMImm(Imm);
2243  auto &WorkingMI = cloneIfNew(MI);
2244  WorkingMI.getOperand(3).setImm(Imm);
2245  return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
2246  OpIdx1, OpIdx2);
2247  }
2248  case X86::VCMPSDZrr:
2249  case X86::VCMPSSZrr:
2250  case X86::VCMPPDZrri:
2251  case X86::VCMPPSZrri:
2252  case X86::VCMPSHZrr:
2253  case X86::VCMPPHZrri:
2254  case X86::VCMPPHZ128rri:
2255  case X86::VCMPPHZ256rri:
2256  case X86::VCMPPDZ128rri:
2257  case X86::VCMPPSZ128rri:
2258  case X86::VCMPPDZ256rri:
2259  case X86::VCMPPSZ256rri:
2260  case X86::VCMPPDZrrik:
2261  case X86::VCMPPSZrrik:
2262  case X86::VCMPPDZ128rrik:
2263  case X86::VCMPPSZ128rrik:
2264  case X86::VCMPPDZ256rrik:
2265  case X86::VCMPPSZ256rrik: {
2266  unsigned Imm =
2267  MI.getOperand(MI.getNumExplicitOperands() - 1).getImm() & 0x1f;
2268  Imm = X86::getSwappedVCMPImm(Imm);
2269  auto &WorkingMI = cloneIfNew(MI);
2270  WorkingMI.getOperand(MI.getNumExplicitOperands() - 1).setImm(Imm);
2271  return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
2272  OpIdx1, OpIdx2);
2273  }
2274  case X86::VPERM2F128rr:
2275  case X86::VPERM2I128rr: {
2276  // Flip permute source immediate.
2277  // Imm & 0x02: lo = if set, select Op1.lo/hi else Op0.lo/hi.
2278  // Imm & 0x20: hi = if set, select Op1.lo/hi else Op0.lo/hi.
2279  int8_t Imm = MI.getOperand(3).getImm() & 0xFF;
2280  auto &WorkingMI = cloneIfNew(MI);
2281  WorkingMI.getOperand(3).setImm(Imm ^ 0x22);
2282  return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
2283  OpIdx1, OpIdx2);
2284  }
2285  case X86::MOVHLPSrr:
2286  case X86::UNPCKHPDrr:
2287  case X86::VMOVHLPSrr:
2288  case X86::VUNPCKHPDrr:
2289  case X86::VMOVHLPSZrr:
2290  case X86::VUNPCKHPDZ128rr: {
2291  assert(Subtarget.hasSSE2() && "Commuting MOVHLP/UNPCKHPD requires SSE2!");
2292 
2293  unsigned Opc = MI.getOpcode();
2294  switch (Opc) {
2295  default: llvm_unreachable("Unreachable!");
2296  case X86::MOVHLPSrr: Opc = X86::UNPCKHPDrr; break;
2297  case X86::UNPCKHPDrr: Opc = X86::MOVHLPSrr; break;
2298  case X86::VMOVHLPSrr: Opc = X86::VUNPCKHPDrr; break;
2299  case X86::VUNPCKHPDrr: Opc = X86::VMOVHLPSrr; break;
2300  case X86::VMOVHLPSZrr: Opc = X86::VUNPCKHPDZ128rr; break;
2301  case X86::VUNPCKHPDZ128rr: Opc = X86::VMOVHLPSZrr; break;
2302  }
2303  auto &WorkingMI = cloneIfNew(MI);
2304  WorkingMI.setDesc(get(Opc));
2305  return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
2306  OpIdx1, OpIdx2);
2307  }
2308  case X86::CMOV16rr: case X86::CMOV32rr: case X86::CMOV64rr: {
2309  auto &WorkingMI = cloneIfNew(MI);
2310  unsigned OpNo = MI.getDesc().getNumOperands() - 1;
2311  X86::CondCode CC = static_cast<X86::CondCode>(MI.getOperand(OpNo).getImm());
2312  WorkingMI.getOperand(OpNo).setImm(X86::GetOppositeBranchCondition(CC));
2313  return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
2314  OpIdx1, OpIdx2);
2315  }
2316  case X86::VPTERNLOGDZrri: case X86::VPTERNLOGDZrmi:
2317  case X86::VPTERNLOGDZ128rri: case X86::VPTERNLOGDZ128rmi:
2318  case X86::VPTERNLOGDZ256rri: case X86::VPTERNLOGDZ256rmi:
2319  case X86::VPTERNLOGQZrri: case X86::VPTERNLOGQZrmi:
2320  case X86::VPTERNLOGQZ128rri: case X86::VPTERNLOGQZ128rmi:
2321  case X86::VPTERNLOGQZ256rri: case X86::VPTERNLOGQZ256rmi:
2322  case X86::VPTERNLOGDZrrik:
2323  case X86::VPTERNLOGDZ128rrik:
2324  case X86::VPTERNLOGDZ256rrik:
2325  case X86::VPTERNLOGQZrrik:
2326  case X86::VPTERNLOGQZ128rrik:
2327  case X86::VPTERNLOGQZ256rrik:
2328  case X86::VPTERNLOGDZrrikz: case X86::VPTERNLOGDZrmikz:
2329  case X86::VPTERNLOGDZ128rrikz: case X86::VPTERNLOGDZ128rmikz:
2330  case X86::VPTERNLOGDZ256rrikz: case X86::VPTERNLOGDZ256rmikz:
2331  case X86::VPTERNLOGQZrrikz: case X86::VPTERNLOGQZrmikz:
2332  case X86::VPTERNLOGQZ128rrikz: case X86::VPTERNLOGQZ128rmikz:
2333  case X86::VPTERNLOGQZ256rrikz: case X86::VPTERNLOGQZ256rmikz:
2334  case X86::VPTERNLOGDZ128rmbi:
2335  case X86::VPTERNLOGDZ256rmbi:
2336  case X86::VPTERNLOGDZrmbi:
2337  case X86::VPTERNLOGQZ128rmbi:
2338  case X86::VPTERNLOGQZ256rmbi:
2339  case X86::VPTERNLOGQZrmbi:
2340  case X86::VPTERNLOGDZ128rmbikz:
2341  case X86::VPTERNLOGDZ256rmbikz:
2342  case X86::VPTERNLOGDZrmbikz:
2343  case X86::VPTERNLOGQZ128rmbikz:
2344  case X86::VPTERNLOGQZ256rmbikz:
2345  case X86::VPTERNLOGQZrmbikz: {
2346  auto &WorkingMI = cloneIfNew(MI);
2347  commuteVPTERNLOG(WorkingMI, OpIdx1, OpIdx2);
2348  return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
2349  OpIdx1, OpIdx2);
2350  }
2351  default: {
2352  if (isCommutableVPERMV3Instruction(MI.getOpcode())) {
2353  unsigned Opc = getCommutedVPERMV3Opcode(MI.getOpcode());
2354  auto &WorkingMI = cloneIfNew(MI);
2355  WorkingMI.setDesc(get(Opc));
2356  return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
2357  OpIdx1, OpIdx2);
2358  }
2359 
2360  const X86InstrFMA3Group *FMA3Group = getFMA3Group(MI.getOpcode(),
2361  MI.getDesc().TSFlags);
2362  if (FMA3Group) {
2363  unsigned Opc =
2364  getFMA3OpcodeToCommuteOperands(MI, OpIdx1, OpIdx2, *FMA3Group);
2365  auto &WorkingMI = cloneIfNew(MI);
2366  WorkingMI.setDesc(get(Opc));
2367  return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
2368  OpIdx1, OpIdx2);
2369  }
2370 
2371  return TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2);
2372  }
2373  }
2374 }
2375 
2376 bool
2377 X86InstrInfo::findThreeSrcCommutedOpIndices(const MachineInstr &MI,
2378  unsigned &SrcOpIdx1,
2379  unsigned &SrcOpIdx2,
2380  bool IsIntrinsic) const {
2381  uint64_t TSFlags = MI.getDesc().TSFlags;
2382 
2383  unsigned FirstCommutableVecOp = 1;
2384  unsigned LastCommutableVecOp = 3;
2385  unsigned KMaskOp = -1U;
2386  if (X86II::isKMasked(TSFlags)) {
2387  // For k-zero-masked operations it is Ok to commute the first vector
2388  // operand. Unless this is an intrinsic instruction.
2389  // For regular k-masked operations a conservative choice is done as the
2390  // elements of the first vector operand, for which the corresponding bit
2391  // in the k-mask operand is set to 0, are copied to the result of the
2392  // instruction.
2393  // TODO/FIXME: The commute still may be legal if it is known that the
2394  // k-mask operand is set to either all ones or all zeroes.
2395  // It is also Ok to commute the 1st operand if all users of MI use only
2396  // the elements enabled by the k-mask operand. For example,
2397  // v4 = VFMADD213PSZrk v1, k, v2, v3; // v1[i] = k[i] ? v2[i]*v1[i]+v3[i]
2398  // : v1[i];
2399  // VMOVAPSZmrk <mem_addr>, k, v4; // this is the ONLY user of v4 ->
2400  // // Ok, to commute v1 in FMADD213PSZrk.
2401 
2402  // The k-mask operand has index = 2 for masked and zero-masked operations.
2403  KMaskOp = 2;
2404 
2405  // The operand with index = 1 is used as a source for those elements for
2406  // which the corresponding bit in the k-mask is set to 0.
2407  if (X86II::isKMergeMasked(TSFlags) || IsIntrinsic)
2408  FirstCommutableVecOp = 3;
2409 
2410  LastCommutableVecOp++;
2411  } else if (IsIntrinsic) {
2412  // Commuting the first operand of an intrinsic instruction isn't possible
2413  // unless we can prove that only the lowest element of the result is used.
2414  FirstCommutableVecOp = 2;
2415  }
2416 
2417  if (isMem(MI, LastCommutableVecOp))
2418  LastCommutableVecOp--;
2419 
2420  // Only the first RegOpsNum operands are commutable.
2421  // Also, the value 'CommuteAnyOperandIndex' is valid here as it means
2422  // that the operand is not specified/fixed.
2423  if (SrcOpIdx1 != CommuteAnyOperandIndex &&
2424  (SrcOpIdx1 < FirstCommutableVecOp || SrcOpIdx1 > LastCommutableVecOp ||
2425  SrcOpIdx1 == KMaskOp))
2426  return false;
2427  if (SrcOpIdx2 != CommuteAnyOperandIndex &&
2428  (SrcOpIdx2 < FirstCommutableVecOp || SrcOpIdx2 > LastCommutableVecOp ||
2429  SrcOpIdx2 == KMaskOp))
2430  return false;
2431 
2432  // Look for two different register operands assumed to be commutable
2433  // regardless of the FMA opcode. The FMA opcode is adjusted later.
2434  if (SrcOpIdx1 == CommuteAnyOperandIndex ||
2435  SrcOpIdx2 == CommuteAnyOperandIndex) {
2436  unsigned CommutableOpIdx2 = SrcOpIdx2;
2437 
2438  // At least one of operands to be commuted is not specified and
2439  // this method is free to choose appropriate commutable operands.
2440  if (SrcOpIdx1 == SrcOpIdx2)
2441  // Both of operands are not fixed. By default set one of commutable
2442  // operands to the last register operand of the instruction.
2443  CommutableOpIdx2 = LastCommutableVecOp;
2444  else if (SrcOpIdx2 == CommuteAnyOperandIndex)
2445  // Only one of operands is not fixed.
2446  CommutableOpIdx2 = SrcOpIdx1;
2447 
2448  // CommutableOpIdx2 is well defined now. Let's choose another commutable
2449  // operand and assign its index to CommutableOpIdx1.
2450  Register Op2Reg = MI.getOperand(CommutableOpIdx2).getReg();
2451 
2452  unsigned CommutableOpIdx1;
2453  for (CommutableOpIdx1 = LastCommutableVecOp;
2454  CommutableOpIdx1 >= FirstCommutableVecOp; CommutableOpIdx1--) {
2455  // Just ignore and skip the k-mask operand.
2456  if (CommutableOpIdx1 == KMaskOp)
2457  continue;
2458 
2459  // The commuted operands must have different registers.
2460  // Otherwise, the commute transformation does not change anything and
2461  // is useless then.
2462  if (Op2Reg != MI.getOperand(CommutableOpIdx1).getReg())
2463  break;
2464  }
2465 
2466  // No appropriate commutable operands were found.
2467  if (CommutableOpIdx1 < FirstCommutableVecOp)
2468  return false;
2469 
2470  // Assign the found pair of commutable indices to SrcOpIdx1 and SrcOpidx2
2471  // to return those values.
2472  if (!fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2,
2473  CommutableOpIdx1, CommutableOpIdx2))
2474  return false;
2475  }
2476 
2477  return true;
2478 }
2479 
2481  unsigned &SrcOpIdx1,
2482  unsigned &SrcOpIdx2) const {
2483  const MCInstrDesc &Desc = MI.getDesc();
2484  if (!Desc.isCommutable())
2485  return false;
2486 
2487  switch (MI.getOpcode()) {
2488  case X86::CMPSDrr:
2489  case X86::CMPSSrr:
2490  case X86::CMPPDrri:
2491  case X86::CMPPSrri:
2492  case X86::VCMPSDrr:
2493  case X86::VCMPSSrr:
2494  case X86::VCMPPDrri:
2495  case X86::VCMPPSrri:
2496  case X86::VCMPPDYrri:
2497  case X86::VCMPPSYrri:
2498  case X86::VCMPSDZrr:
2499  case X86::VCMPSSZrr:
2500  case X86::VCMPPDZrri:
2501  case X86::VCMPPSZrri:
2502  case X86::VCMPSHZrr:
2503  case X86::VCMPPHZrri:
2504  case X86::VCMPPHZ128rri:
2505  case X86::VCMPPHZ256rri:
2506  case X86::VCMPPDZ128rri:
2507  case X86::VCMPPSZ128rri:
2508  case X86::VCMPPDZ256rri:
2509  case X86::VCMPPSZ256rri:
2510  case X86::VCMPPDZrrik:
2511  case X86::VCMPPSZrrik:
2512  case X86::VCMPPDZ128rrik:
2513  case X86::VCMPPSZ128rrik:
2514  case X86::VCMPPDZ256rrik:
2515  case X86::VCMPPSZ256rrik: {
2516  unsigned OpOffset = X86II::isKMasked(Desc.TSFlags) ? 1 : 0;
2517 
2518  // Float comparison can be safely commuted for
2519  // Ordered/Unordered/Equal/NotEqual tests
2520  unsigned Imm = MI.getOperand(3 + OpOffset).getImm() & 0x7;
2521  switch (Imm) {
2522  default:
2523  // EVEX versions can be commuted.
2524  if ((Desc.TSFlags & X86II::EncodingMask) == X86II::EVEX)
2525  break;
2526  return false;
2527  case 0x00: // EQUAL
2528  case 0x03: // UNORDERED
2529  case 0x04: // NOT EQUAL
2530  case 0x07: // ORDERED
2531  break;
2532  }
2533 
2534  // The indices of the commutable operands are 1 and 2 (or 2 and 3
2535  // when masked).
2536  // Assign them to the returned operand indices here.
2537  return fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2, 1 + OpOffset,
2538  2 + OpOffset);
2539  }
2540  case X86::MOVSSrr:
2541  // X86::MOVSDrr is always commutable. MOVSS is only commutable if we can
2542  // form sse4.1 blend. We assume VMOVSSrr/VMOVSDrr is always commutable since
2543  // AVX implies sse4.1.
2544  if (Subtarget.hasSSE41())
2545  return TargetInstrInfo::findCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2);
2546  return false;
2547  case X86::SHUFPDrri:
2548  // We can commute this to MOVSD.
2549  if (MI.getOperand(3).getImm() == 0x02)
2550  return TargetInstrInfo::findCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2);
2551  return false;
2552  case X86::MOVHLPSrr:
2553  case X86::UNPCKHPDrr:
2554  case X86::VMOVHLPSrr:
2555  case X86::VUNPCKHPDrr:
2556  case X86::VMOVHLPSZrr:
2557  case X86::VUNPCKHPDZ128rr:
2558  if (Subtarget.hasSSE2())
2559  return TargetInstrInfo::findCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2);
2560  return false;
2561  case X86::VPTERNLOGDZrri: case X86::VPTERNLOGDZrmi:
2562  case X86::VPTERNLOGDZ128rri: case X86::VPTERNLOGDZ128rmi:
2563  case X86::VPTERNLOGDZ256rri: case X86::VPTERNLOGDZ256rmi:
2564  case X86::VPTERNLOGQZrri: case X86::VPTERNLOGQZrmi:
2565  case X86::VPTERNLOGQZ128rri: case X86::VPTERNLOGQZ128rmi:
2566  case X86::VPTERNLOGQZ256rri: case X86::VPTERNLOGQZ256rmi:
2567  case X86::VPTERNLOGDZrrik:
2568  case X86::VPTERNLOGDZ128rrik:
2569  case X86::VPTERNLOGDZ256rrik:
2570  case X86::VPTERNLOGQZrrik:
2571  case X86::VPTERNLOGQZ128rrik:
2572  case X86::VPTERNLOGQZ256rrik:
2573  case X86::VPTERNLOGDZrrikz: case X86::VPTERNLOGDZrmikz:
2574  case X86::VPTERNLOGDZ128rrikz: case X86::VPTERNLOGDZ128rmikz:
2575  case X86::VPTERNLOGDZ256rrikz: case X86::VPTERNLOGDZ256rmikz:
2576  case X86::VPTERNLOGQZrrikz: case X86::VPTERNLOGQZrmikz:
2577  case X86::VPTERNLOGQZ128rrikz: case X86::VPTERNLOGQZ128rmikz:
2578  case X86::VPTERNLOGQZ256rrikz: case X86::VPTERNLOGQZ256rmikz:
2579  case X86::VPTERNLOGDZ128rmbi:
2580  case X86::VPTERNLOGDZ256rmbi:
2581  case X86::VPTERNLOGDZrmbi:
2582  case X86::VPTERNLOGQZ128rmbi:
2583  case X86::VPTERNLOGQZ256rmbi:
2584  case X86::VPTERNLOGQZrmbi:
2585  case X86::VPTERNLOGDZ128rmbikz:
2586  case X86::VPTERNLOGDZ256rmbikz:
2587  case X86::VPTERNLOGDZrmbikz:
2588  case X86::VPTERNLOGQZ128rmbikz:
2589  case X86::VPTERNLOGQZ256rmbikz:
2590  case X86::VPTERNLOGQZrmbikz:
2591  return findThreeSrcCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2);
2592  case X86::VPDPWSSDYrr:
2593  case X86::VPDPWSSDrr:
2594  case X86::VPDPWSSDSYrr:
2595  case X86::VPDPWSSDSrr:
2596  case X86::VPDPWSSDZ128r:
2597  case X86::VPDPWSSDZ128rk:
2598  case X86::VPDPWSSDZ128rkz:
2599  case X86::VPDPWSSDZ256r:
2600  case X86::VPDPWSSDZ256rk:
2601  case X86::VPDPWSSDZ256rkz:
2602  case X86::VPDPWSSDZr:
2603  case X86::VPDPWSSDZrk:
2604  case X86::VPDPWSSDZrkz:
2605  case X86::VPDPWSSDSZ128r:
2606  case X86::VPDPWSSDSZ128rk:
2607  case X86::VPDPWSSDSZ128rkz:
2608  case X86::VPDPWSSDSZ256r:
2609  case X86::VPDPWSSDSZ256rk:
2610  case X86::VPDPWSSDSZ256rkz:
2611  case X86::VPDPWSSDSZr:
2612  case X86::VPDPWSSDSZrk:
2613  case X86::VPDPWSSDSZrkz:
2614  case X86::VPMADD52HUQZ128r:
2615  case X86::VPMADD52HUQZ128rk:
2616  case X86::VPMADD52HUQZ128rkz:
2617  case X86::VPMADD52HUQZ256r:
2618  case X86::VPMADD52HUQZ256rk:
2619  case X86::VPMADD52HUQZ256rkz:
2620  case X86::VPMADD52HUQZr:
2621  case X86::VPMADD52HUQZrk:
2622  case X86::VPMADD52HUQZrkz:
2623  case X86::VPMADD52LUQZ128r:
2624  case X86::VPMADD52LUQZ128rk:
2625  case X86::VPMADD52LUQZ128rkz:
2626  case X86::VPMADD52LUQZ256r:
2627  case X86::VPMADD52LUQZ256rk:
2628  case X86::VPMADD52LUQZ256rkz:
2629  case X86::VPMADD52LUQZr:
2630  case X86::VPMADD52LUQZrk:
2631  case X86::VPMADD52LUQZrkz:
2632  case X86::VFMADDCPHZr:
2633  case X86::VFMADDCPHZrk:
2634  case X86::VFMADDCPHZrkz:
2635  case X86::VFMADDCPHZ128r:
2636  case X86::VFMADDCPHZ128rk:
2637  case X86::VFMADDCPHZ128rkz:
2638  case X86::VFMADDCPHZ256r:
2639  case X86::VFMADDCPHZ256rk:
2640  case X86::VFMADDCPHZ256rkz:
2641  case X86::VFMADDCSHZr:
2642  case X86::VFMADDCSHZrk:
2643  case X86::VFMADDCSHZrkz: {
2644  unsigned CommutableOpIdx1 = 2;
2645  unsigned CommutableOpIdx2 = 3;
2646  if (X86II::isKMasked(Desc.TSFlags)) {
2647  // Skip the mask register.
2648  ++CommutableOpIdx1;
2649  ++CommutableOpIdx2;
2650  }
2651  if (!fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2,
2652  CommutableOpIdx1, CommutableOpIdx2))
2653  return false;
2654  if (!MI.getOperand(SrcOpIdx1).isReg() ||
2655  !MI.getOperand(SrcOpIdx2).isReg())
2656  // No idea.
2657  return false;
2658  return true;
2659  }
2660 
2661  default:
2662  const X86InstrFMA3Group *FMA3Group = getFMA3Group(MI.getOpcode(),
2663  MI.getDesc().TSFlags);
2664  if (FMA3Group)
2665  return findThreeSrcCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2,
2666  FMA3Group->isIntrinsic());
2667 
2668  // Handled masked instructions since we need to skip over the mask input
2669  // and the preserved input.
2670  if (X86II::isKMasked(Desc.TSFlags)) {
2671  // First assume that the first input is the mask operand and skip past it.
2672  unsigned CommutableOpIdx1 = Desc.getNumDefs() + 1;
2673  unsigned CommutableOpIdx2 = Desc.getNumDefs() + 2;
2674  // Check if the first input is tied. If there isn't one then we only
2675  // need to skip the mask operand which we did above.
2676  if ((MI.getDesc().getOperandConstraint(Desc.getNumDefs(),
2677  MCOI::TIED_TO) != -1)) {
2678  // If this is zero masking instruction with a tied operand, we need to
2679  // move the first index back to the first input since this must
2680  // be a 3 input instruction and we want the first two non-mask inputs.
2681  // Otherwise this is a 2 input instruction with a preserved input and
2682  // mask, so we need to move the indices to skip one more input.
2683  if (X86II::isKMergeMasked(Desc.TSFlags)) {
2684  ++CommutableOpIdx1;
2685  ++CommutableOpIdx2;
2686  } else {
2687  --CommutableOpIdx1;
2688  }
2689  }
2690 
2691  if (!fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2,
2692  CommutableOpIdx1, CommutableOpIdx2))
2693  return false;
2694 
2695  if (!MI.getOperand(SrcOpIdx1).isReg() ||
2696  !MI.getOperand(SrcOpIdx2).isReg())
2697  // No idea.
2698  return false;
2699  return true;
2700  }
2701 
2702  return TargetInstrInfo::findCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2);
2703  }
2704  return false;
2705 }
2706 
2708  unsigned Opcode = MI->getOpcode();
2709  if (Opcode != X86::LEA32r && Opcode != X86::LEA64r &&
2710  Opcode != X86::LEA64_32r)
2711  return false;
2712 
2713  const MachineOperand &Scale = MI->getOperand(1 + X86::AddrScaleAmt);
2714  const MachineOperand &Disp = MI->getOperand(1 + X86::AddrDisp);
2715  const MachineOperand &Segment = MI->getOperand(1 + X86::AddrSegmentReg);
2716 
2717  if (Segment.getReg() != 0 || !Disp.isImm() || Disp.getImm() != 0 ||
2718  Scale.getImm() > 1)
2719  return false;
2720 
2721  return true;
2722 }
2723 
2725  // Currently we're interested in following sequence only.
2726  // r3 = lea r1, r2
2727  // r5 = add r3, r4
2728  // Both r3 and r4 are killed in add, we hope the add instruction has the
2729  // operand order
2730  // r5 = add r4, r3
2731  // So later in X86FixupLEAs the lea instruction can be rewritten as add.
2732  unsigned Opcode = MI.getOpcode();
2733  if (Opcode != X86::ADD32rr && Opcode != X86::ADD64rr)
2734  return false;
2735 
2736  const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
2737  Register Reg1 = MI.getOperand(1).getReg();
2738  Register Reg2 = MI.getOperand(2).getReg();
2739 
2740  // Check if Reg1 comes from LEA in the same MBB.
2741  if (MachineInstr *Inst = MRI.getUniqueVRegDef(Reg1)) {
2742  if (isConvertibleLEA(Inst) && Inst->getParent() == MI.getParent()) {
2743  Commute = true;
2744  return true;
2745  }
2746  }
2747 
2748  // Check if Reg2 comes from LEA in the same MBB.
2749  if (MachineInstr *Inst = MRI.getUniqueVRegDef(Reg2)) {
2750  if (isConvertibleLEA(Inst) && Inst->getParent() == MI.getParent()) {
2751  Commute = false;
2752  return true;
2753  }
2754  }
2755 
2756  return false;
2757 }
2758 
2760  switch (MI.getOpcode()) {
2761  default: return X86::COND_INVALID;
2762  case X86::JCC_1:
2763  return static_cast<X86::CondCode>(
2764  MI.getOperand(MI.getDesc().getNumOperands() - 1).getImm());
2765  }
2766 }
2767 
2768 /// Return condition code of a SETCC opcode.
2770  switch (MI.getOpcode()) {
2771  default: return X86::COND_INVALID;
2772  case X86::SETCCr: case X86::SETCCm:
2773  return static_cast<X86::CondCode>(
2774  MI.getOperand(MI.getDesc().getNumOperands() - 1).getImm());
2775  }
2776 }
2777 
2778 /// Return condition code of a CMov opcode.
2780  switch (MI.getOpcode()) {
2781  default: return X86::COND_INVALID;
2782  case X86::CMOV16rr: case X86::CMOV32rr: case X86::CMOV64rr:
2783  case X86::CMOV16rm: case X86::CMOV32rm: case X86::CMOV64rm:
2784  return static_cast<X86::CondCode>(
2785  MI.getOperand(MI.getDesc().getNumOperands() - 1).getImm());
2786  }
2787 }
2788 
2789 /// Return the inverse of the specified condition,
2790 /// e.g. turning COND_E to COND_NE.
2792  switch (CC) {
2793  default: llvm_unreachable("Illegal condition code!");
2794  case X86::COND_E: return X86::COND_NE;
2795  case X86::COND_NE: return X86::COND_E;
2796  case X86::COND_L: return X86::COND_GE;
2797  case X86::COND_LE: return X86::COND_G;
2798  case X86::COND_G: return X86::COND_LE;
2799  case X86::COND_GE: return X86::COND_L;
2800  case X86::COND_B: return X86::COND_AE;
2801  case X86::COND_BE: return X86::COND_A;
2802  case X86::COND_A: return X86::COND_BE;
2803  case X86::COND_AE: return X86::COND_B;
2804  case X86::COND_S: return X86::COND_NS;
2805  case X86::COND_NS: return X86::COND_S;
2806  case X86::COND_P: return X86::COND_NP;
2807  case X86::COND_NP: return X86::COND_P;
2808  case X86::COND_O: return X86::COND_NO;
2809  case X86::COND_NO: return X86::COND_O;
2812  }
2813 }
2814 
2815 /// Assuming the flags are set by MI(a,b), return the condition code if we
2816 /// modify the instructions such that flags are set by MI(b,a).
2818  switch (CC) {
2819  default: return X86::COND_INVALID;
2820  case X86::COND_E: return X86::COND_E;
2821  case X86::COND_NE: return X86::COND_NE;
2822  case X86::COND_L: return X86::COND_G;
2823  case X86::COND_LE: return X86::COND_GE;
2824  case X86::COND_G: return X86::COND_L;
2825  case X86::COND_GE: return X86::COND_LE;
2826  case X86::COND_B: return X86::COND_A;
2827  case X86::COND_BE: return X86::COND_AE;
2828  case X86::COND_A: return X86::COND_B;
2829  case X86::COND_AE: return X86::COND_BE;
2830  }
2831 }
2832 
2833 std::pair<X86::CondCode, bool>
2836  bool NeedSwap = false;
2837  switch (Predicate) {
2838  default: break;
2839  // Floating-point Predicates
2840  case CmpInst::FCMP_UEQ: CC = X86::COND_E; break;
2841  case CmpInst::FCMP_OLT: NeedSwap = true; LLVM_FALLTHROUGH;
2842  case CmpInst::FCMP_OGT: CC = X86::COND_A; break;
2843  case CmpInst::FCMP_OLE: NeedSwap = true; LLVM_FALLTHROUGH;
2844  case CmpInst::FCMP_OGE: CC = X86::COND_AE; break;
2845  case CmpInst::FCMP_UGT: NeedSwap = true; LLVM_FALLTHROUGH;
2846  case CmpInst::FCMP_ULT: CC = X86::COND_B; break;
2847  case CmpInst::FCMP_UGE: NeedSwap = true; LLVM_FALLTHROUGH;
2848  case CmpInst::FCMP_ULE: CC = X86::COND_BE; break;
2849  case CmpInst::FCMP_ONE: CC = X86::COND_NE; break;
2850  case CmpInst::FCMP_UNO: CC = X86::COND_P; break;
2851  case CmpInst::FCMP_ORD: CC = X86::COND_NP; break;
2853  case CmpInst::FCMP_UNE: CC = X86::COND_INVALID; break;
2854 
2855  // Integer Predicates
2856  case CmpInst::ICMP_EQ: CC = X86::COND_E; break;
2857  case CmpInst::ICMP_NE: CC = X86::COND_NE; break;
2858  case CmpInst::ICMP_UGT: CC = X86::COND_A; break;
2859  case CmpInst::ICMP_UGE: CC = X86::COND_AE; break;
2860  case CmpInst::ICMP_ULT: CC = X86::COND_B; break;
2861  case CmpInst::ICMP_ULE: CC = X86::COND_BE; break;
2862  case CmpInst::ICMP_SGT: CC = X86::COND_G; break;
2863  case CmpInst::ICMP_SGE: CC = X86::COND_GE; break;
2864  case CmpInst::ICMP_SLT: CC = X86::COND_L; break;
2865  case CmpInst::ICMP_SLE: CC = X86::COND_LE; break;
2866  }
2867 
2868  return std::make_pair(CC, NeedSwap);
2869 }
2870 
2871 /// Return a setcc opcode based on whether it has memory operand.
2872 unsigned X86::getSETOpc(bool HasMemoryOperand) {
2873  return HasMemoryOperand ? X86::SETCCr : X86::SETCCm;
2874 }
2875 
2876 /// Return a cmov opcode for the given register size in bytes, and operand type.
2877 unsigned X86::getCMovOpcode(unsigned RegBytes, bool HasMemoryOperand) {
2878  switch(RegBytes) {
2879  default: llvm_unreachable("Illegal register size!");
2880  case 2: return HasMemoryOperand ? X86::CMOV16rm : X86::CMOV16rr;
2881  case 4: return HasMemoryOperand ? X86::CMOV32rm : X86::CMOV32rr;
2882  case 8: return HasMemoryOperand ? X86::CMOV64rm : X86::CMOV64rr;
2883  }
2884 }
2885 
2886 /// Get the VPCMP immediate for the given condition.
2888  switch (CC) {
2889  default: llvm_unreachable("Unexpected SETCC condition");
2890  case ISD::SETNE: return 4;
2891  case ISD::SETEQ: return 0;
2892  case ISD::SETULT:
2893  case ISD::SETLT: return 1;
2894  case ISD::SETUGT:
2895  case ISD::SETGT: return 6;
2896  case ISD::SETUGE:
2897  case ISD::SETGE: return 5;
2898  case ISD::SETULE:
2899  case ISD::SETLE: return 2;
2900  }
2901 }
2902 
2903 /// Get the VPCMP immediate if the operands are swapped.
2904 unsigned X86::getSwappedVPCMPImm(unsigned Imm) {
2905  switch (Imm) {
2906  default: llvm_unreachable("Unreachable!");
2907  case 0x01: Imm = 0x06; break; // LT -> NLE
2908  case 0x02: Imm = 0x05; break; // LE -> NLT
2909  case 0x05: Imm = 0x02; break; // NLT -> LE
2910  case 0x06: Imm = 0x01; break; // NLE -> LT
2911  case 0x00: // EQ
2912  case 0x03: // FALSE
2913  case 0x04: // NE
2914  case 0x07: // TRUE
2915  break;
2916  }
2917 
2918  return Imm;
2919 }
2920 
2921 /// Get the VPCOM immediate if the operands are swapped.
2922 unsigned X86::getSwappedVPCOMImm(unsigned Imm) {
2923  switch (Imm) {
2924  default: llvm_unreachable("Unreachable!");
2925  case 0x00: Imm = 0x02; break; // LT -> GT
2926  case 0x01: Imm = 0x03; break; // LE -> GE
2927  case 0x02: Imm = 0x00; break; // GT -> LT
2928  case 0x03: Imm = 0x01; break; // GE -> LE
2929  case 0x04: // EQ
2930  case 0x05: // NE
2931  case 0x06: // FALSE
2932  case 0x07: // TRUE
2933  break;
2934  }
2935 
2936  return Imm;
2937 }
2938 
2939 /// Get the VCMP immediate if the operands are swapped.
2940 unsigned X86::getSwappedVCMPImm(unsigned Imm) {
2941  // Only need the lower 2 bits to distinquish.
2942  switch (Imm & 0x3) {
2943  default: llvm_unreachable("Unreachable!");
2944  case 0x00: case 0x03:
2945  // EQ/NE/TRUE/FALSE/ORD/UNORD don't change immediate when commuted.
2946  break;
2947  case 0x01: case 0x02:
2948  // Need to toggle bits 3:0. Bit 4 stays the same.
2949  Imm ^= 0xf;
2950  break;
2951  }
2952 
2953  return Imm;
2954 }
2955 
2957  switch (MI.getOpcode()) {
2958  case X86::TCRETURNdi:
2959  case X86::TCRETURNri:
2960  case X86::TCRETURNmi:
2961  case X86::TCRETURNdi64:
2962  case X86::TCRETURNri64:
2963  case X86::TCRETURNmi64:
2964  return true;
2965  default:
2966  return false;
2967  }
2968 }
2969 
2971  SmallVectorImpl<MachineOperand> &BranchCond,
2972  const MachineInstr &TailCall) const {
2973  if (TailCall.getOpcode() != X86::TCRETURNdi &&
2974  TailCall.getOpcode() != X86::TCRETURNdi64) {
2975  // Only direct calls can be done with a conditional branch.
2976  return false;
2977  }
2978 
2979  const MachineFunction *MF = TailCall.getParent()->getParent();
2980  if (Subtarget.isTargetWin64() && MF->hasWinCFI()) {
2981  // Conditional tail calls confuse the Win64 unwinder.
2982  return false;
2983  }
2984 
2985  assert(BranchCond.size() == 1);
2986  if (BranchCond[0].getImm() > X86::LAST_VALID_COND) {
2987  // Can't make a conditional tail call with this condition.
2988  return false;
2989  }
2990 
2992  if (X86FI->getTCReturnAddrDelta() != 0 ||
2993  TailCall.getOperand(1).getImm() != 0) {
2994  // A conditional tail call cannot do any stack adjustment.
2995  return false;
2996  }
2997 
2998  return true;
2999 }
3000 
3003  const MachineInstr &TailCall) const {
3005 
3007  while (I != MBB.begin()) {
3008  --I;
3009  if (I->isDebugInstr())
3010  continue;
3011  if (!I->isBranch())
3012  assert(0 && "Can't find the branch to replace!");
3013 
3015  assert(BranchCond.size() == 1);
3016  if (CC != BranchCond[0].getImm())
3017  continue;
3018 
3019  break;
3020  }
3021 
3022  unsigned Opc = TailCall.getOpcode() == X86::TCRETURNdi ? X86::TCRETURNdicc
3023  : X86::TCRETURNdi64cc;
3024 
3025  auto MIB = BuildMI(MBB, I, MBB.findDebugLoc(I), get(Opc));
3026  MIB->addOperand(TailCall.getOperand(0)); // Destination.
3027  MIB.addImm(0); // Stack offset (not used).
3028  MIB->addOperand(BranchCond[0]); // Condition.
3029  MIB.copyImplicitOps(TailCall); // Regmask and (imp-used) parameters.
3030 
3031  // Add implicit uses and defs of all live regs potentially clobbered by the
3032  // call. This way they still appear live across the call.
3033  LivePhysRegs LiveRegs(getRegisterInfo());
3034  LiveRegs.addLiveOuts(MBB);
3036  LiveRegs.stepForward(*MIB, Clobbers);
3037  for (const auto &C : Clobbers) {
3038  MIB.addReg(C.first, RegState::Implicit);
3040  }
3041 
3042  I->eraseFromParent();
3043 }
3044 
3045 // Given a MBB and its TBB, find the FBB which was a fallthrough MBB (it may
3046 // not be a fallthrough MBB now due to layout changes). Return nullptr if the
3047 // fallthrough MBB cannot be identified.
3049  MachineBasicBlock *TBB) {
3050  // Look for non-EHPad successors other than TBB. If we find exactly one, it
3051  // is the fallthrough MBB. If we find zero, then TBB is both the target MBB
3052  // and fallthrough MBB. If we find more than one, we cannot identify the
3053  // fallthrough MBB and should return nullptr.
3054  MachineBasicBlock *FallthroughBB = nullptr;
3055  for (auto SI = MBB->succ_begin(), SE = MBB->succ_end(); SI != SE; ++SI) {
3056  if ((*SI)->isEHPad() || (*SI == TBB && FallthroughBB))
3057  continue;
3058  // Return a nullptr if we found more than one fallthrough successor.
3059  if (FallthroughBB && FallthroughBB != TBB)
3060  return nullptr;
3061  FallthroughBB = *SI;
3062  }
3063  return FallthroughBB;
3064 }
3065 
3066 bool X86InstrInfo::AnalyzeBranchImpl(
3069  SmallVectorImpl<MachineInstr *> &CondBranches, bool AllowModify) const {
3070 
3071  // Start from the bottom of the block and work up, examining the
3072  // terminator instructions.
3074  MachineBasicBlock::iterator UnCondBrIter = MBB.end();
3075  while (I != MBB.begin()) {
3076  --I;
3077  if (I->isDebugInstr())
3078  continue;
3079 
3080  // Working from the bottom, when we see a non-terminator instruction, we're
3081  // done.
3082  if (!isUnpredicatedTerminator(*I))
3083  break;
3084 
3085  // A terminator that isn't a branch can't easily be handled by this
3086  // analysis.
3087  if (!I->isBranch())
3088  return true;
3089 
3090  // Handle unconditional branches.
3091  if (I->getOpcode() == X86::JMP_1) {
3092  UnCondBrIter = I;
3093 
3094  if (!AllowModify) {
3095  TBB = I->getOperand(0).getMBB();
3096  continue;
3097  }
3098 
3099  // If the block has any instructions after a JMP, delete them.
3100  while (std::next(I) != MBB.end())
3101  std::next(I)->eraseFromParent();
3102 
3103  Cond.clear();
3104  FBB = nullptr;
3105 
3106  // Delete the JMP if it's equivalent to a fall-through.
3107  if (MBB.isLayoutSuccessor(I->getOperand(0).getMBB())) {
3108  TBB = nullptr;
3109  I->eraseFromParent();
3110  I = MBB.end();
3111  UnCondBrIter = MBB.end();
3112  continue;
3113  }
3114 
3115  // TBB is used to indicate the unconditional destination.
3116  TBB = I->getOperand(0).getMBB();
3117  continue;
3118  }
3119 
3120  // Handle conditional branches.
3121  X86::CondCode BranchCode = X86::getCondFromBranch(*I);
3122  if (BranchCode == X86::COND_INVALID)
3123  return true; // Can't handle indirect branch.
3124 
3125  // In practice we should never have an undef eflags operand, if we do
3126  // abort here as we are not prepared to preserve the flag.
3127  if (I->findRegisterUseOperand(X86::EFLAGS)->isUndef())
3128  return true;
3129 
3130  // Working from the bottom, handle the first conditional branch.
3131  if (Cond.empty()) {
3132  MachineBasicBlock *TargetBB = I->getOperand(0).getMBB();
3133  if (AllowModify && UnCondBrIter != MBB.end() &&
3134  MBB.isLayoutSuccessor(TargetBB)) {
3135  // If we can modify the code and it ends in something like:
3136  //
3137  // jCC L1
3138  // jmp L2
3139  // L1:
3140  // ...
3141  // L2:
3142  //
3143  // Then we can change this to:
3144  //
3145  // jnCC L2
3146  // L1:
3147  // ...
3148  // L2:
3149  //
3150  // Which is a bit more efficient.
3151  // We conditionally jump to the fall-through block.
3152  BranchCode = GetOppositeBranchCondition(BranchCode);
3153  MachineBasicBlock::iterator OldInst = I;
3154 
3155  BuildMI(MBB, UnCondBrIter, MBB.findDebugLoc(I), get(X86::JCC_1))
3156  .addMBB(UnCondBrIter->getOperand(0).getMBB())
3157  .addImm(BranchCode);
3158  BuildMI(MBB, UnCondBrIter, MBB.findDebugLoc(I), get(X86::JMP_1))
3159  .addMBB(TargetBB);
3160 
3161  OldInst->eraseFromParent();
3162  UnCondBrIter->eraseFromParent();
3163 
3164  // Restart the analysis.
3165  UnCondBrIter = MBB.end();
3166  I = MBB.end();
3167  continue;
3168  }
3169 
3170  FBB = TBB;
3171  TBB = I->getOperand(0).getMBB();
3172  Cond.push_back(MachineOperand::CreateImm(BranchCode));
3173  CondBranches.push_back(&*I);
3174  continue;
3175  }
3176 
3177  // Handle subsequent conditional branches. Only handle the case where all
3178  // conditional branches branch to the same destination and their condition
3179  // opcodes fit one of the special multi-branch idioms.
3180  assert(Cond.size() == 1);
3181  assert(TBB);
3182 
3183  // If the conditions are the same, we can leave them alone.
3184  X86::CondCode OldBranchCode = (X86::CondCode)Cond[0].getImm();
3185  auto NewTBB = I->getOperand(0).getMBB();
3186  if (OldBranchCode == BranchCode && TBB == NewTBB)
3187  continue;
3188 
3189  // If they differ, see if they fit one of the known patterns. Theoretically,
3190  // we could handle more patterns here, but we shouldn't expect to see them
3191  // if instruction selection has done a reasonable job.
3192  if (TBB == NewTBB &&
3193  ((OldBranchCode == X86::COND_P && BranchCode == X86::COND_NE) ||
3194  (OldBranchCode == X86::COND_NE && BranchCode == X86::COND_P))) {
3195  BranchCode = X86::COND_NE_OR_P;
3196  } else if ((OldBranchCode == X86::COND_NP && BranchCode == X86::COND_NE) ||
3197  (OldBranchCode == X86::COND_E && BranchCode == X86::COND_P)) {
3198  if (NewTBB != (FBB ? FBB : getFallThroughMBB(&MBB, TBB)))
3199  return true;
3200 
3201  // X86::COND_E_AND_NP usually has two different branch destinations.
3202  //
3203  // JP B1
3204  // JE B2
3205  // JMP B1
3206  // B1:
3207  // B2:
3208  //
3209  // Here this condition branches to B2 only if NP && E. It has another
3210  // equivalent form:
3211  //
3212  // JNE B1
3213  // JNP B2
3214  // JMP B1
3215  // B1:
3216  // B2:
3217  //
3218  // Similarly it branches to B2 only if E && NP. That is why this condition
3219  // is named with COND_E_AND_NP.
3220  BranchCode = X86::COND_E_AND_NP;
3221  } else
3222  return true;
3223 
3224  // Update the MachineOperand.
3225  Cond[0].setImm(BranchCode);
3226  CondBranches.push_back(&*I);
3227  }
3228 
3229  return false;
3230 }
3231 
3233  MachineBasicBlock *&TBB,
3234  MachineBasicBlock *&FBB,
3236  bool AllowModify) const {
3237  SmallVector<MachineInstr *, 4> CondBranches;
3238  return AnalyzeBranchImpl(MBB, TBB, FBB, Cond, CondBranches, AllowModify);
3239 }
3240 
3242  MachineBranchPredicate &MBP,
3243  bool AllowModify) const {
3244  using namespace std::placeholders;
3245 
3247  SmallVector<MachineInstr *, 4> CondBranches;
3248  if (AnalyzeBranchImpl(MBB, MBP.TrueDest, MBP.FalseDest, Cond, CondBranches,
3249  AllowModify))
3250  return true;
3251 
3252  if (Cond.size() != 1)
3253  return true;
3254 
3255  assert(MBP.TrueDest && "expected!");
3256 
3257  if (!MBP.FalseDest)
3258  MBP.FalseDest = MBB.getNextNode();
3259 
3261 
3262  MachineInstr *ConditionDef = nullptr;
3263  bool SingleUseCondition = true;
3264 
3265  for (auto I = std::next(MBB.rbegin()), E = MBB.rend(); I != E; ++I) {
3266  if (I->modifiesRegister(X86::EFLAGS, TRI)) {
3267  ConditionDef = &*I;
3268  break;
3269  }
3270 
3271  if (I->readsRegister(X86::EFLAGS, TRI))
3272  SingleUseCondition = false;
3273  }
3274 
3275  if (!ConditionDef)
3276  return true;
3277 
3278  if (SingleUseCondition) {
3279  for (auto *Succ : MBB.successors())
3280  if (Succ->isLiveIn(X86::EFLAGS))
3281  SingleUseCondition = false;
3282  }
3283 
3284  MBP.ConditionDef = ConditionDef;
3285  MBP.SingleUseCondition = SingleUseCondition;
3286 
3287  // Currently we only recognize the simple pattern:
3288  //
3289  // test %reg, %reg
3290  // je %label
3291  //
3292  const unsigned TestOpcode =
3293  Subtarget.is64Bit() ? X86::TEST64rr : X86::TEST32rr;
3294 
3295  if (ConditionDef->getOpcode() == TestOpcode &&
3296  ConditionDef->getNumOperands() == 3 &&
3297  ConditionDef->getOperand(0).isIdenticalTo(ConditionDef->getOperand(1)) &&
3298  (Cond[0].getImm() == X86::COND_NE || Cond[0].getImm() == X86::COND_E)) {
3299  MBP.LHS = ConditionDef->getOperand(0);
3300  MBP.RHS = MachineOperand::CreateImm(0);
3301  MBP.Predicate = Cond[0].getImm() == X86::COND_NE
3304  return false;
3305  }
3306 
3307  return true;
3308 }
3309 
3311  int *BytesRemoved) const {
3312  assert(!BytesRemoved && "code size not handled");
3313 
3315  unsigned Count = 0;
3316 
3317  while (I != MBB.begin()) {
3318  --I;
3319  if (I->isDebugInstr())
3320  continue;
3321  if (I->getOpcode() != X86::JMP_1 &&
3323  break;
3324  // Remove the branch.
3325  I->eraseFromParent();
3326  I = MBB.end();
3327  ++Count;
3328  }
3329 
3330  return Count;
3331 }
3332 
3334  MachineBasicBlock *TBB,
3335  MachineBasicBlock *FBB,
3337  const DebugLoc &DL,
3338  int *BytesAdded) const {
3339  // Shouldn't be a fall through.
3340  assert(TBB && "insertBranch must not be told to insert a fallthrough");
3341  assert((Cond.size() == 1 || Cond.size() == 0) &&
3342  "X86 branch conditions have one component!");
3343  assert(!BytesAdded && "code size not handled");
3344 
3345  if (Cond.empty()) {
3346  // Unconditional branch?
3347  assert(!FBB && "Unconditional branch with multiple successors!");
3348  BuildMI(&MBB, DL, get(X86::JMP_1)).addMBB(TBB);
3349  return 1;
3350  }
3351 
3352  // If FBB is null, it is implied to be a fall-through block.
3353  bool FallThru = FBB == nullptr;
3354 
3355  // Conditional branch.
3356  unsigned Count = 0;
3357  X86::CondCode CC = (X86::CondCode)Cond[0].getImm();
3358  switch (CC) {
3359  case X86::COND_NE_OR_P:
3360  // Synthesize NE_OR_P with two branches.
3361  BuildMI(&MBB, DL, get(X86::JCC_1)).addMBB(TBB).addImm(X86::COND_NE);
3362  ++Count;
3363  BuildMI(&MBB, DL, get(X86::JCC_1)).addMBB(TBB).addImm(X86::COND_P);
3364  ++Count;
3365  break;
3366  case X86::COND_E_AND_NP:
3367  // Use the next block of MBB as FBB if it is null.
3368  if (FBB == nullptr) {
3369  FBB = getFallThroughMBB(&MBB, TBB);
3370  assert(FBB && "MBB cannot be the last block in function when the false "
3371  "body is a fall-through.");
3372  }
3373  // Synthesize COND_E_AND_NP with two branches.
3374  BuildMI(&MBB, DL, get(X86::JCC_1)).addMBB(FBB).addImm(X86::COND_NE);
3375  ++Count;
3376  BuildMI(&MBB, DL, get(X86::JCC_1)).addMBB(TBB).addImm(X86::COND_NP);
3377  ++Count;
3378  break;
3379  default: {
3380  BuildMI(&MBB, DL, get(X86::JCC_1)).addMBB(TBB).addImm(CC);
3381  ++Count;
3382  }
3383  }
3384  if (!FallThru) {
3385  // Two-way Conditional branch. Insert the second branch.
3386  BuildMI(&MBB, DL, get(X86::JMP_1)).addMBB(FBB);
3387  ++Count;
3388  }
3389  return Count;
3390 }
3391 
3394  Register DstReg, Register TrueReg,
3395  Register FalseReg, int &CondCycles,
3396  int &TrueCycles, int &FalseCycles) const {
3397  // Not all subtargets have cmov instructions.
3398  if (!Subtarget.hasCMov())
3399  return false;
3400  if (Cond.size() != 1)
3401  return false;
3402  // We cannot do the composite conditions, at least not in SSA form.
3403  if ((X86::CondCode)Cond[0].getImm() > X86::LAST_VALID_COND)
3404  return false;
3405 
3406  // Check register classes.
3408  const TargetRegisterClass *RC =
3409  RI.getCommonSubClass(MRI.getRegClass(TrueReg), MRI.getRegClass(FalseReg));
3410  if (!RC)
3411  return false;
3412 
3413  // We have cmov instructions for 16, 32, and 64 bit general purpose registers.
3414  if (X86::GR16RegClass.hasSubClassEq(RC) ||
3415  X86::GR32RegClass.hasSubClassEq(RC) ||
3416  X86::GR64RegClass.hasSubClassEq(RC)) {
3417  // This latency applies to Pentium M, Merom, Wolfdale, Nehalem, and Sandy
3418  // Bridge. Probably Ivy Bridge as well.
3419  CondCycles = 2;
3420  TrueCycles = 2;
3421  FalseCycles = 2;
3422  return true;
3423  }
3424 
3425  // Can't do vectors.
3426  return false;
3427 }
3428 
3431  const DebugLoc &DL, Register DstReg,
3433  Register FalseReg) const {
3436  const TargetRegisterClass &RC = *MRI.getRegClass(DstReg);
3437  assert(Cond.size() == 1 && "Invalid Cond array");
3438  unsigned Opc = X86::getCMovOpcode(TRI.getRegSizeInBits(RC) / 8,
3439  false /*HasMemoryOperand*/);
3440  BuildMI(MBB, I, DL, get(Opc), DstReg)
3441  .addReg(FalseReg)
3442  .addReg(TrueReg)
3443  .addImm(Cond[0].getImm());
3444 }
3445 
3446 /// Test if the given register is a physical h register.
3447 static bool isHReg(unsigned Reg) {
3448  return X86::GR8_ABCD_HRegClass.contains(Reg);
3449 }
3450 
3451 // Try and copy between VR128/VR64 and GR64 registers.
3452 static unsigned CopyToFromAsymmetricReg(unsigned DestReg, unsigned SrcReg,
3453  const X86Subtarget &Subtarget) {
3454  bool HasAVX = Subtarget.hasAVX();
3455  bool HasAVX512 = Subtarget.hasAVX512();
3456 
3457  // SrcReg(MaskReg) -> DestReg(GR64)
3458  // SrcReg(MaskReg) -> DestReg(GR32)
3459 
3460  // All KMASK RegClasses hold the same k registers, can be tested against anyone.
3461  if (X86::VK16RegClass.contains(SrcReg)) {
3462  if (X86::GR64RegClass.contains(DestReg)) {
3463  assert(Subtarget.hasBWI());
3464  return X86::KMOVQrk;
3465  }
3466  if (X86::GR32RegClass.contains(DestReg))
3467  return Subtarget.hasBWI() ? X86::KMOVDrk : X86::KMOVWrk;
3468  }
3469 
3470  // SrcReg(GR64) -> DestReg(MaskReg)
3471  // SrcReg(GR32) -> DestReg(MaskReg)
3472 
3473  // All KMASK RegClasses hold the same k registers, can be tested against anyone.
3474  if (X86::VK16RegClass.contains(DestReg)) {
3475  if (X86::GR64RegClass.contains(SrcReg)) {
3476  assert(Subtarget.hasBWI());
3477  return X86::KMOVQkr;
3478  }
3479  if (X86::GR32RegClass.contains(SrcReg))
3480  return Subtarget.hasBWI() ? X86::KMOVDkr : X86::KMOVWkr;
3481  }
3482 
3483 
3484  // SrcReg(VR128) -> DestReg(GR64)
3485  // SrcReg(VR64) -> DestReg(GR64)
3486  // SrcReg(GR64) -> DestReg(VR128)
3487  // SrcReg(GR64) -> DestReg(VR64)
3488 
3489  if (X86::GR64RegClass.contains(DestReg)) {
3490  if (X86::VR128XRegClass.contains(SrcReg))
3491  // Copy from a VR128 register to a GR64 register.
3492  return HasAVX512 ? X86::VMOVPQIto64Zrr :
3493  HasAVX ? X86::VMOVPQIto64rr :
3494  X86::MOVPQIto64rr;
3495  if (X86::VR64RegClass.contains(SrcReg))
3496  // Copy from a VR64 register to a GR64 register.
3497  return X86::MMX_MOVD64from64rr;
3498  } else if (X86::GR64RegClass.contains(SrcReg)) {
3499  // Copy from a GR64 register to a VR128 register.
3500  if (X86::VR128XRegClass.contains(DestReg))
3501  return HasAVX512 ? X86::VMOV64toPQIZrr :
3502  HasAVX ? X86::VMOV64toPQIrr :
3503  X86::MOV64toPQIrr;
3504  // Copy from a GR64 register to a VR64 register.
3505  if (X86::VR64RegClass.contains(DestReg))
3506  return X86::MMX_MOVD64to64rr;
3507  }
3508 
3509  // SrcReg(VR128) -> DestReg(GR32)
3510  // SrcReg(GR32) -> DestReg(VR128)
3511 
3512  if (X86::GR32RegClass.contains(DestReg) &&
3513  X86::VR128XRegClass.contains(SrcReg))
3514  // Copy from a VR128 register to a GR32 register.
3515  return HasAVX512 ? X86::VMOVPDI2DIZrr :
3516  HasAVX ? X86::VMOVPDI2DIrr :
3517  X86::MOVPDI2DIrr;
3518 
3519  if (X86::VR128XRegClass.contains(DestReg) &&
3520  X86::GR32RegClass.contains(SrcReg))
3521  // Copy from a VR128 register to a VR128 register.
3522  return HasAVX512 ? X86::VMOVDI2PDIZrr :
3523  HasAVX ? X86::VMOVDI2PDIrr :
3524  X86::MOVDI2PDIrr;
3525  return 0;
3526 }
3527 
3530  const DebugLoc &DL, MCRegister DestReg,
3531  MCRegister SrcReg, bool KillSrc) const {
3532  // First deal with the normal symmetric copies.
3533  bool HasAVX = Subtarget.hasAVX();
3534  bool HasVLX = Subtarget.hasVLX();
3535  unsigned Opc = 0;
3536  if (X86::GR64RegClass.contains(DestReg, SrcReg))
3537  Opc = X86::MOV64rr;
3538  else if (X86::GR32RegClass.contains(DestReg, SrcReg))
3539  Opc = X86::MOV32rr;
3540  else if (X86::GR16RegClass.contains(DestReg, SrcReg))
3541  Opc = X86::MOV16rr;
3542  else if (X86::GR8RegClass.contains(DestReg, SrcReg)) {
3543  // Copying to or from a physical H register on x86-64 requires a NOREX
3544  // move. Otherwise use a normal move.
3545  if ((isHReg(DestReg) || isHReg(SrcReg)) &&
3546  Subtarget.is64Bit()) {
3547  Opc = X86::MOV8rr_NOREX;
3548  // Both operands must be encodable without an REX prefix.
3549  assert(X86::GR8_NOREXRegClass.contains(SrcReg, DestReg) &&
3550  "8-bit H register can not be copied outside GR8_NOREX");
3551  } else
3552  Opc = X86::MOV8rr;
3553  }
3554  else if (X86::VR64RegClass.contains(DestReg, SrcReg))
3555  Opc = X86::MMX_MOVQ64rr;
3556  else if (X86::VR128XRegClass.contains(DestReg, SrcReg)) {
3557  if (HasVLX)
3558  Opc = X86::VMOVAPSZ128rr;
3559  else if (X86::VR128RegClass.contains(DestReg, SrcReg))
3560  Opc = HasAVX ? X86::VMOVAPSrr : X86::MOVAPSrr;
3561  else {
3562  // If this an extended register and we don't have VLX we need to use a
3563  // 512-bit move.
3564  Opc = X86::VMOVAPSZrr;
3566  DestReg = TRI->getMatchingSuperReg(DestReg, X86::sub_xmm,
3567  &X86::VR512RegClass);
3568  SrcReg = TRI->getMatchingSuperReg(SrcReg, X86::sub_xmm,
3569  &X86::VR512RegClass);
3570  }
3571  } else if (X86::VR256XRegClass.contains(DestReg, SrcReg)) {
3572  if (HasVLX)
3573  Opc = X86::VMOVAPSZ256rr;
3574  else if (X86::VR256RegClass.contains(DestReg, SrcReg))
3575  Opc = X86::VMOVAPSYrr;
3576  else {
3577  // If this an extended register and we don't have VLX we need to use a
3578  // 512-bit move.
3579  Opc = X86::VMOVAPSZrr;
3581  DestReg = TRI->getMatchingSuperReg(DestReg, X86::sub_ymm,
3582  &X86::VR512RegClass);
3583  SrcReg = TRI->getMatchingSuperReg(SrcReg, X86::sub_ymm,
3584  &X86::VR512RegClass);
3585  }
3586  } else if (X86::VR512RegClass.contains(DestReg, SrcReg))
3587  Opc = X86::VMOVAPSZrr;
3588  // All KMASK RegClasses hold the same k registers, can be tested against anyone.
3589  else if (X86::VK16RegClass.contains(DestReg, SrcReg))
3590  Opc = Subtarget.hasBWI() ? X86::KMOVQkk : X86::KMOVWkk;
3591  if (!Opc)
3592  Opc = CopyToFromAsymmetricReg(DestReg, SrcReg, Subtarget);
3593 
3594  if (Opc) {
3595  BuildMI(MBB, MI, DL, get(Opc), DestReg)
3596  .addReg(SrcReg, getKillRegState(KillSrc));
3597  return;
3598  }
3599 
3600  if (SrcReg == X86::EFLAGS || DestReg == X86::EFLAGS) {
3601  // FIXME: We use a fatal error here because historically LLVM has tried
3602  // lower some of these physreg copies and we want to ensure we get
3603  // reasonable bug reports if someone encounters a case no other testing
3604  // found. This path should be removed after the LLVM 7 release.
3605  report_fatal_error("Unable to copy EFLAGS physical register!");
3606  }
3607 
3608  LLVM_DEBUG(dbgs() << "Cannot copy " << RI.getName(SrcReg) << " to "
3609  << RI.getName(DestReg) << '\n');
3610  report_fatal_error("Cannot emit physreg copy instruction");
3611 }
3612 
3615  if (MI.isMoveReg())
3616  return DestSourcePair{MI.getOperand(0), MI.getOperand(1)};
3617  return None;
3618 }
3619 
3621  const TargetRegisterClass *RC,
3622  bool IsStackAligned,
3623  const X86Subtarget &STI, bool load) {
3624  bool HasAVX = STI.hasAVX();
3625  bool HasAVX512 = STI.hasAVX512();
3626  bool HasVLX = STI.hasVLX();
3627 
3628  switch (STI.getRegisterInfo()->getSpillSize(*RC)) {
3629  default:
3630  llvm_unreachable("Unknown spill size");
3631  case 1:
3632  assert(X86::GR8RegClass.hasSubClassEq(RC) && "Unknown 1-byte regclass");
3633  if (STI.is64Bit())
3634  // Copying to or from a physical H register on x86-64 requires a NOREX
3635  // move. Otherwise use a normal move.
3636  if (isHReg(Reg) || X86::GR8_ABCD_HRegClass.hasSubClassEq(RC))
3637  return load ? X86::MOV8rm_NOREX : X86::MOV8mr_NOREX;
3638  return load ? X86::MOV8rm : X86::MOV8mr;
3639  case 2:
3640  if (X86::VK16RegClass.hasSubClassEq(RC))
3641  return load ? X86::KMOVWkm : X86::KMOVWmk;
3642  if (X86::FR16XRegClass.hasSubClassEq(RC)) {
3643  assert(STI.hasFP16());
3644  return load ? X86::VMOVSHZrm_alt : X86::VMOVSHZmr;
3645  }
3646  assert(X86::GR16RegClass.hasSubClassEq(RC) && "Unknown 2-byte regclass");
3647  return load ? X86::MOV16rm : X86::MOV16mr;
3648  case 4:
3649  if (X86::GR32RegClass.hasSubClassEq(RC))
3650  return load ? X86::MOV32rm : X86::MOV32mr;
3651  if (X86::FR32XRegClass.hasSubClassEq(RC))
3652  return load ?
3653  (HasAVX512 ? X86::VMOVSSZrm_alt :
3654  HasAVX ? X86::VMOVSSrm_alt :
3655  X86::MOVSSrm_alt) :
3656  (HasAVX512 ? X86::VMOVSSZmr :
3657  HasAVX ? X86::VMOVSSmr :
3658  X86::MOVSSmr);
3659  if (X86::RFP32RegClass.hasSubClassEq(RC))
3660  return load ? X86::LD_Fp32m : X86::ST_Fp32m;
3661  if (X86::VK32RegClass.hasSubClassEq(RC)) {
3662  assert(STI.hasBWI() && "KMOVD requires BWI");
3663  return load ? X86::KMOVDkm : X86::KMOVDmk;
3664  }
3665  // All of these mask pair classes have the same spill size, the same kind
3666  // of kmov instructions can be used with all of them.
3667  if (X86::VK1PAIRRegClass.hasSubClassEq(RC) ||
3668  X86::VK2PAIRRegClass.hasSubClassEq(RC) ||
3669  X86::VK4PAIRRegClass.hasSubClassEq(RC) ||
3670  X86::VK8PAIRRegClass.hasSubClassEq(RC) ||
3671  X86::VK16PAIRRegClass.hasSubClassEq(RC))
3672  return load ? X86::MASKPAIR16LOAD : X86::MASKPAIR16STORE;
3673  llvm_unreachable("Unknown 4-byte regclass");
3674  case 8:
3675  if (X86::GR64RegClass.hasSubClassEq(RC))
3676  return load ? X86::MOV64rm : X86::MOV64mr;
3677  if (X86::FR64XRegClass.hasSubClassEq(RC))
3678  return load ?
3679  (HasAVX512 ? X86::VMOVSDZrm_alt :
3680  HasAVX ? X86::VMOVSDrm_alt :
3681  X86::MOVSDrm_alt) :
3682  (HasAVX512 ? X86::VMOVSDZmr :
3683  HasAVX ? X86::VMOVSDmr :
3684  X86::MOVSDmr);
3685  if (X86::VR64RegClass.hasSubClassEq(RC))
3686  return load ? X86::MMX_MOVQ64rm : X86::MMX_MOVQ64mr;
3687  if (X86::RFP64RegClass.hasSubClassEq(RC))
3688  return load ? X86::LD_Fp64m : X86::ST_Fp64m;
3689  if (X86::VK64RegClass.hasSubClassEq(RC)) {
3690  assert(STI.hasBWI() && "KMOVQ requires BWI");
3691  return load ? X86::KMOVQkm : X86::KMOVQmk;
3692  }
3693  llvm_unreachable("Unknown 8-byte regclass");
3694  case 10:
3695  assert(X86::RFP80RegClass.hasSubClassEq(RC) && "Unknown 10-byte regclass");
3696  return load ? X86::LD_Fp80m : X86::ST_FpP80m;
3697  case 16: {
3698  if (X86::VR128XRegClass.hasSubClassEq(RC)) {
3699  // If stack is realigned we can use aligned stores.
3700  if (IsStackAligned)
3701  return load ?
3702  (HasVLX ? X86::VMOVAPSZ128rm :
3703  HasAVX512 ? X86::VMOVAPSZ128rm_NOVLX :
3704  HasAVX ? X86::VMOVAPSrm :
3705  X86::MOVAPSrm):
3706  (HasVLX ? X86::VMOVAPSZ128mr :
3707  HasAVX512 ? X86::VMOVAPSZ128mr_NOVLX :
3708  HasAVX ? X86::VMOVAPSmr :
3709  X86::MOVAPSmr);
3710  else
3711  return load ?
3712  (HasVLX ? X86::VMOVUPSZ128rm :
3713  HasAVX512 ? X86::VMOVUPSZ128rm_NOVLX :
3714  HasAVX ? X86::VMOVUPSrm :
3715  X86::MOVUPSrm):
3716  (HasVLX ? X86::VMOVUPSZ128mr :
3717  HasAVX512 ? X86::VMOVUPSZ128mr_NOVLX :
3718  HasAVX ? X86::VMOVUPSmr :
3719  X86::MOVUPSmr);
3720  }
3721  llvm_unreachable("Unknown 16-byte regclass");
3722  }
3723  case 32:
3724  assert(X86::VR256XRegClass.hasSubClassEq(RC) && "Unknown 32-byte regclass");
3725  // If stack is realigned we can use aligned stores.
3726  if (IsStackAligned)
3727  return load ?
3728  (HasVLX ? X86::VMOVAPSZ256rm :
3729  HasAVX512 ? X86::VMOVAPSZ256rm_NOVLX :
3730  X86::VMOVAPSYrm) :
3731  (HasVLX ? X86::VMOVAPSZ256mr :
3732  HasAVX512 ? X86::VMOVAPSZ256mr_NOVLX :
3733  X86::VMOVAPSYmr);
3734  else
3735  return load ?
3736  (HasVLX ? X86::VMOVUPSZ256rm :
3737  HasAVX512 ? X86::VMOVUPSZ256rm_NOVLX :
3738  X86::VMOVUPSYrm) :
3739  (HasVLX ? X86::VMOVUPSZ256mr :
3740  HasAVX512 ? X86::VMOVUPSZ256mr_NOVLX :
3741  X86::VMOVUPSYmr);
3742  case 64:
3743  assert(X86::VR512RegClass.hasSubClassEq(RC) && "Unknown 64-byte regclass");
3744  assert(STI.hasAVX512() && "Using 512-bit register requires AVX512");
3745  if (IsStackAligned)
3746  return load ? X86::VMOVAPSZrm : X86::VMOVAPSZmr;
3747  else
3748  return load ? X86::VMOVUPSZrm : X86::VMOVUPSZmr;
3749  }
3750 }
3751 
3754  const TargetRegisterInfo *TRI) const {
3755  const MCInstrDesc &Desc = MemI.getDesc();
3756  int MemRefBegin = X86II::getMemoryOperandNo(Desc.TSFlags);
3757  if (MemRefBegin < 0)
3758  return None;
3759 
3760  MemRefBegin += X86II::getOperandBias(Desc);
3761 
3762  auto &BaseOp = MemI.getOperand(MemRefBegin + X86::AddrBaseReg);
3763  if (!BaseOp.isReg()) // Can be an MO_FrameIndex
3764  return None;
3765 
3766  const MachineOperand &DispMO = MemI.getOperand(MemRefBegin + X86::AddrDisp);
3767  // Displacement can be symbolic
3768  if (!DispMO.isImm())
3769  return None;
3770 
3771  ExtAddrMode AM;
3772  AM.BaseReg = BaseOp.getReg();
3773  AM.ScaledReg = MemI.getOperand(MemRefBegin + X86::AddrIndexReg).getReg();
3774  AM.Scale = MemI.getOperand(MemRefBegin + X86::AddrScaleAmt).getImm();
3775  AM.Displacement = DispMO.getImm();
3776  return AM;
3777 }
3778 
3780  const Register Reg,
3781  int64_t &ImmVal) const {
3782  if (MI.getOpcode() != X86::MOV32ri && MI.getOpcode() != X86::MOV64ri)
3783  return false;
3784  // Mov Src can be a global address.
3785  if (!MI.getOperand(1).isImm() || MI.getOperand(0).getReg() != Reg)
3786  return false;
3787  ImmVal = MI.getOperand(1).getImm();
3788  return true;
3789 }
3790 
3792  const MachineInstr *MI, const Register NullValueReg,
3793  const TargetRegisterInfo *TRI) const {
3794  if (!MI->modifiesRegister(NullValueReg, TRI))
3795  return true;
3796  switch (MI->getOpcode()) {
3797  // Shift right/left of a null unto itself is still a null, i.e. rax = shl rax
3798  // X.
3799  case X86::SHR64ri:
3800  case X86::SHR32ri:
3801  case X86::SHL64ri:
3802  case X86::SHL32ri:
3803  assert(MI->getOperand(0).isDef() && MI->getOperand(1).isUse() &&
3804  "expected for shift opcode!");
3805  return MI->getOperand(0).getReg() == NullValueReg &&
3806  MI->getOperand(1).getReg() == NullValueReg;
3807  // Zero extend of a sub-reg of NullValueReg into itself does not change the
3808  // null value.
3809  case X86::MOV32rr:
3810  return llvm::all_of(MI->operands(), [&](const MachineOperand &MO) {
3811  return TRI->isSubRegisterEq(NullValueReg, MO.getReg());
3812  });
3813  default:
3814  return false;
3815  }
3816  llvm_unreachable("Should be handled above!");
3817 }
3818 
3821  int64_t &Offset, bool &OffsetIsScalable, unsigned &Width,
3822  const TargetRegisterInfo *TRI) const {
3823  const MCInstrDesc &Desc = MemOp.getDesc();
3824  int MemRefBegin = X86II::getMemoryOperandNo(Desc.TSFlags);
3825  if (MemRefBegin < 0)
3826  return false;
3827 
3828  MemRefBegin += X86II::getOperandBias(Desc);
3829 
3830  const MachineOperand *BaseOp =
3831  &MemOp.getOperand(MemRefBegin + X86::AddrBaseReg);
3832  if (!BaseOp->isReg()) // Can be an MO_FrameIndex
3833  return false;
3834 
3835  if (MemOp.getOperand(MemRefBegin + X86::AddrScaleAmt).getImm() != 1)
3836  return false;
3837 
3838  if (MemOp.getOperand(MemRefBegin + X86::AddrIndexReg).getReg() !=
3839  X86::NoRegister)
3840  return false;
3841 
3842  const MachineOperand &DispMO = MemOp.getOperand(MemRefBegin + X86::AddrDisp);
3843 
3844  // Displacement can be symbolic
3845  if (!DispMO.isImm())
3846  return false;
3847 
3848  Offset = DispMO.getImm();
3849 
3850  if (!BaseOp->isReg())
3851  return false;
3852 
3853  OffsetIsScalable = false;
3854  // FIXME: Relying on memoperands() may not be right thing to do here. Check
3855  // with X86 maintainers, and fix it accordingly. For now, it is ok, since
3856  // there is no use of `Width` for X86 back-end at the moment.
3857  Width =
3858  !MemOp.memoperands_empty() ? MemOp.memoperands().front()->getSize() : 0;
3859  BaseOps.push_back(BaseOp);
3860  return true;
3861 }
3862 
3863 static unsigned getStoreRegOpcode(Register SrcReg,
3864  const TargetRegisterClass *RC,
3865  bool IsStackAligned,
3866  const X86Subtarget &STI) {
3867  return getLoadStoreRegOpcode(SrcReg, RC, IsStackAligned, STI, false);
3868 }
3869 
3870 static unsigned getLoadRegOpcode(Register DestReg,
3871  const TargetRegisterClass *RC,
3872  bool IsStackAligned, const X86Subtarget &STI) {
3873  return getLoadStoreRegOpcode(DestReg, RC, IsStackAligned, STI, true);
3874 }
3875 
3878  Register SrcReg, bool isKill, int FrameIdx,
3879  const TargetRegisterClass *RC,
3880  const TargetRegisterInfo *TRI) const {
3881  const MachineFunction &MF = *MBB.getParent();
3882  const MachineFrameInfo &MFI = MF.getFrameInfo();
3883  assert(MFI.getObjectSize(FrameIdx) >= TRI->getSpillSize(*RC) &&
3884  "Stack slot too small for store");
3885  if (RC->getID() == X86::TILERegClassID) {
3886  unsigned Opc = X86::TILESTORED;
3887  // tilestored %tmm, (%sp, %idx)
3888  MachineRegisterInfo &RegInfo = MBB.getParent()->getRegInfo();
3889  Register VirtReg = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass);
3890  BuildMI(MBB, MI, DebugLoc(), get(X86::MOV64ri), VirtReg).addImm(64);
3891  MachineInstr *NewMI =
3892  addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc)), FrameIdx)
3893  .addReg(SrcReg, getKillRegState(isKill));
3894  MachineOperand &MO = NewMI->getOperand(2);
3895  MO.setReg(VirtReg);
3896  MO.setIsKill(true);
3897  } else {
3898  unsigned Alignment = std::max<uint32_t>(TRI->getSpillSize(*RC), 16);
3899  bool isAligned =
3900  (Subtarget.getFrameLowering()->getStackAlign() >= Alignment) ||
3901  (RI.canRealignStack(MF) && !MFI.isFixedObjectIndex(FrameIdx));
3902  unsigned Opc = getStoreRegOpcode(SrcReg, RC, isAligned, Subtarget);
3903  addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc)), FrameIdx)
3904  .addReg(SrcReg, getKillRegState(isKill));
3905  }
3906 }
3907 
3910  Register DestReg, int FrameIdx,
3911  const TargetRegisterClass *RC,
3912  const TargetRegisterInfo *TRI) const {
3913  if (RC->getID() == X86::TILERegClassID) {
3914  unsigned Opc = X86::TILELOADD;
3915  // tileloadd (%sp, %idx), %tmm
3916  MachineRegisterInfo &RegInfo = MBB.getParent()->getRegInfo();
3917  Register VirtReg = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass);
3918  MachineInstr *NewMI =
3919  BuildMI(MBB, MI, DebugLoc(), get(X86::MOV64ri), VirtReg).addImm(64);
3920  NewMI = addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc), DestReg),
3921  FrameIdx);
3922  MachineOperand &MO = NewMI->getOperand(3);
3923  MO.setReg(VirtReg);
3924  MO.setIsKill(true);
3925  } else {
3926  const MachineFunction &MF = *MBB.getParent();
3927  const MachineFrameInfo &MFI = MF.getFrameInfo();
3928  unsigned Alignment = std::max<uint32_t>(TRI->getSpillSize(*RC), 16);
3929  bool isAligned =
3930  (Subtarget.getFrameLowering()->getStackAlign() >= Alignment) ||
3931  (RI.canRealignStack(MF) && !MFI.isFixedObjectIndex(FrameIdx));
3932  unsigned Opc = getLoadRegOpcode(DestReg, RC, isAligned, Subtarget);
3933  addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc), DestReg),
3934  FrameIdx);
3935  }
3936 }
3937 
3939  Register &SrcReg2, int64_t &CmpMask,
3940  int64_t &CmpValue) const {
3941  switch (MI.getOpcode()) {
3942  default: break;
3943  case X86::CMP64ri32:
3944  case X86::CMP64ri8:
3945  case X86::CMP32ri:
3946  case X86::CMP32ri8:
3947  case X86::CMP16ri:
3948  case X86::CMP16ri8:
3949  case X86::CMP8ri:
3950  SrcReg = MI.getOperand(0).getReg();
3951  SrcReg2 = 0;
3952  if (MI.getOperand(1).isImm()) {
3953  CmpMask = ~0;
3954  CmpValue = MI.getOperand(1).getImm();
3955  } else {
3956  CmpMask = CmpValue = 0;
3957  }
3958  return true;
3959  // A SUB can be used to perform comparison.
3960  case X86::SUB64rm:
3961  case X86::SUB32rm:
3962  case X86::SUB16rm:
3963  case X86::SUB8rm:
3964  SrcReg = MI.getOperand(1).getReg();
3965  SrcReg2 = 0;
3966  CmpMask = 0;
3967  CmpValue = 0;
3968  return true;
3969  case X86::SUB64rr:
3970  case X86::SUB32rr:
3971  case X86::SUB16rr:
3972  case X86::SUB8rr:
3973  SrcReg = MI.getOperand(1).getReg();
3974  SrcReg2 = MI.getOperand(2).getReg();
3975  CmpMask = 0;
3976  CmpValue = 0;
3977  return true;
3978  case X86::SUB64ri32:
3979  case X86::SUB64ri8:
3980  case X86::SUB32ri:
3981  case X86::SUB32ri8:
3982  case X86::SUB16ri:
3983  case X86::SUB16ri8:
3984  case X86::SUB8ri:
3985  SrcReg = MI.getOperand(1).getReg();
3986  SrcReg2 = 0;
3987  if (MI.getOperand(2).isImm()) {
3988  CmpMask = ~0;
3989  CmpValue = MI.getOperand(2).getImm();
3990  } else {
3991  CmpMask = CmpValue = 0;
3992  }
3993  return true;
3994  case X86::CMP64rr:
3995  case X86::CMP32rr:
3996  case X86::CMP16rr:
3997  case X86::CMP8rr:
3998  SrcReg = MI.getOperand(0).getReg();
3999  SrcReg2 = MI.getOperand(1).getReg();
4000  CmpMask = 0;
4001  CmpValue = 0;
4002  return true;
4003  case X86::TEST8rr:
4004  case X86::TEST16rr:
4005  case X86::TEST32rr:
4006  case X86::TEST64rr:
4007  SrcReg = MI.getOperand(0).getReg();
4008  if (MI.getOperand(1).getReg() != SrcReg)
4009  return false;
4010  // Compare against zero.
4011  SrcReg2 = 0;
4012  CmpMask = ~0;
4013  CmpValue = 0;
4014  return true;
4015  }
4016  return false;
4017 }
4018 
4019 /// Check whether the first instruction, whose only
4020 /// purpose is to update flags, can be made redundant.
4021 /// CMPrr can be made redundant by SUBrr if the operands are the same.
4022 /// This function can be extended later on.
4023 /// SrcReg, SrcRegs: register operands for FlagI.
4024 /// ImmValue: immediate for FlagI if it takes an immediate.
4025 inline static bool isRedundantFlagInstr(const MachineInstr &FlagI,
4026  Register SrcReg, Register SrcReg2,
4027  int64_t ImmMask, int64_t ImmValue,
4028  const MachineInstr &OI) {
4029  if (((FlagI.getOpcode() == X86::CMP64rr && OI.getOpcode() == X86::SUB64rr) ||
4030  (FlagI.getOpcode() == X86::CMP32rr && OI.getOpcode() == X86::SUB32rr) ||
4031  (FlagI.getOpcode() == X86::CMP16rr && OI.getOpcode() == X86::SUB16rr) ||
4032  (FlagI.getOpcode() == X86::CMP8rr && OI.getOpcode() == X86::SUB8rr)) &&
4033  ((OI.getOperand(1).getReg() == SrcReg &&
4034  OI.getOperand(2).getReg() == SrcReg2) ||
4035  (OI.getOperand(1).getReg() == SrcReg2 &&
4036  OI.getOperand(2).getReg() == SrcReg)))
4037  return true;
4038 
4039  if (ImmMask != 0 &&
4040  ((FlagI.getOpcode() == X86::CMP64ri32 &&
4041  OI.getOpcode() == X86::SUB64ri32) ||
4042  (FlagI.getOpcode() == X86::CMP64ri8 &&
4043  OI.getOpcode() == X86::SUB64ri8) ||
4044  (FlagI.getOpcode() == X86::CMP32ri && OI.getOpcode() == X86::SUB32ri) ||
4045  (FlagI.getOpcode() == X86::CMP32ri8 &&
4046  OI.getOpcode() == X86::SUB32ri8) ||
4047  (FlagI.getOpcode() == X86::CMP16ri && OI.getOpcode() == X86::SUB16ri) ||
4048  (FlagI.getOpcode() == X86::CMP16ri8 &&
4049  OI.getOpcode() == X86::SUB16ri8) ||
4050  (FlagI.getOpcode() == X86::CMP8ri && OI.getOpcode() == X86::SUB8ri)) &&
4051  OI.getOperand(1).getReg() == SrcReg &&
4052  OI.getOperand(2).getImm() == ImmValue)
4053  return true;
4054  return false;
4055 }
4056 
4057 /// Check whether the definition can be converted
4058 /// to remove a comparison against zero.
4059 inline static bool isDefConvertible(const MachineInstr &MI, bool &NoSignFlag,
4060  bool &ClearsOverflowFlag) {
4061  NoSignFlag = false;
4062  ClearsOverflowFlag = false;
4063 
4064  switch (MI.getOpcode()) {
4065  default: return false;
4066 
4067  // The shift instructions only modify ZF if their shift count is non-zero.
4068  // N.B.: The processor truncates the shift count depending on the encoding.
4069  case X86::SAR8ri: case X86::SAR16ri: case X86::SAR32ri:case X86::SAR64ri:
4070  case X86::SHR8ri: case X86::SHR16ri: case X86::SHR32ri:case X86::SHR64ri:
4071  return getTruncatedShiftCount(MI, 2) != 0;
4072 
4073  // Some left shift instructions can be turned into LEA instructions but only
4074  // if their flags aren't used. Avoid transforming such instructions.
4075  case X86::SHL8ri: case X86::SHL16ri: case X86::SHL32ri:case X86::SHL64ri:{
4076  unsigned ShAmt = getTruncatedShiftCount(MI, 2);
4077  if (isTruncatedShiftCountForLEA(ShAmt)) return false;
4078  return ShAmt != 0;
4079  }
4080 
4081  case X86::SHRD16rri8:case X86::SHRD32rri8:case X86::SHRD64rri8:
4082  case X86::SHLD16rri8:case X86::SHLD32rri8:case X86::SHLD64rri8:
4083  return getTruncatedShiftCount(MI, 3) != 0;
4084 
4085  case X86::SUB64ri32: case X86::SUB64ri8: case X86::SUB32ri:
4086  case X86::SUB32ri8: case X86::SUB16ri: case X86::SUB16ri8:
4087  case X86::SUB8ri: case X86::SUB64rr: case X86::SUB32rr:
4088  case X86::SUB16rr: case X86::SUB8rr: case X86::SUB64rm:
4089  case X86::SUB32rm: case X86::SUB16rm: case X86::SUB8rm:
4090  case X86::DEC64r: case X86::DEC32r: case X86::DEC16r: case X86::DEC8r:
4091  case X86::ADD64ri32: case X86::ADD64ri8: case X86::ADD32ri:
4092  case X86::ADD32ri8: case X86::ADD16ri: case X86::ADD16ri8:
4093  case X86::ADD8ri: case X86::ADD64rr: case X86::ADD32rr:
4094  case X86::ADD16rr: case X86::ADD8rr: case X86::ADD64rm:
4095  case X86::ADD32rm: case X86::ADD16rm: case X86::ADD8rm:
4096  case X86::INC64r: case X86::INC32r: case X86::INC16r: case X86::INC8r:
4097  case X86::ADC64ri32: case X86::ADC64ri8: case X86::ADC32ri:
4098  case X86::ADC32ri8: case X86::ADC16ri: case X86::ADC16ri8:
4099  case X86::ADC8ri: case X86::ADC64rr: case X86::ADC32rr:
4100  case X86::ADC16rr: case X86::ADC8rr: case X86::ADC64rm:
4101  case X86::ADC32rm: case X86::ADC16rm: case X86::ADC8rm:
4102  case X86::SBB64ri32: case X86::SBB64ri8: case X86::SBB32ri:
4103  case X86::SBB32ri8: case X86::SBB16ri: case X86::SBB16ri8:
4104  case X86::SBB8ri: case X86::SBB64rr: case X86::SBB32rr:
4105  case X86::SBB16rr: case X86::SBB8rr: case X86::SBB64rm:
4106  case X86::SBB32rm: case X86::SBB16rm: case X86::SBB8rm:
4107  case X86::NEG8r: case X86::NEG16r: case X86::NEG32r: case X86::NEG64r:
4108  case X86::SAR8r1: case X86::SAR16r1: case X86::SAR32r1:case X86::SAR64r1:
4109  case X86::SHR8r1: case X86::SHR16r1: case X86::SHR32r1:case X86::SHR64r1:
4110  case X86::SHL8r1: case X86::SHL16r1: case X86::SHL32r1:case X86::SHL64r1:
4111  case X86::LZCNT16rr: case X86::LZCNT16rm:
4112  case X86::LZCNT32rr: case X86::LZCNT32rm:
4113  case X86::LZCNT64rr: case X86::LZCNT64rm:
4114  case X86::POPCNT16rr:case X86::POPCNT16rm:
4115  case X86::POPCNT32rr:case X86::POPCNT32rm:
4116  case X86::POPCNT64rr:case X86::POPCNT64rm:
4117  case X86::TZCNT16rr: case X86::TZCNT16rm:
4118  case X86::TZCNT32rr: case X86::TZCNT32rm:
4119  case X86::TZCNT64rr: case X86::TZCNT64rm:
4120  return true;
4121  case X86::AND64ri32: case X86::AND64ri8: case X86::AND32ri:
4122  case X86::AND32ri8: case X86::AND16ri: case X86::AND16ri8:
4123  case X86::AND8ri: case X86::AND64rr: case X86::AND32rr:
4124  case X86::AND16rr: case X86::AND8rr: case X86::AND64rm:
4125  case X86::AND32rm: case X86::AND16rm: case X86::AND8rm:
4126  case X86::XOR64ri32: case X86::XOR64ri8: case X86::XOR32ri:
4127  case X86::XOR32ri8: case X86::XOR16ri: case X86::XOR16ri8:
4128  case X86::XOR8ri: case X86::XOR64rr: case X86::XOR32rr:
4129  case X86::XOR16rr: case X86::XOR8rr: case X86::XOR64rm:
4130  case X86::XOR32rm: case X86::XOR16rm: case X86::XOR8rm:
4131  case X86::OR64ri32: case X86::OR64ri8: case X86::OR32ri:
4132  case X86::OR32ri8: case X86::OR16ri: case X86::OR16ri8:
4133  case X86::OR8ri: case X86::OR64rr: case X86::OR32rr:
4134  case X86::OR16rr: case X86::OR8rr: case X86::OR64rm:
4135  case X86::OR32rm: case X86::OR16rm: case X86::OR8rm:
4136  case X86::ANDN32rr: case X86::ANDN32rm:
4137  case X86::ANDN64rr: case X86::ANDN64rm:
4138  case X86::BLSI32rr: case X86::BLSI32rm:
4139  case X86::BLSI64rr: case X86::BLSI64rm:
4140  case X86::BLSMSK32rr: case X86::BLSMSK32rm:
4141  case X86::BLSMSK64rr: case X86::BLSMSK64rm:
4142  case X86::BLSR32rr: case X86::BLSR32rm:
4143  case X86::BLSR64rr: case X86::BLSR64rm:
4144  case X86::BLCFILL32rr: case X86::BLCFILL32rm:
4145  case X86::BLCFILL64rr: case X86::BLCFILL64rm:
4146  case X86::BLCI32rr: case X86::BLCI32rm:
4147  case X86::BLCI64rr: case X86::BLCI64rm:
4148  case X86::BLCIC32rr: case X86::BLCIC32rm:
4149  case X86::BLCIC64rr: case X86::BLCIC64rm:
4150  case X86::BLCMSK32rr: case X86::BLCMSK32rm:
4151  case X86::BLCMSK64rr: case X86::BLCMSK64rm:
4152  case X86::BLCS32rr: case X86::BLCS32rm:
4153  case X86::BLCS64rr: case X86::BLCS64rm:
4154  case X86::BLSFILL32rr: case X86::BLSFILL32rm:
4155  case X86::BLSFILL64rr: case X86::BLSFILL64rm:
4156  case X86::BLSIC32rr: case X86::BLSIC32rm:
4157  case X86::BLSIC64rr: case X86::BLSIC64rm:
4158  case X86::BZHI32rr: case X86::BZHI32rm:
4159  case X86::BZHI64rr: case X86::BZHI64rm:
4160  case X86::T1MSKC32rr: case X86::T1MSKC32rm:
4161  case X86::T1MSKC64rr: case X86::T1MSKC64rm:
4162  case X86::TZMSK32rr: case X86::TZMSK32rm:
4163  case X86::TZMSK64rr: case X86::TZMSK64rm:
4164  // These instructions clear the overflow flag just like TEST.
4165  // FIXME: These are not the only instructions in this switch that clear the
4166  // overflow flag.
4167  ClearsOverflowFlag = true;
4168  return true;
4169  case X86::BEXTR32rr: case X86::BEXTR64rr:
4170  case X86::BEXTR32rm: case X86::BEXTR64rm:
4171  case X86::BEXTRI32ri: case X86::BEXTRI32mi:
4172  case X86::BEXTRI64ri: case X86::BEXTRI64mi:
4173  // BEXTR doesn't update the sign flag so we can't use it. It does clear
4174  // the overflow flag, but that's not useful without the sign flag.
4175  NoSignFlag = true;
4176  return true;
4177  }
4178 }
4179 
4180 /// Check whether the use can be converted to remove a comparison against zero.
4182  switch (MI.getOpcode()) {
4183  default: return X86::COND_INVALID;
4184  case X86::NEG8r:
4185  case X86::NEG16r:
4186  case X86::NEG32r:
4187  case X86::NEG64r:
4188  return X86::COND_AE;
4189  case X86::LZCNT16rr:
4190  case X86::LZCNT32rr:
4191  case X86::LZCNT64rr:
4192  return X86::COND_B;
4193  case X86::POPCNT16rr:
4194  case X86::POPCNT32rr:
4195  case X86::POPCNT64rr:
4196  return X86::COND_E;
4197  case X86::TZCNT16rr:
4198  case X86::TZCNT32rr:
4199  case X86::TZCNT64rr:
4200  return X86::COND_B;
4201  case X86::BSF16rr:
4202  case X86::BSF32rr:
4203  case X86::BSF64rr:
4204  case X86::BSR16rr:
4205  case X86::BSR32rr:
4206  case X86::BSR64rr:
4207  return X86::COND_E;
4208  case X86::BLSI32rr:
4209  case X86::BLSI64rr:
4210  return X86::COND_AE;
4211  case X86::BLSR32rr:
4212  case X86::BLSR64rr:
4213  case X86::BLSMSK32rr:
4214  case X86::BLSMSK64rr:
4215  return X86::COND_B;
4216  // TODO: TBM instructions.
4217  }
4218 }
4219 
4220 /// Check if there exists an earlier instruction that
4221 /// operates on the same source operands and sets flags in the same way as
4222 /// Compare; remove Compare if possible.
4224  Register SrcReg2, int64_t CmpMask,
4225  int64_t CmpValue,
4226  const MachineRegisterInfo *MRI) const {
4227  // Check whether we can replace SUB with CMP.
4228  switch (CmpInstr.getOpcode()) {
4229  default: break;
4230  case X86::SUB64ri32:
4231  case X86::SUB64ri8:
4232  case X86::SUB32ri:
4233  case X86::SUB32ri8:
4234  case X86::SUB16ri:
4235  case X86::SUB16ri8:
4236  case X86::SUB8ri:
4237  case X86::SUB64rm:
4238  case X86::SUB32rm:
4239  case X86::SUB16rm:
4240  case X86::SUB8rm:
4241  case X86::SUB64rr:
4242  case X86::SUB32rr:
4243  case X86::SUB16rr:
4244  case X86::SUB8rr: {
4245  if (!MRI->use_nodbg_empty(CmpInstr.getOperand(0).getReg()))
4246  return false;
4247  // There is no use of the destination register, we can replace SUB with CMP.
4248  unsigned NewOpcode = 0;
4249  switch (CmpInstr.getOpcode()) {
4250  default: llvm_unreachable("Unreachable!");
4251  case X86::SUB64rm: NewOpcode = X86::CMP64rm; break;
4252  case X86::SUB32rm: NewOpcode = X86::CMP32rm; break;
4253  case X86::SUB16rm: NewOpcode = X86::CMP16rm; break;
4254  case X86::SUB8rm: NewOpcode = X86::CMP8rm; break;
4255  case X86::SUB64rr: NewOpcode = X86::CMP64rr; break;
4256  case X86::SUB32rr: NewOpcode = X86::CMP32rr; break;
4257  case X86::SUB16rr: NewOpcode = X86::CMP16rr; break;
4258  case X86::SUB8rr: NewOpcode = X86::CMP8rr; break;
4259  case X86::SUB64ri32: NewOpcode = X86::CMP64ri32; break;
4260  case X86::SUB64ri8: NewOpcode = X86::CMP64ri8; break;
4261  case X86::SUB32ri: NewOpcode = X86::CMP32ri; break;
4262  case X86::SUB32ri8: NewOpcode = X86::CMP32ri8; break;
4263  case X86::SUB16ri: NewOpcode = X86::CMP16ri; break;
4264  case X86::SUB16ri8: NewOpcode = X86::CMP16ri8; break;
4265  case X86::SUB8ri: NewOpcode = X86::CMP8ri; break;
4266  }
4267  CmpInstr.setDesc(get(NewOpcode));
4268  CmpInstr.RemoveOperand(0);
4269  // Mutating this instruction invalidates any debug data associated with it.
4270  CmpInstr.dropDebugNumber();
4271  // Fall through to optimize Cmp if Cmp is CMPrr or CMPri.
4272  if (NewOpcode == X86::CMP64rm || NewOpcode == X86::CMP32rm ||
4273  NewOpcode == X86::CMP16rm || NewOpcode == X86::CMP8rm)
4274  return false;
4275  }
4276  }
4277 
4278  // Get the unique definition of SrcReg.
4279  MachineInstr *MI = MRI->getUniqueVRegDef(SrcReg);
4280  if (!MI) return false;
4281 
4282  // CmpInstr is the first instruction of the BB.
4283  MachineBasicBlock::iterator I = CmpInstr, Def = MI;
4284 
4285  // If we are comparing against zero, check whether we can use MI to update
4286  // EFLAGS. If MI is not in the same BB as CmpInstr, do not optimize.
4287  bool IsCmpZero = (CmpMask != 0 && CmpValue == 0);
4288  if (IsCmpZero && MI->getParent() != CmpInstr.getParent())
4289  return false;
4290 
4291  // If we have a use of the source register between the def and our compare
4292  // instruction we can eliminate the compare iff the use sets EFLAGS in the
4293  // right way.
4294  bool ShouldUpdateCC = false;
4295  bool NoSignFlag = false;
4296  bool ClearsOverflowFlag = false;
4298  if (IsCmpZero && !isDefConvertible(*MI, NoSignFlag, ClearsOverflowFlag)) {
4299  // Scan forward from the use until we hit the use we're looking for or the
4300  // compare instruction.
4301  for (MachineBasicBlock::iterator J = MI;; ++J) {
4302  // Do we have a convertible instruction?
4303  NewCC = isUseDefConvertible(*J);
4304  if (NewCC != X86::COND_INVALID && J->getOperand(1).isReg() &&
4305  J->getOperand(1).getReg() == SrcReg) {
4306  assert(J->definesRegister(X86::EFLAGS) && "Must be an EFLAGS def!");
4307  ShouldUpdateCC = true; // Update CC later on.
4308  // This is not a def of SrcReg, but still a def of EFLAGS. Keep going
4309  // with the new def.
4310  Def = J;
4311  MI = &*Def;
4312  break;
4313  }
4314 
4315  if (J == I)
4316  return false;
4317  }
4318  }
4319 
4320  // We are searching for an earlier instruction that can make CmpInstr
4321  // redundant and that instruction will be saved in Sub.
4322  MachineInstr *Sub = nullptr;
4324 
4325  // We iterate backward, starting from the instruction before CmpInstr and
4326  // stop when reaching the definition of a source register or done with the BB.
4327  // RI points to the instruction before CmpInstr.
4328  // If the definition is in this basic block, RE points to the definition;
4329  // otherwise, RE is the rend of the basic block.
4331  RI = ++I.getReverse(),
4332  RE = CmpInstr.getParent() == MI->getParent()
4333  ? Def.getReverse() /* points to MI */
4334  : CmpInstr.getParent()->rend();
4335  MachineInstr *Movr0Inst = nullptr;
4336  for (; RI != RE; ++RI) {
4337  MachineInstr &Instr = *RI;
4338  // Check whether CmpInstr can be made redundant by the current instruction.
4339  if (!IsCmpZero && isRedundantFlagInstr(CmpInstr, SrcReg, SrcReg2, CmpMask,
4340  CmpValue, Instr)) {
4341  Sub = &Instr;
4342  break;
4343  }
4344 
4345  if (Instr.modifiesRegister(X86::EFLAGS, TRI) ||
4346  Instr.readsRegister(X86::EFLAGS, TRI)) {
4347  // This instruction modifies or uses EFLAGS.
4348 
4349  // MOV32r0 etc. are implemented with xor which clobbers condition code.
4350  // They are safe to move up, if the definition to EFLAGS is dead and
4351  // earlier instructions do not read or write EFLAGS.
4352  if (!Movr0Inst && Instr.getOpcode() == X86::MOV32r0 &&
4353  Instr.registerDefIsDead(X86::EFLAGS, TRI)) {
4354  Movr0Inst = &Instr;
4355  continue;
4356  }
4357 
4358  // We can't remove CmpInstr.
4359  return false;
4360  }
4361  }
4362 
4363  // Return false if no candidates exist.
4364  if (!IsCmpZero && !Sub)
4365  return false;
4366 
4367  bool IsSwapped =
4368  (SrcReg2 != 0 && Sub && Sub->getOperand(1).getReg() == SrcReg2 &&
4369  Sub->getOperand(2).getReg() == SrcReg);
4370 
4371  // Scan forward from the instruction after CmpInstr for uses of EFLAGS.
4372  // It is safe to remove CmpInstr if EFLAGS is redefined or killed.
4373  // If we are done with the basic block, we need to check whether EFLAGS is
4374  // live-out.
4375  bool IsSafe = false;
4377  MachineBasicBlock::iterator E = CmpInstr.getParent()->end();
4378  for (++I; I != E; ++I) {
4379  const MachineInstr &Instr = *I;
4380  bool ModifyEFLAGS = Instr.modifiesRegister(X86::EFLAGS, TRI);
4381  bool UseEFLAGS = Instr.readsRegister(X86::EFLAGS, TRI);
4382  // We should check the usage if this instruction uses and updates EFLAGS.
4383  if (!UseEFLAGS && ModifyEFLAGS) {
4384  // It is safe to remove CmpInstr if EFLAGS is updated again.
4385  IsSafe = true;
4386  break;
4387  }
4388  if (!UseEFLAGS && !ModifyEFLAGS)
4389  continue;
4390 
4391  // EFLAGS is used by this instruction.
4393  if (IsCmpZero || IsSwapped) {
4394  // We decode the condition code from opcode.
4395  if (Instr.isBranch())
4396  OldCC = X86::getCondFromBranch(Instr);
4397  else {
4398  OldCC = X86::getCondFromSETCC(Instr);
4399  if (OldCC == X86::COND_INVALID)
4400  OldCC = X86::getCondFromCMov(Instr);
4401  }
4402  if (OldCC == X86::COND_INVALID) return false;
4403  }
4404  X86::CondCode ReplacementCC = X86::COND_INVALID;
4405  if (IsCmpZero) {
4406  switch (OldCC) {
4407  default: break;
4408  case X86::COND_A: case X86::COND_AE:
4409  case X86::COND_B: case X86::COND_BE:
4410  // CF is used, we can't perform this optimization.
4411  return false;
4412  case X86::COND_G: case X86::COND_GE:
4413  case X86::COND_L: case X86::COND_LE:
4414  case X86::COND_O: case X86::COND_NO:
4415  // If OF is used, the instruction needs to clear it like CmpZero does.
4416  if (!ClearsOverflowFlag)
4417  return false;
4418  break;
4419  case X86::COND_S: case X86::COND_NS:
4420  // If SF is used, but the instruction doesn't update the SF, then we
4421  // can't do the optimization.
4422  if (NoSignFlag)
4423  return false;
4424  break;
4425  }
4426 
4427  // If we're updating the condition code check if we have to reverse the
4428  // condition.
4429  if (ShouldUpdateCC)
4430  switch (OldCC) {
4431  default:
4432  return false;
4433  case X86::COND_E:
4434  ReplacementCC = NewCC;
4435  break;
4436  case X86::COND_NE:
4437  ReplacementCC = GetOppositeBranchCondition(NewCC);
4438  break;
4439  }
4440  } else if (IsSwapped) {
4441  // If we have SUB(r1, r2) and CMP(r2, r1), the condition code needs
4442  // to be changed from r2 > r1 to r1 < r2, from r2 < r1 to r1 > r2, etc.
4443  // We swap the condition code and synthesize the new opcode.
4444  ReplacementCC = getSwappedCondition(OldCC);
4445  if (ReplacementCC == X86::COND_INVALID) return false;
4446  }
4447 
4448  if ((ShouldUpdateCC || IsSwapped) && ReplacementCC != OldCC) {
4449  // Push the MachineInstr to OpsToUpdate.
4450  // If it is safe to remove CmpInstr, the condition code of these
4451  // instructions will be modified.
4452  OpsToUpdate.push_back(std::make_pair(&*I, ReplacementCC));
4453  }
4454  if (ModifyEFLAGS || Instr.killsRegister(X86::EFLAGS, TRI)) {
4455  // It is safe to remove CmpInstr if EFLAGS is updated again or killed.
4456  IsSafe = true;
4457  break;
4458  }
4459  }
4460 
4461  // If EFLAGS is not killed nor re-defined, we should check whether it is
4462  // live-out. If it is live-out, do not optimize.
4463  if ((IsCmpZero || IsSwapped) && !IsSafe) {
4464  MachineBasicBlock *MBB = CmpInstr.getParent();
4466  if (Successor->isLiveIn(X86::EFLAGS))
4467  return false;
4468  }
4469 
4470  // The instruction to be updated is either Sub or MI.
4471  Sub = IsCmpZero ? MI : Sub;
4472  // Move Movr0Inst to the appropriate place before Sub.
4473  if (Movr0Inst) {
4474  // Look backwards until we find a def that doesn't use the current EFLAGS.
4475  Def = Sub;
4476  MachineBasicBlock::reverse_iterator InsertI = Def.getReverse(),
4477  InsertE = Sub->getParent()->rend();
4478  for (; InsertI != InsertE; ++InsertI) {
4479  MachineInstr *Instr = &*InsertI;
4480  if (!Instr->readsRegister(X86::EFLAGS, TRI) &&
4481  Instr->modifiesRegister(X86::EFLAGS, TRI)) {
4482  Sub->getParent()->remove(Movr0Inst);
4483  Instr->getParent()->insert(MachineBasicBlock::iterator(Instr),
4484  Movr0Inst);
4485  break;
4486  }
4487  }
4488  if (InsertI == InsertE)
4489  return false;
4490  }
4491 
4492  // Make sure Sub instruction defines EFLAGS and mark the def live.
4493  MachineOperand *FlagDef = Sub->findRegisterDefOperand(X86::EFLAGS);
4494  assert(FlagDef && "Unable to locate a def EFLAGS operand");
4495  FlagDef->setIsDead(false);
4496 
4497  CmpInstr.eraseFromParent();
4498 
4499  // Modify the condition code of instructions in OpsToUpdate.
4500  for (auto &Op : OpsToUpdate) {
4501  Op.first->getOperand(Op.first->getDesc().getNumOperands() - 1)
4502  .setImm(Op.second);
4503  }
4504  return true;
4505 }
4506 
4507 /// Try to remove the load by folding it to a register
4508 /// operand at the use. We fold the load instructions if load defines a virtual
4509 /// register, the virtual register is used once in the same BB, and the
4510 /// instructions in-between do not load or store, and have no side effects.
4512  const MachineRegisterInfo *MRI,
4513  Register &FoldAsLoadDefReg,
4514  MachineInstr *&DefMI) const {
4515  // Check whether we can move DefMI here.
4516  DefMI = MRI->getVRegDef(FoldAsLoadDefReg);
4517  assert(DefMI);
4518  bool SawStore = false;
4519  if (!DefMI->isSafeToMove(nullptr, SawStore))
4520  return nullptr;
4521 
4522  // Collect information about virtual register operands of MI.
4523  SmallVector<unsigned, 1> SrcOperandIds;
4524  for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
4525  MachineOperand &MO = MI.getOperand(i);
4526  if (!MO.isReg())
4527  continue;
4528  Register Reg = MO.getReg();
4529  if (Reg != FoldAsLoadDefReg)
4530  continue;
4531  // Do not fold if we have a subreg use or a def.
4532  if (MO.getSubReg() || MO.isDef())
4533  return nullptr;
4534  SrcOperandIds.push_back(i);
4535  }
4536  if (SrcOperandIds.empty())
4537  return nullptr;
4538 
4539  // Check whether we can fold the def into SrcOperandId.
4540  if (MachineInstr *FoldMI = foldMemoryOperand(MI, SrcOperandIds, *DefMI)) {
4541  FoldAsLoadDefReg = 0;
4542  return FoldMI;
4543  }
4544 
4545  return nullptr;
4546 }
4547 
4548 /// Expand a single-def pseudo instruction to a two-addr
4549 /// instruction with two undef reads of the register being defined.
4550 /// This is used for mapping:
4551 /// %xmm4 = V_SET0
4552 /// to:
4553 /// %xmm4 = PXORrr undef %xmm4, undef %xmm4
4554 ///
4556  const MCInstrDesc &Desc) {
4557  assert(Desc.getNumOperands() == 3 && "Expected two-addr instruction.");
4558  Register Reg = MIB.getReg(0);
4559  MIB->setDesc(Desc);
4560 
4561  // MachineInstr::addOperand() will insert explicit operands before any
4562  // implicit operands.
4564  // But we don't trust that.
4565  assert(MIB.getReg(1) == Reg &&
4566  MIB.getReg(2) == Reg && "Misplaced operand");
4567  return true;
4568 }
4569 
4570 /// Expand a single-def pseudo instruction to a two-addr
4571 /// instruction with two %k0 reads.
4572 /// This is used for mapping:
4573 /// %k4 = K_SET1
4574 /// to:
4575 /// %k4 = KXNORrr %k0, %k0
4576 static bool Expand2AddrKreg(MachineInstrBuilder &MIB, const MCInstrDesc &Desc,
4577  Register Reg) {
4578  assert(Desc.getNumOperands() == 3 && "Expected two-addr instruction.");
4579  MIB->setDesc(Desc);
4581  return true;
4582 }
4583 
4585  bool MinusOne) {
4586  MachineBasicBlock &MBB = *MIB->getParent();
4587  const DebugLoc &DL = MIB->getDebugLoc();
4588  Register Reg = MIB.getReg(0);
4589 
4590  // Insert the XOR.
4591  BuildMI(MBB, MIB.getInstr(), DL, TII.get(X86::XOR32rr), Reg)
4594 
4595  // Turn the pseudo into an INC or DEC.
4596  MIB->setDesc(TII.get(MinusOne ? X86::DEC32r : X86::INC32r));
4597  MIB.addReg(Reg);
4598 
4599  return true;
4600 }
4601 
4603  const TargetInstrInfo &TII,
4604  const X86Subtarget &Subtarget) {
4605  MachineBasicBlock &MBB = *MIB->getParent();
4606  const DebugLoc &DL = MIB->getDebugLoc();
4607  int64_t Imm = MIB->getOperand(1).getImm();
4608  assert(Imm != 0 && "Using push/pop for 0 is not efficient.");
4610 
4611  int StackAdjustment;
4612 
4613  if (Subtarget.is64Bit()) {
4614  assert(MIB->getOpcode() == X86::MOV64ImmSExti8 ||
4615  MIB->getOpcode() == X86::MOV32ImmSExti8);
4616 
4617  // Can't use push/pop lowering if the function might write to the red zone.
4618  X86MachineFunctionInfo *X86FI =
4620  if (X86FI->getUsesRedZone()) {
4621  MIB->setDesc(TII.get(MIB->getOpcode() ==
4622  X86::MOV32ImmSExti8 ? X86::MOV32ri : X86::MOV64ri));
4623  return true;
4624  }
4625 
4626  // 64-bit mode doesn't have 32-bit push/pop, so use 64-bit operations and
4627  // widen the register if necessary.
4628  StackAdjustment = 8;
4629  BuildMI(MBB, I, DL, TII.get(X86::PUSH64i8)).addImm(Imm);
4630  MIB->setDesc(TII.get(X86::POP64r));
4631  MIB->getOperand(0)
4632  .setReg(getX86SubSuperRegister(MIB.getReg(0), 64));
4633  } else {
4634  assert(MIB->getOpcode() == X86::MOV32ImmSExti8);
4635  StackAdjustment = 4;
4636  BuildMI(MBB, I, DL, TII.get(X86::PUSH32i8)).addImm(Imm);
4637  MIB->setDesc(TII.get(X86::POP32r));
4638  }
4639  MIB->RemoveOperand(1);
4641 
4642  // Build CFI if necessary.
4643  MachineFunction &MF = *MBB.getParent();
4644  const X86FrameLowering *TFL = Subtarget.getFrameLowering();
4645  bool IsWin64Prologue = MF.getTarget().getMCAsmInfo()->usesWindowsCFI();
4646  bool NeedsDwarfCFI = !IsWin64Prologue && MF.needsFrameMoves();
4647  bool EmitCFI = !TFL->hasFP(MF) && NeedsDwarfCFI;
4648  if (EmitCFI) {
4649  TFL->BuildCFI(MBB, I, DL,
4651  TFL->BuildCFI(MBB, std::next(I), DL,
4653  }
4654 
4655  return true;
4656 }
4657 
4658 // LoadStackGuard has so far only been implemented for 64-bit MachO. Different
4659 // code sequence is needed for other targets.
4661  const TargetInstrInfo &TII) {
4662  MachineBasicBlock &MBB = *MIB->getParent();
4663  const DebugLoc &DL = MIB->getDebugLoc();
4664  Register Reg = MIB.getReg(0);
4665  const GlobalValue *GV =
4666  cast<GlobalValue>((*MIB->memoperands_begin())->getValue());
4667  auto Flags = MachineMemOperand::MOLoad |
4671  MachinePointerInfo::getGOT(*MBB.getParent()), Flags, 8, Align(8));
4673 
4674  BuildMI(MBB, I, DL, TII.get(X86::MOV64rm), Reg).addReg(X86::RIP).addImm(1)
4676  .addMemOperand(MMO);
4677  MIB->setDebugLoc(DL);
4678  MIB->setDesc(TII.get(X86::MOV64rm));
4679  MIB.addReg(Reg, RegState::Kill).addImm(1).addReg(0).addImm(0).addReg(0);
4680 }
4681 
4683  MachineBasicBlock &MBB = *MIB->getParent();
4684  MachineFunction &MF = *MBB.getParent();
4685  const X86Subtarget &Subtarget = MF.getSubtarget<X86Subtarget>();
4686  const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
4687  unsigned XorOp =
4688  MIB->getOpcode() == X86::XOR64_FP ? X86::XOR64rr : X86::XOR32rr;
4689  MIB->setDesc(TII.get(XorOp));
4691  return true;
4692 }
4693 
4694 // This is used to handle spills for 128/256-bit registers when we have AVX512,
4695 // but not VLX. If it uses an extended register we need to use an instruction
4696 // that loads the lower 128/256-bit, but is available with only AVX512F.
4698  const TargetRegisterInfo *TRI,
4699  const MCInstrDesc &LoadDesc,
4700  const MCInstrDesc &BroadcastDesc,
4701  unsigned SubIdx) {
4702  Register DestReg = MIB.getReg(0);
4703  // Check if DestReg is XMM16-31 or YMM16-31.
4704  if (TRI->getEncodingValue(DestReg) < 16) {
4705  // We can use a normal VEX encoded load.
4706  MIB->setDesc(LoadDesc);
4707  } else {
4708  // Use a 128/256-bit VBROADCAST instruction.
4709  MIB->setDesc(BroadcastDesc);
4710  // Change the destination to a 512-bit register.
4711  DestReg = TRI->getMatchingSuperReg(DestReg, SubIdx, &X86::VR512RegClass);
4712  MIB->getOperand(0).setReg(DestReg);
4713  }
4714  return true;
4715 }
4716 
4717 // This is used to handle spills for 128/256-bit registers when we have AVX512,
4718 // but not VLX. If it uses an extended register we need to use an instruction
4719 // that stores the lower 128/256-bit, but is available with only AVX512F.
4721  const TargetRegisterInfo *TRI,
4722  const MCInstrDesc &StoreDesc,
4723  const MCInstrDesc &ExtractDesc,
4724  unsigned SubIdx) {
4725  Register SrcReg = MIB.getReg(X86::AddrNumOperands);
4726  // Check if DestReg is XMM16-31 or YMM16-31.
4727  if (TRI->getEncodingValue(SrcReg) < 16) {
4728  // We can use a normal VEX encoded store.
4729  MIB->setDesc(StoreDesc);
4730  } else {
4731  // Use a VEXTRACTF instruction.
4732  MIB->setDesc(ExtractDesc);
4733  // Change the destination to a 512-bit register.
4734  SrcReg = TRI->getMatchingSuperReg(SrcReg, SubIdx, &X86::VR512RegClass);
4735  MIB->getOperand(X86::AddrNumOperands).setReg(SrcReg);
4736  MIB.addImm(0x0); // Append immediate to extract from the lower bits.
4737  }
4738 
4739  return true;
4740 }
4741 
4742 static bool expandSHXDROT(MachineInstrBuilder &MIB, const MCInstrDesc &Desc) {
4743  MIB->setDesc(Desc);
4744  int64_t ShiftAmt = MIB->getOperand(2).getImm();
4745  // Temporarily remove the immediate so we can add another source register.
4746  MIB->RemoveOperand(2);
4747  // Add the register. Don't copy the kill flag if there is one.
4748  MIB.addReg(MIB.getReg(1),
4749  getUndefRegState(MIB->getOperand(1).isUndef()));
4750  // Add back the immediate.
4751  MIB.addImm(ShiftAmt);
4752  return true;
4753 }
4754 
4756  bool HasAVX = Subtarget.hasAVX();
4757  MachineInstrBuilder MIB(*MI.getParent()->getParent(), MI);
4758  switch (MI.getOpcode()) {
4759  case X86::MOV32r0:
4760  return Expand2AddrUndef(MIB, get(X86::XOR32rr));
4761  case X86::MOV32r1:
4762  return expandMOV32r1(MIB, *this, /*MinusOne=*/ false);
4763  case X86::MOV32r_1:
4764  return expandMOV32r1(MIB, *this, /*MinusOne=*/ true);
4765  case X86::MOV32ImmSExti8:
4766  case X86::MOV64ImmSExti8:
4767  return ExpandMOVImmSExti8(MIB, *this, Subtarget);
4768  case X86::SETB_C32r:
4769  return Expand2AddrUndef(MIB, get(X86::SBB32rr));
4770  case X86::SETB_C64r:
4771  return Expand2AddrUndef(MIB, get(X86::SBB64rr));
4772  case X86::MMX_SET0:
4773  return Expand2AddrUndef(MIB, get(X86::MMX_PXORirr));
4774  case X86::V_SET0:
4775  case X86::FsFLD0SS:
4776  case X86::FsFLD0SD:
4777  case X86::FsFLD0F128:
4778  return Expand2AddrUndef(MIB, get(HasAVX ? X86::VXORPSrr : X86::XORPSrr));
4779  case X86::AVX_SET0: {
4780  assert(HasAVX && "AVX not supported");
4782  Register SrcReg = MIB.getReg(0);
4783  Register XReg = TRI->getSubReg(SrcReg, X86::sub_xmm);
4784  MIB->getOperand(0).setReg(XReg);
4785  Expand2AddrUndef(MIB, get(X86::VXORPSrr));
4786  MIB.addReg(SrcReg, RegState::ImplicitDefine);
4787  return true;
4788  }
4789  case X86::AVX512_128_SET0:
4790  case X86::AVX512_FsFLD0SH:
4791  case X86::AVX512_FsFLD0SS:
4792  case X86::AVX512_FsFLD0SD:
4793  case X86::AVX512_FsFLD0F128: {
4794  bool HasVLX = Subtarget.hasVLX();
4795  Register SrcReg = MIB.getReg(0);
4797  if (HasVLX || TRI->getEncodingValue(SrcReg) < 16)
4798  return Expand2AddrUndef(MIB,
4799  get(HasVLX ? X86::VPXORDZ128rr : X86::VXORPSrr));
4800  // Extended register without VLX. Use a larger XOR.
4801  SrcReg =
4802  TRI->getMatchingSuperReg(SrcReg, X86::sub_xmm, &X86::VR512RegClass);
4803  MIB->getOperand(0).setReg(SrcReg);
4804  return Expand2AddrUndef(MIB, get(X86::VPXORDZrr));
4805  }
4806  case X86::AVX512_256_SET0:
4807  case X86::AVX512_512_SET0: {
4808  bool HasVLX = Subtarget.hasVLX();
4809  Register SrcReg = MIB.getReg(0);
4811  if (HasVLX || TRI->getEncodingValue(SrcReg) < 16) {
4812  Register XReg = TRI->getSubReg(SrcReg, X86::sub_xmm);
4813  MIB->getOperand(0).setReg(XReg);
4814  Expand2AddrUndef(MIB,
4815  get(HasVLX ? X86::VPXORDZ128rr : X86::VXORPSrr));
4816  MIB.addReg(SrcReg, RegState::ImplicitDefine);
4817  return true;
4818  }
4819  if (MI.getOpcode() == X86::AVX512_256_SET0) {
4820  // No VLX so we must reference a zmm.
4821  unsigned ZReg =
4822  TRI->getMatchingSuperReg(SrcReg, X86::sub_ymm, &X86::VR512RegClass);
4823  MIB->getOperand(0).setReg(ZReg);
4824  }
4825  return Expand2AddrUndef(MIB, get(X86::VPXORDZrr));
4826  }
4827  case X86::V_SETALLONES:
4828  return Expand2AddrUndef(MIB, get(HasAVX ? X86::VPCMPEQDrr : X86::PCMPEQDrr));
4829  case X86::AVX2_SETALLONES:
4830  return Expand2AddrUndef(MIB, get(X86::VPCMPEQDYrr));
4831  case X86::AVX1_SETALLONES: {
4832  Register Reg = MIB.getReg(0);
4833  // VCMPPSYrri with an immediate 0xf should produce VCMPTRUEPS.
4834  MIB->setDesc(get(X86::VCMPPSYrri));
4836  return true;
4837  }
4838  case X86::AVX512_512_SETALLONES: {
4839  Register Reg = MIB.getReg(0);
4840  MIB->setDesc(get(X86::VPTERNLOGDZrri));
4841  // VPTERNLOGD needs 3 register inputs and an immediate.
4842  // 0xff will return 1s for any input.
4844  .addReg(Reg, RegState::Undef).addImm(0xff);
4845  return true;
4846  }
4847  case X86::AVX512_512_SEXT_MASK_32:
4848  case X86::AVX512_512_SEXT_MASK_64: {
4849  Register Reg = MIB.getReg(0);
4850  Register MaskReg = MIB.getReg(1);
4851  unsigned MaskState = getRegState(MIB->getOperand(1));
4852  unsigned Opc = (MI.getOpcode() == X86::AVX512_512_SEXT_MASK_64) ?
4853  X86::VPTERNLOGQZrrikz : X86::VPTERNLOGDZrrikz;
4854  MI.RemoveOperand(1);
4855  MIB->setDesc(get(Opc));
4856  // VPTERNLOG needs 3 register inputs and an immediate.
4857  // 0xff will return 1s for any input.
4858  MIB.addReg(Reg, RegState::Undef).addReg(MaskReg, MaskState)
4860  return true;
4861  }
4862  case X86::VMOVAPSZ128rm_NOVLX:
4863  return expandNOVLXLoad(MIB, &getRegisterInfo(), get(X86::VMOVAPSrm),
4864  get(X86::VBROADCASTF32X4rm), X86::sub_xmm);
4865  case X86::VMOVUPSZ128rm_NOVLX:
4866  return expandNOVLXLoad(MIB, &getRegisterInfo(), get(X86::VMOVUPSrm),
4867  get(X86::VBROADCASTF32X4rm), X86::sub_xmm);
4868  case X86::VMOVAPSZ256rm_NOVLX:
4869  return expandNOVLXLoad(MIB, &getRegisterInfo(), get(X86::VMOVAPSYrm),
4870  get(X86::VBROADCASTF64X4rm), X86::sub_ymm);
4871  case X86::VMOVUPSZ256rm_NOVLX:
4872  return expandNOVLXLoad(MIB, &getRegisterInfo(), get(X86::VMOVUPSYrm),
4873  get(X86::VBROADCASTF64X4rm), X86::sub_ymm);
4874  case X86::VMOVAPSZ128mr_NOVLX:
4875  return expandNOVLXStore(MIB, &getRegisterInfo(), get(X86::VMOVAPSmr),
4876  get(X86::VEXTRACTF32x4Zmr), X86::sub_xmm);
4877  case X86::VMOVUPSZ128mr_NOVLX:
4878  return expandNOVLXStore(MIB, &getRegisterInfo(), get(X86::VMOVUPSmr),
4879  get(X86::VEXTRACTF32x4Zmr), X86::sub_xmm);
4880  case X86::VMOVAPSZ256mr_NOVLX:
4881  return expandNOVLXStore(MIB, &getRegisterInfo(), get(X86::VMOVAPSYmr),
4882  get(X86::VEXTRACTF64x4Zmr), X86::sub_ymm);
4883  case X86::VMOVUPSZ256mr_NOVLX:
4884  return expandNOVLXStore(MIB, &getRegisterInfo(), get(X86::VMOVUPSYmr),
4885  get(X86::VEXTRACTF64x4Zmr), X86::sub_ymm);
4886  case X86::MOV32ri64: {
4887  Register Reg = MIB.getReg(0);
4888  Register Reg32 = RI.getSubReg(Reg, X86::sub_32bit);
4889  MI.setDesc(get(X86::MOV32ri));
4890  MIB->getOperand(0).setReg(Reg32);
4892  return true;
4893  }
4894 
4895  // KNL does not recognize dependency-breaking idioms for mask registers,
4896  // so kxnor %k1, %k1, %k2 has a RAW dependence on %k1.
4897  // Using %k0 as the undef input register is a performance heuristic based
4898  // on the assumption that %k0 is used less frequently than the other mask
4899  // registers, since it is not usable as a write mask.
4900  // FIXME: A more advanced approach would be to choose the best input mask
4901  // register based on context.
4902  case X86::KSET0W: return Expand2AddrKreg(MIB, get(X86::KXORWrr), X86::K0);
4903  case X86::KSET0D: return Expand2AddrKreg(MIB, get(X86::KXORDrr), X86::K0);
4904  case X86::KSET0Q: return Expand2AddrKreg(MIB, get(X86::KXORQrr), X86::K0);
4905  case X86::KSET1W: return Expand2AddrKreg(MIB, get(X86::KXNORWrr), X86::K0);
4906  case X86::KSET1D: return Expand2AddrKreg(MIB, get(X86::KXNORDrr), X86::K0);
4907  case X86::KSET1Q: return Expand2AddrKreg(MIB, get(X86::KXNORQrr), X86::K0);
4908  case TargetOpcode::LOAD_STACK_GUARD:
4909  expandLoadStackGuard(MIB, *this);
4910  return true;
4911  case X86::XOR64_FP:
4912  case X86::XOR32_FP:
4913  return expandXorFP(MIB, *this);
4914  case X86::SHLDROT32ri: return expandSHXDROT(MIB, get(X86::SHLD32rri8));
4915  case X86::SHLDROT64ri: return expandSHXDROT(MIB, get(X86::SHLD64rri8));
4916  case X86::SHRDROT32ri: return expandSHXDROT(MIB, get(X86::SHRD32rri8));
4917  case X86::SHRDROT64ri: return expandSHXDROT(MIB, get(X86::SHRD64rri8));
4918  case X86::ADD8rr_DB: MIB->setDesc(get(X86::OR8rr)); break;
4919  case X86::ADD16rr_DB: MIB->setDesc(get(X86::OR16rr)); break;
4920  case X86::ADD32rr_DB: MIB->setDesc(get(X86::OR32rr)); break;
4921  case X86::ADD64rr_DB: MIB->setDesc(get(X86::OR64rr)); break;
4922  case X86::ADD8ri_DB: MIB->setDesc(get(X86::OR8ri)); break;
4923  case X86::ADD16ri_DB: MIB->setDesc(get(X86::OR16ri)); break;
4924  case X86::ADD32ri_DB: MIB->setDesc(get(X86::OR32ri)); break;
4925  case X86::ADD64ri32_DB: MIB->setDesc(get(X86::OR64ri32)); break;
4926  case X86::ADD16ri8_DB: MIB->setDesc(get(X86::OR16ri8)); break;
4927  case X86::ADD32ri8_DB: MIB->setDesc(get(X86::OR32ri8)); break;
4928  case X86::ADD64ri8_DB: MIB->setDesc(get(X86::OR64ri8)); break;
4929  }
4930  return false;
4931 }
4932 
4933 /// Return true for all instructions that only update
4934 /// the first 32 or 64-bits of the destination register and leave the rest
4935 /// unmodified. This can be used to avoid folding loads if the instructions
4936 /// only update part of the destination register, and the non-updated part is
4937 /// not needed. e.g. cvtss2sd, sqrtss. Unfolding the load from these
4938 /// instructions breaks the partial register dependency and it can improve
4939 /// performance. e.g.:
4940 ///
4941 /// movss (%rdi), %xmm0
4942 /// cvtss2sd %xmm0, %xmm0
4943 ///
4944 /// Instead of
4945 /// cvtss2sd (%rdi), %xmm0
4946 ///
4947 /// FIXME: This should be turned into a TSFlags.
4948 ///
4949 static bool hasPartialRegUpdate(unsigned Opcode,
4950  const X86Subtarget &Subtarget,
4951  bool ForLoadFold = false) {
4952  switch (Opcode) {
4953  case X86::CVTSI2SSrr:
4954  case X86::CVTSI2SSrm:
4955  case X86::CVTSI642SSrr:
4956  case X86::CVTSI642SSrm:
4957  case X86::CVTSI2SDrr:
4958  case X86::CVTSI2SDrm:
4959  case X86::CVTSI642SDrr:
4960  case X86::CVTSI642SDrm:
4961  // Load folding won't effect the undef register update since the input is
4962  // a GPR.
4963  return !ForLoadFold;
4964  case X86::CVTSD2SSrr:
4965  case X86::CVTSD2SSrm:
4966  case X86::CVTSS2SDrr:
4967  case X86::CVTSS2SDrm:
4968  case X86::MOVHPDrm:
4969  case X86::MOVHPSrm:
4970  case X86::MOVLPDrm:
4971  case X86::MOVLPSrm:
4972  case X86::RCPSSr:
4973  case X86::RCPSSm:
4974  case X86::RCPSSr_Int:
4975  case X86::RCPSSm_Int:
4976  case X86::ROUNDSDr:
4977  case X86::ROUNDSDm:
4978  case X86::ROUNDSSr:
4979  case X86::ROUNDSSm:
4980  case X86::RSQRTSSr:
4981  case X86::RSQRTSSm:
4982  case X86::RSQRTSSr_Int:
4983  case X86::RSQRTSSm_Int:
4984  case X86::SQRTSSr:
4985  case X86::SQRTSSm:
4986  case X86::SQRTSSr_Int:
4987  case X86::SQRTSSm_Int:
4988  case X86::SQRTSDr:
4989  case X86::SQRTSDm:
4990  case X86::SQRTSDr_Int:
4991  case X86::SQRTSDm_Int:
4992  return true;
4993  // GPR
4994  case X86::POPCNT32rm:
4995  case X86::POPCNT32rr:
4996  case X86::POPCNT64rm:
4997  case X86::POPCNT64rr:
4998  return Subtarget.hasPOPCNTFalseDeps();
4999  case X86::LZCNT32rm:
5000  case X86::LZCNT32rr:
5001  case X86::LZCNT64rm:
5002  case X86::LZCNT64rr:
5003  case X86::TZCNT32rm:
5004  case X86::TZCNT32rr:
5005  case X86::TZCNT64rm:
5006  case X86::TZCNT64rr:
5007  return Subtarget.hasLZCNTFalseDeps();
5008  }
5009 
5010  return false;
5011 }
5012 
5013 /// Inform the BreakFalseDeps pass how many idle
5014 /// instructions we would like before a partial register update.
5016  const MachineInstr &MI, unsigned OpNum,
5017  const TargetRegisterInfo *TRI) const {
5018  if (OpNum != 0 || !hasPartialRegUpdate(MI.getOpcode(), Subtarget))
5019  return 0;
5020 
5021  // If MI is marked as reading Reg, the partial register update is wanted.
5022  const MachineOperand &MO = MI.getOperand(0);
5023  Register Reg = MO.getReg();
5024  if (Reg.isVirtual()) {
5025  if (MO.readsReg() || MI.readsVirtualRegister(Reg))
5026  return 0;
5027  } else {
5028  if (MI.readsRegister(Reg, TRI))
5029  return 0;
5030  }
5031 
5032  // If any instructions in the clearance range are reading Reg, insert a
5033  // dependency breaking instruction, which is inexpensive and is likely to
5034  // be hidden in other instruction's cycles.
5036 }
5037 
5038 // Return true for any instruction the copies the high bits of the first source
5039 // operand into the unused high bits of the destination operand.
5040 // Also returns true for instructions that have two inputs where one may
5041 // be undef and we want it to use the same register as the other input.
5042 static bool hasUndefRegUpdate(unsigned Opcode, unsigned OpNum,
5043  bool ForLoadFold = false) {
5044  // Set the OpNum parameter to the first source operand.
5045  switch (Opcode) {
5046  case X86::MMX_PUNPCKHBWirr:
5047  case X86::MMX_PUNPCKHWDirr:
5048  case X86::MMX_PUNPCKHDQirr:
5049  case X86::MMX_PUNPCKLBWirr:
5050  case X86::MMX_PUNPCKLWDirr:
5051  case X86::MMX_PUNPCKLDQirr:
5052  case X86::MOVHLPSrr:
5053  case X86::PACKSSWBrr:
5054  case X86::PACKUSWBrr:
5055  case X86::PACKSSDWrr:
5056  case X86::PACKUSDWrr:
5057  case X86::PUNPCKHBWrr:
5058  case X86::PUNPCKLBWrr:
5059  case X86::PUNPCKHWDrr:
5060  case X86::PUNPCKLWDrr:
5061  case X86::PUNPCKHDQrr:
5062  case X86::PUNPCKLDQrr:
5063  case X86::PUNPCKHQDQrr:
5064  case X86::PUNPCKLQDQrr:
5065  case X86::SHUFPDrri:
5066  case X86::SHUFPSrri:
5067  // These instructions are sometimes used with an undef first or second
5068  // source. Return true here so BreakFalseDeps will assign this source to the
5069  // same register as the first source to avoid a false dependency.
5070  // Operand 1 of these instructions is tied so they're separate from their
5071  // VEX counterparts.
5072  return OpNum == 2 && !ForLoadFold;
5073 
5074  case X86::VMOVLHPSrr:
5075  case X86::VMOVLHPSZrr:
5076  case X86::VPACKSSWBrr:
5077  case X86::VPACKUSWBrr:
5078  case X86::VPACKSSDWrr:
5079  case X86::VPACKUSDWrr:
5080  case X86::VPACKSSWBZ128rr:
5081  case X86::VPACKUSWBZ128rr:
5082  case X86::VPACKSSDWZ128rr:
5083  case X86::VPACKUSDWZ128rr:
5084  case X86::VPERM2F128rr:
5085  case X86::VPERM2I128rr:
5086  case X86::VSHUFF32X4Z256rri:
5087  case X86::VSHUFF32X4Zrri:
5088  case X86::VSHUFF64X2Z256rri:
5089  case X86::VSHUFF64X2Zrri:
5090  case X86::VSHUFI32X4Z256rri:
5091  case X86::VSHUFI32X4Zrri:
5092  case X86::VSHUFI64X2Z256rri:
5093  case X86::VSHUFI64X2Zrri:
5094  case X86::VPUNPCKHBWrr:
5095  case X86::VPUNPCKLBWrr:
5096  case X86::VPUNPCKHBWYrr:
5097  case X86::VPUNPCKLBWYrr:
5098  case X86::VPUNPCKHBWZ128rr:
5099  case X86::VPUNPCKLBWZ128rr:
5100  case X86::VPUNPCKHBWZ256rr:
5101  case X86::VPUNPCKLBWZ256rr:
5102  case X86::VPUNPCKHBWZrr:
5103  case X86::VPUNPCKLBWZrr:
5104  case X86::VPUNPCKHWDrr:
5105  case X86::VPUNPCKLWDrr:
5106  case X86::VPUNPCKHWDYrr:
5107  case X86::VPUNPCKLWDYrr:
5108  case X86::VPUNPCKHWDZ128rr:
5109  case X86::VPUNPCKLWDZ128rr:
5110  case X86::VPUNPCKHWDZ256rr:
5111  case X86::VPUNPCKLWDZ256rr:
5112  case X86::VPUNPCKHWDZrr:
5113  case X86::VPUNPCKLWDZrr:
5114  case X86::VPUNPCKHDQrr:
5115  case X86::VPUNPCKLDQrr:
5116  case X86::VPUNPCKHDQYrr:
5117  case X86::VPUNPCKLDQYrr:
5118  case X86::VPUNPCKHDQZ128rr:
5119  case X86::VPUNPCKLDQZ128rr:
5120  case X86::VPUNPCKHDQZ256rr:
5121  case X86::VPUNPCKLDQZ256rr:
5122  case X86::VPUNPCKHDQZrr:
5123  case X86::VPUNPCKLDQZrr:
5124  case X86::VPUNPCKHQDQrr:
5125  case X86::VPUNPCKLQDQrr:
5126  case X86::VPUNPCKHQDQYrr:
5127  case X86::VPUNPCKLQDQYrr:
5128  case X86::VPUNPCKHQDQZ128rr:
5129  case X86::VPUNPCKLQDQZ128rr:
5130  case X86::VPUNPCKHQDQZ256rr:
5131  case X86::VPUNPCKLQDQZ256rr:
5132  case X86::VPUNPCKHQDQZrr:
5133  case X86::VPUNPCKLQDQZrr:
5134  // These instructions are sometimes used with an undef first or second
5135  // source. Return true here so BreakFalseDeps will assign this source to the
5136  // same register as the first source to avoid a false dependency.
5137  return (OpNum == 1 || OpNum == 2) && !ForLoadFold;
5138 
5139  case X86::VCVTSI2SSrr:
5140  case X86::VCVTSI2SSrm:
5141  case X86::VCVTSI2SSrr_Int:
5142  case X86::VCVTSI2SSrm_Int:
5143  case X86::VCVTSI642SSrr:
5144  case X86::VCVTSI642SSrm:
5145  case X86::VCVTSI642SSrr_Int:
5146  case X86::VCVTSI642SSrm_Int:
5147  case X86::VCVTSI2SDrr:
5148  case X86::VCVTSI2SDrm:
5149  case X86::VCVTSI2SDrr_Int:
5150  case X86::VCVTSI2SDrm_Int:
5151  case X86::VCVTSI642SDrr:
5152  case X86::VCVTSI642SDrm:
5153  case X86::VCVTSI642SDrr_Int:
5154  case X86::VCVTSI642SDrm_Int:
5155  // AVX-512
5156  case X86::VCVTSI2SSZrr:
5157  case X86::VCVTSI2SSZrm:
5158  case X86::VCVTSI2SSZrr_Int:
5159  case X86::VCVTSI2SSZrrb_Int:
5160  case X86::VCVTSI2SSZrm_Int:
5161  case X86::VCVTSI642SSZrr:
5162  case X86::VCVTSI642SSZrm:
5163  case X86::VCVTSI642SSZrr_Int:
5164  case X86::VCVTSI642SSZrrb_Int:
5165  case X86::VCVTSI642SSZrm_Int:
5166  case X86::VCVTSI2SDZrr:
5167  case X86::VCVTSI2SDZrm:
5168  case X86::VCVTSI2SDZrr_Int:
5169  case X86::VCVTSI2SDZrm_Int:
5170  case X86::VCVTSI642SDZrr:
5171  case X86::VCVTSI642SDZrm:
5172  case X86::VCVTSI642SDZrr_Int:
5173  case X86::VCVTSI642SDZrrb_Int:
5174  case X86::VCVTSI642SDZrm_Int:
5175  case X86::VCVTUSI2SSZrr:
5176  case X86::VCVTUSI2SSZrm:
5177  case X86::VCVTUSI2SSZrr_Int:
5178  case X86::VCVTUSI2SSZrrb_Int:
5179  case X86::VCVTUSI2SSZrm_Int:
5180  case X86::VCVTUSI642SSZrr:
5181  case X86::VCVTUSI642SSZrm:
5182  case X86::VCVTUSI642SSZrr_Int:
5183  case X86::VCVTUSI642SSZrrb_Int:
5184  case X86::VCVTUSI642SSZrm_Int:
5185  case X86::VCVTUSI2SDZrr:
5186  case X86::VCVTUSI2SDZrm:
5187  case X86::VCVTUSI2SDZrr_Int:
5188  case X86::VCVTUSI2SDZrm_Int:
5189  case X86::VCVTUSI642SDZrr:
5190  case X86::VCVTUSI642SDZrm:
5191  case X86::VCVTUSI642SDZrr_Int:
5192  case X86::VCVTUSI642SDZrrb_Int:
5193  case X86::VCVTUSI642SDZrm_Int:
5194  case X86::VCVTSI2SHZrr:
5195  case X86::VCVTSI2SHZrm:
5196  case X86::VCVTSI2SHZrr_Int:
5197  case X86::VCVTSI2SHZrrb_Int:
5198  case X86::VCVTSI2SHZrm_Int:
5199  case X86::VCVTSI642SHZrr:
5200  case X86::VCVTSI642SHZrm:
5201  case X86::VCVTSI642SHZrr_Int:
5202  case X86::VCVTSI642SHZrrb_Int:
5203  case X86::VCVTSI642SHZrm_Int:
5204  case X86::VCVTUSI2SHZrr:
5205  case X86::VCVTUSI2SHZrm:
5206  case X86::VCVTUSI2SHZrr_Int:
5207  case X86::VCVTUSI2SHZrrb_Int:
5208  case X86::VCVTUSI2SHZrm_Int:
5209  case X86::VCVTUSI642SHZrr:
5210  case X86::VCVTUSI642SHZrm:
5211  case X86::VCVTUSI642SHZrr_Int:
5212  case X86::VCVTUSI642SHZrrb_Int:
5213  case X86::VCVTUSI642SHZrm_Int:
5214  // Load folding won't effect the undef register update since the input is
5215  // a GPR.
5216  return OpNum == 1 && !ForLoadFold;
5217  case X86::VCVTSD2SSrr:
5218  case X86::VCVTSD2SSrm:
5219  case X86::VCVTSD2SSrr_Int:
5220  case X86::VCVTSD2SSrm_Int:
5221  case X86::VCVTSS2SDrr:
5222  case X86::VCVTSS2SDrm:
5223  case X86::VCVTSS2SDrr_Int:
5224  case X86::VCVTSS2SDrm_Int:
5225  case X86::VRCPSSr:
5226  case X86::VRCPSSr_Int:
5227  case X86::VRCPSSm:
5228  case X86::VRCPSSm_Int:
5229  case X86::VROUNDSDr:
5230  case X86::VROUNDSDm:
5231  case X86::VROUNDSDr_Int:
5232  case X86::VROUNDSDm_Int:
5233  case X86::VROUNDSSr:
5234  case X86::VROUNDSSm:
5235  case X86::VROUNDSSr_Int:
5236  case X86::VROUNDSSm_Int:
5237  case X86::VRSQRTSSr:
5238  case X86::VRSQRTSSr_Int:
5239  case X86::VRSQRTSSm:
5240  case X86::VRSQRTSSm_Int:
5241  case X86::VSQRTSSr:
5242  case X86::VSQRTSSr_Int:
5243  case X86::VSQRTSSm:
5244  case X86::VSQRTSSm_Int:
5245  case X86::VSQRTSDr:
5246  case X86::VSQRTSDr_Int:
5247  case X86::VSQRTSDm:
5248  case X86::VSQRTSDm_Int:
5249  // AVX-512
5250  case X86::VCVTSD2SSZrr:
5251  case X86::VCVTSD2SSZrr_Int:
5252  case X86::VCVTSD2SSZrrb_Int:
5253  case X86::VCVTSD2SSZrm:
5254  case X86::VCVTSD2SSZrm_Int:
5255  case X86::VCVTSS2SDZrr:
5256  case X86::VCVTSS2SDZrr_Int:
5257  case X86::VCVTSS2SDZrrb_Int:
5258  case X86::VCVTSS2SDZrm:
5259  case X86::VCVTSS2SDZrm_Int:
5260  case X86::VGETEXPSDZr:
5261  case X86::VGETEXPSDZrb:
5262  case X86::VGETEXPSDZm:
5263  case X86::VGETEXPSSZr:
5264  case X86::VGETEXPSSZrb:
5265  case X86::VGETEXPSSZm:
5266  case X86::VGETMANTSDZrri:
5267  case X86::VGETMANTSDZrrib:
5268  case X86::VGETMANTSDZrmi:
5269  case X86::VGETMANTSSZrri:
5270  case X86::VGETMANTSSZrrib:
5271  case X86::VGETMANTSSZrmi:
5272  case X86::VRNDSCALESDZr:
5273  case X86::VRNDSCALESDZr_Int:
5274  case X86::VRNDSCALESDZrb_Int:
5275  case X86::VRNDSCALESDZm:
5276  case X86::VRNDSCALESDZm_Int:
5277  case X86::VRNDSCALESSZr:
5278  case X86::VRNDSCALESSZr_Int:
5279  case X86::VRNDSCALESSZrb_Int:
5280  case X86::VRNDSCALESSZm:
5281  case X86::VRNDSCALESSZm_Int:
5282  case X86::VRCP14SDZrr:
5283  case X86::VRCP14SDZrm:
5284  case X86::VRCP14SSZrr:
5285  case X86::VRCP14SSZrm:
5286  case X86::VRCPSHZrr:
5287  case X86::VRCPSHZrm:
5288  case X86::VRSQRTSHZrr:
5289  case X86::VRSQRTSHZrm:
5290  case X86::VREDUCESHZrmi:
5291  case X86::VREDUCESHZrri:
5292  case X86::VREDUCESHZrrib:
5293  case X86::VGETEXPSHZr:
5294  case X86::VGETEXPSHZrb:
5295  case X86::VGETEXPSHZm:
5296  case X86::VGETMANTSHZrri:
5297  case X86::VGETMANTSHZrrib:
5298  case X86::VGETMANTSHZrmi:
5299  case X86::VRNDSCALESHZr:
5300  case X86::VRNDSCALESHZr_Int:
5301  case X86::VRNDSCALESHZrb_Int:
5302  case X86::VRNDSCALESHZm:
5303  case X86::VRNDSCALESHZm_Int:
5304  case