LLVM  13.0.0git
X86InstrInfo.cpp
Go to the documentation of this file.
1 //===-- X86InstrInfo.cpp - X86 Instruction Information --------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file contains the X86 implementation of the TargetInstrInfo class.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "X86InstrInfo.h"
14 #include "X86.h"
15 #include "X86InstrBuilder.h"
16 #include "X86InstrFoldTables.h"
17 #include "X86MachineFunctionInfo.h"
18 #include "X86Subtarget.h"
19 #include "X86TargetMachine.h"
20 #include "llvm/ADT/STLExtras.h"
21 #include "llvm/ADT/Sequence.h"
30 #include "llvm/CodeGen/StackMaps.h"
32 #include "llvm/IR/DerivedTypes.h"
33 #include "llvm/IR/Function.h"
34 #include "llvm/MC/MCAsmInfo.h"
35 #include "llvm/MC/MCExpr.h"
36 #include "llvm/MC/MCInst.h"
38 #include "llvm/Support/Debug.h"
42 
43 using namespace llvm;
44 
45 #define DEBUG_TYPE "x86-instr-info"
46 
47 #define GET_INSTRINFO_CTOR_DTOR
48 #include "X86GenInstrInfo.inc"
49 
50 static cl::opt<bool>
51  NoFusing("disable-spill-fusing",
52  cl::desc("Disable fusing of spill code into instructions"),
53  cl::Hidden);
54 static cl::opt<bool>
55 PrintFailedFusing("print-failed-fuse-candidates",
56  cl::desc("Print instructions that the allocator wants to"
57  " fuse, but the X86 backend currently can't"),
58  cl::Hidden);
59 static cl::opt<bool>
60 ReMatPICStubLoad("remat-pic-stub-load",
61  cl::desc("Re-materialize load from stub in PIC mode"),
62  cl::init(false), cl::Hidden);
63 static cl::opt<unsigned>
64 PartialRegUpdateClearance("partial-reg-update-clearance",
65  cl::desc("Clearance between two register writes "
66  "for inserting XOR to avoid partial "
67  "register update"),
68  cl::init(64), cl::Hidden);
69 static cl::opt<unsigned>
70 UndefRegClearance("undef-reg-clearance",
71  cl::desc("How many idle instructions we would like before "
72  "certain undef register reads"),
73  cl::init(128), cl::Hidden);
74 
75 
76 // Pin the vtable to this file.
77 void X86InstrInfo::anchor() {}
78 
80  : X86GenInstrInfo((STI.isTarget64BitLP64() ? X86::ADJCALLSTACKDOWN64
81  : X86::ADJCALLSTACKDOWN32),
82  (STI.isTarget64BitLP64() ? X86::ADJCALLSTACKUP64
83  : X86::ADJCALLSTACKUP32),
84  X86::CATCHRET,
85  (STI.is64Bit() ? X86::RETQ : X86::RETL)),
86  Subtarget(STI), RI(STI.getTargetTriple()) {
87 }
88 
89 bool
91  Register &SrcReg, Register &DstReg,
92  unsigned &SubIdx) const {
93  switch (MI.getOpcode()) {
94  default: break;
95  case X86::MOVSX16rr8:
96  case X86::MOVZX16rr8:
97  case X86::MOVSX32rr8:
98  case X86::MOVZX32rr8:
99  case X86::MOVSX64rr8:
100  if (!Subtarget.is64Bit())
101  // It's not always legal to reference the low 8-bit of the larger
102  // register in 32-bit mode.
103  return false;
105  case X86::MOVSX32rr16:
106  case X86::MOVZX32rr16:
107  case X86::MOVSX64rr16:
108  case X86::MOVSX64rr32: {
109  if (MI.getOperand(0).getSubReg() || MI.getOperand(1).getSubReg())
110  // Be conservative.
111  return false;
112  SrcReg = MI.getOperand(1).getReg();
113  DstReg = MI.getOperand(0).getReg();
114  switch (MI.getOpcode()) {
115  default: llvm_unreachable("Unreachable!");
116  case X86::MOVSX16rr8:
117  case X86::MOVZX16rr8:
118  case X86::MOVSX32rr8:
119  case X86::MOVZX32rr8:
120  case X86::MOVSX64rr8:
121  SubIdx = X86::sub_8bit;
122  break;
123  case X86::MOVSX32rr16:
124  case X86::MOVZX32rr16:
125  case X86::MOVSX64rr16:
126  SubIdx = X86::sub_16bit;
127  break;
128  case X86::MOVSX64rr32:
129  SubIdx = X86::sub_32bit;
130  break;
131  }
132  return true;
133  }
134  }
135  return false;
136 }
137 
139  switch (MI.getOpcode()) {
140  default:
141  // By default, assume that the instruction is not data invariant.
142  return false;
143 
144  // Some target-independent operations that trivially lower to data-invariant
145  // instructions.
146  case TargetOpcode::COPY:
147  case TargetOpcode::INSERT_SUBREG:
148  case TargetOpcode::SUBREG_TO_REG:
149  return true;
150 
151  // On x86 it is believed that imul is constant time w.r.t. the loaded data.
152  // However, they set flags and are perhaps the most surprisingly constant
153  // time operations so we call them out here separately.
154  case X86::IMUL16rr:
155  case X86::IMUL16rri8:
156  case X86::IMUL16rri:
157  case X86::IMUL32rr:
158  case X86::IMUL32rri8:
159  case X86::IMUL32rri:
160  case X86::IMUL64rr:
161  case X86::IMUL64rri32:
162  case X86::IMUL64rri8:
163 
164  // Bit scanning and counting instructions that are somewhat surprisingly
165  // constant time as they scan across bits and do other fairly complex
166  // operations like popcnt, but are believed to be constant time on x86.
167  // However, these set flags.
168  case X86::BSF16rr:
169  case X86::BSF32rr:
170  case X86::BSF64rr:
171  case X86::BSR16rr:
172  case X86::BSR32rr:
173  case X86::BSR64rr:
174  case X86::LZCNT16rr:
175  case X86::LZCNT32rr:
176  case X86::LZCNT64rr:
177  case X86::POPCNT16rr:
178  case X86::POPCNT32rr:
179  case X86::POPCNT64rr:
180  case X86::TZCNT16rr:
181  case X86::TZCNT32rr:
182  case X86::TZCNT64rr:
183 
184  // Bit manipulation instructions are effectively combinations of basic
185  // arithmetic ops, and should still execute in constant time. These also
186  // set flags.
187  case X86::BLCFILL32rr:
188  case X86::BLCFILL64rr:
189  case X86::BLCI32rr:
190  case X86::BLCI64rr:
191  case X86::BLCIC32rr:
192  case X86::BLCIC64rr:
193  case X86::BLCMSK32rr:
194  case X86::BLCMSK64rr:
195  case X86::BLCS32rr:
196  case X86::BLCS64rr:
197  case X86::BLSFILL32rr:
198  case X86::BLSFILL64rr:
199  case X86::BLSI32rr:
200  case X86::BLSI64rr:
201  case X86::BLSIC32rr:
202  case X86::BLSIC64rr:
203  case X86::BLSMSK32rr:
204  case X86::BLSMSK64rr:
205  case X86::BLSR32rr:
206  case X86::BLSR64rr:
207  case X86::TZMSK32rr:
208  case X86::TZMSK64rr:
209 
210  // Bit extracting and clearing instructions should execute in constant time,
211  // and set flags.
212  case X86::BEXTR32rr:
213  case X86::BEXTR64rr:
214  case X86::BEXTRI32ri:
215  case X86::BEXTRI64ri:
216  case X86::BZHI32rr:
217  case X86::BZHI64rr:
218 
219  // Shift and rotate.
220  case X86::ROL8r1:
221  case X86::ROL16r1:
222  case X86::ROL32r1:
223  case X86::ROL64r1:
224  case X86::ROL8rCL:
225  case X86::ROL16rCL:
226  case X86::ROL32rCL:
227  case X86::ROL64rCL:
228  case X86::ROL8ri:
229  case X86::ROL16ri:
230  case X86::ROL32ri:
231  case X86::ROL64ri:
232  case X86::ROR8r1:
233  case X86::ROR16r1:
234  case X86::ROR32r1:
235  case X86::ROR64r1:
236  case X86::ROR8rCL:
237  case X86::ROR16rCL:
238  case X86::ROR32rCL:
239  case X86::ROR64rCL:
240  case X86::ROR8ri:
241  case X86::ROR16ri:
242  case X86::ROR32ri:
243  case X86::ROR64ri:
244  case X86::SAR8r1:
245  case X86::SAR16r1:
246  case X86::SAR32r1:
247  case X86::SAR64r1:
248  case X86::SAR8rCL:
249  case X86::SAR16rCL:
250  case X86::SAR32rCL:
251  case X86::SAR64rCL:
252  case X86::SAR8ri:
253  case X86::SAR16ri:
254  case X86::SAR32ri:
255  case X86::SAR64ri:
256  case X86::SHL8r1:
257  case X86::SHL16r1:
258  case X86::SHL32r1:
259  case X86::SHL64r1:
260  case X86::SHL8rCL:
261  case X86::SHL16rCL:
262  case X86::SHL32rCL:
263  case X86::SHL64rCL:
264  case X86::SHL8ri:
265  case X86::SHL16ri:
266  case X86::SHL32ri:
267  case X86::SHL64ri:
268  case X86::SHR8r1:
269  case X86::SHR16r1:
270  case X86::SHR32r1:
271  case X86::SHR64r1:
272  case X86::SHR8rCL:
273  case X86::SHR16rCL:
274  case X86::SHR32rCL:
275  case X86::SHR64rCL:
276  case X86::SHR8ri:
277  case X86::SHR16ri:
278  case X86::SHR32ri:
279  case X86::SHR64ri:
280  case X86::SHLD16rrCL:
281  case X86::SHLD32rrCL:
282  case X86::SHLD64rrCL:
283  case X86::SHLD16rri8:
284  case X86::SHLD32rri8:
285  case X86::SHLD64rri8:
286  case X86::SHRD16rrCL:
287  case X86::SHRD32rrCL:
288  case X86::SHRD64rrCL:
289  case X86::SHRD16rri8:
290  case X86::SHRD32rri8:
291  case X86::SHRD64rri8:
292 
293  // Basic arithmetic is constant time on the input but does set flags.
294  case X86::ADC8rr:
295  case X86::ADC8ri:
296  case X86::ADC16rr:
297  case X86::ADC16ri:
298  case X86::ADC16ri8:
299  case X86::ADC32rr:
300  case X86::ADC32ri:
301  case X86::ADC32ri8:
302  case X86::ADC64rr:
303  case X86::ADC64ri8:
304  case X86::ADC64ri32:
305  case X86::ADD8rr:
306  case X86::ADD8ri:
307  case X86::ADD16rr:
308  case X86::ADD16ri:
309  case X86::ADD16ri8:
310  case X86::ADD32rr:
311  case X86::ADD32ri:
312  case X86::ADD32ri8:
313  case X86::ADD64rr:
314  case X86::ADD64ri8:
315  case X86::ADD64ri32:
316  case X86::AND8rr:
317  case X86::AND8ri:
318  case X86::AND16rr:
319  case X86::AND16ri:
320  case X86::AND16ri8:
321  case X86::AND32rr:
322  case X86::AND32ri:
323  case X86::AND32ri8:
324  case X86::AND64rr:
325  case X86::AND64ri8:
326  case X86::AND64ri32:
327  case X86::OR8rr:
328  case X86::OR8ri:
329  case X86::OR16rr:
330  case X86::OR16ri:
331  case X86::OR16ri8:
332  case X86::OR32rr:
333  case X86::OR32ri:
334  case X86::OR32ri8:
335  case X86::OR64rr:
336  case X86::OR64ri8:
337  case X86::OR64ri32:
338  case X86::SBB8rr:
339  case X86::SBB8ri:
340  case X86::SBB16rr:
341  case X86::SBB16ri:
342  case X86::SBB16ri8:
343  case X86::SBB32rr:
344  case X86::SBB32ri:
345  case X86::SBB32ri8:
346  case X86::SBB64rr:
347  case X86::SBB64ri8:
348  case X86::SBB64ri32:
349  case X86::SUB8rr:
350  case X86::SUB8ri:
351  case X86::SUB16rr:
352  case X86::SUB16ri:
353  case X86::SUB16ri8:
354  case X86::SUB32rr:
355  case X86::SUB32ri:
356  case X86::SUB32ri8:
357  case X86::SUB64rr:
358  case X86::SUB64ri8:
359  case X86::SUB64ri32:
360  case X86::XOR8rr:
361  case X86::XOR8ri:
362  case X86::XOR16rr:
363  case X86::XOR16ri:
364  case X86::XOR16ri8:
365  case X86::XOR32rr:
366  case X86::XOR32ri:
367  case X86::XOR32ri8:
368  case X86::XOR64rr:
369  case X86::XOR64ri8:
370  case X86::XOR64ri32:
371  // Arithmetic with just 32-bit and 64-bit variants and no immediates.
372  case X86::ADCX32rr:
373  case X86::ADCX64rr:
374  case X86::ADOX32rr:
375  case X86::ADOX64rr:
376  case X86::ANDN32rr:
377  case X86::ANDN64rr:
378  // Unary arithmetic operations.
379  case X86::DEC8r:
380  case X86::DEC16r:
381  case X86::DEC32r:
382  case X86::DEC64r:
383  case X86::INC8r:
384  case X86::INC16r:
385  case X86::INC32r:
386  case X86::INC64r:
387  case X86::NEG8r:
388  case X86::NEG16r:
389  case X86::NEG32r:
390  case X86::NEG64r:
391 
392  // Unlike other arithmetic, NOT doesn't set EFLAGS.
393  case X86::NOT8r:
394  case X86::NOT16r:
395  case X86::NOT32r:
396  case X86::NOT64r:
397 
398  // Various move instructions used to zero or sign extend things. Note that we
399  // intentionally don't support the _NOREX variants as we can't handle that
400  // register constraint anyways.
401  case X86::MOVSX16rr8:
402  case X86::MOVSX32rr8:
403  case X86::MOVSX32rr16:
404  case X86::MOVSX64rr8:
405  case X86::MOVSX64rr16:
406  case X86::MOVSX64rr32:
407  case X86::MOVZX16rr8:
408  case X86::MOVZX32rr8:
409  case X86::MOVZX32rr16:
410  case X86::MOVZX64rr8:
411  case X86::MOVZX64rr16:
412  case X86::MOV32rr:
413 
414  // Arithmetic instructions that are both constant time and don't set flags.
415  case X86::RORX32ri:
416  case X86::RORX64ri:
417  case X86::SARX32rr:
418  case X86::SARX64rr:
419  case X86::SHLX32rr:
420  case X86::SHLX64rr:
421  case X86::SHRX32rr:
422  case X86::SHRX64rr:
423 
424  // LEA doesn't actually access memory, and its arithmetic is constant time.
425  case X86::LEA16r:
426  case X86::LEA32r:
427  case X86::LEA64_32r:
428  case X86::LEA64r:
429  return true;
430  }
431 }
432 
434  switch (MI.getOpcode()) {
435  default:
436  // By default, assume that the load will immediately leak.
437  return false;
438 
439  // On x86 it is believed that imul is constant time w.r.t. the loaded data.
440  // However, they set flags and are perhaps the most surprisingly constant
441  // time operations so we call them out here separately.
442  case X86::IMUL16rm:
443  case X86::IMUL16rmi8:
444  case X86::IMUL16rmi:
445  case X86::IMUL32rm:
446  case X86::IMUL32rmi8:
447  case X86::IMUL32rmi:
448  case X86::IMUL64rm:
449  case X86::IMUL64rmi32:
450  case X86::IMUL64rmi8:
451 
452  // Bit scanning and counting instructions that are somewhat surprisingly
453  // constant time as they scan across bits and do other fairly complex
454  // operations like popcnt, but are believed to be constant time on x86.
455  // However, these set flags.
456  case X86::BSF16rm:
457  case X86::BSF32rm:
458  case X86::BSF64rm:
459  case X86::BSR16rm:
460  case X86::BSR32rm:
461  case X86::BSR64rm:
462  case X86::LZCNT16rm:
463  case X86::LZCNT32rm:
464  case X86::LZCNT64rm:
465  case X86::POPCNT16rm:
466  case X86::POPCNT32rm:
467  case X86::POPCNT64rm:
468  case X86::TZCNT16rm:
469  case X86::TZCNT32rm:
470  case X86::TZCNT64rm:
471 
472  // Bit manipulation instructions are effectively combinations of basic
473  // arithmetic ops, and should still execute in constant time. These also
474  // set flags.
475  case X86::BLCFILL32rm:
476  case X86::BLCFILL64rm:
477  case X86::BLCI32rm:
478  case X86::BLCI64rm:
479  case X86::BLCIC32rm:
480  case X86::BLCIC64rm:
481  case X86::BLCMSK32rm:
482  case X86::BLCMSK64rm:
483  case X86::BLCS32rm:
484  case X86::BLCS64rm:
485  case X86::BLSFILL32rm:
486  case X86::BLSFILL64rm:
487  case X86::BLSI32rm:
488  case X86::BLSI64rm:
489  case X86::BLSIC32rm:
490  case X86::BLSIC64rm:
491  case X86::BLSMSK32rm:
492  case X86::BLSMSK64rm:
493  case X86::BLSR32rm:
494  case X86::BLSR64rm:
495  case X86::TZMSK32rm:
496  case X86::TZMSK64rm:
497 
498  // Bit extracting and clearing instructions should execute in constant time,
499  // and set flags.
500  case X86::BEXTR32rm:
501  case X86::BEXTR64rm:
502  case X86::BEXTRI32mi:
503  case X86::BEXTRI64mi:
504  case X86::BZHI32rm:
505  case X86::BZHI64rm:
506 
507  // Basic arithmetic is constant time on the input but does set flags.
508  case X86::ADC8rm:
509  case X86::ADC16rm:
510  case X86::ADC32rm:
511  case X86::ADC64rm:
512  case X86::ADCX32rm:
513  case X86::ADCX64rm:
514  case X86::ADD8rm:
515  case X86::ADD16rm:
516  case X86::ADD32rm:
517  case X86::ADD64rm:
518  case X86::ADOX32rm:
519  case X86::ADOX64rm:
520  case X86::AND8rm:
521  case X86::AND16rm:
522  case X86::AND32rm:
523  case X86::AND64rm:
524  case X86::ANDN32rm:
525  case X86::ANDN64rm:
526  case X86::OR8rm:
527  case X86::OR16rm:
528  case X86::OR32rm:
529  case X86::OR64rm:
530  case X86::SBB8rm:
531  case X86::SBB16rm:
532  case X86::SBB32rm:
533  case X86::SBB64rm:
534  case X86::SUB8rm:
535  case X86::SUB16rm:
536  case X86::SUB32rm:
537  case X86::SUB64rm:
538  case X86::XOR8rm:
539  case X86::XOR16rm:
540  case X86::XOR32rm:
541  case X86::XOR64rm:
542 
543  // Integer multiply w/o affecting flags is still believed to be constant
544  // time on x86. Called out separately as this is among the most surprising
545  // instructions to exhibit that behavior.
546  case X86::MULX32rm:
547  case X86::MULX64rm:
548 
549  // Arithmetic instructions that are both constant time and don't set flags.
550  case X86::RORX32mi:
551  case X86::RORX64mi:
552  case X86::SARX32rm:
553  case X86::SARX64rm:
554  case X86::SHLX32rm:
555  case X86::SHLX64rm:
556  case X86::SHRX32rm:
557  case X86::SHRX64rm:
558 
559  // Conversions are believed to be constant time and don't set flags.
560  case X86::CVTTSD2SI64rm:
561  case X86::VCVTTSD2SI64rm:
562  case X86::VCVTTSD2SI64Zrm:
563  case X86::CVTTSD2SIrm:
564  case X86::VCVTTSD2SIrm:
565  case X86::VCVTTSD2SIZrm:
566  case X86::CVTTSS2SI64rm:
567  case X86::VCVTTSS2SI64rm:
568  case X86::VCVTTSS2SI64Zrm:
569  case X86::CVTTSS2SIrm:
570  case X86::VCVTTSS2SIrm:
571  case X86::VCVTTSS2SIZrm:
572  case X86::CVTSI2SDrm:
573  case X86::VCVTSI2SDrm:
574  case X86::VCVTSI2SDZrm:
575  case X86::CVTSI2SSrm:
576  case X86::VCVTSI2SSrm:
577  case X86::VCVTSI2SSZrm:
578  case X86::CVTSI642SDrm:
579  case X86::VCVTSI642SDrm:
580  case X86::VCVTSI642SDZrm:
581  case X86::CVTSI642SSrm:
582  case X86::VCVTSI642SSrm:
583  case X86::VCVTSI642SSZrm:
584  case X86::CVTSS2SDrm:
585  case X86::VCVTSS2SDrm:
586  case X86::VCVTSS2SDZrm:
587  case X86::CVTSD2SSrm:
588  case X86::VCVTSD2SSrm:
589  case X86::VCVTSD2SSZrm:
590  // AVX512 added unsigned integer conversions.
591  case X86::VCVTTSD2USI64Zrm:
592  case X86::VCVTTSD2USIZrm:
593  case X86::VCVTTSS2USI64Zrm:
594  case X86::VCVTTSS2USIZrm:
595  case X86::VCVTUSI2SDZrm:
596  case X86::VCVTUSI642SDZrm:
597  case X86::VCVTUSI2SSZrm:
598  case X86::VCVTUSI642SSZrm:
599 
600  // Loads to register don't set flags.
601  case X86::MOV8rm:
602  case X86::MOV8rm_NOREX:
603  case X86::MOV16rm:
604  case X86::MOV32rm:
605  case X86::MOV64rm:
606  case X86::MOVSX16rm8:
607  case X86::MOVSX32rm16:
608  case X86::MOVSX32rm8:
609  case X86::MOVSX32rm8_NOREX:
610  case X86::MOVSX64rm16:
611  case X86::MOVSX64rm32:
612  case X86::MOVSX64rm8:
613  case X86::MOVZX16rm8:
614  case X86::MOVZX32rm16:
615  case X86::MOVZX32rm8:
616  case X86::MOVZX32rm8_NOREX:
617  case X86::MOVZX64rm16:
618  case X86::MOVZX64rm8:
619  return true;
620  }
621 }
622 
624  const MachineFunction *MF = MI.getParent()->getParent();
626 
627  if (isFrameInstr(MI)) {
628  int SPAdj = alignTo(getFrameSize(MI), TFI->getStackAlign());
629  SPAdj -= getFrameAdjustment(MI);
630  if (!isFrameSetup(MI))
631  SPAdj = -SPAdj;
632  return SPAdj;
633  }
634 
635  // To know whether a call adjusts the stack, we need information
636  // that is bound to the following ADJCALLSTACKUP pseudo.
637  // Look for the next ADJCALLSTACKUP that follows the call.
638  if (MI.isCall()) {
639  const MachineBasicBlock *MBB = MI.getParent();
641  for (auto E = MBB->end(); I != E; ++I) {
642  if (I->getOpcode() == getCallFrameDestroyOpcode() ||
643  I->isCall())
644  break;
645  }
646 
647  // If we could not find a frame destroy opcode, then it has already
648  // been simplified, so we don't care.
649  if (I->getOpcode() != getCallFrameDestroyOpcode())
650  return 0;
651 
652  return -(I->getOperand(1).getImm());
653  }
654 
655  // Currently handle only PUSHes we can reasonably expect to see
656  // in call sequences
657  switch (MI.getOpcode()) {
658  default:
659  return 0;
660  case X86::PUSH32i8:
661  case X86::PUSH32r:
662  case X86::PUSH32rmm:
663  case X86::PUSH32rmr:
664  case X86::PUSHi32:
665  return 4;
666  case X86::PUSH64i8:
667  case X86::PUSH64r:
668  case X86::PUSH64rmm:
669  case X86::PUSH64rmr:
670  case X86::PUSH64i32:
671  return 8;
672  }
673 }
674 
675 /// Return true and the FrameIndex if the specified
676 /// operand and follow operands form a reference to the stack frame.
677 bool X86InstrInfo::isFrameOperand(const MachineInstr &MI, unsigned int Op,
678  int &FrameIndex) const {
679  if (MI.getOperand(Op + X86::AddrBaseReg).isFI() &&
680  MI.getOperand(Op + X86::AddrScaleAmt).isImm() &&
681  MI.getOperand(Op + X86::AddrIndexReg).isReg() &&
682  MI.getOperand(Op + X86::AddrDisp).isImm() &&
683  MI.getOperand(Op + X86::AddrScaleAmt).getImm() == 1 &&
684  MI.getOperand(Op + X86::AddrIndexReg).getReg() == 0 &&
685  MI.getOperand(Op + X86::AddrDisp).getImm() == 0) {
686  FrameIndex = MI.getOperand(Op + X86::AddrBaseReg).getIndex();
687  return true;
688  }
689  return false;
690 }
691 
692 static bool isFrameLoadOpcode(int Opcode, unsigned &MemBytes) {
693  switch (Opcode) {
694  default:
695  return false;
696  case X86::MOV8rm:
697  case X86::KMOVBkm:
698  MemBytes = 1;
699  return true;
700  case X86::MOV16rm:
701  case X86::KMOVWkm:
702  MemBytes = 2;
703  return true;
704  case X86::MOV32rm:
705  case X86::MOVSSrm:
706  case X86::MOVSSrm_alt:
707  case X86::VMOVSSrm:
708  case X86::VMOVSSrm_alt:
709  case X86::VMOVSSZrm:
710  case X86::VMOVSSZrm_alt:
711  case X86::KMOVDkm:
712  MemBytes = 4;
713  return true;
714  case X86::MOV64rm:
715  case X86::LD_Fp64m:
716  case X86::MOVSDrm:
717  case X86::MOVSDrm_alt:
718  case X86::VMOVSDrm:
719  case X86::VMOVSDrm_alt:
720  case X86::VMOVSDZrm:
721  case X86::VMOVSDZrm_alt:
722  case X86::MMX_MOVD64rm:
723  case X86::MMX_MOVQ64rm:
724  case X86::KMOVQkm:
725  MemBytes = 8;
726  return true;
727  case X86::MOVAPSrm:
728  case X86::MOVUPSrm:
729  case X86::MOVAPDrm:
730  case X86::MOVUPDrm:
731  case X86::MOVDQArm:
732  case X86::MOVDQUrm:
733  case X86::VMOVAPSrm:
734  case X86::VMOVUPSrm:
735  case X86::VMOVAPDrm:
736  case X86::VMOVUPDrm:
737  case X86::VMOVDQArm:
738  case X86::VMOVDQUrm:
739  case X86::VMOVAPSZ128rm:
740  case X86::VMOVUPSZ128rm:
741  case X86::VMOVAPSZ128rm_NOVLX:
742  case X86::VMOVUPSZ128rm_NOVLX:
743  case X86::VMOVAPDZ128rm:
744  case X86::VMOVUPDZ128rm:
745  case X86::VMOVDQU8Z128rm:
746  case X86::VMOVDQU16Z128rm:
747  case X86::VMOVDQA32Z128rm:
748  case X86::VMOVDQU32Z128rm:
749  case X86::VMOVDQA64Z128rm:
750  case X86::VMOVDQU64Z128rm:
751  MemBytes = 16;
752  return true;
753  case X86::VMOVAPSYrm:
754  case X86::VMOVUPSYrm:
755  case X86::VMOVAPDYrm:
756  case X86::VMOVUPDYrm:
757  case X86::VMOVDQAYrm:
758  case X86::VMOVDQUYrm:
759  case X86::VMOVAPSZ256rm:
760  case X86::VMOVUPSZ256rm:
761  case X86::VMOVAPSZ256rm_NOVLX:
762  case X86::VMOVUPSZ256rm_NOVLX:
763  case X86::VMOVAPDZ256rm:
764  case X86::VMOVUPDZ256rm:
765  case X86::VMOVDQU8Z256rm:
766  case X86::VMOVDQU16Z256rm:
767  case X86::VMOVDQA32Z256rm:
768  case X86::VMOVDQU32Z256rm:
769  case X86::VMOVDQA64Z256rm:
770  case X86::VMOVDQU64Z256rm:
771  MemBytes = 32;
772  return true;
773  case X86::VMOVAPSZrm:
774  case X86::VMOVUPSZrm:
775  case X86::VMOVAPDZrm:
776  case X86::VMOVUPDZrm:
777  case X86::VMOVDQU8Zrm:
778  case X86::VMOVDQU16Zrm:
779  case X86::VMOVDQA32Zrm:
780  case X86::VMOVDQU32Zrm:
781  case X86::VMOVDQA64Zrm:
782  case X86::VMOVDQU64Zrm:
783  MemBytes = 64;
784  return true;
785  }
786 }
787 
788 static bool isFrameStoreOpcode(int Opcode, unsigned &MemBytes) {
789  switch (Opcode) {
790  default:
791  return false;
792  case X86::MOV8mr:
793  case X86::KMOVBmk:
794  MemBytes = 1;
795  return true;
796  case X86::MOV16mr:
797  case X86::KMOVWmk:
798  MemBytes = 2;
799  return true;
800  case X86::MOV32mr:
801  case X86::MOVSSmr:
802  case X86::VMOVSSmr:
803  case X86::VMOVSSZmr:
804  case X86::KMOVDmk:
805  MemBytes = 4;
806  return true;
807  case X86::MOV64mr:
808  case X86::ST_FpP64m:
809  case X86::MOVSDmr:
810  case X86::VMOVSDmr:
811  case X86::VMOVSDZmr:
812  case X86::MMX_MOVD64mr:
813  case X86::MMX_MOVQ64mr:
814  case X86::MMX_MOVNTQmr:
815  case X86::KMOVQmk:
816  MemBytes = 8;
817  return true;
818  case X86::MOVAPSmr:
819  case X86::MOVUPSmr:
820  case X86::MOVAPDmr:
821  case X86::MOVUPDmr:
822  case X86::MOVDQAmr:
823  case X86::MOVDQUmr:
824  case X86::VMOVAPSmr:
825  case X86::VMOVUPSmr:
826  case X86::VMOVAPDmr:
827  case X86::VMOVUPDmr:
828  case X86::VMOVDQAmr:
829  case X86::VMOVDQUmr:
830  case X86::VMOVUPSZ128mr:
831  case X86::VMOVAPSZ128mr:
832  case X86::VMOVUPSZ128mr_NOVLX:
833  case X86::VMOVAPSZ128mr_NOVLX:
834  case X86::VMOVUPDZ128mr:
835  case X86::VMOVAPDZ128mr:
836  case X86::VMOVDQA32Z128mr:
837  case X86::VMOVDQU32Z128mr:
838  case X86::VMOVDQA64Z128mr:
839  case X86::VMOVDQU64Z128mr:
840  case X86::VMOVDQU8Z128mr:
841  case X86::VMOVDQU16Z128mr:
842  MemBytes = 16;
843  return true;
844  case X86::VMOVUPSYmr:
845  case X86::VMOVAPSYmr:
846  case X86::VMOVUPDYmr:
847  case X86::VMOVAPDYmr:
848  case X86::VMOVDQUYmr:
849  case X86::VMOVDQAYmr:
850  case X86::VMOVUPSZ256mr:
851  case X86::VMOVAPSZ256mr:
852  case X86::VMOVUPSZ256mr_NOVLX:
853  case X86::VMOVAPSZ256mr_NOVLX:
854  case X86::VMOVUPDZ256mr:
855  case X86::VMOVAPDZ256mr:
856  case X86::VMOVDQU8Z256mr:
857  case X86::VMOVDQU16Z256mr:
858  case X86::VMOVDQA32Z256mr:
859  case X86::VMOVDQU32Z256mr:
860  case X86::VMOVDQA64Z256mr:
861  case X86::VMOVDQU64Z256mr:
862  MemBytes = 32;
863  return true;
864  case X86::VMOVUPSZmr:
865  case X86::VMOVAPSZmr:
866  case X86::VMOVUPDZmr:
867  case X86::VMOVAPDZmr:
868  case X86::VMOVDQU8Zmr:
869  case X86::VMOVDQU16Zmr:
870  case X86::VMOVDQA32Zmr:
871  case X86::VMOVDQU32Zmr:
872  case X86::VMOVDQA64Zmr:
873  case X86::VMOVDQU64Zmr:
874  MemBytes = 64;
875  return true;
876  }
877  return false;
878 }
879 
881  int &FrameIndex) const {
882  unsigned Dummy;
884 }
885 
887  int &FrameIndex,
888  unsigned &MemBytes) const {
889  if (isFrameLoadOpcode(MI.getOpcode(), MemBytes))
890  if (MI.getOperand(0).getSubReg() == 0 && isFrameOperand(MI, 1, FrameIndex))
891  return MI.getOperand(0).getReg();
892  return 0;
893 }
894 
896  int &FrameIndex) const {
897  unsigned Dummy;
898  if (isFrameLoadOpcode(MI.getOpcode(), Dummy)) {
899  unsigned Reg;
901  return Reg;
902  // Check for post-frame index elimination operations
904  if (hasLoadFromStackSlot(MI, Accesses)) {
905  FrameIndex =
906  cast<FixedStackPseudoSourceValue>(Accesses.front()->getPseudoValue())
907  ->getFrameIndex();
908  return 1;
909  }
910  }
911  return 0;
912 }
913 
915  int &FrameIndex) const {
916  unsigned Dummy;
918 }
919 
921  int &FrameIndex,
922  unsigned &MemBytes) const {
923  if (isFrameStoreOpcode(MI.getOpcode(), MemBytes))
924  if (MI.getOperand(X86::AddrNumOperands).getSubReg() == 0 &&
925  isFrameOperand(MI, 0, FrameIndex))
926  return MI.getOperand(X86::AddrNumOperands).getReg();
927  return 0;
928 }
929 
931  int &FrameIndex) const {
932  unsigned Dummy;
933  if (isFrameStoreOpcode(MI.getOpcode(), Dummy)) {
934  unsigned Reg;
936  return Reg;
937  // Check for post-frame index elimination operations
939  if (hasStoreToStackSlot(MI, Accesses)) {
940  FrameIndex =
941  cast<FixedStackPseudoSourceValue>(Accesses.front()->getPseudoValue())
942  ->getFrameIndex();
943  return 1;
944  }
945  }
946  return 0;
947 }
948 
949 /// Return true if register is PIC base; i.e.g defined by X86::MOVPC32r.
950 static bool regIsPICBase(Register BaseReg, const MachineRegisterInfo &MRI) {
951  // Don't waste compile time scanning use-def chains of physregs.
952  if (!BaseReg.isVirtual())
953  return false;
954  bool isPICBase = false;
956  E = MRI.def_instr_end(); I != E; ++I) {
957  MachineInstr *DefMI = &*I;
958  if (DefMI->getOpcode() != X86::MOVPC32r)
959  return false;
960  assert(!isPICBase && "More than one PIC base?");
961  isPICBase = true;
962  }
963  return isPICBase;
964 }
965 
967  AAResults *AA) const {
968  switch (MI.getOpcode()) {
969  default:
970  // This function should only be called for opcodes with the ReMaterializable
971  // flag set.
972  llvm_unreachable("Unknown rematerializable operation!");
973  break;
974 
975  case X86::LOAD_STACK_GUARD:
976  case X86::AVX1_SETALLONES:
977  case X86::AVX2_SETALLONES:
978  case X86::AVX512_128_SET0:
979  case X86::AVX512_256_SET0:
980  case X86::AVX512_512_SET0:
981  case X86::AVX512_512_SETALLONES:
982  case X86::AVX512_FsFLD0SD:
983  case X86::AVX512_FsFLD0SS:
984  case X86::AVX512_FsFLD0F128:
985  case X86::AVX_SET0:
986  case X86::FsFLD0SD:
987  case X86::FsFLD0SS:
988  case X86::FsFLD0F128:
989  case X86::KSET0D:
990  case X86::KSET0Q:
991  case X86::KSET0W:
992  case X86::KSET1D:
993  case X86::KSET1Q:
994  case X86::KSET1W:
995  case X86::MMX_SET0:
996  case X86::MOV32ImmSExti8:
997  case X86::MOV32r0:
998  case X86::MOV32r1:
999  case X86::MOV32r_1:
1000  case X86::MOV32ri64:
1001  case X86::MOV64ImmSExti8:
1002  case X86::V_SET0:
1003  case X86::V_SETALLONES:
1004  case X86::MOV16ri:
1005  case X86::MOV32ri:
1006  case X86::MOV64ri:
1007  case X86::MOV64ri32:
1008  case X86::MOV8ri:
1009  case X86::PTILEZEROV:
1010  return true;
1011 
1012  case X86::MOV8rm:
1013  case X86::MOV8rm_NOREX:
1014  case X86::MOV16rm:
1015  case X86::MOV32rm:
1016  case X86::MOV64rm:
1017  case X86::MOVSSrm:
1018  case X86::MOVSSrm_alt:
1019  case X86::MOVSDrm:
1020  case X86::MOVSDrm_alt:
1021  case X86::MOVAPSrm:
1022  case X86::MOVUPSrm:
1023  case X86::MOVAPDrm:
1024  case X86::MOVUPDrm:
1025  case X86::MOVDQArm:
1026  case X86::MOVDQUrm:
1027  case X86::VMOVSSrm:
1028  case X86::VMOVSSrm_alt:
1029  case X86::VMOVSDrm:
1030  case X86::VMOVSDrm_alt:
1031  case X86::VMOVAPSrm:
1032  case X86::VMOVUPSrm:
1033  case X86::VMOVAPDrm:
1034  case X86::VMOVUPDrm:
1035  case X86::VMOVDQArm:
1036  case X86::VMOVDQUrm:
1037  case X86::VMOVAPSYrm:
1038  case X86::VMOVUPSYrm:
1039  case X86::VMOVAPDYrm:
1040  case X86::VMOVUPDYrm:
1041  case X86::VMOVDQAYrm:
1042  case X86::VMOVDQUYrm:
1043  case X86::MMX_MOVD64rm:
1044  case X86::MMX_MOVQ64rm:
1045  // AVX-512
1046  case X86::VMOVSSZrm:
1047  case X86::VMOVSSZrm_alt:
1048  case X86::VMOVSDZrm:
1049  case X86::VMOVSDZrm_alt:
1050  case X86::VMOVAPDZ128rm:
1051  case X86::VMOVAPDZ256rm:
1052  case X86::VMOVAPDZrm:
1053  case X86::VMOVAPSZ128rm:
1054  case X86::VMOVAPSZ256rm:
1055  case X86::VMOVAPSZ128rm_NOVLX:
1056  case X86::VMOVAPSZ256rm_NOVLX:
1057  case X86::VMOVAPSZrm:
1058  case X86::VMOVDQA32Z128rm:
1059  case X86::VMOVDQA32Z256rm:
1060  case X86::VMOVDQA32Zrm:
1061  case X86::VMOVDQA64Z128rm:
1062  case X86::VMOVDQA64Z256rm:
1063  case X86::VMOVDQA64Zrm:
1064  case X86::VMOVDQU16Z128rm:
1065  case X86::VMOVDQU16Z256rm:
1066  case X86::VMOVDQU16Zrm:
1067  case X86::VMOVDQU32Z128rm:
1068  case X86::VMOVDQU32Z256rm:
1069  case X86::VMOVDQU32Zrm:
1070  case X86::VMOVDQU64Z128rm:
1071  case X86::VMOVDQU64Z256rm:
1072  case X86::VMOVDQU64Zrm:
1073  case X86::VMOVDQU8Z128rm:
1074  case X86::VMOVDQU8Z256rm:
1075  case X86::VMOVDQU8Zrm:
1076  case X86::VMOVUPDZ128rm:
1077  case X86::VMOVUPDZ256rm:
1078  case X86::VMOVUPDZrm:
1079  case X86::VMOVUPSZ128rm:
1080  case X86::VMOVUPSZ256rm:
1081  case X86::VMOVUPSZ128rm_NOVLX:
1082  case X86::VMOVUPSZ256rm_NOVLX:
1083  case X86::VMOVUPSZrm: {
1084  // Loads from constant pools are trivially rematerializable.
1085  if (MI.getOperand(1 + X86::AddrBaseReg).isReg() &&
1086  MI.getOperand(1 + X86::AddrScaleAmt).isImm() &&
1087  MI.getOperand(1 + X86::AddrIndexReg).isReg() &&
1088  MI.getOperand(1 + X86::AddrIndexReg).getReg() == 0 &&
1089  MI.isDereferenceableInvariantLoad(AA)) {
1090  Register BaseReg = MI.getOperand(1 + X86::AddrBaseReg).getReg();
1091  if (BaseReg == 0 || BaseReg == X86::RIP)
1092  return true;
1093  // Allow re-materialization of PIC load.
1094  if (!ReMatPICStubLoad && MI.getOperand(1 + X86::AddrDisp).isGlobal())
1095  return false;
1096  const MachineFunction &MF = *MI.getParent()->getParent();
1097  const MachineRegisterInfo &MRI = MF.getRegInfo();
1098  return regIsPICBase(BaseReg, MRI);
1099  }
1100  return false;
1101  }
1102 
1103  case X86::LEA32r:
1104  case X86::LEA64r: {
1105  if (MI.getOperand(1 + X86::AddrScaleAmt).isImm() &&
1106  MI.getOperand(1 + X86::AddrIndexReg).isReg() &&
1107  MI.getOperand(1 + X86::AddrIndexReg).getReg() == 0 &&
1108  !MI.getOperand(1 + X86::AddrDisp).isReg()) {
1109  // lea fi#, lea GV, etc. are all rematerializable.
1110  if (!MI.getOperand(1 + X86::AddrBaseReg).isReg())
1111  return true;
1112  Register BaseReg = MI.getOperand(1 + X86::AddrBaseReg).getReg();
1113  if (BaseReg == 0)
1114  return true;
1115  // Allow re-materialization of lea PICBase + x.
1116  const MachineFunction &MF = *MI.getParent()->getParent();
1117  const MachineRegisterInfo &MRI = MF.getRegInfo();
1118  return regIsPICBase(BaseReg, MRI);
1119  }
1120  return false;
1121  }
1122  }
1123 }
1124 
1127  Register DestReg, unsigned SubIdx,
1128  const MachineInstr &Orig,
1129  const TargetRegisterInfo &TRI) const {
1130  bool ClobbersEFLAGS = Orig.modifiesRegister(X86::EFLAGS, &TRI);
1131  if (ClobbersEFLAGS && MBB.computeRegisterLiveness(&TRI, X86::EFLAGS, I) !=
1133  // The instruction clobbers EFLAGS. Re-materialize as MOV32ri to avoid side
1134  // effects.
1135  int Value;
1136  switch (Orig.getOpcode()) {
1137  case X86::MOV32r0: Value = 0; break;
1138  case X86::MOV32r1: Value = 1; break;
1139  case X86::MOV32r_1: Value = -1; break;
1140  default:
1141  llvm_unreachable("Unexpected instruction!");
1142  }
1143 
1144  const DebugLoc &DL = Orig.getDebugLoc();
1145  BuildMI(MBB, I, DL, get(X86::MOV32ri))
1146  .add(Orig.getOperand(0))
1147  .addImm(Value);
1148  } else {
1150  MBB.insert(I, MI);
1151  }
1152 
1153  MachineInstr &NewMI = *std::prev(I);
1154  NewMI.substituteRegister(Orig.getOperand(0).getReg(), DestReg, SubIdx, TRI);
1155 }
1156 
1157 /// True if MI has a condition code def, e.g. EFLAGS, that is not marked dead.
1159  for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
1160  MachineOperand &MO = MI.getOperand(i);
1161  if (MO.isReg() && MO.isDef() &&
1162  MO.getReg() == X86::EFLAGS && !MO.isDead()) {
1163  return true;
1164  }
1165  }
1166  return false;
1167 }
1168 
1169 /// Check whether the shift count for a machine operand is non-zero.
1170 inline static unsigned getTruncatedShiftCount(const MachineInstr &MI,
1171  unsigned ShiftAmtOperandIdx) {
1172  // The shift count is six bits with the REX.W prefix and five bits without.
1173  unsigned ShiftCountMask = (MI.getDesc().TSFlags & X86II::REX_W) ? 63 : 31;
1174  unsigned Imm = MI.getOperand(ShiftAmtOperandIdx).getImm();
1175  return Imm & ShiftCountMask;
1176 }
1177 
1178 /// Check whether the given shift count is appropriate
1179 /// can be represented by a LEA instruction.
1180 inline static bool isTruncatedShiftCountForLEA(unsigned ShAmt) {
1181  // Left shift instructions can be transformed into load-effective-address
1182  // instructions if we can encode them appropriately.
1183  // A LEA instruction utilizes a SIB byte to encode its scale factor.
1184  // The SIB.scale field is two bits wide which means that we can encode any
1185  // shift amount less than 4.
1186  return ShAmt < 4 && ShAmt > 0;
1187 }
1188 
1190  unsigned Opc, bool AllowSP, Register &NewSrc,
1191  bool &isKill, MachineOperand &ImplicitOp,
1192  LiveVariables *LV) const {
1193  MachineFunction &MF = *MI.getParent()->getParent();
1194  const TargetRegisterClass *RC;
1195  if (AllowSP) {
1196  RC = Opc != X86::LEA32r ? &X86::GR64RegClass : &X86::GR32RegClass;
1197  } else {
1198  RC = Opc != X86::LEA32r ?
1199  &X86::GR64_NOSPRegClass : &X86::GR32_NOSPRegClass;
1200  }
1201  Register SrcReg = Src.getReg();
1202 
1203  // For both LEA64 and LEA32 the register already has essentially the right
1204  // type (32-bit or 64-bit) we may just need to forbid SP.
1205  if (Opc != X86::LEA64_32r) {
1206  NewSrc = SrcReg;
1207  isKill = Src.isKill();
1208  assert(!Src.isUndef() && "Undef op doesn't need optimization");
1209 
1210  if (NewSrc.isVirtual() && !MF.getRegInfo().constrainRegClass(NewSrc, RC))
1211  return false;
1212 
1213  return true;
1214  }
1215 
1216  // This is for an LEA64_32r and incoming registers are 32-bit. One way or
1217  // another we need to add 64-bit registers to the final MI.
1218  if (SrcReg.isPhysical()) {
1219  ImplicitOp = Src;
1220  ImplicitOp.setImplicit();
1221 
1222  NewSrc = getX86SubSuperRegister(Src.getReg(), 64);
1223  isKill = Src.isKill();
1224  assert(!Src.isUndef() && "Undef op doesn't need optimization");
1225  } else {
1226  // Virtual register of the wrong class, we have to create a temporary 64-bit
1227  // vreg to feed into the LEA.
1228  NewSrc = MF.getRegInfo().createVirtualRegister(RC);
1229  MachineInstr *Copy =
1230  BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(TargetOpcode::COPY))
1231  .addReg(NewSrc, RegState::Define | RegState::Undef, X86::sub_32bit)
1232  .add(Src);
1233 
1234  // Which is obviously going to be dead after we're done with it.
1235  isKill = true;
1236 
1237  if (LV)
1238  LV->replaceKillInstruction(SrcReg, MI, *Copy);
1239  }
1240 
1241  // We've set all the parameters without issue.
1242  return true;
1243 }
1244 
1245 MachineInstr *X86InstrInfo::convertToThreeAddressWithLEA(
1246  unsigned MIOpc, MachineFunction::iterator &MFI, MachineInstr &MI,
1247  LiveVariables *LV, bool Is8BitOp) const {
1248  // We handle 8-bit adds and various 16-bit opcodes in the switch below.
1249  MachineRegisterInfo &RegInfo = MFI->getParent()->getRegInfo();
1250  assert((Is8BitOp || RegInfo.getTargetRegisterInfo()->getRegSizeInBits(
1251  *RegInfo.getRegClass(MI.getOperand(0).getReg())) == 16) &&
1252  "Unexpected type for LEA transform");
1253 
1254  // TODO: For a 32-bit target, we need to adjust the LEA variables with
1255  // something like this:
1256  // Opcode = X86::LEA32r;
1257  // InRegLEA = RegInfo.createVirtualRegister(&X86::GR32_NOSPRegClass);
1258  // OutRegLEA =
1259  // Is8BitOp ? RegInfo.createVirtualRegister(&X86::GR32ABCD_RegClass)
1260  // : RegInfo.createVirtualRegister(&X86::GR32RegClass);
1261  if (!Subtarget.is64Bit())
1262  return nullptr;
1263 
1264  unsigned Opcode = X86::LEA64_32r;
1265  Register InRegLEA = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass);
1266  Register OutRegLEA = RegInfo.createVirtualRegister(&X86::GR32RegClass);
1267 
1268  // Build and insert into an implicit UNDEF value. This is OK because
1269  // we will be shifting and then extracting the lower 8/16-bits.
1270  // This has the potential to cause partial register stall. e.g.
1271  // movw (%rbp,%rcx,2), %dx
1272  // leal -65(%rdx), %esi
1273  // But testing has shown this *does* help performance in 64-bit mode (at
1274  // least on modern x86 machines).
1275  MachineBasicBlock::iterator MBBI = MI.getIterator();
1276  Register Dest = MI.getOperand(0).getReg();
1277  Register Src = MI.getOperand(1).getReg();
1278  bool IsDead = MI.getOperand(0).isDead();
1279  bool IsKill = MI.getOperand(1).isKill();
1280  unsigned SubReg = Is8BitOp ? X86::sub_8bit : X86::sub_16bit;
1281  assert(!MI.getOperand(1).isUndef() && "Undef op doesn't need optimization");
1282  BuildMI(*MFI, MBBI, MI.getDebugLoc(), get(X86::IMPLICIT_DEF), InRegLEA);
1283  MachineInstr *InsMI =
1284  BuildMI(*MFI, MBBI, MI.getDebugLoc(), get(TargetOpcode::COPY))
1285  .addReg(InRegLEA, RegState::Define, SubReg)
1286  .addReg(Src, getKillRegState(IsKill));
1287 
1288  MachineInstrBuilder MIB =
1289  BuildMI(*MFI, MBBI, MI.getDebugLoc(), get(Opcode), OutRegLEA);
1290  switch (MIOpc) {
1291  default: llvm_unreachable("Unreachable!");
1292  case X86::SHL8ri:
1293  case X86::SHL16ri: {
1294  unsigned ShAmt = MI.getOperand(2).getImm();
1295  MIB.addReg(0).addImm(1ULL << ShAmt)
1296  .addReg(InRegLEA, RegState::Kill).addImm(0).addReg(0);
1297  break;
1298  }
1299  case X86::INC8r:
1300  case X86::INC16r:
1301  addRegOffset(MIB, InRegLEA, true, 1);
1302  break;
1303  case X86::DEC8r:
1304  case X86::DEC16r:
1305  addRegOffset(MIB, InRegLEA, true, -1);
1306  break;
1307  case X86::ADD8ri:
1308  case X86::ADD8ri_DB:
1309  case X86::ADD16ri:
1310  case X86::ADD16ri8:
1311  case X86::ADD16ri_DB:
1312  case X86::ADD16ri8_DB:
1313  addRegOffset(MIB, InRegLEA, true, MI.getOperand(2).getImm());
1314  break;
1315  case X86::ADD8rr:
1316  case X86::ADD8rr_DB:
1317  case X86::ADD16rr:
1318  case X86::ADD16rr_DB: {
1319  Register Src2 = MI.getOperand(2).getReg();
1320  bool IsKill2 = MI.getOperand(2).isKill();
1321  assert(!MI.getOperand(2).isUndef() && "Undef op doesn't need optimization");
1322  unsigned InRegLEA2 = 0;
1323  MachineInstr *InsMI2 = nullptr;
1324  if (Src == Src2) {
1325  // ADD8rr/ADD16rr killed %reg1028, %reg1028
1326  // just a single insert_subreg.
1327  addRegReg(MIB, InRegLEA, true, InRegLEA, false);
1328  } else {
1329  if (Subtarget.is64Bit())
1330  InRegLEA2 = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass);
1331  else
1332  InRegLEA2 = RegInfo.createVirtualRegister(&X86::GR32_NOSPRegClass);
1333  // Build and insert into an implicit UNDEF value. This is OK because
1334  // we will be shifting and then extracting the lower 8/16-bits.
1335  BuildMI(*MFI, &*MIB, MI.getDebugLoc(), get(X86::IMPLICIT_DEF), InRegLEA2);
1336  InsMI2 = BuildMI(*MFI, &*MIB, MI.getDebugLoc(), get(TargetOpcode::COPY))
1337  .addReg(InRegLEA2, RegState::Define, SubReg)
1338  .addReg(Src2, getKillRegState(IsKill2));
1339  addRegReg(MIB, InRegLEA, true, InRegLEA2, true);
1340  }
1341  if (LV && IsKill2 && InsMI2)
1342  LV->replaceKillInstruction(Src2, MI, *InsMI2);
1343  break;
1344  }
1345  }
1346 
1347  MachineInstr *NewMI = MIB;
1348  MachineInstr *ExtMI =
1349  BuildMI(*MFI, MBBI, MI.getDebugLoc(), get(TargetOpcode::COPY))
1351  .addReg(OutRegLEA, RegState::Kill, SubReg);
1352 
1353  if (LV) {
1354  // Update live variables.
1355  LV->getVarInfo(InRegLEA).Kills.push_back(NewMI);
1356  LV->getVarInfo(OutRegLEA).Kills.push_back(ExtMI);
1357  if (IsKill)
1358  LV->replaceKillInstruction(Src, MI, *InsMI);
1359  if (IsDead)
1360  LV->replaceKillInstruction(Dest, MI, *ExtMI);
1361  }
1362 
1363  return ExtMI;
1364 }
1365 
1366 /// This method must be implemented by targets that
1367 /// set the M_CONVERTIBLE_TO_3_ADDR flag. When this flag is set, the target
1368 /// may be able to convert a two-address instruction into a true
1369 /// three-address instruction on demand. This allows the X86 target (for
1370 /// example) to convert ADD and SHL instructions into LEA instructions if they
1371 /// would require register copies due to two-addressness.
1372 ///
1373 /// This method returns a null pointer if the transformation cannot be
1374 /// performed, otherwise it returns the new instruction.
1375 ///
1376 MachineInstr *
1378  MachineInstr &MI, LiveVariables *LV) const {
1379  // The following opcodes also sets the condition code register(s). Only
1380  // convert them to equivalent lea if the condition code register def's
1381  // are dead!
1382  if (hasLiveCondCodeDef(MI))
1383  return nullptr;
1384 
1385  MachineFunction &MF = *MI.getParent()->getParent();
1386  // All instructions input are two-addr instructions. Get the known operands.
1387  const MachineOperand &Dest = MI.getOperand(0);
1388  const MachineOperand &Src = MI.getOperand(1);
1389 
1390  // Ideally, operations with undef should be folded before we get here, but we
1391  // can't guarantee it. Bail out because optimizing undefs is a waste of time.
1392  // Without this, we have to forward undef state to new register operands to
1393  // avoid machine verifier errors.
1394  if (Src.isUndef())
1395  return nullptr;
1396  if (MI.getNumOperands() > 2)
1397  if (MI.getOperand(2).isReg() && MI.getOperand(2).isUndef())
1398  return nullptr;
1399 
1400  MachineInstr *NewMI = nullptr;
1401  bool Is64Bit = Subtarget.is64Bit();
1402 
1403  bool Is8BitOp = false;
1404  unsigned MIOpc = MI.getOpcode();
1405  switch (MIOpc) {
1406  default: llvm_unreachable("Unreachable!");
1407  case X86::SHL64ri: {
1408  assert(MI.getNumOperands() >= 3 && "Unknown shift instruction!");
1409  unsigned ShAmt = getTruncatedShiftCount(MI, 2);
1410  if (!isTruncatedShiftCountForLEA(ShAmt)) return nullptr;
1411 
1412  // LEA can't handle RSP.
1413  if (Src.getReg().isVirtual() && !MF.getRegInfo().constrainRegClass(
1414  Src.getReg(), &X86::GR64_NOSPRegClass))
1415  return nullptr;
1416 
1417  NewMI = BuildMI(MF, MI.getDebugLoc(), get(X86::LEA64r))
1418  .add(Dest)
1419  .addReg(0)
1420  .addImm(1ULL << ShAmt)
1421  .add(Src)
1422  .addImm(0)
1423  .addReg(0);
1424  break;
1425  }
1426  case X86::SHL32ri: {
1427  assert(MI.getNumOperands() >= 3 && "Unknown shift instruction!");
1428  unsigned ShAmt = getTruncatedShiftCount(MI, 2);
1429  if (!isTruncatedShiftCountForLEA(ShAmt)) return nullptr;
1430 
1431  unsigned Opc = Is64Bit ? X86::LEA64_32r : X86::LEA32r;
1432 
1433  // LEA can't handle ESP.
1434  bool isKill;
1435  Register SrcReg;
1436  MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
1437  if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ false,
1438  SrcReg, isKill, ImplicitOp, LV))
1439  return nullptr;
1440 
1441  MachineInstrBuilder MIB =
1442  BuildMI(MF, MI.getDebugLoc(), get(Opc))
1443  .add(Dest)
1444  .addReg(0)
1445  .addImm(1ULL << ShAmt)
1446  .addReg(SrcReg, getKillRegState(isKill))
1447  .addImm(0)
1448  .addReg(0);
1449  if (ImplicitOp.getReg() != 0)
1450  MIB.add(ImplicitOp);
1451  NewMI = MIB;
1452 
1453  break;
1454  }
1455  case X86::SHL8ri:
1456  Is8BitOp = true;
1458  case X86::SHL16ri: {
1459  assert(MI.getNumOperands() >= 3 && "Unknown shift instruction!");
1460  unsigned ShAmt = getTruncatedShiftCount(MI, 2);
1461  if (!isTruncatedShiftCountForLEA(ShAmt))
1462  return nullptr;
1463  return convertToThreeAddressWithLEA(MIOpc, MFI, MI, LV, Is8BitOp);
1464  }
1465  case X86::INC64r:
1466  case X86::INC32r: {
1467  assert(MI.getNumOperands() >= 2 && "Unknown inc instruction!");
1468  unsigned Opc = MIOpc == X86::INC64r ? X86::LEA64r :
1469  (Is64Bit ? X86::LEA64_32r : X86::LEA32r);
1470  bool isKill;
1471  Register SrcReg;
1472  MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
1473  if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ false, SrcReg, isKill,
1474  ImplicitOp, LV))
1475  return nullptr;
1476 
1477  MachineInstrBuilder MIB =
1478  BuildMI(MF, MI.getDebugLoc(), get(Opc))
1479  .add(Dest)
1480  .addReg(SrcReg, getKillRegState(isKill));
1481  if (ImplicitOp.getReg() != 0)
1482  MIB.add(ImplicitOp);
1483 
1484  NewMI = addOffset(MIB, 1);
1485  break;
1486  }
1487  case X86::DEC64r:
1488  case X86::DEC32r: {
1489  assert(MI.getNumOperands() >= 2 && "Unknown dec instruction!");
1490  unsigned Opc = MIOpc == X86::DEC64r ? X86::LEA64r
1491  : (Is64Bit ? X86::LEA64_32r : X86::LEA32r);
1492 
1493  bool isKill;
1494  Register SrcReg;
1495  MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
1496  if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ false, SrcReg, isKill,
1497  ImplicitOp, LV))
1498  return nullptr;
1499 
1500  MachineInstrBuilder MIB = BuildMI(MF, MI.getDebugLoc(), get(Opc))
1501  .add(Dest)
1502  .addReg(SrcReg, getKillRegState(isKill));
1503  if (ImplicitOp.getReg() != 0)
1504  MIB.add(ImplicitOp);
1505 
1506  NewMI = addOffset(MIB, -1);
1507 
1508  break;
1509  }
1510  case X86::DEC8r:
1511  case X86::INC8r:
1512  Is8BitOp = true;
1514  case X86::DEC16r:
1515  case X86::INC16r:
1516  return convertToThreeAddressWithLEA(MIOpc, MFI, MI, LV, Is8BitOp);
1517  case X86::ADD64rr:
1518  case X86::ADD64rr_DB:
1519  case X86::ADD32rr:
1520  case X86::ADD32rr_DB: {
1521  assert(MI.getNumOperands() >= 3 && "Unknown add instruction!");
1522  unsigned Opc;
1523  if (MIOpc == X86::ADD64rr || MIOpc == X86::ADD64rr_DB)
1524  Opc = X86::LEA64r;
1525  else
1526  Opc = Is64Bit ? X86::LEA64_32r : X86::LEA32r;
1527 
1528  bool isKill;
1529  Register SrcReg;
1530  MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
1531  if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ true,
1532  SrcReg, isKill, ImplicitOp, LV))
1533  return nullptr;
1534 
1535  const MachineOperand &Src2 = MI.getOperand(2);
1536  bool isKill2;
1537  Register SrcReg2;
1538  MachineOperand ImplicitOp2 = MachineOperand::CreateReg(0, false);
1539  if (!classifyLEAReg(MI, Src2, Opc, /*AllowSP=*/ false,
1540  SrcReg2, isKill2, ImplicitOp2, LV))
1541  return nullptr;
1542 
1543  MachineInstrBuilder MIB = BuildMI(MF, MI.getDebugLoc(), get(Opc)).add(Dest);
1544  if (ImplicitOp.getReg() != 0)
1545  MIB.add(ImplicitOp);
1546  if (ImplicitOp2.getReg() != 0)
1547  MIB.add(ImplicitOp2);
1548 
1549  NewMI = addRegReg(MIB, SrcReg, isKill, SrcReg2, isKill2);
1550  if (LV && Src2.isKill())
1551  LV->replaceKillInstruction(SrcReg2, MI, *NewMI);
1552  break;
1553  }
1554  case X86::ADD8rr:
1555  case X86::ADD8rr_DB:
1556  Is8BitOp = true;
1558  case X86::ADD16rr:
1559  case X86::ADD16rr_DB:
1560  return convertToThreeAddressWithLEA(MIOpc, MFI, MI, LV, Is8BitOp);
1561  case X86::ADD64ri32:
1562  case X86::ADD64ri8:
1563  case X86::ADD64ri32_DB:
1564  case X86::ADD64ri8_DB:
1565  assert(MI.getNumOperands() >= 3 && "Unknown add instruction!");
1566  NewMI = addOffset(
1567  BuildMI(MF, MI.getDebugLoc(), get(X86::LEA64r)).add(Dest).add(Src),
1568  MI.getOperand(2));
1569  break;
1570  case X86::ADD32ri:
1571  case X86::ADD32ri8:
1572  case X86::ADD32ri_DB:
1573  case X86::ADD32ri8_DB: {
1574  assert(MI.getNumOperands() >= 3 && "Unknown add instruction!");
1575  unsigned Opc = Is64Bit ? X86::LEA64_32r : X86::LEA32r;
1576 
1577  bool isKill;
1578  Register SrcReg;
1579  MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
1580  if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ true,
1581  SrcReg, isKill, ImplicitOp, LV))
1582  return nullptr;
1583 
1584  MachineInstrBuilder MIB = BuildMI(MF, MI.getDebugLoc(), get(Opc))
1585  .add(Dest)
1586  .addReg(SrcReg, getKillRegState(isKill));
1587  if (ImplicitOp.getReg() != 0)
1588  MIB.add(ImplicitOp);
1589 
1590  NewMI = addOffset(MIB, MI.getOperand(2));
1591  break;
1592  }
1593  case X86::ADD8ri:
1594  case X86::ADD8ri_DB:
1595  Is8BitOp = true;
1597  case X86::ADD16ri:
1598  case X86::ADD16ri8:
1599  case X86::ADD16ri_DB:
1600  case X86::ADD16ri8_DB:
1601  return convertToThreeAddressWithLEA(MIOpc, MFI, MI, LV, Is8BitOp);
1602  case X86::SUB8ri:
1603  case X86::SUB16ri8:
1604  case X86::SUB16ri:
1605  /// FIXME: Support these similar to ADD8ri/ADD16ri*.
1606  return nullptr;
1607  case X86::SUB32ri8:
1608  case X86::SUB32ri: {
1609  if (!MI.getOperand(2).isImm())
1610  return nullptr;
1611  int64_t Imm = MI.getOperand(2).getImm();
1612  if (!isInt<32>(-Imm))
1613  return nullptr;
1614 
1615  assert(MI.getNumOperands() >= 3 && "Unknown add instruction!");
1616  unsigned Opc = Is64Bit ? X86::LEA64_32r : X86::LEA32r;
1617 
1618  bool isKill;
1619  Register SrcReg;
1620  MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
1621  if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ true,
1622  SrcReg, isKill, ImplicitOp, LV))
1623  return nullptr;
1624 
1625  MachineInstrBuilder MIB = BuildMI(MF, MI.getDebugLoc(), get(Opc))
1626  .add(Dest)
1627  .addReg(SrcReg, getKillRegState(isKill));
1628  if (ImplicitOp.getReg() != 0)
1629  MIB.add(ImplicitOp);
1630 
1631  NewMI = addOffset(MIB, -Imm);
1632  break;
1633  }
1634 
1635  case X86::SUB64ri8:
1636  case X86::SUB64ri32: {
1637  if (!MI.getOperand(2).isImm())
1638  return nullptr;
1639  int64_t Imm = MI.getOperand(2).getImm();
1640  if (!isInt<32>(-Imm))
1641  return nullptr;
1642 
1643  assert(MI.getNumOperands() >= 3 && "Unknown sub instruction!");
1644 
1645  MachineInstrBuilder MIB = BuildMI(MF, MI.getDebugLoc(),
1646  get(X86::LEA64r)).add(Dest).add(Src);
1647  NewMI = addOffset(MIB, -Imm);
1648  break;
1649  }
1650 
1651  case X86::VMOVDQU8Z128rmk:
1652  case X86::VMOVDQU8Z256rmk:
1653  case X86::VMOVDQU8Zrmk:
1654  case X86::VMOVDQU16Z128rmk:
1655  case X86::VMOVDQU16Z256rmk:
1656  case X86::VMOVDQU16Zrmk:
1657  case X86::VMOVDQU32Z128rmk: case X86::VMOVDQA32Z128rmk:
1658  case X86::VMOVDQU32Z256rmk: case X86::VMOVDQA32Z256rmk:
1659  case X86::VMOVDQU32Zrmk: case X86::VMOVDQA32Zrmk:
1660  case X86::VMOVDQU64Z128rmk: case X86::VMOVDQA64Z128rmk:
1661  case X86::VMOVDQU64Z256rmk: case X86::VMOVDQA64Z256rmk:
1662  case X86::VMOVDQU64Zrmk: case X86::VMOVDQA64Zrmk:
1663  case X86::VMOVUPDZ128rmk: case X86::VMOVAPDZ128rmk:
1664  case X86::VMOVUPDZ256rmk: case X86::VMOVAPDZ256rmk:
1665  case X86::VMOVUPDZrmk: case X86::VMOVAPDZrmk:
1666  case X86::VMOVUPSZ128rmk: case X86::VMOVAPSZ128rmk:
1667  case X86::VMOVUPSZ256rmk: case X86::VMOVAPSZ256rmk:
1668  case X86::VMOVUPSZrmk: case X86::VMOVAPSZrmk:
1669  case X86::VBROADCASTSDZ256rmk:
1670  case X86::VBROADCASTSDZrmk:
1671  case X86::VBROADCASTSSZ128rmk:
1672  case X86::VBROADCASTSSZ256rmk:
1673  case X86::VBROADCASTSSZrmk:
1674  case X86::VPBROADCASTDZ128rmk:
1675  case X86::VPBROADCASTDZ256rmk:
1676  case X86::VPBROADCASTDZrmk:
1677  case X86::VPBROADCASTQZ128rmk:
1678  case X86::VPBROADCASTQZ256rmk:
1679  case X86::VPBROADCASTQZrmk: {
1680  unsigned Opc;
1681  switch (MIOpc) {
1682  default: llvm_unreachable("Unreachable!");
1683  case X86::VMOVDQU8Z128rmk: Opc = X86::VPBLENDMBZ128rmk; break;
1684  case X86::VMOVDQU8Z256rmk: Opc = X86::VPBLENDMBZ256rmk; break;
1685  case X86::VMOVDQU8Zrmk: Opc = X86::VPBLENDMBZrmk; break;
1686  case X86::VMOVDQU16Z128rmk: Opc = X86::VPBLENDMWZ128rmk; break;
1687  case X86::VMOVDQU16Z256rmk: Opc = X86::VPBLENDMWZ256rmk; break;
1688  case X86::VMOVDQU16Zrmk: Opc = X86::VPBLENDMWZrmk; break;
1689  case X86::VMOVDQU32Z128rmk: Opc = X86::VPBLENDMDZ128rmk; break;
1690  case X86::VMOVDQU32Z256rmk: Opc = X86::VPBLENDMDZ256rmk; break;
1691  case X86::VMOVDQU32Zrmk: Opc = X86::VPBLENDMDZrmk; break;
1692  case X86::VMOVDQU64Z128rmk: Opc = X86::VPBLENDMQZ128rmk; break;
1693  case X86::VMOVDQU64Z256rmk: Opc = X86::VPBLENDMQZ256rmk; break;
1694  case X86::VMOVDQU64Zrmk: Opc = X86::VPBLENDMQZrmk; break;
1695  case X86::VMOVUPDZ128rmk: Opc = X86::VBLENDMPDZ128rmk; break;
1696  case X86::VMOVUPDZ256rmk: Opc = X86::VBLENDMPDZ256rmk; break;
1697  case X86::VMOVUPDZrmk: Opc = X86::VBLENDMPDZrmk; break;
1698  case X86::VMOVUPSZ128rmk: Opc = X86::VBLENDMPSZ128rmk; break;
1699  case X86::VMOVUPSZ256rmk: Opc = X86::VBLENDMPSZ256rmk; break;
1700  case X86::VMOVUPSZrmk: Opc = X86::VBLENDMPSZrmk; break;
1701  case X86::VMOVDQA32Z128rmk: Opc = X86::VPBLENDMDZ128rmk; break;
1702  case X86::VMOVDQA32Z256rmk: Opc = X86::VPBLENDMDZ256rmk; break;
1703  case X86::VMOVDQA32Zrmk: Opc = X86::VPBLENDMDZrmk; break;
1704  case X86::VMOVDQA64Z128rmk: Opc = X86::VPBLENDMQZ128rmk; break;
1705  case X86::VMOVDQA64Z256rmk: Opc = X86::VPBLENDMQZ256rmk; break;
1706  case X86::VMOVDQA64Zrmk: Opc = X86::VPBLENDMQZrmk; break;
1707  case X86::VMOVAPDZ128rmk: Opc = X86::VBLENDMPDZ128rmk; break;
1708  case X86::VMOVAPDZ256rmk: Opc = X86::VBLENDMPDZ256rmk; break;
1709  case X86::VMOVAPDZrmk: Opc = X86::VBLENDMPDZrmk; break;
1710  case X86::VMOVAPSZ128rmk: Opc = X86::VBLENDMPSZ128rmk; break;
1711  case X86::VMOVAPSZ256rmk: Opc = X86::VBLENDMPSZ256rmk; break;
1712  case X86::VMOVAPSZrmk: Opc = X86::VBLENDMPSZrmk; break;
1713  case X86::VBROADCASTSDZ256rmk: Opc = X86::VBLENDMPDZ256rmbk; break;
1714  case X86::VBROADCASTSDZrmk: Opc = X86::VBLENDMPDZrmbk; break;
1715  case X86::VBROADCASTSSZ128rmk: Opc = X86::VBLENDMPSZ128rmbk; break;
1716  case X86::VBROADCASTSSZ256rmk: Opc = X86::VBLENDMPSZ256rmbk; break;
1717  case X86::VBROADCASTSSZrmk: Opc = X86::VBLENDMPSZrmbk; break;
1718  case X86::VPBROADCASTDZ128rmk: Opc = X86::VPBLENDMDZ128rmbk; break;
1719  case X86::VPBROADCASTDZ256rmk: Opc = X86::VPBLENDMDZ256rmbk; break;
1720  case X86::VPBROADCASTDZrmk: Opc = X86::VPBLENDMDZrmbk; break;
1721  case X86::VPBROADCASTQZ128rmk: Opc = X86::VPBLENDMQZ128rmbk; break;
1722  case X86::VPBROADCASTQZ256rmk: Opc = X86::VPBLENDMQZ256rmbk; break;
1723  case X86::VPBROADCASTQZrmk: Opc = X86::VPBLENDMQZrmbk; break;
1724  }
1725 
1726  NewMI = BuildMI(MF, MI.getDebugLoc(), get(Opc))
1727  .add(Dest)
1728  .add(MI.getOperand(2))
1729  .add(Src)
1730  .add(MI.getOperand(3))
1731  .add(MI.getOperand(4))
1732  .add(MI.getOperand(5))
1733  .add(MI.getOperand(6))
1734  .add(MI.getOperand(7));
1735  break;
1736  }
1737 
1738  case X86::VMOVDQU8Z128rrk:
1739  case X86::VMOVDQU8Z256rrk:
1740  case X86::VMOVDQU8Zrrk:
1741  case X86::VMOVDQU16Z128rrk:
1742  case X86::VMOVDQU16Z256rrk:
1743  case X86::VMOVDQU16Zrrk:
1744  case X86::VMOVDQU32Z128rrk: case X86::VMOVDQA32Z128rrk:
1745  case X86::VMOVDQU32Z256rrk: case X86::VMOVDQA32Z256rrk:
1746  case X86::VMOVDQU32Zrrk: case X86::VMOVDQA32Zrrk:
1747  case X86::VMOVDQU64Z128rrk: case X86::VMOVDQA64Z128rrk:
1748  case X86::VMOVDQU64Z256rrk: case X86::VMOVDQA64Z256rrk:
1749  case X86::VMOVDQU64Zrrk: case X86::VMOVDQA64Zrrk:
1750  case X86::VMOVUPDZ128rrk: case X86::VMOVAPDZ128rrk:
1751  case X86::VMOVUPDZ256rrk: case X86::VMOVAPDZ256rrk:
1752  case X86::VMOVUPDZrrk: case X86::VMOVAPDZrrk:
1753  case X86::VMOVUPSZ128rrk: case X86::VMOVAPSZ128rrk:
1754  case X86::VMOVUPSZ256rrk: case X86::VMOVAPSZ256rrk:
1755  case X86::VMOVUPSZrrk: case X86::VMOVAPSZrrk: {
1756  unsigned Opc;
1757  switch (MIOpc) {
1758  default: llvm_unreachable("Unreachable!");
1759  case X86::VMOVDQU8Z128rrk: Opc = X86::VPBLENDMBZ128rrk; break;
1760  case X86::VMOVDQU8Z256rrk: Opc = X86::VPBLENDMBZ256rrk; break;
1761  case X86::VMOVDQU8Zrrk: Opc = X86::VPBLENDMBZrrk; break;
1762  case X86::VMOVDQU16Z128rrk: Opc = X86::VPBLENDMWZ128rrk; break;
1763  case X86::VMOVDQU16Z256rrk: Opc = X86::VPBLENDMWZ256rrk; break;
1764  case X86::VMOVDQU16Zrrk: Opc = X86::VPBLENDMWZrrk; break;
1765  case X86::VMOVDQU32Z128rrk: Opc = X86::VPBLENDMDZ128rrk; break;
1766  case X86::VMOVDQU32Z256rrk: Opc = X86::VPBLENDMDZ256rrk; break;
1767  case X86::VMOVDQU32Zrrk: Opc = X86::VPBLENDMDZrrk; break;
1768  case X86::VMOVDQU64Z128rrk: Opc = X86::VPBLENDMQZ128rrk; break;
1769  case X86::VMOVDQU64Z256rrk: Opc = X86::VPBLENDMQZ256rrk; break;
1770  case X86::VMOVDQU64Zrrk: Opc = X86::VPBLENDMQZrrk; break;
1771  case X86::VMOVUPDZ128rrk: Opc = X86::VBLENDMPDZ128rrk; break;
1772  case X86::VMOVUPDZ256rrk: Opc = X86::VBLENDMPDZ256rrk; break;
1773  case X86::VMOVUPDZrrk: Opc = X86::VBLENDMPDZrrk; break;
1774  case X86::VMOVUPSZ128rrk: Opc = X86::VBLENDMPSZ128rrk; break;
1775  case X86::VMOVUPSZ256rrk: Opc = X86::VBLENDMPSZ256rrk; break;
1776  case X86::VMOVUPSZrrk: Opc = X86::VBLENDMPSZrrk; break;
1777  case X86::VMOVDQA32Z128rrk: Opc = X86::VPBLENDMDZ128rrk; break;
1778  case X86::VMOVDQA32Z256rrk: Opc = X86::VPBLENDMDZ256rrk; break;
1779  case X86::VMOVDQA32Zrrk: Opc = X86::VPBLENDMDZrrk; break;
1780  case X86::VMOVDQA64Z128rrk: Opc = X86::VPBLENDMQZ128rrk; break;
1781  case X86::VMOVDQA64Z256rrk: Opc = X86::VPBLENDMQZ256rrk; break;
1782  case X86::VMOVDQA64Zrrk: Opc = X86::VPBLENDMQZrrk; break;
1783  case X86::VMOVAPDZ128rrk: Opc = X86::VBLENDMPDZ128rrk; break;
1784  case X86::VMOVAPDZ256rrk: Opc = X86::VBLENDMPDZ256rrk; break;
1785  case X86::VMOVAPDZrrk: Opc = X86::VBLENDMPDZrrk; break;
1786  case X86::VMOVAPSZ128rrk: Opc = X86::VBLENDMPSZ128rrk; break;
1787  case X86::VMOVAPSZ256rrk: Opc = X86::VBLENDMPSZ256rrk; break;
1788  case X86::VMOVAPSZrrk: Opc = X86::VBLENDMPSZrrk; break;
1789  }
1790 
1791  NewMI = BuildMI(MF, MI.getDebugLoc(), get(Opc))
1792  .add(Dest)
1793  .add(MI.getOperand(2))
1794  .add(Src)
1795  .add(MI.getOperand(3));
1796  break;
1797  }
1798  }
1799 
1800  if (!NewMI) return nullptr;
1801 
1802  if (LV) { // Update live variables
1803  if (Src.isKill())
1804  LV->replaceKillInstruction(Src.getReg(), MI, *NewMI);
1805  if (Dest.isDead())
1806  LV->replaceKillInstruction(Dest.getReg(), MI, *NewMI);
1807  }
1808 
1809  MFI->insert(MI.getIterator(), NewMI); // Insert the new inst
1810  return NewMI;
1811 }
1812 
1813 /// This determines which of three possible cases of a three source commute
1814 /// the source indexes correspond to taking into account any mask operands.
1815 /// All prevents commuting a passthru operand. Returns -1 if the commute isn't
1816 /// possible.
1817 /// Case 0 - Possible to commute the first and second operands.
1818 /// Case 1 - Possible to commute the first and third operands.
1819 /// Case 2 - Possible to commute the second and third operands.
1820 static unsigned getThreeSrcCommuteCase(uint64_t TSFlags, unsigned SrcOpIdx1,
1821  unsigned SrcOpIdx2) {
1822  // Put the lowest index to SrcOpIdx1 to simplify the checks below.
1823  if (SrcOpIdx1 > SrcOpIdx2)
1824  std::swap(SrcOpIdx1, SrcOpIdx2);
1825 
1826  unsigned Op1 = 1, Op2 = 2, Op3 = 3;
1827  if (X86II::isKMasked(TSFlags)) {
1828  Op2++;
1829  Op3++;
1830  }
1831 
1832  if (SrcOpIdx1 == Op1 && SrcOpIdx2 == Op2)
1833  return 0;
1834  if (SrcOpIdx1 == Op1 && SrcOpIdx2 == Op3)
1835  return 1;
1836  if (SrcOpIdx1 == Op2 && SrcOpIdx2 == Op3)
1837  return 2;
1838  llvm_unreachable("Unknown three src commute case.");
1839 }
1840 
1842  const MachineInstr &MI, unsigned SrcOpIdx1, unsigned SrcOpIdx2,
1843  const X86InstrFMA3Group &FMA3Group) const {
1844 
1845  unsigned Opc = MI.getOpcode();
1846 
1847  // TODO: Commuting the 1st operand of FMA*_Int requires some additional
1848  // analysis. The commute optimization is legal only if all users of FMA*_Int
1849  // use only the lowest element of the FMA*_Int instruction. Such analysis are
1850  // not implemented yet. So, just return 0 in that case.
1851  // When such analysis are available this place will be the right place for
1852  // calling it.
1853  assert(!(FMA3Group.isIntrinsic() && (SrcOpIdx1 == 1 || SrcOpIdx2 == 1)) &&
1854  "Intrinsic instructions can't commute operand 1");
1855 
1856  // Determine which case this commute is or if it can't be done.
1857  unsigned Case = getThreeSrcCommuteCase(MI.getDesc().TSFlags, SrcOpIdx1,
1858  SrcOpIdx2);
1859  assert(Case < 3 && "Unexpected case number!");
1860 
1861  // Define the FMA forms mapping array that helps to map input FMA form
1862  // to output FMA form to preserve the operation semantics after
1863  // commuting the operands.
1864  const unsigned Form132Index = 0;
1865  const unsigned Form213Index = 1;
1866  const unsigned Form231Index = 2;
1867  static const unsigned FormMapping[][3] = {
1868  // 0: SrcOpIdx1 == 1 && SrcOpIdx2 == 2;
1869  // FMA132 A, C, b; ==> FMA231 C, A, b;
1870  // FMA213 B, A, c; ==> FMA213 A, B, c;
1871  // FMA231 C, A, b; ==> FMA132 A, C, b;
1872  { Form231Index, Form213Index, Form132Index },
1873  // 1: SrcOpIdx1 == 1 && SrcOpIdx2 == 3;
1874  // FMA132 A, c, B; ==> FMA132 B, c, A;
1875  // FMA213 B, a, C; ==> FMA231 C, a, B;
1876  // FMA231 C, a, B; ==> FMA213 B, a, C;
1877  { Form132Index, Form231Index, Form213Index },
1878  // 2: SrcOpIdx1 == 2 && SrcOpIdx2 == 3;
1879  // FMA132 a, C, B; ==> FMA213 a, B, C;
1880  // FMA213 b, A, C; ==> FMA132 b, C, A;
1881  // FMA231 c, A, B; ==> FMA231 c, B, A;
1882  { Form213Index, Form132Index, Form231Index }
1883  };
1884 
1885  unsigned FMAForms[3];
1886  FMAForms[0] = FMA3Group.get132Opcode();
1887  FMAForms[1] = FMA3Group.get213Opcode();
1888  FMAForms[2] = FMA3Group.get231Opcode();
1889  unsigned FormIndex;
1890  for (FormIndex = 0; FormIndex < 3; FormIndex++)
1891  if (Opc == FMAForms[FormIndex])
1892  break;
1893 
1894  // Everything is ready, just adjust the FMA opcode and return it.
1895  FormIndex = FormMapping[Case][FormIndex];
1896  return FMAForms[FormIndex];
1897 }
1898 
1899 static void commuteVPTERNLOG(MachineInstr &MI, unsigned SrcOpIdx1,
1900  unsigned SrcOpIdx2) {
1901  // Determine which case this commute is or if it can't be done.
1902  unsigned Case = getThreeSrcCommuteCase(MI.getDesc().TSFlags, SrcOpIdx1,
1903  SrcOpIdx2);
1904  assert(Case < 3 && "Unexpected case value!");
1905 
1906  // For each case we need to swap two pairs of bits in the final immediate.
1907  static const uint8_t SwapMasks[3][4] = {
1908  { 0x04, 0x10, 0x08, 0x20 }, // Swap bits 2/4 and 3/5.
1909  { 0x02, 0x10, 0x08, 0x40 }, // Swap bits 1/4 and 3/6.
1910  { 0x02, 0x04, 0x20, 0x40 }, // Swap bits 1/2 and 5/6.
1911  };
1912 
1913  uint8_t Imm = MI.getOperand(MI.getNumOperands()-1).getImm();
1914  // Clear out the bits we are swapping.
1915  uint8_t NewImm = Imm & ~(SwapMasks[Case][0] | SwapMasks[Case][1] |
1916  SwapMasks[Case][2] | SwapMasks[Case][3]);
1917  // If the immediate had a bit of the pair set, then set the opposite bit.
1918  if (Imm & SwapMasks[Case][0]) NewImm |= SwapMasks[Case][1];
1919  if (Imm & SwapMasks[Case][1]) NewImm |= SwapMasks[Case][0];
1920  if (Imm & SwapMasks[Case][2]) NewImm |= SwapMasks[Case][3];
1921  if (Imm & SwapMasks[Case][3]) NewImm |= SwapMasks[Case][2];
1922  MI.getOperand(MI.getNumOperands()-1).setImm(NewImm);
1923 }
1924 
1925 // Returns true if this is a VPERMI2 or VPERMT2 instruction that can be
1926 // commuted.
1927 static bool isCommutableVPERMV3Instruction(unsigned Opcode) {
1928 #define VPERM_CASES(Suffix) \
1929  case X86::VPERMI2##Suffix##128rr: case X86::VPERMT2##Suffix##128rr: \
1930  case X86::VPERMI2##Suffix##256rr: case X86::VPERMT2##Suffix##256rr: \
1931  case X86::VPERMI2##Suffix##rr: case X86::VPERMT2##Suffix##rr: \
1932  case X86::VPERMI2##Suffix##128rm: case X86::VPERMT2##Suffix##128rm: \
1933  case X86::VPERMI2##Suffix##256rm: case X86::VPERMT2##Suffix##256rm: \
1934  case X86::VPERMI2##Suffix##rm: case X86::VPERMT2##Suffix##rm: \
1935  case X86::VPERMI2##Suffix##128rrkz: case X86::VPERMT2##Suffix##128rrkz: \
1936  case X86::VPERMI2##Suffix##256rrkz: case X86::VPERMT2##Suffix##256rrkz: \
1937  case X86::VPERMI2##Suffix##rrkz: case X86::VPERMT2##Suffix##rrkz: \
1938  case X86::VPERMI2##Suffix##128rmkz: case X86::VPERMT2##Suffix##128rmkz: \
1939  case X86::VPERMI2##Suffix##256rmkz: case X86::VPERMT2##Suffix##256rmkz: \
1940  case X86::VPERMI2##Suffix##rmkz: case X86::VPERMT2##Suffix##rmkz:
1941 
1942 #define VPERM_CASES_BROADCAST(Suffix) \
1943  VPERM_CASES(Suffix) \
1944  case X86::VPERMI2##Suffix##128rmb: case X86::VPERMT2##Suffix##128rmb: \
1945  case X86::VPERMI2##Suffix##256rmb: case X86::VPERMT2##Suffix##256rmb: \
1946  case X86::VPERMI2##Suffix##rmb: case X86::VPERMT2##Suffix##rmb: \
1947  case X86::VPERMI2##Suffix##128rmbkz: case X86::VPERMT2##Suffix##128rmbkz: \
1948  case X86::VPERMI2##Suffix##256rmbkz: case X86::VPERMT2##Suffix##256rmbkz: \
1949  case X86::VPERMI2##Suffix##rmbkz: case X86::VPERMT2##Suffix##rmbkz:
1950 
1951  switch (Opcode) {
1952  default: return false;
1953  VPERM_CASES(B)
1958  VPERM_CASES(W)
1959  return true;
1960  }
1961 #undef VPERM_CASES_BROADCAST
1962 #undef VPERM_CASES
1963 }
1964 
1965 // Returns commuted opcode for VPERMI2 and VPERMT2 instructions by switching
1966 // from the I opcode to the T opcode and vice versa.
1967 static unsigned getCommutedVPERMV3Opcode(unsigned Opcode) {
1968 #define VPERM_CASES(Orig, New) \
1969  case X86::Orig##128rr: return X86::New##128rr; \
1970  case X86::Orig##128rrkz: return X86::New##128rrkz; \
1971  case X86::Orig##128rm: return X86::New##128rm; \
1972  case X86::Orig##128rmkz: return X86::New##128rmkz; \
1973  case X86::Orig##256rr: return X86::New##256rr; \
1974  case X86::Orig##256rrkz: return X86::New##256rrkz; \
1975  case X86::Orig##256rm: return X86::New##256rm; \
1976  case X86::Orig##256rmkz: return X86::New##256rmkz; \
1977  case X86::Orig##rr: return X86::New##rr; \
1978  case X86::Orig##rrkz: return X86::New##rrkz; \
1979  case X86::Orig##rm: return X86::New##rm; \
1980  case X86::Orig##rmkz: return X86::New##rmkz;
1981 
1982 #define VPERM_CASES_BROADCAST(Orig, New) \
1983  VPERM_CASES(Orig, New) \
1984  case X86::Orig##128rmb: return X86::New##128rmb; \
1985  case X86::Orig##128rmbkz: return X86::New##128rmbkz; \
1986  case X86::Orig##256rmb: return X86::New##256rmb; \
1987  case X86::Orig##256rmbkz: return X86::New##256rmbkz; \
1988  case X86::Orig##rmb: return X86::New##rmb; \
1989  case X86::Orig##rmbkz: return X86::New##rmbkz;
1990 
1991  switch (Opcode) {
1992  VPERM_CASES(VPERMI2B, VPERMT2B)
1993  VPERM_CASES_BROADCAST(VPERMI2D, VPERMT2D)
1994  VPERM_CASES_BROADCAST(VPERMI2PD, VPERMT2PD)
1995  VPERM_CASES_BROADCAST(VPERMI2PS, VPERMT2PS)
1996  VPERM_CASES_BROADCAST(VPERMI2Q, VPERMT2Q)
1997  VPERM_CASES(VPERMI2W, VPERMT2W)
1998  VPERM_CASES(VPERMT2B, VPERMI2B)
1999  VPERM_CASES_BROADCAST(VPERMT2D, VPERMI2D)
2000  VPERM_CASES_BROADCAST(VPERMT2PD, VPERMI2PD)
2001  VPERM_CASES_BROADCAST(VPERMT2PS, VPERMI2PS)
2002  VPERM_CASES_BROADCAST(VPERMT2Q, VPERMI2Q)
2003  VPERM_CASES(VPERMT2W, VPERMI2W)
2004  }
2005 
2006  llvm_unreachable("Unreachable!");
2007 #undef VPERM_CASES_BROADCAST
2008 #undef VPERM_CASES
2009 }
2010 
2012  unsigned OpIdx1,
2013  unsigned OpIdx2) const {
2014  auto cloneIfNew = [NewMI](MachineInstr &MI) -> MachineInstr & {
2015  if (NewMI)
2016  return *MI.getParent()->getParent()->CloneMachineInstr(&MI);
2017  return MI;
2018  };
2019 
2020  switch (MI.getOpcode()) {
2021  case X86::SHRD16rri8: // A = SHRD16rri8 B, C, I -> A = SHLD16rri8 C, B, (16-I)
2022  case X86::SHLD16rri8: // A = SHLD16rri8 B, C, I -> A = SHRD16rri8 C, B, (16-I)
2023  case X86::SHRD32rri8: // A = SHRD32rri8 B, C, I -> A = SHLD32rri8 C, B, (32-I)
2024  case X86::SHLD32rri8: // A = SHLD32rri8 B, C, I -> A = SHRD32rri8 C, B, (32-I)
2025  case X86::SHRD64rri8: // A = SHRD64rri8 B, C, I -> A = SHLD64rri8 C, B, (64-I)
2026  case X86::SHLD64rri8:{// A = SHLD64rri8 B, C, I -> A = SHRD64rri8 C, B, (64-I)
2027  unsigned Opc;
2028  unsigned Size;
2029  switch (MI.getOpcode()) {
2030  default: llvm_unreachable("Unreachable!");
2031  case X86::SHRD16rri8: Size = 16; Opc = X86::SHLD16rri8; break;
2032  case X86::SHLD16rri8: Size = 16; Opc = X86::SHRD16rri8; break;
2033  case X86::SHRD32rri8: Size = 32; Opc = X86::SHLD32rri8; break;
2034  case X86::SHLD32rri8: Size = 32; Opc = X86::SHRD32rri8; break;
2035  case X86::SHRD64rri8: Size = 64; Opc = X86::SHLD64rri8; break;
2036  case X86::SHLD64rri8: Size = 64; Opc = X86::SHRD64rri8; break;
2037  }
2038  unsigned Amt = MI.getOperand(3).getImm();
2039  auto &WorkingMI = cloneIfNew(MI);
2040  WorkingMI.setDesc(get(Opc));
2041  WorkingMI.getOperand(3).setImm(Size - Amt);
2042  return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
2043  OpIdx1, OpIdx2);
2044  }
2045  case X86::PFSUBrr:
2046  case X86::PFSUBRrr: {
2047  // PFSUB x, y: x = x - y
2048  // PFSUBR x, y: x = y - x
2049  unsigned Opc =
2050  (X86::PFSUBRrr == MI.getOpcode() ? X86::PFSUBrr : X86::PFSUBRrr);
2051  auto &WorkingMI = cloneIfNew(MI);
2052  WorkingMI.setDesc(get(Opc));
2053  return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
2054  OpIdx1, OpIdx2);
2055  }
2056  case X86::BLENDPDrri:
2057  case X86::BLENDPSrri:
2058  case X86::VBLENDPDrri:
2059  case X86::VBLENDPSrri:
2060  // If we're optimizing for size, try to use MOVSD/MOVSS.
2061  if (MI.getParent()->getParent()->getFunction().hasOptSize()) {
2062  unsigned Mask, Opc;
2063  switch (MI.getOpcode()) {
2064  default: llvm_unreachable("Unreachable!");
2065  case X86::BLENDPDrri: Opc = X86::MOVSDrr; Mask = 0x03; break;
2066  case X86::BLENDPSrri: Opc = X86::MOVSSrr; Mask = 0x0F; break;
2067  case X86::VBLENDPDrri: Opc = X86::VMOVSDrr; Mask = 0x03; break;
2068  case X86::VBLENDPSrri: Opc = X86::VMOVSSrr; Mask = 0x0F; break;
2069  }
2070  if ((MI.getOperand(3).getImm() ^ Mask) == 1) {
2071  auto &WorkingMI = cloneIfNew(MI);
2072  WorkingMI.setDesc(get(Opc));
2073  WorkingMI.RemoveOperand(3);
2074  return TargetInstrInfo::commuteInstructionImpl(WorkingMI,
2075  /*NewMI=*/false,
2076  OpIdx1, OpIdx2);
2077  }
2078  }
2080  case X86::PBLENDWrri:
2081  case X86::VBLENDPDYrri:
2082  case X86::VBLENDPSYrri:
2083  case X86::VPBLENDDrri:
2084  case X86::VPBLENDWrri:
2085  case X86::VPBLENDDYrri:
2086  case X86::VPBLENDWYrri:{
2087  int8_t Mask;
2088  switch (MI.getOpcode()) {
2089  default: llvm_unreachable("Unreachable!");
2090  case X86::BLENDPDrri: Mask = (int8_t)0x03; break;
2091  case X86::BLENDPSrri: Mask = (int8_t)0x0F; break;
2092  case X86::PBLENDWrri: Mask = (int8_t)0xFF; break;
2093  case X86::VBLENDPDrri: Mask = (int8_t)0x03; break;
2094  case X86::VBLENDPSrri: Mask = (int8_t)0x0F; break;
2095  case X86::VBLENDPDYrri: Mask = (int8_t)0x0F; break;
2096  case X86::VBLENDPSYrri: Mask = (int8_t)0xFF; break;
2097  case X86::VPBLENDDrri: Mask = (int8_t)0x0F; break;
2098  case X86::VPBLENDWrri: Mask = (int8_t)0xFF; break;
2099  case X86::VPBLENDDYrri: Mask = (int8_t)0xFF; break;
2100  case X86::VPBLENDWYrri: Mask = (int8_t)0xFF; break;
2101  }
2102  // Only the least significant bits of Imm are used.
2103  // Using int8_t to ensure it will be sign extended to the int64_t that
2104  // setImm takes in order to match isel behavior.
2105  int8_t Imm = MI.getOperand(3).getImm() & Mask;
2106  auto &WorkingMI = cloneIfNew(MI);
2107  WorkingMI.getOperand(3).setImm(Mask ^ Imm);
2108  return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
2109  OpIdx1, OpIdx2);
2110  }
2111  case X86::INSERTPSrr:
2112  case X86::VINSERTPSrr:
2113  case X86::VINSERTPSZrr: {
2114  unsigned Imm = MI.getOperand(MI.getNumOperands() - 1).getImm();
2115  unsigned ZMask = Imm & 15;
2116  unsigned DstIdx = (Imm >> 4) & 3;
2117  unsigned SrcIdx = (Imm >> 6) & 3;
2118 
2119  // We can commute insertps if we zero 2 of the elements, the insertion is
2120  // "inline" and we don't override the insertion with a zero.
2121  if (DstIdx == SrcIdx && (ZMask & (1 << DstIdx)) == 0 &&
2122  countPopulation(ZMask) == 2) {
2123  unsigned AltIdx = findFirstSet((ZMask | (1 << DstIdx)) ^ 15);
2124  assert(AltIdx < 4 && "Illegal insertion index");
2125  unsigned AltImm = (AltIdx << 6) | (AltIdx << 4) | ZMask;
2126  auto &WorkingMI = cloneIfNew(MI);
2127  WorkingMI.getOperand(MI.getNumOperands() - 1).setImm(AltImm);
2128  return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
2129  OpIdx1, OpIdx2);
2130  }
2131  return nullptr;
2132  }
2133  case X86::MOVSDrr:
2134  case X86::MOVSSrr:
2135  case X86::VMOVSDrr:
2136  case X86::VMOVSSrr:{
2137  // On SSE41 or later we can commute a MOVSS/MOVSD to a BLENDPS/BLENDPD.
2138  if (Subtarget.hasSSE41()) {
2139  unsigned Mask, Opc;
2140  switch (MI.getOpcode()) {
2141  default: llvm_unreachable("Unreachable!");
2142  case X86::MOVSDrr: Opc = X86::BLENDPDrri; Mask = 0x02; break;
2143  case X86::MOVSSrr: Opc = X86::BLENDPSrri; Mask = 0x0E; break;
2144  case X86::VMOVSDrr: Opc = X86::VBLENDPDrri; Mask = 0x02; break;
2145  case X86::VMOVSSrr: Opc = X86::VBLENDPSrri; Mask = 0x0E; break;
2146  }
2147 
2148  auto &WorkingMI = cloneIfNew(MI);
2149  WorkingMI.setDesc(get(Opc));
2150  WorkingMI.addOperand(MachineOperand::CreateImm(Mask));
2151  return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
2152  OpIdx1, OpIdx2);
2153  }
2154 
2155  // Convert to SHUFPD.
2156  assert(MI.getOpcode() == X86::MOVSDrr &&
2157  "Can only commute MOVSDrr without SSE4.1");
2158 
2159  auto &WorkingMI = cloneIfNew(MI);
2160  WorkingMI.setDesc(get(X86::SHUFPDrri));
2161  WorkingMI.addOperand(MachineOperand::CreateImm(0x02));
2162  return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
2163  OpIdx1, OpIdx2);
2164  }
2165  case X86::SHUFPDrri: {
2166  // Commute to MOVSD.
2167  assert(MI.getOperand(3).getImm() == 0x02 && "Unexpected immediate!");
2168  auto &WorkingMI = cloneIfNew(MI);
2169  WorkingMI.setDesc(get(X86::MOVSDrr));
2170  WorkingMI.RemoveOperand(3);
2171  return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
2172  OpIdx1, OpIdx2);
2173  }
2174  case X86::PCLMULQDQrr:
2175  case X86::VPCLMULQDQrr:
2176  case X86::VPCLMULQDQYrr:
2177  case X86::VPCLMULQDQZrr:
2178  case X86::VPCLMULQDQZ128rr:
2179  case X86::VPCLMULQDQZ256rr: {
2180  // SRC1 64bits = Imm[0] ? SRC1[127:64] : SRC1[63:0]
2181  // SRC2 64bits = Imm[4] ? SRC2[127:64] : SRC2[63:0]
2182  unsigned Imm = MI.getOperand(3).getImm();
2183  unsigned Src1Hi = Imm & 0x01;
2184  unsigned Src2Hi = Imm & 0x10;
2185  auto &WorkingMI = cloneIfNew(MI);
2186  WorkingMI.getOperand(3).setImm((Src1Hi << 4) | (Src2Hi >> 4));
2187  return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
2188  OpIdx1, OpIdx2);
2189  }
2190  case X86::VPCMPBZ128rri: case X86::VPCMPUBZ128rri:
2191  case X86::VPCMPBZ256rri: case X86::VPCMPUBZ256rri:
2192  case X86::VPCMPBZrri: case X86::VPCMPUBZrri:
2193  case X86::VPCMPDZ128rri: case X86::VPCMPUDZ128rri:
2194  case X86::VPCMPDZ256rri: case X86::VPCMPUDZ256rri:
2195  case X86::VPCMPDZrri: case X86::VPCMPUDZrri:
2196  case X86::VPCMPQZ128rri: case X86::VPCMPUQZ128rri:
2197  case X86::VPCMPQZ256rri: case X86::VPCMPUQZ256rri:
2198  case X86::VPCMPQZrri: case X86::VPCMPUQZrri:
2199  case X86::VPCMPWZ128rri: case X86::VPCMPUWZ128rri:
2200  case X86::VPCMPWZ256rri: case X86::VPCMPUWZ256rri:
2201  case X86::VPCMPWZrri: case X86::VPCMPUWZrri:
2202  case X86::VPCMPBZ128rrik: case X86::VPCMPUBZ128rrik:
2203  case X86::VPCMPBZ256rrik: case X86::VPCMPUBZ256rrik:
2204  case X86::VPCMPBZrrik: case X86::VPCMPUBZrrik:
2205  case X86::VPCMPDZ128rrik: case X86::VPCMPUDZ128rrik:
2206  case X86::VPCMPDZ256rrik: case X86::VPCMPUDZ256rrik:
2207  case X86::VPCMPDZrrik: case X86::VPCMPUDZrrik:
2208  case X86::VPCMPQZ128rrik: case X86::VPCMPUQZ128rrik:
2209  case X86::VPCMPQZ256rrik: case X86::VPCMPUQZ256rrik:
2210  case X86::VPCMPQZrrik: case X86::VPCMPUQZrrik:
2211  case X86::VPCMPWZ128rrik: case X86::VPCMPUWZ128rrik:
2212  case X86::VPCMPWZ256rrik: case X86::VPCMPUWZ256rrik:
2213  case X86::VPCMPWZrrik: case X86::VPCMPUWZrrik: {
2214  // Flip comparison mode immediate (if necessary).
2215  unsigned Imm = MI.getOperand(MI.getNumOperands() - 1).getImm() & 0x7;
2216  Imm = X86::getSwappedVPCMPImm(Imm);
2217  auto &WorkingMI = cloneIfNew(MI);
2218  WorkingMI.getOperand(MI.getNumOperands() - 1).setImm(Imm);
2219  return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
2220  OpIdx1, OpIdx2);
2221  }
2222  case X86::VPCOMBri: case X86::VPCOMUBri:
2223  case X86::VPCOMDri: case X86::VPCOMUDri:
2224  case X86::VPCOMQri: case X86::VPCOMUQri:
2225  case X86::VPCOMWri: case X86::VPCOMUWri: {
2226  // Flip comparison mode immediate (if necessary).
2227  unsigned Imm = MI.getOperand(3).getImm() & 0x7;
2228  Imm = X86::getSwappedVPCOMImm(Imm);
2229  auto &WorkingMI = cloneIfNew(MI);
2230  WorkingMI.getOperand(3).setImm(Imm);
2231  return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
2232  OpIdx1, OpIdx2);
2233  }
2234  case X86::VCMPSDZrr:
2235  case X86::VCMPSSZrr:
2236  case X86::VCMPPDZrri:
2237  case X86::VCMPPSZrri:
2238  case X86::VCMPPDZ128rri:
2239  case X86::VCMPPSZ128rri:
2240  case X86::VCMPPDZ256rri:
2241  case X86::VCMPPSZ256rri:
2242  case X86::VCMPPDZrrik:
2243  case X86::VCMPPSZrrik:
2244  case X86::VCMPPDZ128rrik:
2245  case X86::VCMPPSZ128rrik:
2246  case X86::VCMPPDZ256rrik:
2247  case X86::VCMPPSZ256rrik: {
2248  unsigned Imm =
2249  MI.getOperand(MI.getNumExplicitOperands() - 1).getImm() & 0x1f;
2250  Imm = X86::getSwappedVCMPImm(Imm);
2251  auto &WorkingMI = cloneIfNew(MI);
2252  WorkingMI.getOperand(MI.getNumExplicitOperands() - 1).setImm(Imm);
2253  return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
2254  OpIdx1, OpIdx2);
2255  }
2256  case X86::VPERM2F128rr:
2257  case X86::VPERM2I128rr: {
2258  // Flip permute source immediate.
2259  // Imm & 0x02: lo = if set, select Op1.lo/hi else Op0.lo/hi.
2260  // Imm & 0x20: hi = if set, select Op1.lo/hi else Op0.lo/hi.
2261  int8_t Imm = MI.getOperand(3).getImm() & 0xFF;
2262  auto &WorkingMI = cloneIfNew(MI);
2263  WorkingMI.getOperand(3).setImm(Imm ^ 0x22);
2264  return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
2265  OpIdx1, OpIdx2);
2266  }
2267  case X86::MOVHLPSrr:
2268  case X86::UNPCKHPDrr:
2269  case X86::VMOVHLPSrr:
2270  case X86::VUNPCKHPDrr:
2271  case X86::VMOVHLPSZrr:
2272  case X86::VUNPCKHPDZ128rr: {
2273  assert(Subtarget.hasSSE2() && "Commuting MOVHLP/UNPCKHPD requires SSE2!");
2274 
2275  unsigned Opc = MI.getOpcode();
2276  switch (Opc) {
2277  default: llvm_unreachable("Unreachable!");
2278  case X86::MOVHLPSrr: Opc = X86::UNPCKHPDrr; break;
2279  case X86::UNPCKHPDrr: Opc = X86::MOVHLPSrr; break;
2280  case X86::VMOVHLPSrr: Opc = X86::VUNPCKHPDrr; break;
2281  case X86::VUNPCKHPDrr: Opc = X86::VMOVHLPSrr; break;
2282  case X86::VMOVHLPSZrr: Opc = X86::VUNPCKHPDZ128rr; break;
2283  case X86::VUNPCKHPDZ128rr: Opc = X86::VMOVHLPSZrr; break;
2284  }
2285  auto &WorkingMI = cloneIfNew(MI);
2286  WorkingMI.setDesc(get(Opc));
2287  return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
2288  OpIdx1, OpIdx2);
2289  }
2290  case X86::CMOV16rr: case X86::CMOV32rr: case X86::CMOV64rr: {
2291  auto &WorkingMI = cloneIfNew(MI);
2292  unsigned OpNo = MI.getDesc().getNumOperands() - 1;
2293  X86::CondCode CC = static_cast<X86::CondCode>(MI.getOperand(OpNo).getImm());
2294  WorkingMI.getOperand(OpNo).setImm(X86::GetOppositeBranchCondition(CC));
2295  return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
2296  OpIdx1, OpIdx2);
2297  }
2298  case X86::VPTERNLOGDZrri: case X86::VPTERNLOGDZrmi:
2299  case X86::VPTERNLOGDZ128rri: case X86::VPTERNLOGDZ128rmi:
2300  case X86::VPTERNLOGDZ256rri: case X86::VPTERNLOGDZ256rmi:
2301  case X86::VPTERNLOGQZrri: case X86::VPTERNLOGQZrmi:
2302  case X86::VPTERNLOGQZ128rri: case X86::VPTERNLOGQZ128rmi:
2303  case X86::VPTERNLOGQZ256rri: case X86::VPTERNLOGQZ256rmi:
2304  case X86::VPTERNLOGDZrrik:
2305  case X86::VPTERNLOGDZ128rrik:
2306  case X86::VPTERNLOGDZ256rrik:
2307  case X86::VPTERNLOGQZrrik:
2308  case X86::VPTERNLOGQZ128rrik:
2309  case X86::VPTERNLOGQZ256rrik:
2310  case X86::VPTERNLOGDZrrikz: case X86::VPTERNLOGDZrmikz:
2311  case X86::VPTERNLOGDZ128rrikz: case X86::VPTERNLOGDZ128rmikz:
2312  case X86::VPTERNLOGDZ256rrikz: case X86::VPTERNLOGDZ256rmikz:
2313  case X86::VPTERNLOGQZrrikz: case X86::VPTERNLOGQZrmikz:
2314  case X86::VPTERNLOGQZ128rrikz: case X86::VPTERNLOGQZ128rmikz:
2315  case X86::VPTERNLOGQZ256rrikz: case X86::VPTERNLOGQZ256rmikz:
2316  case X86::VPTERNLOGDZ128rmbi:
2317  case X86::VPTERNLOGDZ256rmbi:
2318  case X86::VPTERNLOGDZrmbi:
2319  case X86::VPTERNLOGQZ128rmbi:
2320  case X86::VPTERNLOGQZ256rmbi:
2321  case X86::VPTERNLOGQZrmbi:
2322  case X86::VPTERNLOGDZ128rmbikz:
2323  case X86::VPTERNLOGDZ256rmbikz:
2324  case X86::VPTERNLOGDZrmbikz:
2325  case X86::VPTERNLOGQZ128rmbikz:
2326  case X86::VPTERNLOGQZ256rmbikz:
2327  case X86::VPTERNLOGQZrmbikz: {
2328  auto &WorkingMI = cloneIfNew(MI);
2329  commuteVPTERNLOG(WorkingMI, OpIdx1, OpIdx2);
2330  return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
2331  OpIdx1, OpIdx2);
2332  }
2333  default: {
2334  if (isCommutableVPERMV3Instruction(MI.getOpcode())) {
2335  unsigned Opc = getCommutedVPERMV3Opcode(MI.getOpcode());
2336  auto &WorkingMI = cloneIfNew(MI);
2337  WorkingMI.setDesc(get(Opc));
2338  return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
2339  OpIdx1, OpIdx2);
2340  }
2341 
2342  const X86InstrFMA3Group *FMA3Group = getFMA3Group(MI.getOpcode(),
2343  MI.getDesc().TSFlags);
2344  if (FMA3Group) {
2345  unsigned Opc =
2346  getFMA3OpcodeToCommuteOperands(MI, OpIdx1, OpIdx2, *FMA3Group);
2347  auto &WorkingMI = cloneIfNew(MI);
2348  WorkingMI.setDesc(get(Opc));
2349  return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
2350  OpIdx1, OpIdx2);
2351  }
2352 
2353  return TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2);
2354  }
2355  }
2356 }
2357 
2358 bool
2359 X86InstrInfo::findThreeSrcCommutedOpIndices(const MachineInstr &MI,
2360  unsigned &SrcOpIdx1,
2361  unsigned &SrcOpIdx2,
2362  bool IsIntrinsic) const {
2363  uint64_t TSFlags = MI.getDesc().TSFlags;
2364 
2365  unsigned FirstCommutableVecOp = 1;
2366  unsigned LastCommutableVecOp = 3;
2367  unsigned KMaskOp = -1U;
2368  if (X86II::isKMasked(TSFlags)) {
2369  // For k-zero-masked operations it is Ok to commute the first vector
2370  // operand. Unless this is an intrinsic instruction.
2371  // For regular k-masked operations a conservative choice is done as the
2372  // elements of the first vector operand, for which the corresponding bit
2373  // in the k-mask operand is set to 0, are copied to the result of the
2374  // instruction.
2375  // TODO/FIXME: The commute still may be legal if it is known that the
2376  // k-mask operand is set to either all ones or all zeroes.
2377  // It is also Ok to commute the 1st operand if all users of MI use only
2378  // the elements enabled by the k-mask operand. For example,
2379  // v4 = VFMADD213PSZrk v1, k, v2, v3; // v1[i] = k[i] ? v2[i]*v1[i]+v3[i]
2380  // : v1[i];
2381  // VMOVAPSZmrk <mem_addr>, k, v4; // this is the ONLY user of v4 ->
2382  // // Ok, to commute v1 in FMADD213PSZrk.
2383 
2384  // The k-mask operand has index = 2 for masked and zero-masked operations.
2385  KMaskOp = 2;
2386 
2387  // The operand with index = 1 is used as a source for those elements for
2388  // which the corresponding bit in the k-mask is set to 0.
2389  if (X86II::isKMergeMasked(TSFlags) || IsIntrinsic)
2390  FirstCommutableVecOp = 3;
2391 
2392  LastCommutableVecOp++;
2393  } else if (IsIntrinsic) {
2394  // Commuting the first operand of an intrinsic instruction isn't possible
2395  // unless we can prove that only the lowest element of the result is used.
2396  FirstCommutableVecOp = 2;
2397  }
2398 
2399  if (isMem(MI, LastCommutableVecOp))
2400  LastCommutableVecOp--;
2401 
2402  // Only the first RegOpsNum operands are commutable.
2403  // Also, the value 'CommuteAnyOperandIndex' is valid here as it means
2404  // that the operand is not specified/fixed.
2405  if (SrcOpIdx1 != CommuteAnyOperandIndex &&
2406  (SrcOpIdx1 < FirstCommutableVecOp || SrcOpIdx1 > LastCommutableVecOp ||
2407  SrcOpIdx1 == KMaskOp))
2408  return false;
2409  if (SrcOpIdx2 != CommuteAnyOperandIndex &&
2410  (SrcOpIdx2 < FirstCommutableVecOp || SrcOpIdx2 > LastCommutableVecOp ||
2411  SrcOpIdx2 == KMaskOp))
2412  return false;
2413 
2414  // Look for two different register operands assumed to be commutable
2415  // regardless of the FMA opcode. The FMA opcode is adjusted later.
2416  if (SrcOpIdx1 == CommuteAnyOperandIndex ||
2417  SrcOpIdx2 == CommuteAnyOperandIndex) {
2418  unsigned CommutableOpIdx2 = SrcOpIdx2;
2419 
2420  // At least one of operands to be commuted is not specified and
2421  // this method is free to choose appropriate commutable operands.
2422  if (SrcOpIdx1 == SrcOpIdx2)
2423  // Both of operands are not fixed. By default set one of commutable
2424  // operands to the last register operand of the instruction.
2425  CommutableOpIdx2 = LastCommutableVecOp;
2426  else if (SrcOpIdx2 == CommuteAnyOperandIndex)
2427  // Only one of operands is not fixed.
2428  CommutableOpIdx2 = SrcOpIdx1;
2429 
2430  // CommutableOpIdx2 is well defined now. Let's choose another commutable
2431  // operand and assign its index to CommutableOpIdx1.
2432  Register Op2Reg = MI.getOperand(CommutableOpIdx2).getReg();
2433 
2434  unsigned CommutableOpIdx1;
2435  for (CommutableOpIdx1 = LastCommutableVecOp;
2436  CommutableOpIdx1 >= FirstCommutableVecOp; CommutableOpIdx1--) {
2437  // Just ignore and skip the k-mask operand.
2438  if (CommutableOpIdx1 == KMaskOp)
2439  continue;
2440 
2441  // The commuted operands must have different registers.
2442  // Otherwise, the commute transformation does not change anything and
2443  // is useless then.
2444  if (Op2Reg != MI.getOperand(CommutableOpIdx1).getReg())
2445  break;
2446  }
2447 
2448  // No appropriate commutable operands were found.
2449  if (CommutableOpIdx1 < FirstCommutableVecOp)
2450  return false;
2451 
2452  // Assign the found pair of commutable indices to SrcOpIdx1 and SrcOpidx2
2453  // to return those values.
2454  if (!fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2,
2455  CommutableOpIdx1, CommutableOpIdx2))
2456  return false;
2457  }
2458 
2459  return true;
2460 }
2461 
2463  unsigned &SrcOpIdx1,
2464  unsigned &SrcOpIdx2) const {
2465  const MCInstrDesc &Desc = MI.getDesc();
2466  if (!Desc.isCommutable())
2467  return false;
2468 
2469  switch (MI.getOpcode()) {
2470  case X86::CMPSDrr:
2471  case X86::CMPSSrr:
2472  case X86::CMPPDrri:
2473  case X86::CMPPSrri:
2474  case X86::VCMPSDrr:
2475  case X86::VCMPSSrr:
2476  case X86::VCMPPDrri:
2477  case X86::VCMPPSrri:
2478  case X86::VCMPPDYrri:
2479  case X86::VCMPPSYrri:
2480  case X86::VCMPSDZrr:
2481  case X86::VCMPSSZrr:
2482  case X86::VCMPPDZrri:
2483  case X86::VCMPPSZrri:
2484  case X86::VCMPPDZ128rri:
2485  case X86::VCMPPSZ128rri:
2486  case X86::VCMPPDZ256rri:
2487  case X86::VCMPPSZ256rri:
2488  case X86::VCMPPDZrrik:
2489  case X86::VCMPPSZrrik:
2490  case X86::VCMPPDZ128rrik:
2491  case X86::VCMPPSZ128rrik:
2492  case X86::VCMPPDZ256rrik:
2493  case X86::VCMPPSZ256rrik: {
2494  unsigned OpOffset = X86II::isKMasked(Desc.TSFlags) ? 1 : 0;
2495 
2496  // Float comparison can be safely commuted for
2497  // Ordered/Unordered/Equal/NotEqual tests
2498  unsigned Imm = MI.getOperand(3 + OpOffset).getImm() & 0x7;
2499  switch (Imm) {
2500  default:
2501  // EVEX versions can be commuted.
2502  if ((Desc.TSFlags & X86II::EncodingMask) == X86II::EVEX)
2503  break;
2504  return false;
2505  case 0x00: // EQUAL
2506  case 0x03: // UNORDERED
2507  case 0x04: // NOT EQUAL
2508  case 0x07: // ORDERED
2509  break;
2510  }
2511 
2512  // The indices of the commutable operands are 1 and 2 (or 2 and 3
2513  // when masked).
2514  // Assign them to the returned operand indices here.
2515  return fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2, 1 + OpOffset,
2516  2 + OpOffset);
2517  }
2518  case X86::MOVSSrr:
2519  // X86::MOVSDrr is always commutable. MOVSS is only commutable if we can
2520  // form sse4.1 blend. We assume VMOVSSrr/VMOVSDrr is always commutable since
2521  // AVX implies sse4.1.
2522  if (Subtarget.hasSSE41())
2523  return TargetInstrInfo::findCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2);
2524  return false;
2525  case X86::SHUFPDrri:
2526  // We can commute this to MOVSD.
2527  if (MI.getOperand(3).getImm() == 0x02)
2528  return TargetInstrInfo::findCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2);
2529  return false;
2530  case X86::MOVHLPSrr:
2531  case X86::UNPCKHPDrr:
2532  case X86::VMOVHLPSrr:
2533  case X86::VUNPCKHPDrr:
2534  case X86::VMOVHLPSZrr:
2535  case X86::VUNPCKHPDZ128rr:
2536  if (Subtarget.hasSSE2())
2537  return TargetInstrInfo::findCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2);
2538  return false;
2539  case X86::VPTERNLOGDZrri: case X86::VPTERNLOGDZrmi:
2540  case X86::VPTERNLOGDZ128rri: case X86::VPTERNLOGDZ128rmi:
2541  case X86::VPTERNLOGDZ256rri: case X86::VPTERNLOGDZ256rmi:
2542  case X86::VPTERNLOGQZrri: case X86::VPTERNLOGQZrmi:
2543  case X86::VPTERNLOGQZ128rri: case X86::VPTERNLOGQZ128rmi:
2544  case X86::VPTERNLOGQZ256rri: case X86::VPTERNLOGQZ256rmi:
2545  case X86::VPTERNLOGDZrrik:
2546  case X86::VPTERNLOGDZ128rrik:
2547  case X86::VPTERNLOGDZ256rrik:
2548  case X86::VPTERNLOGQZrrik:
2549  case X86::VPTERNLOGQZ128rrik:
2550  case X86::VPTERNLOGQZ256rrik:
2551  case X86::VPTERNLOGDZrrikz: case X86::VPTERNLOGDZrmikz:
2552  case X86::VPTERNLOGDZ128rrikz: case X86::VPTERNLOGDZ128rmikz:
2553  case X86::VPTERNLOGDZ256rrikz: case X86::VPTERNLOGDZ256rmikz:
2554  case X86::VPTERNLOGQZrrikz: case X86::VPTERNLOGQZrmikz:
2555  case X86::VPTERNLOGQZ128rrikz: case X86::VPTERNLOGQZ128rmikz:
2556  case X86::VPTERNLOGQZ256rrikz: case X86::VPTERNLOGQZ256rmikz:
2557  case X86::VPTERNLOGDZ128rmbi:
2558  case X86::VPTERNLOGDZ256rmbi:
2559  case X86::VPTERNLOGDZrmbi:
2560  case X86::VPTERNLOGQZ128rmbi:
2561  case X86::VPTERNLOGQZ256rmbi:
2562  case X86::VPTERNLOGQZrmbi:
2563  case X86::VPTERNLOGDZ128rmbikz:
2564  case X86::VPTERNLOGDZ256rmbikz:
2565  case X86::VPTERNLOGDZrmbikz:
2566  case X86::VPTERNLOGQZ128rmbikz:
2567  case X86::VPTERNLOGQZ256rmbikz:
2568  case X86::VPTERNLOGQZrmbikz:
2569  return findThreeSrcCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2);
2570  case X86::VPDPWSSDYrr:
2571  case X86::VPDPWSSDrr:
2572  case X86::VPDPWSSDSYrr:
2573  case X86::VPDPWSSDSrr:
2574  case X86::VPDPWSSDZ128r:
2575  case X86::VPDPWSSDZ128rk:
2576  case X86::VPDPWSSDZ128rkz:
2577  case X86::VPDPWSSDZ256r:
2578  case X86::VPDPWSSDZ256rk:
2579  case X86::VPDPWSSDZ256rkz:
2580  case X86::VPDPWSSDZr:
2581  case X86::VPDPWSSDZrk:
2582  case X86::VPDPWSSDZrkz:
2583  case X86::VPDPWSSDSZ128r:
2584  case X86::VPDPWSSDSZ128rk:
2585  case X86::VPDPWSSDSZ128rkz:
2586  case X86::VPDPWSSDSZ256r:
2587  case X86::VPDPWSSDSZ256rk:
2588  case X86::VPDPWSSDSZ256rkz:
2589  case X86::VPDPWSSDSZr:
2590  case X86::VPDPWSSDSZrk:
2591  case X86::VPDPWSSDSZrkz:
2592  case X86::VPMADD52HUQZ128r:
2593  case X86::VPMADD52HUQZ128rk:
2594  case X86::VPMADD52HUQZ128rkz:
2595  case X86::VPMADD52HUQZ256r:
2596  case X86::VPMADD52HUQZ256rk:
2597  case X86::VPMADD52HUQZ256rkz:
2598  case X86::VPMADD52HUQZr:
2599  case X86::VPMADD52HUQZrk:
2600  case X86::VPMADD52HUQZrkz:
2601  case X86::VPMADD52LUQZ128r:
2602  case X86::VPMADD52LUQZ128rk:
2603  case X86::VPMADD52LUQZ128rkz:
2604  case X86::VPMADD52LUQZ256r:
2605  case X86::VPMADD52LUQZ256rk:
2606  case X86::VPMADD52LUQZ256rkz:
2607  case X86::VPMADD52LUQZr:
2608  case X86::VPMADD52LUQZrk:
2609  case X86::VPMADD52LUQZrkz: {
2610  unsigned CommutableOpIdx1 = 2;
2611  unsigned CommutableOpIdx2 = 3;
2612  if (X86II::isKMasked(Desc.TSFlags)) {
2613  // Skip the mask register.
2614  ++CommutableOpIdx1;
2615  ++CommutableOpIdx2;
2616  }
2617  if (!fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2,
2618  CommutableOpIdx1, CommutableOpIdx2))
2619  return false;
2620  if (!MI.getOperand(SrcOpIdx1).isReg() ||
2621  !MI.getOperand(SrcOpIdx2).isReg())
2622  // No idea.
2623  return false;
2624  return true;
2625  }
2626 
2627  default:
2628  const X86InstrFMA3Group *FMA3Group = getFMA3Group(MI.getOpcode(),
2629  MI.getDesc().TSFlags);
2630  if (FMA3Group)
2631  return findThreeSrcCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2,
2632  FMA3Group->isIntrinsic());
2633 
2634  // Handled masked instructions since we need to skip over the mask input
2635  // and the preserved input.
2636  if (X86II::isKMasked(Desc.TSFlags)) {
2637  // First assume that the first input is the mask operand and skip past it.
2638  unsigned CommutableOpIdx1 = Desc.getNumDefs() + 1;
2639  unsigned CommutableOpIdx2 = Desc.getNumDefs() + 2;
2640  // Check if the first input is tied. If there isn't one then we only
2641  // need to skip the mask operand which we did above.
2642  if ((MI.getDesc().getOperandConstraint(Desc.getNumDefs(),
2643  MCOI::TIED_TO) != -1)) {
2644  // If this is zero masking instruction with a tied operand, we need to
2645  // move the first index back to the first input since this must
2646  // be a 3 input instruction and we want the first two non-mask inputs.
2647  // Otherwise this is a 2 input instruction with a preserved input and
2648  // mask, so we need to move the indices to skip one more input.
2649  if (X86II::isKMergeMasked(Desc.TSFlags)) {
2650  ++CommutableOpIdx1;
2651  ++CommutableOpIdx2;
2652  } else {
2653  --CommutableOpIdx1;
2654  }
2655  }
2656 
2657  if (!fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2,
2658  CommutableOpIdx1, CommutableOpIdx2))
2659  return false;
2660 
2661  if (!MI.getOperand(SrcOpIdx1).isReg() ||
2662  !MI.getOperand(SrcOpIdx2).isReg())
2663  // No idea.
2664  return false;
2665  return true;
2666  }
2667 
2668  return TargetInstrInfo::findCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2);
2669  }
2670  return false;
2671 }
2672 
2674  switch (MI.getOpcode()) {
2675  default: return X86::COND_INVALID;
2676  case X86::JCC_1:
2677  return static_cast<X86::CondCode>(
2678  MI.getOperand(MI.getDesc().getNumOperands() - 1).getImm());
2679  }
2680 }
2681 
2682 /// Return condition code of a SETCC opcode.
2684  switch (MI.getOpcode()) {
2685  default: return X86::COND_INVALID;
2686  case X86::SETCCr: case X86::SETCCm:
2687  return static_cast<X86::CondCode>(
2688  MI.getOperand(MI.getDesc().getNumOperands() - 1).getImm());
2689  }
2690 }
2691 
2692 /// Return condition code of a CMov opcode.
2694  switch (MI.getOpcode()) {
2695  default: return X86::COND_INVALID;
2696  case X86::CMOV16rr: case X86::CMOV32rr: case X86::CMOV64rr:
2697  case X86::CMOV16rm: case X86::CMOV32rm: case X86::CMOV64rm:
2698  return static_cast<X86::CondCode>(
2699  MI.getOperand(MI.getDesc().getNumOperands() - 1).getImm());
2700  }
2701 }
2702 
2703 /// Return the inverse of the specified condition,
2704 /// e.g. turning COND_E to COND_NE.
2706  switch (CC) {
2707  default: llvm_unreachable("Illegal condition code!");
2708  case X86::COND_E: return X86::COND_NE;
2709  case X86::COND_NE: return X86::COND_E;
2710  case X86::COND_L: return X86::COND_GE;
2711  case X86::COND_LE: return X86::COND_G;
2712  case X86::COND_G: return X86::COND_LE;
2713  case X86::COND_GE: return X86::COND_L;
2714  case X86::COND_B: return X86::COND_AE;
2715  case X86::COND_BE: return X86::COND_A;
2716  case X86::COND_A: return X86::COND_BE;
2717  case X86::COND_AE: return X86::COND_B;
2718  case X86::COND_S: return X86::COND_NS;
2719  case X86::COND_NS: return X86::COND_S;
2720  case X86::COND_P: return X86::COND_NP;
2721  case X86::COND_NP: return X86::COND_P;
2722  case X86::COND_O: return X86::COND_NO;
2723  case X86::COND_NO: return X86::COND_O;
2726  }
2727 }
2728 
2729 /// Assuming the flags are set by MI(a,b), return the condition code if we
2730 /// modify the instructions such that flags are set by MI(b,a).
2732  switch (CC) {
2733  default: return X86::COND_INVALID;
2734  case X86::COND_E: return X86::COND_E;
2735  case X86::COND_NE: return X86::COND_NE;
2736  case X86::COND_L: return X86::COND_G;
2737  case X86::COND_LE: return X86::COND_GE;
2738  case X86::COND_G: return X86::COND_L;
2739  case X86::COND_GE: return X86::COND_LE;
2740  case X86::COND_B: return X86::COND_A;
2741  case X86::COND_BE: return X86::COND_AE;
2742  case X86::COND_A: return X86::COND_B;
2743  case X86::COND_AE: return X86::COND_BE;
2744  }
2745 }
2746 
2747 std::pair<X86::CondCode, bool>
2750  bool NeedSwap = false;
2751  switch (Predicate) {
2752  default: break;
2753  // Floating-point Predicates
2754  case CmpInst::FCMP_UEQ: CC = X86::COND_E; break;
2755  case CmpInst::FCMP_OLT: NeedSwap = true; LLVM_FALLTHROUGH;
2756  case CmpInst::FCMP_OGT: CC = X86::COND_A; break;
2757  case CmpInst::FCMP_OLE: NeedSwap = true; LLVM_FALLTHROUGH;
2758  case CmpInst::FCMP_OGE: CC = X86::COND_AE; break;
2759  case CmpInst::FCMP_UGT: NeedSwap = true; LLVM_FALLTHROUGH;
2760  case CmpInst::FCMP_ULT: CC = X86::COND_B; break;
2761  case CmpInst::FCMP_UGE: NeedSwap = true; LLVM_FALLTHROUGH;
2762  case CmpInst::FCMP_ULE: CC = X86::COND_BE; break;
2763  case CmpInst::FCMP_ONE: CC = X86::COND_NE; break;
2764  case CmpInst::FCMP_UNO: CC = X86::COND_P; break;
2765  case CmpInst::FCMP_ORD: CC = X86::COND_NP; break;
2767  case CmpInst::FCMP_UNE: CC = X86::COND_INVALID; break;
2768 
2769  // Integer Predicates
2770  case CmpInst::ICMP_EQ: CC = X86::COND_E; break;
2771  case CmpInst::ICMP_NE: CC = X86::COND_NE; break;
2772  case CmpInst::ICMP_UGT: CC = X86::COND_A; break;
2773  case CmpInst::ICMP_UGE: CC = X86::COND_AE; break;
2774  case CmpInst::ICMP_ULT: CC = X86::COND_B; break;
2775  case CmpInst::ICMP_ULE: CC = X86::COND_BE; break;
2776  case CmpInst::ICMP_SGT: CC = X86::COND_G; break;
2777  case CmpInst::ICMP_SGE: CC = X86::COND_GE; break;
2778  case CmpInst::ICMP_SLT: CC = X86::COND_L; break;
2779  case CmpInst::ICMP_SLE: CC = X86::COND_LE; break;
2780  }
2781 
2782  return std::make_pair(CC, NeedSwap);
2783 }
2784 
2785 /// Return a setcc opcode based on whether it has memory operand.
2786 unsigned X86::getSETOpc(bool HasMemoryOperand) {
2787  return HasMemoryOperand ? X86::SETCCr : X86::SETCCm;
2788 }
2789 
2790 /// Return a cmov opcode for the given register size in bytes, and operand type.
2791 unsigned X86::getCMovOpcode(unsigned RegBytes, bool HasMemoryOperand) {
2792  switch(RegBytes) {
2793  default: llvm_unreachable("Illegal register size!");
2794  case 2: return HasMemoryOperand ? X86::CMOV16rm : X86::CMOV16rr;
2795  case 4: return HasMemoryOperand ? X86::CMOV32rm : X86::CMOV32rr;
2796  case 8: return HasMemoryOperand ? X86::CMOV64rm : X86::CMOV64rr;
2797  }
2798 }
2799 
2800 /// Get the VPCMP immediate for the given condition.
2802  switch (CC) {
2803  default: llvm_unreachable("Unexpected SETCC condition");
2804  case ISD::SETNE: return 4;
2805  case ISD::SETEQ: return 0;
2806  case ISD::SETULT:
2807  case ISD::SETLT: return 1;
2808  case ISD::SETUGT:
2809  case ISD::SETGT: return 6;
2810  case ISD::SETUGE:
2811  case ISD::SETGE: return 5;
2812  case ISD::SETULE:
2813  case ISD::SETLE: return 2;
2814  }
2815 }
2816 
2817 /// Get the VPCMP immediate if the operands are swapped.
2818 unsigned X86::getSwappedVPCMPImm(unsigned Imm) {
2819  switch (Imm) {
2820  default: llvm_unreachable("Unreachable!");
2821  case 0x01: Imm = 0x06; break; // LT -> NLE
2822  case 0x02: Imm = 0x05; break; // LE -> NLT
2823  case 0x05: Imm = 0x02; break; // NLT -> LE
2824  case 0x06: Imm = 0x01; break; // NLE -> LT
2825  case 0x00: // EQ
2826  case 0x03: // FALSE
2827  case 0x04: // NE
2828  case 0x07: // TRUE
2829  break;
2830  }
2831 
2832  return Imm;
2833 }
2834 
2835 /// Get the VPCOM immediate if the operands are swapped.
2836 unsigned X86::getSwappedVPCOMImm(unsigned Imm) {
2837  switch (Imm) {
2838  default: llvm_unreachable("Unreachable!");
2839  case 0x00: Imm = 0x02; break; // LT -> GT
2840  case 0x01: Imm = 0x03; break; // LE -> GE
2841  case 0x02: Imm = 0x00; break; // GT -> LT
2842  case 0x03: Imm = 0x01; break; // GE -> LE
2843  case 0x04: // EQ
2844  case 0x05: // NE
2845  case 0x06: // FALSE
2846  case 0x07: // TRUE
2847  break;
2848  }
2849 
2850  return Imm;
2851 }
2852 
2853 /// Get the VCMP immediate if the operands are swapped.
2854 unsigned X86::getSwappedVCMPImm(unsigned Imm) {
2855  // Only need the lower 2 bits to distinquish.
2856  switch (Imm & 0x3) {
2857  default: llvm_unreachable("Unreachable!");
2858  case 0x00: case 0x03:
2859  // EQ/NE/TRUE/FALSE/ORD/UNORD don't change immediate when commuted.
2860  break;
2861  case 0x01: case 0x02:
2862  // Need to toggle bits 3:0. Bit 4 stays the same.
2863  Imm ^= 0xf;
2864  break;
2865  }
2866 
2867  return Imm;
2868 }
2869 
2871  switch (MI.getOpcode()) {
2872  case X86::TCRETURNdi:
2873  case X86::TCRETURNri:
2874  case X86::TCRETURNmi:
2875  case X86::TCRETURNdi64:
2876  case X86::TCRETURNri64:
2877  case X86::TCRETURNmi64:
2878  return true;
2879  default:
2880  return false;
2881  }
2882 }
2883 
2885  SmallVectorImpl<MachineOperand> &BranchCond,
2886  const MachineInstr &TailCall) const {
2887  if (TailCall.getOpcode() != X86::TCRETURNdi &&
2888  TailCall.getOpcode() != X86::TCRETURNdi64) {
2889  // Only direct calls can be done with a conditional branch.
2890  return false;
2891  }
2892 
2893  const MachineFunction *MF = TailCall.getParent()->getParent();
2894  if (Subtarget.isTargetWin64() && MF->hasWinCFI()) {
2895  // Conditional tail calls confuse the Win64 unwinder.
2896  return false;
2897  }
2898 
2899  assert(BranchCond.size() == 1);
2900  if (BranchCond[0].getImm() > X86::LAST_VALID_COND) {
2901  // Can't make a conditional tail call with this condition.
2902  return false;
2903  }
2904 
2906  if (X86FI->getTCReturnAddrDelta() != 0 ||
2907  TailCall.getOperand(1).getImm() != 0) {
2908  // A conditional tail call cannot do any stack adjustment.
2909  return false;
2910  }
2911 
2912  return true;
2913 }
2914 
2917  const MachineInstr &TailCall) const {
2919 
2921  while (I != MBB.begin()) {
2922  --I;
2923  if (I->isDebugInstr())
2924  continue;
2925  if (!I->isBranch())
2926  assert(0 && "Can't find the branch to replace!");
2927 
2929  assert(BranchCond.size() == 1);
2930  if (CC != BranchCond[0].getImm())
2931  continue;
2932 
2933  break;
2934  }
2935 
2936  unsigned Opc = TailCall.getOpcode() == X86::TCRETURNdi ? X86::TCRETURNdicc
2937  : X86::TCRETURNdi64cc;
2938 
2939  auto MIB = BuildMI(MBB, I, MBB.findDebugLoc(I), get(Opc));
2940  MIB->addOperand(TailCall.getOperand(0)); // Destination.
2941  MIB.addImm(0); // Stack offset (not used).
2942  MIB->addOperand(BranchCond[0]); // Condition.
2943  MIB.copyImplicitOps(TailCall); // Regmask and (imp-used) parameters.
2944 
2945  // Add implicit uses and defs of all live regs potentially clobbered by the
2946  // call. This way they still appear live across the call.
2947  LivePhysRegs LiveRegs(getRegisterInfo());
2948  LiveRegs.addLiveOuts(MBB);
2950  LiveRegs.stepForward(*MIB, Clobbers);
2951  for (const auto &C : Clobbers) {
2952  MIB.addReg(C.first, RegState::Implicit);
2954  }
2955 
2956  I->eraseFromParent();
2957 }
2958 
2959 // Given a MBB and its TBB, find the FBB which was a fallthrough MBB (it may
2960 // not be a fallthrough MBB now due to layout changes). Return nullptr if the
2961 // fallthrough MBB cannot be identified.
2963  MachineBasicBlock *TBB) {
2964  // Look for non-EHPad successors other than TBB. If we find exactly one, it
2965  // is the fallthrough MBB. If we find zero, then TBB is both the target MBB
2966  // and fallthrough MBB. If we find more than one, we cannot identify the
2967  // fallthrough MBB and should return nullptr.
2968  MachineBasicBlock *FallthroughBB = nullptr;
2969  for (auto SI = MBB->succ_begin(), SE = MBB->succ_end(); SI != SE; ++SI) {
2970  if ((*SI)->isEHPad() || (*SI == TBB && FallthroughBB))
2971  continue;
2972  // Return a nullptr if we found more than one fallthrough successor.
2973  if (FallthroughBB && FallthroughBB != TBB)
2974  return nullptr;
2975  FallthroughBB = *SI;
2976  }
2977  return FallthroughBB;
2978 }
2979 
2980 bool X86InstrInfo::AnalyzeBranchImpl(
2983  SmallVectorImpl<MachineInstr *> &CondBranches, bool AllowModify) const {
2984 
2985  // Start from the bottom of the block and work up, examining the
2986  // terminator instructions.
2988  MachineBasicBlock::iterator UnCondBrIter = MBB.end();
2989  while (I != MBB.begin()) {
2990  --I;
2991  if (I->isDebugInstr())
2992  continue;
2993 
2994  // Working from the bottom, when we see a non-terminator instruction, we're
2995  // done.
2996  if (!isUnpredicatedTerminator(*I))
2997  break;
2998 
2999  // A terminator that isn't a branch can't easily be handled by this
3000  // analysis.
3001  if (!I->isBranch())
3002  return true;
3003 
3004  // Handle unconditional branches.
3005  if (I->getOpcode() == X86::JMP_1) {
3006  UnCondBrIter = I;
3007 
3008  if (!AllowModify) {
3009  TBB = I->getOperand(0).getMBB();
3010  continue;
3011  }
3012 
3013  // If the block has any instructions after a JMP, delete them.
3014  while (std::next(I) != MBB.end())
3015  std::next(I)->eraseFromParent();
3016 
3017  Cond.clear();
3018  FBB = nullptr;
3019 
3020  // Delete the JMP if it's equivalent to a fall-through.
3021  if (MBB.isLayoutSuccessor(I->getOperand(0).getMBB())) {
3022  TBB = nullptr;
3023  I->eraseFromParent();
3024  I = MBB.end();
3025  UnCondBrIter = MBB.end();
3026  continue;
3027  }
3028 
3029  // TBB is used to indicate the unconditional destination.
3030  TBB = I->getOperand(0).getMBB();
3031  continue;
3032  }
3033 
3034  // Handle conditional branches.
3035  X86::CondCode BranchCode = X86::getCondFromBranch(*I);
3036  if (BranchCode == X86::COND_INVALID)
3037  return true; // Can't handle indirect branch.
3038 
3039  // In practice we should never have an undef eflags operand, if we do
3040  // abort here as we are not prepared to preserve the flag.
3041  if (I->findRegisterUseOperand(X86::EFLAGS)->isUndef())
3042  return true;
3043 
3044  // Working from the bottom, handle the first conditional branch.
3045  if (Cond.empty()) {
3046  MachineBasicBlock *TargetBB = I->getOperand(0).getMBB();
3047  if (AllowModify && UnCondBrIter != MBB.end() &&
3048  MBB.isLayoutSuccessor(TargetBB)) {
3049  // If we can modify the code and it ends in something like:
3050  //
3051  // jCC L1
3052  // jmp L2
3053  // L1:
3054  // ...
3055  // L2:
3056  //
3057  // Then we can change this to:
3058  //
3059  // jnCC L2
3060  // L1:
3061  // ...
3062  // L2:
3063  //
3064  // Which is a bit more efficient.
3065  // We conditionally jump to the fall-through block.
3066  BranchCode = GetOppositeBranchCondition(BranchCode);
3067  MachineBasicBlock::iterator OldInst = I;
3068 
3069  BuildMI(MBB, UnCondBrIter, MBB.findDebugLoc(I), get(X86::JCC_1))
3070  .addMBB(UnCondBrIter->getOperand(0).getMBB())
3071  .addImm(BranchCode);
3072  BuildMI(MBB, UnCondBrIter, MBB.findDebugLoc(I), get(X86::JMP_1))
3073  .addMBB(TargetBB);
3074 
3075  OldInst->eraseFromParent();
3076  UnCondBrIter->eraseFromParent();
3077 
3078  // Restart the analysis.
3079  UnCondBrIter = MBB.end();
3080  I = MBB.end();
3081  continue;
3082  }
3083 
3084  FBB = TBB;
3085  TBB = I->getOperand(0).getMBB();
3086  Cond.push_back(MachineOperand::CreateImm(BranchCode));
3087  CondBranches.push_back(&*I);
3088  continue;
3089  }
3090 
3091  // Handle subsequent conditional branches. Only handle the case where all
3092  // conditional branches branch to the same destination and their condition
3093  // opcodes fit one of the special multi-branch idioms.
3094  assert(Cond.size() == 1);
3095  assert(TBB);
3096 
3097  // If the conditions are the same, we can leave them alone.
3098  X86::CondCode OldBranchCode = (X86::CondCode)Cond[0].getImm();
3099  auto NewTBB = I->getOperand(0).getMBB();
3100  if (OldBranchCode == BranchCode && TBB == NewTBB)
3101  continue;
3102 
3103  // If they differ, see if they fit one of the known patterns. Theoretically,
3104  // we could handle more patterns here, but we shouldn't expect to see them
3105  // if instruction selection has done a reasonable job.
3106  if (TBB == NewTBB &&
3107  ((OldBranchCode == X86::COND_P && BranchCode == X86::COND_NE) ||
3108  (OldBranchCode == X86::COND_NE && BranchCode == X86::COND_P))) {
3109  BranchCode = X86::COND_NE_OR_P;
3110  } else if ((OldBranchCode == X86::COND_NP && BranchCode == X86::COND_NE) ||
3111  (OldBranchCode == X86::COND_E && BranchCode == X86::COND_P)) {
3112  if (NewTBB != (FBB ? FBB : getFallThroughMBB(&MBB, TBB)))
3113  return true;
3114 
3115  // X86::COND_E_AND_NP usually has two different branch destinations.
3116  //
3117  // JP B1
3118  // JE B2
3119  // JMP B1
3120  // B1:
3121  // B2:
3122  //
3123  // Here this condition branches to B2 only if NP && E. It has another
3124  // equivalent form:
3125  //
3126  // JNE B1
3127  // JNP B2
3128  // JMP B1
3129  // B1:
3130  // B2:
3131  //
3132  // Similarly it branches to B2 only if E && NP. That is why this condition
3133  // is named with COND_E_AND_NP.
3134  BranchCode = X86::COND_E_AND_NP;
3135  } else
3136  return true;
3137 
3138  // Update the MachineOperand.
3139  Cond[0].setImm(BranchCode);
3140  CondBranches.push_back(&*I);
3141  }
3142 
3143  return false;
3144 }
3145 
3147  MachineBasicBlock *&TBB,
3148  MachineBasicBlock *&FBB,
3150  bool AllowModify) const {
3151  SmallVector<MachineInstr *, 4> CondBranches;
3152  return AnalyzeBranchImpl(MBB, TBB, FBB, Cond, CondBranches, AllowModify);
3153 }
3154 
3156  MachineBranchPredicate &MBP,
3157  bool AllowModify) const {
3158  using namespace std::placeholders;
3159 
3161  SmallVector<MachineInstr *, 4> CondBranches;
3162  if (AnalyzeBranchImpl(MBB, MBP.TrueDest, MBP.FalseDest, Cond, CondBranches,
3163  AllowModify))
3164  return true;
3165 
3166  if (Cond.size() != 1)
3167  return true;
3168 
3169  assert(MBP.TrueDest && "expected!");
3170 
3171  if (!MBP.FalseDest)
3172  MBP.FalseDest = MBB.getNextNode();
3173 
3175 
3176  MachineInstr *ConditionDef = nullptr;
3177  bool SingleUseCondition = true;
3178 
3179  for (auto I = std::next(MBB.rbegin()), E = MBB.rend(); I != E; ++I) {
3180  if (I->modifiesRegister(X86::EFLAGS, TRI)) {
3181  ConditionDef = &*I;
3182  break;
3183  }
3184 
3185  if (I->readsRegister(X86::EFLAGS, TRI))
3186  SingleUseCondition = false;
3187  }
3188 
3189  if (!ConditionDef)
3190  return true;
3191 
3192  if (SingleUseCondition) {
3193  for (auto *Succ : MBB.successors())
3194  if (Succ->isLiveIn(X86::EFLAGS))
3195  SingleUseCondition = false;
3196  }
3197 
3198  MBP.ConditionDef = ConditionDef;
3199  MBP.SingleUseCondition = SingleUseCondition;
3200 
3201  // Currently we only recognize the simple pattern:
3202  //
3203  // test %reg, %reg
3204  // je %label
3205  //
3206  const unsigned TestOpcode =
3207  Subtarget.is64Bit() ? X86::TEST64rr : X86::TEST32rr;
3208 
3209  if (ConditionDef->getOpcode() == TestOpcode &&
3210  ConditionDef->getNumOperands() == 3 &&
3211  ConditionDef->getOperand(0).isIdenticalTo(ConditionDef->getOperand(1)) &&
3212  (Cond[0].getImm() == X86::COND_NE || Cond[0].getImm() == X86::COND_E)) {
3213  MBP.LHS = ConditionDef->getOperand(0);
3214  MBP.RHS = MachineOperand::CreateImm(0);
3215  MBP.Predicate = Cond[0].getImm() == X86::COND_NE
3218  return false;
3219  }
3220 
3221  return true;
3222 }
3223 
3225  int *BytesRemoved) const {
3226  assert(!BytesRemoved && "code size not handled");
3227 
3229  unsigned Count = 0;
3230 
3231  while (I != MBB.begin()) {
3232  --I;
3233  if (I->isDebugInstr())
3234  continue;
3235  if (I->getOpcode() != X86::JMP_1 &&
3237  break;
3238  // Remove the branch.
3239  I->eraseFromParent();
3240  I = MBB.end();
3241  ++Count;
3242  }
3243 
3244  return Count;
3245 }
3246 
3248  MachineBasicBlock *TBB,
3249  MachineBasicBlock *FBB,
3251  const DebugLoc &DL,
3252  int *BytesAdded) const {
3253  // Shouldn't be a fall through.
3254  assert(TBB && "insertBranch must not be told to insert a fallthrough");
3255  assert((Cond.size() == 1 || Cond.size() == 0) &&
3256  "X86 branch conditions have one component!");
3257  assert(!BytesAdded && "code size not handled");
3258 
3259  if (Cond.empty()) {
3260  // Unconditional branch?
3261  assert(!FBB && "Unconditional branch with multiple successors!");
3262  BuildMI(&MBB, DL, get(X86::JMP_1)).addMBB(TBB);
3263  return 1;
3264  }
3265 
3266  // If FBB is null, it is implied to be a fall-through block.
3267  bool FallThru = FBB == nullptr;
3268 
3269  // Conditional branch.
3270  unsigned Count = 0;
3271  X86::CondCode CC = (X86::CondCode)Cond[0].getImm();
3272  switch (CC) {
3273  case X86::COND_NE_OR_P:
3274  // Synthesize NE_OR_P with two branches.
3275  BuildMI(&MBB, DL, get(X86::JCC_1)).addMBB(TBB).addImm(X86::COND_NE);
3276  ++Count;
3277  BuildMI(&MBB, DL, get(X86::JCC_1)).addMBB(TBB).addImm(X86::COND_P);
3278  ++Count;
3279  break;
3280  case X86::COND_E_AND_NP:
3281  // Use the next block of MBB as FBB if it is null.
3282  if (FBB == nullptr) {
3283  FBB = getFallThroughMBB(&MBB, TBB);
3284  assert(FBB && "MBB cannot be the last block in function when the false "
3285  "body is a fall-through.");
3286  }
3287  // Synthesize COND_E_AND_NP with two branches.
3288  BuildMI(&MBB, DL, get(X86::JCC_1)).addMBB(FBB).addImm(X86::COND_NE);
3289  ++Count;
3290  BuildMI(&MBB, DL, get(X86::JCC_1)).addMBB(TBB).addImm(X86::COND_NP);
3291  ++Count;
3292  break;
3293  default: {
3294  BuildMI(&MBB, DL, get(X86::JCC_1)).addMBB(TBB).addImm(CC);
3295  ++Count;
3296  }
3297  }
3298  if (!FallThru) {
3299  // Two-way Conditional branch. Insert the second branch.
3300  BuildMI(&MBB, DL, get(X86::JMP_1)).addMBB(FBB);
3301  ++Count;
3302  }
3303  return Count;
3304 }
3305 
3308  Register DstReg, Register TrueReg,
3309  Register FalseReg, int &CondCycles,
3310  int &TrueCycles, int &FalseCycles) const {
3311  // Not all subtargets have cmov instructions.
3312  if (!Subtarget.hasCMov())
3313  return false;
3314  if (Cond.size() != 1)
3315  return false;
3316  // We cannot do the composite conditions, at least not in SSA form.
3317  if ((X86::CondCode)Cond[0].getImm() > X86::LAST_VALID_COND)
3318  return false;
3319 
3320  // Check register classes.
3322  const TargetRegisterClass *RC =
3323  RI.getCommonSubClass(MRI.getRegClass(TrueReg), MRI.getRegClass(FalseReg));
3324  if (!RC)
3325  return false;
3326 
3327  // We have cmov instructions for 16, 32, and 64 bit general purpose registers.
3328  if (X86::GR16RegClass.hasSubClassEq(RC) ||
3329  X86::GR32RegClass.hasSubClassEq(RC) ||
3330  X86::GR64RegClass.hasSubClassEq(RC)) {
3331  // This latency applies to Pentium M, Merom, Wolfdale, Nehalem, and Sandy
3332  // Bridge. Probably Ivy Bridge as well.
3333  CondCycles = 2;
3334  TrueCycles = 2;
3335  FalseCycles = 2;
3336  return true;
3337  }
3338 
3339  // Can't do vectors.
3340  return false;
3341 }
3342 
3345  const DebugLoc &DL, Register DstReg,
3347  Register FalseReg) const {
3350  const TargetRegisterClass &RC = *MRI.getRegClass(DstReg);
3351  assert(Cond.size() == 1 && "Invalid Cond array");
3352  unsigned Opc = X86::getCMovOpcode(TRI.getRegSizeInBits(RC) / 8,
3353  false /*HasMemoryOperand*/);
3354  BuildMI(MBB, I, DL, get(Opc), DstReg)
3355  .addReg(FalseReg)
3356  .addReg(TrueReg)
3357  .addImm(Cond[0].getImm());
3358 }
3359 
3360 /// Test if the given register is a physical h register.
3361 static bool isHReg(unsigned Reg) {
3362  return X86::GR8_ABCD_HRegClass.contains(Reg);
3363 }
3364 
3365 // Try and copy between VR128/VR64 and GR64 registers.
3366 static unsigned CopyToFromAsymmetricReg(unsigned DestReg, unsigned SrcReg,
3367  const X86Subtarget &Subtarget) {
3368  bool HasAVX = Subtarget.hasAVX();
3369  bool HasAVX512 = Subtarget.hasAVX512();
3370 
3371  // SrcReg(MaskReg) -> DestReg(GR64)
3372  // SrcReg(MaskReg) -> DestReg(GR32)
3373 
3374  // All KMASK RegClasses hold the same k registers, can be tested against anyone.
3375  if (X86::VK16RegClass.contains(SrcReg)) {
3376  if (X86::GR64RegClass.contains(DestReg)) {
3377  assert(Subtarget.hasBWI());
3378  return X86::KMOVQrk;
3379  }
3380  if (X86::GR32RegClass.contains(DestReg))
3381  return Subtarget.hasBWI() ? X86::KMOVDrk : X86::KMOVWrk;
3382  }
3383 
3384  // SrcReg(GR64) -> DestReg(MaskReg)
3385  // SrcReg(GR32) -> DestReg(MaskReg)
3386 
3387  // All KMASK RegClasses hold the same k registers, can be tested against anyone.
3388  if (X86::VK16RegClass.contains(DestReg)) {
3389  if (X86::GR64RegClass.contains(SrcReg)) {
3390  assert(Subtarget.hasBWI());
3391  return X86::KMOVQkr;
3392  }
3393  if (X86::GR32RegClass.contains(SrcReg))
3394  return Subtarget.hasBWI() ? X86::KMOVDkr : X86::KMOVWkr;
3395  }
3396 
3397 
3398  // SrcReg(VR128) -> DestReg(GR64)
3399  // SrcReg(VR64) -> DestReg(GR64)
3400  // SrcReg(GR64) -> DestReg(VR128)
3401  // SrcReg(GR64) -> DestReg(VR64)
3402 
3403  if (X86::GR64RegClass.contains(DestReg)) {
3404  if (X86::VR128XRegClass.contains(SrcReg))
3405  // Copy from a VR128 register to a GR64 register.
3406  return HasAVX512 ? X86::VMOVPQIto64Zrr :
3407  HasAVX ? X86::VMOVPQIto64rr :
3408  X86::MOVPQIto64rr;
3409  if (X86::VR64RegClass.contains(SrcReg))
3410  // Copy from a VR64 register to a GR64 register.
3411  return X86::MMX_MOVD64from64rr;
3412  } else if (X86::GR64RegClass.contains(SrcReg)) {
3413  // Copy from a GR64 register to a VR128 register.
3414  if (X86::VR128XRegClass.contains(DestReg))
3415  return HasAVX512 ? X86::VMOV64toPQIZrr :
3416  HasAVX ? X86::VMOV64toPQIrr :
3417  X86::MOV64toPQIrr;
3418  // Copy from a GR64 register to a VR64 register.
3419  if (X86::VR64RegClass.contains(DestReg))
3420  return X86::MMX_MOVD64to64rr;
3421  }
3422 
3423  // SrcReg(VR128) -> DestReg(GR32)
3424  // SrcReg(GR32) -> DestReg(VR128)
3425 
3426  if (X86::GR32RegClass.contains(DestReg) &&
3427  X86::VR128XRegClass.contains(SrcReg))
3428  // Copy from a VR128 register to a GR32 register.
3429  return HasAVX512 ? X86::VMOVPDI2DIZrr :
3430  HasAVX ? X86::VMOVPDI2DIrr :
3431  X86::MOVPDI2DIrr;
3432 
3433  if (X86::VR128XRegClass.contains(DestReg) &&
3434  X86::GR32RegClass.contains(SrcReg))
3435  // Copy from a VR128 register to a VR128 register.
3436  return HasAVX512 ? X86::VMOVDI2PDIZrr :
3437  HasAVX ? X86::VMOVDI2PDIrr :
3438  X86::MOVDI2PDIrr;
3439  return 0;
3440 }
3441 
3444  const DebugLoc &DL, MCRegister DestReg,
3445  MCRegister SrcReg, bool KillSrc) const {
3446  // First deal with the normal symmetric copies.
3447  bool HasAVX = Subtarget.hasAVX();
3448  bool HasVLX = Subtarget.hasVLX();
3449  unsigned Opc = 0;
3450  if (X86::GR64RegClass.contains(DestReg, SrcReg))
3451  Opc = X86::MOV64rr;
3452  else if (X86::GR32RegClass.contains(DestReg, SrcReg))
3453  Opc = X86::MOV32rr;
3454  else if (X86::GR16RegClass.contains(DestReg, SrcReg))
3455  Opc = X86::MOV16rr;
3456  else if (X86::GR8RegClass.contains(DestReg, SrcReg)) {
3457  // Copying to or from a physical H register on x86-64 requires a NOREX
3458  // move. Otherwise use a normal move.
3459  if ((isHReg(DestReg) || isHReg(SrcReg)) &&
3460  Subtarget.is64Bit()) {
3461  Opc = X86::MOV8rr_NOREX;
3462  // Both operands must be encodable without an REX prefix.
3463  assert(X86::GR8_NOREXRegClass.contains(SrcReg, DestReg) &&
3464  "8-bit H register can not be copied outside GR8_NOREX");
3465  } else
3466  Opc = X86::MOV8rr;
3467  }
3468  else if (X86::VR64RegClass.contains(DestReg, SrcReg))
3469  Opc = X86::MMX_MOVQ64rr;
3470  else if (X86::VR128XRegClass.contains(DestReg, SrcReg)) {
3471  if (HasVLX)
3472  Opc = X86::VMOVAPSZ128rr;
3473  else if (X86::VR128RegClass.contains(DestReg, SrcReg))
3474  Opc = HasAVX ? X86::VMOVAPSrr : X86::MOVAPSrr;
3475  else {
3476  // If this an extended register and we don't have VLX we need to use a
3477  // 512-bit move.
3478  Opc = X86::VMOVAPSZrr;
3480  DestReg = TRI->getMatchingSuperReg(DestReg, X86::sub_xmm,
3481  &X86::VR512RegClass);
3482  SrcReg = TRI->getMatchingSuperReg(SrcReg, X86::sub_xmm,
3483  &X86::VR512RegClass);
3484  }
3485  } else if (X86::VR256XRegClass.contains(DestReg, SrcReg)) {
3486  if (HasVLX)
3487  Opc = X86::VMOVAPSZ256rr;
3488  else if (X86::VR256RegClass.contains(DestReg, SrcReg))
3489  Opc = X86::VMOVAPSYrr;
3490  else {
3491  // If this an extended register and we don't have VLX we need to use a
3492  // 512-bit move.
3493  Opc = X86::VMOVAPSZrr;
3495  DestReg = TRI->getMatchingSuperReg(DestReg, X86::sub_ymm,
3496  &X86::VR512RegClass);
3497  SrcReg = TRI->getMatchingSuperReg(SrcReg, X86::sub_ymm,
3498  &X86::VR512RegClass);
3499  }
3500  } else if (X86::VR512RegClass.contains(DestReg, SrcReg))
3501  Opc = X86::VMOVAPSZrr;
3502  // All KMASK RegClasses hold the same k registers, can be tested against anyone.
3503  else if (X86::VK16RegClass.contains(DestReg, SrcReg))
3504  Opc = Subtarget.hasBWI() ? X86::KMOVQkk : X86::KMOVWkk;
3505  if (!Opc)
3506  Opc = CopyToFromAsymmetricReg(DestReg, SrcReg, Subtarget);
3507 
3508  if (Opc) {
3509  BuildMI(MBB, MI, DL, get(Opc), DestReg)
3510  .addReg(SrcReg, getKillRegState(KillSrc));
3511  return;
3512  }
3513 
3514  if (SrcReg == X86::EFLAGS || DestReg == X86::EFLAGS) {
3515  // FIXME: We use a fatal error here because historically LLVM has tried
3516  // lower some of these physreg copies and we want to ensure we get
3517  // reasonable bug reports if someone encounters a case no other testing
3518  // found. This path should be removed after the LLVM 7 release.
3519  report_fatal_error("Unable to copy EFLAGS physical register!");
3520  }
3521 
3522  LLVM_DEBUG(dbgs() << "Cannot copy " << RI.getName(SrcReg) << " to "
3523  << RI.getName(DestReg) << '\n');
3524  report_fatal_error("Cannot emit physreg copy instruction");
3525 }
3526 
3529  if (MI.isMoveReg())
3530  return DestSourcePair{MI.getOperand(0), MI.getOperand(1)};
3531  return None;
3532 }
3533 
3535  const TargetRegisterClass *RC,
3536  bool IsStackAligned,
3537  const X86Subtarget &STI, bool load) {
3538  bool HasAVX = STI.hasAVX();
3539  bool HasAVX512 = STI.hasAVX512();
3540  bool HasVLX = STI.hasVLX();
3541 
3542  switch (STI.getRegisterInfo()->getSpillSize(*RC)) {
3543  default:
3544  llvm_unreachable("Unknown spill size");
3545  case 1:
3546  assert(X86::GR8RegClass.hasSubClassEq(RC) && "Unknown 1-byte regclass");
3547  if (STI.is64Bit())
3548  // Copying to or from a physical H register on x86-64 requires a NOREX
3549  // move. Otherwise use a normal move.
3550  if (isHReg(Reg) || X86::GR8_ABCD_HRegClass.hasSubClassEq(RC))
3551  return load ? X86::MOV8rm_NOREX : X86::MOV8mr_NOREX;
3552  return load ? X86::MOV8rm : X86::MOV8mr;
3553  case 2:
3554  if (X86::VK16RegClass.hasSubClassEq(RC))
3555  return load ? X86::KMOVWkm : X86::KMOVWmk;
3556  assert(X86::GR16RegClass.hasSubClassEq(RC) && "Unknown 2-byte regclass");
3557  return load ? X86::MOV16rm : X86::MOV16mr;
3558  case 4:
3559  if (X86::GR32RegClass.hasSubClassEq(RC))
3560  return load ? X86::MOV32rm : X86::MOV32mr;
3561  if (X86::FR32XRegClass.hasSubClassEq(RC))
3562  return load ?
3563  (HasAVX512 ? X86::VMOVSSZrm_alt :
3564  HasAVX ? X86::VMOVSSrm_alt :
3565  X86::MOVSSrm_alt) :
3566  (HasAVX512 ? X86::VMOVSSZmr :
3567  HasAVX ? X86::VMOVSSmr :
3568  X86::MOVSSmr);
3569  if (X86::RFP32RegClass.hasSubClassEq(RC))
3570  return load ? X86::LD_Fp32m : X86::ST_Fp32m;
3571  if (X86::VK32RegClass.hasSubClassEq(RC)) {
3572  assert(STI.hasBWI() && "KMOVD requires BWI");
3573  return load ? X86::KMOVDkm : X86::KMOVDmk;
3574  }
3575  // All of these mask pair classes have the same spill size, the same kind
3576  // of kmov instructions can be used with all of them.
3577  if (X86::VK1PAIRRegClass.hasSubClassEq(RC) ||
3578  X86::VK2PAIRRegClass.hasSubClassEq(RC) ||
3579  X86::VK4PAIRRegClass.hasSubClassEq(RC) ||
3580  X86::VK8PAIRRegClass.hasSubClassEq(RC) ||
3581  X86::VK16PAIRRegClass.hasSubClassEq(RC))
3582  return load ? X86::MASKPAIR16LOAD : X86::MASKPAIR16STORE;
3583  llvm_unreachable("Unknown 4-byte regclass");
3584  case 8:
3585  if (X86::GR64RegClass.hasSubClassEq(RC))
3586  return load ? X86::MOV64rm : X86::MOV64mr;
3587  if (X86::FR64XRegClass.hasSubClassEq(RC))
3588  return load ?
3589  (HasAVX512 ? X86::VMOVSDZrm_alt :
3590  HasAVX ? X86::VMOVSDrm_alt :
3591  X86::MOVSDrm_alt) :
3592  (HasAVX512 ? X86::VMOVSDZmr :
3593  HasAVX ? X86::VMOVSDmr :
3594  X86::MOVSDmr);
3595  if (X86::VR64RegClass.hasSubClassEq(RC))
3596  return load ? X86::MMX_MOVQ64rm : X86::MMX_MOVQ64mr;
3597  if (X86::RFP64RegClass.hasSubClassEq(RC))
3598  return load ? X86::LD_Fp64m : X86::ST_Fp64m;
3599  if (X86::VK64RegClass.hasSubClassEq(RC)) {
3600  assert(STI.hasBWI() && "KMOVQ requires BWI");
3601  return load ? X86::KMOVQkm : X86::KMOVQmk;
3602  }
3603  llvm_unreachable("Unknown 8-byte regclass");
3604  case 10:
3605  assert(X86::RFP80RegClass.hasSubClassEq(RC) && "Unknown 10-byte regclass");
3606  return load ? X86::LD_Fp80m : X86::ST_FpP80m;
3607  case 16: {
3608  if (X86::VR128XRegClass.hasSubClassEq(RC)) {
3609  // If stack is realigned we can use aligned stores.
3610  if (IsStackAligned)
3611  return load ?
3612  (HasVLX ? X86::VMOVAPSZ128rm :
3613  HasAVX512 ? X86::VMOVAPSZ128rm_NOVLX :
3614  HasAVX ? X86::VMOVAPSrm :
3615  X86::MOVAPSrm):
3616  (HasVLX ? X86::VMOVAPSZ128mr :
3617  HasAVX512 ? X86::VMOVAPSZ128mr_NOVLX :
3618  HasAVX ? X86::VMOVAPSmr :
3619  X86::MOVAPSmr);
3620  else
3621  return load ?
3622  (HasVLX ? X86::VMOVUPSZ128rm :
3623  HasAVX512 ? X86::VMOVUPSZ128rm_NOVLX :
3624  HasAVX ? X86::VMOVUPSrm :
3625  X86::MOVUPSrm):
3626  (HasVLX ? X86::VMOVUPSZ128mr :
3627  HasAVX512 ? X86::VMOVUPSZ128mr_NOVLX :
3628  HasAVX ? X86::VMOVUPSmr :
3629  X86::MOVUPSmr);
3630  }
3631  if (X86::BNDRRegClass.hasSubClassEq(RC)) {
3632  if (STI.is64Bit())
3633  return load ? X86::BNDMOV64rm : X86::BNDMOV64mr;
3634  else
3635  return load ? X86::BNDMOV32rm : X86::BNDMOV32mr;
3636  }
3637  llvm_unreachable("Unknown 16-byte regclass");
3638  }
3639  case 32:
3640  assert(X86::VR256XRegClass.hasSubClassEq(RC) && "Unknown 32-byte regclass");
3641  // If stack is realigned we can use aligned stores.
3642  if (IsStackAligned)
3643  return load ?
3644  (HasVLX ? X86::VMOVAPSZ256rm :
3645  HasAVX512 ? X86::VMOVAPSZ256rm_NOVLX :
3646  X86::VMOVAPSYrm) :
3647  (HasVLX ? X86::VMOVAPSZ256mr :
3648  HasAVX512 ? X86::VMOVAPSZ256mr_NOVLX :
3649  X86::VMOVAPSYmr);
3650  else
3651  return load ?
3652  (HasVLX ? X86::VMOVUPSZ256rm :
3653  HasAVX512 ? X86::VMOVUPSZ256rm_NOVLX :
3654  X86::VMOVUPSYrm) :
3655  (HasVLX ? X86::VMOVUPSZ256mr :
3656  HasAVX512 ? X86::VMOVUPSZ256mr_NOVLX :
3657  X86::VMOVUPSYmr);
3658  case 64:
3659  assert(X86::VR512RegClass.hasSubClassEq(RC) && "Unknown 64-byte regclass");
3660  assert(STI.hasAVX512() && "Using 512-bit register requires AVX512");
3661  if (IsStackAligned)
3662  return load ? X86::VMOVAPSZrm : X86::VMOVAPSZmr;
3663  else
3664  return load ? X86::VMOVUPSZrm : X86::VMOVUPSZmr;
3665  }
3666 }
3667 
3670  const TargetRegisterInfo *TRI) const {
3671  const MCInstrDesc &Desc = MemI.getDesc();
3672  int MemRefBegin = X86II::getMemoryOperandNo(Desc.TSFlags);
3673  if (MemRefBegin < 0)
3674  return None;
3675 
3676  MemRefBegin += X86II::getOperandBias(Desc);
3677 
3678  auto &BaseOp = MemI.getOperand(MemRefBegin + X86::AddrBaseReg);
3679  if (!BaseOp.isReg()) // Can be an MO_FrameIndex
3680  return None;
3681 
3682  const MachineOperand &DispMO = MemI.getOperand(MemRefBegin + X86::AddrDisp);
3683  // Displacement can be symbolic
3684  if (!DispMO.isImm())
3685  return None;
3686 
3687  ExtAddrMode AM;
3688  AM.BaseReg = BaseOp.getReg();
3689  AM.ScaledReg = MemI.getOperand(MemRefBegin + X86::AddrIndexReg).getReg();
3690  AM.Scale = MemI.getOperand(MemRefBegin + X86::AddrScaleAmt).getImm();
3691  AM.Displacement = DispMO.getImm();
3692  return AM;
3693 }
3694 
3696  const Register Reg,
3697  int64_t &ImmVal) const {
3698  if (MI.getOpcode() != X86::MOV32ri && MI.getOpcode() != X86::MOV64ri)
3699  return false;
3700  // Mov Src can be a global address.
3701  if (!MI.getOperand(1).isImm() || MI.getOperand(0).getReg() != Reg)
3702  return false;
3703  ImmVal = MI.getOperand(1).getImm();
3704  return true;
3705 }
3706 
3708  const MachineInstr *MI, const Register NullValueReg,
3709  const TargetRegisterInfo *TRI) const {
3710  if (!MI->modifiesRegister(NullValueReg, TRI))
3711  return true;
3712  switch (MI->getOpcode()) {
3713  // Shift right/left of a null unto itself is still a null, i.e. rax = shl rax
3714  // X.
3715  case X86::SHR64ri:
3716  case X86::SHR32ri:
3717  case X86::SHL64ri:
3718  case X86::SHL32ri:
3719  assert(MI->getOperand(0).isDef() && MI->getOperand(1).isUse() &&
3720  "expected for shift opcode!");
3721  return MI->getOperand(0).getReg() == NullValueReg &&
3722  MI->getOperand(1).getReg() == NullValueReg;
3723  // Zero extend of a sub-reg of NullValueReg into itself does not change the
3724  // null value.
3725  case X86::MOV32rr:
3726  return llvm::all_of(MI->operands(), [&](const MachineOperand &MO) {
3727  return TRI->isSubRegisterEq(NullValueReg, MO.getReg());
3728  });
3729  default:
3730  return false;
3731  }
3732  llvm_unreachable("Should be handled above!");
3733 }
3734 
3737  int64_t &Offset, bool &OffsetIsScalable, unsigned &Width,
3738  const TargetRegisterInfo *TRI) const {
3739  const MCInstrDesc &Desc = MemOp.getDesc();
3740  int MemRefBegin = X86II::getMemoryOperandNo(Desc.TSFlags);
3741  if (MemRefBegin < 0)
3742  return false;
3743 
3744  MemRefBegin += X86II::getOperandBias(Desc);
3745 
3746  const MachineOperand *BaseOp =
3747  &MemOp.getOperand(MemRefBegin + X86::AddrBaseReg);
3748  if (!BaseOp->isReg()) // Can be an MO_FrameIndex
3749  return false;
3750 
3751  if (MemOp.getOperand(MemRefBegin + X86::AddrScaleAmt).getImm() != 1)
3752  return false;
3753 
3754  if (MemOp.getOperand(MemRefBegin + X86::AddrIndexReg).getReg() !=
3755  X86::NoRegister)
3756  return false;
3757 
3758  const MachineOperand &DispMO = MemOp.getOperand(MemRefBegin + X86::AddrDisp);
3759 
3760  // Displacement can be symbolic
3761  if (!DispMO.isImm())
3762  return false;
3763 
3764  Offset = DispMO.getImm();
3765 
3766  if (!BaseOp->isReg())
3767  return false;
3768 
3769  OffsetIsScalable = false;
3770  // FIXME: Relying on memoperands() may not be right thing to do here. Check
3771  // with X86 maintainers, and fix it accordingly. For now, it is ok, since
3772  // there is no use of `Width` for X86 back-end at the moment.
3773  Width =
3774  !MemOp.memoperands_empty() ? MemOp.memoperands().front()->getSize() : 0;
3775  BaseOps.push_back(BaseOp);
3776  return true;
3777 }
3778 
3779 static unsigned getStoreRegOpcode(Register SrcReg,
3780  const TargetRegisterClass *RC,
3781  bool IsStackAligned,
3782  const X86Subtarget &STI) {
3783  return getLoadStoreRegOpcode(SrcReg, RC, IsStackAligned, STI, false);
3784 }
3785 
3786 static unsigned getLoadRegOpcode(Register DestReg,
3787  const TargetRegisterClass *RC,
3788  bool IsStackAligned, const X86Subtarget &STI) {
3789  return getLoadStoreRegOpcode(DestReg, RC, IsStackAligned, STI, true);
3790 }
3791 
3794  Register SrcReg, bool isKill, int FrameIdx,
3795  const TargetRegisterClass *RC,
3796  const TargetRegisterInfo *TRI) const {
3797  const MachineFunction &MF = *MBB.getParent();
3798  const MachineFrameInfo &MFI = MF.getFrameInfo();
3799  assert(MFI.getObjectSize(FrameIdx) >= TRI->getSpillSize(*RC) &&
3800  "Stack slot too small for store");
3801  if (RC->getID() == X86::TILERegClassID) {
3802  unsigned Opc = X86::TILESTORED;
3803  // tilestored %tmm, (%sp, %idx)
3804  MachineRegisterInfo &RegInfo = MBB.getParent()->getRegInfo();
3805  Register VirtReg = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass);
3806  BuildMI(MBB, MI, DebugLoc(), get(X86::MOV64ri), VirtReg).addImm(64);
3807  MachineInstr *NewMI =
3808  addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc)), FrameIdx)
3809  .addReg(SrcReg, getKillRegState(isKill));
3810  MachineOperand &MO = NewMI->getOperand(2);
3811  MO.setReg(VirtReg);
3812  MO.setIsKill(true);
3813  } else {
3814  unsigned Alignment = std::max<uint32_t>(TRI->getSpillSize(*RC), 16);
3815  bool isAligned =
3816  (Subtarget.getFrameLowering()->getStackAlign() >= Alignment) ||
3817  (RI.canRealignStack(MF) && !MFI.isFixedObjectIndex(FrameIdx));
3818  unsigned Opc = getStoreRegOpcode(SrcReg, RC, isAligned, Subtarget);
3819  addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc)), FrameIdx)
3820  .addReg(SrcReg, getKillRegState(isKill));
3821  }
3822 }
3823 
3826  Register DestReg, int FrameIdx,
3827  const TargetRegisterClass *RC,
3828  const TargetRegisterInfo *TRI) const {
3829  if (RC->getID() == X86::TILERegClassID) {
3830  unsigned Opc = X86::TILELOADD;
3831  // tileloadd (%sp, %idx), %tmm
3832  MachineRegisterInfo &RegInfo = MBB.getParent()->getRegInfo();
3833  Register VirtReg = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass);
3834  MachineInstr *NewMI =
3835  BuildMI(MBB, MI, DebugLoc(), get(X86::MOV64ri), VirtReg).addImm(64);
3836  NewMI = addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc), DestReg),
3837  FrameIdx);
3838  MachineOperand &MO = NewMI->getOperand(3);
3839  MO.setReg(VirtReg);
3840  MO.setIsKill(true);
3841  } else {
3842  const MachineFunction &MF = *MBB.getParent();
3843  const MachineFrameInfo &MFI = MF.getFrameInfo();
3844  unsigned Alignment = std::max<uint32_t>(TRI->getSpillSize(*RC), 16);
3845  bool isAligned =
3846  (Subtarget.getFrameLowering()->getStackAlign() >= Alignment) ||
3847  (RI.canRealignStack(MF) && !MFI.isFixedObjectIndex(FrameIdx));
3848  unsigned Opc = getLoadRegOpcode(DestReg, RC, isAligned, Subtarget);
3849  addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc), DestReg),
3850  FrameIdx);
3851  }
3852 }
3853 
3855  Register &SrcReg2, int &CmpMask,
3856  int &CmpValue) const {
3857  switch (MI.getOpcode()) {
3858  default: break;
3859  case X86::CMP64ri32:
3860  case X86::CMP64ri8:
3861  case X86::CMP32ri:
3862  case X86::CMP32ri8:
3863  case X86::CMP16ri:
3864  case X86::CMP16ri8:
3865  case X86::CMP8ri:
3866  SrcReg = MI.getOperand(0).getReg();
3867  SrcReg2 = 0;
3868  if (MI.getOperand(1).isImm()) {
3869  CmpMask = ~0;
3870  CmpValue = MI.getOperand(1).getImm();
3871  } else {
3872  CmpMask = CmpValue = 0;
3873  }
3874  return true;
3875  // A SUB can be used to perform comparison.
3876  case X86::SUB64rm:
3877  case X86::SUB32rm:
3878  case X86::SUB16rm:
3879  case X86::SUB8rm:
3880  SrcReg = MI.getOperand(1).getReg();
3881  SrcReg2 = 0;
3882  CmpMask = 0;
3883  CmpValue = 0;
3884  return true;
3885  case X86::SUB64rr:
3886  case X86::SUB32rr:
3887  case X86::SUB16rr:
3888  case X86::SUB8rr:
3889  SrcReg = MI.getOperand(1).getReg();
3890  SrcReg2 = MI.getOperand(2).getReg();
3891  CmpMask = 0;
3892  CmpValue = 0;
3893  return true;
3894  case X86::SUB64ri32:
3895  case X86::SUB64ri8:
3896  case X86::SUB32ri:
3897  case X86::SUB32ri8:
3898  case X86::SUB16ri:
3899  case X86::SUB16ri8:
3900  case X86::SUB8ri:
3901  SrcReg = MI.getOperand(1).getReg();
3902  SrcReg2 = 0;
3903  if (MI.getOperand(2).isImm()) {
3904  CmpMask = ~0;
3905  CmpValue = MI.getOperand(2).getImm();
3906  } else {
3907  CmpMask = CmpValue = 0;
3908  }
3909  return true;
3910  case X86::CMP64rr:
3911  case X86::CMP32rr:
3912  case X86::CMP16rr:
3913  case X86::CMP8rr:
3914  SrcReg = MI.getOperand(0).getReg();
3915  SrcReg2 = MI.getOperand(1).getReg();
3916  CmpMask = 0;
3917  CmpValue = 0;
3918  return true;
3919  case X86::TEST8rr:
3920  case X86::TEST16rr:
3921  case X86::TEST32rr:
3922  case X86::TEST64rr:
3923  SrcReg = MI.getOperand(0).getReg();
3924  if (MI.getOperand(1).getReg() != SrcReg)
3925  return false;
3926  // Compare against zero.
3927  SrcReg2 = 0;
3928  CmpMask = ~0;
3929  CmpValue = 0;
3930  return true;
3931  }
3932  return false;
3933 }
3934 
3935 /// Check whether the first instruction, whose only
3936 /// purpose is to update flags, can be made redundant.
3937 /// CMPrr can be made redundant by SUBrr if the operands are the same.
3938 /// This function can be extended later on.
3939 /// SrcReg, SrcRegs: register operands for FlagI.
3940 /// ImmValue: immediate for FlagI if it takes an immediate.
3941 inline static bool isRedundantFlagInstr(const MachineInstr &FlagI,
3942  Register SrcReg, Register SrcReg2,
3943  int ImmMask, int ImmValue,
3944  const MachineInstr &OI) {
3945  if (((FlagI.getOpcode() == X86::CMP64rr && OI.getOpcode() == X86::SUB64rr) ||
3946  (FlagI.getOpcode() == X86::CMP32rr && OI.getOpcode() == X86::SUB32rr) ||
3947  (FlagI.getOpcode() == X86::CMP16rr && OI.getOpcode() == X86::SUB16rr) ||
3948  (FlagI.getOpcode() == X86::CMP8rr && OI.getOpcode() == X86::SUB8rr)) &&
3949  ((OI.getOperand(1).getReg() == SrcReg &&
3950  OI.getOperand(2).getReg() == SrcReg2) ||
3951  (OI.getOperand(1).getReg() == SrcReg2 &&
3952  OI.getOperand(2).getReg() == SrcReg)))
3953  return true;
3954 
3955  if (ImmMask != 0 &&
3956  ((FlagI.getOpcode() == X86::CMP64ri32 &&
3957  OI.getOpcode() == X86::SUB64ri32) ||
3958  (FlagI.getOpcode() == X86::CMP64ri8 &&
3959  OI.getOpcode() == X86::SUB64ri8) ||
3960  (FlagI.getOpcode() == X86::CMP32ri && OI.getOpcode() == X86::SUB32ri) ||
3961  (FlagI.getOpcode() == X86::CMP32ri8 &&
3962  OI.getOpcode() == X86::SUB32ri8) ||
3963  (FlagI.getOpcode() == X86::CMP16ri && OI.getOpcode() == X86::SUB16ri) ||
3964  (FlagI.getOpcode() == X86::CMP16ri8 &&
3965  OI.getOpcode() == X86::SUB16ri8) ||
3966  (FlagI.getOpcode() == X86::CMP8ri && OI.getOpcode() == X86::SUB8ri)) &&
3967  OI.getOperand(1).getReg() == SrcReg &&
3968  OI.getOperand(2).getImm() == ImmValue)
3969  return true;
3970  return false;
3971 }
3972 
3973 /// Check whether the definition can be converted
3974 /// to remove a comparison against zero.
3975 inline static bool isDefConvertible(const MachineInstr &MI, bool &NoSignFlag,
3976  bool &ClearsOverflowFlag) {
3977  NoSignFlag = false;
3978  ClearsOverflowFlag = false;
3979 
3980  switch (MI.getOpcode()) {
3981  default: return false;
3982 
3983  // The shift instructions only modify ZF if their shift count is non-zero.
3984  // N.B.: The processor truncates the shift count depending on the encoding.
3985  case X86::SAR8ri: case X86::SAR16ri: case X86::SAR32ri:case X86::SAR64ri:
3986  case X86::SHR8ri: case X86::SHR16ri: case X86::SHR32ri:case X86::SHR64ri:
3987  return getTruncatedShiftCount(MI, 2) != 0;
3988 
3989  // Some left shift instructions can be turned into LEA instructions but only
3990  // if their flags aren't used. Avoid transforming such instructions.
3991  case X86::SHL8ri: case X86::SHL16ri: case X86::SHL32ri:case X86::SHL64ri:{
3992  unsigned ShAmt = getTruncatedShiftCount(MI, 2);
3993  if (isTruncatedShiftCountForLEA(ShAmt)) return false;
3994  return ShAmt != 0;
3995  }
3996 
3997  case X86::SHRD16rri8:case X86::SHRD32rri8:case X86::SHRD64rri8:
3998  case X86::SHLD16rri8:case X86::SHLD32rri8:case X86::SHLD64rri8:
3999  return getTruncatedShiftCount(MI, 3) != 0;
4000 
4001  case X86::SUB64ri32: case X86::SUB64ri8: case X86::SUB32ri:
4002  case X86::SUB32ri8: case X86::SUB16ri: case X86::SUB16ri8:
4003  case X86::SUB8ri: case X86::SUB64rr: case X86::SUB32rr:
4004  case X86::SUB16rr: case X86::SUB8rr: case X86::SUB64rm:
4005  case X86::SUB32rm: case X86::SUB16rm: case X86::SUB8rm:
4006  case X86::DEC64r: case X86::DEC32r: case X86::DEC16r: case X86::DEC8r:
4007  case X86::ADD64ri32: case X86::ADD64ri8: case X86::ADD32ri:
4008  case X86::ADD32ri8: case X86::ADD16ri: case X86::ADD16ri8:
4009  case X86::ADD8ri: case X86::ADD64rr: case X86::ADD32rr:
4010  case X86::ADD16rr: case X86::ADD8rr: case X86::ADD64rm:
4011  case X86::ADD32rm: case X86::ADD16rm: case X86::ADD8rm:
4012  case X86::INC64r: case X86::INC32r: case X86::INC16r: case X86::INC8r:
4013  case X86::ADC64ri32: case X86::ADC64ri8: case X86::ADC32ri:
4014  case X86::ADC32ri8: case X86::ADC16ri: case X86::ADC16ri8:
4015  case X86::ADC8ri: case X86::ADC64rr: case X86::ADC32rr:
4016  case X86::ADC16rr: case X86::ADC8rr: case X86::ADC64rm:
4017  case X86::ADC32rm: case X86::ADC16rm: case X86::ADC8rm:
4018  case X86::SBB64ri32: case X86::SBB64ri8: case X86::SBB32ri:
4019  case X86::SBB32ri8: case X86::SBB16ri: case X86::SBB16ri8:
4020  case X86::SBB8ri: case X86::SBB64rr: case X86::SBB32rr:
4021  case X86::SBB16rr: case X86::SBB8rr: case X86::SBB64rm:
4022  case X86::SBB32rm: case X86::SBB16rm: case X86::SBB8rm:
4023  case X86::NEG8r: case X86::NEG16r: case X86::NEG32r: case X86::NEG64r:
4024  case X86::SAR8r1: case X86::SAR16r1: case X86::SAR32r1:case X86::SAR64r1:
4025  case X86::SHR8r1: case X86::SHR16r1: case X86::SHR32r1:case X86::SHR64r1:
4026  case X86::SHL8r1: case X86::SHL16r1: case X86::SHL32r1:case X86::SHL64r1:
4027  case X86::LZCNT16rr: case X86::LZCNT16rm:
4028  case X86::LZCNT32rr: case X86::LZCNT32rm:
4029  case X86::LZCNT64rr: case X86::LZCNT64rm:
4030  case X86::POPCNT16rr:case X86::POPCNT16rm:
4031  case X86::POPCNT32rr:case X86::POPCNT32rm:
4032  case X86::POPCNT64rr:case X86::POPCNT64rm:
4033  case X86::TZCNT16rr: case X86::TZCNT16rm:
4034  case X86::TZCNT32rr: case X86::TZCNT32rm:
4035  case X86::TZCNT64rr: case X86::TZCNT64rm:
4036  return true;
4037  case X86::AND64ri32: case X86::AND64ri8: case X86::AND32ri:
4038  case X86::AND32ri8: case X86::AND16ri: case X86::AND16ri8:
4039  case X86::AND8ri: case X86::AND64rr: case X86::AND32rr:
4040  case X86::AND16rr: case X86::AND8rr: case X86::AND64rm:
4041  case X86::AND32rm: case X86::AND16rm: case X86::AND8rm:
4042  case X86::XOR64ri32: case X86::XOR64ri8: case X86::XOR32ri:
4043  case X86::XOR32ri8: case X86::XOR16ri: case X86::XOR16ri8:
4044  case X86::XOR8ri: case X86::XOR64rr: case X86::XOR32rr:
4045  case X86::XOR16rr: case X86::XOR8rr: case X86::XOR64rm:
4046  case X86::XOR32rm: case X86::XOR16rm: case X86::XOR8rm:
4047  case X86::OR64ri32: case X86::OR64ri8: case X86::OR32ri:
4048  case X86::OR32ri8: case X86::OR16ri: case X86::OR16ri8:
4049  case X86::OR8ri: case X86::OR64rr: case X86::OR32rr:
4050  case X86::OR16rr: case X86::OR8rr: case X86::OR64rm:
4051  case X86::OR32rm: case X86::OR16rm: case X86::OR8rm:
4052  case X86::ANDN32rr: case X86::ANDN32rm:
4053  case X86::ANDN64rr: case X86::ANDN64rm:
4054  case X86::BLSI32rr: case X86::BLSI32rm:
4055  case X86::BLSI64rr: case X86::BLSI64rm:
4056  case X86::BLSMSK32rr: case X86::BLSMSK32rm:
4057  case X86::BLSMSK64rr: case X86::BLSMSK64rm:
4058  case X86::BLSR32rr: case X86::BLSR32rm:
4059  case X86::BLSR64rr: case X86::BLSR64rm:
4060  case X86::BLCFILL32rr: case X86::BLCFILL32rm:
4061  case X86::BLCFILL64rr: case X86::BLCFILL64rm:
4062  case X86::BLCI32rr: case X86::BLCI32rm:
4063  case X86::BLCI64rr: case X86::BLCI64rm:
4064  case X86::BLCIC32rr: case X86::BLCIC32rm:
4065  case X86::BLCIC64rr: case X86::BLCIC64rm:
4066  case X86::BLCMSK32rr: case X86::BLCMSK32rm:
4067  case X86::BLCMSK64rr: case X86::BLCMSK64rm:
4068  case X86::BLCS32rr: case X86::BLCS32rm:
4069  case X86::BLCS64rr: case X86::BLCS64rm:
4070  case X86::BLSFILL32rr: case X86::BLSFILL32rm:
4071  case X86::BLSFILL64rr: case X86::BLSFILL64rm:
4072  case X86::BLSIC32rr: case X86::BLSIC32rm:
4073  case X86::BLSIC64rr: case X86::BLSIC64rm:
4074  case X86::BZHI32rr: case X86::BZHI32rm:
4075  case X86::BZHI64rr: case X86::BZHI64rm:
4076  case X86::T1MSKC32rr: case X86::T1MSKC32rm:
4077  case X86::T1MSKC64rr: case X86::T1MSKC64rm:
4078  case X86::TZMSK32rr: case X86::TZMSK32rm:
4079  case X86::TZMSK64rr: case X86::TZMSK64rm:
4080  // These instructions clear the overflow flag just like TEST.
4081  // FIXME: These are not the only instructions in this switch that clear the
4082  // overflow flag.
4083  ClearsOverflowFlag = true;
4084  return true;
4085  case X86::BEXTR32rr: case X86::BEXTR64rr:
4086  case X86::BEXTR32rm: case X86::BEXTR64rm:
4087  case X86::BEXTRI32ri: case X86::BEXTRI32mi:
4088  case X86::BEXTRI64ri: case X86::BEXTRI64mi:
4089  // BEXTR doesn't update the sign flag so we can't use it. It does clear
4090  // the overflow flag, but that's not useful without the sign flag.
4091  NoSignFlag = true;
4092  return true;
4093  }
4094 }
4095 
4096 /// Check whether the use can be converted to remove a comparison against zero.
4098  switch (MI.getOpcode()) {
4099  default: return X86::COND_INVALID;
4100  case X86::NEG8r:
4101  case X86::NEG16r:
4102  case X86::NEG32r:
4103  case X86::NEG64r:
4104  return X86::COND_AE;
4105  case X86::LZCNT16rr:
4106  case X86::LZCNT32rr:
4107  case X86::LZCNT64rr:
4108  return X86::COND_B;
4109  case X86::POPCNT16rr:
4110  case X86::POPCNT32rr:
4111  case X86::POPCNT64rr:
4112  return X86::COND_E;
4113  case X86::TZCNT16rr:
4114  case X86::TZCNT32rr:
4115  case X86::TZCNT64rr:
4116  return X86::COND_B;
4117  case X86::BSF16rr:
4118  case X86::BSF32rr:
4119  case X86::BSF64rr:
4120  case X86::BSR16rr:
4121  case X86::BSR32rr:
4122  case X86::BSR64rr:
4123  return X86::COND_E;
4124  case X86::BLSI32rr:
4125  case X86::BLSI64rr:
4126  return X86::COND_AE;
4127  case X86::BLSR32rr:
4128  case X86::BLSR64rr:
4129  case X86::BLSMSK32rr:
4130  case X86::BLSMSK64rr:
4131  return X86::COND_B;
4132  // TODO: TBM instructions.
4133  }
4134 }
4135 
4136 /// Check if there exists an earlier instruction that
4137 /// operates on the same source operands and sets flags in the same way as
4138 /// Compare; remove Compare if possible.
4140  Register SrcReg2, int CmpMask,
4141  int CmpValue,
4142  const MachineRegisterInfo *MRI) const {
4143  // Check whether we can replace SUB with CMP.
4144  switch (CmpInstr.getOpcode()) {
4145  default: break;
4146  case X86::SUB64ri32:
4147  case X86::SUB64ri8:
4148  case X86::SUB32ri:
4149  case X86::SUB32ri8:
4150  case X86::SUB16ri:
4151  case X86::SUB16ri8:
4152  case X86::SUB8ri:
4153  case X86::SUB64rm:
4154  case X86::SUB32rm:
4155  case X86::SUB16rm:
4156  case X86::SUB8rm:
4157  case X86::SUB64rr:
4158  case X86::SUB32rr:
4159  case X86::SUB16rr:
4160  case X86::SUB8rr: {
4161  if (!MRI->use_nodbg_empty(CmpInstr.getOperand(0).getReg()))
4162  return false;
4163  // There is no use of the destination register, we can replace SUB with CMP.
4164  unsigned NewOpcode = 0;
4165  switch (CmpInstr.getOpcode()) {
4166  default: llvm_unreachable("Unreachable!");
4167  case X86::SUB64rm: NewOpcode = X86::CMP64rm; break;
4168  case X86::SUB32rm: NewOpcode = X86::CMP32rm; break;
4169  case X86::SUB16rm: NewOpcode = X86::CMP16rm; break;
4170  case X86::SUB8rm: NewOpcode = X86::CMP8rm; break;
4171  case X86::SUB64rr: NewOpcode = X86::CMP64rr; break;
4172  case X86::SUB32rr: NewOpcode = X86::CMP32rr; break;
4173  case X86::SUB16rr: NewOpcode = X86::CMP16rr; break;
4174  case X86::SUB8rr: NewOpcode = X86::CMP8rr; break;
4175  case X86::SUB64ri32: NewOpcode = X86::CMP64ri32; break;
4176  case X86::SUB64ri8: NewOpcode = X86::CMP64ri8; break;
4177  case X86::SUB32ri: NewOpcode = X86::CMP32ri; break;
4178  case X86::SUB32ri8: NewOpcode = X86::CMP32ri8; break;
4179  case X86::SUB16ri: NewOpcode = X86::CMP16ri; break;
4180  case X86::SUB16ri8: NewOpcode = X86::CMP16ri8; break;
4181  case X86::SUB8ri: NewOpcode = X86::CMP8ri; break;
4182  }
4183  CmpInstr.setDesc(get(NewOpcode));
4184  CmpInstr.RemoveOperand(0);
4185  // Fall through to optimize Cmp if Cmp is CMPrr or CMPri.
4186  if (NewOpcode == X86::CMP64rm || NewOpcode == X86::CMP32rm ||
4187  NewOpcode == X86::CMP16rm || NewOpcode == X86::CMP8rm)
4188  return false;
4189  }
4190  }
4191 
4192  // Get the unique definition of SrcReg.
4193  MachineInstr *MI = MRI->getUniqueVRegDef(SrcReg);
4194  if (!MI) return false;
4195 
4196  // CmpInstr is the first instruction of the BB.
4197  MachineBasicBlock::iterator I = CmpInstr, Def = MI;
4198 
4199  // If we are comparing against zero, check whether we can use MI to update
4200  // EFLAGS. If MI is not in the same BB as CmpInstr, do not optimize.
4201  bool IsCmpZero = (CmpMask != 0 && CmpValue == 0);
4202  if (IsCmpZero && MI->getParent() != CmpInstr.getParent())
4203  return false;
4204 
4205  // If we have a use of the source register between the def and our compare
4206  // instruction we can eliminate the compare iff the use sets EFLAGS in the
4207  // right way.
4208  bool ShouldUpdateCC = false;
4209  bool NoSignFlag = false;
4210  bool ClearsOverflowFlag = false;
4212  if (IsCmpZero && !isDefConvertible(*MI, NoSignFlag, ClearsOverflowFlag)) {
4213  // Scan forward from the use until we hit the use we're looking for or the
4214  // compare instruction.
4215  for (MachineBasicBlock::iterator J = MI;; ++J) {
4216  // Do we have a convertible instruction?
4217  NewCC = isUseDefConvertible(*J);
4218  if (NewCC != X86::COND_INVALID && J->getOperand(1).isReg() &&
4219  J->getOperand(1).getReg() == SrcReg) {
4220  assert(J->definesRegister(X86::EFLAGS) && "Must be an EFLAGS def!");
4221  ShouldUpdateCC = true; // Update CC later on.
4222  // This is not a def of SrcReg, but still a def of EFLAGS. Keep going
4223  // with the new def.
4224  Def = J;
4225  MI = &*Def;
4226  break;
4227  }
4228 
4229  if (J == I)
4230  return false;
4231  }
4232  }
4233 
4234  // We are searching for an earlier instruction that can make CmpInstr
4235  // redundant and that instruction will be saved in Sub.
4236  MachineInstr *Sub = nullptr;
4238 
4239  // We iterate backward, starting from the instruction before CmpInstr and
4240  // stop when reaching the definition of a source register or done with the BB.
4241  // RI points to the instruction before CmpInstr.
4242  // If the definition is in this basic block, RE points to the definition;
4243  // otherwise, RE is the rend of the basic block.
4245  RI = ++I.getReverse(),
4246  RE = CmpInstr.getParent() == MI->getParent()
4247  ? Def.getReverse() /* points to MI */
4248  : CmpInstr.getParent()->rend();
4249  MachineInstr *Movr0Inst = nullptr;
4250  for (; RI != RE; ++RI) {
4251  MachineInstr &Instr = *RI;
4252  // Check whether CmpInstr can be made redundant by the current instruction.
4253  if (!IsCmpZero && isRedundantFlagInstr(CmpInstr, SrcReg, SrcReg2, CmpMask,
4254  CmpValue, Instr)) {
4255  Sub = &Instr;
4256  break;
4257  }
4258 
4259  if (Instr.modifiesRegister(X86::EFLAGS, TRI) ||
4260  Instr.readsRegister(X86::EFLAGS, TRI)) {
4261  // This instruction modifies or uses EFLAGS.
4262 
4263  // MOV32r0 etc. are implemented with xor which clobbers condition code.
4264  // They are safe to move up, if the definition to EFLAGS is dead and
4265  // earlier instructions do not read or write EFLAGS.
4266  if (!Movr0Inst && Instr.getOpcode() == X86::MOV32r0 &&
4267  Instr.registerDefIsDead(X86::EFLAGS, TRI)) {
4268  Movr0Inst = &Instr;
4269  continue;
4270  }
4271 
4272  // We can't remove CmpInstr.
4273  return false;
4274  }
4275  }
4276 
4277  // Return false if no candidates exist.
4278  if (!IsCmpZero && !Sub)
4279  return false;
4280 
4281  bool IsSwapped =
4282  (SrcReg2 != 0 && Sub && Sub->getOperand(1).getReg() == SrcReg2 &&
4283  Sub->getOperand(2).getReg() == SrcReg);
4284 
4285  // Scan forward from the instruction after CmpInstr for uses of EFLAGS.
4286  // It is safe to remove CmpInstr if EFLAGS is redefined or killed.
4287  // If we are done with the basic block, we need to check whether EFLAGS is
4288  // live-out.
4289  bool IsSafe = false;
4291  MachineBasicBlock::iterator E = CmpInstr.getParent()->end();
4292  for (++I; I != E; ++I) {
4293  const MachineInstr &Instr = *I;
4294  bool ModifyEFLAGS = Instr.modifiesRegister(X86::EFLAGS, TRI);
4295  bool UseEFLAGS = Instr.readsRegister(X86::EFLAGS, TRI);
4296  // We should check the usage if this instruction uses and updates EFLAGS.
4297  if (!UseEFLAGS && ModifyEFLAGS) {
4298  // It is safe to remove CmpInstr if EFLAGS is updated again.
4299  IsSafe = true;
4300  break;
4301  }
4302  if (!UseEFLAGS && !ModifyEFLAGS)
4303  continue;
4304 
4305  // EFLAGS is used by this instruction.
4307  if (IsCmpZero || IsSwapped) {
4308  // We decode the condition code from opcode.
4309  if (Instr.isBranch())
4310  OldCC = X86::getCondFromBranch(Instr);
4311  else {
4312  OldCC = X86::getCondFromSETCC(Instr);
4313  if (OldCC == X86::COND_INVALID)
4314  OldCC = X86::getCondFromCMov(Instr);
4315  }
4316  if (OldCC == X86::COND_INVALID) return false;
4317  }
4318  X86::CondCode ReplacementCC = X86::COND_INVALID;
4319  if (IsCmpZero) {
4320  switch (OldCC) {
4321  default: break;
4322  case X86::COND_A: case X86::COND_AE:
4323  case X86::COND_B: case X86::COND_BE:
4324  // CF is used, we can't perform this optimization.
4325  return false;
4326  case X86::COND_G: case X86::COND_GE:
4327  case X86::COND_L: case X86::COND_LE:
4328  case X86::COND_O: case X86::COND_NO:
4329  // If OF is used, the instruction needs to clear it like CmpZero does.
4330  if (!ClearsOverflowFlag)
4331  return false;
4332  break;
4333  case X86::COND_S: case X86::COND_NS:
4334  // If SF is used, but the instruction doesn't update the SF, then we
4335  // can't do the optimization.
4336  if (NoSignFlag)
4337  return false;
4338  break;
4339  }
4340 
4341  // If we're updating the condition code check if we have to reverse the
4342  // condition.
4343  if (ShouldUpdateCC)
4344  switch (OldCC) {
4345  default:
4346  return false;
4347  case X86::COND_E:
4348  ReplacementCC = NewCC;
4349  break;
4350  case X86::COND_NE:
4351  ReplacementCC = GetOppositeBranchCondition(NewCC);
4352  break;
4353  }
4354  } else if (IsSwapped) {
4355  // If we have SUB(r1, r2) and CMP(r2, r1), the condition code needs
4356  // to be changed from r2 > r1 to r1 < r2, from r2 < r1 to r1 > r2, etc.
4357  // We swap the condition code and synthesize the new opcode.
4358  ReplacementCC = getSwappedCondition(OldCC);
4359  if (ReplacementCC == X86::COND_INVALID) return false;
4360  }
4361 
4362  if ((ShouldUpdateCC || IsSwapped) && ReplacementCC != OldCC) {
4363  // Push the MachineInstr to OpsToUpdate.
4364  // If it is safe to remove CmpInstr, the condition code of these
4365  // instructions will be modified.
4366  OpsToUpdate.push_back(std::make_pair(&*I, ReplacementCC));
4367  }
4368  if (ModifyEFLAGS || Instr.killsRegister(X86::EFLAGS, TRI)) {
4369  // It is safe to remove CmpInstr if EFLAGS is updated again or killed.
4370  IsSafe = true;
4371  break;
4372  }
4373  }
4374 
4375  // If EFLAGS is not killed nor re-defined, we should check whether it is
4376  // live-out. If it is live-out, do not optimize.
4377  if ((IsCmpZero || IsSwapped) && !IsSafe) {
4378  MachineBasicBlock *MBB = CmpInstr.getParent();
4380  if (Successor->isLiveIn(X86::EFLAGS))
4381  return false;
4382  }
4383 
4384  // The instruction to be updated is either Sub or MI.
4385  Sub = IsCmpZero ? MI : Sub;
4386  // Move Movr0Inst to the appropriate place before Sub.
4387  if (Movr0Inst) {
4388  // Look backwards until we find a def that doesn't use the current EFLAGS.
4389  Def = Sub;
4390  MachineBasicBlock::reverse_iterator InsertI = Def.getReverse(),
4391  InsertE = Sub->getParent()->rend();
4392  for (; InsertI != InsertE; ++InsertI) {
4393  MachineInstr *Instr = &*InsertI;
4394  if (!Instr->readsRegister(X86::EFLAGS, TRI) &&
4395  Instr->modifiesRegister(X86::EFLAGS, TRI)) {
4396  Sub->getParent()->remove(Movr0Inst);
4397  Instr->getParent()->insert(MachineBasicBlock::iterator(Instr),
4398  Movr0Inst);
4399  break;
4400  }
4401  }
4402  if (InsertI == InsertE)
4403  return false;
4404  }
4405 
4406  // Make sure Sub instruction defines EFLAGS and mark the def live.
4407  MachineOperand *FlagDef = Sub->findRegisterDefOperand(X86::EFLAGS);
4408  assert(FlagDef && "Unable to locate a def EFLAGS operand");
4409  FlagDef->setIsDead(false);
4410 
4411  CmpInstr.eraseFromParent();
4412 
4413  // Modify the condition code of instructions in OpsToUpdate.
4414  for (auto &Op : OpsToUpdate) {
4415  Op.first->getOperand(Op.first->getDesc().getNumOperands() - 1)
4416  .setImm(Op.second);
4417  }
4418  return true;
4419 }
4420 
4421 /// Try to remove the load by folding it to a register
4422 /// operand at the use. We fold the load instructions if load defines a virtual
4423 /// register, the virtual register is used once in the same BB, and the
4424 /// instructions in-between do not load or store, and have no side effects.
4426  const MachineRegisterInfo *MRI,
4427  Register &FoldAsLoadDefReg,
4428  MachineInstr *&DefMI) const {
4429  // Check whether we can move DefMI here.
4430  DefMI = MRI->getVRegDef(FoldAsLoadDefReg);
4431  assert(DefMI);
4432  bool SawStore = false;
4433  if (!DefMI->isSafeToMove(nullptr, SawStore))
4434  return nullptr;
4435 
4436  // Collect information about virtual register operands of MI.
4437  SmallVector<unsigned, 1> SrcOperandIds;
4438  for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
4439  MachineOperand &MO = MI.getOperand(i);
4440  if (!MO.isReg())
4441  continue;
4442  Register Reg = MO.getReg();
4443  if (Reg != FoldAsLoadDefReg)
4444  continue;
4445  // Do not fold if we have a subreg use or a def.
4446  if (MO.getSubReg() || MO.isDef())
4447  return nullptr;
4448  SrcOperandIds.push_back(i);
4449  }
4450  if (SrcOperandIds.empty())
4451  return nullptr;
4452 
4453  // Check whether we can fold the def into SrcOperandId.
4454  if (MachineInstr *FoldMI = foldMemoryOperand(MI, SrcOperandIds, *DefMI)) {
4455  FoldAsLoadDefReg = 0;
4456  return FoldMI;
4457  }
4458 
4459  return nullptr;
4460 }
4461 
4462 /// Expand a single-def pseudo instruction to a two-addr
4463 /// instruction with two undef reads of the register being defined.
4464 /// This is used for mapping:
4465 /// %xmm4 = V_SET0
4466 /// to:
4467 /// %xmm4 = PXORrr undef %xmm4, undef %xmm4
4468 ///
4470  const MCInstrDesc &Desc) {
4471  assert(Desc.getNumOperands() == 3 && "Expected two-addr instruction.");
4472  Register Reg = MIB.getReg(0);
4473  MIB->setDesc(Desc);
4474 
4475  // MachineInstr::addOperand() will insert explicit operands before any
4476  // implicit operands.
4478  // But we don't trust that.
4479  assert(MIB.getReg(1) == Reg &&
4480  MIB.getReg(2) == Reg && "Misplaced operand");
4481  return true;
4482 }
4483 
4484 /// Expand a single-def pseudo instruction to a two-addr
4485 /// instruction with two %k0 reads.
4486 /// This is used for mapping:
4487 /// %k4 = K_SET1
4488 /// to:
4489 /// %k4 = KXNORrr %k0, %k0
4490 static bool Expand2AddrKreg(MachineInstrBuilder &MIB, const MCInstrDesc &Desc,
4491  Register Reg) {
4492  assert(Desc.getNumOperands() == 3 && "Expected two-addr instruction.");
4493  MIB->setDesc(Desc);
4495  return true;
4496 }
4497 
4499  bool MinusOne) {
4500  MachineBasicBlock &MBB = *MIB->getParent();
4501  DebugLoc DL = MIB->getDebugLoc();
4502  Register Reg = MIB.getReg(0);
4503 
4504  // Insert the XOR.
4505  BuildMI(MBB, MIB.getInstr(), DL, TII.get(X86::XOR32rr), Reg)
4508 
4509  // Turn the pseudo into an INC or DEC.
4510  MIB->setDesc(TII.get(MinusOne ? X86::DEC32r : X86::INC32r));
4511  MIB.addReg(Reg);
4512 
4513  return true;
4514 }
4515 
4517  const TargetInstrInfo &TII,
4518  const X86Subtarget &Subtarget) {
4519  MachineBasicBlock &MBB = *MIB->getParent();
4520  DebugLoc DL = MIB->getDebugLoc();
4521  int64_t Imm = MIB->getOperand(1).getImm();
4522  assert(Imm != 0 && "Using push/pop for 0 is not efficient.");
4524 
4525  int StackAdjustment;
4526 
4527  if (Subtarget.is64Bit()) {
4528  assert(MIB->getOpcode() == X86::MOV64ImmSExti8 ||
4529  MIB->getOpcode() == X86::MOV32ImmSExti8);
4530 
4531  // Can't use push/pop lowering if the function might write to the red zone.
4532  X86MachineFunctionInfo *X86FI =
4534  if (X86FI->getUsesRedZone()) {
4535  MIB->setDesc(TII.get(MIB->getOpcode() ==
4536  X86::MOV32ImmSExti8 ? X86::MOV32ri : X86::MOV64ri));
4537  return true;
4538  }
4539 
4540  // 64-bit mode doesn't have 32-bit push/pop, so use 64-bit operations and
4541  // widen the register if necessary.
4542  StackAdjustment = 8;
4543  BuildMI(MBB, I, DL, TII.get(X86::PUSH64i8)).addImm(Imm);
4544  MIB->setDesc(TII.get(X86::POP64r));
4545  MIB->getOperand(0)
4546  .setReg(getX86SubSuperRegister(MIB.getReg(0), 64));
4547  } else {
4548  assert(MIB->getOpcode() == X86::MOV32ImmSExti8);
4549  StackAdjustment = 4;
4550  BuildMI(MBB, I, DL, TII.get(X86::PUSH32i8)).addImm(Imm);
4551  MIB->setDesc(TII.get(X86::POP32r));
4552  }
4553  MIB->RemoveOperand(1);
4555 
4556  // Build CFI if necessary.
4557  MachineFunction &MF = *MBB.getParent();
4558  const X86FrameLowering *TFL = Subtarget.getFrameLowering();
4559  bool IsWin64Prologue = MF.getTarget().getMCAsmInfo()->usesWindowsCFI();
4560  bool NeedsDwarfCFI = !IsWin64Prologue && MF.needsFrameMoves();
4561  bool EmitCFI = !TFL->hasFP(MF) && NeedsDwarfCFI;
4562  if (EmitCFI) {
4563  TFL->BuildCFI(MBB, I, DL,
4565  TFL->BuildCFI(MBB, std::next(I), DL,
4567  }
4568 
4569  return true;
4570 }
4571 
4572 // LoadStackGuard has so far only been implemented for 64-bit MachO. Different
4573 // code sequence is needed for other targets.
4575  const TargetInstrInfo &TII) {
4576  MachineBasicBlock &MBB = *MIB->getParent();
4577  DebugLoc DL = MIB->getDebugLoc();
4578  Register Reg = MIB.getReg(0);
4579  const GlobalValue *GV =
4580  cast<GlobalValue>((*MIB->memoperands_begin())->getValue());
4581  auto Flags = MachineMemOperand::MOLoad |
4585  MachinePointerInfo::getGOT(*MBB.getParent()), Flags, 8, Align(8));
4587 
4588  BuildMI(MBB, I, DL, TII.get(X86::MOV64rm), Reg).addReg(X86::RIP).addImm(1)
4590  .addMemOperand(MMO);
4591  MIB->setDebugLoc(DL);
4592  MIB->setDesc(TII.get(X86::MOV64rm));
4593  MIB.addReg(Reg, RegState::Kill).addImm(1).addReg(0).addImm(0).addReg(0);
4594 }
4595 
4597  MachineBasicBlock &MBB = *MIB->getParent();
4598  MachineFunction &MF = *MBB.getParent();
4599  const X86Subtarget &Subtarget = MF.getSubtarget<X86Subtarget>();
4600  const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
4601  unsigned XorOp =
4602  MIB->getOpcode() == X86::XOR64_FP ? X86::XOR64rr : X86::XOR32rr;
4603  MIB->setDesc(TII.get(XorOp));
4605  return true;
4606 }
4607 
4608 // This is used to handle spills for 128/256-bit registers when we have AVX512,
4609 // but not VLX. If it uses an extended register we need to use an instruction
4610 // that loads the lower 128/256-bit, but is available with only AVX512F.
4612  const TargetRegisterInfo *TRI,
4613  const MCInstrDesc &LoadDesc,
4614  const MCInstrDesc &BroadcastDesc,
4615  unsigned SubIdx) {
4616  Register DestReg = MIB.getReg(0);
4617  // Check if DestReg is XMM16-31 or YMM16-31.
4618  if (TRI->getEncodingValue(DestReg) < 16) {
4619  // We can use a normal VEX encoded load.
4620  MIB->setDesc(LoadDesc);
4621  } else {
4622  // Use a 128/256-bit VBROADCAST instruction.
4623  MIB->setDesc(BroadcastDesc);
4624  // Change the destination to a 512-bit register.
4625  DestReg = TRI->getMatchingSuperReg(DestReg, SubIdx, &X86::VR512RegClass);
4626  MIB->getOperand(0).setReg(DestReg);
4627  }
4628  return true;
4629 }
4630 
4631 // This is used to handle spills for 128/256-bit registers when we have AVX512,
4632 // but not VLX. If it uses an extended register we need to use an instruction
4633 // that stores the lower 128/256-bit, but is available with only AVX512F.
4635  const TargetRegisterInfo *TRI,
4636  const MCInstrDesc &StoreDesc,
4637  const MCInstrDesc &ExtractDesc,
4638  unsigned SubIdx) {
4639  Register SrcReg = MIB.getReg(X86::AddrNumOperands);
4640  // Check if DestReg is XMM16-31 or YMM16-31.
4641  if (TRI->getEncodingValue(SrcReg) < 16) {
4642  // We can use a normal VEX encoded store.
4643  MIB->setDesc(StoreDesc);
4644  } else {
4645  // Use a VEXTRACTF instruction.
4646  MIB->setDesc(ExtractDesc);
4647  // Change the destination to a 512-bit register.
4648  SrcReg = TRI->getMatchingSuperReg(SrcReg, SubIdx, &X86::VR512RegClass);
4649  MIB->getOperand(X86::AddrNumOperands).setReg(SrcReg);
4650  MIB.addImm(0x0); // Append immediate to extract from the lower bits.
4651  }
4652 
4653  return true;
4654 }
4655 
4656 static bool expandSHXDROT(MachineInstrBuilder &MIB, const MCInstrDesc &Desc) {
4657  MIB->setDesc(Desc);
4658  int64_t ShiftAmt = MIB->getOperand(2).getImm();
4659  // Temporarily remove the immediate so we can add another source register.
4660  MIB->RemoveOperand(2);
4661  // Add the register. Don't copy the kill flag if there is one.
4662  MIB.addReg(MIB.getReg(1),
4663  getUndefRegState(MIB->getOperand(1).isUndef()));
4664  // Add back the immediate.
4665  MIB.addImm(ShiftAmt);
4666  return true;
4667 }
4668 
4670  bool HasAVX = Subtarget.hasAVX();
4671  MachineInstrBuilder MIB(*MI.getParent()->getParent(), MI);
4672  switch (MI.getOpcode()) {
4673  case X86::MOV32r0:
4674  return Expand2AddrUndef(MIB, get(X86::XOR32rr));
4675  case X86::MOV32r1:
4676  return expandMOV32r1(MIB, *this, /*MinusOne=*/ false);
4677  case X86::MOV32r_1:
4678  return expandMOV32r1(MIB, *this, /*MinusOne=*/ true);
4679  case X86::MOV32ImmSExti8:
4680  case X86::MOV64ImmSExti8:
4681  return ExpandMOVImmSExti8(MIB, *this, Subtarget);
4682  case X86::SETB_C32r:
4683  return Expand2AddrUndef(MIB, get(X86::SBB32rr));
4684  case X86::SETB_C64r:
4685  return Expand2AddrUndef(MIB, get(X86::SBB64rr));
4686  case X86::MMX_SET0:
4687  return Expand2AddrUndef(MIB, get(X86::MMX_PXORirr));
4688  case X86::V_SET0:
4689  case X86::FsFLD0SS:
4690  case X86::FsFLD0SD:
4691  case X86::FsFLD0F128:
4692  return Expand2AddrUndef(MIB, get(HasAVX ? X86::VXORPSrr : X86::XORPSrr));
4693  case X86::AVX_SET0: {
4694  assert(HasAVX && "AVX not supported");
4696  Register SrcReg = MIB.getReg(0);
4697  Register XReg = TRI->getSubReg(SrcReg, X86::sub_xmm);
4698  MIB->getOperand(0).setReg(XReg);
4699  Expand2AddrUndef(MIB, get(X86::VXORPSrr));
4700  MIB.addReg(SrcReg, RegState::ImplicitDefine);
4701  return true;
4702  }
4703  case X86::AVX512_128_SET0:
4704  case X86::AVX512_FsFLD0SS:
4705  case X86::AVX512_FsFLD0SD:
4706  case X86::AVX512_FsFLD0F128: {
4707  bool HasVLX = Subtarget.hasVLX();
4708  Register SrcReg = MIB.getReg(0);
4710  if (HasVLX || TRI->getEncodingValue(SrcReg) < 16)
4711  return Expand2AddrUndef(MIB,
4712  get(HasVLX ? X86::VPXORDZ128rr : X86::VXORPSrr));
4713  // Extended register without VLX. Use a larger XOR.
4714  SrcReg =
4715  TRI->getMatchingSuperReg(SrcReg, X86::sub_xmm, &X86::VR512RegClass);
4716  MIB->getOperand(0).setReg(SrcReg);
4717  return Expand2AddrUndef(MIB, get(X86::VPXORDZrr));
4718  }
4719  case X86::AVX512_256_SET0:
4720  case X86::AVX512_512_SET0: {
4721  bool HasVLX = Subtarget.hasVLX();
4722  Register SrcReg = MIB.getReg(0);
4724  if (HasVLX || TRI->getEncodingValue(SrcReg) < 16) {
4725  Register XReg = TRI->getSubReg(SrcReg, X86::sub_xmm);
4726  MIB->getOperand(0).setReg(XReg);
4727  Expand2AddrUndef(MIB,
4728  get(HasVLX ? X86::VPXORDZ128rr : X86::VXORPSrr));
4729  MIB.addReg(SrcReg, RegState::ImplicitDefine);
4730  return true;
4731  }
4732  if (MI.getOpcode() == X86::AVX512_256_SET0) {
4733  // No VLX so we must reference a zmm.
4734  unsigned ZReg =
4735  TRI->getMatchingSuperReg(SrcReg, X86::sub_ymm, &X86::VR512RegClass);
4736  MIB->getOperand(0).setReg(ZReg);
4737  }
4738  return Expand2AddrUndef(MIB, get(X86::VPXORDZrr));
4739  }
4740  case X86::V_SETALLONES:
4741  return Expand2AddrUndef(MIB, get(HasAVX ? X86::VPCMPEQDrr : X86::PCMPEQDrr));
4742  case X86::AVX2_SETALLONES:
4743  return Expand2AddrUndef(MIB, get(X86::VPCMPEQDYrr));
4744  case X86::AVX1_SETALLONES: {
4745  Register Reg = MIB.getReg(0);
4746  // VCMPPSYrri with an immediate 0xf should produce VCMPTRUEPS.
4747  MIB->setDesc(get(X86::VCMPPSYrri));
4749  return true;
4750  }
4751  case X86::AVX512_512_SETALLONES: {
4752  Register Reg = MIB.getReg(0);
4753  MIB->setDesc(get(X86::VPTERNLOGDZrri));
4754  // VPTERNLOGD needs 3 register inputs and an immediate.
4755  // 0xff will return 1s for any input.
4757  .addReg(Reg, RegState::Undef).addImm(0xff);
4758  return true;
4759  }
4760  case X86::AVX512_512_SEXT_MASK_32:
4761  case X86::AVX512_512_SEXT_MASK_64: {
4762  Register Reg = MIB.getReg(0);
4763  Register MaskReg = MIB.getReg(1);
4764  unsigned MaskState = getRegState(MIB->getOperand(1));
4765  unsigned Opc = (MI.getOpcode() == X86::AVX512_512_SEXT_MASK_64) ?
4766  X86::VPTERNLOGQZrrikz : X86::VPTERNLOGDZrrikz;
4767  MI.RemoveOperand(1);
4768  MIB->setDesc(get(Opc));
4769  // VPTERNLOG needs 3 register inputs and an immediate.
4770  // 0xff will return 1s for any input.
4771  MIB.addReg(Reg, RegState::Undef).addReg(MaskReg, MaskState)
4773  return true;
4774  }
4775  case X86::VMOVAPSZ128rm_NOVLX:
4776  return expandNOVLXLoad(MIB, &getRegisterInfo(), get(X86::VMOVAPSrm),
4777  get(X86::VBROADCASTF32X4rm), X86::sub_xmm);
4778  case X86::VMOVUPSZ128rm_NOVLX:
4779  return expandNOVLXLoad(MIB, &getRegisterInfo(), get(X86::VMOVUPSrm),
4780  get(X86::VBROADCASTF32X4rm), X86::sub_xmm);
4781  case X86::VMOVAPSZ256rm_NOVLX:
4782  return expandNOVLXLoad(MIB, &getRegisterInfo(), get(X86::VMOVAPSYrm),
4783  get(X86::VBROADCASTF64X4rm), X86::sub_ymm);
4784  case X86::VMOVUPSZ256rm_NOVLX:
4785  return expandNOVLXLoad(MIB, &getRegisterInfo(), get(X86::VMOVUPSYrm),
4786  get(X86::VBROADCASTF64X4rm), X86::sub_ymm);
4787  case X86::VMOVAPSZ128mr_NOVLX:
4788  return expandNOVLXStore(MIB, &getRegisterInfo(), get(X86::VMOVAPSmr),
4789  get(X86::VEXTRACTF32x4Zmr), X86::sub_xmm);
4790  case X86::VMOVUPSZ128mr_NOVLX:
4791  return expandNOVLXStore(MIB, &getRegisterInfo(), get(X86::VMOVUPSmr),
4792  get(X86::VEXTRACTF32x4Zmr), X86::sub_xmm);
4793  case X86::VMOVAPSZ256mr_NOVLX:
4794  return expandNOVLXStore(MIB, &getRegisterInfo(), get(X86::VMOVAPSYmr),
4795  get(X86::VEXTRACTF64x4Zmr), X86::sub_ymm);
4796  case X86::VMOVUPSZ256mr_NOVLX:
4797  return expandNOVLXStore(MIB, &getRegisterInfo(), get(X86::VMOVUPSYmr),
4798  get(X86::VEXTRACTF64x4Zmr), X86::sub_ymm);
4799  case X86::MOV32ri64: {
4800  Register Reg = MIB.getReg(0);
4801  Register Reg32 = RI.getSubReg(Reg, X86::sub_32bit);
4802  MI.setDesc(get(X86::MOV32ri));
4803  MIB->getOperand(0).setReg(Reg32);
4805  return true;
4806  }
4807 
4808  // KNL does not recognize dependency-breaking idioms for mask registers,
4809  // so kxnor %k1, %k1, %k2 has a RAW dependence on %k1.
4810  // Using %k0 as the undef input register is a performance heuristic based
4811  // on the assumption that %k0 is used less frequently than the other mask
4812  // registers, since it is not usable as a write mask.
4813  // FIXME: A more advanced approach would be to choose the best input mask
4814  // register based on context.
4815  case X86::KSET0W: return Expand2AddrKreg(MIB, get(X86::KXORWrr), X86::K0);
4816  case X86::KSET0D: return Expand2AddrKreg(MIB, get(X86::KXORDrr), X86::K0);
4817  case X86::KSET0Q: return Expand2AddrKreg(MIB, get(X86::KXORQrr), X86::K0);
4818  case X86::KSET1W: return Expand2AddrKreg(MIB, get(X86::KXNORWrr), X86::K0);
4819  case X86::KSET1D: return Expand2AddrKreg(MIB, get(X86::KXNORDrr), X86::K0);
4820  case X86::KSET1Q: return Expand2AddrKreg(MIB, get(X86::KXNORQrr), X86::K0);
4821  case TargetOpcode::LOAD_STACK_GUARD:
4822  expandLoadStackGuard(MIB, *this);
4823  return true;
4824  case X86::XOR64_FP:
4825  case X86::XOR32_FP:
4826  return expandXorFP(MIB, *this);
4827  case X86::SHLDROT32ri: return expandSHXDROT(MIB, get(X86::SHLD32rri8));
4828  case X86::SHLDROT64ri: return expandSHXDROT(MIB, get(X86::SHLD64rri8));
4829  case X86::SHRDROT32ri: return expandSHXDROT(MIB, get(X86::SHRD32rri8));
4830  case X86::SHRDROT64ri: return expandSHXDROT(MIB, get(X86::SHRD64rri8));
4831  case X86::ADD8rr_DB: MIB->setDesc(get(X86::OR8rr)); break;
4832  case X86::ADD16rr_DB: MIB->setDesc(get(X86::OR16rr)); break;
4833  case X86::ADD32rr_DB: MIB->setDesc(get(X86::OR32rr)); break;
4834  case X86::ADD64rr_DB: MIB->setDesc(get(X86::OR64rr)); break;
4835  case X86::ADD8ri_DB: MIB->setDesc(get(X86::OR8ri)); break;
4836  case X86::ADD16ri_DB: MIB->setDesc(get(X86::OR16ri)); break;
4837  case X86::ADD32ri_DB: MIB->setDesc(get(X86::OR32ri)); break;
4838  case X86::ADD64ri32_DB: MIB->setDesc(get(X86::OR64ri32)); break;
4839  case X86::ADD16ri8_DB: MIB->setDesc(get(X86::OR16ri8)); break;
4840  case X86::ADD32ri8_DB: MIB->setDesc(get(X86::OR32ri8)); break;
4841  case X86::ADD64ri8_DB: MIB->setDesc(get(X86::OR64ri8)); break;
4842  }
4843  return false;
4844 }
4845 
4846 /// Return true for all instructions that only update
4847 /// the first 32 or 64-bits of the destination register and leave the rest
4848 /// unmodified. This can be used to avoid folding loads if the instructions
4849 /// only update part of the destination register, and the non-updated part is
4850 /// not needed. e.g. cvtss2sd, sqrtss. Unfolding the load from these
4851 /// instructions breaks the partial register dependency and it can improve
4852 /// performance. e.g.:
4853 ///
4854 /// movss (%rdi), %xmm0
4855 /// cvtss2sd %xmm0, %xmm0
4856 ///
4857 /// Instead of
4858 /// cvtss2sd (%rdi), %xmm0
4859 ///
4860 /// FIXME: This should be turned into a TSFlags.
4861 ///
4862 static bool hasPartialRegUpdate(unsigned Opcode,
4863  const X86Subtarget &Subtarget,
4864  bool ForLoadFold = false) {
4865  switch (Opcode) {
4866  case X86::CVTSI2SSrr:
4867  case X86::CVTSI2SSrm:
4868  case X86::CVTSI642SSrr:
4869  case X86::CVTSI642SSrm:
4870  case X86::CVTSI2SDrr:
4871  case X86::CVTSI2SDrm:
4872  case X86::CVTSI642SDrr:
4873  case X86::CVTSI642SDrm:
4874  // Load folding won't effect the undef register update since the input is
4875  // a GPR.
4876  return !ForLoadFold;
4877  case X86::CVTSD2SSrr:
4878  case X86::CVTSD2SSrm:
4879  case X86::CVTSS2SDrr:
4880  case X86::CVTSS2SDrm:
4881  case X86::MOVHPDrm:
4882  case X86::MOVHPSrm:
4883  case X86::MOVLPDrm:
4884  case X86::MOVLPSrm:
4885  case X86::RCPSSr:
4886  case X86::RCPSSm:
4887  case X86::RCPSSr_Int:
4888  case X86::RCPSSm_Int:
4889  case X86::ROUNDSDr:
4890  case X86::ROUNDSDm:
4891  case X86::ROUNDSSr:
4892  case X86::ROUNDSSm:
4893  case X86::RSQRTSSr:
4894  case X86::RSQRTSSm:
4895  case X86::RSQRTSSr_Int:
4896  case X86::RSQRTSSm_Int:
4897  case X86::SQRTSSr:
4898  case X86::SQRTSSm:
4899  case X86::SQRTSSr_Int:
4900  case X86::SQRTSSm_Int:
4901  case X86::SQRTSDr:
4902  case X86::SQRTSDm:
4903  case X86::SQRTSDr_Int:
4904  case X86::SQRTSDm_Int:
4905  return true;
4906  // GPR
4907  case X86::POPCNT32rm:
4908  case X86::POPCNT32rr:
4909  case X86::POPCNT64rm:
4910  case X86::POPCNT64rr:
4911  return Subtarget.hasPOPCNTFalseDeps();
4912  case X86::LZCNT32rm:
4913  case X86::LZCNT32rr:
4914  case X86::LZCNT64rm:
4915  case X86::LZCNT64rr:
4916  case X86::TZCNT32rm:
4917  case X86::TZCNT32rr:
4918  case X86::TZCNT64rm:
4919  case X86::TZCNT64rr:
4920  return Subtarget.hasLZCNTFalseDeps();
4921  }
4922 
4923  return false;
4924 }
4925 
4926 /// Inform the BreakFalseDeps pass how many idle
4927 /// instructions we would like before a partial register update.
4929  const MachineInstr &MI, unsigned OpNum,
4930  const TargetRegisterInfo *TRI) const {
4931  if (OpNum != 0 || !hasPartialRegUpdate(MI.getOpcode(), Subtarget))
4932  return 0;
4933 
4934  // If MI is marked as reading Reg, the partial register update is wanted.
4935  const MachineOperand &MO = MI.getOperand(0);
4936  Register Reg = MO.getReg();
4937  if (Reg.isVirtual()) {
4938  if (MO.readsReg() || MI.readsVirtualRegister(Reg))
4939  return 0;
4940  } else {
4941  if (MI.readsRegister(Reg, TRI))
4942  return 0;
4943  }
4944 
4945  // If any instructions in the clearance range are reading Reg, insert a
4946  // dependency breaking instruction, which is inexpensive and is likely to
4947  // be hidden in other instruction's cycles.
4949 }
4950 
4951 // Return true for any instruction the copies the high bits of the first source
4952 // operand into the unused high bits of the destination operand.
4953 // Also returns true for instructions that have two inputs where one may
4954 // be undef and we want it to use the same register as the other input.
4955 static bool hasUndefRegUpdate(unsigned Opcode, unsigned OpNum,
4956  bool ForLoadFold = false) {
4957  // Set the OpNum parameter to the first source operand.
4958  switch (Opcode) {
4959  case X86::MMX_PUNPCKHBWirr:
4960  case X86::MMX_PUNPCKHWDirr:
4961  case X86::MMX_PUNPCKHDQirr:
4962  case X86::MMX_PUNPCKLBWirr:
4963  case X86::MMX_PUNPCKLWDirr:
4964  case X86::MMX_PUNPCKLDQirr:
4965  case X86::MOVHLPSrr:
4966  case X86::PACKSSWBrr:
4967  case X86::PACKUSWBrr:
4968  case X86::PACKSSDWrr:
4969  case X86::PACKUSDWrr:
4970  case X86::PUNPCKHBWrr:
4971  case X86::PUNPCKLBWrr:
4972  case X86::PUNPCKHWDrr:
4973  case X86::PUNPCKLWDrr:
4974  case X86::PUNPCKHDQrr:
4975  case X86::PUNPCKLDQrr:
4976  case X86::PUNPCKHQDQrr:
4977  case X86::PUNPCKLQDQrr:
4978  case X86::SHUFPDrri:
4979  case X86::SHUFPSrri:
4980  // These instructions are sometimes used with an undef first or second
4981  // source. Return true here so BreakFalseDeps will assign this source to the
4982  // same register as the first source to avoid a false dependency.
4983  // Operand 1 of these instructions is tied so they're separate from their
4984  // VEX counterparts.
4985  return OpNum == 2 && !ForLoadFold;
4986 
4987  case X86::VMOVLHPSrr:
4988  case X86::VMOVLHPSZrr:
4989  case X86::VPACKSSWBrr:
4990  case X86::VPACKUSWBrr:
4991  case X86::VPACKSSDWrr:
4992  case X86::VPACKUSDWrr:
4993  case X86::VPACKSSWBZ128rr:
4994  case X86::VPACKUSWBZ128rr:
4995  case X86::VPACKSSDWZ128rr:
4996  case X86::VPACKUSDWZ128rr:
4997  case X86::VPERM2F128rr:
4998  case X86::VPERM2I128rr:
4999  case X86::VSHUFF32X4Z256rri:
5000  case X86::VSHUFF32X4Zrri:
5001  case X86::VSHUFF64X2Z256rri:
5002  case X86::VSHUFF64X2Zrri:
5003  case X86::VSHUFI32X4Z256rri:
5004  case X86::VSHUFI32X4Zrri:
5005  case X86::VSHUFI64X2Z256rri:
5006  case X86::VSHUFI64X2Zrri:
5007  case X86::VPUNPCKHBWrr:
5008  case X86::VPUNPCKLBWrr:
5009  case X86::VPUNPCKHBWYrr:
5010  case X86::VPUNPCKLBWYrr:
5011  case X86::VPUNPCKHBWZ128rr:
5012  case X86::VPUNPCKLBWZ128rr:
5013  case X86::VPUNPCKHBWZ256rr:
5014  case X86::VPUNPCKLBWZ256rr:
5015  case X86::VPUNPCKHBWZrr:
5016  case X86::VPUNPCKLBWZrr:
5017  case X86::VPUNPCKHWDrr:
5018  case X86::VPUNPCKLWDrr:
5019  case X86::VPUNPCKHWDYrr:
5020  case X86::VPUNPCKLWDYrr:
5021  case X86::VPUNPCKHWDZ128rr:
5022  case X86::VPUNPCKLWDZ128rr:
5023  case X86::VPUNPCKHWDZ256rr:
5024  case X86::VPUNPCKLWDZ256rr:
5025  case X86::VPUNPCKHWDZrr:
5026  case X86::VPUNPCKLWDZrr:
5027  case X86::VPUNPCKHDQrr:
5028  case X86::VPUNPCKLDQrr:
5029  case X86::VPUNPCKHDQYrr:
5030  case X86::VPUNPCKLDQYrr:
5031  case X86::VPUNPCKHDQZ128rr:
5032  case X86::VPUNPCKLDQZ128rr:
5033  case X86::VPUNPCKHDQZ256rr:
5034  case X86::VPUNPCKLDQZ256rr:
5035  case X86::VPUNPCKHDQZrr:
5036  case X86::VPUNPCKLDQZrr:
5037  case X86::VPUNPCKHQDQrr:
5038  case X86::VPUNPCKLQDQrr:
5039  case X86::VPUNPCKHQDQYrr:
5040  case X86::VPUNPCKLQDQYrr:
5041  case X86::VPUNPCKHQDQZ128rr:
5042  case X86::VPUNPCKLQDQZ128rr:
5043  case X86::VPUNPCKHQDQZ256rr:
5044  case X86::VPUNPCKLQDQZ256rr:
5045  case X86::VPUNPCKHQDQZrr:
5046  case X86::VPUNPCKLQDQZrr:
5047  // These instructions are sometimes used with an undef first or second
5048  // source. Return true here so BreakFalseDeps will assign this source to the
5049  // same register as the first source to avoid a false dependency.
5050  return (OpNum == 1 || OpNum == 2) && !ForLoadFold;
5051 
5052  case X86::VCVTSI2SSrr:
5053  case X86::VCVTSI2SSrm:
5054  case X86::VCVTSI2SSrr_Int:
5055  case X86::VCVTSI2SSrm_Int:
5056  case X86::VCVTSI642SSrr:
5057  case X86::VCVTSI642SSrm:
5058  case X86::VCVTSI642SSrr_Int:
5059  case X86::VCVTSI642SSrm_Int:
5060  case X86::VCVTSI2SDrr:
5061  case X86::VCVTSI2SDrm:
5062  case X86::VCVTSI2SDrr_Int:
5063  case X86::VCVTSI2SDrm_Int:
5064  case X86::VCVTSI642SDrr:
5065  case X86::VCVTSI642SDrm:
5066  case X86::VCVTSI642SDrr_Int:
5067  case X86::VCVTSI642SDrm_Int:
5068  // AVX-512
5069  case X86::VCVTSI2SSZrr:
5070  case X86::VCVTSI2SSZrm:
5071  case X86::VCVTSI2SSZrr_Int:
5072  case X86::VCVTSI2SSZrrb_Int:
5073  case X86::VCVTSI2SSZrm_Int:
5074  case X86::VCVTSI642SSZrr:
5075  case X86::VCVTSI642SSZrm:
5076  case X86::VCVTSI642SSZrr_Int:
5077  case X86::VCVTSI642SSZrrb_Int:
5078  case X86::VCVTSI642SSZrm_Int:
5079  case X86::VCVTSI2SDZrr:
5080  case X86::VCVTSI2SDZrm:
5081  case X86::VCVTSI2SDZrr_Int:
5082  case X86::VCVTSI2SDZrm_Int:
5083  case X86::VCVTSI642SDZrr:
5084  case X86::VCVTSI642SDZrm:
5085  case X86::VCVTSI642SDZrr_Int:
5086  case X86::VCVTSI642SDZrrb_Int:
5087  case X86::VCVTSI642SDZrm_Int:
5088  case X86::VCVTUSI2SSZrr:
5089  case X86::VCVTUSI2SSZrm:
5090  case X86::VCVTUSI2SSZrr_Int:
5091  case X86::VCVTUSI2SSZrrb_Int:
5092  case X86::VCVTUSI2SSZrm_Int:
5093  case X86::VCVTUSI642SSZrr:
5094  case X86::VCVTUSI642SSZrm:
5095  case X86::VCVTUSI642SSZrr_Int:
5096  case X86::VCVTUSI642SSZrrb_Int:
5097  case X86::VCVTUSI642SSZrm_Int:
5098  case X86::VCVTUSI2SDZrr:
5099  case X86::VCVTUSI2SDZrm:
5100  case X86::VCVTUSI2SDZrr_Int:
5101  case X86::VCVTUSI2SDZrm_Int:
5102  case X86::VCVTUSI642SDZrr:
5103  case X86::VCVTUSI642SDZrm:
5104  case X86::VCVTUSI642SDZrr_Int:
5105  case X86::VCVTUSI642SDZrrb_Int:
5106  case X86::VCVTUSI642SDZrm_Int:
5107  // Load folding won't effect the undef register update since the input is
5108  // a GPR.
5109  return OpNum == 1 && !ForLoadFold;
5110  case X86::VCVTSD2SSrr:
5111  case X86::VCVTSD2SSrm:
5112  case X86::VCVTSD2SSrr_Int:
5113  case X86::VCVTSD2SSrm_Int:
5114  case X86::VCVTSS2SDrr:
5115  case X86::VCVTSS2SDrm:
5116  case X86::VCVTSS2SDrr_Int:
5117  case X86::VCVTSS2SDrm_Int:
5118  case X86::VRCPSSr:
5119  case X86::VRCPSSr_Int:
5120  case X86::VRCPSSm:
5121  case X86::VRCPSSm_Int:
5122  case X86::VROUNDSDr:
5123  case X86::VROUNDSDm:
5124  case X86::VROUNDSDr_Int:
5125  case X86::VROUNDSDm_Int:
5126  case X86::VROUNDSSr:
5127  case X86::VROUNDSSm:
5128  case X86::VROUNDSSr_Int:
5129  case X86::VROUNDSSm_Int:
5130  case X86::VRSQRTSSr:
5131  case X86::VRSQRTSSr_Int:
5132  case X86::VRSQRTSSm:
5133  case X86::VRSQRTSSm_Int:
5134  case X86::VSQRTSSr:
5135  case X86::VSQRTSSr_Int:
5136  case X86::VSQRTSSm:
5137  case X86::VSQRTSSm_Int:
5138  case X86::VSQRTSDr:
5139  case X86::VSQRTSDr_Int:
5140  case X86::VSQRTSDm:
5141  case X86::VSQRTSDm_Int:
5142  // AVX-512
5143  case X86::VCVTSD2SSZrr:
5144  case X86::VCVTSD2SSZrr_Int:
5145  case X86::VCVTSD2SSZrrb_Int:
5146  case X86::VCVTSD2SSZrm:
5147  case X86::VCVTSD2SSZrm_Int:
5148  case X86::VCVTSS2SDZrr:
5149  case X86::VCVTSS2SDZrr_Int:
5150  case X86::VCVTSS2SDZrrb_Int:
5151  case X86::VCVTSS2SDZrm:
5152  case X86::VCVTSS2SDZrm_Int:
5153  case X86::VGETEXPSDZr:
5154  case X86::VGETEXPSDZrb:
5155  case X86::VGETEXPSDZm:
5156  case X86::VGETEXPSSZr:
5157  case X86::VGETEXPSSZrb:
5158  case X86::VGETEXPSSZm:
5159  case X86::VGETMANTSDZrri:
5160  case X86::VGETMANTSDZrrib:
5161  case X86::VGETMANTSDZrmi:
5162  case X86::VGETMANTSSZrri:
5163  case X86::VGETMANTSSZrrib:
5164  case X86::VGETMANTSSZrmi:
5165  case X86::VRNDSCALESDZr:
5166  case X86::VRNDSCALESDZr_Int:
5167  case X86::VRNDSCALESDZrb_Int:
5168  case X86::VRNDSCALESDZm:
5169  case X86::VRNDSCALESDZm_Int:
5170  case X86::VRNDSCALESSZr:
5171  case X86::VRNDSCALESSZr_Int:
5172  case X86::VRNDSCALESSZrb_Int:
5173  case X86::VRNDSCALESSZm:
5174  case X86::VRNDSCALESSZm_Int:
5175  case X86::VRCP14SDZrr:
5176  case X86::VRCP14SDZrm:
5177  case X86::VRCP14SSZrr:
5178  case X86::VRCP14SSZrm:
5179  case X86::VRCP28SDZr:
5180  case X86::VRCP28SDZrb:
5181  case X86::VRCP28SDZm:
5182  case X86::VRCP28SSZr:
5183  case X86::VRCP28SSZrb:
5184  case X86::VRCP28SSZm:
5185  case X86::VREDUCESSZrmi:
5186  case X86::VREDUCESSZrri:
5187  case X86::VREDUCESSZrrib:
5188  case X86::VRSQRT14SDZrr:
5189  case X86::VRSQRT14SDZrm:
5190  case X86::VRSQRT14SSZrr:
5191  case X86::VRSQRT14SSZrm:
5192  case X86::VRSQRT28SDZr:
5193  case X86::VRSQRT28SDZrb:
5194  case X86::VRSQRT28SDZm:
5195  case X86::VRSQRT28SSZr:
5196  case X86::VRSQRT28SSZrb:
5197  case X86::VRSQRT28SSZm:
5198  case X86::VSQRTSSZr:
5199  case X86::VSQRTSSZr_Int:
5200  case X86::VSQRTSSZrb_Int:
5201  case X86::VSQRTSSZm:
5202  case X86::VSQRTSSZm_Int:
5203  case X86::VSQRTSDZr:
5204  case X86::VSQRTSDZr_Int:
5205  case X86::VSQRTSDZrb_Int:
5206  case X86::VSQRTSDZm:
5207  case X86::VSQRTSDZm_Int:
5208  return OpNum == 1;
5209  case X86::VMOVSSZrrk:
5210  case X86::VMOVSDZrrk:
5211  return OpNum == 3 && !ForLoadFold;
5212  case X86::VMOVSSZrrkz:
5213  case X86::VMOVSDZrrkz:
5214  return OpNum == 2 && !ForLoadFold;
5215  }
5216 
5217  return false;
5218 }
5219 
5220 /// Inform the BreakFalseDeps pass how many idle instructions we would like
5221 /// before certain undef register reads.
5222 ///
5223 /// This catches the VCVTSI2SD family of instructions:
5224 ///
5225 /// vcvtsi2sdq %rax, undef %xmm0, %xmm14
5226 ///
5227 /// We should to be careful *not* to catch VXOR idioms which are presumably
5228 /// handled specially in the pipeline:
5229 ///
5230 /// vxorps undef %xmm1, undef %xmm1, %xmm1
5231 ///
5232 /// Like getPartialRegUpdateClearance, this makes a strong assumption that the
5233 /// high bits that are passed-through are not live.
5234 unsigned
5236  const TargetRegisterInfo *TRI) const {
5237  const MachineOperand &MO = MI.getOperand(OpNum);
5239  hasUndefRegUpdate(MI.getOpcode(), OpNum))
5240  return UndefRegClearance;
5241 
5242  return 0;
5243 }
5244 
5246  MachineInstr &MI, unsigned OpNum, const TargetRegisterInfo *TRI) const {
5247  Register Reg = MI.getOperand(OpNum).getReg();
5248  // If MI kills this register, the false dependence is already broken.
5249  if (MI.killsRegister(Reg, TRI))
5250  return;
5251 
5252  if (X86::VR128RegClass.contains(Reg)) {
5253  // These instructions are all floating point domain, so xorps is the best
5254  // choice.
5255  unsigned Opc = Subtarget.hasAVX() ? X86::VXORPSrr : X86::XORPSrr;
5256  BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(Opc), Reg)
5259  MI.addRegisterKilled(Reg, TRI, true);
5260  } else if (X86::VR256RegClass.contains(Reg)) {
5261  // Use vxorps to clear the full ymm register.
5262  // It wants to read and write the xmm sub-register.
5263  Register XReg = TRI->getSubReg(Reg, X86::sub_xmm);
5264  BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(X86::VXORPSrr), XReg)
5265  .addReg(XReg, RegState::Undef)
5266  .addReg(XReg, RegState::Undef)
5268  MI.addRegisterKilled(Reg, TRI, true);
5269  } else if (X86::GR64RegClass.contains(Reg)) {
5270  // Using XOR32rr because it has shorter encoding and zeros up the upper bits
5271  // as well.
5272  Register XReg = TRI->getSubReg(Reg, X86::sub_32bit);
5273  BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(X86::XOR32rr), XReg)
5274  .addReg(XReg, RegState::Undef)
5275  .addReg(XReg, RegState::Undef)
5277  MI.addRegisterKilled(Reg, TRI, true);
5278  } else if (X86::GR32RegClass.contains(Reg)) {
5279