LLVM  14.0.0git
X86InstrInfo.cpp
Go to the documentation of this file.
1 //===-- X86InstrInfo.cpp - X86 Instruction Information --------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file contains the X86 implementation of the TargetInstrInfo class.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "X86InstrInfo.h"
14 #include "X86.h"
15 #include "X86InstrBuilder.h"
16 #include "X86InstrFoldTables.h"
17 #include "X86MachineFunctionInfo.h"
18 #include "X86Subtarget.h"
19 #include "X86TargetMachine.h"
20 #include "llvm/ADT/STLExtras.h"
21 #include "llvm/ADT/Sequence.h"
30 #include "llvm/CodeGen/StackMaps.h"
32 #include "llvm/IR/DerivedTypes.h"
33 #include "llvm/IR/Function.h"
34 #include "llvm/MC/MCAsmInfo.h"
35 #include "llvm/MC/MCExpr.h"
36 #include "llvm/MC/MCInst.h"
38 #include "llvm/Support/Debug.h"
42 
43 using namespace llvm;
44 
45 #define DEBUG_TYPE "x86-instr-info"
46 
47 #define GET_INSTRINFO_CTOR_DTOR
48 #include "X86GenInstrInfo.inc"
49 
50 static cl::opt<bool>
51  NoFusing("disable-spill-fusing",
52  cl::desc("Disable fusing of spill code into instructions"),
53  cl::Hidden);
54 static cl::opt<bool>
55 PrintFailedFusing("print-failed-fuse-candidates",
56  cl::desc("Print instructions that the allocator wants to"
57  " fuse, but the X86 backend currently can't"),
58  cl::Hidden);
59 static cl::opt<bool>
60 ReMatPICStubLoad("remat-pic-stub-load",
61  cl::desc("Re-materialize load from stub in PIC mode"),
62  cl::init(false), cl::Hidden);
63 static cl::opt<unsigned>
64 PartialRegUpdateClearance("partial-reg-update-clearance",
65  cl::desc("Clearance between two register writes "
66  "for inserting XOR to avoid partial "
67  "register update"),
68  cl::init(64), cl::Hidden);
69 static cl::opt<unsigned>
70 UndefRegClearance("undef-reg-clearance",
71  cl::desc("How many idle instructions we would like before "
72  "certain undef register reads"),
73  cl::init(128), cl::Hidden);
74 
75 
76 // Pin the vtable to this file.
77 void X86InstrInfo::anchor() {}
78 
80  : X86GenInstrInfo((STI.isTarget64BitLP64() ? X86::ADJCALLSTACKDOWN64
81  : X86::ADJCALLSTACKDOWN32),
82  (STI.isTarget64BitLP64() ? X86::ADJCALLSTACKUP64
83  : X86::ADJCALLSTACKUP32),
84  X86::CATCHRET,
85  (STI.is64Bit() ? X86::RETQ : X86::RETL)),
86  Subtarget(STI), RI(STI.getTargetTriple()) {
87 }
88 
89 bool
91  Register &SrcReg, Register &DstReg,
92  unsigned &SubIdx) const {
93  switch (MI.getOpcode()) {
94  default: break;
95  case X86::MOVSX16rr8:
96  case X86::MOVZX16rr8:
97  case X86::MOVSX32rr8:
98  case X86::MOVZX32rr8:
99  case X86::MOVSX64rr8:
100  if (!Subtarget.is64Bit())
101  // It's not always legal to reference the low 8-bit of the larger
102  // register in 32-bit mode.
103  return false;
105  case X86::MOVSX32rr16:
106  case X86::MOVZX32rr16:
107  case X86::MOVSX64rr16:
108  case X86::MOVSX64rr32: {
109  if (MI.getOperand(0).getSubReg() || MI.getOperand(1).getSubReg())
110  // Be conservative.
111  return false;
112  SrcReg = MI.getOperand(1).getReg();
113  DstReg = MI.getOperand(0).getReg();
114  switch (MI.getOpcode()) {
115  default: llvm_unreachable("Unreachable!");
116  case X86::MOVSX16rr8:
117  case X86::MOVZX16rr8:
118  case X86::MOVSX32rr8:
119  case X86::MOVZX32rr8:
120  case X86::MOVSX64rr8:
121  SubIdx = X86::sub_8bit;
122  break;
123  case X86::MOVSX32rr16:
124  case X86::MOVZX32rr16:
125  case X86::MOVSX64rr16:
126  SubIdx = X86::sub_16bit;
127  break;
128  case X86::MOVSX64rr32:
129  SubIdx = X86::sub_32bit;
130  break;
131  }
132  return true;
133  }
134  }
135  return false;
136 }
137 
139  switch (MI.getOpcode()) {
140  default:
141  // By default, assume that the instruction is not data invariant.
142  return false;
143 
144  // Some target-independent operations that trivially lower to data-invariant
145  // instructions.
146  case TargetOpcode::COPY:
147  case TargetOpcode::INSERT_SUBREG:
148  case TargetOpcode::SUBREG_TO_REG:
149  return true;
150 
151  // On x86 it is believed that imul is constant time w.r.t. the loaded data.
152  // However, they set flags and are perhaps the most surprisingly constant
153  // time operations so we call them out here separately.
154  case X86::IMUL16rr:
155  case X86::IMUL16rri8:
156  case X86::IMUL16rri:
157  case X86::IMUL32rr:
158  case X86::IMUL32rri8:
159  case X86::IMUL32rri:
160  case X86::IMUL64rr:
161  case X86::IMUL64rri32:
162  case X86::IMUL64rri8:
163 
164  // Bit scanning and counting instructions that are somewhat surprisingly
165  // constant time as they scan across bits and do other fairly complex
166  // operations like popcnt, but are believed to be constant time on x86.
167  // However, these set flags.
168  case X86::BSF16rr:
169  case X86::BSF32rr:
170  case X86::BSF64rr:
171  case X86::BSR16rr:
172  case X86::BSR32rr:
173  case X86::BSR64rr:
174  case X86::LZCNT16rr:
175  case X86::LZCNT32rr:
176  case X86::LZCNT64rr:
177  case X86::POPCNT16rr:
178  case X86::POPCNT32rr:
179  case X86::POPCNT64rr:
180  case X86::TZCNT16rr:
181  case X86::TZCNT32rr:
182  case X86::TZCNT64rr:
183 
184  // Bit manipulation instructions are effectively combinations of basic
185  // arithmetic ops, and should still execute in constant time. These also
186  // set flags.
187  case X86::BLCFILL32rr:
188  case X86::BLCFILL64rr:
189  case X86::BLCI32rr:
190  case X86::BLCI64rr:
191  case X86::BLCIC32rr:
192  case X86::BLCIC64rr:
193  case X86::BLCMSK32rr:
194  case X86::BLCMSK64rr:
195  case X86::BLCS32rr:
196  case X86::BLCS64rr:
197  case X86::BLSFILL32rr:
198  case X86::BLSFILL64rr:
199  case X86::BLSI32rr:
200  case X86::BLSI64rr:
201  case X86::BLSIC32rr:
202  case X86::BLSIC64rr:
203  case X86::BLSMSK32rr:
204  case X86::BLSMSK64rr:
205  case X86::BLSR32rr:
206  case X86::BLSR64rr:
207  case X86::TZMSK32rr:
208  case X86::TZMSK64rr:
209 
210  // Bit extracting and clearing instructions should execute in constant time,
211  // and set flags.
212  case X86::BEXTR32rr:
213  case X86::BEXTR64rr:
214  case X86::BEXTRI32ri:
215  case X86::BEXTRI64ri:
216  case X86::BZHI32rr:
217  case X86::BZHI64rr:
218 
219  // Shift and rotate.
220  case X86::ROL8r1:
221  case X86::ROL16r1:
222  case X86::ROL32r1:
223  case X86::ROL64r1:
224  case X86::ROL8rCL:
225  case X86::ROL16rCL:
226  case X86::ROL32rCL:
227  case X86::ROL64rCL:
228  case X86::ROL8ri:
229  case X86::ROL16ri:
230  case X86::ROL32ri:
231  case X86::ROL64ri:
232  case X86::ROR8r1:
233  case X86::ROR16r1:
234  case X86::ROR32r1:
235  case X86::ROR64r1:
236  case X86::ROR8rCL:
237  case X86::ROR16rCL:
238  case X86::ROR32rCL:
239  case X86::ROR64rCL:
240  case X86::ROR8ri:
241  case X86::ROR16ri:
242  case X86::ROR32ri:
243  case X86::ROR64ri:
244  case X86::SAR8r1:
245  case X86::SAR16r1:
246  case X86::SAR32r1:
247  case X86::SAR64r1:
248  case X86::SAR8rCL:
249  case X86::SAR16rCL:
250  case X86::SAR32rCL:
251  case X86::SAR64rCL:
252  case X86::SAR8ri:
253  case X86::SAR16ri:
254  case X86::SAR32ri:
255  case X86::SAR64ri:
256  case X86::SHL8r1:
257  case X86::SHL16r1:
258  case X86::SHL32r1:
259  case X86::SHL64r1:
260  case X86::SHL8rCL:
261  case X86::SHL16rCL:
262  case X86::SHL32rCL:
263  case X86::SHL64rCL:
264  case X86::SHL8ri:
265  case X86::SHL16ri:
266  case X86::SHL32ri:
267  case X86::SHL64ri:
268  case X86::SHR8r1:
269  case X86::SHR16r1:
270  case X86::SHR32r1:
271  case X86::SHR64r1:
272  case X86::SHR8rCL:
273  case X86::SHR16rCL:
274  case X86::SHR32rCL:
275  case X86::SHR64rCL:
276  case X86::SHR8ri:
277  case X86::SHR16ri:
278  case X86::SHR32ri:
279  case X86::SHR64ri:
280  case X86::SHLD16rrCL:
281  case X86::SHLD32rrCL:
282  case X86::SHLD64rrCL:
283  case X86::SHLD16rri8:
284  case X86::SHLD32rri8:
285  case X86::SHLD64rri8:
286  case X86::SHRD16rrCL:
287  case X86::SHRD32rrCL:
288  case X86::SHRD64rrCL:
289  case X86::SHRD16rri8:
290  case X86::SHRD32rri8:
291  case X86::SHRD64rri8:
292 
293  // Basic arithmetic is constant time on the input but does set flags.
294  case X86::ADC8rr:
295  case X86::ADC8ri:
296  case X86::ADC16rr:
297  case X86::ADC16ri:
298  case X86::ADC16ri8:
299  case X86::ADC32rr:
300  case X86::ADC32ri:
301  case X86::ADC32ri8:
302  case X86::ADC64rr:
303  case X86::ADC64ri8:
304  case X86::ADC64ri32:
305  case X86::ADD8rr:
306  case X86::ADD8ri:
307  case X86::ADD16rr:
308  case X86::ADD16ri:
309  case X86::ADD16ri8:
310  case X86::ADD32rr:
311  case X86::ADD32ri:
312  case X86::ADD32ri8:
313  case X86::ADD64rr:
314  case X86::ADD64ri8:
315  case X86::ADD64ri32:
316  case X86::AND8rr:
317  case X86::AND8ri:
318  case X86::AND16rr:
319  case X86::AND16ri:
320  case X86::AND16ri8:
321  case X86::AND32rr:
322  case X86::AND32ri:
323  case X86::AND32ri8:
324  case X86::AND64rr:
325  case X86::AND64ri8:
326  case X86::AND64ri32:
327  case X86::OR8rr:
328  case X86::OR8ri:
329  case X86::OR16rr:
330  case X86::OR16ri:
331  case X86::OR16ri8:
332  case X86::OR32rr:
333  case X86::OR32ri:
334  case X86::OR32ri8:
335  case X86::OR64rr:
336  case X86::OR64ri8:
337  case X86::OR64ri32:
338  case X86::SBB8rr:
339  case X86::SBB8ri:
340  case X86::SBB16rr:
341  case X86::SBB16ri:
342  case X86::SBB16ri8:
343  case X86::SBB32rr:
344  case X86::SBB32ri:
345  case X86::SBB32ri8:
346  case X86::SBB64rr:
347  case X86::SBB64ri8:
348  case X86::SBB64ri32:
349  case X86::SUB8rr:
350  case X86::SUB8ri:
351  case X86::SUB16rr:
352  case X86::SUB16ri:
353  case X86::SUB16ri8:
354  case X86::SUB32rr:
355  case X86::SUB32ri:
356  case X86::SUB32ri8:
357  case X86::SUB64rr:
358  case X86::SUB64ri8:
359  case X86::SUB64ri32:
360  case X86::XOR8rr:
361  case X86::XOR8ri:
362  case X86::XOR16rr:
363  case X86::XOR16ri:
364  case X86::XOR16ri8:
365  case X86::XOR32rr:
366  case X86::XOR32ri:
367  case X86::XOR32ri8:
368  case X86::XOR64rr:
369  case X86::XOR64ri8:
370  case X86::XOR64ri32:
371  // Arithmetic with just 32-bit and 64-bit variants and no immediates.
372  case X86::ADCX32rr:
373  case X86::ADCX64rr:
374  case X86::ADOX32rr:
375  case X86::ADOX64rr:
376  case X86::ANDN32rr:
377  case X86::ANDN64rr:
378  // Unary arithmetic operations.
379  case X86::DEC8r:
380  case X86::DEC16r:
381  case X86::DEC32r:
382  case X86::DEC64r:
383  case X86::INC8r:
384  case X86::INC16r:
385  case X86::INC32r:
386  case X86::INC64r:
387  case X86::NEG8r:
388  case X86::NEG16r:
389  case X86::NEG32r:
390  case X86::NEG64r:
391 
392  // Unlike other arithmetic, NOT doesn't set EFLAGS.
393  case X86::NOT8r:
394  case X86::NOT16r:
395  case X86::NOT32r:
396  case X86::NOT64r:
397 
398  // Various move instructions used to zero or sign extend things. Note that we
399  // intentionally don't support the _NOREX variants as we can't handle that
400  // register constraint anyways.
401  case X86::MOVSX16rr8:
402  case X86::MOVSX32rr8:
403  case X86::MOVSX32rr16:
404  case X86::MOVSX64rr8:
405  case X86::MOVSX64rr16:
406  case X86::MOVSX64rr32:
407  case X86::MOVZX16rr8:
408  case X86::MOVZX32rr8:
409  case X86::MOVZX32rr16:
410  case X86::MOVZX64rr8:
411  case X86::MOVZX64rr16:
412  case X86::MOV32rr:
413 
414  // Arithmetic instructions that are both constant time and don't set flags.
415  case X86::RORX32ri:
416  case X86::RORX64ri:
417  case X86::SARX32rr:
418  case X86::SARX64rr:
419  case X86::SHLX32rr:
420  case X86::SHLX64rr:
421  case X86::SHRX32rr:
422  case X86::SHRX64rr:
423 
424  // LEA doesn't actually access memory, and its arithmetic is constant time.
425  case X86::LEA16r:
426  case X86::LEA32r:
427  case X86::LEA64_32r:
428  case X86::LEA64r:
429  return true;
430  }
431 }
432 
434  switch (MI.getOpcode()) {
435  default:
436  // By default, assume that the load will immediately leak.
437  return false;
438 
439  // On x86 it is believed that imul is constant time w.r.t. the loaded data.
440  // However, they set flags and are perhaps the most surprisingly constant
441  // time operations so we call them out here separately.
442  case X86::IMUL16rm:
443  case X86::IMUL16rmi8:
444  case X86::IMUL16rmi:
445  case X86::IMUL32rm:
446  case X86::IMUL32rmi8:
447  case X86::IMUL32rmi:
448  case X86::IMUL64rm:
449  case X86::IMUL64rmi32:
450  case X86::IMUL64rmi8:
451 
452  // Bit scanning and counting instructions that are somewhat surprisingly
453  // constant time as they scan across bits and do other fairly complex
454  // operations like popcnt, but are believed to be constant time on x86.
455  // However, these set flags.
456  case X86::BSF16rm:
457  case X86::BSF32rm:
458  case X86::BSF64rm:
459  case X86::BSR16rm:
460  case X86::BSR32rm:
461  case X86::BSR64rm:
462  case X86::LZCNT16rm:
463  case X86::LZCNT32rm:
464  case X86::LZCNT64rm:
465  case X86::POPCNT16rm:
466  case X86::POPCNT32rm:
467  case X86::POPCNT64rm:
468  case X86::TZCNT16rm:
469  case X86::TZCNT32rm:
470  case X86::TZCNT64rm:
471 
472  // Bit manipulation instructions are effectively combinations of basic
473  // arithmetic ops, and should still execute in constant time. These also
474  // set flags.
475  case X86::BLCFILL32rm:
476  case X86::BLCFILL64rm:
477  case X86::BLCI32rm:
478  case X86::BLCI64rm:
479  case X86::BLCIC32rm:
480  case X86::BLCIC64rm:
481  case X86::BLCMSK32rm:
482  case X86::BLCMSK64rm:
483  case X86::BLCS32rm:
484  case X86::BLCS64rm:
485  case X86::BLSFILL32rm:
486  case X86::BLSFILL64rm:
487  case X86::BLSI32rm:
488  case X86::BLSI64rm:
489  case X86::BLSIC32rm:
490  case X86::BLSIC64rm:
491  case X86::BLSMSK32rm:
492  case X86::BLSMSK64rm:
493  case X86::BLSR32rm:
494  case X86::BLSR64rm:
495  case X86::TZMSK32rm:
496  case X86::TZMSK64rm:
497 
498  // Bit extracting and clearing instructions should execute in constant time,
499  // and set flags.
500  case X86::BEXTR32rm:
501  case X86::BEXTR64rm:
502  case X86::BEXTRI32mi:
503  case X86::BEXTRI64mi:
504  case X86::BZHI32rm:
505  case X86::BZHI64rm:
506 
507  // Basic arithmetic is constant time on the input but does set flags.
508  case X86::ADC8rm:
509  case X86::ADC16rm:
510  case X86::ADC32rm:
511  case X86::ADC64rm:
512  case X86::ADCX32rm:
513  case X86::ADCX64rm:
514  case X86::ADD8rm:
515  case X86::ADD16rm:
516  case X86::ADD32rm:
517  case X86::ADD64rm:
518  case X86::ADOX32rm:
519  case X86::ADOX64rm:
520  case X86::AND8rm:
521  case X86::AND16rm:
522  case X86::AND32rm:
523  case X86::AND64rm:
524  case X86::ANDN32rm:
525  case X86::ANDN64rm:
526  case X86::OR8rm:
527  case X86::OR16rm:
528  case X86::OR32rm:
529  case X86::OR64rm:
530  case X86::SBB8rm:
531  case X86::SBB16rm:
532  case X86::SBB32rm:
533  case X86::SBB64rm:
534  case X86::SUB8rm:
535  case X86::SUB16rm:
536  case X86::SUB32rm:
537  case X86::SUB64rm:
538  case X86::XOR8rm:
539  case X86::XOR16rm:
540  case X86::XOR32rm:
541  case X86::XOR64rm:
542 
543  // Integer multiply w/o affecting flags is still believed to be constant
544  // time on x86. Called out separately as this is among the most surprising
545  // instructions to exhibit that behavior.
546  case X86::MULX32rm:
547  case X86::MULX64rm:
548 
549  // Arithmetic instructions that are both constant time and don't set flags.
550  case X86::RORX32mi:
551  case X86::RORX64mi:
552  case X86::SARX32rm:
553  case X86::SARX64rm:
554  case X86::SHLX32rm:
555  case X86::SHLX64rm:
556  case X86::SHRX32rm:
557  case X86::SHRX64rm:
558 
559  // Conversions are believed to be constant time and don't set flags.
560  case X86::CVTTSD2SI64rm:
561  case X86::VCVTTSD2SI64rm:
562  case X86::VCVTTSD2SI64Zrm:
563  case X86::CVTTSD2SIrm:
564  case X86::VCVTTSD2SIrm:
565  case X86::VCVTTSD2SIZrm:
566  case X86::CVTTSS2SI64rm:
567  case X86::VCVTTSS2SI64rm:
568  case X86::VCVTTSS2SI64Zrm:
569  case X86::CVTTSS2SIrm:
570  case X86::VCVTTSS2SIrm:
571  case X86::VCVTTSS2SIZrm:
572  case X86::CVTSI2SDrm:
573  case X86::VCVTSI2SDrm:
574  case X86::VCVTSI2SDZrm:
575  case X86::CVTSI2SSrm:
576  case X86::VCVTSI2SSrm:
577  case X86::VCVTSI2SSZrm:
578  case X86::CVTSI642SDrm:
579  case X86::VCVTSI642SDrm:
580  case X86::VCVTSI642SDZrm:
581  case X86::CVTSI642SSrm:
582  case X86::VCVTSI642SSrm:
583  case X86::VCVTSI642SSZrm:
584  case X86::CVTSS2SDrm:
585  case X86::VCVTSS2SDrm:
586  case X86::VCVTSS2SDZrm:
587  case X86::CVTSD2SSrm:
588  case X86::VCVTSD2SSrm:
589  case X86::VCVTSD2SSZrm:
590  // AVX512 added unsigned integer conversions.
591  case X86::VCVTTSD2USI64Zrm:
592  case X86::VCVTTSD2USIZrm:
593  case X86::VCVTTSS2USI64Zrm:
594  case X86::VCVTTSS2USIZrm:
595  case X86::VCVTUSI2SDZrm:
596  case X86::VCVTUSI642SDZrm:
597  case X86::VCVTUSI2SSZrm:
598  case X86::VCVTUSI642SSZrm:
599 
600  // Loads to register don't set flags.
601  case X86::MOV8rm:
602  case X86::MOV8rm_NOREX:
603  case X86::MOV16rm:
604  case X86::MOV32rm:
605  case X86::MOV64rm:
606  case X86::MOVSX16rm8:
607  case X86::MOVSX32rm16:
608  case X86::MOVSX32rm8:
609  case X86::MOVSX32rm8_NOREX:
610  case X86::MOVSX64rm16:
611  case X86::MOVSX64rm32:
612  case X86::MOVSX64rm8:
613  case X86::MOVZX16rm8:
614  case X86::MOVZX32rm16:
615  case X86::MOVZX32rm8:
616  case X86::MOVZX32rm8_NOREX:
617  case X86::MOVZX64rm16:
618  case X86::MOVZX64rm8:
619  return true;
620  }
621 }
622 
624  const MachineFunction *MF = MI.getParent()->getParent();
626 
627  if (isFrameInstr(MI)) {
628  int SPAdj = alignTo(getFrameSize(MI), TFI->getStackAlign());
629  SPAdj -= getFrameAdjustment(MI);
630  if (!isFrameSetup(MI))
631  SPAdj = -SPAdj;
632  return SPAdj;
633  }
634 
635  // To know whether a call adjusts the stack, we need information
636  // that is bound to the following ADJCALLSTACKUP pseudo.
637  // Look for the next ADJCALLSTACKUP that follows the call.
638  if (MI.isCall()) {
639  const MachineBasicBlock *MBB = MI.getParent();
641  for (auto E = MBB->end(); I != E; ++I) {
642  if (I->getOpcode() == getCallFrameDestroyOpcode() ||
643  I->isCall())
644  break;
645  }
646 
647  // If we could not find a frame destroy opcode, then it has already
648  // been simplified, so we don't care.
649  if (I->getOpcode() != getCallFrameDestroyOpcode())
650  return 0;
651 
652  return -(I->getOperand(1).getImm());
653  }
654 
655  // Currently handle only PUSHes we can reasonably expect to see
656  // in call sequences
657  switch (MI.getOpcode()) {
658  default:
659  return 0;
660  case X86::PUSH32i8:
661  case X86::PUSH32r:
662  case X86::PUSH32rmm:
663  case X86::PUSH32rmr:
664  case X86::PUSHi32:
665  return 4;
666  case X86::PUSH64i8:
667  case X86::PUSH64r:
668  case X86::PUSH64rmm:
669  case X86::PUSH64rmr:
670  case X86::PUSH64i32:
671  return 8;
672  }
673 }
674 
675 /// Return true and the FrameIndex if the specified
676 /// operand and follow operands form a reference to the stack frame.
677 bool X86InstrInfo::isFrameOperand(const MachineInstr &MI, unsigned int Op,
678  int &FrameIndex) const {
679  if (MI.getOperand(Op + X86::AddrBaseReg).isFI() &&
680  MI.getOperand(Op + X86::AddrScaleAmt).isImm() &&
681  MI.getOperand(Op + X86::AddrIndexReg).isReg() &&
682  MI.getOperand(Op + X86::AddrDisp).isImm() &&
683  MI.getOperand(Op + X86::AddrScaleAmt).getImm() == 1 &&
684  MI.getOperand(Op + X86::AddrIndexReg).getReg() == 0 &&
685  MI.getOperand(Op + X86::AddrDisp).getImm() == 0) {
686  FrameIndex = MI.getOperand(Op + X86::AddrBaseReg).getIndex();
687  return true;
688  }
689  return false;
690 }
691 
692 static bool isFrameLoadOpcode(int Opcode, unsigned &MemBytes) {
693  switch (Opcode) {
694  default:
695  return false;
696  case X86::MOV8rm:
697  case X86::KMOVBkm:
698  MemBytes = 1;
699  return true;
700  case X86::MOV16rm:
701  case X86::KMOVWkm:
702  MemBytes = 2;
703  return true;
704  case X86::MOV32rm:
705  case X86::MOVSSrm:
706  case X86::MOVSSrm_alt:
707  case X86::VMOVSSrm:
708  case X86::VMOVSSrm_alt:
709  case X86::VMOVSSZrm:
710  case X86::VMOVSSZrm_alt:
711  case X86::KMOVDkm:
712  MemBytes = 4;
713  return true;
714  case X86::MOV64rm:
715  case X86::LD_Fp64m:
716  case X86::MOVSDrm:
717  case X86::MOVSDrm_alt:
718  case X86::VMOVSDrm:
719  case X86::VMOVSDrm_alt:
720  case X86::VMOVSDZrm:
721  case X86::VMOVSDZrm_alt:
722  case X86::MMX_MOVD64rm:
723  case X86::MMX_MOVQ64rm:
724  case X86::KMOVQkm:
725  MemBytes = 8;
726  return true;
727  case X86::MOVAPSrm:
728  case X86::MOVUPSrm:
729  case X86::MOVAPDrm:
730  case X86::MOVUPDrm:
731  case X86::MOVDQArm:
732  case X86::MOVDQUrm:
733  case X86::VMOVAPSrm:
734  case X86::VMOVUPSrm:
735  case X86::VMOVAPDrm:
736  case X86::VMOVUPDrm:
737  case X86::VMOVDQArm:
738  case X86::VMOVDQUrm:
739  case X86::VMOVAPSZ128rm:
740  case X86::VMOVUPSZ128rm:
741  case X86::VMOVAPSZ128rm_NOVLX:
742  case X86::VMOVUPSZ128rm_NOVLX:
743  case X86::VMOVAPDZ128rm:
744  case X86::VMOVUPDZ128rm:
745  case X86::VMOVDQU8Z128rm:
746  case X86::VMOVDQU16Z128rm:
747  case X86::VMOVDQA32Z128rm:
748  case X86::VMOVDQU32Z128rm:
749  case X86::VMOVDQA64Z128rm:
750  case X86::VMOVDQU64Z128rm:
751  MemBytes = 16;
752  return true;
753  case X86::VMOVAPSYrm:
754  case X86::VMOVUPSYrm:
755  case X86::VMOVAPDYrm:
756  case X86::VMOVUPDYrm:
757  case X86::VMOVDQAYrm:
758  case X86::VMOVDQUYrm:
759  case X86::VMOVAPSZ256rm:
760  case X86::VMOVUPSZ256rm:
761  case X86::VMOVAPSZ256rm_NOVLX:
762  case X86::VMOVUPSZ256rm_NOVLX:
763  case X86::VMOVAPDZ256rm:
764  case X86::VMOVUPDZ256rm:
765  case X86::VMOVDQU8Z256rm:
766  case X86::VMOVDQU16Z256rm:
767  case X86::VMOVDQA32Z256rm:
768  case X86::VMOVDQU32Z256rm:
769  case X86::VMOVDQA64Z256rm:
770  case X86::VMOVDQU64Z256rm:
771  MemBytes = 32;
772  return true;
773  case X86::VMOVAPSZrm:
774  case X86::VMOVUPSZrm:
775  case X86::VMOVAPDZrm:
776  case X86::VMOVUPDZrm:
777  case X86::VMOVDQU8Zrm:
778  case X86::VMOVDQU16Zrm:
779  case X86::VMOVDQA32Zrm:
780  case X86::VMOVDQU32Zrm:
781  case X86::VMOVDQA64Zrm:
782  case X86::VMOVDQU64Zrm:
783  MemBytes = 64;
784  return true;
785  }
786 }
787 
788 static bool isFrameStoreOpcode(int Opcode, unsigned &MemBytes) {
789  switch (Opcode) {
790  default:
791  return false;
792  case X86::MOV8mr:
793  case X86::KMOVBmk:
794  MemBytes = 1;
795  return true;
796  case X86::MOV16mr:
797  case X86::KMOVWmk:
798  MemBytes = 2;
799  return true;
800  case X86::MOV32mr:
801  case X86::MOVSSmr:
802  case X86::VMOVSSmr:
803  case X86::VMOVSSZmr:
804  case X86::KMOVDmk:
805  MemBytes = 4;
806  return true;
807  case X86::MOV64mr:
808  case X86::ST_FpP64m:
809  case X86::MOVSDmr:
810  case X86::VMOVSDmr:
811  case X86::VMOVSDZmr:
812  case X86::MMX_MOVD64mr:
813  case X86::MMX_MOVQ64mr:
814  case X86::MMX_MOVNTQmr:
815  case X86::KMOVQmk:
816  MemBytes = 8;
817  return true;
818  case X86::MOVAPSmr:
819  case X86::MOVUPSmr:
820  case X86::MOVAPDmr:
821  case X86::MOVUPDmr:
822  case X86::MOVDQAmr:
823  case X86::MOVDQUmr:
824  case X86::VMOVAPSmr:
825  case X86::VMOVUPSmr:
826  case X86::VMOVAPDmr:
827  case X86::VMOVUPDmr:
828  case X86::VMOVDQAmr:
829  case X86::VMOVDQUmr:
830  case X86::VMOVUPSZ128mr:
831  case X86::VMOVAPSZ128mr:
832  case X86::VMOVUPSZ128mr_NOVLX:
833  case X86::VMOVAPSZ128mr_NOVLX:
834  case X86::VMOVUPDZ128mr:
835  case X86::VMOVAPDZ128mr:
836  case X86::VMOVDQA32Z128mr:
837  case X86::VMOVDQU32Z128mr:
838  case X86::VMOVDQA64Z128mr:
839  case X86::VMOVDQU64Z128mr:
840  case X86::VMOVDQU8Z128mr:
841  case X86::VMOVDQU16Z128mr:
842  MemBytes = 16;
843  return true;
844  case X86::VMOVUPSYmr:
845  case X86::VMOVAPSYmr:
846  case X86::VMOVUPDYmr:
847  case X86::VMOVAPDYmr:
848  case X86::VMOVDQUYmr:
849  case X86::VMOVDQAYmr:
850  case X86::VMOVUPSZ256mr:
851  case X86::VMOVAPSZ256mr:
852  case X86::VMOVUPSZ256mr_NOVLX:
853  case X86::VMOVAPSZ256mr_NOVLX:
854  case X86::VMOVUPDZ256mr:
855  case X86::VMOVAPDZ256mr:
856  case X86::VMOVDQU8Z256mr:
857  case X86::VMOVDQU16Z256mr:
858  case X86::VMOVDQA32Z256mr:
859  case X86::VMOVDQU32Z256mr:
860  case X86::VMOVDQA64Z256mr:
861  case X86::VMOVDQU64Z256mr:
862  MemBytes = 32;
863  return true;
864  case X86::VMOVUPSZmr:
865  case X86::VMOVAPSZmr:
866  case X86::VMOVUPDZmr:
867  case X86::VMOVAPDZmr:
868  case X86::VMOVDQU8Zmr:
869  case X86::VMOVDQU16Zmr:
870  case X86::VMOVDQA32Zmr:
871  case X86::VMOVDQU32Zmr:
872  case X86::VMOVDQA64Zmr:
873  case X86::VMOVDQU64Zmr:
874  MemBytes = 64;
875  return true;
876  }
877  return false;
878 }
879 
881  int &FrameIndex) const {
882  unsigned Dummy;
884 }
885 
887  int &FrameIndex,
888  unsigned &MemBytes) const {
889  if (isFrameLoadOpcode(MI.getOpcode(), MemBytes))
890  if (MI.getOperand(0).getSubReg() == 0 && isFrameOperand(MI, 1, FrameIndex))
891  return MI.getOperand(0).getReg();
892  return 0;
893 }
894 
896  int &FrameIndex) const {
897  unsigned Dummy;
898  if (isFrameLoadOpcode(MI.getOpcode(), Dummy)) {
899  unsigned Reg;
901  return Reg;
902  // Check for post-frame index elimination operations
904  if (hasLoadFromStackSlot(MI, Accesses)) {
905  FrameIndex =
906  cast<FixedStackPseudoSourceValue>(Accesses.front()->getPseudoValue())
907  ->getFrameIndex();
908  return MI.getOperand(0).getReg();
909  }
910  }
911  return 0;
912 }
913 
915  int &FrameIndex) const {
916  unsigned Dummy;
918 }
919 
921  int &FrameIndex,
922  unsigned &MemBytes) const {
923  if (isFrameStoreOpcode(MI.getOpcode(), MemBytes))
924  if (MI.getOperand(X86::AddrNumOperands).getSubReg() == 0 &&
925  isFrameOperand(MI, 0, FrameIndex))
926  return MI.getOperand(X86::AddrNumOperands).getReg();
927  return 0;
928 }
929 
931  int &FrameIndex) const {
932  unsigned Dummy;
933  if (isFrameStoreOpcode(MI.getOpcode(), Dummy)) {
934  unsigned Reg;
936  return Reg;
937  // Check for post-frame index elimination operations
939  if (hasStoreToStackSlot(MI, Accesses)) {
940  FrameIndex =
941  cast<FixedStackPseudoSourceValue>(Accesses.front()->getPseudoValue())
942  ->getFrameIndex();
943  return MI.getOperand(X86::AddrNumOperands).getReg();
944  }
945  }
946  return 0;
947 }
948 
949 /// Return true if register is PIC base; i.e.g defined by X86::MOVPC32r.
950 static bool regIsPICBase(Register BaseReg, const MachineRegisterInfo &MRI) {
951  // Don't waste compile time scanning use-def chains of physregs.
952  if (!BaseReg.isVirtual())
953  return false;
954  bool isPICBase = false;
956  E = MRI.def_instr_end(); I != E; ++I) {
957  MachineInstr *DefMI = &*I;
958  if (DefMI->getOpcode() != X86::MOVPC32r)
959  return false;
960  assert(!isPICBase && "More than one PIC base?");
961  isPICBase = true;
962  }
963  return isPICBase;
964 }
965 
967  AAResults *AA) const {
968  switch (MI.getOpcode()) {
969  default:
970  // This function should only be called for opcodes with the ReMaterializable
971  // flag set.
972  llvm_unreachable("Unknown rematerializable operation!");
973  break;
974 
975  case X86::LOAD_STACK_GUARD:
976  case X86::AVX1_SETALLONES:
977  case X86::AVX2_SETALLONES:
978  case X86::AVX512_128_SET0:
979  case X86::AVX512_256_SET0:
980  case X86::AVX512_512_SET0:
981  case X86::AVX512_512_SETALLONES:
982  case X86::AVX512_FsFLD0SD:
983  case X86::AVX512_FsFLD0SS:
984  case X86::AVX512_FsFLD0F128:
985  case X86::AVX_SET0:
986  case X86::FsFLD0SD:
987  case X86::FsFLD0SS:
988  case X86::FsFLD0F128:
989  case X86::KSET0D:
990  case X86::KSET0Q:
991  case X86::KSET0W:
992  case X86::KSET1D:
993  case X86::KSET1Q:
994  case X86::KSET1W:
995  case X86::MMX_SET0:
996  case X86::MOV32ImmSExti8:
997  case X86::MOV32r0:
998  case X86::MOV32r1:
999  case X86::MOV32r_1:
1000  case X86::MOV32ri64:
1001  case X86::MOV64ImmSExti8:
1002  case X86::V_SET0:
1003  case X86::V_SETALLONES:
1004  case X86::MOV16ri:
1005  case X86::MOV32ri:
1006  case X86::MOV64ri:
1007  case X86::MOV64ri32:
1008  case X86::MOV8ri:
1009  case X86::PTILEZEROV:
1010  return true;
1011 
1012  case X86::MOV8rm:
1013  case X86::MOV8rm_NOREX:
1014  case X86::MOV16rm:
1015  case X86::MOV32rm:
1016  case X86::MOV64rm:
1017  case X86::MOVSSrm:
1018  case X86::MOVSSrm_alt:
1019  case X86::MOVSDrm:
1020  case X86::MOVSDrm_alt:
1021  case X86::MOVAPSrm:
1022  case X86::MOVUPSrm:
1023  case X86::MOVAPDrm:
1024  case X86::MOVUPDrm:
1025  case X86::MOVDQArm:
1026  case X86::MOVDQUrm:
1027  case X86::VMOVSSrm:
1028  case X86::VMOVSSrm_alt:
1029  case X86::VMOVSDrm:
1030  case X86::VMOVSDrm_alt:
1031  case X86::VMOVAPSrm:
1032  case X86::VMOVUPSrm:
1033  case X86::VMOVAPDrm:
1034  case X86::VMOVUPDrm:
1035  case X86::VMOVDQArm:
1036  case X86::VMOVDQUrm:
1037  case X86::VMOVAPSYrm:
1038  case X86::VMOVUPSYrm:
1039  case X86::VMOVAPDYrm:
1040  case X86::VMOVUPDYrm:
1041  case X86::VMOVDQAYrm:
1042  case X86::VMOVDQUYrm:
1043  case X86::MMX_MOVD64rm:
1044  case X86::MMX_MOVQ64rm:
1045  // AVX-512
1046  case X86::VMOVSSZrm:
1047  case X86::VMOVSSZrm_alt:
1048  case X86::VMOVSDZrm:
1049  case X86::VMOVSDZrm_alt:
1050  case X86::VMOVAPDZ128rm:
1051  case X86::VMOVAPDZ256rm:
1052  case X86::VMOVAPDZrm:
1053  case X86::VMOVAPSZ128rm:
1054  case X86::VMOVAPSZ256rm:
1055  case X86::VMOVAPSZ128rm_NOVLX:
1056  case X86::VMOVAPSZ256rm_NOVLX:
1057  case X86::VMOVAPSZrm:
1058  case X86::VMOVDQA32Z128rm:
1059  case X86::VMOVDQA32Z256rm:
1060  case X86::VMOVDQA32Zrm:
1061  case X86::VMOVDQA64Z128rm:
1062  case X86::VMOVDQA64Z256rm:
1063  case X86::VMOVDQA64Zrm:
1064  case X86::VMOVDQU16Z128rm:
1065  case X86::VMOVDQU16Z256rm:
1066  case X86::VMOVDQU16Zrm:
1067  case X86::VMOVDQU32Z128rm:
1068  case X86::VMOVDQU32Z256rm:
1069  case X86::VMOVDQU32Zrm:
1070  case X86::VMOVDQU64Z128rm:
1071  case X86::VMOVDQU64Z256rm:
1072  case X86::VMOVDQU64Zrm:
1073  case X86::VMOVDQU8Z128rm:
1074  case X86::VMOVDQU8Z256rm:
1075  case X86::VMOVDQU8Zrm:
1076  case X86::VMOVUPDZ128rm:
1077  case X86::VMOVUPDZ256rm:
1078  case X86::VMOVUPDZrm:
1079  case X86::VMOVUPSZ128rm:
1080  case X86::VMOVUPSZ256rm:
1081  case X86::VMOVUPSZ128rm_NOVLX:
1082  case X86::VMOVUPSZ256rm_NOVLX:
1083  case X86::VMOVUPSZrm: {
1084  // Loads from constant pools are trivially rematerializable.
1085  if (MI.getOperand(1 + X86::AddrBaseReg).isReg() &&
1086  MI.getOperand(1 + X86::AddrScaleAmt).isImm() &&
1087  MI.getOperand(1 + X86::AddrIndexReg).isReg() &&
1088  MI.getOperand(1 + X86::AddrIndexReg).getReg() == 0 &&
1089  MI.isDereferenceableInvariantLoad(AA)) {
1090  Register BaseReg = MI.getOperand(1 + X86::AddrBaseReg).getReg();
1091  if (BaseReg == 0 || BaseReg == X86::RIP)
1092  return true;
1093  // Allow re-materialization of PIC load.
1094  if (!ReMatPICStubLoad && MI.getOperand(1 + X86::AddrDisp).isGlobal())
1095  return false;
1096  const MachineFunction &MF = *MI.getParent()->getParent();
1097  const MachineRegisterInfo &MRI = MF.getRegInfo();
1098  return regIsPICBase(BaseReg, MRI);
1099  }
1100  return false;
1101  }
1102 
1103  case X86::LEA32r:
1104  case X86::LEA64r: {
1105  if (MI.getOperand(1 + X86::AddrScaleAmt).isImm() &&
1106  MI.getOperand(1 + X86::AddrIndexReg).isReg() &&
1107  MI.getOperand(1 + X86::AddrIndexReg).getReg() == 0 &&
1108  !MI.getOperand(1 + X86::AddrDisp).isReg()) {
1109  // lea fi#, lea GV, etc. are all rematerializable.
1110  if (!MI.getOperand(1 + X86::AddrBaseReg).isReg())
1111  return true;
1112  Register BaseReg = MI.getOperand(1 + X86::AddrBaseReg).getReg();
1113  if (BaseReg == 0)
1114  return true;
1115  // Allow re-materialization of lea PICBase + x.
1116  const MachineFunction &MF = *MI.getParent()->getParent();
1117  const MachineRegisterInfo &MRI = MF.getRegInfo();
1118  return regIsPICBase(BaseReg, MRI);
1119  }
1120  return false;
1121  }
1122  }
1123 }
1124 
1127  Register DestReg, unsigned SubIdx,
1128  const MachineInstr &Orig,
1129  const TargetRegisterInfo &TRI) const {
1130  bool ClobbersEFLAGS = Orig.modifiesRegister(X86::EFLAGS, &TRI);
1131  if (ClobbersEFLAGS && MBB.computeRegisterLiveness(&TRI, X86::EFLAGS, I) !=
1133  // The instruction clobbers EFLAGS. Re-materialize as MOV32ri to avoid side
1134  // effects.
1135  int Value;
1136  switch (Orig.getOpcode()) {
1137  case X86::MOV32r0: Value = 0; break;
1138  case X86::MOV32r1: Value = 1; break;
1139  case X86::MOV32r_1: Value = -1; break;
1140  default:
1141  llvm_unreachable("Unexpected instruction!");
1142  }
1143 
1144  const DebugLoc &DL = Orig.getDebugLoc();
1145  BuildMI(MBB, I, DL, get(X86::MOV32ri))
1146  .add(Orig.getOperand(0))
1147  .addImm(Value);
1148  } else {
1150  MBB.insert(I, MI);
1151  }
1152 
1153  MachineInstr &NewMI = *std::prev(I);
1154  NewMI.substituteRegister(Orig.getOperand(0).getReg(), DestReg, SubIdx, TRI);
1155 }
1156 
1157 /// True if MI has a condition code def, e.g. EFLAGS, that is not marked dead.
1159  for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
1160  MachineOperand &MO = MI.getOperand(i);
1161  if (MO.isReg() && MO.isDef() &&
1162  MO.getReg() == X86::EFLAGS && !MO.isDead()) {
1163  return true;
1164  }
1165  }
1166  return false;
1167 }
1168 
1169 /// Check whether the shift count for a machine operand is non-zero.
1170 inline static unsigned getTruncatedShiftCount(const MachineInstr &MI,
1171  unsigned ShiftAmtOperandIdx) {
1172  // The shift count is six bits with the REX.W prefix and five bits without.
1173  unsigned ShiftCountMask = (MI.getDesc().TSFlags & X86II::REX_W) ? 63 : 31;
1174  unsigned Imm = MI.getOperand(ShiftAmtOperandIdx).getImm();
1175  return Imm & ShiftCountMask;
1176 }
1177 
1178 /// Check whether the given shift count is appropriate
1179 /// can be represented by a LEA instruction.
1180 inline static bool isTruncatedShiftCountForLEA(unsigned ShAmt) {
1181  // Left shift instructions can be transformed into load-effective-address
1182  // instructions if we can encode them appropriately.
1183  // A LEA instruction utilizes a SIB byte to encode its scale factor.
1184  // The SIB.scale field is two bits wide which means that we can encode any
1185  // shift amount less than 4.
1186  return ShAmt < 4 && ShAmt > 0;
1187 }
1188 
1190  unsigned Opc, bool AllowSP, Register &NewSrc,
1191  bool &isKill, MachineOperand &ImplicitOp,
1192  LiveVariables *LV) const {
1193  MachineFunction &MF = *MI.getParent()->getParent();
1194  const TargetRegisterClass *RC;
1195  if (AllowSP) {
1196  RC = Opc != X86::LEA32r ? &X86::GR64RegClass : &X86::GR32RegClass;
1197  } else {
1198  RC = Opc != X86::LEA32r ?
1199  &X86::GR64_NOSPRegClass : &X86::GR32_NOSPRegClass;
1200  }
1201  Register SrcReg = Src.getReg();
1202 
1203  // For both LEA64 and LEA32 the register already has essentially the right
1204  // type (32-bit or 64-bit) we may just need to forbid SP.
1205  if (Opc != X86::LEA64_32r) {
1206  NewSrc = SrcReg;
1207  isKill = Src.isKill();
1208  assert(!Src.isUndef() && "Undef op doesn't need optimization");
1209 
1210  if (NewSrc.isVirtual() && !MF.getRegInfo().constrainRegClass(NewSrc, RC))
1211  return false;
1212 
1213  return true;
1214  }
1215 
1216  // This is for an LEA64_32r and incoming registers are 32-bit. One way or
1217  // another we need to add 64-bit registers to the final MI.
1218  if (SrcReg.isPhysical()) {
1219  ImplicitOp = Src;
1220  ImplicitOp.setImplicit();
1221 
1222  NewSrc = getX86SubSuperRegister(Src.getReg(), 64);
1223  isKill = Src.isKill();
1224  assert(!Src.isUndef() && "Undef op doesn't need optimization");
1225  } else {
1226  // Virtual register of the wrong class, we have to create a temporary 64-bit
1227  // vreg to feed into the LEA.
1228  NewSrc = MF.getRegInfo().createVirtualRegister(RC);
1229  MachineInstr *Copy =
1230  BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(TargetOpcode::COPY))
1231  .addReg(NewSrc, RegState::Define | RegState::Undef, X86::sub_32bit)
1232  .add(Src);
1233 
1234  // Which is obviously going to be dead after we're done with it.
1235  isKill = true;
1236 
1237  if (LV)
1238  LV->replaceKillInstruction(SrcReg, MI, *Copy);
1239  }
1240 
1241  // We've set all the parameters without issue.
1242  return true;
1243 }
1244 
1245 MachineInstr *X86InstrInfo::convertToThreeAddressWithLEA(
1246  unsigned MIOpc, MachineFunction::iterator &MFI, MachineInstr &MI,
1247  LiveVariables *LV, bool Is8BitOp) const {
1248  // We handle 8-bit adds and various 16-bit opcodes in the switch below.
1249  MachineRegisterInfo &RegInfo = MFI->getParent()->getRegInfo();
1250  assert((Is8BitOp || RegInfo.getTargetRegisterInfo()->getRegSizeInBits(
1251  *RegInfo.getRegClass(MI.getOperand(0).getReg())) == 16) &&
1252  "Unexpected type for LEA transform");
1253 
1254  // TODO: For a 32-bit target, we need to adjust the LEA variables with
1255  // something like this:
1256  // Opcode = X86::LEA32r;
1257  // InRegLEA = RegInfo.createVirtualRegister(&X86::GR32_NOSPRegClass);
1258  // OutRegLEA =
1259  // Is8BitOp ? RegInfo.createVirtualRegister(&X86::GR32ABCD_RegClass)
1260  // : RegInfo.createVirtualRegister(&X86::GR32RegClass);
1261  if (!Subtarget.is64Bit())
1262  return nullptr;
1263 
1264  unsigned Opcode = X86::LEA64_32r;
1265  Register InRegLEA = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass);
1266  Register OutRegLEA = RegInfo.createVirtualRegister(&X86::GR32RegClass);
1267 
1268  // Build and insert into an implicit UNDEF value. This is OK because
1269  // we will be shifting and then extracting the lower 8/16-bits.
1270  // This has the potential to cause partial register stall. e.g.
1271  // movw (%rbp,%rcx,2), %dx
1272  // leal -65(%rdx), %esi
1273  // But testing has shown this *does* help performance in 64-bit mode (at
1274  // least on modern x86 machines).
1275  MachineBasicBlock::iterator MBBI = MI.getIterator();
1276  Register Dest = MI.getOperand(0).getReg();
1277  Register Src = MI.getOperand(1).getReg();
1278  bool IsDead = MI.getOperand(0).isDead();
1279  bool IsKill = MI.getOperand(1).isKill();
1280  unsigned SubReg = Is8BitOp ? X86::sub_8bit : X86::sub_16bit;
1281  assert(!MI.getOperand(1).isUndef() && "Undef op doesn't need optimization");
1282  BuildMI(*MFI, MBBI, MI.getDebugLoc(), get(X86::IMPLICIT_DEF), InRegLEA);
1283  MachineInstr *InsMI =
1284  BuildMI(*MFI, MBBI, MI.getDebugLoc(), get(TargetOpcode::COPY))
1285  .addReg(InRegLEA, RegState::Define, SubReg)
1286  .addReg(Src, getKillRegState(IsKill));
1287 
1288  MachineInstrBuilder MIB =
1289  BuildMI(*MFI, MBBI, MI.getDebugLoc(), get(Opcode), OutRegLEA);
1290  switch (MIOpc) {
1291  default: llvm_unreachable("Unreachable!");
1292  case X86::SHL8ri:
1293  case X86::SHL16ri: {
1294  unsigned ShAmt = MI.getOperand(2).getImm();
1295  MIB.addReg(0).addImm(1ULL << ShAmt)
1296  .addReg(InRegLEA, RegState::Kill).addImm(0).addReg(0);
1297  break;
1298  }
1299  case X86::INC8r:
1300  case X86::INC16r:
1301  addRegOffset(MIB, InRegLEA, true, 1);
1302  break;
1303  case X86::DEC8r:
1304  case X86::DEC16r:
1305  addRegOffset(MIB, InRegLEA, true, -1);
1306  break;
1307  case X86::ADD8ri:
1308  case X86::ADD8ri_DB:
1309  case X86::ADD16ri:
1310  case X86::ADD16ri8:
1311  case X86::ADD16ri_DB:
1312  case X86::ADD16ri8_DB:
1313  addRegOffset(MIB, InRegLEA, true, MI.getOperand(2).getImm());
1314  break;
1315  case X86::ADD8rr:
1316  case X86::ADD8rr_DB:
1317  case X86::ADD16rr:
1318  case X86::ADD16rr_DB: {
1319  Register Src2 = MI.getOperand(2).getReg();
1320  bool IsKill2 = MI.getOperand(2).isKill();
1321  assert(!MI.getOperand(2).isUndef() && "Undef op doesn't need optimization");
1322  unsigned InRegLEA2 = 0;
1323  MachineInstr *InsMI2 = nullptr;
1324  if (Src == Src2) {
1325  // ADD8rr/ADD16rr killed %reg1028, %reg1028
1326  // just a single insert_subreg.
1327  addRegReg(MIB, InRegLEA, true, InRegLEA, false);
1328  } else {
1329  if (Subtarget.is64Bit())
1330  InRegLEA2 = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass);
1331  else
1332  InRegLEA2 = RegInfo.createVirtualRegister(&X86::GR32_NOSPRegClass);
1333  // Build and insert into an implicit UNDEF value. This is OK because
1334  // we will be shifting and then extracting the lower 8/16-bits.
1335  BuildMI(*MFI, &*MIB, MI.getDebugLoc(), get(X86::IMPLICIT_DEF), InRegLEA2);
1336  InsMI2 = BuildMI(*MFI, &*MIB, MI.getDebugLoc(), get(TargetOpcode::COPY))
1337  .addReg(InRegLEA2, RegState::Define, SubReg)
1338  .addReg(Src2, getKillRegState(IsKill2));
1339  addRegReg(MIB, InRegLEA, true, InRegLEA2, true);
1340  }
1341  if (LV && IsKill2 && InsMI2)
1342  LV->replaceKillInstruction(Src2, MI, *InsMI2);
1343  break;
1344  }
1345  }
1346 
1347  MachineInstr *NewMI = MIB;
1348  MachineInstr *ExtMI =
1349  BuildMI(*MFI, MBBI, MI.getDebugLoc(), get(TargetOpcode::COPY))
1351  .addReg(OutRegLEA, RegState::Kill, SubReg);
1352 
1353  if (LV) {
1354  // Update live variables.
1355  LV->getVarInfo(InRegLEA).Kills.push_back(NewMI);
1356  LV->getVarInfo(OutRegLEA).Kills.push_back(ExtMI);
1357  if (IsKill)
1358  LV->replaceKillInstruction(Src, MI, *InsMI);
1359  if (IsDead)
1360  LV->replaceKillInstruction(Dest, MI, *ExtMI);
1361  }
1362 
1363  return ExtMI;
1364 }
1365 
1366 /// This method must be implemented by targets that
1367 /// set the M_CONVERTIBLE_TO_3_ADDR flag. When this flag is set, the target
1368 /// may be able to convert a two-address instruction into a true
1369 /// three-address instruction on demand. This allows the X86 target (for
1370 /// example) to convert ADD and SHL instructions into LEA instructions if they
1371 /// would require register copies due to two-addressness.
1372 ///
1373 /// This method returns a null pointer if the transformation cannot be
1374 /// performed, otherwise it returns the new instruction.
1375 ///
1376 MachineInstr *
1378  MachineInstr &MI, LiveVariables *LV) const {
1379  // The following opcodes also sets the condition code register(s). Only
1380  // convert them to equivalent lea if the condition code register def's
1381  // are dead!
1382  if (hasLiveCondCodeDef(MI))
1383  return nullptr;
1384 
1385  MachineFunction &MF = *MI.getParent()->getParent();
1386  // All instructions input are two-addr instructions. Get the known operands.
1387  const MachineOperand &Dest = MI.getOperand(0);
1388  const MachineOperand &Src = MI.getOperand(1);
1389 
1390  // Ideally, operations with undef should be folded before we get here, but we
1391  // can't guarantee it. Bail out because optimizing undefs is a waste of time.
1392  // Without this, we have to forward undef state to new register operands to
1393  // avoid machine verifier errors.
1394  if (Src.isUndef())
1395  return nullptr;
1396  if (MI.getNumOperands() > 2)
1397  if (MI.getOperand(2).isReg() && MI.getOperand(2).isUndef())
1398  return nullptr;
1399 
1400  MachineInstr *NewMI = nullptr;
1401  bool Is64Bit = Subtarget.is64Bit();
1402 
1403  bool Is8BitOp = false;
1404  unsigned MIOpc = MI.getOpcode();
1405  switch (MIOpc) {
1406  default: llvm_unreachable("Unreachable!");
1407  case X86::SHL64ri: {
1408  assert(MI.getNumOperands() >= 3 && "Unknown shift instruction!");
1409  unsigned ShAmt = getTruncatedShiftCount(MI, 2);
1410  if (!isTruncatedShiftCountForLEA(ShAmt)) return nullptr;
1411 
1412  // LEA can't handle RSP.
1413  if (Src.getReg().isVirtual() && !MF.getRegInfo().constrainRegClass(
1414  Src.getReg(), &X86::GR64_NOSPRegClass))
1415  return nullptr;
1416 
1417  NewMI = BuildMI(MF, MI.getDebugLoc(), get(X86::LEA64r))
1418  .add(Dest)
1419  .addReg(0)
1420  .addImm(1ULL << ShAmt)
1421  .add(Src)
1422  .addImm(0)
1423  .addReg(0);
1424  break;
1425  }
1426  case X86::SHL32ri: {
1427  assert(MI.getNumOperands() >= 3 && "Unknown shift instruction!");
1428  unsigned ShAmt = getTruncatedShiftCount(MI, 2);
1429  if (!isTruncatedShiftCountForLEA(ShAmt)) return nullptr;
1430 
1431  unsigned Opc = Is64Bit ? X86::LEA64_32r : X86::LEA32r;
1432 
1433  // LEA can't handle ESP.
1434  bool isKill;
1435  Register SrcReg;
1436  MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
1437  if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ false,
1438  SrcReg, isKill, ImplicitOp, LV))
1439  return nullptr;
1440 
1441  MachineInstrBuilder MIB =
1442  BuildMI(MF, MI.getDebugLoc(), get(Opc))
1443  .add(Dest)
1444  .addReg(0)
1445  .addImm(1ULL << ShAmt)
1446  .addReg(SrcReg, getKillRegState(isKill))
1447  .addImm(0)
1448  .addReg(0);
1449  if (ImplicitOp.getReg() != 0)
1450  MIB.add(ImplicitOp);
1451  NewMI = MIB;
1452 
1453  break;
1454  }
1455  case X86::SHL8ri:
1456  Is8BitOp = true;
1458  case X86::SHL16ri: {
1459  assert(MI.getNumOperands() >= 3 && "Unknown shift instruction!");
1460  unsigned ShAmt = getTruncatedShiftCount(MI, 2);
1461  if (!isTruncatedShiftCountForLEA(ShAmt))
1462  return nullptr;
1463  return convertToThreeAddressWithLEA(MIOpc, MFI, MI, LV, Is8BitOp);
1464  }
1465  case X86::INC64r:
1466  case X86::INC32r: {
1467  assert(MI.getNumOperands() >= 2 && "Unknown inc instruction!");
1468  unsigned Opc = MIOpc == X86::INC64r ? X86::LEA64r :
1469  (Is64Bit ? X86::LEA64_32r : X86::LEA32r);
1470  bool isKill;
1471  Register SrcReg;
1472  MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
1473  if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ false, SrcReg, isKill,
1474  ImplicitOp, LV))
1475  return nullptr;
1476 
1477  MachineInstrBuilder MIB =
1478  BuildMI(MF, MI.getDebugLoc(), get(Opc))
1479  .add(Dest)
1480  .addReg(SrcReg, getKillRegState(isKill));
1481  if (ImplicitOp.getReg() != 0)
1482  MIB.add(ImplicitOp);
1483 
1484  NewMI = addOffset(MIB, 1);
1485  break;
1486  }
1487  case X86::DEC64r:
1488  case X86::DEC32r: {
1489  assert(MI.getNumOperands() >= 2 && "Unknown dec instruction!");
1490  unsigned Opc = MIOpc == X86::DEC64r ? X86::LEA64r
1491  : (Is64Bit ? X86::LEA64_32r : X86::LEA32r);
1492 
1493  bool isKill;
1494  Register SrcReg;
1495  MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
1496  if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ false, SrcReg, isKill,
1497  ImplicitOp, LV))
1498  return nullptr;
1499 
1500  MachineInstrBuilder MIB = BuildMI(MF, MI.getDebugLoc(), get(Opc))
1501  .add(Dest)
1502  .addReg(SrcReg, getKillRegState(isKill));
1503  if (ImplicitOp.getReg() != 0)
1504  MIB.add(ImplicitOp);
1505 
1506  NewMI = addOffset(MIB, -1);
1507 
1508  break;
1509  }
1510  case X86::DEC8r:
1511  case X86::INC8r:
1512  Is8BitOp = true;
1514  case X86::DEC16r:
1515  case X86::INC16r:
1516  return convertToThreeAddressWithLEA(MIOpc, MFI, MI, LV, Is8BitOp);
1517  case X86::ADD64rr:
1518  case X86::ADD64rr_DB:
1519  case X86::ADD32rr:
1520  case X86::ADD32rr_DB: {
1521  assert(MI.getNumOperands() >= 3 && "Unknown add instruction!");
1522  unsigned Opc;
1523  if (MIOpc == X86::ADD64rr || MIOpc == X86::ADD64rr_DB)
1524  Opc = X86::LEA64r;
1525  else
1526  Opc = Is64Bit ? X86::LEA64_32r : X86::LEA32r;
1527 
1528  bool isKill;
1529  Register SrcReg;
1530  MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
1531  if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ true,
1532  SrcReg, isKill, ImplicitOp, LV))
1533  return nullptr;
1534 
1535  const MachineOperand &Src2 = MI.getOperand(2);
1536  bool isKill2;
1537  Register SrcReg2;
1538  MachineOperand ImplicitOp2 = MachineOperand::CreateReg(0, false);
1539  if (!classifyLEAReg(MI, Src2, Opc, /*AllowSP=*/ false,
1540  SrcReg2, isKill2, ImplicitOp2, LV))
1541  return nullptr;
1542 
1543  MachineInstrBuilder MIB = BuildMI(MF, MI.getDebugLoc(), get(Opc)).add(Dest);
1544  if (ImplicitOp.getReg() != 0)
1545  MIB.add(ImplicitOp);
1546  if (ImplicitOp2.getReg() != 0)
1547  MIB.add(ImplicitOp2);
1548 
1549  NewMI = addRegReg(MIB, SrcReg, isKill, SrcReg2, isKill2);
1550  if (LV && Src2.isKill())
1551  LV->replaceKillInstruction(SrcReg2, MI, *NewMI);
1552  break;
1553  }
1554  case X86::ADD8rr:
1555  case X86::ADD8rr_DB:
1556  Is8BitOp = true;
1558  case X86::ADD16rr:
1559  case X86::ADD16rr_DB:
1560  return convertToThreeAddressWithLEA(MIOpc, MFI, MI, LV, Is8BitOp);
1561  case X86::ADD64ri32:
1562  case X86::ADD64ri8:
1563  case X86::ADD64ri32_DB:
1564  case X86::ADD64ri8_DB:
1565  assert(MI.getNumOperands() >= 3 && "Unknown add instruction!");
1566  NewMI = addOffset(
1567  BuildMI(MF, MI.getDebugLoc(), get(X86::LEA64r)).add(Dest).add(Src),
1568  MI.getOperand(2));
1569  break;
1570  case X86::ADD32ri:
1571  case X86::ADD32ri8:
1572  case X86::ADD32ri_DB:
1573  case X86::ADD32ri8_DB: {
1574  assert(MI.getNumOperands() >= 3 && "Unknown add instruction!");
1575  unsigned Opc = Is64Bit ? X86::LEA64_32r : X86::LEA32r;
1576 
1577  bool isKill;
1578  Register SrcReg;
1579  MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
1580  if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ true,
1581  SrcReg, isKill, ImplicitOp, LV))
1582  return nullptr;
1583 
1584  MachineInstrBuilder MIB = BuildMI(MF, MI.getDebugLoc(), get(Opc))
1585  .add(Dest)
1586  .addReg(SrcReg, getKillRegState(isKill));
1587  if (ImplicitOp.getReg() != 0)
1588  MIB.add(ImplicitOp);
1589 
1590  NewMI = addOffset(MIB, MI.getOperand(2));
1591  break;
1592  }
1593  case X86::ADD8ri:
1594  case X86::ADD8ri_DB:
1595  Is8BitOp = true;
1597  case X86::ADD16ri:
1598  case X86::ADD16ri8:
1599  case X86::ADD16ri_DB:
1600  case X86::ADD16ri8_DB:
1601  return convertToThreeAddressWithLEA(MIOpc, MFI, MI, LV, Is8BitOp);
1602  case X86::SUB8ri:
1603  case X86::SUB16ri8:
1604  case X86::SUB16ri:
1605  /// FIXME: Support these similar to ADD8ri/ADD16ri*.
1606  return nullptr;
1607  case X86::SUB32ri8:
1608  case X86::SUB32ri: {
1609  if (!MI.getOperand(2).isImm())
1610  return nullptr;
1611  int64_t Imm = MI.getOperand(2).getImm();
1612  if (!isInt<32>(-Imm))
1613  return nullptr;
1614 
1615  assert(MI.getNumOperands() >= 3 && "Unknown add instruction!");
1616  unsigned Opc = Is64Bit ? X86::LEA64_32r : X86::LEA32r;
1617 
1618  bool isKill;
1619  Register SrcReg;
1620  MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
1621  if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ true,
1622  SrcReg, isKill, ImplicitOp, LV))
1623  return nullptr;
1624 
1625  MachineInstrBuilder MIB = BuildMI(MF, MI.getDebugLoc(), get(Opc))
1626  .add(Dest)
1627  .addReg(SrcReg, getKillRegState(isKill));
1628  if (ImplicitOp.getReg() != 0)
1629  MIB.add(ImplicitOp);
1630 
1631  NewMI = addOffset(MIB, -Imm);
1632  break;
1633  }
1634 
1635  case X86::SUB64ri8:
1636  case X86::SUB64ri32: {
1637  if (!MI.getOperand(2).isImm())
1638  return nullptr;
1639  int64_t Imm = MI.getOperand(2).getImm();
1640  if (!isInt<32>(-Imm))
1641  return nullptr;
1642 
1643  assert(MI.getNumOperands() >= 3 && "Unknown sub instruction!");
1644 
1645  MachineInstrBuilder MIB = BuildMI(MF, MI.getDebugLoc(),
1646  get(X86::LEA64r)).add(Dest).add(Src);
1647  NewMI = addOffset(MIB, -Imm);
1648  break;
1649  }
1650 
1651  case X86::VMOVDQU8Z128rmk:
1652  case X86::VMOVDQU8Z256rmk:
1653  case X86::VMOVDQU8Zrmk:
1654  case X86::VMOVDQU16Z128rmk:
1655  case X86::VMOVDQU16Z256rmk:
1656  case X86::VMOVDQU16Zrmk:
1657  case X86::VMOVDQU32Z128rmk: case X86::VMOVDQA32Z128rmk:
1658  case X86::VMOVDQU32Z256rmk: case X86::VMOVDQA32Z256rmk:
1659  case X86::VMOVDQU32Zrmk: case X86::VMOVDQA32Zrmk:
1660  case X86::VMOVDQU64Z128rmk: case X86::VMOVDQA64Z128rmk:
1661  case X86::VMOVDQU64Z256rmk: case X86::VMOVDQA64Z256rmk:
1662  case X86::VMOVDQU64Zrmk: case X86::VMOVDQA64Zrmk:
1663  case X86::VMOVUPDZ128rmk: case X86::VMOVAPDZ128rmk:
1664  case X86::VMOVUPDZ256rmk: case X86::VMOVAPDZ256rmk:
1665  case X86::VMOVUPDZrmk: case X86::VMOVAPDZrmk:
1666  case X86::VMOVUPSZ128rmk: case X86::VMOVAPSZ128rmk:
1667  case X86::VMOVUPSZ256rmk: case X86::VMOVAPSZ256rmk:
1668  case X86::VMOVUPSZrmk: case X86::VMOVAPSZrmk:
1669  case X86::VBROADCASTSDZ256rmk:
1670  case X86::VBROADCASTSDZrmk:
1671  case X86::VBROADCASTSSZ128rmk:
1672  case X86::VBROADCASTSSZ256rmk:
1673  case X86::VBROADCASTSSZrmk:
1674  case X86::VPBROADCASTDZ128rmk:
1675  case X86::VPBROADCASTDZ256rmk:
1676  case X86::VPBROADCASTDZrmk:
1677  case X86::VPBROADCASTQZ128rmk:
1678  case X86::VPBROADCASTQZ256rmk:
1679  case X86::VPBROADCASTQZrmk: {
1680  unsigned Opc;
1681  switch (MIOpc) {
1682  default: llvm_unreachable("Unreachable!");
1683  case X86::VMOVDQU8Z128rmk: Opc = X86::VPBLENDMBZ128rmk; break;
1684  case X86::VMOVDQU8Z256rmk: Opc = X86::VPBLENDMBZ256rmk; break;
1685  case X86::VMOVDQU8Zrmk: Opc = X86::VPBLENDMBZrmk; break;
1686  case X86::VMOVDQU16Z128rmk: Opc = X86::VPBLENDMWZ128rmk; break;
1687  case X86::VMOVDQU16Z256rmk: Opc = X86::VPBLENDMWZ256rmk; break;
1688  case X86::VMOVDQU16Zrmk: Opc = X86::VPBLENDMWZrmk; break;
1689  case X86::VMOVDQU32Z128rmk: Opc = X86::VPBLENDMDZ128rmk; break;
1690  case X86::VMOVDQU32Z256rmk: Opc = X86::VPBLENDMDZ256rmk; break;
1691  case X86::VMOVDQU32Zrmk: Opc = X86::VPBLENDMDZrmk; break;
1692  case X86::VMOVDQU64Z128rmk: Opc = X86::VPBLENDMQZ128rmk; break;
1693  case X86::VMOVDQU64Z256rmk: Opc = X86::VPBLENDMQZ256rmk; break;
1694  case X86::VMOVDQU64Zrmk: Opc = X86::VPBLENDMQZrmk; break;
1695  case X86::VMOVUPDZ128rmk: Opc = X86::VBLENDMPDZ128rmk; break;
1696  case X86::VMOVUPDZ256rmk: Opc = X86::VBLENDMPDZ256rmk; break;
1697  case X86::VMOVUPDZrmk: Opc = X86::VBLENDMPDZrmk; break;
1698  case X86::VMOVUPSZ128rmk: Opc = X86::VBLENDMPSZ128rmk; break;
1699  case X86::VMOVUPSZ256rmk: Opc = X86::VBLENDMPSZ256rmk; break;
1700  case X86::VMOVUPSZrmk: Opc = X86::VBLENDMPSZrmk; break;
1701  case X86::VMOVDQA32Z128rmk: Opc = X86::VPBLENDMDZ128rmk; break;
1702  case X86::VMOVDQA32Z256rmk: Opc = X86::VPBLENDMDZ256rmk; break;
1703  case X86::VMOVDQA32Zrmk: Opc = X86::VPBLENDMDZrmk; break;
1704  case X86::VMOVDQA64Z128rmk: Opc = X86::VPBLENDMQZ128rmk; break;
1705  case X86::VMOVDQA64Z256rmk: Opc = X86::VPBLENDMQZ256rmk; break;
1706  case X86::VMOVDQA64Zrmk: Opc = X86::VPBLENDMQZrmk; break;
1707  case X86::VMOVAPDZ128rmk: Opc = X86::VBLENDMPDZ128rmk; break;
1708  case X86::VMOVAPDZ256rmk: Opc = X86::VBLENDMPDZ256rmk; break;
1709  case X86::VMOVAPDZrmk: Opc = X86::VBLENDMPDZrmk; break;
1710  case X86::VMOVAPSZ128rmk: Opc = X86::VBLENDMPSZ128rmk; break;
1711  case X86::VMOVAPSZ256rmk: Opc = X86::VBLENDMPSZ256rmk; break;
1712  case X86::VMOVAPSZrmk: Opc = X86::VBLENDMPSZrmk; break;
1713  case X86::VBROADCASTSDZ256rmk: Opc = X86::VBLENDMPDZ256rmbk; break;
1714  case X86::VBROADCASTSDZrmk: Opc = X86::VBLENDMPDZrmbk; break;
1715  case X86::VBROADCASTSSZ128rmk: Opc = X86::VBLENDMPSZ128rmbk; break;
1716  case X86::VBROADCASTSSZ256rmk: Opc = X86::VBLENDMPSZ256rmbk; break;
1717  case X86::VBROADCASTSSZrmk: Opc = X86::VBLENDMPSZrmbk; break;
1718  case X86::VPBROADCASTDZ128rmk: Opc = X86::VPBLENDMDZ128rmbk; break;
1719  case X86::VPBROADCASTDZ256rmk: Opc = X86::VPBLENDMDZ256rmbk; break;
1720  case X86::VPBROADCASTDZrmk: Opc = X86::VPBLENDMDZrmbk; break;
1721  case X86::VPBROADCASTQZ128rmk: Opc = X86::VPBLENDMQZ128rmbk; break;
1722  case X86::VPBROADCASTQZ256rmk: Opc = X86::VPBLENDMQZ256rmbk; break;
1723  case X86::VPBROADCASTQZrmk: Opc = X86::VPBLENDMQZrmbk; break;
1724  }
1725 
1726  NewMI = BuildMI(MF, MI.getDebugLoc(), get(Opc))
1727  .add(Dest)
1728  .add(MI.getOperand(2))
1729  .add(Src)
1730  .add(MI.getOperand(3))
1731  .add(MI.getOperand(4))
1732  .add(MI.getOperand(5))
1733  .add(MI.getOperand(6))
1734  .add(MI.getOperand(7));
1735  break;
1736  }
1737 
1738  case X86::VMOVDQU8Z128rrk:
1739  case X86::VMOVDQU8Z256rrk:
1740  case X86::VMOVDQU8Zrrk:
1741  case X86::VMOVDQU16Z128rrk:
1742  case X86::VMOVDQU16Z256rrk:
1743  case X86::VMOVDQU16Zrrk:
1744  case X86::VMOVDQU32Z128rrk: case X86::VMOVDQA32Z128rrk:
1745  case X86::VMOVDQU32Z256rrk: case X86::VMOVDQA32Z256rrk:
1746  case X86::VMOVDQU32Zrrk: case X86::VMOVDQA32Zrrk:
1747  case X86::VMOVDQU64Z128rrk: case X86::VMOVDQA64Z128rrk:
1748  case X86::VMOVDQU64Z256rrk: case X86::VMOVDQA64Z256rrk:
1749  case X86::VMOVDQU64Zrrk: case X86::VMOVDQA64Zrrk:
1750  case X86::VMOVUPDZ128rrk: case X86::VMOVAPDZ128rrk:
1751  case X86::VMOVUPDZ256rrk: case X86::VMOVAPDZ256rrk:
1752  case X86::VMOVUPDZrrk: case X86::VMOVAPDZrrk:
1753  case X86::VMOVUPSZ128rrk: case X86::VMOVAPSZ128rrk:
1754  case X86::VMOVUPSZ256rrk: case X86::VMOVAPSZ256rrk:
1755  case X86::VMOVUPSZrrk: case X86::VMOVAPSZrrk: {
1756  unsigned Opc;
1757  switch (MIOpc) {
1758  default: llvm_unreachable("Unreachable!");
1759  case X86::VMOVDQU8Z128rrk: Opc = X86::VPBLENDMBZ128rrk; break;
1760  case X86::VMOVDQU8Z256rrk: Opc = X86::VPBLENDMBZ256rrk; break;
1761  case X86::VMOVDQU8Zrrk: Opc = X86::VPBLENDMBZrrk; break;
1762  case X86::VMOVDQU16Z128rrk: Opc = X86::VPBLENDMWZ128rrk; break;
1763  case X86::VMOVDQU16Z256rrk: Opc = X86::VPBLENDMWZ256rrk; break;
1764  case X86::VMOVDQU16Zrrk: Opc = X86::VPBLENDMWZrrk; break;
1765  case X86::VMOVDQU32Z128rrk: Opc = X86::VPBLENDMDZ128rrk; break;
1766  case X86::VMOVDQU32Z256rrk: Opc = X86::VPBLENDMDZ256rrk; break;
1767  case X86::VMOVDQU32Zrrk: Opc = X86::VPBLENDMDZrrk; break;
1768  case X86::VMOVDQU64Z128rrk: Opc = X86::VPBLENDMQZ128rrk; break;
1769  case X86::VMOVDQU64Z256rrk: Opc = X86::VPBLENDMQZ256rrk; break;
1770  case X86::VMOVDQU64Zrrk: Opc = X86::VPBLENDMQZrrk; break;
1771  case X86::VMOVUPDZ128rrk: Opc = X86::VBLENDMPDZ128rrk; break;
1772  case X86::VMOVUPDZ256rrk: Opc = X86::VBLENDMPDZ256rrk; break;
1773  case X86::VMOVUPDZrrk: Opc = X86::VBLENDMPDZrrk; break;
1774  case X86::VMOVUPSZ128rrk: Opc = X86::VBLENDMPSZ128rrk; break;
1775  case X86::VMOVUPSZ256rrk: Opc = X86::VBLENDMPSZ256rrk; break;
1776  case X86::VMOVUPSZrrk: Opc = X86::VBLENDMPSZrrk; break;
1777  case X86::VMOVDQA32Z128rrk: Opc = X86::VPBLENDMDZ128rrk; break;
1778  case X86::VMOVDQA32Z256rrk: Opc = X86::VPBLENDMDZ256rrk; break;
1779  case X86::VMOVDQA32Zrrk: Opc = X86::VPBLENDMDZrrk; break;
1780  case X86::VMOVDQA64Z128rrk: Opc = X86::VPBLENDMQZ128rrk; break;
1781  case X86::VMOVDQA64Z256rrk: Opc = X86::VPBLENDMQZ256rrk; break;
1782  case X86::VMOVDQA64Zrrk: Opc = X86::VPBLENDMQZrrk; break;
1783  case X86::VMOVAPDZ128rrk: Opc = X86::VBLENDMPDZ128rrk; break;
1784  case X86::VMOVAPDZ256rrk: Opc = X86::VBLENDMPDZ256rrk; break;
1785  case X86::VMOVAPDZrrk: Opc = X86::VBLENDMPDZrrk; break;
1786  case X86::VMOVAPSZ128rrk: Opc = X86::VBLENDMPSZ128rrk; break;
1787  case X86::VMOVAPSZ256rrk: Opc = X86::VBLENDMPSZ256rrk; break;
1788  case X86::VMOVAPSZrrk: Opc = X86::VBLENDMPSZrrk; break;
1789  }
1790 
1791  NewMI = BuildMI(MF, MI.getDebugLoc(), get(Opc))
1792  .add(Dest)
1793  .add(MI.getOperand(2))
1794  .add(Src)
1795  .add(MI.getOperand(3));
1796  break;
1797  }
1798  }
1799 
1800  if (!NewMI) return nullptr;
1801 
1802  if (LV) { // Update live variables
1803  if (Src.isKill())
1804  LV->replaceKillInstruction(Src.getReg(), MI, *NewMI);
1805  if (Dest.isDead())
1806  LV->replaceKillInstruction(Dest.getReg(), MI, *NewMI);
1807  }
1808 
1809  MFI->insert(MI.getIterator(), NewMI); // Insert the new inst
1810  return NewMI;
1811 }
1812 
1813 /// This determines which of three possible cases of a three source commute
1814 /// the source indexes correspond to taking into account any mask operands.
1815 /// All prevents commuting a passthru operand. Returns -1 if the commute isn't
1816 /// possible.
1817 /// Case 0 - Possible to commute the first and second operands.
1818 /// Case 1 - Possible to commute the first and third operands.
1819 /// Case 2 - Possible to commute the second and third operands.
1820 static unsigned getThreeSrcCommuteCase(uint64_t TSFlags, unsigned SrcOpIdx1,
1821  unsigned SrcOpIdx2) {
1822  // Put the lowest index to SrcOpIdx1 to simplify the checks below.
1823  if (SrcOpIdx1 > SrcOpIdx2)
1824  std::swap(SrcOpIdx1, SrcOpIdx2);
1825 
1826  unsigned Op1 = 1, Op2 = 2, Op3 = 3;
1827  if (X86II::isKMasked(TSFlags)) {
1828  Op2++;
1829  Op3++;
1830  }
1831 
1832  if (SrcOpIdx1 == Op1 && SrcOpIdx2 == Op2)
1833  return 0;
1834  if (SrcOpIdx1 == Op1 && SrcOpIdx2 == Op3)
1835  return 1;
1836  if (SrcOpIdx1 == Op2 && SrcOpIdx2 == Op3)
1837  return 2;
1838  llvm_unreachable("Unknown three src commute case.");
1839 }
1840 
1842  const MachineInstr &MI, unsigned SrcOpIdx1, unsigned SrcOpIdx2,
1843  const X86InstrFMA3Group &FMA3Group) const {
1844 
1845  unsigned Opc = MI.getOpcode();
1846 
1847  // TODO: Commuting the 1st operand of FMA*_Int requires some additional
1848  // analysis. The commute optimization is legal only if all users of FMA*_Int
1849  // use only the lowest element of the FMA*_Int instruction. Such analysis are
1850  // not implemented yet. So, just return 0 in that case.
1851  // When such analysis are available this place will be the right place for
1852  // calling it.
1853  assert(!(FMA3Group.isIntrinsic() && (SrcOpIdx1 == 1 || SrcOpIdx2 == 1)) &&
1854  "Intrinsic instructions can't commute operand 1");
1855 
1856  // Determine which case this commute is or if it can't be done.
1857  unsigned Case = getThreeSrcCommuteCase(MI.getDesc().TSFlags, SrcOpIdx1,
1858  SrcOpIdx2);
1859  assert(Case < 3 && "Unexpected case number!");
1860 
1861  // Define the FMA forms mapping array that helps to map input FMA form
1862  // to output FMA form to preserve the operation semantics after
1863  // commuting the operands.
1864  const unsigned Form132Index = 0;
1865  const unsigned Form213Index = 1;
1866  const unsigned Form231Index = 2;
1867  static const unsigned FormMapping[][3] = {
1868  // 0: SrcOpIdx1 == 1 && SrcOpIdx2 == 2;
1869  // FMA132 A, C, b; ==> FMA231 C, A, b;
1870  // FMA213 B, A, c; ==> FMA213 A, B, c;
1871  // FMA231 C, A, b; ==> FMA132 A, C, b;
1872  { Form231Index, Form213Index, Form132Index },
1873  // 1: SrcOpIdx1 == 1 && SrcOpIdx2 == 3;
1874  // FMA132 A, c, B; ==> FMA132 B, c, A;
1875  // FMA213 B, a, C; ==> FMA231 C, a, B;
1876  // FMA231 C, a, B; ==> FMA213 B, a, C;
1877  { Form132Index, Form231Index, Form213Index },
1878  // 2: SrcOpIdx1 == 2 && SrcOpIdx2 == 3;
1879  // FMA132 a, C, B; ==> FMA213 a, B, C;
1880  // FMA213 b, A, C; ==> FMA132 b, C, A;
1881  // FMA231 c, A, B; ==> FMA231 c, B, A;
1882  { Form213Index, Form132Index, Form231Index }
1883  };
1884 
1885  unsigned FMAForms[3];
1886  FMAForms[0] = FMA3Group.get132Opcode();
1887  FMAForms[1] = FMA3Group.get213Opcode();
1888  FMAForms[2] = FMA3Group.get231Opcode();
1889  unsigned FormIndex;
1890  for (FormIndex = 0; FormIndex < 3; FormIndex++)
1891  if (Opc == FMAForms[FormIndex])
1892  break;
1893 
1894  // Everything is ready, just adjust the FMA opcode and return it.
1895  FormIndex = FormMapping[Case][FormIndex];
1896  return FMAForms[FormIndex];
1897 }
1898 
1899 static void commuteVPTERNLOG(MachineInstr &MI, unsigned SrcOpIdx1,
1900  unsigned SrcOpIdx2) {
1901  // Determine which case this commute is or if it can't be done.
1902  unsigned Case = getThreeSrcCommuteCase(MI.getDesc().TSFlags, SrcOpIdx1,
1903  SrcOpIdx2);
1904  assert(Case < 3 && "Unexpected case value!");
1905 
1906  // For each case we need to swap two pairs of bits in the final immediate.
1907  static const uint8_t SwapMasks[3][4] = {
1908  { 0x04, 0x10, 0x08, 0x20 }, // Swap bits 2/4 and 3/5.
1909  { 0x02, 0x10, 0x08, 0x40 }, // Swap bits 1/4 and 3/6.
1910  { 0x02, 0x04, 0x20, 0x40 }, // Swap bits 1/2 and 5/6.
1911  };
1912 
1913  uint8_t Imm = MI.getOperand(MI.getNumOperands()-1).getImm();
1914  // Clear out the bits we are swapping.
1915  uint8_t NewImm = Imm & ~(SwapMasks[Case][0] | SwapMasks[Case][1] |
1916  SwapMasks[Case][2] | SwapMasks[Case][3]);
1917  // If the immediate had a bit of the pair set, then set the opposite bit.
1918  if (Imm & SwapMasks[Case][0]) NewImm |= SwapMasks[Case][1];
1919  if (Imm & SwapMasks[Case][1]) NewImm |= SwapMasks[Case][0];
1920  if (Imm & SwapMasks[Case][2]) NewImm |= SwapMasks[Case][3];
1921  if (Imm & SwapMasks[Case][3]) NewImm |= SwapMasks[Case][2];
1922  MI.getOperand(MI.getNumOperands()-1).setImm(NewImm);
1923 }
1924 
1925 // Returns true if this is a VPERMI2 or VPERMT2 instruction that can be
1926 // commuted.
1927 static bool isCommutableVPERMV3Instruction(unsigned Opcode) {
1928 #define VPERM_CASES(Suffix) \
1929  case X86::VPERMI2##Suffix##128rr: case X86::VPERMT2##Suffix##128rr: \
1930  case X86::VPERMI2##Suffix##256rr: case X86::VPERMT2##Suffix##256rr: \
1931  case X86::VPERMI2##Suffix##rr: case X86::VPERMT2##Suffix##rr: \
1932  case X86::VPERMI2##Suffix##128rm: case X86::VPERMT2##Suffix##128rm: \
1933  case X86::VPERMI2##Suffix##256rm: case X86::VPERMT2##Suffix##256rm: \
1934  case X86::VPERMI2##Suffix##rm: case X86::VPERMT2##Suffix##rm: \
1935  case X86::VPERMI2##Suffix##128rrkz: case X86::VPERMT2##Suffix##128rrkz: \
1936  case X86::VPERMI2##Suffix##256rrkz: case X86::VPERMT2##Suffix##256rrkz: \
1937  case X86::VPERMI2##Suffix##rrkz: case X86::VPERMT2##Suffix##rrkz: \
1938  case X86::VPERMI2##Suffix##128rmkz: case X86::VPERMT2##Suffix##128rmkz: \
1939  case X86::VPERMI2##Suffix##256rmkz: case X86::VPERMT2##Suffix##256rmkz: \
1940  case X86::VPERMI2##Suffix##rmkz: case X86::VPERMT2##Suffix##rmkz:
1941 
1942 #define VPERM_CASES_BROADCAST(Suffix) \
1943  VPERM_CASES(Suffix) \
1944  case X86::VPERMI2##Suffix##128rmb: case X86::VPERMT2##Suffix##128rmb: \
1945  case X86::VPERMI2##Suffix##256rmb: case X86::VPERMT2##Suffix##256rmb: \
1946  case X86::VPERMI2##Suffix##rmb: case X86::VPERMT2##Suffix##rmb: \
1947  case X86::VPERMI2##Suffix##128rmbkz: case X86::VPERMT2##Suffix##128rmbkz: \
1948  case X86::VPERMI2##Suffix##256rmbkz: case X86::VPERMT2##Suffix##256rmbkz: \
1949  case X86::VPERMI2##Suffix##rmbkz: case X86::VPERMT2##Suffix##rmbkz:
1950 
1951  switch (Opcode) {
1952  default: return false;
1953  VPERM_CASES(B)
1958  VPERM_CASES(W)
1959  return true;
1960  }
1961 #undef VPERM_CASES_BROADCAST
1962 #undef VPERM_CASES
1963 }
1964 
1965 // Returns commuted opcode for VPERMI2 and VPERMT2 instructions by switching
1966 // from the I opcode to the T opcode and vice versa.
1967 static unsigned getCommutedVPERMV3Opcode(unsigned Opcode) {
1968 #define VPERM_CASES(Orig, New) \
1969  case X86::Orig##128rr: return X86::New##128rr; \
1970  case X86::Orig##128rrkz: return X86::New##128rrkz; \
1971  case X86::Orig##128rm: return X86::New##128rm; \
1972  case X86::Orig##128rmkz: return X86::New##128rmkz; \
1973  case X86::Orig##256rr: return X86::New##256rr; \
1974  case X86::Orig##256rrkz: return X86::New##256rrkz; \
1975  case X86::Orig##256rm: return X86::New##256rm; \
1976  case X86::Orig##256rmkz: return X86::New##256rmkz; \
1977  case X86::Orig##rr: return X86::New##rr; \
1978  case X86::Orig##rrkz: return X86::New##rrkz; \
1979  case X86::Orig##rm: return X86::New##rm; \
1980  case X86::Orig##rmkz: return X86::New##rmkz;
1981 
1982 #define VPERM_CASES_BROADCAST(Orig, New) \
1983  VPERM_CASES(Orig, New) \
1984  case X86::Orig##128rmb: return X86::New##128rmb; \
1985  case X86::Orig##128rmbkz: return X86::New##128rmbkz; \
1986  case X86::Orig##256rmb: return X86::New##256rmb; \
1987  case X86::Orig##256rmbkz: return X86::New##256rmbkz; \
1988  case X86::Orig##rmb: return X86::New##rmb; \
1989  case X86::Orig##rmbkz: return X86::New##rmbkz;
1990 
1991  switch (Opcode) {
1992  VPERM_CASES(VPERMI2B, VPERMT2B)
1993  VPERM_CASES_BROADCAST(VPERMI2D, VPERMT2D)
1994  VPERM_CASES_BROADCAST(VPERMI2PD, VPERMT2PD)
1995  VPERM_CASES_BROADCAST(VPERMI2PS, VPERMT2PS)
1996  VPERM_CASES_BROADCAST(VPERMI2Q, VPERMT2Q)
1997  VPERM_CASES(VPERMI2W, VPERMT2W)
1998  VPERM_CASES(VPERMT2B, VPERMI2B)
1999  VPERM_CASES_BROADCAST(VPERMT2D, VPERMI2D)
2000  VPERM_CASES_BROADCAST(VPERMT2PD, VPERMI2PD)
2001  VPERM_CASES_BROADCAST(VPERMT2PS, VPERMI2PS)
2002  VPERM_CASES_BROADCAST(VPERMT2Q, VPERMI2Q)
2003  VPERM_CASES(VPERMT2W, VPERMI2W)
2004  }
2005 
2006  llvm_unreachable("Unreachable!");
2007 #undef VPERM_CASES_BROADCAST
2008 #undef VPERM_CASES
2009 }
2010 
2012  unsigned OpIdx1,
2013  unsigned OpIdx2) const {
2014  auto cloneIfNew = [NewMI](MachineInstr &MI) -> MachineInstr & {
2015  if (NewMI)
2016  return *MI.getParent()->getParent()->CloneMachineInstr(&MI);
2017  return MI;
2018  };
2019 
2020  switch (MI.getOpcode()) {
2021  case X86::SHRD16rri8: // A = SHRD16rri8 B, C, I -> A = SHLD16rri8 C, B, (16-I)
2022  case X86::SHLD16rri8: // A = SHLD16rri8 B, C, I -> A = SHRD16rri8 C, B, (16-I)
2023  case X86::SHRD32rri8: // A = SHRD32rri8 B, C, I -> A = SHLD32rri8 C, B, (32-I)
2024  case X86::SHLD32rri8: // A = SHLD32rri8 B, C, I -> A = SHRD32rri8 C, B, (32-I)
2025  case X86::SHRD64rri8: // A = SHRD64rri8 B, C, I -> A = SHLD64rri8 C, B, (64-I)
2026  case X86::SHLD64rri8:{// A = SHLD64rri8 B, C, I -> A = SHRD64rri8 C, B, (64-I)
2027  unsigned Opc;
2028  unsigned Size;
2029  switch (MI.getOpcode()) {
2030  default: llvm_unreachable("Unreachable!");
2031  case X86::SHRD16rri8: Size = 16; Opc = X86::SHLD16rri8; break;
2032  case X86::SHLD16rri8: Size = 16; Opc = X86::SHRD16rri8; break;
2033  case X86::SHRD32rri8: Size = 32; Opc = X86::SHLD32rri8; break;
2034  case X86::SHLD32rri8: Size = 32; Opc = X86::SHRD32rri8; break;
2035  case X86::SHRD64rri8: Size = 64; Opc = X86::SHLD64rri8; break;
2036  case X86::SHLD64rri8: Size = 64; Opc = X86::SHRD64rri8; break;
2037  }
2038  unsigned Amt = MI.getOperand(3).getImm();
2039  auto &WorkingMI = cloneIfNew(MI);
2040  WorkingMI.setDesc(get(Opc));
2041  WorkingMI.getOperand(3).setImm(Size - Amt);
2042  return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
2043  OpIdx1, OpIdx2);
2044  }
2045  case X86::PFSUBrr:
2046  case X86::PFSUBRrr: {
2047  // PFSUB x, y: x = x - y
2048  // PFSUBR x, y: x = y - x
2049  unsigned Opc =
2050  (X86::PFSUBRrr == MI.getOpcode() ? X86::PFSUBrr : X86::PFSUBRrr);
2051  auto &WorkingMI = cloneIfNew(MI);
2052  WorkingMI.setDesc(get(Opc));
2053  return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
2054  OpIdx1, OpIdx2);
2055  }
2056  case X86::BLENDPDrri:
2057  case X86::BLENDPSrri:
2058  case X86::VBLENDPDrri:
2059  case X86::VBLENDPSrri:
2060  // If we're optimizing for size, try to use MOVSD/MOVSS.
2061  if (MI.getParent()->getParent()->getFunction().hasOptSize()) {
2062  unsigned Mask, Opc;
2063  switch (MI.getOpcode()) {
2064  default: llvm_unreachable("Unreachable!");
2065  case X86::BLENDPDrri: Opc = X86::MOVSDrr; Mask = 0x03; break;
2066  case X86::BLENDPSrri: Opc = X86::MOVSSrr; Mask = 0x0F; break;
2067  case X86::VBLENDPDrri: Opc = X86::VMOVSDrr; Mask = 0x03; break;
2068  case X86::VBLENDPSrri: Opc = X86::VMOVSSrr; Mask = 0x0F; break;
2069  }
2070  if ((MI.getOperand(3).getImm() ^ Mask) == 1) {
2071  auto &WorkingMI = cloneIfNew(MI);
2072  WorkingMI.setDesc(get(Opc));
2073  WorkingMI.RemoveOperand(3);
2074  return TargetInstrInfo::commuteInstructionImpl(WorkingMI,
2075  /*NewMI=*/false,
2076  OpIdx1, OpIdx2);
2077  }
2078  }
2080  case X86::PBLENDWrri:
2081  case X86::VBLENDPDYrri:
2082  case X86::VBLENDPSYrri:
2083  case X86::VPBLENDDrri:
2084  case X86::VPBLENDWrri:
2085  case X86::VPBLENDDYrri:
2086  case X86::VPBLENDWYrri:{
2087  int8_t Mask;
2088  switch (MI.getOpcode()) {
2089  default: llvm_unreachable("Unreachable!");
2090  case X86::BLENDPDrri: Mask = (int8_t)0x03; break;
2091  case X86::BLENDPSrri: Mask = (int8_t)0x0F; break;
2092  case X86::PBLENDWrri: Mask = (int8_t)0xFF; break;
2093  case X86::VBLENDPDrri: Mask = (int8_t)0x03; break;
2094  case X86::VBLENDPSrri: Mask = (int8_t)0x0F; break;
2095  case X86::VBLENDPDYrri: Mask = (int8_t)0x0F; break;
2096  case X86::VBLENDPSYrri: Mask = (int8_t)0xFF; break;
2097  case X86::VPBLENDDrri: Mask = (int8_t)0x0F; break;
2098  case X86::VPBLENDWrri: Mask = (int8_t)0xFF; break;
2099  case X86::VPBLENDDYrri: Mask = (int8_t)0xFF; break;
2100  case X86::VPBLENDWYrri: Mask = (int8_t)0xFF; break;
2101  }
2102  // Only the least significant bits of Imm are used.
2103  // Using int8_t to ensure it will be sign extended to the int64_t that
2104  // setImm takes in order to match isel behavior.
2105  int8_t Imm = MI.getOperand(3).getImm() & Mask;
2106  auto &WorkingMI = cloneIfNew(MI);
2107  WorkingMI.getOperand(3).setImm(Mask ^ Imm);
2108  return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
2109  OpIdx1, OpIdx2);
2110  }
2111  case X86::INSERTPSrr:
2112  case X86::VINSERTPSrr:
2113  case X86::VINSERTPSZrr: {
2114  unsigned Imm = MI.getOperand(MI.getNumOperands() - 1).getImm();
2115  unsigned ZMask = Imm & 15;
2116  unsigned DstIdx = (Imm >> 4) & 3;
2117  unsigned SrcIdx = (Imm >> 6) & 3;
2118 
2119  // We can commute insertps if we zero 2 of the elements, the insertion is
2120  // "inline" and we don't override the insertion with a zero.
2121  if (DstIdx == SrcIdx && (ZMask & (1 << DstIdx)) == 0 &&
2122  countPopulation(ZMask) == 2) {
2123  unsigned AltIdx = findFirstSet((ZMask | (1 << DstIdx)) ^ 15);
2124  assert(AltIdx < 4 && "Illegal insertion index");
2125  unsigned AltImm = (AltIdx << 6) | (AltIdx << 4) | ZMask;
2126  auto &WorkingMI = cloneIfNew(MI);
2127  WorkingMI.getOperand(MI.getNumOperands() - 1).setImm(AltImm);
2128  return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
2129  OpIdx1, OpIdx2);
2130  }
2131  return nullptr;
2132  }
2133  case X86::MOVSDrr:
2134  case X86::MOVSSrr:
2135  case X86::VMOVSDrr:
2136  case X86::VMOVSSrr:{
2137  // On SSE41 or later we can commute a MOVSS/MOVSD to a BLENDPS/BLENDPD.
2138  if (Subtarget.hasSSE41()) {
2139  unsigned Mask, Opc;
2140  switch (MI.getOpcode()) {
2141  default: llvm_unreachable("Unreachable!");
2142  case X86::MOVSDrr: Opc = X86::BLENDPDrri; Mask = 0x02; break;
2143  case X86::MOVSSrr: Opc = X86::BLENDPSrri; Mask = 0x0E; break;
2144  case X86::VMOVSDrr: Opc = X86::VBLENDPDrri; Mask = 0x02; break;
2145  case X86::VMOVSSrr: Opc = X86::VBLENDPSrri; Mask = 0x0E; break;
2146  }
2147 
2148  auto &WorkingMI = cloneIfNew(MI);
2149  WorkingMI.setDesc(get(Opc));
2150  WorkingMI.addOperand(MachineOperand::CreateImm(Mask));
2151  return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
2152  OpIdx1, OpIdx2);
2153  }
2154 
2155  // Convert to SHUFPD.
2156  assert(MI.getOpcode() == X86::MOVSDrr &&
2157  "Can only commute MOVSDrr without SSE4.1");
2158 
2159  auto &WorkingMI = cloneIfNew(MI);
2160  WorkingMI.setDesc(get(X86::SHUFPDrri));
2161  WorkingMI.addOperand(MachineOperand::CreateImm(0x02));
2162  return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
2163  OpIdx1, OpIdx2);
2164  }
2165  case X86::SHUFPDrri: {
2166  // Commute to MOVSD.
2167  assert(MI.getOperand(3).getImm() == 0x02 && "Unexpected immediate!");
2168  auto &WorkingMI = cloneIfNew(MI);
2169  WorkingMI.setDesc(get(X86::MOVSDrr));
2170  WorkingMI.RemoveOperand(3);
2171  return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
2172  OpIdx1, OpIdx2);
2173  }
2174  case X86::PCLMULQDQrr:
2175  case X86::VPCLMULQDQrr:
2176  case X86::VPCLMULQDQYrr:
2177  case X86::VPCLMULQDQZrr:
2178  case X86::VPCLMULQDQZ128rr:
2179  case X86::VPCLMULQDQZ256rr: {
2180  // SRC1 64bits = Imm[0] ? SRC1[127:64] : SRC1[63:0]
2181  // SRC2 64bits = Imm[4] ? SRC2[127:64] : SRC2[63:0]
2182  unsigned Imm = MI.getOperand(3).getImm();
2183  unsigned Src1Hi = Imm & 0x01;
2184  unsigned Src2Hi = Imm & 0x10;
2185  auto &WorkingMI = cloneIfNew(MI);
2186  WorkingMI.getOperand(3).setImm((Src1Hi << 4) | (Src2Hi >> 4));
2187  return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
2188  OpIdx1, OpIdx2);
2189  }
2190  case X86::VPCMPBZ128rri: case X86::VPCMPUBZ128rri:
2191  case X86::VPCMPBZ256rri: case X86::VPCMPUBZ256rri:
2192  case X86::VPCMPBZrri: case X86::VPCMPUBZrri:
2193  case X86::VPCMPDZ128rri: case X86::VPCMPUDZ128rri:
2194  case X86::VPCMPDZ256rri: case X86::VPCMPUDZ256rri:
2195  case X86::VPCMPDZrri: case X86::VPCMPUDZrri:
2196  case X86::VPCMPQZ128rri: case X86::VPCMPUQZ128rri:
2197  case X86::VPCMPQZ256rri: case X86::VPCMPUQZ256rri:
2198  case X86::VPCMPQZrri: case X86::VPCMPUQZrri:
2199  case X86::VPCMPWZ128rri: case X86::VPCMPUWZ128rri:
2200  case X86::VPCMPWZ256rri: case X86::VPCMPUWZ256rri:
2201  case X86::VPCMPWZrri: case X86::VPCMPUWZrri:
2202  case X86::VPCMPBZ128rrik: case X86::VPCMPUBZ128rrik:
2203  case X86::VPCMPBZ256rrik: case X86::VPCMPUBZ256rrik:
2204  case X86::VPCMPBZrrik: case X86::VPCMPUBZrrik:
2205  case X86::VPCMPDZ128rrik: case X86::VPCMPUDZ128rrik:
2206  case X86::VPCMPDZ256rrik: case X86::VPCMPUDZ256rrik:
2207  case X86::VPCMPDZrrik: case X86::VPCMPUDZrrik:
2208  case X86::VPCMPQZ128rrik: case X86::VPCMPUQZ128rrik:
2209  case X86::VPCMPQZ256rrik: case X86::VPCMPUQZ256rrik:
2210  case X86::VPCMPQZrrik: case X86::VPCMPUQZrrik:
2211  case X86::VPCMPWZ128rrik: case X86::VPCMPUWZ128rrik:
2212  case X86::VPCMPWZ256rrik: case X86::VPCMPUWZ256rrik:
2213  case X86::VPCMPWZrrik: case X86::VPCMPUWZrrik: {
2214  // Flip comparison mode immediate (if necessary).
2215  unsigned Imm = MI.getOperand(MI.getNumOperands() - 1).getImm() & 0x7;
2216  Imm = X86::getSwappedVPCMPImm(Imm);
2217  auto &WorkingMI = cloneIfNew(MI);
2218  WorkingMI.getOperand(MI.getNumOperands() - 1).setImm(Imm);
2219  return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
2220  OpIdx1, OpIdx2);
2221  }
2222  case X86::VPCOMBri: case X86::VPCOMUBri:
2223  case X86::VPCOMDri: case X86::VPCOMUDri:
2224  case X86::VPCOMQri: case X86::VPCOMUQri:
2225  case X86::VPCOMWri: case X86::VPCOMUWri: {
2226  // Flip comparison mode immediate (if necessary).
2227  unsigned Imm = MI.getOperand(3).getImm() & 0x7;
2228  Imm = X86::getSwappedVPCOMImm(Imm);
2229  auto &WorkingMI = cloneIfNew(MI);
2230  WorkingMI.getOperand(3).setImm(Imm);
2231  return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
2232  OpIdx1, OpIdx2);
2233  }
2234  case X86::VCMPSDZrr:
2235  case X86::VCMPSSZrr:
2236  case X86::VCMPPDZrri:
2237  case X86::VCMPPSZrri:
2238  case X86::VCMPPDZ128rri:
2239  case X86::VCMPPSZ128rri:
2240  case X86::VCMPPDZ256rri:
2241  case X86::VCMPPSZ256rri:
2242  case X86::VCMPPDZrrik:
2243  case X86::VCMPPSZrrik:
2244  case X86::VCMPPDZ128rrik:
2245  case X86::VCMPPSZ128rrik:
2246  case X86::VCMPPDZ256rrik:
2247  case X86::VCMPPSZ256rrik: {
2248  unsigned Imm =
2249  MI.getOperand(MI.getNumExplicitOperands() - 1).getImm() & 0x1f;
2250  Imm = X86::getSwappedVCMPImm(Imm);
2251  auto &WorkingMI = cloneIfNew(MI);
2252  WorkingMI.getOperand(MI.getNumExplicitOperands() - 1).setImm(Imm);
2253  return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
2254  OpIdx1, OpIdx2);
2255  }
2256  case X86::VPERM2F128rr:
2257  case X86::VPERM2I128rr: {
2258  // Flip permute source immediate.
2259  // Imm & 0x02: lo = if set, select Op1.lo/hi else Op0.lo/hi.
2260  // Imm & 0x20: hi = if set, select Op1.lo/hi else Op0.lo/hi.
2261  int8_t Imm = MI.getOperand(3).getImm() & 0xFF;
2262  auto &WorkingMI = cloneIfNew(MI);
2263  WorkingMI.getOperand(3).setImm(Imm ^ 0x22);
2264  return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
2265  OpIdx1, OpIdx2);
2266  }
2267  case X86::MOVHLPSrr:
2268  case X86::UNPCKHPDrr:
2269  case X86::VMOVHLPSrr:
2270  case X86::VUNPCKHPDrr:
2271  case X86::VMOVHLPSZrr:
2272  case X86::VUNPCKHPDZ128rr: {
2273  assert(Subtarget.hasSSE2() && "Commuting MOVHLP/UNPCKHPD requires SSE2!");
2274 
2275  unsigned Opc = MI.getOpcode();
2276  switch (Opc) {
2277  default: llvm_unreachable("Unreachable!");
2278  case X86::MOVHLPSrr: Opc = X86::UNPCKHPDrr; break;
2279  case X86::UNPCKHPDrr: Opc = X86::MOVHLPSrr; break;
2280  case X86::VMOVHLPSrr: Opc = X86::VUNPCKHPDrr; break;
2281  case X86::VUNPCKHPDrr: Opc = X86::VMOVHLPSrr; break;
2282  case X86::VMOVHLPSZrr: Opc = X86::VUNPCKHPDZ128rr; break;
2283  case X86::VUNPCKHPDZ128rr: Opc = X86::VMOVHLPSZrr; break;
2284  }
2285  auto &WorkingMI = cloneIfNew(MI);
2286  WorkingMI.setDesc(get(Opc));
2287  return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
2288  OpIdx1, OpIdx2);
2289  }
2290  case X86::CMOV16rr: case X86::CMOV32rr: case X86::CMOV64rr: {
2291  auto &WorkingMI = cloneIfNew(MI);
2292  unsigned OpNo = MI.getDesc().getNumOperands() - 1;
2293  X86::CondCode CC = static_cast<X86::CondCode>(MI.getOperand(OpNo).getImm());
2294  WorkingMI.getOperand(OpNo).setImm(X86::GetOppositeBranchCondition(CC));
2295  return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
2296  OpIdx1, OpIdx2);
2297  }
2298  case X86::VPTERNLOGDZrri: case X86::VPTERNLOGDZrmi:
2299  case X86::VPTERNLOGDZ128rri: case X86::VPTERNLOGDZ128rmi:
2300  case X86::VPTERNLOGDZ256rri: case X86::VPTERNLOGDZ256rmi:
2301  case X86::VPTERNLOGQZrri: case X86::VPTERNLOGQZrmi:
2302  case X86::VPTERNLOGQZ128rri: case X86::VPTERNLOGQZ128rmi:
2303  case X86::VPTERNLOGQZ256rri: case X86::VPTERNLOGQZ256rmi:
2304  case X86::VPTERNLOGDZrrik:
2305  case X86::VPTERNLOGDZ128rrik:
2306  case X86::VPTERNLOGDZ256rrik:
2307  case X86::VPTERNLOGQZrrik:
2308  case X86::VPTERNLOGQZ128rrik:
2309  case X86::VPTERNLOGQZ256rrik:
2310  case X86::VPTERNLOGDZrrikz: case X86::VPTERNLOGDZrmikz:
2311  case X86::VPTERNLOGDZ128rrikz: case X86::VPTERNLOGDZ128rmikz:
2312  case X86::VPTERNLOGDZ256rrikz: case X86::VPTERNLOGDZ256rmikz:
2313  case X86::VPTERNLOGQZrrikz: case X86::VPTERNLOGQZrmikz:
2314  case X86::VPTERNLOGQZ128rrikz: case X86::VPTERNLOGQZ128rmikz:
2315  case X86::VPTERNLOGQZ256rrikz: case X86::VPTERNLOGQZ256rmikz:
2316  case X86::VPTERNLOGDZ128rmbi:
2317  case X86::VPTERNLOGDZ256rmbi:
2318  case X86::VPTERNLOGDZrmbi:
2319  case X86::VPTERNLOGQZ128rmbi:
2320  case X86::VPTERNLOGQZ256rmbi:
2321  case X86::VPTERNLOGQZrmbi:
2322  case X86::VPTERNLOGDZ128rmbikz:
2323  case X86::VPTERNLOGDZ256rmbikz:
2324  case X86::VPTERNLOGDZrmbikz:
2325  case X86::VPTERNLOGQZ128rmbikz:
2326  case X86::VPTERNLOGQZ256rmbikz:
2327  case X86::VPTERNLOGQZrmbikz: {
2328  auto &WorkingMI = cloneIfNew(MI);
2329  commuteVPTERNLOG(WorkingMI, OpIdx1, OpIdx2);
2330  return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
2331  OpIdx1, OpIdx2);
2332  }
2333  default: {
2334  if (isCommutableVPERMV3Instruction(MI.getOpcode())) {
2335  unsigned Opc = getCommutedVPERMV3Opcode(MI.getOpcode());
2336  auto &WorkingMI = cloneIfNew(MI);
2337  WorkingMI.setDesc(get(Opc));
2338  return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
2339  OpIdx1, OpIdx2);
2340  }
2341 
2342  const X86InstrFMA3Group *FMA3Group = getFMA3Group(MI.getOpcode(),
2343  MI.getDesc().TSFlags);
2344  if (FMA3Group) {
2345  unsigned Opc =
2346  getFMA3OpcodeToCommuteOperands(MI, OpIdx1, OpIdx2, *FMA3Group);
2347  auto &WorkingMI = cloneIfNew(MI);
2348  WorkingMI.setDesc(get(Opc));
2349  return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
2350  OpIdx1, OpIdx2);
2351  }
2352 
2353  return TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2);
2354  }
2355  }
2356 }
2357 
2358 bool
2359 X86InstrInfo::findThreeSrcCommutedOpIndices(const MachineInstr &MI,
2360  unsigned &SrcOpIdx1,
2361  unsigned &SrcOpIdx2,
2362  bool IsIntrinsic) const {
2363  uint64_t TSFlags = MI.getDesc().TSFlags;
2364 
2365  unsigned FirstCommutableVecOp = 1;
2366  unsigned LastCommutableVecOp = 3;
2367  unsigned KMaskOp = -1U;
2368  if (X86II::isKMasked(TSFlags)) {
2369  // For k-zero-masked operations it is Ok to commute the first vector
2370  // operand. Unless this is an intrinsic instruction.
2371  // For regular k-masked operations a conservative choice is done as the
2372  // elements of the first vector operand, for which the corresponding bit
2373  // in the k-mask operand is set to 0, are copied to the result of the
2374  // instruction.
2375  // TODO/FIXME: The commute still may be legal if it is known that the
2376  // k-mask operand is set to either all ones or all zeroes.
2377  // It is also Ok to commute the 1st operand if all users of MI use only
2378  // the elements enabled by the k-mask operand. For example,
2379  // v4 = VFMADD213PSZrk v1, k, v2, v3; // v1[i] = k[i] ? v2[i]*v1[i]+v3[i]
2380  // : v1[i];
2381  // VMOVAPSZmrk <mem_addr>, k, v4; // this is the ONLY user of v4 ->
2382  // // Ok, to commute v1 in FMADD213PSZrk.
2383 
2384  // The k-mask operand has index = 2 for masked and zero-masked operations.
2385  KMaskOp = 2;
2386 
2387  // The operand with index = 1 is used as a source for those elements for
2388  // which the corresponding bit in the k-mask is set to 0.
2389  if (X86II::isKMergeMasked(TSFlags) || IsIntrinsic)
2390  FirstCommutableVecOp = 3;
2391 
2392  LastCommutableVecOp++;
2393  } else if (IsIntrinsic) {
2394  // Commuting the first operand of an intrinsic instruction isn't possible
2395  // unless we can prove that only the lowest element of the result is used.
2396  FirstCommutableVecOp = 2;
2397  }
2398 
2399  if (isMem(MI, LastCommutableVecOp))
2400  LastCommutableVecOp--;
2401 
2402  // Only the first RegOpsNum operands are commutable.
2403  // Also, the value 'CommuteAnyOperandIndex' is valid here as it means
2404  // that the operand is not specified/fixed.
2405  if (SrcOpIdx1 != CommuteAnyOperandIndex &&
2406  (SrcOpIdx1 < FirstCommutableVecOp || SrcOpIdx1 > LastCommutableVecOp ||
2407  SrcOpIdx1 == KMaskOp))
2408  return false;
2409  if (SrcOpIdx2 != CommuteAnyOperandIndex &&
2410  (SrcOpIdx2 < FirstCommutableVecOp || SrcOpIdx2 > LastCommutableVecOp ||
2411  SrcOpIdx2 == KMaskOp))
2412  return false;
2413 
2414  // Look for two different register operands assumed to be commutable
2415  // regardless of the FMA opcode. The FMA opcode is adjusted later.
2416  if (SrcOpIdx1 == CommuteAnyOperandIndex ||
2417  SrcOpIdx2 == CommuteAnyOperandIndex) {
2418  unsigned CommutableOpIdx2 = SrcOpIdx2;
2419 
2420  // At least one of operands to be commuted is not specified and
2421  // this method is free to choose appropriate commutable operands.
2422  if (SrcOpIdx1 == SrcOpIdx2)
2423  // Both of operands are not fixed. By default set one of commutable
2424  // operands to the last register operand of the instruction.
2425  CommutableOpIdx2 = LastCommutableVecOp;
2426  else if (SrcOpIdx2 == CommuteAnyOperandIndex)
2427  // Only one of operands is not fixed.
2428  CommutableOpIdx2 = SrcOpIdx1;
2429 
2430  // CommutableOpIdx2 is well defined now. Let's choose another commutable
2431  // operand and assign its index to CommutableOpIdx1.
2432  Register Op2Reg = MI.getOperand(CommutableOpIdx2).getReg();
2433 
2434  unsigned CommutableOpIdx1;
2435  for (CommutableOpIdx1 = LastCommutableVecOp;
2436  CommutableOpIdx1 >= FirstCommutableVecOp; CommutableOpIdx1--) {
2437  // Just ignore and skip the k-mask operand.
2438  if (CommutableOpIdx1 == KMaskOp)
2439  continue;
2440 
2441  // The commuted operands must have different registers.
2442  // Otherwise, the commute transformation does not change anything and
2443  // is useless then.
2444  if (Op2Reg != MI.getOperand(CommutableOpIdx1).getReg())
2445  break;
2446  }
2447 
2448  // No appropriate commutable operands were found.
2449  if (CommutableOpIdx1 < FirstCommutableVecOp)
2450  return false;
2451 
2452  // Assign the found pair of commutable indices to SrcOpIdx1 and SrcOpidx2
2453  // to return those values.
2454  if (!fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2,
2455  CommutableOpIdx1, CommutableOpIdx2))
2456  return false;
2457  }
2458 
2459  return true;
2460 }
2461 
2463  unsigned &SrcOpIdx1,
2464  unsigned &SrcOpIdx2) const {
2465  const MCInstrDesc &Desc = MI.getDesc();
2466  if (!Desc.isCommutable())
2467  return false;
2468 
2469  switch (MI.getOpcode()) {
2470  case X86::CMPSDrr:
2471  case X86::CMPSSrr:
2472  case X86::CMPPDrri:
2473  case X86::CMPPSrri:
2474  case X86::VCMPSDrr:
2475  case X86::VCMPSSrr:
2476  case X86::VCMPPDrri:
2477  case X86::VCMPPSrri:
2478  case X86::VCMPPDYrri:
2479  case X86::VCMPPSYrri:
2480  case X86::VCMPSDZrr:
2481  case X86::VCMPSSZrr:
2482  case X86::VCMPPDZrri:
2483  case X86::VCMPPSZrri:
2484  case X86::VCMPPDZ128rri:
2485  case X86::VCMPPSZ128rri:
2486  case X86::VCMPPDZ256rri:
2487  case X86::VCMPPSZ256rri:
2488  case X86::VCMPPDZrrik:
2489  case X86::VCMPPSZrrik:
2490  case X86::VCMPPDZ128rrik:
2491  case X86::VCMPPSZ128rrik:
2492  case X86::VCMPPDZ256rrik:
2493  case X86::VCMPPSZ256rrik: {
2494  unsigned OpOffset = X86II::isKMasked(Desc.TSFlags) ? 1 : 0;
2495 
2496  // Float comparison can be safely commuted for
2497  // Ordered/Unordered/Equal/NotEqual tests
2498  unsigned Imm = MI.getOperand(3 + OpOffset).getImm() & 0x7;
2499  switch (Imm) {
2500  default:
2501  // EVEX versions can be commuted.
2502  if ((Desc.TSFlags & X86II::EncodingMask) == X86II::EVEX)
2503  break;
2504  return false;
2505  case 0x00: // EQUAL
2506  case 0x03: // UNORDERED
2507  case 0x04: // NOT EQUAL
2508  case 0x07: // ORDERED
2509  break;
2510  }
2511 
2512  // The indices of the commutable operands are 1 and 2 (or 2 and 3
2513  // when masked).
2514  // Assign them to the returned operand indices here.
2515  return fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2, 1 + OpOffset,
2516  2 + OpOffset);
2517  }
2518  case X86::MOVSSrr:
2519  // X86::MOVSDrr is always commutable. MOVSS is only commutable if we can
2520  // form sse4.1 blend. We assume VMOVSSrr/VMOVSDrr is always commutable since
2521  // AVX implies sse4.1.
2522  if (Subtarget.hasSSE41())
2523  return TargetInstrInfo::findCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2);
2524  return false;
2525  case X86::SHUFPDrri:
2526  // We can commute this to MOVSD.
2527  if (MI.getOperand(3).getImm() == 0x02)
2528  return TargetInstrInfo::findCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2);
2529  return false;
2530  case X86::MOVHLPSrr:
2531  case X86::UNPCKHPDrr:
2532  case X86::VMOVHLPSrr:
2533  case X86::VUNPCKHPDrr:
2534  case X86::VMOVHLPSZrr:
2535  case X86::VUNPCKHPDZ128rr:
2536  if (Subtarget.hasSSE2())
2537  return TargetInstrInfo::findCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2);
2538  return false;
2539  case X86::VPTERNLOGDZrri: case X86::VPTERNLOGDZrmi:
2540  case X86::VPTERNLOGDZ128rri: case X86::VPTERNLOGDZ128rmi:
2541  case X86::VPTERNLOGDZ256rri: case X86::VPTERNLOGDZ256rmi:
2542  case X86::VPTERNLOGQZrri: case X86::VPTERNLOGQZrmi:
2543  case X86::VPTERNLOGQZ128rri: case X86::VPTERNLOGQZ128rmi:
2544  case X86::VPTERNLOGQZ256rri: case X86::VPTERNLOGQZ256rmi:
2545  case X86::VPTERNLOGDZrrik:
2546  case X86::VPTERNLOGDZ128rrik:
2547  case X86::VPTERNLOGDZ256rrik:
2548  case X86::VPTERNLOGQZrrik:
2549  case X86::VPTERNLOGQZ128rrik:
2550  case X86::VPTERNLOGQZ256rrik:
2551  case X86::VPTERNLOGDZrrikz: case X86::VPTERNLOGDZrmikz:
2552  case X86::VPTERNLOGDZ128rrikz: case X86::VPTERNLOGDZ128rmikz:
2553  case X86::VPTERNLOGDZ256rrikz: case X86::VPTERNLOGDZ256rmikz:
2554  case X86::VPTERNLOGQZrrikz: case X86::VPTERNLOGQZrmikz:
2555  case X86::VPTERNLOGQZ128rrikz: case X86::VPTERNLOGQZ128rmikz:
2556  case X86::VPTERNLOGQZ256rrikz: case X86::VPTERNLOGQZ256rmikz:
2557  case X86::VPTERNLOGDZ128rmbi:
2558  case X86::VPTERNLOGDZ256rmbi:
2559  case X86::VPTERNLOGDZrmbi:
2560  case X86::VPTERNLOGQZ128rmbi:
2561  case X86::VPTERNLOGQZ256rmbi:
2562  case X86::VPTERNLOGQZrmbi:
2563  case X86::VPTERNLOGDZ128rmbikz:
2564  case X86::VPTERNLOGDZ256rmbikz:
2565  case X86::VPTERNLOGDZrmbikz:
2566  case X86::VPTERNLOGQZ128rmbikz:
2567  case X86::VPTERNLOGQZ256rmbikz:
2568  case X86::VPTERNLOGQZrmbikz:
2569  return findThreeSrcCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2);
2570  case X86::VPDPWSSDYrr:
2571  case X86::VPDPWSSDrr:
2572  case X86::VPDPWSSDSYrr:
2573  case X86::VPDPWSSDSrr:
2574  case X86::VPDPWSSDZ128r:
2575  case X86::VPDPWSSDZ128rk:
2576  case X86::VPDPWSSDZ128rkz:
2577  case X86::VPDPWSSDZ256r:
2578  case X86::VPDPWSSDZ256rk:
2579  case X86::VPDPWSSDZ256rkz:
2580  case X86::VPDPWSSDZr:
2581  case X86::VPDPWSSDZrk:
2582  case X86::VPDPWSSDZrkz:
2583  case X86::VPDPWSSDSZ128r:
2584  case X86::VPDPWSSDSZ128rk:
2585  case X86::VPDPWSSDSZ128rkz:
2586  case X86::VPDPWSSDSZ256r:
2587  case X86::VPDPWSSDSZ256rk:
2588  case X86::VPDPWSSDSZ256rkz:
2589  case X86::VPDPWSSDSZr:
2590  case X86::VPDPWSSDSZrk:
2591  case X86::VPDPWSSDSZrkz:
2592  case X86::VPMADD52HUQZ128r:
2593  case X86::VPMADD52HUQZ128rk:
2594  case X86::VPMADD52HUQZ128rkz:
2595  case X86::VPMADD52HUQZ256r:
2596  case X86::VPMADD52HUQZ256rk:
2597  case X86::VPMADD52HUQZ256rkz:
2598  case X86::VPMADD52HUQZr:
2599  case X86::VPMADD52HUQZrk:
2600  case X86::VPMADD52HUQZrkz:
2601  case X86::VPMADD52LUQZ128r:
2602  case X86::VPMADD52LUQZ128rk:
2603  case X86::VPMADD52LUQZ128rkz:
2604  case X86::VPMADD52LUQZ256r:
2605  case X86::VPMADD52LUQZ256rk:
2606  case X86::VPMADD52LUQZ256rkz:
2607  case X86::VPMADD52LUQZr:
2608  case X86::VPMADD52LUQZrk:
2609  case X86::VPMADD52LUQZrkz: {
2610  unsigned CommutableOpIdx1 = 2;
2611  unsigned CommutableOpIdx2 = 3;
2612  if (X86II::isKMasked(Desc.TSFlags)) {
2613  // Skip the mask register.
2614  ++CommutableOpIdx1;
2615  ++CommutableOpIdx2;
2616  }
2617  if (!fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2,
2618  CommutableOpIdx1, CommutableOpIdx2))
2619  return false;
2620  if (!MI.getOperand(SrcOpIdx1).isReg() ||
2621  !MI.getOperand(SrcOpIdx2).isReg())
2622  // No idea.
2623  return false;
2624  return true;
2625  }
2626 
2627  default:
2628  const X86InstrFMA3Group *FMA3Group = getFMA3Group(MI.getOpcode(),
2629  MI.getDesc().TSFlags);
2630  if (FMA3Group)
2631  return findThreeSrcCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2,
2632  FMA3Group->isIntrinsic());
2633 
2634  // Handled masked instructions since we need to skip over the mask input
2635  // and the preserved input.
2636  if (X86II::isKMasked(Desc.TSFlags)) {
2637  // First assume that the first input is the mask operand and skip past it.
2638  unsigned CommutableOpIdx1 = Desc.getNumDefs() + 1;
2639  unsigned CommutableOpIdx2 = Desc.getNumDefs() + 2;
2640  // Check if the first input is tied. If there isn't one then we only
2641  // need to skip the mask operand which we did above.
2642  if ((MI.getDesc().getOperandConstraint(Desc.getNumDefs(),
2643  MCOI::TIED_TO) != -1)) {
2644  // If this is zero masking instruction with a tied operand, we need to
2645  // move the first index back to the first input since this must
2646  // be a 3 input instruction and we want the first two non-mask inputs.
2647  // Otherwise this is a 2 input instruction with a preserved input and
2648  // mask, so we need to move the indices to skip one more input.
2649  if (X86II::isKMergeMasked(Desc.TSFlags)) {
2650  ++CommutableOpIdx1;
2651  ++CommutableOpIdx2;
2652  } else {
2653  --CommutableOpIdx1;
2654  }
2655  }
2656 
2657  if (!fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2,
2658  CommutableOpIdx1, CommutableOpIdx2))
2659  return false;
2660 
2661  if (!MI.getOperand(SrcOpIdx1).isReg() ||
2662  !MI.getOperand(SrcOpIdx2).isReg())
2663  // No idea.
2664  return false;
2665  return true;
2666  }
2667 
2668  return TargetInstrInfo::findCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2);
2669  }
2670  return false;
2671 }
2672 
2674  unsigned Opcode = MI->getOpcode();
2675  if (Opcode != X86::LEA32r && Opcode != X86::LEA64r &&
2676  Opcode != X86::LEA64_32r)
2677  return false;
2678 
2679  const MachineOperand &Scale = MI->getOperand(1 + X86::AddrScaleAmt);
2680  const MachineOperand &Disp = MI->getOperand(1 + X86::AddrDisp);
2681  const MachineOperand &Segment = MI->getOperand(1 + X86::AddrSegmentReg);
2682 
2683  if (Segment.getReg() != 0 || !Disp.isImm() || Disp.getImm() != 0 ||
2684  Scale.getImm() > 1)
2685  return false;
2686 
2687  return true;
2688 }
2689 
2691  // Currently we're interested in following sequence only.
2692  // r3 = lea r1, r2
2693  // r5 = add r3, r4
2694  // Both r3 and r4 are killed in add, we hope the add instruction has the
2695  // operand order
2696  // r5 = add r4, r3
2697  // So later in X86FixupLEAs the lea instruction can be rewritten as add.
2698  unsigned Opcode = MI.getOpcode();
2699  if (Opcode != X86::ADD32rr && Opcode != X86::ADD64rr)
2700  return false;
2701 
2702  const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
2703  Register Reg1 = MI.getOperand(1).getReg();
2704  Register Reg2 = MI.getOperand(2).getReg();
2705 
2706  // Check if Reg1 comes from LEA in the same MBB.
2707  if (MachineInstr *Inst = MRI.getUniqueVRegDef(Reg1)) {
2708  if (isConvertibleLEA(Inst) && Inst->getParent() == MI.getParent()) {
2709  Commute = true;
2710  return true;
2711  }
2712  }
2713 
2714  // Check if Reg2 comes from LEA in the same MBB.
2715  if (MachineInstr *Inst = MRI.getUniqueVRegDef(Reg2)) {
2716  if (isConvertibleLEA(Inst) && Inst->getParent() == MI.getParent()) {
2717  Commute = false;
2718  return true;
2719  }
2720  }
2721 
2722  return false;
2723 }
2724 
2726  switch (MI.getOpcode()) {
2727  default: return X86::COND_INVALID;
2728  case X86::JCC_1:
2729  return static_cast<X86::CondCode>(
2730  MI.getOperand(MI.getDesc().getNumOperands() - 1).getImm());
2731  }
2732 }
2733 
2734 /// Return condition code of a SETCC opcode.
2736  switch (MI.getOpcode()) {
2737  default: return X86::COND_INVALID;
2738  case X86::SETCCr: case X86::SETCCm:
2739  return static_cast<X86::CondCode>(
2740  MI.getOperand(MI.getDesc().getNumOperands() - 1).getImm());
2741  }
2742 }
2743 
2744 /// Return condition code of a CMov opcode.
2746  switch (MI.getOpcode()) {
2747  default: return X86::COND_INVALID;
2748  case X86::CMOV16rr: case X86::CMOV32rr: case X86::CMOV64rr:
2749  case X86::CMOV16rm: case X86::CMOV32rm: case X86::CMOV64rm:
2750  return static_cast<X86::CondCode>(
2751  MI.getOperand(MI.getDesc().getNumOperands() - 1).getImm());
2752  }
2753 }
2754 
2755 /// Return the inverse of the specified condition,
2756 /// e.g. turning COND_E to COND_NE.
2758  switch (CC) {
2759  default: llvm_unreachable("Illegal condition code!");
2760  case X86::COND_E: return X86::COND_NE;
2761  case X86::COND_NE: return X86::COND_E;
2762  case X86::COND_L: return X86::COND_GE;
2763  case X86::COND_LE: return X86::COND_G;
2764  case X86::COND_G: return X86::COND_LE;
2765  case X86::COND_GE: return X86::COND_L;
2766  case X86::COND_B: return X86::COND_AE;
2767  case X86::COND_BE: return X86::COND_A;
2768  case X86::COND_A: return X86::COND_BE;
2769  case X86::COND_AE: return X86::COND_B;
2770  case X86::COND_S: return X86::COND_NS;
2771  case X86::COND_NS: return X86::COND_S;
2772  case X86::COND_P: return X86::COND_NP;
2773  case X86::COND_NP: return X86::COND_P;
2774  case X86::COND_O: return X86::COND_NO;
2775  case X86::COND_NO: return X86::COND_O;
2778  }
2779 }
2780 
2781 /// Assuming the flags are set by MI(a,b), return the condition code if we
2782 /// modify the instructions such that flags are set by MI(b,a).
2784  switch (CC) {
2785  default: return X86::COND_INVALID;
2786  case X86::COND_E: return X86::COND_E;
2787  case X86::COND_NE: return X86::COND_NE;
2788  case X86::COND_L: return X86::COND_G;
2789  case X86::COND_LE: return X86::COND_GE;
2790  case X86::COND_G: return X86::COND_L;
2791  case X86::COND_GE: return X86::COND_LE;
2792  case X86::COND_B: return X86::COND_A;
2793  case X86::COND_BE: return X86::COND_AE;
2794  case X86::COND_A: return X86::COND_B;
2795  case X86::COND_AE: return X86::COND_BE;
2796  }
2797 }
2798 
2799 std::pair<X86::CondCode, bool>
2802  bool NeedSwap = false;
2803  switch (Predicate) {
2804  default: break;
2805  // Floating-point Predicates
2806  case CmpInst::FCMP_UEQ: CC = X86::COND_E; break;
2807  case CmpInst::FCMP_OLT: NeedSwap = true; LLVM_FALLTHROUGH;
2808  case CmpInst::FCMP_OGT: CC = X86::COND_A; break;
2809  case CmpInst::FCMP_OLE: NeedSwap = true; LLVM_FALLTHROUGH;
2810  case CmpInst::FCMP_OGE: CC = X86::COND_AE; break;
2811  case CmpInst::FCMP_UGT: NeedSwap = true; LLVM_FALLTHROUGH;
2812  case CmpInst::FCMP_ULT: CC = X86::COND_B; break;
2813  case CmpInst::FCMP_UGE: NeedSwap = true; LLVM_FALLTHROUGH;
2814  case CmpInst::FCMP_ULE: CC = X86::COND_BE; break;
2815  case CmpInst::FCMP_ONE: CC = X86::COND_NE; break;
2816  case CmpInst::FCMP_UNO: CC = X86::COND_P; break;
2817  case CmpInst::FCMP_ORD: CC = X86::COND_NP; break;
2819  case CmpInst::FCMP_UNE: CC = X86::COND_INVALID; break;
2820 
2821  // Integer Predicates
2822  case CmpInst::ICMP_EQ: CC = X86::COND_E; break;
2823  case CmpInst::ICMP_NE: CC = X86::COND_NE; break;
2824  case CmpInst::ICMP_UGT: CC = X86::COND_A; break;
2825  case CmpInst::ICMP_UGE: CC = X86::COND_AE; break;
2826  case CmpInst::ICMP_ULT: CC = X86::COND_B; break;
2827  case CmpInst::ICMP_ULE: CC = X86::COND_BE; break;
2828  case CmpInst::ICMP_SGT: CC = X86::COND_G; break;
2829  case CmpInst::ICMP_SGE: CC = X86::COND_GE; break;
2830  case CmpInst::ICMP_SLT: CC = X86::COND_L; break;
2831  case CmpInst::ICMP_SLE: CC = X86::COND_LE; break;
2832  }
2833 
2834  return std::make_pair(CC, NeedSwap);
2835 }
2836 
2837 /// Return a setcc opcode based on whether it has memory operand.
2838 unsigned X86::getSETOpc(bool HasMemoryOperand) {
2839  return HasMemoryOperand ? X86::SETCCr : X86::SETCCm;
2840 }
2841 
2842 /// Return a cmov opcode for the given register size in bytes, and operand type.
2843 unsigned X86::getCMovOpcode(unsigned RegBytes, bool HasMemoryOperand) {
2844  switch(RegBytes) {
2845  default: llvm_unreachable("Illegal register size!");
2846  case 2: return HasMemoryOperand ? X86::CMOV16rm : X86::CMOV16rr;
2847  case 4: return HasMemoryOperand ? X86::CMOV32rm : X86::CMOV32rr;
2848  case 8: return HasMemoryOperand ? X86::CMOV64rm : X86::CMOV64rr;
2849  }
2850 }
2851 
2852 /// Get the VPCMP immediate for the given condition.
2854  switch (CC) {
2855  default: llvm_unreachable("Unexpected SETCC condition");
2856  case ISD::SETNE: return 4;
2857  case ISD::SETEQ: return 0;
2858  case ISD::SETULT:
2859  case ISD::SETLT: return 1;
2860  case ISD::SETUGT:
2861  case ISD::SETGT: return 6;
2862  case ISD::SETUGE:
2863  case ISD::SETGE: return 5;
2864  case ISD::SETULE:
2865  case ISD::SETLE: return 2;
2866  }
2867 }
2868 
2869 /// Get the VPCMP immediate if the operands are swapped.
2870 unsigned X86::getSwappedVPCMPImm(unsigned Imm) {
2871  switch (Imm) {
2872  default: llvm_unreachable("Unreachable!");
2873  case 0x01: Imm = 0x06; break; // LT -> NLE
2874  case 0x02: Imm = 0x05; break; // LE -> NLT
2875  case 0x05: Imm = 0x02; break; // NLT -> LE
2876  case 0x06: Imm = 0x01; break; // NLE -> LT
2877  case 0x00: // EQ
2878  case 0x03: // FALSE
2879  case 0x04: // NE
2880  case 0x07: // TRUE
2881  break;
2882  }
2883 
2884  return Imm;
2885 }
2886 
2887 /// Get the VPCOM immediate if the operands are swapped.
2888 unsigned X86::getSwappedVPCOMImm(unsigned Imm) {
2889  switch (Imm) {
2890  default: llvm_unreachable("Unreachable!");
2891  case 0x00: Imm = 0x02; break; // LT -> GT
2892  case 0x01: Imm = 0x03; break; // LE -> GE
2893  case 0x02: Imm = 0x00; break; // GT -> LT
2894  case 0x03: Imm = 0x01; break; // GE -> LE
2895  case 0x04: // EQ
2896  case 0x05: // NE
2897  case 0x06: // FALSE
2898  case 0x07: // TRUE
2899  break;
2900  }
2901 
2902  return Imm;
2903 }
2904 
2905 /// Get the VCMP immediate if the operands are swapped.
2906 unsigned X86::getSwappedVCMPImm(unsigned Imm) {
2907  // Only need the lower 2 bits to distinquish.
2908  switch (Imm & 0x3) {
2909  default: llvm_unreachable("Unreachable!");
2910  case 0x00: case 0x03:
2911  // EQ/NE/TRUE/FALSE/ORD/UNORD don't change immediate when commuted.
2912  break;
2913  case 0x01: case 0x02:
2914  // Need to toggle bits 3:0. Bit 4 stays the same.
2915  Imm ^= 0xf;
2916  break;
2917  }
2918 
2919  return Imm;
2920 }
2921 
2923  switch (MI.getOpcode()) {
2924  case X86::TCRETURNdi:
2925  case X86::TCRETURNri:
2926  case X86::TCRETURNmi:
2927  case X86::TCRETURNdi64:
2928  case X86::TCRETURNri64:
2929  case X86::TCRETURNmi64:
2930  return true;
2931  default:
2932  return false;
2933  }
2934 }
2935 
2937  SmallVectorImpl<MachineOperand> &BranchCond,
2938  const MachineInstr &TailCall) const {
2939  if (TailCall.getOpcode() != X86::TCRETURNdi &&
2940  TailCall.getOpcode() != X86::TCRETURNdi64) {
2941  // Only direct calls can be done with a conditional branch.
2942  return false;
2943  }
2944 
2945  const MachineFunction *MF = TailCall.getParent()->getParent();
2946  if (Subtarget.isTargetWin64() && MF->hasWinCFI()) {
2947  // Conditional tail calls confuse the Win64 unwinder.
2948  return false;
2949  }
2950 
2951  assert(BranchCond.size() == 1);
2952  if (BranchCond[0].getImm() > X86::LAST_VALID_COND) {
2953  // Can't make a conditional tail call with this condition.
2954  return false;
2955  }
2956 
2958  if (X86FI->getTCReturnAddrDelta() != 0 ||
2959  TailCall.getOperand(1).getImm() != 0) {
2960  // A conditional tail call cannot do any stack adjustment.
2961  return false;
2962  }
2963 
2964  return true;
2965 }
2966 
2969  const MachineInstr &TailCall) const {
2971 
2973  while (I != MBB.begin()) {
2974  --I;
2975  if (I->isDebugInstr())
2976  continue;
2977  if (!I->isBranch())
2978  assert(0 && "Can't find the branch to replace!");
2979 
2981  assert(BranchCond.size() == 1);
2982  if (CC != BranchCond[0].getImm())
2983  continue;
2984 
2985  break;
2986  }
2987 
2988  unsigned Opc = TailCall.getOpcode() == X86::TCRETURNdi ? X86::TCRETURNdicc
2989  : X86::TCRETURNdi64cc;
2990 
2991  auto MIB = BuildMI(MBB, I, MBB.findDebugLoc(I), get(Opc));
2992  MIB->addOperand(TailCall.getOperand(0)); // Destination.
2993  MIB.addImm(0); // Stack offset (not used).
2994  MIB->addOperand(BranchCond[0]); // Condition.
2995  MIB.copyImplicitOps(TailCall); // Regmask and (imp-used) parameters.
2996 
2997  // Add implicit uses and defs of all live regs potentially clobbered by the
2998  // call. This way they still appear live across the call.
2999  LivePhysRegs LiveRegs(getRegisterInfo());
3000  LiveRegs.addLiveOuts(MBB);
3002  LiveRegs.stepForward(*MIB, Clobbers);
3003  for (const auto &C : Clobbers) {
3004  MIB.addReg(C.first, RegState::Implicit);
3006  }
3007 
3008  I->eraseFromParent();
3009 }
3010 
3011 // Given a MBB and its TBB, find the FBB which was a fallthrough MBB (it may
3012 // not be a fallthrough MBB now due to layout changes). Return nullptr if the
3013 // fallthrough MBB cannot be identified.
3015  MachineBasicBlock *TBB) {
3016  // Look for non-EHPad successors other than TBB. If we find exactly one, it
3017  // is the fallthrough MBB. If we find zero, then TBB is both the target MBB
3018  // and fallthrough MBB. If we find more than one, we cannot identify the
3019  // fallthrough MBB and should return nullptr.
3020  MachineBasicBlock *FallthroughBB = nullptr;
3021  for (auto SI = MBB->succ_begin(), SE = MBB->succ_end(); SI != SE; ++SI) {
3022  if ((*SI)->isEHPad() || (*SI == TBB && FallthroughBB))
3023  continue;
3024  // Return a nullptr if we found more than one fallthrough successor.
3025  if (FallthroughBB && FallthroughBB != TBB)
3026  return nullptr;
3027  FallthroughBB = *SI;
3028  }
3029  return FallthroughBB;
3030 }
3031 
3032 bool X86InstrInfo::AnalyzeBranchImpl(
3035  SmallVectorImpl<MachineInstr *> &CondBranches, bool AllowModify) const {
3036 
3037  // Start from the bottom of the block and work up, examining the
3038  // terminator instructions.
3040  MachineBasicBlock::iterator UnCondBrIter = MBB.end();
3041  while (I != MBB.begin()) {
3042  --I;
3043  if (I->isDebugInstr())
3044  continue;
3045 
3046  // Working from the bottom, when we see a non-terminator instruction, we're
3047  // done.
3048  if (!isUnpredicatedTerminator(*I))
3049  break;
3050 
3051  // A terminator that isn't a branch can't easily be handled by this
3052  // analysis.
3053  if (!I->isBranch())
3054  return true;
3055 
3056  // Handle unconditional branches.
3057  if (I->getOpcode() == X86::JMP_1) {
3058  UnCondBrIter = I;
3059 
3060  if (!AllowModify) {
3061  TBB = I->getOperand(0).getMBB();
3062  continue;
3063  }
3064 
3065  // If the block has any instructions after a JMP, delete them.
3066  while (std::next(I) != MBB.end())
3067  std::next(I)->eraseFromParent();
3068 
3069  Cond.clear();
3070  FBB = nullptr;
3071 
3072  // Delete the JMP if it's equivalent to a fall-through.
3073  if (MBB.isLayoutSuccessor(I->getOperand(0).getMBB())) {
3074  TBB = nullptr;
3075  I->eraseFromParent();
3076  I = MBB.end();
3077  UnCondBrIter = MBB.end();
3078  continue;
3079  }
3080 
3081  // TBB is used to indicate the unconditional destination.
3082  TBB = I->getOperand(0).getMBB();
3083  continue;
3084  }
3085 
3086  // Handle conditional branches.
3087  X86::CondCode BranchCode = X86::getCondFromBranch(*I);
3088  if (BranchCode == X86::COND_INVALID)
3089  return true; // Can't handle indirect branch.
3090 
3091  // In practice we should never have an undef eflags operand, if we do
3092  // abort here as we are not prepared to preserve the flag.
3093  if (I->findRegisterUseOperand(X86::EFLAGS)->isUndef())
3094  return true;
3095 
3096  // Working from the bottom, handle the first conditional branch.
3097  if (Cond.empty()) {
3098  MachineBasicBlock *TargetBB = I->getOperand(0).getMBB();
3099  if (AllowModify && UnCondBrIter != MBB.end() &&
3100  MBB.isLayoutSuccessor(TargetBB)) {
3101  // If we can modify the code and it ends in something like:
3102  //
3103  // jCC L1
3104  // jmp L2
3105  // L1:
3106  // ...
3107  // L2:
3108  //
3109  // Then we can change this to:
3110  //
3111  // jnCC L2
3112  // L1:
3113  // ...
3114  // L2:
3115  //
3116  // Which is a bit more efficient.
3117  // We conditionally jump to the fall-through block.
3118  BranchCode = GetOppositeBranchCondition(BranchCode);
3119  MachineBasicBlock::iterator OldInst = I;
3120 
3121  BuildMI(MBB, UnCondBrIter, MBB.findDebugLoc(I), get(X86::JCC_1))
3122  .addMBB(UnCondBrIter->getOperand(0).getMBB())
3123  .addImm(BranchCode);
3124  BuildMI(MBB, UnCondBrIter, MBB.findDebugLoc(I), get(X86::JMP_1))
3125  .addMBB(TargetBB);
3126 
3127  OldInst->eraseFromParent();
3128  UnCondBrIter->eraseFromParent();
3129 
3130  // Restart the analysis.
3131  UnCondBrIter = MBB.end();
3132  I = MBB.end();
3133  continue;
3134  }
3135 
3136  FBB = TBB;
3137  TBB = I->getOperand(0).getMBB();
3138  Cond.push_back(MachineOperand::CreateImm(BranchCode));
3139  CondBranches.push_back(&*I);
3140  continue;
3141  }
3142 
3143  // Handle subsequent conditional branches. Only handle the case where all
3144  // conditional branches branch to the same destination and their condition
3145  // opcodes fit one of the special multi-branch idioms.
3146  assert(Cond.size() == 1);
3147  assert(TBB);
3148 
3149  // If the conditions are the same, we can leave them alone.
3150  X86::CondCode OldBranchCode = (X86::CondCode)Cond[0].getImm();
3151  auto NewTBB = I->getOperand(0).getMBB();
3152  if (OldBranchCode == BranchCode && TBB == NewTBB)
3153  continue;
3154 
3155  // If they differ, see if they fit one of the known patterns. Theoretically,
3156  // we could handle more patterns here, but we shouldn't expect to see them
3157  // if instruction selection has done a reasonable job.
3158  if (TBB == NewTBB &&
3159  ((OldBranchCode == X86::COND_P && BranchCode == X86::COND_NE) ||
3160  (OldBranchCode == X86::COND_NE && BranchCode == X86::COND_P))) {
3161  BranchCode = X86::COND_NE_OR_P;
3162  } else if ((OldBranchCode == X86::COND_NP && BranchCode == X86::COND_NE) ||
3163  (OldBranchCode == X86::COND_E && BranchCode == X86::COND_P)) {
3164  if (NewTBB != (FBB ? FBB : getFallThroughMBB(&MBB, TBB)))
3165  return true;
3166 
3167  // X86::COND_E_AND_NP usually has two different branch destinations.
3168  //
3169  // JP B1
3170  // JE B2
3171  // JMP B1
3172  // B1:
3173  // B2:
3174  //
3175  // Here this condition branches to B2 only if NP && E. It has another
3176  // equivalent form:
3177  //
3178  // JNE B1
3179  // JNP B2
3180  // JMP B1
3181  // B1:
3182  // B2:
3183  //
3184  // Similarly it branches to B2 only if E && NP. That is why this condition
3185  // is named with COND_E_AND_NP.
3186  BranchCode = X86::COND_E_AND_NP;
3187  } else
3188  return true;
3189 
3190  // Update the MachineOperand.
3191  Cond[0].setImm(BranchCode);
3192  CondBranches.push_back(&*I);
3193  }
3194 
3195  return false;
3196 }
3197 
3199  MachineBasicBlock *&TBB,
3200  MachineBasicBlock *&FBB,
3202  bool AllowModify) const {
3203  SmallVector<MachineInstr *, 4> CondBranches;
3204  return AnalyzeBranchImpl(MBB, TBB, FBB, Cond, CondBranches, AllowModify);
3205 }
3206 
3208  MachineBranchPredicate &MBP,
3209  bool AllowModify) const {
3210  using namespace std::placeholders;
3211 
3213  SmallVector<MachineInstr *, 4> CondBranches;
3214  if (AnalyzeBranchImpl(MBB, MBP.TrueDest, MBP.FalseDest, Cond, CondBranches,
3215  AllowModify))
3216  return true;
3217 
3218  if (Cond.size() != 1)
3219  return true;
3220 
3221  assert(MBP.TrueDest && "expected!");
3222 
3223  if (!MBP.FalseDest)
3224  MBP.FalseDest = MBB.getNextNode();
3225 
3227 
3228  MachineInstr *ConditionDef = nullptr;
3229  bool SingleUseCondition = true;
3230 
3231  for (auto I = std::next(MBB.rbegin()), E = MBB.rend(); I != E; ++I) {
3232  if (I->modifiesRegister(X86::EFLAGS, TRI)) {
3233  ConditionDef = &*I;
3234  break;
3235  }
3236 
3237  if (I->readsRegister(X86::EFLAGS, TRI))
3238  SingleUseCondition = false;
3239  }
3240 
3241  if (!ConditionDef)
3242  return true;
3243 
3244  if (SingleUseCondition) {
3245  for (auto *Succ : MBB.successors())
3246  if (Succ->isLiveIn(X86::EFLAGS))
3247  SingleUseCondition = false;
3248  }
3249 
3250  MBP.ConditionDef = ConditionDef;
3251  MBP.SingleUseCondition = SingleUseCondition;
3252 
3253  // Currently we only recognize the simple pattern:
3254  //
3255  // test %reg, %reg
3256  // je %label
3257  //
3258  const unsigned TestOpcode =
3259  Subtarget.is64Bit() ? X86::TEST64rr : X86::TEST32rr;
3260 
3261  if (ConditionDef->getOpcode() == TestOpcode &&
3262  ConditionDef->getNumOperands() == 3 &&
3263  ConditionDef->getOperand(0).isIdenticalTo(ConditionDef->getOperand(1)) &&
3264  (Cond[0].getImm() == X86::COND_NE || Cond[0].getImm() == X86::COND_E)) {
3265  MBP.LHS = ConditionDef->getOperand(0);
3266  MBP.RHS = MachineOperand::CreateImm(0);
3267  MBP.Predicate = Cond[0].getImm() == X86::COND_NE
3270  return false;
3271  }
3272 
3273  return true;
3274 }
3275 
3277  int *BytesRemoved) const {
3278  assert(!BytesRemoved && "code size not handled");
3279 
3281  unsigned Count = 0;
3282 
3283  while (I != MBB.begin()) {
3284  --I;
3285  if (I->isDebugInstr())
3286  continue;
3287  if (I->getOpcode() != X86::JMP_1 &&
3289  break;
3290  // Remove the branch.
3291  I->eraseFromParent();
3292  I = MBB.end();
3293  ++Count;
3294  }
3295 
3296  return Count;
3297 }
3298 
3300  MachineBasicBlock *TBB,
3301  MachineBasicBlock *FBB,
3303  const DebugLoc &DL,
3304  int *BytesAdded) const {
3305  // Shouldn't be a fall through.
3306  assert(TBB && "insertBranch must not be told to insert a fallthrough");
3307  assert((Cond.size() == 1 || Cond.size() == 0) &&
3308  "X86 branch conditions have one component!");
3309  assert(!BytesAdded && "code size not handled");
3310 
3311  if (Cond.empty()) {
3312  // Unconditional branch?
3313  assert(!FBB && "Unconditional branch with multiple successors!");
3314  BuildMI(&MBB, DL, get(X86::JMP_1)).addMBB(TBB);
3315  return 1;
3316  }
3317 
3318  // If FBB is null, it is implied to be a fall-through block.
3319  bool FallThru = FBB == nullptr;
3320 
3321  // Conditional branch.
3322  unsigned Count = 0;
3323  X86::CondCode CC = (X86::CondCode)Cond[0].getImm();
3324  switch (CC) {
3325  case X86::COND_NE_OR_P:
3326  // Synthesize NE_OR_P with two branches.
3327  BuildMI(&MBB, DL, get(X86::JCC_1)).addMBB(TBB).addImm(X86::COND_NE);
3328  ++Count;
3329  BuildMI(&MBB, DL, get(X86::JCC_1)).addMBB(TBB).addImm(X86::COND_P);
3330  ++Count;
3331  break;
3332  case X86::COND_E_AND_NP:
3333  // Use the next block of MBB as FBB if it is null.
3334  if (FBB == nullptr) {
3335  FBB = getFallThroughMBB(&MBB, TBB);
3336  assert(FBB && "MBB cannot be the last block in function when the false "
3337  "body is a fall-through.");
3338  }
3339  // Synthesize COND_E_AND_NP with two branches.
3340  BuildMI(&MBB, DL, get(X86::JCC_1)).addMBB(FBB).addImm(X86::COND_NE);
3341  ++Count;
3342  BuildMI(&MBB, DL, get(X86::JCC_1)).addMBB(TBB).addImm(X86::COND_NP);
3343  ++Count;
3344  break;
3345  default: {
3346  BuildMI(&MBB, DL, get(X86::JCC_1)).addMBB(TBB).addImm(CC);
3347  ++Count;
3348  }
3349  }
3350  if (!FallThru) {
3351  // Two-way Conditional branch. Insert the second branch.
3352  BuildMI(&MBB, DL, get(X86::JMP_1)).addMBB(FBB);
3353  ++Count;
3354  }
3355  return Count;
3356 }
3357 
3360  Register DstReg, Register TrueReg,
3361  Register FalseReg, int &CondCycles,
3362  int &TrueCycles, int &FalseCycles) const {
3363  // Not all subtargets have cmov instructions.
3364  if (!Subtarget.hasCMov())
3365  return false;
3366  if (Cond.size() != 1)
3367  return false;
3368  // We cannot do the composite conditions, at least not in SSA form.
3369  if ((X86::CondCode)Cond[0].getImm() > X86::LAST_VALID_COND)
3370  return false;
3371 
3372  // Check register classes.
3374  const TargetRegisterClass *RC =
3375  RI.getCommonSubClass(MRI.getRegClass(TrueReg), MRI.getRegClass(FalseReg));
3376  if (!RC)
3377  return false;
3378 
3379  // We have cmov instructions for 16, 32, and 64 bit general purpose registers.
3380  if (X86::GR16RegClass.hasSubClassEq(RC) ||
3381  X86::GR32RegClass.hasSubClassEq(RC) ||
3382  X86::GR64RegClass.hasSubClassEq(RC)) {
3383  // This latency applies to Pentium M, Merom, Wolfdale, Nehalem, and Sandy
3384  // Bridge. Probably Ivy Bridge as well.
3385  CondCycles = 2;
3386  TrueCycles = 2;
3387  FalseCycles = 2;
3388  return true;
3389  }
3390 
3391  // Can't do vectors.
3392  return false;
3393 }
3394 
3397  const DebugLoc &DL, Register DstReg,
3399  Register FalseReg) const {
3402  const TargetRegisterClass &RC = *MRI.getRegClass(DstReg);
3403  assert(Cond.size() == 1 && "Invalid Cond array");
3404  unsigned Opc = X86::getCMovOpcode(TRI.getRegSizeInBits(RC) / 8,
3405  false /*HasMemoryOperand*/);
3406  BuildMI(MBB, I, DL, get(Opc), DstReg)
3407  .addReg(FalseReg)
3408  .addReg(TrueReg)
3409  .addImm(Cond[0].getImm());
3410 }
3411 
3412 /// Test if the given register is a physical h register.
3413 static bool isHReg(unsigned Reg) {
3414  return X86::GR8_ABCD_HRegClass.contains(Reg);
3415 }
3416 
3417 // Try and copy between VR128/VR64 and GR64 registers.
3418 static unsigned CopyToFromAsymmetricReg(unsigned DestReg, unsigned SrcReg,
3419  const X86Subtarget &Subtarget) {
3420  bool HasAVX = Subtarget.hasAVX();
3421  bool HasAVX512 = Subtarget.hasAVX512();
3422 
3423  // SrcReg(MaskReg) -> DestReg(GR64)
3424  // SrcReg(MaskReg) -> DestReg(GR32)
3425 
3426  // All KMASK RegClasses hold the same k registers, can be tested against anyone.
3427  if (X86::VK16RegClass.contains(SrcReg)) {
3428  if (X86::GR64RegClass.contains(DestReg)) {
3429  assert(Subtarget.hasBWI());
3430  return X86::KMOVQrk;
3431  }
3432  if (X86::GR32RegClass.contains(DestReg))
3433  return Subtarget.hasBWI() ? X86::KMOVDrk : X86::KMOVWrk;
3434  }
3435 
3436  // SrcReg(GR64) -> DestReg(MaskReg)
3437  // SrcReg(GR32) -> DestReg(MaskReg)
3438 
3439  // All KMASK RegClasses hold the same k registers, can be tested against anyone.
3440  if (X86::VK16RegClass.contains(DestReg)) {
3441  if (X86::GR64RegClass.contains(SrcReg)) {
3442  assert(Subtarget.hasBWI());
3443  return X86::KMOVQkr;
3444  }
3445  if (X86::GR32RegClass.contains(SrcReg))
3446  return Subtarget.hasBWI() ? X86::KMOVDkr : X86::KMOVWkr;
3447  }
3448 
3449 
3450  // SrcReg(VR128) -> DestReg(GR64)
3451  // SrcReg(VR64) -> DestReg(GR64)
3452  // SrcReg(GR64) -> DestReg(VR128)
3453  // SrcReg(GR64) -> DestReg(VR64)
3454 
3455  if (X86::GR64RegClass.contains(DestReg)) {
3456  if (X86::VR128XRegClass.contains(SrcReg))
3457  // Copy from a VR128 register to a GR64 register.
3458  return HasAVX512 ? X86::VMOVPQIto64Zrr :
3459  HasAVX ? X86::VMOVPQIto64rr :
3460  X86::MOVPQIto64rr;
3461  if (X86::VR64RegClass.contains(SrcReg))
3462  // Copy from a VR64 register to a GR64 register.
3463  return X86::MMX_MOVD64from64rr;
3464  } else if (X86::GR64RegClass.contains(SrcReg)) {
3465  // Copy from a GR64 register to a VR128 register.
3466  if (X86::VR128XRegClass.contains(DestReg))
3467  return HasAVX512 ? X86::VMOV64toPQIZrr :
3468  HasAVX ? X86::VMOV64toPQIrr :
3469  X86::MOV64toPQIrr;
3470  // Copy from a GR64 register to a VR64 register.
3471  if (X86::VR64RegClass.contains(DestReg))
3472  return X86::MMX_MOVD64to64rr;
3473  }
3474 
3475  // SrcReg(VR128) -> DestReg(GR32)
3476  // SrcReg(GR32) -> DestReg(VR128)
3477 
3478  if (X86::GR32RegClass.contains(DestReg) &&
3479  X86::VR128XRegClass.contains(SrcReg))
3480  // Copy from a VR128 register to a GR32 register.
3481  return HasAVX512 ? X86::VMOVPDI2DIZrr :
3482  HasAVX ? X86::VMOVPDI2DIrr :
3483  X86::MOVPDI2DIrr;
3484 
3485  if (X86::VR128XRegClass.contains(DestReg) &&
3486  X86::GR32RegClass.contains(SrcReg))
3487  // Copy from a VR128 register to a VR128 register.
3488  return HasAVX512 ? X86::VMOVDI2PDIZrr :
3489  HasAVX ? X86::VMOVDI2PDIrr :
3490  X86::MOVDI2PDIrr;
3491  return 0;
3492 }
3493 
3496  const DebugLoc &DL, MCRegister DestReg,
3497  MCRegister SrcReg, bool KillSrc) const {
3498  // First deal with the normal symmetric copies.
3499  bool HasAVX = Subtarget.hasAVX();
3500  bool HasVLX = Subtarget.hasVLX();
3501  unsigned Opc = 0;
3502  if (X86::GR64RegClass.contains(DestReg, SrcReg))
3503  Opc = X86::MOV64rr;
3504  else if (X86::GR32RegClass.contains(DestReg, SrcReg))
3505  Opc = X86::MOV32rr;
3506  else if (X86::GR16RegClass.contains(DestReg, SrcReg))
3507  Opc = X86::MOV16rr;
3508  else if (X86::GR8RegClass.contains(DestReg, SrcReg)) {
3509  // Copying to or from a physical H register on x86-64 requires a NOREX
3510  // move. Otherwise use a normal move.
3511  if ((isHReg(DestReg) || isHReg(SrcReg)) &&
3512  Subtarget.is64Bit()) {
3513  Opc = X86::MOV8rr_NOREX;
3514  // Both operands must be encodable without an REX prefix.
3515  assert(X86::GR8_NOREXRegClass.contains(SrcReg, DestReg) &&
3516  "8-bit H register can not be copied outside GR8_NOREX");
3517  } else
3518  Opc = X86::MOV8rr;
3519  }
3520  else if (X86::VR64RegClass.contains(DestReg, SrcReg))
3521  Opc = X86::MMX_MOVQ64rr;
3522  else if (X86::VR128XRegClass.contains(DestReg, SrcReg)) {
3523  if (HasVLX)
3524  Opc = X86::VMOVAPSZ128rr;
3525  else if (X86::VR128RegClass.contains(DestReg, SrcReg))
3526  Opc = HasAVX ? X86::VMOVAPSrr : X86::MOVAPSrr;
3527  else {
3528  // If this an extended register and we don't have VLX we need to use a
3529  // 512-bit move.
3530  Opc = X86::VMOVAPSZrr;
3532  DestReg = TRI->getMatchingSuperReg(DestReg, X86::sub_xmm,
3533  &X86::VR512RegClass);
3534  SrcReg = TRI->getMatchingSuperReg(SrcReg, X86::sub_xmm,
3535  &X86::VR512RegClass);
3536  }
3537  } else if (X86::VR256XRegClass.contains(DestReg, SrcReg)) {
3538  if (HasVLX)
3539  Opc = X86::VMOVAPSZ256rr;
3540  else if (X86::VR256RegClass.contains(DestReg, SrcReg))
3541  Opc = X86::VMOVAPSYrr;
3542  else {
3543  // If this an extended register and we don't have VLX we need to use a
3544  // 512-bit move.
3545  Opc = X86::VMOVAPSZrr;
3547  DestReg = TRI->getMatchingSuperReg(DestReg, X86::sub_ymm,
3548  &X86::VR512RegClass);
3549  SrcReg = TRI->getMatchingSuperReg(SrcReg, X86::sub_ymm,
3550  &X86::VR512RegClass);
3551  }
3552  } else if (X86::VR512RegClass.contains(DestReg, SrcReg))
3553  Opc = X86::VMOVAPSZrr;
3554  // All KMASK RegClasses hold the same k registers, can be tested against anyone.
3555  else if (X86::VK16RegClass.contains(DestReg, SrcReg))
3556  Opc = Subtarget.hasBWI() ? X86::KMOVQkk : X86::KMOVWkk;
3557  if (!Opc)
3558  Opc = CopyToFromAsymmetricReg(DestReg, SrcReg, Subtarget);
3559 
3560  if (Opc) {
3561  BuildMI(MBB, MI, DL, get(Opc), DestReg)
3562  .addReg(SrcReg, getKillRegState(KillSrc));
3563  return;
3564  }
3565 
3566  if (SrcReg == X86::EFLAGS || DestReg == X86::EFLAGS) {
3567  // FIXME: We use a fatal error here because historically LLVM has tried
3568  // lower some of these physreg copies and we want to ensure we get
3569  // reasonable bug reports if someone encounters a case no other testing
3570  // found. This path should be removed after the LLVM 7 release.
3571  report_fatal_error("Unable to copy EFLAGS physical register!");
3572  }
3573 
3574  LLVM_DEBUG(dbgs() << "Cannot copy " << RI.getName(SrcReg) << " to "
3575  << RI.getName(DestReg) << '\n');
3576  report_fatal_error("Cannot emit physreg copy instruction");
3577 }
3578 
3581  if (MI.isMoveReg())
3582  return DestSourcePair{MI.getOperand(0), MI.getOperand(1)};
3583  return None;
3584 }
3585 
3587  const TargetRegisterClass *RC,
3588  bool IsStackAligned,
3589  const X86Subtarget &STI, bool load) {
3590  bool HasAVX = STI.hasAVX();
3591  bool HasAVX512 = STI.hasAVX512();
3592  bool HasVLX = STI.hasVLX();
3593 
3594  switch (STI.getRegisterInfo()->getSpillSize(*RC)) {
3595  default:
3596  llvm_unreachable("Unknown spill size");
3597  case 1:
3598  assert(X86::GR8RegClass.hasSubClassEq(RC) && "Unknown 1-byte regclass");
3599  if (STI.is64Bit())
3600  // Copying to or from a physical H register on x86-64 requires a NOREX
3601  // move. Otherwise use a normal move.
3602  if (isHReg(Reg) || X86::GR8_ABCD_HRegClass.hasSubClassEq(RC))
3603  return load ? X86::MOV8rm_NOREX : X86::MOV8mr_NOREX;
3604  return load ? X86::MOV8rm : X86::MOV8mr;
3605  case 2:
3606  if (X86::VK16RegClass.hasSubClassEq(RC))
3607  return load ? X86::KMOVWkm : X86::KMOVWmk;
3608  assert(X86::GR16RegClass.hasSubClassEq(RC) && "Unknown 2-byte regclass");
3609  return load ? X86::MOV16rm : X86::MOV16mr;
3610  case 4:
3611  if (X86::GR32RegClass.hasSubClassEq(RC))
3612  return load ? X86::MOV32rm : X86::MOV32mr;
3613  if (X86::FR32XRegClass.hasSubClassEq(RC))
3614  return load ?
3615  (HasAVX512 ? X86::VMOVSSZrm_alt :
3616  HasAVX ? X86::VMOVSSrm_alt :
3617  X86::MOVSSrm_alt) :
3618  (HasAVX512 ? X86::VMOVSSZmr :
3619  HasAVX ? X86::VMOVSSmr :
3620  X86::MOVSSmr);
3621  if (X86::RFP32RegClass.hasSubClassEq(RC))
3622  return load ? X86::LD_Fp32m : X86::ST_Fp32m;
3623  if (X86::VK32RegClass.hasSubClassEq(RC)) {
3624  assert(STI.hasBWI() && "KMOVD requires BWI");
3625  return load ? X86::KMOVDkm : X86::KMOVDmk;
3626  }
3627  // All of these mask pair classes have the same spill size, the same kind
3628  // of kmov instructions can be used with all of them.
3629  if (X86::VK1PAIRRegClass.hasSubClassEq(RC) ||
3630  X86::VK2PAIRRegClass.hasSubClassEq(RC) ||
3631  X86::VK4PAIRRegClass.hasSubClassEq(RC) ||
3632  X86::VK8PAIRRegClass.hasSubClassEq(RC) ||
3633  X86::VK16PAIRRegClass.hasSubClassEq(RC))
3634  return load ? X86::MASKPAIR16LOAD : X86::MASKPAIR16STORE;
3635  llvm_unreachable("Unknown 4-byte regclass");
3636  case 8:
3637  if (X86::GR64RegClass.hasSubClassEq(RC))
3638  return load ? X86::MOV64rm : X86::MOV64mr;
3639  if (X86::FR64XRegClass.hasSubClassEq(RC))
3640  return load ?
3641  (HasAVX512 ? X86::VMOVSDZrm_alt :
3642  HasAVX ? X86::VMOVSDrm_alt :
3643  X86::MOVSDrm_alt) :
3644  (HasAVX512 ? X86::VMOVSDZmr :
3645  HasAVX ? X86::VMOVSDmr :
3646  X86::MOVSDmr);
3647  if (X86::VR64RegClass.hasSubClassEq(RC))
3648  return load ? X86::MMX_MOVQ64rm : X86::MMX_MOVQ64mr;
3649  if (X86::RFP64RegClass.hasSubClassEq(RC))
3650  return load ? X86::LD_Fp64m : X86::ST_Fp64m;
3651  if (X86::VK64RegClass.hasSubClassEq(RC)) {
3652  assert(STI.hasBWI() && "KMOVQ requires BWI");
3653  return load ? X86::KMOVQkm : X86::KMOVQmk;
3654  }
3655  llvm_unreachable("Unknown 8-byte regclass");
3656  case 10:
3657  assert(X86::RFP80RegClass.hasSubClassEq(RC) && "Unknown 10-byte regclass");
3658  return load ? X86::LD_Fp80m : X86::ST_FpP80m;
3659  case 16: {
3660  if (X86::VR128XRegClass.hasSubClassEq(RC)) {
3661  // If stack is realigned we can use aligned stores.
3662  if (IsStackAligned)
3663  return load ?
3664  (HasVLX ? X86::VMOVAPSZ128rm :
3665  HasAVX512 ? X86::VMOVAPSZ128rm_NOVLX :
3666  HasAVX ? X86::VMOVAPSrm :
3667  X86::MOVAPSrm):
3668  (HasVLX ? X86::VMOVAPSZ128mr :
3669  HasAVX512 ? X86::VMOVAPSZ128mr_NOVLX :
3670  HasAVX ? X86::VMOVAPSmr :
3671  X86::MOVAPSmr);
3672  else
3673  return load ?
3674  (HasVLX ? X86::VMOVUPSZ128rm :
3675  HasAVX512 ? X86::VMOVUPSZ128rm_NOVLX :
3676  HasAVX ? X86::VMOVUPSrm :
3677  X86::MOVUPSrm):
3678  (HasVLX ? X86::VMOVUPSZ128mr :
3679  HasAVX512 ? X86::VMOVUPSZ128mr_NOVLX :
3680  HasAVX ? X86::VMOVUPSmr :
3681  X86::MOVUPSmr);
3682  }
3683  if (X86::BNDRRegClass.hasSubClassEq(RC)) {
3684  if (STI.is64Bit())
3685  return load ? X86::BNDMOV64rm : X86::BNDMOV64mr;
3686  else
3687  return load ? X86::BNDMOV32rm : X86::BNDMOV32mr;
3688  }
3689  llvm_unreachable("Unknown 16-byte regclass");
3690  }
3691  case 32:
3692  assert(X86::VR256XRegClass.hasSubClassEq(RC) && "Unknown 32-byte regclass");
3693  // If stack is realigned we can use aligned stores.
3694  if (IsStackAligned)
3695  return load ?
3696  (HasVLX ? X86::VMOVAPSZ256rm :
3697  HasAVX512 ? X86::VMOVAPSZ256rm_NOVLX :
3698  X86::VMOVAPSYrm) :
3699  (HasVLX ? X86::VMOVAPSZ256mr :
3700  HasAVX512 ? X86::VMOVAPSZ256mr_NOVLX :
3701  X86::VMOVAPSYmr);
3702  else
3703  return load ?
3704  (HasVLX ? X86::VMOVUPSZ256rm :
3705  HasAVX512 ? X86::VMOVUPSZ256rm_NOVLX :
3706  X86::VMOVUPSYrm) :
3707  (HasVLX ? X86::VMOVUPSZ256mr :
3708  HasAVX512 ? X86::VMOVUPSZ256mr_NOVLX :
3709  X86::VMOVUPSYmr);
3710  case 64:
3711  assert(X86::VR512RegClass.hasSubClassEq(RC) && "Unknown 64-byte regclass");
3712  assert(STI.hasAVX512() && "Using 512-bit register requires AVX512");
3713  if (IsStackAligned)
3714  return load ? X86::VMOVAPSZrm : X86::VMOVAPSZmr;
3715  else
3716  return load ? X86::VMOVUPSZrm : X86::VMOVUPSZmr;
3717  }
3718 }
3719 
3722  const TargetRegisterInfo *TRI) const {
3723  const MCInstrDesc &Desc = MemI.getDesc();
3724  int MemRefBegin = X86II::getMemoryOperandNo(Desc.TSFlags);
3725  if (MemRefBegin < 0)
3726  return None;
3727 
3728  MemRefBegin += X86II::getOperandBias(Desc);
3729 
3730  auto &BaseOp = MemI.getOperand(MemRefBegin + X86::AddrBaseReg);
3731  if (!BaseOp.isReg()) // Can be an MO_FrameIndex
3732  return None;
3733 
3734  const MachineOperand &DispMO = MemI.getOperand(MemRefBegin + X86::AddrDisp);
3735  // Displacement can be symbolic
3736  if (!DispMO.isImm())
3737  return None;
3738 
3739  ExtAddrMode AM;
3740  AM.BaseReg = BaseOp.getReg();
3741  AM.ScaledReg = MemI.getOperand(MemRefBegin + X86::AddrIndexReg).getReg();
3742  AM.Scale = MemI.getOperand(MemRefBegin + X86::AddrScaleAmt).getImm();
3743  AM.Displacement = DispMO.getImm();
3744  return AM;
3745 }
3746 
3748  const Register Reg,
3749  int64_t &ImmVal) const {
3750  if (MI.getOpcode() != X86::MOV32ri && MI.getOpcode() != X86::MOV64ri)
3751  return false;
3752  // Mov Src can be a global address.
3753  if (!MI.getOperand(1).isImm() || MI.getOperand(0).getReg() != Reg)
3754  return false;
3755  ImmVal = MI.getOperand(1).getImm();
3756  return true;
3757 }
3758 
3760  const MachineInstr *MI, const Register NullValueReg,
3761  const TargetRegisterInfo *TRI) const {
3762  if (!MI->modifiesRegister(NullValueReg, TRI))
3763  return true;
3764  switch (MI->getOpcode()) {
3765  // Shift right/left of a null unto itself is still a null, i.e. rax = shl rax
3766  // X.
3767  case X86::SHR64ri:
3768  case X86::SHR32ri:
3769  case X86::SHL64ri:
3770  case X86::SHL32ri:
3771  assert(MI->getOperand(0).isDef() && MI->getOperand(1).isUse() &&
3772  "expected for shift opcode!");
3773  return MI->getOperand(0).getReg() == NullValueReg &&
3774  MI->getOperand(1).getReg() == NullValueReg;
3775  // Zero extend of a sub-reg of NullValueReg into itself does not change the
3776  // null value.
3777  case X86::MOV32rr:
3778  return llvm::all_of(MI->operands(), [&](const MachineOperand &MO) {
3779  return TRI->isSubRegisterEq(NullValueReg, MO.getReg());
3780  });
3781  default:
3782  return false;
3783  }
3784  llvm_unreachable("Should be handled above!");
3785 }
3786 
3789  int64_t &Offset, bool &OffsetIsScalable, unsigned &Width,
3790  const TargetRegisterInfo *TRI) const {
3791  const MCInstrDesc &Desc = MemOp.getDesc();
3792  int MemRefBegin = X86II::getMemoryOperandNo(Desc.TSFlags);
3793  if (MemRefBegin < 0)
3794  return false;
3795 
3796  MemRefBegin += X86II::getOperandBias(Desc);
3797 
3798  const MachineOperand *BaseOp =
3799  &MemOp.getOperand(MemRefBegin + X86::AddrBaseReg);
3800  if (!BaseOp->isReg()) // Can be an MO_FrameIndex
3801  return false;
3802 
3803  if (MemOp.getOperand(MemRefBegin + X86::AddrScaleAmt).getImm() != 1)
3804  return false;
3805 
3806  if (MemOp.getOperand(MemRefBegin + X86::AddrIndexReg).getReg() !=
3807  X86::NoRegister)
3808  return false;
3809 
3810  const MachineOperand &DispMO = MemOp.getOperand(MemRefBegin + X86::AddrDisp);
3811 
3812  // Displacement can be symbolic
3813  if (!DispMO.isImm())
3814  return false;
3815 
3816  Offset = DispMO.getImm();
3817 
3818  if (!BaseOp->isReg())
3819  return false;
3820 
3821  OffsetIsScalable = false;
3822  // FIXME: Relying on memoperands() may not be right thing to do here. Check
3823  // with X86 maintainers, and fix it accordingly. For now, it is ok, since
3824  // there is no use of `Width` for X86 back-end at the moment.
3825  Width =
3826  !MemOp.memoperands_empty() ? MemOp.memoperands().front()->getSize() : 0;
3827  BaseOps.push_back(BaseOp);
3828  return true;
3829 }
3830 
3831 static unsigned getStoreRegOpcode(Register SrcReg,
3832  const TargetRegisterClass *RC,
3833  bool IsStackAligned,
3834  const X86Subtarget &STI) {
3835  return getLoadStoreRegOpcode(SrcReg, RC, IsStackAligned, STI, false);
3836 }
3837 
3838 static unsigned getLoadRegOpcode(Register DestReg,
3839  const TargetRegisterClass *RC,
3840  bool IsStackAligned, const X86Subtarget &STI) {
3841  return getLoadStoreRegOpcode(DestReg, RC, IsStackAligned, STI, true);
3842 }
3843 
3846  Register SrcReg, bool isKill, int FrameIdx,
3847  const TargetRegisterClass *RC,
3848  const TargetRegisterInfo *TRI) const {
3849  const MachineFunction &MF = *MBB.getParent();
3850  const MachineFrameInfo &MFI = MF.getFrameInfo();
3851  assert(MFI.getObjectSize(FrameIdx) >= TRI->getSpillSize(*RC) &&
3852  "Stack slot too small for store");
3853  if (RC->getID() == X86::TILERegClassID) {
3854  unsigned Opc = X86::TILESTORED;
3855  // tilestored %tmm, (%sp, %idx)
3856  MachineRegisterInfo &RegInfo = MBB.getParent()->getRegInfo();
3857  Register VirtReg = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass);
3858  BuildMI(MBB, MI, DebugLoc(), get(X86::MOV64ri), VirtReg).addImm(64);
3859  MachineInstr *NewMI =
3860  addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc)), FrameIdx)
3861  .addReg(SrcReg, getKillRegState(isKill));
3862  MachineOperand &MO = NewMI->getOperand(2);
3863  MO.setReg(VirtReg);
3864  MO.setIsKill(true);
3865  } else {
3866  unsigned Alignment = std::max<uint32_t>(TRI->getSpillSize(*RC), 16);
3867  bool isAligned =
3868  (Subtarget.getFrameLowering()->getStackAlign() >= Alignment) ||
3869  (RI.canRealignStack(MF) && !MFI.isFixedObjectIndex(FrameIdx));
3870  unsigned Opc = getStoreRegOpcode(SrcReg, RC, isAligned, Subtarget);
3871  addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc)), FrameIdx)
3872  .addReg(SrcReg, getKillRegState(isKill));
3873  }
3874 }
3875 
3878  Register DestReg, int FrameIdx,
3879  const TargetRegisterClass *RC,
3880  const TargetRegisterInfo *TRI) const {
3881  if (RC->getID() == X86::TILERegClassID) {
3882  unsigned Opc = X86::TILELOADD;
3883  // tileloadd (%sp, %idx), %tmm
3884  MachineRegisterInfo &RegInfo = MBB.getParent()->getRegInfo();
3885  Register VirtReg = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass);
3886  MachineInstr *NewMI =
3887  BuildMI(MBB, MI, DebugLoc(), get(X86::MOV64ri), VirtReg).addImm(64);
3888  NewMI = addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc), DestReg),
3889  FrameIdx);
3890  MachineOperand &MO = NewMI->getOperand(3);
3891  MO.setReg(VirtReg);
3892  MO.setIsKill(true);
3893  } else {
3894  const MachineFunction &MF = *MBB.getParent();
3895  const MachineFrameInfo &MFI = MF.getFrameInfo();
3896  unsigned Alignment = std::max<uint32_t>(TRI->getSpillSize(*RC), 16);
3897  bool isAligned =
3898  (Subtarget.getFrameLowering()->getStackAlign() >= Alignment) ||
3899  (RI.canRealignStack(MF) && !MFI.isFixedObjectIndex(FrameIdx));
3900  unsigned Opc = getLoadRegOpcode(DestReg, RC, isAligned, Subtarget);
3901  addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc), DestReg),
3902  FrameIdx);
3903  }
3904 }
3905 
3907  Register &SrcReg2, int &CmpMask,
3908  int &CmpValue) const {
3909  switch (MI.getOpcode()) {
3910  default: break;
3911  case X86::CMP64ri32:
3912  case X86::CMP64ri8:
3913  case X86::CMP32ri:
3914  case X86::CMP32ri8:
3915  case X86::CMP16ri:
3916  case X86::CMP16ri8:
3917  case X86::CMP8ri:
3918  SrcReg = MI.getOperand(0).getReg();
3919  SrcReg2 = 0;
3920  if (MI.getOperand(1).isImm()) {
3921  CmpMask = ~0;
3922  CmpValue = MI.getOperand(1).getImm();
3923  } else {
3924  CmpMask = CmpValue = 0;
3925  }
3926  return true;
3927  // A SUB can be used to perform comparison.
3928  case X86::SUB64rm:
3929  case X86::SUB32rm:
3930  case X86::SUB16rm:
3931  case X86::SUB8rm:
3932  SrcReg = MI.getOperand(1).getReg();
3933  SrcReg2 = 0;
3934  CmpMask = 0;
3935  CmpValue = 0;
3936  return true;
3937  case X86::SUB64rr:
3938  case X86::SUB32rr:
3939  case X86::SUB16rr:
3940  case X86::SUB8rr:
3941  SrcReg = MI.getOperand(1).getReg();
3942  SrcReg2 = MI.getOperand(2).getReg();
3943  CmpMask = 0;
3944  CmpValue = 0;
3945  return true;
3946  case X86::SUB64ri32:
3947  case X86::SUB64ri8:
3948  case X86::SUB32ri:
3949  case X86::SUB32ri8:
3950  case X86::SUB16ri:
3951  case X86::SUB16ri8:
3952  case X86::SUB8ri:
3953  SrcReg = MI.getOperand(1).getReg();
3954  SrcReg2 = 0;
3955  if (MI.getOperand(2).isImm()) {
3956  CmpMask = ~0;
3957  CmpValue = MI.getOperand(2).getImm();
3958  } else {
3959  CmpMask = CmpValue = 0;
3960  }
3961  return true;
3962  case X86::CMP64rr:
3963  case X86::CMP32rr:
3964  case X86::CMP16rr:
3965  case X86::CMP8rr:
3966  SrcReg = MI.getOperand(0).getReg();
3967  SrcReg2 = MI.getOperand(1).getReg();
3968  CmpMask = 0;
3969  CmpValue = 0;
3970  return true;
3971  case X86::TEST8rr:
3972  case X86::TEST16rr:
3973  case X86::TEST32rr:
3974  case X86::TEST64rr:
3975  SrcReg = MI.getOperand(0).getReg();
3976  if (MI.getOperand(1).getReg() != SrcReg)
3977  return false;
3978  // Compare against zero.
3979  SrcReg2 = 0;
3980  CmpMask = ~0;
3981  CmpValue = 0;
3982  return true;
3983  }
3984  return false;
3985 }
3986 
3987 /// Check whether the first instruction, whose only
3988 /// purpose is to update flags, can be made redundant.
3989 /// CMPrr can be made redundant by SUBrr if the operands are the same.
3990 /// This function can be extended later on.
3991 /// SrcReg, SrcRegs: register operands for FlagI.
3992 /// ImmValue: immediate for FlagI if it takes an immediate.
3993 inline static bool isRedundantFlagInstr(const MachineInstr &FlagI,
3994  Register SrcReg, Register SrcReg2,
3995  int ImmMask, int ImmValue,
3996  const MachineInstr &OI) {
3997  if (((FlagI.getOpcode() == X86::CMP64rr && OI.getOpcode() == X86::SUB64rr) ||
3998  (FlagI.getOpcode() == X86::CMP32rr && OI.getOpcode() == X86::SUB32rr) ||
3999  (FlagI.getOpcode() == X86::CMP16rr && OI.getOpcode() == X86::SUB16rr) ||
4000  (FlagI.getOpcode() == X86::CMP8rr && OI.getOpcode() == X86::SUB8rr)) &&
4001  ((OI.getOperand(1).getReg() == SrcReg &&
4002  OI.getOperand(2).getReg() == SrcReg2) ||
4003  (OI.getOperand(1).getReg() == SrcReg2 &&
4004  OI.getOperand(2).getReg() == SrcReg)))
4005  return true;
4006 
4007  if (ImmMask != 0 &&
4008  ((FlagI.getOpcode() == X86::CMP64ri32 &&
4009  OI.getOpcode() == X86::SUB64ri32) ||
4010  (FlagI.getOpcode() == X86::CMP64ri8 &&
4011  OI.getOpcode() == X86::SUB64ri8) ||
4012  (FlagI.getOpcode() == X86::CMP32ri && OI.getOpcode() == X86::SUB32ri) ||
4013  (FlagI.getOpcode() == X86::CMP32ri8 &&
4014  OI.getOpcode() == X86::SUB32ri8) ||
4015  (FlagI.getOpcode() == X86::CMP16ri && OI.getOpcode() == X86::SUB16ri) ||
4016  (FlagI.getOpcode() == X86::CMP16ri8 &&
4017  OI.getOpcode() == X86::SUB16ri8) ||
4018  (FlagI.getOpcode() == X86::CMP8ri && OI.getOpcode() == X86::SUB8ri)) &&
4019  OI.getOperand(1).getReg() == SrcReg &&
4020  OI.getOperand(2).getImm() == ImmValue)
4021  return true;
4022  return false;
4023 }
4024 
4025 /// Check whether the definition can be converted
4026 /// to remove a comparison against zero.
4027 inline static bool isDefConvertible(const MachineInstr &MI, bool &NoSignFlag,
4028  bool &ClearsOverflowFlag) {
4029  NoSignFlag = false;
4030  ClearsOverflowFlag = false;
4031 
4032  switch (MI.getOpcode()) {
4033  default: return false;
4034 
4035  // The shift instructions only modify ZF if their shift count is non-zero.
4036  // N.B.: The processor truncates the shift count depending on the encoding.
4037  case X86::SAR8ri: case X86::SAR16ri: case X86::SAR32ri:case X86::SAR64ri:
4038  case X86::SHR8ri: case X86::SHR16ri: case X86::SHR32ri:case X86::SHR64ri:
4039  return getTruncatedShiftCount(MI, 2) != 0;
4040 
4041  // Some left shift instructions can be turned into LEA instructions but only
4042  // if their flags aren't used. Avoid transforming such instructions.
4043  case X86::SHL8ri: case X86::SHL16ri: case X86::SHL32ri:case X86::SHL64ri:{
4044  unsigned ShAmt = getTruncatedShiftCount(MI, 2);
4045  if (isTruncatedShiftCountForLEA(ShAmt)) return false;
4046  return ShAmt != 0;
4047  }
4048 
4049  case X86::SHRD16rri8:case X86::SHRD32rri8:case X86::SHRD64rri8:
4050  case X86::SHLD16rri8:case X86::SHLD32rri8:case X86::SHLD64rri8:
4051  return getTruncatedShiftCount(MI, 3) != 0;
4052 
4053  case X86::SUB64ri32: case X86::SUB64ri8: case X86::SUB32ri:
4054  case X86::SUB32ri8: case X86::SUB16ri: case X86::SUB16ri8:
4055  case X86::SUB8ri: case X86::SUB64rr: case X86::SUB32rr:
4056  case X86::SUB16rr: case X86::SUB8rr: case X86::SUB64rm:
4057  case X86::SUB32rm: case X86::SUB16rm: case X86::SUB8rm:
4058  case X86::DEC64r: case X86::DEC32r: case X86::DEC16r: case X86::DEC8r:
4059  case X86::ADD64ri32: case X86::ADD64ri8: case X86::ADD32ri:
4060  case X86::ADD32ri8: case X86::ADD16ri: case X86::ADD16ri8:
4061  case X86::ADD8ri: case X86::ADD64rr: case X86::ADD32rr:
4062  case X86::ADD16rr: case X86::ADD8rr: case X86::ADD64rm:
4063  case X86::ADD32rm: case X86::ADD16rm: case X86::ADD8rm:
4064  case X86::INC64r: case X86::INC32r: case X86::INC16r: case X86::INC8r:
4065  case X86::ADC64ri32: case X86::ADC64ri8: case X86::ADC32ri:
4066  case X86::ADC32ri8: case X86::ADC16ri: case X86::ADC16ri8:
4067  case X86::ADC8ri: case X86::ADC64rr: case X86::ADC32rr:
4068  case X86::ADC16rr: case X86::ADC8rr: case X86::ADC64rm:
4069  case X86::ADC32rm: case X86::ADC16rm: case X86::ADC8rm:
4070  case X86::SBB64ri32: case X86::SBB64ri8: case X86::SBB32ri:
4071  case X86::SBB32ri8: case X86::SBB16ri: case X86::SBB16ri8:
4072  case X86::SBB8ri: case X86::SBB64rr: case X86::SBB32rr:
4073  case X86::SBB16rr: case X86::SBB8rr: case X86::SBB64rm:
4074  case X86::SBB32rm: case X86::SBB16rm: case X86::SBB8rm:
4075  case X86::NEG8r: case X86::NEG16r: case X86::NEG32r: case X86::NEG64r:
4076  case X86::SAR8r1: case X86::SAR16r1: case X86::SAR32r1:case X86::SAR64r1:
4077  case X86::SHR8r1: case X86::SHR16r1: case X86::SHR32r1:case X86::SHR64r1:
4078  case X86::SHL8r1: case X86::SHL16r1: case X86::SHL32r1:case X86::SHL64r1:
4079  case X86::LZCNT16rr: case X86::LZCNT16rm:
4080  case X86::LZCNT32rr: case X86::LZCNT32rm:
4081  case X86::LZCNT64rr: case X86::LZCNT64rm:
4082  case X86::POPCNT16rr:case X86::POPCNT16rm:
4083  case X86::POPCNT32rr:case X86::POPCNT32rm:
4084  case X86::POPCNT64rr:case X86::POPCNT64rm:
4085  case X86::TZCNT16rr: case X86::TZCNT16rm:
4086  case X86::TZCNT32rr: case X86::TZCNT32rm:
4087  case X86::TZCNT64rr: case X86::TZCNT64rm:
4088  return true;
4089  case X86::AND64ri32: case X86::AND64ri8: case X86::AND32ri:
4090  case X86::AND32ri8: case X86::AND16ri: case X86::AND16ri8:
4091  case X86::AND8ri: case X86::AND64rr: case X86::AND32rr:
4092  case X86::AND16rr: case X86::AND8rr: case X86::AND64rm:
4093  case X86::AND32rm: case X86::AND16rm: case X86::AND8rm:
4094  case X86::XOR64ri32: case X86::XOR64ri8: case X86::XOR32ri:
4095  case X86::XOR32ri8: case X86::XOR16ri: case X86::XOR16ri8:
4096  case X86::XOR8ri: case X86::XOR64rr: case X86::XOR32rr:
4097  case X86::XOR16rr: case X86::XOR8rr: case X86::XOR64rm:
4098  case X86::XOR32rm: case X86::XOR16rm: case X86::XOR8rm:
4099  case X86::OR64ri32: case X86::OR64ri8: case X86::OR32ri:
4100  case X86::OR32ri8: case X86::OR16ri: case X86::OR16ri8:
4101  case X86::OR8ri: case X86::OR64rr: case X86::OR32rr:
4102  case X86::OR16rr: case X86::OR8rr: case X86::OR64rm:
4103  case X86::OR32rm: case X86::OR16rm: case X86::OR8rm:
4104  case X86::ANDN32rr: case X86::ANDN32rm:
4105  case X86::ANDN64rr: case X86::ANDN64rm:
4106  case X86::BLSI32rr: case X86::BLSI32rm:
4107  case X86::BLSI64rr: case X86::BLSI64rm:
4108  case X86::BLSMSK32rr: case X86::BLSMSK32rm:
4109  case X86::BLSMSK64rr: case X86::BLSMSK64rm:
4110  case X86::BLSR32rr: case X86::BLSR32rm:
4111  case X86::BLSR64rr: case X86::BLSR64rm:
4112  case X86::BLCFILL32rr: case X86::BLCFILL32rm:
4113  case X86::BLCFILL64rr: case X86::BLCFILL64rm:
4114  case X86::BLCI32rr: case X86::BLCI32rm:
4115  case X86::BLCI64rr: case X86::BLCI64rm:
4116  case X86::BLCIC32rr: case X86::BLCIC32rm:
4117  case X86::BLCIC64rr: case X86::BLCIC64rm:
4118  case X86::BLCMSK32rr: case X86::BLCMSK32rm:
4119  case X86::BLCMSK64rr: case X86::BLCMSK64rm:
4120  case X86::BLCS32rr: case X86::BLCS32rm:
4121  case X86::BLCS64rr: case X86::BLCS64rm:
4122  case X86::BLSFILL32rr: case X86::BLSFILL32rm:
4123  case X86::BLSFILL64rr: case X86::BLSFILL64rm:
4124  case X86::BLSIC32rr: case X86::BLSIC32rm:
4125  case X86::BLSIC64rr: case X86::BLSIC64rm:
4126  case X86::BZHI32rr: case X86::BZHI32rm:
4127  case X86::BZHI64rr: case X86::BZHI64rm:
4128  case X86::T1MSKC32rr: case X86::T1MSKC32rm:
4129  case X86::T1MSKC64rr: case X86::T1MSKC64rm:
4130  case X86::TZMSK32rr: case X86::TZMSK32rm:
4131  case X86::TZMSK64rr: case X86::TZMSK64rm:
4132  // These instructions clear the overflow flag just like TEST.
4133  // FIXME: These are not the only instructions in this switch that clear the
4134  // overflow flag.
4135  ClearsOverflowFlag = true;
4136  return true;
4137  case X86::BEXTR32rr: case X86::BEXTR64rr:
4138  case X86::BEXTR32rm: case X86::BEXTR64rm:
4139  case X86::BEXTRI32ri: case X86::BEXTRI32mi:
4140  case X86::BEXTRI64ri: case X86::BEXTRI64mi:
4141  // BEXTR doesn't update the sign flag so we can't use it. It does clear
4142  // the overflow flag, but that's not useful without the sign flag.
4143  NoSignFlag = true;
4144  return true;
4145  }
4146 }
4147 
4148 /// Check whether the use can be converted to remove a comparison against zero.
4150  switch (MI.getOpcode()) {
4151  default: return X86::COND_INVALID;
4152  case X86::NEG8r:
4153  case X86::NEG16r:
4154  case X86::NEG32r:
4155  case X86::NEG64r:
4156  return X86::COND_AE;
4157  case X86::LZCNT16rr:
4158  case X86::LZCNT32rr:
4159  case X86::LZCNT64rr:
4160  return X86::COND_B;
4161  case X86::POPCNT16rr:
4162  case X86::POPCNT32rr:
4163  case X86::POPCNT64rr:
4164  return X86::COND_E;
4165  case X86::TZCNT16rr:
4166  case X86::TZCNT32rr:
4167  case X86::TZCNT64rr:
4168  return X86::COND_B;
4169  case X86::BSF16rr:
4170  case X86::BSF32rr:
4171  case X86::BSF64rr:
4172  case X86::BSR16rr:
4173  case X86::BSR32rr:
4174  case X86::BSR64rr:
4175  return X86::COND_E;
4176  case X86::BLSI32rr:
4177  case X86::BLSI64rr:
4178  return X86::COND_AE;
4179  case X86::BLSR32rr:
4180  case X86::BLSR64rr:
4181  case X86::BLSMSK32rr:
4182  case X86::BLSMSK64rr:
4183  return X86::COND_B;
4184  // TODO: TBM instructions.
4185  }
4186 }
4187 
4188 /// Check if there exists an earlier instruction that
4189 /// operates on the same source operands and sets flags in the same way as
4190 /// Compare; remove Compare if possible.
4192  Register SrcReg2, int CmpMask,
4193  int CmpValue,
4194  const MachineRegisterInfo *MRI) const {
4195  // Check whether we can replace SUB with CMP.
4196  switch (CmpInstr.getOpcode()) {
4197  default: break;
4198  case X86::SUB64ri32:
4199  case X86::SUB64ri8:
4200  case X86::SUB32ri:
4201  case X86::SUB32ri8:
4202  case X86::SUB16ri:
4203  case X86::SUB16ri8:
4204  case X86::SUB8ri:
4205  case X86::SUB64rm:
4206  case X86::SUB32rm:
4207  case X86::SUB16rm:
4208  case X86::SUB8rm:
4209  case X86::SUB64rr:
4210  case X86::SUB32rr:
4211  case X86::SUB16rr:
4212  case X86::SUB8rr: {
4213  if (!MRI->use_nodbg_empty(CmpInstr.getOperand(0).getReg()))
4214  return false;
4215  // There is no use of the destination register, we can replace SUB with CMP.
4216  unsigned NewOpcode = 0;
4217  switch (CmpInstr.getOpcode()) {
4218  default: llvm_unreachable("Unreachable!");
4219  case X86::SUB64rm: NewOpcode = X86::CMP64rm; break;
4220  case X86::SUB32rm: NewOpcode = X86::CMP32rm; break;
4221  case X86::SUB16rm: NewOpcode = X86::CMP16rm; break;
4222  case X86::SUB8rm: NewOpcode = X86::CMP8rm; break;
4223  case X86::SUB64rr: NewOpcode = X86::CMP64rr; break;
4224  case X86::SUB32rr: NewOpcode = X86::CMP32rr; break;
4225  case X86::SUB16rr: NewOpcode = X86::CMP16rr; break;
4226  case X86::SUB8rr: NewOpcode = X86::CMP8rr; break;
4227  case X86::SUB64ri32: NewOpcode = X86::CMP64ri32; break;
4228  case X86::SUB64ri8: NewOpcode = X86::CMP64ri8; break;
4229  case X86::SUB32ri: NewOpcode = X86::CMP32ri; break;
4230  case X86::SUB32ri8: NewOpcode = X86::CMP32ri8; break;
4231  case X86::SUB16ri: NewOpcode = X86::CMP16ri; break;
4232  case X86::SUB16ri8: NewOpcode = X86::CMP16ri8; break;
4233  case X86::SUB8ri: NewOpcode = X86::CMP8ri; break;
4234  }
4235  CmpInstr.setDesc(get(NewOpcode));
4236  CmpInstr.RemoveOperand(0);
4237  // Mutating this instruction invalidates any debug data associated with it.
4238  CmpInstr.dropDebugNumber();
4239  // Fall through to optimize Cmp if Cmp is CMPrr or CMPri.
4240  if (NewOpcode == X86::CMP64rm || NewOpcode == X86::CMP32rm ||
4241  NewOpcode == X86::CMP16rm || NewOpcode == X86::CMP8rm)
4242  return false;
4243  }
4244  }
4245 
4246  // Get the unique definition of SrcReg.
4247  MachineInstr *MI = MRI->getUniqueVRegDef(SrcReg);
4248  if (!MI) return false;
4249 
4250  // CmpInstr is the first instruction of the BB.
4251  MachineBasicBlock::iterator I = CmpInstr, Def = MI;
4252 
4253  // If we are comparing against zero, check whether we can use MI to update
4254  // EFLAGS. If MI is not in the same BB as CmpInstr, do not optimize.
4255  bool IsCmpZero = (CmpMask != 0 && CmpValue == 0);
4256  if (IsCmpZero && MI->getParent() != CmpInstr.getParent())
4257  return false;
4258 
4259  // If we have a use of the source register between the def and our compare
4260  // instruction we can eliminate the compare iff the use sets EFLAGS in the
4261  // right way.
4262  bool ShouldUpdateCC = false;
4263  bool NoSignFlag = false;
4264  bool ClearsOverflowFlag = false;
4266  if (IsCmpZero && !isDefConvertible(*MI, NoSignFlag, ClearsOverflowFlag)) {
4267  // Scan forward from the use until we hit the use we're looking for or the
4268  // compare instruction.
4269  for (MachineBasicBlock::iterator J = MI;; ++J) {
4270  // Do we have a convertible instruction?
4271  NewCC = isUseDefConvertible(*J);
4272  if (NewCC != X86::COND_INVALID && J->getOperand(1).isReg() &&
4273  J->getOperand(1).getReg() == SrcReg) {
4274  assert(J->definesRegister(X86::EFLAGS) && "Must be an EFLAGS def!");
4275  ShouldUpdateCC = true; // Update CC later on.
4276  // This is not a def of SrcReg, but still a def of EFLAGS. Keep going
4277  // with the new def.
4278  Def = J;
4279  MI = &*Def;
4280  break;
4281  }
4282 
4283  if (J == I)
4284  return false;
4285  }
4286  }
4287 
4288  // We are searching for an earlier instruction that can make CmpInstr
4289  // redundant and that instruction will be saved in Sub.
4290  MachineInstr *Sub = nullptr;
4292 
4293  // We iterate backward, starting from the instruction before CmpInstr and
4294  // stop when reaching the definition of a source register or done with the BB.
4295  // RI points to the instruction before CmpInstr.
4296  // If the definition is in this basic block, RE points to the definition;
4297  // otherwise, RE is the rend of the basic block.
4299  RI = ++I.getReverse(),
4300  RE = CmpInstr.getParent() == MI->getParent()
4301  ? Def.getReverse() /* points to MI */
4302  : CmpInstr.getParent()->rend();
4303  MachineInstr *Movr0Inst = nullptr;
4304  for (; RI != RE; ++RI) {
4305  MachineInstr &Instr = *RI;
4306  // Check whether CmpInstr can be made redundant by the current instruction.
4307  if (!IsCmpZero && isRedundantFlagInstr(CmpInstr, SrcReg, SrcReg2, CmpMask,
4308  CmpValue, Instr)) {
4309  Sub = &Instr;
4310  break;
4311  }
4312 
4313  if (Instr.modifiesRegister(X86::EFLAGS, TRI) ||
4314  Instr.readsRegister(X86::EFLAGS, TRI)) {
4315  // This instruction modifies or uses EFLAGS.
4316 
4317  // MOV32r0 etc. are implemented with xor which clobbers condition code.
4318  // They are safe to move up, if the definition to EFLAGS is dead and
4319  // earlier instructions do not read or write EFLAGS.
4320  if (!Movr0Inst && Instr.getOpcode() == X86::MOV32r0 &&
4321  Instr.registerDefIsDead(X86::EFLAGS, TRI)) {
4322  Movr0Inst = &Instr;
4323  continue;
4324  }
4325 
4326  // We can't remove CmpInstr.
4327  return false;
4328  }
4329  }
4330 
4331  // Return false if no candidates exist.
4332  if (!IsCmpZero && !Sub)
4333  return false;
4334 
4335  bool IsSwapped =
4336  (SrcReg2 != 0 && Sub && Sub->getOperand(1).getReg() == SrcReg2 &&
4337  Sub->getOperand(2).getReg() == SrcReg);
4338 
4339  // Scan forward from the instruction after CmpInstr for uses of EFLAGS.
4340  // It is safe to remove CmpInstr if EFLAGS is redefined or killed.
4341  // If we are done with the basic block, we need to check whether EFLAGS is
4342  // live-out.
4343  bool IsSafe = false;
4345  MachineBasicBlock::iterator E = CmpInstr.getParent()->end();
4346  for (++I; I != E; ++I) {
4347  const MachineInstr &Instr = *I;
4348  bool ModifyEFLAGS = Instr.modifiesRegister(X86::EFLAGS, TRI);
4349  bool UseEFLAGS = Instr.readsRegister(X86::EFLAGS, TRI);
4350  // We should check the usage if this instruction uses and updates EFLAGS.
4351  if (!UseEFLAGS && ModifyEFLAGS) {
4352  // It is safe to remove CmpInstr if EFLAGS is updated again.
4353  IsSafe = true;
4354  break;
4355  }
4356  if (!UseEFLAGS && !ModifyEFLAGS)
4357  continue;
4358 
4359  // EFLAGS is used by this instruction.
4361  if (IsCmpZero || IsSwapped) {
4362  // We decode the condition code from opcode.
4363  if (Instr.isBranch())
4364  OldCC = X86::getCondFromBranch(Instr);
4365  else {
4366  OldCC = X86::getCondFromSETCC(Instr);
4367  if (OldCC == X86::COND_INVALID)
4368  OldCC = X86::getCondFromCMov(Instr);
4369  }
4370  if (OldCC == X86::COND_INVALID) return false;
4371  }
4372  X86::CondCode ReplacementCC = X86::COND_INVALID;
4373  if (IsCmpZero) {
4374  switch (OldCC) {
4375  default: break;
4376  case X86::COND_A: case X86::COND_AE:
4377  case X86::COND_B: case X86::COND_BE:
4378  // CF is used, we can't perform this optimization.
4379  return false;
4380  case X86::COND_G: case X86::COND_GE:
4381  case X86::COND_L: case X86::COND_LE:
4382  case X86::COND_O: case X86::COND_NO:
4383  // If OF is used, the instruction needs to clear it like CmpZero does.
4384  if (!ClearsOverflowFlag)
4385  return false;
4386  break;
4387  case X86::COND_S: case X86::COND_NS:
4388  // If SF is used, but the instruction doesn't update the SF, then we
4389  // can't do the optimization.
4390  if (NoSignFlag)
4391  return false;
4392  break;
4393  }
4394 
4395  // If we're updating the condition code check if we have to reverse the
4396  // condition.
4397  if (ShouldUpdateCC)
4398  switch (OldCC) {
4399  default:
4400  return false;
4401  case X86::COND_E:
4402  ReplacementCC = NewCC;
4403  break;
4404  case X86::COND_NE:
4405  ReplacementCC = GetOppositeBranchCondition(NewCC);
4406  break;
4407  }
4408  } else if (IsSwapped) {
4409  // If we have SUB(r1, r2) and CMP(r2, r1), the condition code needs
4410  // to be changed from r2 > r1 to r1 < r2, from r2 < r1 to r1 > r2, etc.
4411  // We swap the condition code and synthesize the new opcode.
4412  ReplacementCC = getSwappedCondition(OldCC);
4413  if (ReplacementCC == X86::COND_INVALID) return false;
4414  }
4415 
4416  if ((ShouldUpdateCC || IsSwapped) && ReplacementCC != OldCC) {
4417  // Push the MachineInstr to OpsToUpdate.
4418  // If it is safe to remove CmpInstr, the condition code of these
4419  // instructions will be modified.
4420  OpsToUpdate.push_back(std::make_pair(&*I, ReplacementCC));
4421  }
4422  if (ModifyEFLAGS || Instr.killsRegister(X86::EFLAGS, TRI)) {
4423  // It is safe to remove CmpInstr if EFLAGS is updated again or killed.
4424  IsSafe = true;
4425  break;
4426  }
4427  }
4428 
4429  // If EFLAGS is not killed nor re-defined, we should check whether it is
4430  // live-out. If it is live-out, do not optimize.
4431  if ((IsCmpZero || IsSwapped) && !IsSafe) {
4432  MachineBasicBlock *MBB = CmpInstr.getParent();
4434  if (Successor->isLiveIn(X86::EFLAGS))
4435  return false;
4436  }
4437 
4438  // The instruction to be updated is either Sub or MI.
4439  Sub = IsCmpZero ? MI : Sub;
4440  // Move Movr0Inst to the appropriate place before Sub.
4441  if (Movr0Inst) {
4442  // Look backwards until we find a def that doesn't use the current EFLAGS.
4443  Def = Sub;
4444  MachineBasicBlock::reverse_iterator InsertI = Def.getReverse(),
4445  InsertE = Sub->getParent()->rend();
4446  for (; InsertI != InsertE; ++InsertI) {
4447  MachineInstr *Instr = &*InsertI;
4448  if (!Instr->readsRegister(X86::EFLAGS, TRI) &&
4449  Instr->modifiesRegister(X86::EFLAGS, TRI)) {
4450  Sub->getParent()->remove(Movr0Inst);
4451  Instr->getParent()->insert(MachineBasicBlock::iterator(Instr),
4452  Movr0Inst);
4453  break;
4454  }
4455  }
4456  if (InsertI == InsertE)
4457  return false;
4458  }
4459 
4460  // Make sure Sub instruction defines EFLAGS and mark the def live.
4461  MachineOperand *FlagDef = Sub->findRegisterDefOperand(X86::EFLAGS);
4462  assert(FlagDef && "Unable to locate a def EFLAGS operand");
4463  FlagDef->setIsDead(false);
4464 
4465  CmpInstr.eraseFromParent();
4466 
4467  // Modify the condition code of instructions in OpsToUpdate.
4468  for (auto &Op : OpsToUpdate) {
4469  Op.first->getOperand(Op.first->getDesc().getNumOperands() - 1)
4470  .setImm(Op.second);
4471  }
4472  return true;
4473 }
4474 
4475 /// Try to remove the load by folding it to a register
4476 /// operand at the use. We fold the load instructions if load defines a virtual
4477 /// register, the virtual register is used once in the same BB, and the
4478 /// instructions in-between do not load or store, and have no side effects.
4480  const MachineRegisterInfo *MRI,
4481  Register &FoldAsLoadDefReg,
4482  MachineInstr *&DefMI) const {
4483  // Check whether we can move DefMI here.
4484  DefMI = MRI->getVRegDef(FoldAsLoadDefReg);
4485  assert(DefMI);
4486  bool SawStore = false;
4487  if (!DefMI->isSafeToMove(nullptr, SawStore))
4488  return nullptr;
4489 
4490  // Collect information about virtual register operands of MI.
4491  SmallVector<unsigned, 1> SrcOperandIds;
4492  for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
4493  MachineOperand &MO = MI.getOperand(i);
4494  if (!MO.isReg())
4495  continue;
4496  Register Reg = MO.getReg();
4497  if (Reg != FoldAsLoadDefReg)
4498  continue;
4499  // Do not fold if we have a subreg use or a def.
4500  if (MO.getSubReg() || MO.isDef())
4501  return nullptr;
4502  SrcOperandIds.push_back(i);
4503  }
4504  if (SrcOperandIds.empty())
4505  return nullptr;
4506 
4507  // Check whether we can fold the def into SrcOperandId.
4508  if (MachineInstr *FoldMI = foldMemoryOperand(MI, SrcOperandIds, *DefMI)) {
4509  FoldAsLoadDefReg = 0;
4510  return FoldMI;
4511  }
4512 
4513  return nullptr;
4514 }
4515 
4516 /// Expand a single-def pseudo instruction to a two-addr
4517 /// instruction with two undef reads of the register being defined.
4518 /// This is used for mapping:
4519 /// %xmm4 = V_SET0
4520 /// to:
4521 /// %xmm4 = PXORrr undef %xmm4, undef %xmm4
4522 ///
4524  const MCInstrDesc &Desc) {
4525  assert(Desc.getNumOperands() == 3 && "Expected two-addr instruction.");
4526  Register Reg = MIB.getReg(0);
4527  MIB->setDesc(Desc);
4528 
4529  // MachineInstr::addOperand() will insert explicit operands before any
4530  // implicit operands.
4532  // But we don't trust that.
4533  assert(MIB.getReg(1) == Reg &&
4534  MIB.getReg(2) == Reg && "Misplaced operand");
4535  return true;
4536 }
4537 
4538 /// Expand a single-def pseudo instruction to a two-addr
4539 /// instruction with two %k0 reads.
4540 /// This is used for mapping:
4541 /// %k4 = K_SET1
4542 /// to:
4543 /// %k4 = KXNORrr %k0, %k0
4544 static bool Expand2AddrKreg(MachineInstrBuilder &MIB, const MCInstrDesc &Desc,
4545  Register Reg) {
4546  assert(Desc.getNumOperands() == 3 && "Expected two-addr instruction.");
4547  MIB->setDesc(Desc);
4549  return true;
4550 }
4551 
4553  bool MinusOne) {
4554  MachineBasicBlock &MBB = *MIB->getParent();
4555  const DebugLoc &DL = MIB->getDebugLoc();
4556  Register Reg = MIB.getReg(0);
4557 
4558  // Insert the XOR.
4559  BuildMI(MBB, MIB.getInstr(), DL, TII.get(X86::XOR32rr), Reg)
4562 
4563  // Turn the pseudo into an INC or DEC.
4564  MIB->setDesc(TII.get(MinusOne ? X86::DEC32r : X86::INC32r));
4565  MIB.addReg(Reg);
4566 
4567  return true;
4568 }
4569 
4571  const TargetInstrInfo &TII,
4572  const X86Subtarget &Subtarget) {
4573  MachineBasicBlock &MBB = *MIB->getParent();
4574  const DebugLoc &DL = MIB->getDebugLoc();
4575  int64_t Imm = MIB->getOperand(1).getImm();
4576  assert(Imm != 0 && "Using push/pop for 0 is not efficient.");
4578 
4579  int StackAdjustment;
4580 
4581  if (Subtarget.is64Bit()) {
4582  assert(MIB->getOpcode() == X86::MOV64ImmSExti8 ||
4583  MIB->getOpcode() == X86::MOV32ImmSExti8);
4584 
4585  // Can't use push/pop lowering if the function might write to the red zone.
4586  X86MachineFunctionInfo *X86FI =
4588  if (X86FI->getUsesRedZone()) {
4589  MIB->setDesc(TII.get(MIB->getOpcode() ==
4590  X86::MOV32ImmSExti8 ? X86::MOV32ri : X86::MOV64ri));
4591  return true;
4592  }
4593 
4594  // 64-bit mode doesn't have 32-bit push/pop, so use 64-bit operations and
4595  // widen the register if necessary.
4596  StackAdjustment = 8;
4597  BuildMI(MBB, I, DL, TII.get(X86::PUSH64i8)).addImm(Imm);
4598  MIB->setDesc(TII.get(X86::POP64r));
4599  MIB->getOperand(0)
4600  .setReg(getX86SubSuperRegister(MIB.getReg(0), 64));
4601  } else {
4602  assert(MIB->getOpcode() == X86::MOV32ImmSExti8);
4603  StackAdjustment = 4;
4604  BuildMI(MBB, I, DL, TII.get(X86::PUSH32i8)).addImm(Imm);
4605  MIB->setDesc(TII.get(X86::POP32r));
4606  }
4607  MIB->RemoveOperand(1);
4609 
4610  // Build CFI if necessary.
4611  MachineFunction &MF = *MBB.getParent();
4612  const X86FrameLowering *TFL = Subtarget.getFrameLowering();
4613  bool IsWin64Prologue = MF.getTarget().getMCAsmInfo()->usesWindowsCFI();
4614  bool NeedsDwarfCFI = !IsWin64Prologue && MF.needsFrameMoves();
4615  bool EmitCFI = !TFL->hasFP(MF) && NeedsDwarfCFI;
4616  if (EmitCFI) {
4617  TFL->BuildCFI(MBB, I, DL,
4619  TFL->BuildCFI(MBB, std::next(I), DL,
4621  }
4622 
4623  return true;
4624 }
4625 
4626 // LoadStackGuard has so far only been implemented for 64-bit MachO. Different
4627 // code sequence is needed for other targets.
4629  const TargetInstrInfo &TII) {
4630  MachineBasicBlock &MBB = *MIB->getParent();
4631  const DebugLoc &DL = MIB->getDebugLoc();
4632  Register Reg = MIB.getReg(0);
4633  const GlobalValue *GV =
4634  cast<GlobalValue>((*MIB->memoperands_begin())->getValue());
4635  auto Flags = MachineMemOperand::MOLoad |
4639  MachinePointerInfo::getGOT(*MBB.getParent()), Flags, 8, Align(8));
4641 
4642  BuildMI(MBB, I, DL, TII.get(X86::MOV64rm), Reg).addReg(X86::RIP).addImm(1)
4644  .addMemOperand(MMO);
4645  MIB->setDebugLoc(DL);
4646  MIB->setDesc(TII.get(X86::MOV64rm));
4647  MIB.addReg(Reg, RegState::Kill).addImm(1).addReg(0).addImm(0).addReg(0);
4648 }
4649 
4651  MachineBasicBlock &MBB = *MIB->getParent();
4652  MachineFunction &MF = *MBB.getParent();
4653  const X86Subtarget &Subtarget = MF.getSubtarget<X86Subtarget>();
4654  const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
4655  unsigned XorOp =
4656  MIB->getOpcode() == X86::XOR64_FP ? X86::XOR64rr : X86::XOR32rr;
4657  MIB->setDesc(TII.get(XorOp));
4659  return true;
4660 }
4661 
4662 // This is used to handle spills for 128/256-bit registers when we have AVX512,
4663 // but not VLX. If it uses an extended register we need to use an instruction
4664 // that loads the lower 128/256-bit, but is available with only AVX512F.
4666  const TargetRegisterInfo *TRI,
4667  const MCInstrDesc &LoadDesc,
4668  const MCInstrDesc &BroadcastDesc,
4669  unsigned SubIdx) {
4670  Register DestReg = MIB.getReg(0);
4671  // Check if DestReg is XMM16-31 or YMM16-31.
4672  if (TRI->getEncodingValue(DestReg) < 16) {
4673  // We can use a normal VEX encoded load.
4674  MIB->setDesc(LoadDesc);
4675  } else {
4676  // Use a 128/256-bit VBROADCAST instruction.
4677  MIB->setDesc(BroadcastDesc);
4678  // Change the destination to a 512-bit register.
4679  DestReg = TRI->getMatchingSuperReg(DestReg, SubIdx, &X86::VR512RegClass);
4680  MIB->getOperand(0).setReg(DestReg);
4681  }
4682  return true;
4683 }
4684 
4685 // This is used to handle spills for 128/256-bit registers when we have AVX512,
4686 // but not VLX. If it uses an extended register we need to use an instruction
4687 // that stores the lower 128/256-bit, but is available with only AVX512F.
4689  const TargetRegisterInfo *TRI,
4690  const MCInstrDesc &StoreDesc,
4691  const MCInstrDesc &ExtractDesc,
4692  unsigned SubIdx) {
4693  Register SrcReg = MIB.getReg(X86::AddrNumOperands);
4694  // Check if DestReg is XMM16-31 or YMM16-31.
4695  if (TRI->getEncodingValue(SrcReg) < 16) {
4696  // We can use a normal VEX encoded store.
4697  MIB->setDesc(StoreDesc);
4698  } else {
4699  // Use a VEXTRACTF instruction.
4700  MIB->setDesc(ExtractDesc);
4701  // Change the destination to a 512-bit register.
4702  SrcReg = TRI->getMatchingSuperReg(SrcReg, SubIdx, &X86::VR512RegClass);
4703  MIB->getOperand(X86::AddrNumOperands).setReg(SrcReg);
4704  MIB.addImm(0x0); // Append immediate to extract from the lower bits.
4705  }
4706 
4707  return true;
4708 }
4709 
4710 static bool expandSHXDROT(MachineInstrBuilder &MIB, const MCInstrDesc &Desc) {
4711  MIB->setDesc(Desc);
4712  int64_t ShiftAmt = MIB->getOperand(2).getImm();
4713  // Temporarily remove the immediate so we can add another source register.
4714  MIB->RemoveOperand(2);
4715  // Add the register. Don't copy the kill flag if there is one.
4716  MIB.addReg(MIB.getReg(1),
4717  getUndefRegState(MIB->getOperand(1).isUndef()));
4718  // Add back the immediate.
4719  MIB.addImm(ShiftAmt);
4720  return true;
4721 }
4722 
4724  bool HasAVX = Subtarget.hasAVX();
4725  MachineInstrBuilder MIB(*MI.getParent()->getParent(), MI);
4726  switch (MI.getOpcode()) {
4727  case X86::MOV32r0:
4728  return Expand2AddrUndef(MIB, get(X86::XOR32rr));
4729  case X86::MOV32r1:
4730  return expandMOV32r1(MIB, *this, /*MinusOne=*/ false);
4731  case X86::MOV32r_1:
4732  return expandMOV32r1(MIB, *this, /*MinusOne=*/ true);
4733  case X86::MOV32ImmSExti8:
4734  case X86::MOV64ImmSExti8:
4735  return ExpandMOVImmSExti8(MIB, *this, Subtarget);
4736  case X86::SETB_C32r:
4737  return Expand2AddrUndef(MIB, get(X86::SBB32rr));
4738  case X86::SETB_C64r:
4739  return Expand2AddrUndef(MIB, get(X86::SBB64rr));
4740  case X86::MMX_SET0:
4741  return Expand2AddrUndef(MIB, get(X86::MMX_PXORirr));
4742  case X86::V_SET0:
4743  case X86::FsFLD0SS:
4744  case X86::FsFLD0SD:
4745  case X86::FsFLD0F128:
4746  return Expand2AddrUndef(MIB, get(HasAVX ? X86::VXORPSrr : X86::XORPSrr));
4747  case X86::AVX_SET0: {
4748  assert(HasAVX && "AVX not supported");
4750  Register SrcReg = MIB.getReg(0);
4751  Register XReg = TRI->getSubReg(SrcReg, X86::sub_xmm);
4752  MIB->getOperand(0).setReg(XReg);
4753  Expand2AddrUndef(MIB, get(X86::VXORPSrr));
4754  MIB.addReg(SrcReg, RegState::ImplicitDefine);
4755  return true;
4756  }
4757  case X86::AVX512_128_SET0:
4758  case X86::AVX512_FsFLD0SS:
4759  case X86::AVX512_FsFLD0SD:
4760  case X86::AVX512_FsFLD0F128: {
4761  bool HasVLX = Subtarget.hasVLX();
4762  Register SrcReg = MIB.getReg(0);
4764  if (HasVLX || TRI->getEncodingValue(SrcReg) < 16)
4765  return Expand2AddrUndef(MIB,
4766  get(HasVLX ? X86::VPXORDZ128rr : X86::VXORPSrr));
4767  // Extended register without VLX. Use a larger XOR.
4768  SrcReg =
4769  TRI->getMatchingSuperReg(SrcReg, X86::sub_xmm, &X86::VR512RegClass);
4770  MIB->getOperand(0).setReg(SrcReg);
4771  return Expand2AddrUndef(MIB, get(X86::VPXORDZrr));
4772  }
4773  case X86::AVX512_256_SET0:
4774  case X86::AVX512_512_SET0: {
4775  bool HasVLX = Subtarget.hasVLX();
4776  Register SrcReg = MIB.getReg(0);
4778  if (HasVLX || TRI->getEncodingValue(SrcReg) < 16) {
4779  Register XReg = TRI->getSubReg(SrcReg, X86::sub_xmm);
4780  MIB->getOperand(0).setReg(XReg);
4781  Expand2AddrUndef(MIB,
4782  get(HasVLX ? X86::VPXORDZ128rr : X86::VXORPSrr));
4783  MIB.addReg(SrcReg, RegState::ImplicitDefine);
4784  return true;
4785  }
4786  if (MI.getOpcode() == X86::AVX512_256_SET0) {
4787  // No VLX so we must reference a zmm.
4788  unsigned ZReg =
4789  TRI->getMatchingSuperReg(SrcReg, X86::sub_ymm, &X86::VR512RegClass);
4790  MIB->getOperand(0).setReg(ZReg);
4791  }
4792  return Expand2AddrUndef(MIB, get(X86::VPXORDZrr));
4793  }
4794  case X86::V_SETALLONES:
4795  return Expand2AddrUndef(MIB, get(HasAVX ? X86::VPCMPEQDrr : X86::PCMPEQDrr));
4796  case X86::AVX2_SETALLONES:
4797  return Expand2AddrUndef(MIB, get(X86::VPCMPEQDYrr));
4798  case X86::AVX1_SETALLONES: {
4799  Register Reg = MIB.getReg(0);
4800  // VCMPPSYrri with an immediate 0xf should produce VCMPTRUEPS.
4801  MIB->setDesc(get(X86::VCMPPSYrri));
4803  return true;
4804  }
4805  case X86::AVX512_512_SETALLONES: {
4806  Register Reg = MIB.getReg(0);
4807  MIB->setDesc(get(X86::VPTERNLOGDZrri));
4808  // VPTERNLOGD needs 3 register inputs and an immediate.
4809  // 0xff will return 1s for any input.
4811  .addReg(Reg, RegState::Undef).addImm(0xff);
4812  return true;
4813  }
4814  case X86::AVX512_512_SEXT_MASK_32:
4815  case X86::AVX512_512_SEXT_MASK_64: {
4816  Register Reg = MIB.getReg(0);
4817  Register MaskReg = MIB.getReg(1);
4818  unsigned MaskState = getRegState(MIB->getOperand(1));
4819  unsigned Opc = (MI.getOpcode() == X86::AVX512_512_SEXT_MASK_64) ?
4820  X86::VPTERNLOGQZrrikz : X86::VPTERNLOGDZrrikz;
4821  MI.RemoveOperand(1);
4822  MIB->setDesc(get(Opc));
4823  // VPTERNLOG needs 3 register inputs and an immediate.
4824  // 0xff will return 1s for any input.
4825  MIB.addReg(Reg, RegState::Undef).addReg(MaskReg, MaskState)
4827  return true;
4828  }
4829  case X86::VMOVAPSZ128rm_NOVLX:
4830  return expandNOVLXLoad(MIB, &getRegisterInfo(), get(X86::VMOVAPSrm),
4831  get(X86::VBROADCASTF32X4rm), X86::sub_xmm);
4832  case X86::VMOVUPSZ128rm_NOVLX:
4833  return expandNOVLXLoad(MIB, &getRegisterInfo(), get(X86::VMOVUPSrm),
4834  get(X86::VBROADCASTF32X4rm), X86::sub_xmm);
4835  case X86::VMOVAPSZ256rm_NOVLX:
4836  return expandNOVLXLoad(MIB, &getRegisterInfo(), get(X86::VMOVAPSYrm),
4837  get(X86::VBROADCASTF64X4rm), X86::sub_ymm);
4838  case X86::VMOVUPSZ256rm_NOVLX:
4839  return expandNOVLXLoad(MIB, &getRegisterInfo(), get(X86::VMOVUPSYrm),
4840  get(X86::VBROADCASTF64X4rm), X86::sub_ymm);
4841  case X86::VMOVAPSZ128mr_NOVLX:
4842  return expandNOVLXStore(MIB, &getRegisterInfo(), get(X86::VMOVAPSmr),
4843  get(X86::VEXTRACTF32x4Zmr), X86::sub_xmm);
4844  case X86::VMOVUPSZ128mr_NOVLX:
4845  return expandNOVLXStore(MIB, &getRegisterInfo(), get(X86::VMOVUPSmr),
4846  get(X86::VEXTRACTF32x4Zmr), X86::sub_xmm);
4847  case X86::VMOVAPSZ256mr_NOVLX:
4848  return expandNOVLXStore(MIB, &getRegisterInfo(), get(X86::VMOVAPSYmr),
4849  get(X86::VEXTRACTF64x4Zmr), X86::sub_ymm);
4850  case X86::VMOVUPSZ256mr_NOVLX:
4851  return expandNOVLXStore(MIB, &getRegisterInfo(), get(X86::VMOVUPSYmr),
4852  get(X86::VEXTRACTF64x4Zmr), X86::sub_ymm);
4853  case X86::MOV32ri64: {
4854  Register Reg = MIB.getReg(0);
4855  Register Reg32 = RI.getSubReg(Reg, X86::sub_32bit);
4856  MI.setDesc(get(X86::MOV32ri));
4857  MIB->getOperand(0).setReg(Reg32);
4859  return true;
4860  }
4861 
4862  // KNL does not recognize dependency-breaking idioms for mask registers,
4863  // so kxnor %k1, %k1, %k2 has a RAW dependence on %k1.
4864  // Using %k0 as the undef input register is a performance heuristic based
4865  // on the assumption that %k0 is used less frequently than the other mask
4866  // registers, since it is not usable as a write mask.
4867  // FIXME: A more advanced approach would be to choose the best input mask
4868  // register based on context.
4869  case X86::KSET0W: return Expand2AddrKreg(MIB, get(X86::KXORWrr), X86::K0);
4870  case X86::KSET0D: return Expand2AddrKreg(MIB, get(X86::KXORDrr), X86::K0);
4871  case X86::KSET0Q: return Expand2AddrKreg(MIB, get(X86::KXORQrr), X86::K0);
4872  case X86::KSET1W: return Expand2AddrKreg(MIB, get(X86::KXNORWrr), X86::K0);
4873  case X86::KSET1D: return Expand2AddrKreg(MIB, get(X86::KXNORDrr), X86::K0);
4874  case X86::KSET1Q: return Expand2AddrKreg(MIB, get(X86::KXNORQrr), X86::K0);
4875  case TargetOpcode::LOAD_STACK_GUARD:
4876  expandLoadStackGuard(MIB, *this);
4877  return true;
4878  case X86::XOR64_FP:
4879  case X86::XOR32_FP:
4880  return expandXorFP(MIB, *this);
4881  case X86::SHLDROT32ri: return expandSHXDROT(MIB, get(X86::SHLD32rri8));
4882  case X86::SHLDROT64ri: return expandSHXDROT(MIB, get(X86::SHLD64rri8));
4883  case X86::SHRDROT32ri: return expandSHXDROT(MIB, get(X86::SHRD32rri8));
4884  case X86::SHRDROT64ri: return expandSHXDROT(MIB, get(X86::SHRD64rri8));
4885  case X86::ADD8rr_DB: MIB->setDesc(get(X86::OR8rr)); break;
4886  case X86::ADD16rr_DB: MIB->setDesc(get(X86::OR16rr)); break;
4887  case X86::ADD32rr_DB: MIB->setDesc(get(X86::OR32rr)); break;
4888  case X86::ADD64rr_DB: MIB->setDesc(get(X86::OR64rr)); break;
4889  case X86::ADD8ri_DB: MIB->setDesc(get(X86::OR8ri)); break;
4890  case X86::ADD16ri_DB: MIB->setDesc(get(X86::OR16ri)); break;
4891  case X86::ADD32ri_DB: MIB->setDesc(get(X86::OR32ri)); break;
4892  case X86::ADD64ri32_DB: MIB->setDesc(get(X86::OR64ri32)); break;
4893  case X86::ADD16ri8_DB: MIB->setDesc(get(X86::OR16ri8)); break;
4894  case X86::ADD32ri8_DB: MIB->setDesc(get(X86::OR32ri8)); break;
4895  case X86::ADD64ri8_DB: MIB->setDesc(get(X86::OR64ri8)); break;
4896  }
4897  return false;
4898 }
4899 
4900 /// Return true for all instructions that only update
4901 /// the first 32 or 64-bits of the destination register and leave the rest
4902 /// unmodified. This can be used to avoid folding loads if the instructions
4903 /// only update part of the destination register, and the non-updated part is
4904 /// not needed. e.g. cvtss2sd, sqrtss. Unfolding the load from these
4905 /// instructions breaks the partial register dependency and it can improve
4906 /// performance. e.g.:
4907 ///
4908 /// movss (%rdi), %xmm0
4909 /// cvtss2sd %xmm0, %xmm0
4910 ///
4911 /// Instead of
4912 /// cvtss2sd (%rdi), %xmm0
4913 ///
4914 /// FIXME: This should be turned into a TSFlags.
4915 ///
4916 static bool hasPartialRegUpdate(unsigned Opcode,
4917  const X86Subtarget &Subtarget,
4918  bool ForLoadFold = false) {
4919  switch (Opcode) {
4920  case X86::CVTSI2SSrr:
4921  case X86::CVTSI2SSrm:
4922  case X86::CVTSI642SSrr:
4923  case X86::CVTSI642SSrm:
4924  case X86::CVTSI2SDrr:
4925  case X86::CVTSI2SDrm:
4926  case X86::CVTSI642SDrr:
4927  case X86::CVTSI642SDrm:
4928  // Load folding won't effect the undef register update since the input is
4929  // a GPR.
4930  return !ForLoadFold;
4931  case X86::CVTSD2SSrr:
4932  case X86::CVTSD2SSrm:
4933  case X86::CVTSS2SDrr:
4934  case X86::CVTSS2SDrm:
4935  case X86::MOVHPDrm:
4936  case X86::MOVHPSrm:
4937  case X86::MOVLPDrm:
4938  case X86::MOVLPSrm:
4939  case X86::RCPSSr:
4940  case X86::RCPSSm:
4941  case X86::RCPSSr_Int:
4942  case X86::RCPSSm_Int:
4943  case X86::ROUNDSDr:
4944  case X86::ROUNDSDm:
4945  case X86::ROUNDSSr:
4946  case X86::ROUNDSSm:
4947  case X86::RSQRTSSr:
4948  case X86::RSQRTSSm:
4949  case X86::RSQRTSSr_Int:
4950  case X86::RSQRTSSm_Int:
4951  case X86::SQRTSSr:
4952  case X86::SQRTSSm:
4953  case X86::SQRTSSr_Int:
4954  case X86::SQRTSSm_Int:
4955  case X86::SQRTSDr:
4956  case X86::SQRTSDm:
4957  case X86::SQRTSDr_Int:
4958  case X86::SQRTSDm_Int:
4959  return true;
4960  // GPR
4961  case X86::POPCNT32rm:
4962  case X86::POPCNT32rr:
4963  case X86::POPCNT64rm:
4964  case X86::POPCNT64rr:
4965  return Subtarget.hasPOPCNTFalseDeps();
4966  case X86::LZCNT32rm:
4967  case X86::LZCNT32rr:
4968  case X86::LZCNT64rm:
4969  case X86::LZCNT64rr:
4970  case X86::TZCNT32rm:
4971  case X86::TZCNT32rr:
4972  case X86::TZCNT64rm:
4973  case X86::TZCNT64rr:
4974  return Subtarget.hasLZCNTFalseDeps();
4975  }
4976 
4977  return false;
4978 }
4979 
4980 /// Inform the BreakFalseDeps pass how many idle
4981 /// instructions we would like before a partial register update.
4983  const MachineInstr &MI, unsigned OpNum,
4984  const TargetRegisterInfo *TRI) const {
4985  if (OpNum != 0 || !hasPartialRegUpdate(MI.getOpcode(), Subtarget))
4986  return 0;
4987 
4988  // If MI is marked as reading Reg, the partial register update is wanted.
4989  const MachineOperand &MO = MI.getOperand(0);
4990  Register Reg = MO.getReg();
4991  if (Reg.isVirtual()) {
4992  if (MO.readsReg() || MI.readsVirtualRegister(Reg))
4993  return 0;
4994  } else {
4995  if (MI.readsRegister(Reg, TRI))
4996  return 0;
4997  }
4998 
4999  // If any instructions in the clearance range are reading Reg, insert a
5000  // dependency breaking instruction, which is inexpensive and is likely to
5001  // be hidden in other instruction's cycles.
5003 }
5004 
5005 // Return true for any instruction the copies the high bits of the first source
5006 // operand into the unused high bits of the destination operand.
5007 // Also returns true for instructions that have two inputs where one may
5008 // be undef and we want it to use the same register as the other input.
5009 static bool hasUndefRegUpdate(unsigned Opcode, unsigned OpNum,
5010  bool ForLoadFold = false) {
5011  // Set the OpNum parameter to the first source operand.
5012  switch (Opcode) {
5013  case X86::MMX_PUNPCKHBWirr:
5014  case X86::MMX_PUNPCKHWDirr:
5015  case X86::MMX_PUNPCKHDQirr:
5016  case X86::MMX_PUNPCKLBWirr:
5017  case X86::MMX_PUNPCKLWDirr:
5018  case X86::MMX_PUNPCKLDQirr:
5019  case X86::MOVHLPSrr:
5020  case X86::PACKSSWBrr:
5021  case X86::PACKUSWBrr:
5022  case X86::PACKSSDWrr:
5023  case X86::PACKUSDWrr:
5024  case X86::PUNPCKHBWrr:
5025  case X86::PUNPCKLBWrr:
5026  case X86::PUNPCKHWDrr:
5027  case X86::PUNPCKLWDrr:
5028  case X86::PUNPCKHDQrr:
5029  case X86::PUNPCKLDQrr:
5030  case X86::PUNPCKHQDQrr:
5031  case X86::PUNPCKLQDQrr:
5032  case X86::SHUFPDrri:
5033  case X86::SHUFPSrri:
5034  // These instructions are sometimes used with an undef first or second
5035  // source. Return true here so BreakFalseDeps will assign this source to the
5036  // same register as the first source to avoid a false dependency.
5037  // Operand 1 of these instructions is tied so they're separate from their
5038  // VEX counterparts.
5039  return OpNum == 2 && !ForLoadFold;
5040 
5041  case X86::VMOVLHPSrr:
5042  case X86::VMOVLHPSZrr:
5043  case X86::VPACKSSWBrr:
5044  case X86::VPACKUSWBrr:
5045  case X86::VPACKSSDWrr:
5046  case X86::VPACKUSDWrr:
5047  case X86::VPACKSSWBZ128rr:
5048  case X86::VPACKUSWBZ128rr:
5049  case X86::VPACKSSDWZ128rr:
5050  case X86::VPACKUSDWZ128rr:
5051  case X86::VPERM2F128rr:
5052  case X86::VPERM2I128rr:
5053  case X86::VSHUFF32X4Z256rri:
5054  case X86::VSHUFF32X4Zrri:
5055  case X86::VSHUFF64X2Z256rri:
5056  case X86::VSHUFF64X2Zrri:
5057  case X86::VSHUFI32X4Z256rri:
5058  case X86::VSHUFI32X4Zrri:
5059  case X86::VSHUFI64X2Z256rri:
5060  case X86::VSHUFI64X2Zrri:
5061  case X86::VPUNPCKHBWrr:
5062  case X86::VPUNPCKLBWrr:
5063  case X86::VPUNPCKHBWYrr:
5064  case X86::VPUNPCKLBWYrr:
5065  case X86::VPUNPCKHBWZ128rr:
5066  case X86::VPUNPCKLBWZ128rr:
5067  case X86::VPUNPCKHBWZ256rr:
5068  case X86::VPUNPCKLBWZ256rr:
5069  case X86::VPUNPCKHBWZrr:
5070  case X86::VPUNPCKLBWZrr:
5071  case X86::VPUNPCKHWDrr:
5072  case X86::VPUNPCKLWDrr:
5073  case X86::VPUNPCKHWDYrr:
5074  case X86::VPUNPCKLWDYrr:
5075  case X86::VPUNPCKHWDZ128rr:
5076  case X86::VPUNPCKLWDZ128rr:
5077  case X86::VPUNPCKHWDZ256rr:
5078  case X86::VPUNPCKLWDZ256rr:
5079  case X86::VPUNPCKHWDZrr:
5080  case X86::VPUNPCKLWDZrr:
5081  case X86::VPUNPCKHDQrr:
5082  case X86::VPUNPCKLDQrr:
5083  case X86::VPUNPCKHDQYrr:
5084  case X86::VPUNPCKLDQYrr:
5085  case X86::VPUNPCKHDQZ128rr:
5086  case X86::VPUNPCKLDQZ128rr:
5087  case X86::VPUNPCKHDQZ256rr:
5088  case X86::VPUNPCKLDQZ256rr:
5089  case X86::VPUNPCKHDQZrr:
5090  case X86::VPUNPCKLDQZrr:
5091  case X86::VPUNPCKHQDQrr:
5092  case X86::VPUNPCKLQDQrr:
5093  case X86::VPUNPCKHQDQYrr:
5094  case X86::VPUNPCKLQDQYrr:
5095  case X86::VPUNPCKHQDQZ128rr:
5096  case X86::VPUNPCKLQDQZ128rr:
5097  case X86::VPUNPCKHQDQZ256rr:
5098  case X86::VPUNPCKLQDQZ256rr:
5099  case X86::VPUNPCKHQDQZrr:
5100  case X86::VPUNPCKLQDQZrr:
5101  // These instructions are sometimes used with an undef first or second
5102  // source. Return true here so BreakFalseDeps will assign this source to the
5103  // same register as the first source to avoid a false dependency.
5104  return (OpNum == 1 || OpNum == 2) && !ForLoadFold;
5105 
5106  case X86::VCVTSI2SSrr:
5107  case X86::VCVTSI2SSrm:
5108  case X86::VCVTSI2SSrr_Int:
5109  case X86::VCVTSI2SSrm_Int:
5110  case X86::VCVTSI642SSrr:
5111  case X86::VCVTSI642SSrm:
5112  case X86::VCVTSI642SSrr_Int:
5113  case X86::VCVTSI642SSrm_Int:
5114  case X86::VCVTSI2SDrr:
5115  case X86::VCVTSI2SDrm:
5116  case X86::VCVTSI2SDrr_Int:
5117  case X86::VCVTSI2SDrm_Int:
5118  case X86::VCVTSI642SDrr:
5119  case X86::VCVTSI642SDrm:
5120  case X86::VCVTSI642SDrr_Int:
5121  case X86::VCVTSI642SDrm_Int:
5122  // AVX-512
5123  case X86::VCVTSI2SSZrr:
5124  case X86::VCVTSI2SSZrm:
5125  case X86::VCVTSI2SSZrr_Int:
5126  case X86::VCVTSI2SSZrrb_Int:
5127  case X86::VCVTSI2SSZrm_Int:
5128  case X86::VCVTSI642SSZrr:
5129  case X86::VCVTSI642SSZrm:
5130  case X86::VCVTSI642SSZrr_Int:
5131  case X86::VCVTSI642SSZrrb_Int:
5132  case X86::VCVTSI642SSZrm_Int:
5133  case X86::VCVTSI2SDZrr:
5134  case X86::VCVTSI2SDZrm:
5135  case X86::VCVTSI2SDZrr_Int:
5136  case X86::VCVTSI2SDZrm_Int:
5137  case X86::VCVTSI642SDZrr:
5138  case X86::VCVTSI642SDZrm:
5139  case X86::VCVTSI642SDZrr_Int:
5140  case X86::VCVTSI642SDZrrb_Int:
5141  case X86::VCVTSI642SDZrm_Int:
5142  case X86::VCVTUSI2SSZrr:
5143  case X86::VCVTUSI2SSZrm:
5144  case X86::VCVTUSI2SSZrr_Int:
5145  case X86::VCVTUSI2SSZrrb_Int:
5146  case X86::VCVTUSI2SSZrm_Int:
5147  case X86::VCVTUSI642SSZrr:
5148  case X86::VCVTUSI642SSZrm:
5149  case X86::VCVTUSI642SSZrr_Int:
5150  case X86::VCVTUSI642SSZrrb_Int:
5151  case X86::VCVTUSI642SSZrm_Int:
5152  case X86::VCVTUSI2SDZrr:
5153  case X86::VCVTUSI2SDZrm:
5154  case X86::VCVTUSI2SDZrr_Int:
5155  case X86::VCVTUSI2SDZrm_Int:
5156  case X86::VCVTUSI642SDZrr:
5157  case X86::VCVTUSI642SDZrm:
5158  case X86::VCVTUSI642SDZrr_Int:
5159  case X86::VCVTUSI642SDZrrb_Int:
5160  case X86::VCVTUSI642SDZrm_Int:
5161  // Load folding won't effect the undef register update since the input is
5162  // a GPR.
5163  return OpNum == 1 && !ForLoadFold;
5164  case X86::VCVTSD2SSrr:
5165  case X86::VCVTSD2SSrm:
5166  case X86::VCVTSD2SSrr_Int:
5167  case X86::VCVTSD2SSrm_Int:
5168  case X86::VCVTSS2SDrr:
5169  case X86::VCVTSS2SDrm:
5170  case X86::VCVTSS2SDrr_Int:
5171  case X86::VCVTSS2SDrm_Int:
5172  case X86::VRCPSSr:
5173  case X86::VRCPSSr_Int:
5174  case X86::VRCPSSm:
5175  case X86::VRCPSSm_Int:
5176  case X86::VROUNDSDr:
5177  case X86::VROUNDSDm:
5178  case X86::VROUNDSDr_Int:
5179  case X86::VROUNDSDm_Int:
5180  case X86::VROUNDSSr:
5181  case X86::VROUNDSSm:
5182  case X86::VROUNDSSr_Int:
5183  case X86::VROUNDSSm_Int:
5184  case X86::VRSQRTSSr:
5185  case X86::VRSQRTSSr_Int:
5186  case X86::VRSQRTSSm:
5187  case X86::VRSQRTSSm_Int:
5188  case X86::VSQRTSSr:
5189  case X86::VSQRTSSr_Int:
5190  case X86::VSQRTSSm:
5191  case X86::VSQRTSSm_Int:
5192  case X86::VSQRTSDr:
5193  case X86::VSQRTSDr_Int:
5194  case X86::VSQRTSDm:
5195  case X86::VSQRTSDm_Int:
5196  // AVX-512
5197  case X86::VCVTSD2SSZrr:
5198  case X86::VCVTSD2SSZrr_Int:
5199  case X86::VCVTSD2SSZrrb_Int:
5200  case X86::VCVTSD2SSZrm:
5201  case X86::VCVTSD2SSZrm_Int:
5202  case X86::VCVTSS2SDZrr:
5203  case X86::VCVTSS2SDZrr_Int:
5204  case X86::VCVTSS2SDZrrb_Int:
5205  case X86::VCVTSS2SDZrm:
5206  case X86::VCVTSS2SDZrm_Int:
5207  case X86::VGETEXPSDZr:
5208  case X86::VGETEXPSDZrb:
5209  case X86::VGETEXPSDZm:
5210  case X86::VGETEXPSSZr:
5211  case X86::VGETEXPSSZrb:
5212  case X86::VGETEXPSSZm:
5213  case X86::VGETMANTSDZrri:
5214  case X86::VGETMANTSDZrrib:
5215  case X86::VGETMANTSDZrmi:
5216  case X86::VGETMANTSSZrri:
5217  case X86::VGETMANTSSZrrib:
5218  case X86::VGETMANTSSZrmi:
5219  case X86::VRNDSCALESDZr:
5220  case X86::VRNDSCALESDZr_Int:
5221  case X86::VRNDSCALESDZrb_Int:
5222  case X86::VRNDSCALESDZm:
5223  case X86::VRNDSCALESDZm_Int:
5224  case X86::VRNDSCALESSZr:
5225  case X86::VRNDSCALESSZr_Int:
5226  case X86::VRNDSCALESSZrb_Int:
5227  case X86::VRNDSCALESSZm:
5228  case X86::VRNDSCALESSZm_Int:
5229  case X86::VRCP14SDZrr:
5230  case X86::VRCP14SDZrm:
5231  case X86::VRCP14SSZrr:
5232  case X86::VRCP14SSZrm:
5233  case X86::VRCP28SDZr:
5234  case X86::VRCP28SDZrb:
5235  case X86::VRCP28SDZm:
5236  case X86::VRCP28SSZr:
5237  case X86::VRCP28SSZrb:
5238  case X86::VRCP28SSZm:
5239  case X86::VREDUCESSZrmi:
5240  case X86::VREDUCESSZrri:
5241  case X86::VREDUCESSZrrib:
5242  case X86::VRSQRT14SDZrr:
5243  case X86::VRSQRT14SDZrm:
5244  case X86::VRSQRT14SSZrr:
5245  case X86::VRSQRT14SSZrm:
5246  case X86::VRSQRT28SDZr:
5247  case X86::VRSQRT28SDZrb:
5248  case X86::VRSQRT28SDZm:
5249  case X86::VRSQRT28SSZr:
5250  case X86::VRSQRT28SSZrb:
5251  case X86::VRSQRT28SSZm:
5252  case X86::VSQRTSSZr:
5253  case X86::VSQRTSSZr_Int:
5254  case X86::VSQRTSSZrb_Int:
5255  case X86::VSQRTSSZm:
5256  case X86::VSQRTSSZm_Int:
5257  case X86::VSQRTSDZr:
5258  case X86::VSQRTSDZr_Int:
5259  case X86::VSQRTSDZrb_Int:
5260  case X86::VSQRTSDZm:
5261  case X86::VSQRTSDZm_Int:
5262  return OpNum == 1;
5263  case X86::VMOVSSZrrk:
5264  case X86::VMOVSDZrrk:
5265  return OpNum == 3 && !ForLoadFold;
5266  case X86::VMOVSSZrrkz:
5267  case X86::VMOVSDZrrkz:
5268  return OpNum == 2 && !ForLoadFold;
5269  }
5270 
5271  return false;
5272 }
5273 
5274 /// Inform the BreakFalseDeps pass how many idle instructions we would like
5275 /// before certain undef register reads.
5276 ///
5277 /// This catches the VCVTSI2SD family of instructions:
5278 ///
5279 /// vcvtsi2sdq %rax, undef %xmm0, %xmm14
5280 ///
5281 /// We should to be careful *not* to catch VXOR idioms which are presumably
5282 /// handled specially in the pipeline:
5283 ///
5284 /// vxorps undef %xmm1, undef %xmm1, %xmm1
5285 ///
5286 /// Like getPartialRegUpdateClearance, this makes a strong assumption that the
5287 /// high bits that are passed-through are not live.
5288 unsigned
5290  const TargetRegisterInfo *TRI) const {
5291  const MachineOperand &MO = MI.getOperand(OpNum);
5293  hasUndefRegUpdate(MI.getOpcode(), OpNum))
5294  return UndefRegClearance;
5295 
5296  return 0;
5297 }
5298 
5300