LLVM  15.0.0git
X86InstrInfo.cpp
Go to the documentation of this file.
1 //===-- X86InstrInfo.cpp - X86 Instruction Information --------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file contains the X86 implementation of the TargetInstrInfo class.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "X86InstrInfo.h"
14 #include "X86.h"
15 #include "X86InstrBuilder.h"
16 #include "X86InstrFoldTables.h"
17 #include "X86MachineFunctionInfo.h"
18 #include "X86Subtarget.h"
19 #include "X86TargetMachine.h"
20 #include "llvm/ADT/STLExtras.h"
21 #include "llvm/ADT/Sequence.h"
33 #include "llvm/CodeGen/StackMaps.h"
35 #include "llvm/IR/DerivedTypes.h"
36 #include "llvm/IR/Function.h"
37 #include "llvm/IR/InstrTypes.h"
38 #include "llvm/MC/MCAsmInfo.h"
39 #include "llvm/MC/MCExpr.h"
40 #include "llvm/MC/MCInst.h"
42 #include "llvm/Support/Debug.h"
46 
47 using namespace llvm;
48 
49 #define DEBUG_TYPE "x86-instr-info"
50 
51 #define GET_INSTRINFO_CTOR_DTOR
52 #include "X86GenInstrInfo.inc"
53 
54 static cl::opt<bool>
55  NoFusing("disable-spill-fusing",
56  cl::desc("Disable fusing of spill code into instructions"),
57  cl::Hidden);
58 static cl::opt<bool>
59 PrintFailedFusing("print-failed-fuse-candidates",
60  cl::desc("Print instructions that the allocator wants to"
61  " fuse, but the X86 backend currently can't"),
62  cl::Hidden);
63 static cl::opt<bool>
64 ReMatPICStubLoad("remat-pic-stub-load",
65  cl::desc("Re-materialize load from stub in PIC mode"),
66  cl::init(false), cl::Hidden);
67 static cl::opt<unsigned>
68 PartialRegUpdateClearance("partial-reg-update-clearance",
69  cl::desc("Clearance between two register writes "
70  "for inserting XOR to avoid partial "
71  "register update"),
72  cl::init(64), cl::Hidden);
73 static cl::opt<unsigned>
74 UndefRegClearance("undef-reg-clearance",
75  cl::desc("How many idle instructions we would like before "
76  "certain undef register reads"),
77  cl::init(128), cl::Hidden);
78 
79 
80 // Pin the vtable to this file.
81 void X86InstrInfo::anchor() {}
82 
84  : X86GenInstrInfo((STI.isTarget64BitLP64() ? X86::ADJCALLSTACKDOWN64
85  : X86::ADJCALLSTACKDOWN32),
86  (STI.isTarget64BitLP64() ? X86::ADJCALLSTACKUP64
87  : X86::ADJCALLSTACKUP32),
88  X86::CATCHRET,
89  (STI.is64Bit() ? X86::RET64 : X86::RET32)),
90  Subtarget(STI), RI(STI.getTargetTriple()) {
91 }
92 
93 bool
95  Register &SrcReg, Register &DstReg,
96  unsigned &SubIdx) const {
97  switch (MI.getOpcode()) {
98  default: break;
99  case X86::MOVSX16rr8:
100  case X86::MOVZX16rr8:
101  case X86::MOVSX32rr8:
102  case X86::MOVZX32rr8:
103  case X86::MOVSX64rr8:
104  if (!Subtarget.is64Bit())
105  // It's not always legal to reference the low 8-bit of the larger
106  // register in 32-bit mode.
107  return false;
109  case X86::MOVSX32rr16:
110  case X86::MOVZX32rr16:
111  case X86::MOVSX64rr16:
112  case X86::MOVSX64rr32: {
113  if (MI.getOperand(0).getSubReg() || MI.getOperand(1).getSubReg())
114  // Be conservative.
115  return false;
116  SrcReg = MI.getOperand(1).getReg();
117  DstReg = MI.getOperand(0).getReg();
118  switch (MI.getOpcode()) {
119  default: llvm_unreachable("Unreachable!");
120  case X86::MOVSX16rr8:
121  case X86::MOVZX16rr8:
122  case X86::MOVSX32rr8:
123  case X86::MOVZX32rr8:
124  case X86::MOVSX64rr8:
125  SubIdx = X86::sub_8bit;
126  break;
127  case X86::MOVSX32rr16:
128  case X86::MOVZX32rr16:
129  case X86::MOVSX64rr16:
130  SubIdx = X86::sub_16bit;
131  break;
132  case X86::MOVSX64rr32:
133  SubIdx = X86::sub_32bit;
134  break;
135  }
136  return true;
137  }
138  }
139  return false;
140 }
141 
143  if (MI.mayLoad() || MI.mayStore())
144  return false;
145 
146  // Some target-independent operations that trivially lower to data-invariant
147  // instructions.
148  if (MI.isCopyLike() || MI.isInsertSubreg())
149  return true;
150 
151  unsigned Opcode = MI.getOpcode();
152  using namespace X86;
153  // On x86 it is believed that imul is constant time w.r.t. the loaded data.
154  // However, they set flags and are perhaps the most surprisingly constant
155  // time operations so we call them out here separately.
156  if (isIMUL(Opcode))
157  return true;
158  // Bit scanning and counting instructions that are somewhat surprisingly
159  // constant time as they scan across bits and do other fairly complex
160  // operations like popcnt, but are believed to be constant time on x86.
161  // However, these set flags.
162  if (isBSF(Opcode) || isBSR(Opcode) || isLZCNT(Opcode) || isPOPCNT(Opcode) ||
163  isTZCNT(Opcode))
164  return true;
165  // Bit manipulation instructions are effectively combinations of basic
166  // arithmetic ops, and should still execute in constant time. These also
167  // set flags.
168  if (isBLCFILL(Opcode) || isBLCI(Opcode) || isBLCIC(Opcode) ||
169  isBLCMSK(Opcode) || isBLCS(Opcode) || isBLSFILL(Opcode) ||
170  isBLSI(Opcode) || isBLSIC(Opcode) || isBLSMSK(Opcode) || isBLSR(Opcode) ||
171  isTZMSK(Opcode))
172  return true;
173  // Bit extracting and clearing instructions should execute in constant time,
174  // and set flags.
175  if (isBEXTR(Opcode) || isBZHI(Opcode))
176  return true;
177  // Shift and rotate.
178  if (isROL(Opcode) || isROR(Opcode) || isSAR(Opcode) || isSHL(Opcode) ||
179  isSHR(Opcode) || isSHLD(Opcode) || isSHRD(Opcode))
180  return true;
181  // Basic arithmetic is constant time on the input but does set flags.
182  if (isADC(Opcode) || isADD(Opcode) || isAND(Opcode) || isOR(Opcode) ||
183  isSBB(Opcode) || isSUB(Opcode) || isXOR(Opcode))
184  return true;
185  // Arithmetic with just 32-bit and 64-bit variants and no immediates.
186  if (isADCX(Opcode) || isADOX(Opcode) || isANDN(Opcode))
187  return true;
188  // Unary arithmetic operations.
189  if (isDEC(Opcode) || isINC(Opcode) || isNEG(Opcode))
190  return true;
191  // Unlike other arithmetic, NOT doesn't set EFLAGS.
192  if (isNOT(Opcode))
193  return true;
194  // Various move instructions used to zero or sign extend things. Note that we
195  // intentionally don't support the _NOREX variants as we can't handle that
196  // register constraint anyways.
197  if (isMOVSX(Opcode) || isMOVZX(Opcode) || isMOVSXD(Opcode) || isMOV(Opcode))
198  return true;
199  // Arithmetic instructions that are both constant time and don't set flags.
200  if (isRORX(Opcode) || isSARX(Opcode) || isSHLX(Opcode) || isSHRX(Opcode))
201  return true;
202  // LEA doesn't actually access memory, and its arithmetic is constant time.
203  if (isLEA(Opcode))
204  return true;
205  // By default, assume that the instruction is not data invariant.
206  return false;
207 }
208 
210  switch (MI.getOpcode()) {
211  default:
212  // By default, assume that the load will immediately leak.
213  return false;
214 
215  // On x86 it is believed that imul is constant time w.r.t. the loaded data.
216  // However, they set flags and are perhaps the most surprisingly constant
217  // time operations so we call them out here separately.
218  case X86::IMUL16rm:
219  case X86::IMUL16rmi8:
220  case X86::IMUL16rmi:
221  case X86::IMUL32rm:
222  case X86::IMUL32rmi8:
223  case X86::IMUL32rmi:
224  case X86::IMUL64rm:
225  case X86::IMUL64rmi32:
226  case X86::IMUL64rmi8:
227 
228  // Bit scanning and counting instructions that are somewhat surprisingly
229  // constant time as they scan across bits and do other fairly complex
230  // operations like popcnt, but are believed to be constant time on x86.
231  // However, these set flags.
232  case X86::BSF16rm:
233  case X86::BSF32rm:
234  case X86::BSF64rm:
235  case X86::BSR16rm:
236  case X86::BSR32rm:
237  case X86::BSR64rm:
238  case X86::LZCNT16rm:
239  case X86::LZCNT32rm:
240  case X86::LZCNT64rm:
241  case X86::POPCNT16rm:
242  case X86::POPCNT32rm:
243  case X86::POPCNT64rm:
244  case X86::TZCNT16rm:
245  case X86::TZCNT32rm:
246  case X86::TZCNT64rm:
247 
248  // Bit manipulation instructions are effectively combinations of basic
249  // arithmetic ops, and should still execute in constant time. These also
250  // set flags.
251  case X86::BLCFILL32rm:
252  case X86::BLCFILL64rm:
253  case X86::BLCI32rm:
254  case X86::BLCI64rm:
255  case X86::BLCIC32rm:
256  case X86::BLCIC64rm:
257  case X86::BLCMSK32rm:
258  case X86::BLCMSK64rm:
259  case X86::BLCS32rm:
260  case X86::BLCS64rm:
261  case X86::BLSFILL32rm:
262  case X86::BLSFILL64rm:
263  case X86::BLSI32rm:
264  case X86::BLSI64rm:
265  case X86::BLSIC32rm:
266  case X86::BLSIC64rm:
267  case X86::BLSMSK32rm:
268  case X86::BLSMSK64rm:
269  case X86::BLSR32rm:
270  case X86::BLSR64rm:
271  case X86::TZMSK32rm:
272  case X86::TZMSK64rm:
273 
274  // Bit extracting and clearing instructions should execute in constant time,
275  // and set flags.
276  case X86::BEXTR32rm:
277  case X86::BEXTR64rm:
278  case X86::BEXTRI32mi:
279  case X86::BEXTRI64mi:
280  case X86::BZHI32rm:
281  case X86::BZHI64rm:
282 
283  // Basic arithmetic is constant time on the input but does set flags.
284  case X86::ADC8rm:
285  case X86::ADC16rm:
286  case X86::ADC32rm:
287  case X86::ADC64rm:
288  case X86::ADCX32rm:
289  case X86::ADCX64rm:
290  case X86::ADD8rm:
291  case X86::ADD16rm:
292  case X86::ADD32rm:
293  case X86::ADD64rm:
294  case X86::ADOX32rm:
295  case X86::ADOX64rm:
296  case X86::AND8rm:
297  case X86::AND16rm:
298  case X86::AND32rm:
299  case X86::AND64rm:
300  case X86::ANDN32rm:
301  case X86::ANDN64rm:
302  case X86::OR8rm:
303  case X86::OR16rm:
304  case X86::OR32rm:
305  case X86::OR64rm:
306  case X86::SBB8rm:
307  case X86::SBB16rm:
308  case X86::SBB32rm:
309  case X86::SBB64rm:
310  case X86::SUB8rm:
311  case X86::SUB16rm:
312  case X86::SUB32rm:
313  case X86::SUB64rm:
314  case X86::XOR8rm:
315  case X86::XOR16rm:
316  case X86::XOR32rm:
317  case X86::XOR64rm:
318 
319  // Integer multiply w/o affecting flags is still believed to be constant
320  // time on x86. Called out separately as this is among the most surprising
321  // instructions to exhibit that behavior.
322  case X86::MULX32rm:
323  case X86::MULX64rm:
324 
325  // Arithmetic instructions that are both constant time and don't set flags.
326  case X86::RORX32mi:
327  case X86::RORX64mi:
328  case X86::SARX32rm:
329  case X86::SARX64rm:
330  case X86::SHLX32rm:
331  case X86::SHLX64rm:
332  case X86::SHRX32rm:
333  case X86::SHRX64rm:
334 
335  // Conversions are believed to be constant time and don't set flags.
336  case X86::CVTTSD2SI64rm:
337  case X86::VCVTTSD2SI64rm:
338  case X86::VCVTTSD2SI64Zrm:
339  case X86::CVTTSD2SIrm:
340  case X86::VCVTTSD2SIrm:
341  case X86::VCVTTSD2SIZrm:
342  case X86::CVTTSS2SI64rm:
343  case X86::VCVTTSS2SI64rm:
344  case X86::VCVTTSS2SI64Zrm:
345  case X86::CVTTSS2SIrm:
346  case X86::VCVTTSS2SIrm:
347  case X86::VCVTTSS2SIZrm:
348  case X86::CVTSI2SDrm:
349  case X86::VCVTSI2SDrm:
350  case X86::VCVTSI2SDZrm:
351  case X86::CVTSI2SSrm:
352  case X86::VCVTSI2SSrm:
353  case X86::VCVTSI2SSZrm:
354  case X86::CVTSI642SDrm:
355  case X86::VCVTSI642SDrm:
356  case X86::VCVTSI642SDZrm:
357  case X86::CVTSI642SSrm:
358  case X86::VCVTSI642SSrm:
359  case X86::VCVTSI642SSZrm:
360  case X86::CVTSS2SDrm:
361  case X86::VCVTSS2SDrm:
362  case X86::VCVTSS2SDZrm:
363  case X86::CVTSD2SSrm:
364  case X86::VCVTSD2SSrm:
365  case X86::VCVTSD2SSZrm:
366  // AVX512 added unsigned integer conversions.
367  case X86::VCVTTSD2USI64Zrm:
368  case X86::VCVTTSD2USIZrm:
369  case X86::VCVTTSS2USI64Zrm:
370  case X86::VCVTTSS2USIZrm:
371  case X86::VCVTUSI2SDZrm:
372  case X86::VCVTUSI642SDZrm:
373  case X86::VCVTUSI2SSZrm:
374  case X86::VCVTUSI642SSZrm:
375 
376  // Loads to register don't set flags.
377  case X86::MOV8rm:
378  case X86::MOV8rm_NOREX:
379  case X86::MOV16rm:
380  case X86::MOV32rm:
381  case X86::MOV64rm:
382  case X86::MOVSX16rm8:
383  case X86::MOVSX32rm16:
384  case X86::MOVSX32rm8:
385  case X86::MOVSX32rm8_NOREX:
386  case X86::MOVSX64rm16:
387  case X86::MOVSX64rm32:
388  case X86::MOVSX64rm8:
389  case X86::MOVZX16rm8:
390  case X86::MOVZX32rm16:
391  case X86::MOVZX32rm8:
392  case X86::MOVZX32rm8_NOREX:
393  case X86::MOVZX64rm16:
394  case X86::MOVZX64rm8:
395  return true;
396  }
397 }
398 
400  const MachineFunction *MF = MI.getParent()->getParent();
402 
403  if (isFrameInstr(MI)) {
404  int SPAdj = alignTo(getFrameSize(MI), TFI->getStackAlign());
405  SPAdj -= getFrameAdjustment(MI);
406  if (!isFrameSetup(MI))
407  SPAdj = -SPAdj;
408  return SPAdj;
409  }
410 
411  // To know whether a call adjusts the stack, we need information
412  // that is bound to the following ADJCALLSTACKUP pseudo.
413  // Look for the next ADJCALLSTACKUP that follows the call.
414  if (MI.isCall()) {
415  const MachineBasicBlock *MBB = MI.getParent();
417  for (auto E = MBB->end(); I != E; ++I) {
418  if (I->getOpcode() == getCallFrameDestroyOpcode() ||
419  I->isCall())
420  break;
421  }
422 
423  // If we could not find a frame destroy opcode, then it has already
424  // been simplified, so we don't care.
425  if (I->getOpcode() != getCallFrameDestroyOpcode())
426  return 0;
427 
428  return -(I->getOperand(1).getImm());
429  }
430 
431  // Currently handle only PUSHes we can reasonably expect to see
432  // in call sequences
433  switch (MI.getOpcode()) {
434  default:
435  return 0;
436  case X86::PUSH32i8:
437  case X86::PUSH32r:
438  case X86::PUSH32rmm:
439  case X86::PUSH32rmr:
440  case X86::PUSHi32:
441  return 4;
442  case X86::PUSH64i8:
443  case X86::PUSH64r:
444  case X86::PUSH64rmm:
445  case X86::PUSH64rmr:
446  case X86::PUSH64i32:
447  return 8;
448  }
449 }
450 
451 /// Return true and the FrameIndex if the specified
452 /// operand and follow operands form a reference to the stack frame.
453 bool X86InstrInfo::isFrameOperand(const MachineInstr &MI, unsigned int Op,
454  int &FrameIndex) const {
455  if (MI.getOperand(Op + X86::AddrBaseReg).isFI() &&
456  MI.getOperand(Op + X86::AddrScaleAmt).isImm() &&
457  MI.getOperand(Op + X86::AddrIndexReg).isReg() &&
458  MI.getOperand(Op + X86::AddrDisp).isImm() &&
459  MI.getOperand(Op + X86::AddrScaleAmt).getImm() == 1 &&
460  MI.getOperand(Op + X86::AddrIndexReg).getReg() == 0 &&
461  MI.getOperand(Op + X86::AddrDisp).getImm() == 0) {
462  FrameIndex = MI.getOperand(Op + X86::AddrBaseReg).getIndex();
463  return true;
464  }
465  return false;
466 }
467 
468 static bool isFrameLoadOpcode(int Opcode, unsigned &MemBytes) {
469  switch (Opcode) {
470  default:
471  return false;
472  case X86::MOV8rm:
473  case X86::KMOVBkm:
474  MemBytes = 1;
475  return true;
476  case X86::MOV16rm:
477  case X86::KMOVWkm:
478  case X86::VMOVSHZrm:
479  case X86::VMOVSHZrm_alt:
480  MemBytes = 2;
481  return true;
482  case X86::MOV32rm:
483  case X86::MOVSSrm:
484  case X86::MOVSSrm_alt:
485  case X86::VMOVSSrm:
486  case X86::VMOVSSrm_alt:
487  case X86::VMOVSSZrm:
488  case X86::VMOVSSZrm_alt:
489  case X86::KMOVDkm:
490  MemBytes = 4;
491  return true;
492  case X86::MOV64rm:
493  case X86::LD_Fp64m:
494  case X86::MOVSDrm:
495  case X86::MOVSDrm_alt:
496  case X86::VMOVSDrm:
497  case X86::VMOVSDrm_alt:
498  case X86::VMOVSDZrm:
499  case X86::VMOVSDZrm_alt:
500  case X86::MMX_MOVD64rm:
501  case X86::MMX_MOVQ64rm:
502  case X86::KMOVQkm:
503  MemBytes = 8;
504  return true;
505  case X86::MOVAPSrm:
506  case X86::MOVUPSrm:
507  case X86::MOVAPDrm:
508  case X86::MOVUPDrm:
509  case X86::MOVDQArm:
510  case X86::MOVDQUrm:
511  case X86::VMOVAPSrm:
512  case X86::VMOVUPSrm:
513  case X86::VMOVAPDrm:
514  case X86::VMOVUPDrm:
515  case X86::VMOVDQArm:
516  case X86::VMOVDQUrm:
517  case X86::VMOVAPSZ128rm:
518  case X86::VMOVUPSZ128rm:
519  case X86::VMOVAPSZ128rm_NOVLX:
520  case X86::VMOVUPSZ128rm_NOVLX:
521  case X86::VMOVAPDZ128rm:
522  case X86::VMOVUPDZ128rm:
523  case X86::VMOVDQU8Z128rm:
524  case X86::VMOVDQU16Z128rm:
525  case X86::VMOVDQA32Z128rm:
526  case X86::VMOVDQU32Z128rm:
527  case X86::VMOVDQA64Z128rm:
528  case X86::VMOVDQU64Z128rm:
529  MemBytes = 16;
530  return true;
531  case X86::VMOVAPSYrm:
532  case X86::VMOVUPSYrm:
533  case X86::VMOVAPDYrm:
534  case X86::VMOVUPDYrm:
535  case X86::VMOVDQAYrm:
536  case X86::VMOVDQUYrm:
537  case X86::VMOVAPSZ256rm:
538  case X86::VMOVUPSZ256rm:
539  case X86::VMOVAPSZ256rm_NOVLX:
540  case X86::VMOVUPSZ256rm_NOVLX:
541  case X86::VMOVAPDZ256rm:
542  case X86::VMOVUPDZ256rm:
543  case X86::VMOVDQU8Z256rm:
544  case X86::VMOVDQU16Z256rm:
545  case X86::VMOVDQA32Z256rm:
546  case X86::VMOVDQU32Z256rm:
547  case X86::VMOVDQA64Z256rm:
548  case X86::VMOVDQU64Z256rm:
549  MemBytes = 32;
550  return true;
551  case X86::VMOVAPSZrm:
552  case X86::VMOVUPSZrm:
553  case X86::VMOVAPDZrm:
554  case X86::VMOVUPDZrm:
555  case X86::VMOVDQU8Zrm:
556  case X86::VMOVDQU16Zrm:
557  case X86::VMOVDQA32Zrm:
558  case X86::VMOVDQU32Zrm:
559  case X86::VMOVDQA64Zrm:
560  case X86::VMOVDQU64Zrm:
561  MemBytes = 64;
562  return true;
563  }
564 }
565 
566 static bool isFrameStoreOpcode(int Opcode, unsigned &MemBytes) {
567  switch (Opcode) {
568  default:
569  return false;
570  case X86::MOV8mr:
571  case X86::KMOVBmk:
572  MemBytes = 1;
573  return true;
574  case X86::MOV16mr:
575  case X86::KMOVWmk:
576  case X86::VMOVSHZmr:
577  MemBytes = 2;
578  return true;
579  case X86::MOV32mr:
580  case X86::MOVSSmr:
581  case X86::VMOVSSmr:
582  case X86::VMOVSSZmr:
583  case X86::KMOVDmk:
584  MemBytes = 4;
585  return true;
586  case X86::MOV64mr:
587  case X86::ST_FpP64m:
588  case X86::MOVSDmr:
589  case X86::VMOVSDmr:
590  case X86::VMOVSDZmr:
591  case X86::MMX_MOVD64mr:
592  case X86::MMX_MOVQ64mr:
593  case X86::MMX_MOVNTQmr:
594  case X86::KMOVQmk:
595  MemBytes = 8;
596  return true;
597  case X86::MOVAPSmr:
598  case X86::MOVUPSmr:
599  case X86::MOVAPDmr:
600  case X86::MOVUPDmr:
601  case X86::MOVDQAmr:
602  case X86::MOVDQUmr:
603  case X86::VMOVAPSmr:
604  case X86::VMOVUPSmr:
605  case X86::VMOVAPDmr:
606  case X86::VMOVUPDmr:
607  case X86::VMOVDQAmr:
608  case X86::VMOVDQUmr:
609  case X86::VMOVUPSZ128mr:
610  case X86::VMOVAPSZ128mr:
611  case X86::VMOVUPSZ128mr_NOVLX:
612  case X86::VMOVAPSZ128mr_NOVLX:
613  case X86::VMOVUPDZ128mr:
614  case X86::VMOVAPDZ128mr:
615  case X86::VMOVDQA32Z128mr:
616  case X86::VMOVDQU32Z128mr:
617  case X86::VMOVDQA64Z128mr:
618  case X86::VMOVDQU64Z128mr:
619  case X86::VMOVDQU8Z128mr:
620  case X86::VMOVDQU16Z128mr:
621  MemBytes = 16;
622  return true;
623  case X86::VMOVUPSYmr:
624  case X86::VMOVAPSYmr:
625  case X86::VMOVUPDYmr:
626  case X86::VMOVAPDYmr:
627  case X86::VMOVDQUYmr:
628  case X86::VMOVDQAYmr:
629  case X86::VMOVUPSZ256mr:
630  case X86::VMOVAPSZ256mr:
631  case X86::VMOVUPSZ256mr_NOVLX:
632  case X86::VMOVAPSZ256mr_NOVLX:
633  case X86::VMOVUPDZ256mr:
634  case X86::VMOVAPDZ256mr:
635  case X86::VMOVDQU8Z256mr:
636  case X86::VMOVDQU16Z256mr:
637  case X86::VMOVDQA32Z256mr:
638  case X86::VMOVDQU32Z256mr:
639  case X86::VMOVDQA64Z256mr:
640  case X86::VMOVDQU64Z256mr:
641  MemBytes = 32;
642  return true;
643  case X86::VMOVUPSZmr:
644  case X86::VMOVAPSZmr:
645  case X86::VMOVUPDZmr:
646  case X86::VMOVAPDZmr:
647  case X86::VMOVDQU8Zmr:
648  case X86::VMOVDQU16Zmr:
649  case X86::VMOVDQA32Zmr:
650  case X86::VMOVDQU32Zmr:
651  case X86::VMOVDQA64Zmr:
652  case X86::VMOVDQU64Zmr:
653  MemBytes = 64;
654  return true;
655  }
656  return false;
657 }
658 
660  int &FrameIndex) const {
661  unsigned Dummy;
663 }
664 
666  int &FrameIndex,
667  unsigned &MemBytes) const {
668  if (isFrameLoadOpcode(MI.getOpcode(), MemBytes))
669  if (MI.getOperand(0).getSubReg() == 0 && isFrameOperand(MI, 1, FrameIndex))
670  return MI.getOperand(0).getReg();
671  return 0;
672 }
673 
675  int &FrameIndex) const {
676  unsigned Dummy;
677  if (isFrameLoadOpcode(MI.getOpcode(), Dummy)) {
678  unsigned Reg;
680  return Reg;
681  // Check for post-frame index elimination operations
683  if (hasLoadFromStackSlot(MI, Accesses)) {
684  FrameIndex =
685  cast<FixedStackPseudoSourceValue>(Accesses.front()->getPseudoValue())
686  ->getFrameIndex();
687  return MI.getOperand(0).getReg();
688  }
689  }
690  return 0;
691 }
692 
694  int &FrameIndex) const {
695  unsigned Dummy;
697 }
698 
700  int &FrameIndex,
701  unsigned &MemBytes) const {
702  if (isFrameStoreOpcode(MI.getOpcode(), MemBytes))
703  if (MI.getOperand(X86::AddrNumOperands).getSubReg() == 0 &&
704  isFrameOperand(MI, 0, FrameIndex))
705  return MI.getOperand(X86::AddrNumOperands).getReg();
706  return 0;
707 }
708 
710  int &FrameIndex) const {
711  unsigned Dummy;
712  if (isFrameStoreOpcode(MI.getOpcode(), Dummy)) {
713  unsigned Reg;
715  return Reg;
716  // Check for post-frame index elimination operations
718  if (hasStoreToStackSlot(MI, Accesses)) {
719  FrameIndex =
720  cast<FixedStackPseudoSourceValue>(Accesses.front()->getPseudoValue())
721  ->getFrameIndex();
722  return MI.getOperand(X86::AddrNumOperands).getReg();
723  }
724  }
725  return 0;
726 }
727 
728 /// Return true if register is PIC base; i.e.g defined by X86::MOVPC32r.
729 static bool regIsPICBase(Register BaseReg, const MachineRegisterInfo &MRI) {
730  // Don't waste compile time scanning use-def chains of physregs.
731  if (!BaseReg.isVirtual())
732  return false;
733  bool isPICBase = false;
735  E = MRI.def_instr_end(); I != E; ++I) {
736  MachineInstr *DefMI = &*I;
737  if (DefMI->getOpcode() != X86::MOVPC32r)
738  return false;
739  assert(!isPICBase && "More than one PIC base?");
740  isPICBase = true;
741  }
742  return isPICBase;
743 }
744 
746  AAResults *AA) const {
747  switch (MI.getOpcode()) {
748  default:
749  // This function should only be called for opcodes with the ReMaterializable
750  // flag set.
751  llvm_unreachable("Unknown rematerializable operation!");
752  break;
753 
754  case X86::LOAD_STACK_GUARD:
755  case X86::AVX1_SETALLONES:
756  case X86::AVX2_SETALLONES:
757  case X86::AVX512_128_SET0:
758  case X86::AVX512_256_SET0:
759  case X86::AVX512_512_SET0:
760  case X86::AVX512_512_SETALLONES:
761  case X86::AVX512_FsFLD0SD:
762  case X86::AVX512_FsFLD0SH:
763  case X86::AVX512_FsFLD0SS:
764  case X86::AVX512_FsFLD0F128:
765  case X86::AVX_SET0:
766  case X86::FsFLD0SD:
767  case X86::FsFLD0SS:
768  case X86::FsFLD0F128:
769  case X86::KSET0D:
770  case X86::KSET0Q:
771  case X86::KSET0W:
772  case X86::KSET1D:
773  case X86::KSET1Q:
774  case X86::KSET1W:
775  case X86::MMX_SET0:
776  case X86::MOV32ImmSExti8:
777  case X86::MOV32r0:
778  case X86::MOV32r1:
779  case X86::MOV32r_1:
780  case X86::MOV32ri64:
781  case X86::MOV64ImmSExti8:
782  case X86::V_SET0:
783  case X86::V_SETALLONES:
784  case X86::MOV16ri:
785  case X86::MOV32ri:
786  case X86::MOV64ri:
787  case X86::MOV64ri32:
788  case X86::MOV8ri:
789  case X86::PTILEZEROV:
790  return true;
791 
792  case X86::MOV8rm:
793  case X86::MOV8rm_NOREX:
794  case X86::MOV16rm:
795  case X86::MOV32rm:
796  case X86::MOV64rm:
797  case X86::MOVSSrm:
798  case X86::MOVSSrm_alt:
799  case X86::MOVSDrm:
800  case X86::MOVSDrm_alt:
801  case X86::MOVAPSrm:
802  case X86::MOVUPSrm:
803  case X86::MOVAPDrm:
804  case X86::MOVUPDrm:
805  case X86::MOVDQArm:
806  case X86::MOVDQUrm:
807  case X86::VMOVSSrm:
808  case X86::VMOVSSrm_alt:
809  case X86::VMOVSDrm:
810  case X86::VMOVSDrm_alt:
811  case X86::VMOVAPSrm:
812  case X86::VMOVUPSrm:
813  case X86::VMOVAPDrm:
814  case X86::VMOVUPDrm:
815  case X86::VMOVDQArm:
816  case X86::VMOVDQUrm:
817  case X86::VMOVAPSYrm:
818  case X86::VMOVUPSYrm:
819  case X86::VMOVAPDYrm:
820  case X86::VMOVUPDYrm:
821  case X86::VMOVDQAYrm:
822  case X86::VMOVDQUYrm:
823  case X86::MMX_MOVD64rm:
824  case X86::MMX_MOVQ64rm:
825  // AVX-512
826  case X86::VMOVSSZrm:
827  case X86::VMOVSSZrm_alt:
828  case X86::VMOVSDZrm:
829  case X86::VMOVSDZrm_alt:
830  case X86::VMOVSHZrm:
831  case X86::VMOVSHZrm_alt:
832  case X86::VMOVAPDZ128rm:
833  case X86::VMOVAPDZ256rm:
834  case X86::VMOVAPDZrm:
835  case X86::VMOVAPSZ128rm:
836  case X86::VMOVAPSZ256rm:
837  case X86::VMOVAPSZ128rm_NOVLX:
838  case X86::VMOVAPSZ256rm_NOVLX:
839  case X86::VMOVAPSZrm:
840  case X86::VMOVDQA32Z128rm:
841  case X86::VMOVDQA32Z256rm:
842  case X86::VMOVDQA32Zrm:
843  case X86::VMOVDQA64Z128rm:
844  case X86::VMOVDQA64Z256rm:
845  case X86::VMOVDQA64Zrm:
846  case X86::VMOVDQU16Z128rm:
847  case X86::VMOVDQU16Z256rm:
848  case X86::VMOVDQU16Zrm:
849  case X86::VMOVDQU32Z128rm:
850  case X86::VMOVDQU32Z256rm:
851  case X86::VMOVDQU32Zrm:
852  case X86::VMOVDQU64Z128rm:
853  case X86::VMOVDQU64Z256rm:
854  case X86::VMOVDQU64Zrm:
855  case X86::VMOVDQU8Z128rm:
856  case X86::VMOVDQU8Z256rm:
857  case X86::VMOVDQU8Zrm:
858  case X86::VMOVUPDZ128rm:
859  case X86::VMOVUPDZ256rm:
860  case X86::VMOVUPDZrm:
861  case X86::VMOVUPSZ128rm:
862  case X86::VMOVUPSZ256rm:
863  case X86::VMOVUPSZ128rm_NOVLX:
864  case X86::VMOVUPSZ256rm_NOVLX:
865  case X86::VMOVUPSZrm: {
866  // Loads from constant pools are trivially rematerializable.
867  if (MI.getOperand(1 + X86::AddrBaseReg).isReg() &&
868  MI.getOperand(1 + X86::AddrScaleAmt).isImm() &&
869  MI.getOperand(1 + X86::AddrIndexReg).isReg() &&
870  MI.getOperand(1 + X86::AddrIndexReg).getReg() == 0 &&
871  MI.isDereferenceableInvariantLoad(AA)) {
872  Register BaseReg = MI.getOperand(1 + X86::AddrBaseReg).getReg();
873  if (BaseReg == 0 || BaseReg == X86::RIP)
874  return true;
875  // Allow re-materialization of PIC load.
876  if (!ReMatPICStubLoad && MI.getOperand(1 + X86::AddrDisp).isGlobal())
877  return false;
878  const MachineFunction &MF = *MI.getParent()->getParent();
879  const MachineRegisterInfo &MRI = MF.getRegInfo();
880  return regIsPICBase(BaseReg, MRI);
881  }
882  return false;
883  }
884 
885  case X86::LEA32r:
886  case X86::LEA64r: {
887  if (MI.getOperand(1 + X86::AddrScaleAmt).isImm() &&
888  MI.getOperand(1 + X86::AddrIndexReg).isReg() &&
889  MI.getOperand(1 + X86::AddrIndexReg).getReg() == 0 &&
890  !MI.getOperand(1 + X86::AddrDisp).isReg()) {
891  // lea fi#, lea GV, etc. are all rematerializable.
892  if (!MI.getOperand(1 + X86::AddrBaseReg).isReg())
893  return true;
894  Register BaseReg = MI.getOperand(1 + X86::AddrBaseReg).getReg();
895  if (BaseReg == 0)
896  return true;
897  // Allow re-materialization of lea PICBase + x.
898  const MachineFunction &MF = *MI.getParent()->getParent();
899  const MachineRegisterInfo &MRI = MF.getRegInfo();
900  return regIsPICBase(BaseReg, MRI);
901  }
902  return false;
903  }
904  }
905 }
906 
909  Register DestReg, unsigned SubIdx,
910  const MachineInstr &Orig,
911  const TargetRegisterInfo &TRI) const {
912  bool ClobbersEFLAGS = Orig.modifiesRegister(X86::EFLAGS, &TRI);
913  if (ClobbersEFLAGS && MBB.computeRegisterLiveness(&TRI, X86::EFLAGS, I) !=
915  // The instruction clobbers EFLAGS. Re-materialize as MOV32ri to avoid side
916  // effects.
917  int Value;
918  switch (Orig.getOpcode()) {
919  case X86::MOV32r0: Value = 0; break;
920  case X86::MOV32r1: Value = 1; break;
921  case X86::MOV32r_1: Value = -1; break;
922  default:
923  llvm_unreachable("Unexpected instruction!");
924  }
925 
926  const DebugLoc &DL = Orig.getDebugLoc();
927  BuildMI(MBB, I, DL, get(X86::MOV32ri))
928  .add(Orig.getOperand(0))
929  .addImm(Value);
930  } else {
932  MBB.insert(I, MI);
933  }
934 
935  MachineInstr &NewMI = *std::prev(I);
936  NewMI.substituteRegister(Orig.getOperand(0).getReg(), DestReg, SubIdx, TRI);
937 }
938 
939 /// True if MI has a condition code def, e.g. EFLAGS, that is not marked dead.
941  for (const MachineOperand &MO : MI.operands()) {
942  if (MO.isReg() && MO.isDef() &&
943  MO.getReg() == X86::EFLAGS && !MO.isDead()) {
944  return true;
945  }
946  }
947  return false;
948 }
949 
950 /// Check whether the shift count for a machine operand is non-zero.
951 inline static unsigned getTruncatedShiftCount(const MachineInstr &MI,
952  unsigned ShiftAmtOperandIdx) {
953  // The shift count is six bits with the REX.W prefix and five bits without.
954  unsigned ShiftCountMask = (MI.getDesc().TSFlags & X86II::REX_W) ? 63 : 31;
955  unsigned Imm = MI.getOperand(ShiftAmtOperandIdx).getImm();
956  return Imm & ShiftCountMask;
957 }
958 
959 /// Check whether the given shift count is appropriate
960 /// can be represented by a LEA instruction.
961 inline static bool isTruncatedShiftCountForLEA(unsigned ShAmt) {
962  // Left shift instructions can be transformed into load-effective-address
963  // instructions if we can encode them appropriately.
964  // A LEA instruction utilizes a SIB byte to encode its scale factor.
965  // The SIB.scale field is two bits wide which means that we can encode any
966  // shift amount less than 4.
967  return ShAmt < 4 && ShAmt > 0;
968 }
969 
970 static bool findRedundantFlagInstr(MachineInstr &CmpInstr,
971  MachineInstr &CmpValDefInstr,
972  const MachineRegisterInfo *MRI,
973  MachineInstr **AndInstr,
974  const TargetRegisterInfo *TRI,
975  bool &NoSignFlag, bool &ClearsOverflowFlag) {
976  if (CmpValDefInstr.getOpcode() != X86::SUBREG_TO_REG)
977  return false;
978 
979  if (CmpInstr.getOpcode() != X86::TEST64rr)
980  return false;
981 
982  // CmpInstr is a TEST64rr instruction, and `X86InstrInfo::analyzeCompare`
983  // guarantees that it's analyzable only if two registers are identical.
984  assert(
985  (CmpInstr.getOperand(0).getReg() == CmpInstr.getOperand(1).getReg()) &&
986  "CmpInstr is an analyzable TEST64rr, and `X86InstrInfo::analyzeCompare` "
987  "requires two reg operands are the same.");
988 
989  // Caller (`X86InstrInfo::optimizeCompareInstr`) guarantees that
990  // `CmpValDefInstr` defines the value that's used by `CmpInstr`; in this case
991  // if `CmpValDefInstr` sets the EFLAGS, it is likely that `CmpInstr` is
992  // redundant.
993  assert(
994  (MRI->getVRegDef(CmpInstr.getOperand(0).getReg()) == &CmpValDefInstr) &&
995  "Caller guarantees that TEST64rr is a user of SUBREG_TO_REG.");
996 
997  // As seen in X86 td files, CmpValDefInstr.getOperand(1).getImm() is typically
998  // 0.
999  if (CmpValDefInstr.getOperand(1).getImm() != 0)
1000  return false;
1001 
1002  // As seen in X86 td files, CmpValDefInstr.getOperand(3) is typically
1003  // sub_32bit or sub_xmm.
1004  if (CmpValDefInstr.getOperand(3).getImm() != X86::sub_32bit)
1005  return false;
1006 
1007  MachineInstr *VregDefInstr =
1008  MRI->getVRegDef(CmpValDefInstr.getOperand(2).getReg());
1009 
1010  assert(VregDefInstr && "Must have a definition (SSA)");
1011 
1012  // Requires `CmpValDefInstr` and `VregDefInstr` are from the same MBB
1013  // to simplify the subsequent analysis.
1014  //
1015  // FIXME: If `VregDefInstr->getParent()` is the only predecessor of
1016  // `CmpValDefInstr.getParent()`, this could be handled.
1017  if (VregDefInstr->getParent() != CmpValDefInstr.getParent())
1018  return false;
1019 
1020  if (X86::isAND(VregDefInstr->getOpcode())) {
1021  // Get a sequence of instructions like
1022  // %reg = and* ... // Set EFLAGS
1023  // ... // EFLAGS not changed
1024  // %extended_reg = subreg_to_reg 0, %reg, %subreg.sub_32bit
1025  // test64rr %extended_reg, %extended_reg, implicit-def $eflags
1026  //
1027  // If subsequent readers use a subset of bits that don't change
1028  // after `and*` instructions, it's likely that the test64rr could
1029  // be optimized away.
1030  for (const MachineInstr &Instr :
1031  make_range(std::next(MachineBasicBlock::iterator(VregDefInstr)),
1032  MachineBasicBlock::iterator(CmpValDefInstr))) {
1033  // There are instructions between 'VregDefInstr' and
1034  // 'CmpValDefInstr' that modifies EFLAGS.
1035  if (Instr.modifiesRegister(X86::EFLAGS, TRI))
1036  return false;
1037  }
1038 
1039  *AndInstr = VregDefInstr;
1040 
1041  // AND instruction will essentially update SF and clear OF, so
1042  // NoSignFlag should be false in the sense that SF is modified by `AND`.
1043  //
1044  // However, the implementation artifically sets `NoSignFlag` to true
1045  // to poison the SF bit; that is to say, if SF is looked at later, the
1046  // optimization (to erase TEST64rr) will be disabled.
1047  //
1048  // The reason to poison SF bit is that SF bit value could be different
1049  // in the `AND` and `TEST` operation; signed bit is not known for `AND`,
1050  // and is known to be 0 as a result of `TEST64rr`.
1051  //
1052  // FIXME: As opposed to poisoning the SF bit directly, consider peeking into
1053  // the AND instruction and using the static information to guide peephole
1054  // optimization if possible. For example, it's possible to fold a
1055  // conditional move into a copy if the relevant EFLAG bits could be deduced
1056  // from an immediate operand of and operation.
1057  //
1058  NoSignFlag = true;
1059  // ClearsOverflowFlag is true for AND operation (no surprise).
1060  ClearsOverflowFlag = true;
1061  return true;
1062  }
1063  return false;
1064 }
1065 
1067  unsigned Opc, bool AllowSP, Register &NewSrc,
1068  bool &isKill, MachineOperand &ImplicitOp,
1069  LiveVariables *LV, LiveIntervals *LIS) const {
1070  MachineFunction &MF = *MI.getParent()->getParent();
1071  const TargetRegisterClass *RC;
1072  if (AllowSP) {
1073  RC = Opc != X86::LEA32r ? &X86::GR64RegClass : &X86::GR32RegClass;
1074  } else {
1075  RC = Opc != X86::LEA32r ?
1076  &X86::GR64_NOSPRegClass : &X86::GR32_NOSPRegClass;
1077  }
1078  Register SrcReg = Src.getReg();
1079  isKill = MI.killsRegister(SrcReg);
1080 
1081  // For both LEA64 and LEA32 the register already has essentially the right
1082  // type (32-bit or 64-bit) we may just need to forbid SP.
1083  if (Opc != X86::LEA64_32r) {
1084  NewSrc = SrcReg;
1085  assert(!Src.isUndef() && "Undef op doesn't need optimization");
1086 
1087  if (NewSrc.isVirtual() && !MF.getRegInfo().constrainRegClass(NewSrc, RC))
1088  return false;
1089 
1090  return true;
1091  }
1092 
1093  // This is for an LEA64_32r and incoming registers are 32-bit. One way or
1094  // another we need to add 64-bit registers to the final MI.
1095  if (SrcReg.isPhysical()) {
1096  ImplicitOp = Src;
1097  ImplicitOp.setImplicit();
1098 
1099  NewSrc = getX86SubSuperRegister(SrcReg, 64);
1100  assert(!Src.isUndef() && "Undef op doesn't need optimization");
1101  } else {
1102  // Virtual register of the wrong class, we have to create a temporary 64-bit
1103  // vreg to feed into the LEA.
1104  NewSrc = MF.getRegInfo().createVirtualRegister(RC);
1105  MachineInstr *Copy =
1106  BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(TargetOpcode::COPY))
1107  .addReg(NewSrc, RegState::Define | RegState::Undef, X86::sub_32bit)
1108  .addReg(SrcReg, getKillRegState(isKill));
1109 
1110  // Which is obviously going to be dead after we're done with it.
1111  isKill = true;
1112 
1113  if (LV)
1114  LV->replaceKillInstruction(SrcReg, MI, *Copy);
1115 
1116  if (LIS) {
1117  SlotIndex CopyIdx = LIS->InsertMachineInstrInMaps(*Copy);
1118  SlotIndex Idx = LIS->getInstructionIndex(MI);
1119  LiveInterval &LI = LIS->getInterval(SrcReg);
1121  if (S->end.getBaseIndex() == Idx)
1122  S->end = CopyIdx.getRegSlot();
1123  }
1124  }
1125 
1126  // We've set all the parameters without issue.
1127  return true;
1128 }
1129 
1130 MachineInstr *X86InstrInfo::convertToThreeAddressWithLEA(unsigned MIOpc,
1131  MachineInstr &MI,
1132  LiveVariables *LV,
1133  LiveIntervals *LIS,
1134  bool Is8BitOp) const {
1135  // We handle 8-bit adds and various 16-bit opcodes in the switch below.
1136  MachineBasicBlock &MBB = *MI.getParent();
1137  MachineRegisterInfo &RegInfo = MBB.getParent()->getRegInfo();
1138  assert((Is8BitOp || RegInfo.getTargetRegisterInfo()->getRegSizeInBits(
1139  *RegInfo.getRegClass(MI.getOperand(0).getReg())) == 16) &&
1140  "Unexpected type for LEA transform");
1141 
1142  // TODO: For a 32-bit target, we need to adjust the LEA variables with
1143  // something like this:
1144  // Opcode = X86::LEA32r;
1145  // InRegLEA = RegInfo.createVirtualRegister(&X86::GR32_NOSPRegClass);
1146  // OutRegLEA =
1147  // Is8BitOp ? RegInfo.createVirtualRegister(&X86::GR32ABCD_RegClass)
1148  // : RegInfo.createVirtualRegister(&X86::GR32RegClass);
1149  if (!Subtarget.is64Bit())
1150  return nullptr;
1151 
1152  unsigned Opcode = X86::LEA64_32r;
1153  Register InRegLEA = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass);
1154  Register OutRegLEA = RegInfo.createVirtualRegister(&X86::GR32RegClass);
1155  Register InRegLEA2;
1156 
1157  // Build and insert into an implicit UNDEF value. This is OK because
1158  // we will be shifting and then extracting the lower 8/16-bits.
1159  // This has the potential to cause partial register stall. e.g.
1160  // movw (%rbp,%rcx,2), %dx
1161  // leal -65(%rdx), %esi
1162  // But testing has shown this *does* help performance in 64-bit mode (at
1163  // least on modern x86 machines).
1164  MachineBasicBlock::iterator MBBI = MI.getIterator();
1165  Register Dest = MI.getOperand(0).getReg();
1166  Register Src = MI.getOperand(1).getReg();
1167  Register Src2;
1168  bool IsDead = MI.getOperand(0).isDead();
1169  bool IsKill = MI.getOperand(1).isKill();
1170  unsigned SubReg = Is8BitOp ? X86::sub_8bit : X86::sub_16bit;
1171  assert(!MI.getOperand(1).isUndef() && "Undef op doesn't need optimization");
1172  MachineInstr *ImpDef =
1173  BuildMI(MBB, MBBI, MI.getDebugLoc(), get(X86::IMPLICIT_DEF), InRegLEA);
1174  MachineInstr *InsMI =
1175  BuildMI(MBB, MBBI, MI.getDebugLoc(), get(TargetOpcode::COPY))
1176  .addReg(InRegLEA, RegState::Define, SubReg)
1177  .addReg(Src, getKillRegState(IsKill));
1178  MachineInstr *ImpDef2 = nullptr;
1179  MachineInstr *InsMI2 = nullptr;
1180 
1181  MachineInstrBuilder MIB =
1182  BuildMI(MBB, MBBI, MI.getDebugLoc(), get(Opcode), OutRegLEA);
1183  switch (MIOpc) {
1184  default: llvm_unreachable("Unreachable!");
1185  case X86::SHL8ri:
1186  case X86::SHL16ri: {
1187  unsigned ShAmt = MI.getOperand(2).getImm();
1188  MIB.addReg(0).addImm(1ULL << ShAmt)
1189  .addReg(InRegLEA, RegState::Kill).addImm(0).addReg(0);
1190  break;
1191  }
1192  case X86::INC8r:
1193  case X86::INC16r:
1194  addRegOffset(MIB, InRegLEA, true, 1);
1195  break;
1196  case X86::DEC8r:
1197  case X86::DEC16r:
1198  addRegOffset(MIB, InRegLEA, true, -1);
1199  break;
1200  case X86::ADD8ri:
1201  case X86::ADD8ri_DB:
1202  case X86::ADD16ri:
1203  case X86::ADD16ri8:
1204  case X86::ADD16ri_DB:
1205  case X86::ADD16ri8_DB:
1206  addRegOffset(MIB, InRegLEA, true, MI.getOperand(2).getImm());
1207  break;
1208  case X86::ADD8rr:
1209  case X86::ADD8rr_DB:
1210  case X86::ADD16rr:
1211  case X86::ADD16rr_DB: {
1212  Src2 = MI.getOperand(2).getReg();
1213  bool IsKill2 = MI.getOperand(2).isKill();
1214  assert(!MI.getOperand(2).isUndef() && "Undef op doesn't need optimization");
1215  if (Src == Src2) {
1216  // ADD8rr/ADD16rr killed %reg1028, %reg1028
1217  // just a single insert_subreg.
1218  addRegReg(MIB, InRegLEA, true, InRegLEA, false);
1219  } else {
1220  if (Subtarget.is64Bit())
1221  InRegLEA2 = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass);
1222  else
1223  InRegLEA2 = RegInfo.createVirtualRegister(&X86::GR32_NOSPRegClass);
1224  // Build and insert into an implicit UNDEF value. This is OK because
1225  // we will be shifting and then extracting the lower 8/16-bits.
1226  ImpDef2 = BuildMI(MBB, &*MIB, MI.getDebugLoc(), get(X86::IMPLICIT_DEF),
1227  InRegLEA2);
1228  InsMI2 = BuildMI(MBB, &*MIB, MI.getDebugLoc(), get(TargetOpcode::COPY))
1229  .addReg(InRegLEA2, RegState::Define, SubReg)
1230  .addReg(Src2, getKillRegState(IsKill2));
1231  addRegReg(MIB, InRegLEA, true, InRegLEA2, true);
1232  }
1233  if (LV && IsKill2 && InsMI2)
1234  LV->replaceKillInstruction(Src2, MI, *InsMI2);
1235  break;
1236  }
1237  }
1238 
1239  MachineInstr *NewMI = MIB;
1240  MachineInstr *ExtMI =
1241  BuildMI(MBB, MBBI, MI.getDebugLoc(), get(TargetOpcode::COPY))
1243  .addReg(OutRegLEA, RegState::Kill, SubReg);
1244 
1245  if (LV) {
1246  // Update live variables.
1247  LV->getVarInfo(InRegLEA).Kills.push_back(NewMI);
1248  LV->getVarInfo(OutRegLEA).Kills.push_back(ExtMI);
1249  if (IsKill)
1250  LV->replaceKillInstruction(Src, MI, *InsMI);
1251  if (IsDead)
1252  LV->replaceKillInstruction(Dest, MI, *ExtMI);
1253  }
1254 
1255  if (LIS) {
1256  LIS->InsertMachineInstrInMaps(*ImpDef);
1257  SlotIndex InsIdx = LIS->InsertMachineInstrInMaps(*InsMI);
1258  if (ImpDef2)
1259  LIS->InsertMachineInstrInMaps(*ImpDef2);
1260  SlotIndex Ins2Idx;
1261  if (InsMI2)
1262  Ins2Idx = LIS->InsertMachineInstrInMaps(*InsMI2);
1263  SlotIndex NewIdx = LIS->ReplaceMachineInstrInMaps(MI, *NewMI);
1264  SlotIndex ExtIdx = LIS->InsertMachineInstrInMaps(*ExtMI);
1265  LIS->getInterval(InRegLEA);
1266  LIS->getInterval(OutRegLEA);
1267  if (InRegLEA2)
1268  LIS->getInterval(InRegLEA2);
1269 
1270  // Move the use of Src up to InsMI.
1271  LiveInterval &SrcLI = LIS->getInterval(Src);
1272  LiveRange::Segment *SrcSeg = SrcLI.getSegmentContaining(NewIdx);
1273  if (SrcSeg->end == NewIdx.getRegSlot())
1274  SrcSeg->end = InsIdx.getRegSlot();
1275 
1276  if (InsMI2) {
1277  // Move the use of Src2 up to InsMI2.
1278  LiveInterval &Src2LI = LIS->getInterval(Src2);
1279  LiveRange::Segment *Src2Seg = Src2LI.getSegmentContaining(NewIdx);
1280  if (Src2Seg->end == NewIdx.getRegSlot())
1281  Src2Seg->end = Ins2Idx.getRegSlot();
1282  }
1283 
1284  // Move the definition of Dest down to ExtMI.
1285  LiveInterval &DestLI = LIS->getInterval(Dest);
1286  LiveRange::Segment *DestSeg =
1287  DestLI.getSegmentContaining(NewIdx.getRegSlot());
1288  assert(DestSeg->start == NewIdx.getRegSlot() &&
1289  DestSeg->valno->def == NewIdx.getRegSlot());
1290  DestSeg->start = ExtIdx.getRegSlot();
1291  DestSeg->valno->def = ExtIdx.getRegSlot();
1292  }
1293 
1294  return ExtMI;
1295 }
1296 
1297 /// This method must be implemented by targets that
1298 /// set the M_CONVERTIBLE_TO_3_ADDR flag. When this flag is set, the target
1299 /// may be able to convert a two-address instruction into a true
1300 /// three-address instruction on demand. This allows the X86 target (for
1301 /// example) to convert ADD and SHL instructions into LEA instructions if they
1302 /// would require register copies due to two-addressness.
1303 ///
1304 /// This method returns a null pointer if the transformation cannot be
1305 /// performed, otherwise it returns the new instruction.
1306 ///
1308  LiveVariables *LV,
1309  LiveIntervals *LIS) const {
1310  // The following opcodes also sets the condition code register(s). Only
1311  // convert them to equivalent lea if the condition code register def's
1312  // are dead!
1313  if (hasLiveCondCodeDef(MI))
1314  return nullptr;
1315 
1316  MachineFunction &MF = *MI.getParent()->getParent();
1317  // All instructions input are two-addr instructions. Get the known operands.
1318  const MachineOperand &Dest = MI.getOperand(0);
1319  const MachineOperand &Src = MI.getOperand(1);
1320 
1321  // Ideally, operations with undef should be folded before we get here, but we
1322  // can't guarantee it. Bail out because optimizing undefs is a waste of time.
1323  // Without this, we have to forward undef state to new register operands to
1324  // avoid machine verifier errors.
1325  if (Src.isUndef())
1326  return nullptr;
1327  if (MI.getNumOperands() > 2)
1328  if (MI.getOperand(2).isReg() && MI.getOperand(2).isUndef())
1329  return nullptr;
1330 
1331  MachineInstr *NewMI = nullptr;
1332  Register SrcReg, SrcReg2;
1333  bool Is64Bit = Subtarget.is64Bit();
1334 
1335  bool Is8BitOp = false;
1336  unsigned MIOpc = MI.getOpcode();
1337  switch (MIOpc) {
1338  default: llvm_unreachable("Unreachable!");
1339  case X86::SHL64ri: {
1340  assert(MI.getNumOperands() >= 3 && "Unknown shift instruction!");
1341  unsigned ShAmt = getTruncatedShiftCount(MI, 2);
1342  if (!isTruncatedShiftCountForLEA(ShAmt)) return nullptr;
1343 
1344  // LEA can't handle RSP.
1345  if (Src.getReg().isVirtual() && !MF.getRegInfo().constrainRegClass(
1346  Src.getReg(), &X86::GR64_NOSPRegClass))
1347  return nullptr;
1348 
1349  NewMI = BuildMI(MF, MI.getDebugLoc(), get(X86::LEA64r))
1350  .add(Dest)
1351  .addReg(0)
1352  .addImm(1ULL << ShAmt)
1353  .add(Src)
1354  .addImm(0)
1355  .addReg(0);
1356  break;
1357  }
1358  case X86::SHL32ri: {
1359  assert(MI.getNumOperands() >= 3 && "Unknown shift instruction!");
1360  unsigned ShAmt = getTruncatedShiftCount(MI, 2);
1361  if (!isTruncatedShiftCountForLEA(ShAmt)) return nullptr;
1362 
1363  unsigned Opc = Is64Bit ? X86::LEA64_32r : X86::LEA32r;
1364 
1365  // LEA can't handle ESP.
1366  bool isKill;
1367  MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
1368  if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/false, SrcReg, isKill,
1369  ImplicitOp, LV, LIS))
1370  return nullptr;
1371 
1372  MachineInstrBuilder MIB =
1373  BuildMI(MF, MI.getDebugLoc(), get(Opc))
1374  .add(Dest)
1375  .addReg(0)
1376  .addImm(1ULL << ShAmt)
1377  .addReg(SrcReg, getKillRegState(isKill))
1378  .addImm(0)
1379  .addReg(0);
1380  if (ImplicitOp.getReg() != 0)
1381  MIB.add(ImplicitOp);
1382  NewMI = MIB;
1383 
1384  break;
1385  }
1386  case X86::SHL8ri:
1387  Is8BitOp = true;
1389  case X86::SHL16ri: {
1390  assert(MI.getNumOperands() >= 3 && "Unknown shift instruction!");
1391  unsigned ShAmt = getTruncatedShiftCount(MI, 2);
1392  if (!isTruncatedShiftCountForLEA(ShAmt))
1393  return nullptr;
1394  return convertToThreeAddressWithLEA(MIOpc, MI, LV, LIS, Is8BitOp);
1395  }
1396  case X86::INC64r:
1397  case X86::INC32r: {
1398  assert(MI.getNumOperands() >= 2 && "Unknown inc instruction!");
1399  unsigned Opc = MIOpc == X86::INC64r ? X86::LEA64r :
1400  (Is64Bit ? X86::LEA64_32r : X86::LEA32r);
1401  bool isKill;
1402  MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
1403  if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/false, SrcReg, isKill,
1404  ImplicitOp, LV, LIS))
1405  return nullptr;
1406 
1407  MachineInstrBuilder MIB =
1408  BuildMI(MF, MI.getDebugLoc(), get(Opc))
1409  .add(Dest)
1410  .addReg(SrcReg, getKillRegState(isKill));
1411  if (ImplicitOp.getReg() != 0)
1412  MIB.add(ImplicitOp);
1413 
1414  NewMI = addOffset(MIB, 1);
1415  break;
1416  }
1417  case X86::DEC64r:
1418  case X86::DEC32r: {
1419  assert(MI.getNumOperands() >= 2 && "Unknown dec instruction!");
1420  unsigned Opc = MIOpc == X86::DEC64r ? X86::LEA64r
1421  : (Is64Bit ? X86::LEA64_32r : X86::LEA32r);
1422 
1423  bool isKill;
1424  MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
1425  if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/false, SrcReg, isKill,
1426  ImplicitOp, LV, LIS))
1427  return nullptr;
1428 
1429  MachineInstrBuilder MIB = BuildMI(MF, MI.getDebugLoc(), get(Opc))
1430  .add(Dest)
1431  .addReg(SrcReg, getKillRegState(isKill));
1432  if (ImplicitOp.getReg() != 0)
1433  MIB.add(ImplicitOp);
1434 
1435  NewMI = addOffset(MIB, -1);
1436 
1437  break;
1438  }
1439  case X86::DEC8r:
1440  case X86::INC8r:
1441  Is8BitOp = true;
1443  case X86::DEC16r:
1444  case X86::INC16r:
1445  return convertToThreeAddressWithLEA(MIOpc, MI, LV, LIS, Is8BitOp);
1446  case X86::ADD64rr:
1447  case X86::ADD64rr_DB:
1448  case X86::ADD32rr:
1449  case X86::ADD32rr_DB: {
1450  assert(MI.getNumOperands() >= 3 && "Unknown add instruction!");
1451  unsigned Opc;
1452  if (MIOpc == X86::ADD64rr || MIOpc == X86::ADD64rr_DB)
1453  Opc = X86::LEA64r;
1454  else
1455  Opc = Is64Bit ? X86::LEA64_32r : X86::LEA32r;
1456 
1457  const MachineOperand &Src2 = MI.getOperand(2);
1458  bool isKill2;
1459  MachineOperand ImplicitOp2 = MachineOperand::CreateReg(0, false);
1460  if (!classifyLEAReg(MI, Src2, Opc, /*AllowSP=*/false, SrcReg2, isKill2,
1461  ImplicitOp2, LV, LIS))
1462  return nullptr;
1463 
1464  bool isKill;
1465  MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
1466  if (Src.getReg() == Src2.getReg()) {
1467  // Don't call classify LEAReg a second time on the same register, in case
1468  // the first call inserted a COPY from Src2 and marked it as killed.
1469  isKill = isKill2;
1470  SrcReg = SrcReg2;
1471  } else {
1472  if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/true, SrcReg, isKill,
1473  ImplicitOp, LV, LIS))
1474  return nullptr;
1475  }
1476 
1477  MachineInstrBuilder MIB = BuildMI(MF, MI.getDebugLoc(), get(Opc)).add(Dest);
1478  if (ImplicitOp.getReg() != 0)
1479  MIB.add(ImplicitOp);
1480  if (ImplicitOp2.getReg() != 0)
1481  MIB.add(ImplicitOp2);
1482 
1483  NewMI = addRegReg(MIB, SrcReg, isKill, SrcReg2, isKill2);
1484  if (LV && Src2.isKill())
1485  LV->replaceKillInstruction(SrcReg2, MI, *NewMI);
1486  break;
1487  }
1488  case X86::ADD8rr:
1489  case X86::ADD8rr_DB:
1490  Is8BitOp = true;
1492  case X86::ADD16rr:
1493  case X86::ADD16rr_DB:
1494  return convertToThreeAddressWithLEA(MIOpc, MI, LV, LIS, Is8BitOp);
1495  case X86::ADD64ri32:
1496  case X86::ADD64ri8:
1497  case X86::ADD64ri32_DB:
1498  case X86::ADD64ri8_DB:
1499  assert(MI.getNumOperands() >= 3 && "Unknown add instruction!");
1500  NewMI = addOffset(
1501  BuildMI(MF, MI.getDebugLoc(), get(X86::LEA64r)).add(Dest).add(Src),
1502  MI.getOperand(2));
1503  break;
1504  case X86::ADD32ri:
1505  case X86::ADD32ri8:
1506  case X86::ADD32ri_DB:
1507  case X86::ADD32ri8_DB: {
1508  assert(MI.getNumOperands() >= 3 && "Unknown add instruction!");
1509  unsigned Opc = Is64Bit ? X86::LEA64_32r : X86::LEA32r;
1510 
1511  bool isKill;
1512  MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
1513  if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/true, SrcReg, isKill,
1514  ImplicitOp, LV, LIS))
1515  return nullptr;
1516 
1517  MachineInstrBuilder MIB = BuildMI(MF, MI.getDebugLoc(), get(Opc))
1518  .add(Dest)
1519  .addReg(SrcReg, getKillRegState(isKill));
1520  if (ImplicitOp.getReg() != 0)
1521  MIB.add(ImplicitOp);
1522 
1523  NewMI = addOffset(MIB, MI.getOperand(2));
1524  break;
1525  }
1526  case X86::ADD8ri:
1527  case X86::ADD8ri_DB:
1528  Is8BitOp = true;
1530  case X86::ADD16ri:
1531  case X86::ADD16ri8:
1532  case X86::ADD16ri_DB:
1533  case X86::ADD16ri8_DB:
1534  return convertToThreeAddressWithLEA(MIOpc, MI, LV, LIS, Is8BitOp);
1535  case X86::SUB8ri:
1536  case X86::SUB16ri8:
1537  case X86::SUB16ri:
1538  /// FIXME: Support these similar to ADD8ri/ADD16ri*.
1539  return nullptr;
1540  case X86::SUB32ri8:
1541  case X86::SUB32ri: {
1542  if (!MI.getOperand(2).isImm())
1543  return nullptr;
1544  int64_t Imm = MI.getOperand(2).getImm();
1545  if (!isInt<32>(-Imm))
1546  return nullptr;
1547 
1548  assert(MI.getNumOperands() >= 3 && "Unknown add instruction!");
1549  unsigned Opc = Is64Bit ? X86::LEA64_32r : X86::LEA32r;
1550 
1551  bool isKill;
1552  MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
1553  if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/true, SrcReg, isKill,
1554  ImplicitOp, LV, LIS))
1555  return nullptr;
1556 
1557  MachineInstrBuilder MIB = BuildMI(MF, MI.getDebugLoc(), get(Opc))
1558  .add(Dest)
1559  .addReg(SrcReg, getKillRegState(isKill));
1560  if (ImplicitOp.getReg() != 0)
1561  MIB.add(ImplicitOp);
1562 
1563  NewMI = addOffset(MIB, -Imm);
1564  break;
1565  }
1566 
1567  case X86::SUB64ri8:
1568  case X86::SUB64ri32: {
1569  if (!MI.getOperand(2).isImm())
1570  return nullptr;
1571  int64_t Imm = MI.getOperand(2).getImm();
1572  if (!isInt<32>(-Imm))
1573  return nullptr;
1574 
1575  assert(MI.getNumOperands() >= 3 && "Unknown sub instruction!");
1576 
1577  MachineInstrBuilder MIB = BuildMI(MF, MI.getDebugLoc(),
1578  get(X86::LEA64r)).add(Dest).add(Src);
1579  NewMI = addOffset(MIB, -Imm);
1580  break;
1581  }
1582 
1583  case X86::VMOVDQU8Z128rmk:
1584  case X86::VMOVDQU8Z256rmk:
1585  case X86::VMOVDQU8Zrmk:
1586  case X86::VMOVDQU16Z128rmk:
1587  case X86::VMOVDQU16Z256rmk:
1588  case X86::VMOVDQU16Zrmk:
1589  case X86::VMOVDQU32Z128rmk: case X86::VMOVDQA32Z128rmk:
1590  case X86::VMOVDQU32Z256rmk: case X86::VMOVDQA32Z256rmk:
1591  case X86::VMOVDQU32Zrmk: case X86::VMOVDQA32Zrmk:
1592  case X86::VMOVDQU64Z128rmk: case X86::VMOVDQA64Z128rmk:
1593  case X86::VMOVDQU64Z256rmk: case X86::VMOVDQA64Z256rmk:
1594  case X86::VMOVDQU64Zrmk: case X86::VMOVDQA64Zrmk:
1595  case X86::VMOVUPDZ128rmk: case X86::VMOVAPDZ128rmk:
1596  case X86::VMOVUPDZ256rmk: case X86::VMOVAPDZ256rmk:
1597  case X86::VMOVUPDZrmk: case X86::VMOVAPDZrmk:
1598  case X86::VMOVUPSZ128rmk: case X86::VMOVAPSZ128rmk:
1599  case X86::VMOVUPSZ256rmk: case X86::VMOVAPSZ256rmk:
1600  case X86::VMOVUPSZrmk: case X86::VMOVAPSZrmk:
1601  case X86::VBROADCASTSDZ256rmk:
1602  case X86::VBROADCASTSDZrmk:
1603  case X86::VBROADCASTSSZ128rmk:
1604  case X86::VBROADCASTSSZ256rmk:
1605  case X86::VBROADCASTSSZrmk:
1606  case X86::VPBROADCASTDZ128rmk:
1607  case X86::VPBROADCASTDZ256rmk:
1608  case X86::VPBROADCASTDZrmk:
1609  case X86::VPBROADCASTQZ128rmk:
1610  case X86::VPBROADCASTQZ256rmk:
1611  case X86::VPBROADCASTQZrmk: {
1612  unsigned Opc;
1613  switch (MIOpc) {
1614  default: llvm_unreachable("Unreachable!");
1615  case X86::VMOVDQU8Z128rmk: Opc = X86::VPBLENDMBZ128rmk; break;
1616  case X86::VMOVDQU8Z256rmk: Opc = X86::VPBLENDMBZ256rmk; break;
1617  case X86::VMOVDQU8Zrmk: Opc = X86::VPBLENDMBZrmk; break;
1618  case X86::VMOVDQU16Z128rmk: Opc = X86::VPBLENDMWZ128rmk; break;
1619  case X86::VMOVDQU16Z256rmk: Opc = X86::VPBLENDMWZ256rmk; break;
1620  case X86::VMOVDQU16Zrmk: Opc = X86::VPBLENDMWZrmk; break;
1621  case X86::VMOVDQU32Z128rmk: Opc = X86::VPBLENDMDZ128rmk; break;
1622  case X86::VMOVDQU32Z256rmk: Opc = X86::VPBLENDMDZ256rmk; break;
1623  case X86::VMOVDQU32Zrmk: Opc = X86::VPBLENDMDZrmk; break;
1624  case X86::VMOVDQU64Z128rmk: Opc = X86::VPBLENDMQZ128rmk; break;
1625  case X86::VMOVDQU64Z256rmk: Opc = X86::VPBLENDMQZ256rmk; break;
1626  case X86::VMOVDQU64Zrmk: Opc = X86::VPBLENDMQZrmk; break;
1627  case X86::VMOVUPDZ128rmk: Opc = X86::VBLENDMPDZ128rmk; break;
1628  case X86::VMOVUPDZ256rmk: Opc = X86::VBLENDMPDZ256rmk; break;
1629  case X86::VMOVUPDZrmk: Opc = X86::VBLENDMPDZrmk; break;
1630  case X86::VMOVUPSZ128rmk: Opc = X86::VBLENDMPSZ128rmk; break;
1631  case X86::VMOVUPSZ256rmk: Opc = X86::VBLENDMPSZ256rmk; break;
1632  case X86::VMOVUPSZrmk: Opc = X86::VBLENDMPSZrmk; break;
1633  case X86::VMOVDQA32Z128rmk: Opc = X86::VPBLENDMDZ128rmk; break;
1634  case X86::VMOVDQA32Z256rmk: Opc = X86::VPBLENDMDZ256rmk; break;
1635  case X86::VMOVDQA32Zrmk: Opc = X86::VPBLENDMDZrmk; break;
1636  case X86::VMOVDQA64Z128rmk: Opc = X86::VPBLENDMQZ128rmk; break;
1637  case X86::VMOVDQA64Z256rmk: Opc = X86::VPBLENDMQZ256rmk; break;
1638  case X86::VMOVDQA64Zrmk: Opc = X86::VPBLENDMQZrmk; break;
1639  case X86::VMOVAPDZ128rmk: Opc = X86::VBLENDMPDZ128rmk; break;
1640  case X86::VMOVAPDZ256rmk: Opc = X86::VBLENDMPDZ256rmk; break;
1641  case X86::VMOVAPDZrmk: Opc = X86::VBLENDMPDZrmk; break;
1642  case X86::VMOVAPSZ128rmk: Opc = X86::VBLENDMPSZ128rmk; break;
1643  case X86::VMOVAPSZ256rmk: Opc = X86::VBLENDMPSZ256rmk; break;
1644  case X86::VMOVAPSZrmk: Opc = X86::VBLENDMPSZrmk; break;
1645  case X86::VBROADCASTSDZ256rmk: Opc = X86::VBLENDMPDZ256rmbk; break;
1646  case X86::VBROADCASTSDZrmk: Opc = X86::VBLENDMPDZrmbk; break;
1647  case X86::VBROADCASTSSZ128rmk: Opc = X86::VBLENDMPSZ128rmbk; break;
1648  case X86::VBROADCASTSSZ256rmk: Opc = X86::VBLENDMPSZ256rmbk; break;
1649  case X86::VBROADCASTSSZrmk: Opc = X86::VBLENDMPSZrmbk; break;
1650  case X86::VPBROADCASTDZ128rmk: Opc = X86::VPBLENDMDZ128rmbk; break;
1651  case X86::VPBROADCASTDZ256rmk: Opc = X86::VPBLENDMDZ256rmbk; break;
1652  case X86::VPBROADCASTDZrmk: Opc = X86::VPBLENDMDZrmbk; break;
1653  case X86::VPBROADCASTQZ128rmk: Opc = X86::VPBLENDMQZ128rmbk; break;
1654  case X86::VPBROADCASTQZ256rmk: Opc = X86::VPBLENDMQZ256rmbk; break;
1655  case X86::VPBROADCASTQZrmk: Opc = X86::VPBLENDMQZrmbk; break;
1656  }
1657 
1658  NewMI = BuildMI(MF, MI.getDebugLoc(), get(Opc))
1659  .add(Dest)
1660  .add(MI.getOperand(2))
1661  .add(Src)
1662  .add(MI.getOperand(3))
1663  .add(MI.getOperand(4))
1664  .add(MI.getOperand(5))
1665  .add(MI.getOperand(6))
1666  .add(MI.getOperand(7));
1667  break;
1668  }
1669 
1670  case X86::VMOVDQU8Z128rrk:
1671  case X86::VMOVDQU8Z256rrk:
1672  case X86::VMOVDQU8Zrrk:
1673  case X86::VMOVDQU16Z128rrk:
1674  case X86::VMOVDQU16Z256rrk:
1675  case X86::VMOVDQU16Zrrk:
1676  case X86::VMOVDQU32Z128rrk: case X86::VMOVDQA32Z128rrk:
1677  case X86::VMOVDQU32Z256rrk: case X86::VMOVDQA32Z256rrk:
1678  case X86::VMOVDQU32Zrrk: case X86::VMOVDQA32Zrrk:
1679  case X86::VMOVDQU64Z128rrk: case X86::VMOVDQA64Z128rrk:
1680  case X86::VMOVDQU64Z256rrk: case X86::VMOVDQA64Z256rrk:
1681  case X86::VMOVDQU64Zrrk: case X86::VMOVDQA64Zrrk:
1682  case X86::VMOVUPDZ128rrk: case X86::VMOVAPDZ128rrk:
1683  case X86::VMOVUPDZ256rrk: case X86::VMOVAPDZ256rrk:
1684  case X86::VMOVUPDZrrk: case X86::VMOVAPDZrrk:
1685  case X86::VMOVUPSZ128rrk: case X86::VMOVAPSZ128rrk:
1686  case X86::VMOVUPSZ256rrk: case X86::VMOVAPSZ256rrk:
1687  case X86::VMOVUPSZrrk: case X86::VMOVAPSZrrk: {
1688  unsigned Opc;
1689  switch (MIOpc) {
1690  default: llvm_unreachable("Unreachable!");
1691  case X86::VMOVDQU8Z128rrk: Opc = X86::VPBLENDMBZ128rrk; break;
1692  case X86::VMOVDQU8Z256rrk: Opc = X86::VPBLENDMBZ256rrk; break;
1693  case X86::VMOVDQU8Zrrk: Opc = X86::VPBLENDMBZrrk; break;
1694  case X86::VMOVDQU16Z128rrk: Opc = X86::VPBLENDMWZ128rrk; break;
1695  case X86::VMOVDQU16Z256rrk: Opc = X86::VPBLENDMWZ256rrk; break;
1696  case X86::VMOVDQU16Zrrk: Opc = X86::VPBLENDMWZrrk; break;
1697  case X86::VMOVDQU32Z128rrk: Opc = X86::VPBLENDMDZ128rrk; break;
1698  case X86::VMOVDQU32Z256rrk: Opc = X86::VPBLENDMDZ256rrk; break;
1699  case X86::VMOVDQU32Zrrk: Opc = X86::VPBLENDMDZrrk; break;
1700  case X86::VMOVDQU64Z128rrk: Opc = X86::VPBLENDMQZ128rrk; break;
1701  case X86::VMOVDQU64Z256rrk: Opc = X86::VPBLENDMQZ256rrk; break;
1702  case X86::VMOVDQU64Zrrk: Opc = X86::VPBLENDMQZrrk; break;
1703  case X86::VMOVUPDZ128rrk: Opc = X86::VBLENDMPDZ128rrk; break;
1704  case X86::VMOVUPDZ256rrk: Opc = X86::VBLENDMPDZ256rrk; break;
1705  case X86::VMOVUPDZrrk: Opc = X86::VBLENDMPDZrrk; break;
1706  case X86::VMOVUPSZ128rrk: Opc = X86::VBLENDMPSZ128rrk; break;
1707  case X86::VMOVUPSZ256rrk: Opc = X86::VBLENDMPSZ256rrk; break;
1708  case X86::VMOVUPSZrrk: Opc = X86::VBLENDMPSZrrk; break;
1709  case X86::VMOVDQA32Z128rrk: Opc = X86::VPBLENDMDZ128rrk; break;
1710  case X86::VMOVDQA32Z256rrk: Opc = X86::VPBLENDMDZ256rrk; break;
1711  case X86::VMOVDQA32Zrrk: Opc = X86::VPBLENDMDZrrk; break;
1712  case X86::VMOVDQA64Z128rrk: Opc = X86::VPBLENDMQZ128rrk; break;
1713  case X86::VMOVDQA64Z256rrk: Opc = X86::VPBLENDMQZ256rrk; break;
1714  case X86::VMOVDQA64Zrrk: Opc = X86::VPBLENDMQZrrk; break;
1715  case X86::VMOVAPDZ128rrk: Opc = X86::VBLENDMPDZ128rrk; break;
1716  case X86::VMOVAPDZ256rrk: Opc = X86::VBLENDMPDZ256rrk; break;
1717  case X86::VMOVAPDZrrk: Opc = X86::VBLENDMPDZrrk; break;
1718  case X86::VMOVAPSZ128rrk: Opc = X86::VBLENDMPSZ128rrk; break;
1719  case X86::VMOVAPSZ256rrk: Opc = X86::VBLENDMPSZ256rrk; break;
1720  case X86::VMOVAPSZrrk: Opc = X86::VBLENDMPSZrrk; break;
1721  }
1722 
1723  NewMI = BuildMI(MF, MI.getDebugLoc(), get(Opc))
1724  .add(Dest)
1725  .add(MI.getOperand(2))
1726  .add(Src)
1727  .add(MI.getOperand(3));
1728  break;
1729  }
1730  }
1731 
1732  if (!NewMI) return nullptr;
1733 
1734  if (LV) { // Update live variables
1735  if (Src.isKill())
1736  LV->replaceKillInstruction(Src.getReg(), MI, *NewMI);
1737  if (Dest.isDead())
1738  LV->replaceKillInstruction(Dest.getReg(), MI, *NewMI);
1739  }
1740 
1741  MachineBasicBlock &MBB = *MI.getParent();
1742  MBB.insert(MI.getIterator(), NewMI); // Insert the new inst
1743 
1744  if (LIS) {
1745  LIS->ReplaceMachineInstrInMaps(MI, *NewMI);
1746  if (SrcReg)
1747  LIS->getInterval(SrcReg);
1748  if (SrcReg2)
1749  LIS->getInterval(SrcReg2);
1750  }
1751 
1752  return NewMI;
1753 }
1754 
1755 /// This determines which of three possible cases of a three source commute
1756 /// the source indexes correspond to taking into account any mask operands.
1757 /// All prevents commuting a passthru operand. Returns -1 if the commute isn't
1758 /// possible.
1759 /// Case 0 - Possible to commute the first and second operands.
1760 /// Case 1 - Possible to commute the first and third operands.
1761 /// Case 2 - Possible to commute the second and third operands.
1762 static unsigned getThreeSrcCommuteCase(uint64_t TSFlags, unsigned SrcOpIdx1,
1763  unsigned SrcOpIdx2) {
1764  // Put the lowest index to SrcOpIdx1 to simplify the checks below.
1765  if (SrcOpIdx1 > SrcOpIdx2)
1766  std::swap(SrcOpIdx1, SrcOpIdx2);
1767 
1768  unsigned Op1 = 1, Op2 = 2, Op3 = 3;
1769  if (X86II::isKMasked(TSFlags)) {
1770  Op2++;
1771  Op3++;
1772  }
1773 
1774  if (SrcOpIdx1 == Op1 && SrcOpIdx2 == Op2)
1775  return 0;
1776  if (SrcOpIdx1 == Op1 && SrcOpIdx2 == Op3)
1777  return 1;
1778  if (SrcOpIdx1 == Op2 && SrcOpIdx2 == Op3)
1779  return 2;
1780  llvm_unreachable("Unknown three src commute case.");
1781 }
1782 
1784  const MachineInstr &MI, unsigned SrcOpIdx1, unsigned SrcOpIdx2,
1785  const X86InstrFMA3Group &FMA3Group) const {
1786 
1787  unsigned Opc = MI.getOpcode();
1788 
1789  // TODO: Commuting the 1st operand of FMA*_Int requires some additional
1790  // analysis. The commute optimization is legal only if all users of FMA*_Int
1791  // use only the lowest element of the FMA*_Int instruction. Such analysis are
1792  // not implemented yet. So, just return 0 in that case.
1793  // When such analysis are available this place will be the right place for
1794  // calling it.
1795  assert(!(FMA3Group.isIntrinsic() && (SrcOpIdx1 == 1 || SrcOpIdx2 == 1)) &&
1796  "Intrinsic instructions can't commute operand 1");
1797 
1798  // Determine which case this commute is or if it can't be done.
1799  unsigned Case = getThreeSrcCommuteCase(MI.getDesc().TSFlags, SrcOpIdx1,
1800  SrcOpIdx2);
1801  assert(Case < 3 && "Unexpected case number!");
1802 
1803  // Define the FMA forms mapping array that helps to map input FMA form
1804  // to output FMA form to preserve the operation semantics after
1805  // commuting the operands.
1806  const unsigned Form132Index = 0;
1807  const unsigned Form213Index = 1;
1808  const unsigned Form231Index = 2;
1809  static const unsigned FormMapping[][3] = {
1810  // 0: SrcOpIdx1 == 1 && SrcOpIdx2 == 2;
1811  // FMA132 A, C, b; ==> FMA231 C, A, b;
1812  // FMA213 B, A, c; ==> FMA213 A, B, c;
1813  // FMA231 C, A, b; ==> FMA132 A, C, b;
1814  { Form231Index, Form213Index, Form132Index },
1815  // 1: SrcOpIdx1 == 1 && SrcOpIdx2 == 3;
1816  // FMA132 A, c, B; ==> FMA132 B, c, A;
1817  // FMA213 B, a, C; ==> FMA231 C, a, B;
1818  // FMA231 C, a, B; ==> FMA213 B, a, C;
1819  { Form132Index, Form231Index, Form213Index },
1820  // 2: SrcOpIdx1 == 2 && SrcOpIdx2 == 3;
1821  // FMA132 a, C, B; ==> FMA213 a, B, C;
1822  // FMA213 b, A, C; ==> FMA132 b, C, A;
1823  // FMA231 c, A, B; ==> FMA231 c, B, A;
1824  { Form213Index, Form132Index, Form231Index }
1825  };
1826 
1827  unsigned FMAForms[3];
1828  FMAForms[0] = FMA3Group.get132Opcode();
1829  FMAForms[1] = FMA3Group.get213Opcode();
1830  FMAForms[2] = FMA3Group.get231Opcode();
1831 
1832  // Everything is ready, just adjust the FMA opcode and return it.
1833  for (unsigned FormIndex = 0; FormIndex < 3; FormIndex++)
1834  if (Opc == FMAForms[FormIndex])
1835  return FMAForms[FormMapping[Case][FormIndex]];
1836 
1837  llvm_unreachable("Illegal FMA3 format");
1838 }
1839 
1840 static void commuteVPTERNLOG(MachineInstr &MI, unsigned SrcOpIdx1,
1841  unsigned SrcOpIdx2) {
1842  // Determine which case this commute is or if it can't be done.
1843  unsigned Case = getThreeSrcCommuteCase(MI.getDesc().TSFlags, SrcOpIdx1,
1844  SrcOpIdx2);
1845  assert(Case < 3 && "Unexpected case value!");
1846 
1847  // For each case we need to swap two pairs of bits in the final immediate.
1848  static const uint8_t SwapMasks[3][4] = {
1849  { 0x04, 0x10, 0x08, 0x20 }, // Swap bits 2/4 and 3/5.
1850  { 0x02, 0x10, 0x08, 0x40 }, // Swap bits 1/4 and 3/6.
1851  { 0x02, 0x04, 0x20, 0x40 }, // Swap bits 1/2 and 5/6.
1852  };
1853 
1854  uint8_t Imm = MI.getOperand(MI.getNumOperands()-1).getImm();
1855  // Clear out the bits we are swapping.
1856  uint8_t NewImm = Imm & ~(SwapMasks[Case][0] | SwapMasks[Case][1] |
1857  SwapMasks[Case][2] | SwapMasks[Case][3]);
1858  // If the immediate had a bit of the pair set, then set the opposite bit.
1859  if (Imm & SwapMasks[Case][0]) NewImm |= SwapMasks[Case][1];
1860  if (Imm & SwapMasks[Case][1]) NewImm |= SwapMasks[Case][0];
1861  if (Imm & SwapMasks[Case][2]) NewImm |= SwapMasks[Case][3];
1862  if (Imm & SwapMasks[Case][3]) NewImm |= SwapMasks[Case][2];
1863  MI.getOperand(MI.getNumOperands()-1).setImm(NewImm);
1864 }
1865 
1866 // Returns true if this is a VPERMI2 or VPERMT2 instruction that can be
1867 // commuted.
1868 static bool isCommutableVPERMV3Instruction(unsigned Opcode) {
1869 #define VPERM_CASES(Suffix) \
1870  case X86::VPERMI2##Suffix##128rr: case X86::VPERMT2##Suffix##128rr: \
1871  case X86::VPERMI2##Suffix##256rr: case X86::VPERMT2##Suffix##256rr: \
1872  case X86::VPERMI2##Suffix##rr: case X86::VPERMT2##Suffix##rr: \
1873  case X86::VPERMI2##Suffix##128rm: case X86::VPERMT2##Suffix##128rm: \
1874  case X86::VPERMI2##Suffix##256rm: case X86::VPERMT2##Suffix##256rm: \
1875  case X86::VPERMI2##Suffix##rm: case X86::VPERMT2##Suffix##rm: \
1876  case X86::VPERMI2##Suffix##128rrkz: case X86::VPERMT2##Suffix##128rrkz: \
1877  case X86::VPERMI2##Suffix##256rrkz: case X86::VPERMT2##Suffix##256rrkz: \
1878  case X86::VPERMI2##Suffix##rrkz: case X86::VPERMT2##Suffix##rrkz: \
1879  case X86::VPERMI2##Suffix##128rmkz: case X86::VPERMT2##Suffix##128rmkz: \
1880  case X86::VPERMI2##Suffix##256rmkz: case X86::VPERMT2##Suffix##256rmkz: \
1881  case X86::VPERMI2##Suffix##rmkz: case X86::VPERMT2##Suffix##rmkz:
1882 
1883 #define VPERM_CASES_BROADCAST(Suffix) \
1884  VPERM_CASES(Suffix) \
1885  case X86::VPERMI2##Suffix##128rmb: case X86::VPERMT2##Suffix##128rmb: \
1886  case X86::VPERMI2##Suffix##256rmb: case X86::VPERMT2##Suffix##256rmb: \
1887  case X86::VPERMI2##Suffix##rmb: case X86::VPERMT2##Suffix##rmb: \
1888  case X86::VPERMI2##Suffix##128rmbkz: case X86::VPERMT2##Suffix##128rmbkz: \
1889  case X86::VPERMI2##Suffix##256rmbkz: case X86::VPERMT2##Suffix##256rmbkz: \
1890  case X86::VPERMI2##Suffix##rmbkz: case X86::VPERMT2##Suffix##rmbkz:
1891 
1892  switch (Opcode) {
1893  default: return false;
1894  VPERM_CASES(B)
1899  VPERM_CASES(W)
1900  return true;
1901  }
1902 #undef VPERM_CASES_BROADCAST
1903 #undef VPERM_CASES
1904 }
1905 
1906 // Returns commuted opcode for VPERMI2 and VPERMT2 instructions by switching
1907 // from the I opcode to the T opcode and vice versa.
1908 static unsigned getCommutedVPERMV3Opcode(unsigned Opcode) {
1909 #define VPERM_CASES(Orig, New) \
1910  case X86::Orig##128rr: return X86::New##128rr; \
1911  case X86::Orig##128rrkz: return X86::New##128rrkz; \
1912  case X86::Orig##128rm: return X86::New##128rm; \
1913  case X86::Orig##128rmkz: return X86::New##128rmkz; \
1914  case X86::Orig##256rr: return X86::New##256rr; \
1915  case X86::Orig##256rrkz: return X86::New##256rrkz; \
1916  case X86::Orig##256rm: return X86::New##256rm; \
1917  case X86::Orig##256rmkz: return X86::New##256rmkz; \
1918  case X86::Orig##rr: return X86::New##rr; \
1919  case X86::Orig##rrkz: return X86::New##rrkz; \
1920  case X86::Orig##rm: return X86::New##rm; \
1921  case X86::Orig##rmkz: return X86::New##rmkz;
1922 
1923 #define VPERM_CASES_BROADCAST(Orig, New) \
1924  VPERM_CASES(Orig, New) \
1925  case X86::Orig##128rmb: return X86::New##128rmb; \
1926  case X86::Orig##128rmbkz: return X86::New##128rmbkz; \
1927  case X86::Orig##256rmb: return X86::New##256rmb; \
1928  case X86::Orig##256rmbkz: return X86::New##256rmbkz; \
1929  case X86::Orig##rmb: return X86::New##rmb; \
1930  case X86::Orig##rmbkz: return X86::New##rmbkz;
1931 
1932  switch (Opcode) {
1933  VPERM_CASES(VPERMI2B, VPERMT2B)
1934  VPERM_CASES_BROADCAST(VPERMI2D, VPERMT2D)
1935  VPERM_CASES_BROADCAST(VPERMI2PD, VPERMT2PD)
1936  VPERM_CASES_BROADCAST(VPERMI2PS, VPERMT2PS)
1937  VPERM_CASES_BROADCAST(VPERMI2Q, VPERMT2Q)
1938  VPERM_CASES(VPERMI2W, VPERMT2W)
1939  VPERM_CASES(VPERMT2B, VPERMI2B)
1940  VPERM_CASES_BROADCAST(VPERMT2D, VPERMI2D)
1941  VPERM_CASES_BROADCAST(VPERMT2PD, VPERMI2PD)
1942  VPERM_CASES_BROADCAST(VPERMT2PS, VPERMI2PS)
1943  VPERM_CASES_BROADCAST(VPERMT2Q, VPERMI2Q)
1944  VPERM_CASES(VPERMT2W, VPERMI2W)
1945  }
1946 
1947  llvm_unreachable("Unreachable!");
1948 #undef VPERM_CASES_BROADCAST
1949 #undef VPERM_CASES
1950 }
1951 
1953  unsigned OpIdx1,
1954  unsigned OpIdx2) const {
1955  auto cloneIfNew = [NewMI](MachineInstr &MI) -> MachineInstr & {
1956  if (NewMI)
1957  return *MI.getParent()->getParent()->CloneMachineInstr(&MI);
1958  return MI;
1959  };
1960 
1961  switch (MI.getOpcode()) {
1962  case X86::SHRD16rri8: // A = SHRD16rri8 B, C, I -> A = SHLD16rri8 C, B, (16-I)
1963  case X86::SHLD16rri8: // A = SHLD16rri8 B, C, I -> A = SHRD16rri8 C, B, (16-I)
1964  case X86::SHRD32rri8: // A = SHRD32rri8 B, C, I -> A = SHLD32rri8 C, B, (32-I)
1965  case X86::SHLD32rri8: // A = SHLD32rri8 B, C, I -> A = SHRD32rri8 C, B, (32-I)
1966  case X86::SHRD64rri8: // A = SHRD64rri8 B, C, I -> A = SHLD64rri8 C, B, (64-I)
1967  case X86::SHLD64rri8:{// A = SHLD64rri8 B, C, I -> A = SHRD64rri8 C, B, (64-I)
1968  unsigned Opc;
1969  unsigned Size;
1970  switch (MI.getOpcode()) {
1971  default: llvm_unreachable("Unreachable!");
1972  case X86::SHRD16rri8: Size = 16; Opc = X86::SHLD16rri8; break;
1973  case X86::SHLD16rri8: Size = 16; Opc = X86::SHRD16rri8; break;
1974  case X86::SHRD32rri8: Size = 32; Opc = X86::SHLD32rri8; break;
1975  case X86::SHLD32rri8: Size = 32; Opc = X86::SHRD32rri8; break;
1976  case X86::SHRD64rri8: Size = 64; Opc = X86::SHLD64rri8; break;
1977  case X86::SHLD64rri8: Size = 64; Opc = X86::SHRD64rri8; break;
1978  }
1979  unsigned Amt = MI.getOperand(3).getImm();
1980  auto &WorkingMI = cloneIfNew(MI);
1981  WorkingMI.setDesc(get(Opc));
1982  WorkingMI.getOperand(3).setImm(Size - Amt);
1983  return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
1984  OpIdx1, OpIdx2);
1985  }
1986  case X86::PFSUBrr:
1987  case X86::PFSUBRrr: {
1988  // PFSUB x, y: x = x - y
1989  // PFSUBR x, y: x = y - x
1990  unsigned Opc =
1991  (X86::PFSUBRrr == MI.getOpcode() ? X86::PFSUBrr : X86::PFSUBRrr);
1992  auto &WorkingMI = cloneIfNew(MI);
1993  WorkingMI.setDesc(get(Opc));
1994  return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
1995  OpIdx1, OpIdx2);
1996  }
1997  case X86::BLENDPDrri:
1998  case X86::BLENDPSrri:
1999  case X86::VBLENDPDrri:
2000  case X86::VBLENDPSrri:
2001  // If we're optimizing for size, try to use MOVSD/MOVSS.
2002  if (MI.getParent()->getParent()->getFunction().hasOptSize()) {
2003  unsigned Mask, Opc;
2004  switch (MI.getOpcode()) {
2005  default: llvm_unreachable("Unreachable!");
2006  case X86::BLENDPDrri: Opc = X86::MOVSDrr; Mask = 0x03; break;
2007  case X86::BLENDPSrri: Opc = X86::MOVSSrr; Mask = 0x0F; break;
2008  case X86::VBLENDPDrri: Opc = X86::VMOVSDrr; Mask = 0x03; break;
2009  case X86::VBLENDPSrri: Opc = X86::VMOVSSrr; Mask = 0x0F; break;
2010  }
2011  if ((MI.getOperand(3).getImm() ^ Mask) == 1) {
2012  auto &WorkingMI = cloneIfNew(MI);
2013  WorkingMI.setDesc(get(Opc));
2014  WorkingMI.removeOperand(3);
2015  return TargetInstrInfo::commuteInstructionImpl(WorkingMI,
2016  /*NewMI=*/false,
2017  OpIdx1, OpIdx2);
2018  }
2019  }
2021  case X86::PBLENDWrri:
2022  case X86::VBLENDPDYrri:
2023  case X86::VBLENDPSYrri:
2024  case X86::VPBLENDDrri:
2025  case X86::VPBLENDWrri:
2026  case X86::VPBLENDDYrri:
2027  case X86::VPBLENDWYrri:{
2028  int8_t Mask;
2029  switch (MI.getOpcode()) {
2030  default: llvm_unreachable("Unreachable!");
2031  case X86::BLENDPDrri: Mask = (int8_t)0x03; break;
2032  case X86::BLENDPSrri: Mask = (int8_t)0x0F; break;
2033  case X86::PBLENDWrri: Mask = (int8_t)0xFF; break;
2034  case X86::VBLENDPDrri: Mask = (int8_t)0x03; break;
2035  case X86::VBLENDPSrri: Mask = (int8_t)0x0F; break;
2036  case X86::VBLENDPDYrri: Mask = (int8_t)0x0F; break;
2037  case X86::VBLENDPSYrri: Mask = (int8_t)0xFF; break;
2038  case X86::VPBLENDDrri: Mask = (int8_t)0x0F; break;
2039  case X86::VPBLENDWrri: Mask = (int8_t)0xFF; break;
2040  case X86::VPBLENDDYrri: Mask = (int8_t)0xFF; break;
2041  case X86::VPBLENDWYrri: Mask = (int8_t)0xFF; break;
2042  }
2043  // Only the least significant bits of Imm are used.
2044  // Using int8_t to ensure it will be sign extended to the int64_t that
2045  // setImm takes in order to match isel behavior.
2046  int8_t Imm = MI.getOperand(3).getImm() & Mask;
2047  auto &WorkingMI = cloneIfNew(MI);
2048  WorkingMI.getOperand(3).setImm(Mask ^ Imm);
2049  return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
2050  OpIdx1, OpIdx2);
2051  }
2052  case X86::INSERTPSrr:
2053  case X86::VINSERTPSrr:
2054  case X86::VINSERTPSZrr: {
2055  unsigned Imm = MI.getOperand(MI.getNumOperands() - 1).getImm();
2056  unsigned ZMask = Imm & 15;
2057  unsigned DstIdx = (Imm >> 4) & 3;
2058  unsigned SrcIdx = (Imm >> 6) & 3;
2059 
2060  // We can commute insertps if we zero 2 of the elements, the insertion is
2061  // "inline" and we don't override the insertion with a zero.
2062  if (DstIdx == SrcIdx && (ZMask & (1 << DstIdx)) == 0 &&
2063  countPopulation(ZMask) == 2) {
2064  unsigned AltIdx = findFirstSet((ZMask | (1 << DstIdx)) ^ 15);
2065  assert(AltIdx < 4 && "Illegal insertion index");
2066  unsigned AltImm = (AltIdx << 6) | (AltIdx << 4) | ZMask;
2067  auto &WorkingMI = cloneIfNew(MI);
2068  WorkingMI.getOperand(MI.getNumOperands() - 1).setImm(AltImm);
2069  return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
2070  OpIdx1, OpIdx2);
2071  }
2072  return nullptr;
2073  }
2074  case X86::MOVSDrr:
2075  case X86::MOVSSrr:
2076  case X86::VMOVSDrr:
2077  case X86::VMOVSSrr:{
2078  // On SSE41 or later we can commute a MOVSS/MOVSD to a BLENDPS/BLENDPD.
2079  if (Subtarget.hasSSE41()) {
2080  unsigned Mask, Opc;
2081  switch (MI.getOpcode()) {
2082  default: llvm_unreachable("Unreachable!");
2083  case X86::MOVSDrr: Opc = X86::BLENDPDrri; Mask = 0x02; break;
2084  case X86::MOVSSrr: Opc = X86::BLENDPSrri; Mask = 0x0E; break;
2085  case X86::VMOVSDrr: Opc = X86::VBLENDPDrri; Mask = 0x02; break;
2086  case X86::VMOVSSrr: Opc = X86::VBLENDPSrri; Mask = 0x0E; break;
2087  }
2088 
2089  auto &WorkingMI = cloneIfNew(MI);
2090  WorkingMI.setDesc(get(Opc));
2091  WorkingMI.addOperand(MachineOperand::CreateImm(Mask));
2092  return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
2093  OpIdx1, OpIdx2);
2094  }
2095 
2096  // Convert to SHUFPD.
2097  assert(MI.getOpcode() == X86::MOVSDrr &&
2098  "Can only commute MOVSDrr without SSE4.1");
2099 
2100  auto &WorkingMI = cloneIfNew(MI);
2101  WorkingMI.setDesc(get(X86::SHUFPDrri));
2102  WorkingMI.addOperand(MachineOperand::CreateImm(0x02));
2103  return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
2104  OpIdx1, OpIdx2);
2105  }
2106  case X86::SHUFPDrri: {
2107  // Commute to MOVSD.
2108  assert(MI.getOperand(3).getImm() == 0x02 && "Unexpected immediate!");
2109  auto &WorkingMI = cloneIfNew(MI);
2110  WorkingMI.setDesc(get(X86::MOVSDrr));
2111  WorkingMI.removeOperand(3);
2112  return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
2113  OpIdx1, OpIdx2);
2114  }
2115  case X86::PCLMULQDQrr:
2116  case X86::VPCLMULQDQrr:
2117  case X86::VPCLMULQDQYrr:
2118  case X86::VPCLMULQDQZrr:
2119  case X86::VPCLMULQDQZ128rr:
2120  case X86::VPCLMULQDQZ256rr: {
2121  // SRC1 64bits = Imm[0] ? SRC1[127:64] : SRC1[63:0]
2122  // SRC2 64bits = Imm[4] ? SRC2[127:64] : SRC2[63:0]
2123  unsigned Imm = MI.getOperand(3).getImm();
2124  unsigned Src1Hi = Imm & 0x01;
2125  unsigned Src2Hi = Imm & 0x10;
2126  auto &WorkingMI = cloneIfNew(MI);
2127  WorkingMI.getOperand(3).setImm((Src1Hi << 4) | (Src2Hi >> 4));
2128  return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
2129  OpIdx1, OpIdx2);
2130  }
2131  case X86::VPCMPBZ128rri: case X86::VPCMPUBZ128rri:
2132  case X86::VPCMPBZ256rri: case X86::VPCMPUBZ256rri:
2133  case X86::VPCMPBZrri: case X86::VPCMPUBZrri:
2134  case X86::VPCMPDZ128rri: case X86::VPCMPUDZ128rri:
2135  case X86::VPCMPDZ256rri: case X86::VPCMPUDZ256rri:
2136  case X86::VPCMPDZrri: case X86::VPCMPUDZrri:
2137  case X86::VPCMPQZ128rri: case X86::VPCMPUQZ128rri:
2138  case X86::VPCMPQZ256rri: case X86::VPCMPUQZ256rri:
2139  case X86::VPCMPQZrri: case X86::VPCMPUQZrri:
2140  case X86::VPCMPWZ128rri: case X86::VPCMPUWZ128rri:
2141  case X86::VPCMPWZ256rri: case X86::VPCMPUWZ256rri:
2142  case X86::VPCMPWZrri: case X86::VPCMPUWZrri:
2143  case X86::VPCMPBZ128rrik: case X86::VPCMPUBZ128rrik:
2144  case X86::VPCMPBZ256rrik: case X86::VPCMPUBZ256rrik:
2145  case X86::VPCMPBZrrik: case X86::VPCMPUBZrrik:
2146  case X86::VPCMPDZ128rrik: case X86::VPCMPUDZ128rrik:
2147  case X86::VPCMPDZ256rrik: case X86::VPCMPUDZ256rrik:
2148  case X86::VPCMPDZrrik: case X86::VPCMPUDZrrik:
2149  case X86::VPCMPQZ128rrik: case X86::VPCMPUQZ128rrik:
2150  case X86::VPCMPQZ256rrik: case X86::VPCMPUQZ256rrik:
2151  case X86::VPCMPQZrrik: case X86::VPCMPUQZrrik:
2152  case X86::VPCMPWZ128rrik: case X86::VPCMPUWZ128rrik:
2153  case X86::VPCMPWZ256rrik: case X86::VPCMPUWZ256rrik:
2154  case X86::VPCMPWZrrik: case X86::VPCMPUWZrrik: {
2155  // Flip comparison mode immediate (if necessary).
2156  unsigned Imm = MI.getOperand(MI.getNumOperands() - 1).getImm() & 0x7;
2158  auto &WorkingMI = cloneIfNew(MI);
2159  WorkingMI.getOperand(MI.getNumOperands() - 1).setImm(Imm);
2160  return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
2161  OpIdx1, OpIdx2);
2162  }
2163  case X86::VPCOMBri: case X86::VPCOMUBri:
2164  case X86::VPCOMDri: case X86::VPCOMUDri:
2165  case X86::VPCOMQri: case X86::VPCOMUQri:
2166  case X86::VPCOMWri: case X86::VPCOMUWri: {
2167  // Flip comparison mode immediate (if necessary).
2168  unsigned Imm = MI.getOperand(3).getImm() & 0x7;
2170  auto &WorkingMI = cloneIfNew(MI);
2171  WorkingMI.getOperand(3).setImm(Imm);
2172  return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
2173  OpIdx1, OpIdx2);
2174  }
2175  case X86::VCMPSDZrr:
2176  case X86::VCMPSSZrr:
2177  case X86::VCMPPDZrri:
2178  case X86::VCMPPSZrri:
2179  case X86::VCMPSHZrr:
2180  case X86::VCMPPHZrri:
2181  case X86::VCMPPHZ128rri:
2182  case X86::VCMPPHZ256rri:
2183  case X86::VCMPPDZ128rri:
2184  case X86::VCMPPSZ128rri:
2185  case X86::VCMPPDZ256rri:
2186  case X86::VCMPPSZ256rri:
2187  case X86::VCMPPDZrrik:
2188  case X86::VCMPPSZrrik:
2189  case X86::VCMPPDZ128rrik:
2190  case X86::VCMPPSZ128rrik:
2191  case X86::VCMPPDZ256rrik:
2192  case X86::VCMPPSZ256rrik: {
2193  unsigned Imm =
2194  MI.getOperand(MI.getNumExplicitOperands() - 1).getImm() & 0x1f;
2196  auto &WorkingMI = cloneIfNew(MI);
2197  WorkingMI.getOperand(MI.getNumExplicitOperands() - 1).setImm(Imm);
2198  return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
2199  OpIdx1, OpIdx2);
2200  }
2201  case X86::VPERM2F128rr:
2202  case X86::VPERM2I128rr: {
2203  // Flip permute source immediate.
2204  // Imm & 0x02: lo = if set, select Op1.lo/hi else Op0.lo/hi.
2205  // Imm & 0x20: hi = if set, select Op1.lo/hi else Op0.lo/hi.
2206  int8_t Imm = MI.getOperand(3).getImm() & 0xFF;
2207  auto &WorkingMI = cloneIfNew(MI);
2208  WorkingMI.getOperand(3).setImm(Imm ^ 0x22);
2209  return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
2210  OpIdx1, OpIdx2);
2211  }
2212  case X86::MOVHLPSrr:
2213  case X86::UNPCKHPDrr:
2214  case X86::VMOVHLPSrr:
2215  case X86::VUNPCKHPDrr:
2216  case X86::VMOVHLPSZrr:
2217  case X86::VUNPCKHPDZ128rr: {
2218  assert(Subtarget.hasSSE2() && "Commuting MOVHLP/UNPCKHPD requires SSE2!");
2219 
2220  unsigned Opc = MI.getOpcode();
2221  switch (Opc) {
2222  default: llvm_unreachable("Unreachable!");
2223  case X86::MOVHLPSrr: Opc = X86::UNPCKHPDrr; break;
2224  case X86::UNPCKHPDrr: Opc = X86::MOVHLPSrr; break;
2225  case X86::VMOVHLPSrr: Opc = X86::VUNPCKHPDrr; break;
2226  case X86::VUNPCKHPDrr: Opc = X86::VMOVHLPSrr; break;
2227  case X86::VMOVHLPSZrr: Opc = X86::VUNPCKHPDZ128rr; break;
2228  case X86::VUNPCKHPDZ128rr: Opc = X86::VMOVHLPSZrr; break;
2229  }
2230  auto &WorkingMI = cloneIfNew(MI);
2231  WorkingMI.setDesc(get(Opc));
2232  return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
2233  OpIdx1, OpIdx2);
2234  }
2235  case X86::CMOV16rr: case X86::CMOV32rr: case X86::CMOV64rr: {
2236  auto &WorkingMI = cloneIfNew(MI);
2237  unsigned OpNo = MI.getDesc().getNumOperands() - 1;
2238  X86::CondCode CC = static_cast<X86::CondCode>(MI.getOperand(OpNo).getImm());
2239  WorkingMI.getOperand(OpNo).setImm(X86::GetOppositeBranchCondition(CC));
2240  return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
2241  OpIdx1, OpIdx2);
2242  }
2243  case X86::VPTERNLOGDZrri: case X86::VPTERNLOGDZrmi:
2244  case X86::VPTERNLOGDZ128rri: case X86::VPTERNLOGDZ128rmi:
2245  case X86::VPTERNLOGDZ256rri: case X86::VPTERNLOGDZ256rmi:
2246  case X86::VPTERNLOGQZrri: case X86::VPTERNLOGQZrmi:
2247  case X86::VPTERNLOGQZ128rri: case X86::VPTERNLOGQZ128rmi:
2248  case X86::VPTERNLOGQZ256rri: case X86::VPTERNLOGQZ256rmi:
2249  case X86::VPTERNLOGDZrrik:
2250  case X86::VPTERNLOGDZ128rrik:
2251  case X86::VPTERNLOGDZ256rrik:
2252  case X86::VPTERNLOGQZrrik:
2253  case X86::VPTERNLOGQZ128rrik:
2254  case X86::VPTERNLOGQZ256rrik:
2255  case X86::VPTERNLOGDZrrikz: case X86::VPTERNLOGDZrmikz:
2256  case X86::VPTERNLOGDZ128rrikz: case X86::VPTERNLOGDZ128rmikz:
2257  case X86::VPTERNLOGDZ256rrikz: case X86::VPTERNLOGDZ256rmikz:
2258  case X86::VPTERNLOGQZrrikz: case X86::VPTERNLOGQZrmikz:
2259  case X86::VPTERNLOGQZ128rrikz: case X86::VPTERNLOGQZ128rmikz:
2260  case X86::VPTERNLOGQZ256rrikz: case X86::VPTERNLOGQZ256rmikz:
2261  case X86::VPTERNLOGDZ128rmbi:
2262  case X86::VPTERNLOGDZ256rmbi:
2263  case X86::VPTERNLOGDZrmbi:
2264  case X86::VPTERNLOGQZ128rmbi:
2265  case X86::VPTERNLOGQZ256rmbi:
2266  case X86::VPTERNLOGQZrmbi:
2267  case X86::VPTERNLOGDZ128rmbikz:
2268  case X86::VPTERNLOGDZ256rmbikz:
2269  case X86::VPTERNLOGDZrmbikz:
2270  case X86::VPTERNLOGQZ128rmbikz:
2271  case X86::VPTERNLOGQZ256rmbikz:
2272  case X86::VPTERNLOGQZrmbikz: {
2273  auto &WorkingMI = cloneIfNew(MI);
2274  commuteVPTERNLOG(WorkingMI, OpIdx1, OpIdx2);
2275  return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
2276  OpIdx1, OpIdx2);
2277  }
2278  default: {
2279  if (isCommutableVPERMV3Instruction(MI.getOpcode())) {
2280  unsigned Opc = getCommutedVPERMV3Opcode(MI.getOpcode());
2281  auto &WorkingMI = cloneIfNew(MI);
2282  WorkingMI.setDesc(get(Opc));
2283  return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
2284  OpIdx1, OpIdx2);
2285  }
2286 
2287  const X86InstrFMA3Group *FMA3Group = getFMA3Group(MI.getOpcode(),
2288  MI.getDesc().TSFlags);
2289  if (FMA3Group) {
2290  unsigned Opc =
2291  getFMA3OpcodeToCommuteOperands(MI, OpIdx1, OpIdx2, *FMA3Group);
2292  auto &WorkingMI = cloneIfNew(MI);
2293  WorkingMI.setDesc(get(Opc));
2294  return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
2295  OpIdx1, OpIdx2);
2296  }
2297 
2298  return TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2);
2299  }
2300  }
2301 }
2302 
2303 bool
2304 X86InstrInfo::findThreeSrcCommutedOpIndices(const MachineInstr &MI,
2305  unsigned &SrcOpIdx1,
2306  unsigned &SrcOpIdx2,
2307  bool IsIntrinsic) const {
2308  uint64_t TSFlags = MI.getDesc().TSFlags;
2309 
2310  unsigned FirstCommutableVecOp = 1;
2311  unsigned LastCommutableVecOp = 3;
2312  unsigned KMaskOp = -1U;
2313  if (X86II::isKMasked(TSFlags)) {
2314  // For k-zero-masked operations it is Ok to commute the first vector
2315  // operand. Unless this is an intrinsic instruction.
2316  // For regular k-masked operations a conservative choice is done as the
2317  // elements of the first vector operand, for which the corresponding bit
2318  // in the k-mask operand is set to 0, are copied to the result of the
2319  // instruction.
2320  // TODO/FIXME: The commute still may be legal if it is known that the
2321  // k-mask operand is set to either all ones or all zeroes.
2322  // It is also Ok to commute the 1st operand if all users of MI use only
2323  // the elements enabled by the k-mask operand. For example,
2324  // v4 = VFMADD213PSZrk v1, k, v2, v3; // v1[i] = k[i] ? v2[i]*v1[i]+v3[i]
2325  // : v1[i];
2326  // VMOVAPSZmrk <mem_addr>, k, v4; // this is the ONLY user of v4 ->
2327  // // Ok, to commute v1 in FMADD213PSZrk.
2328 
2329  // The k-mask operand has index = 2 for masked and zero-masked operations.
2330  KMaskOp = 2;
2331 
2332  // The operand with index = 1 is used as a source for those elements for
2333  // which the corresponding bit in the k-mask is set to 0.
2334  if (X86II::isKMergeMasked(TSFlags) || IsIntrinsic)
2335  FirstCommutableVecOp = 3;
2336 
2337  LastCommutableVecOp++;
2338  } else if (IsIntrinsic) {
2339  // Commuting the first operand of an intrinsic instruction isn't possible
2340  // unless we can prove that only the lowest element of the result is used.
2341  FirstCommutableVecOp = 2;
2342  }
2343 
2344  if (isMem(MI, LastCommutableVecOp))
2345  LastCommutableVecOp--;
2346 
2347  // Only the first RegOpsNum operands are commutable.
2348  // Also, the value 'CommuteAnyOperandIndex' is valid here as it means
2349  // that the operand is not specified/fixed.
2350  if (SrcOpIdx1 != CommuteAnyOperandIndex &&
2351  (SrcOpIdx1 < FirstCommutableVecOp || SrcOpIdx1 > LastCommutableVecOp ||
2352  SrcOpIdx1 == KMaskOp))
2353  return false;
2354  if (SrcOpIdx2 != CommuteAnyOperandIndex &&
2355  (SrcOpIdx2 < FirstCommutableVecOp || SrcOpIdx2 > LastCommutableVecOp ||
2356  SrcOpIdx2 == KMaskOp))
2357  return false;
2358 
2359  // Look for two different register operands assumed to be commutable
2360  // regardless of the FMA opcode. The FMA opcode is adjusted later.
2361  if (SrcOpIdx1 == CommuteAnyOperandIndex ||
2362  SrcOpIdx2 == CommuteAnyOperandIndex) {
2363  unsigned CommutableOpIdx2 = SrcOpIdx2;
2364 
2365  // At least one of operands to be commuted is not specified and
2366  // this method is free to choose appropriate commutable operands.
2367  if (SrcOpIdx1 == SrcOpIdx2)
2368  // Both of operands are not fixed. By default set one of commutable
2369  // operands to the last register operand of the instruction.
2370  CommutableOpIdx2 = LastCommutableVecOp;
2371  else if (SrcOpIdx2 == CommuteAnyOperandIndex)
2372  // Only one of operands is not fixed.
2373  CommutableOpIdx2 = SrcOpIdx1;
2374 
2375  // CommutableOpIdx2 is well defined now. Let's choose another commutable
2376  // operand and assign its index to CommutableOpIdx1.
2377  Register Op2Reg = MI.getOperand(CommutableOpIdx2).getReg();
2378 
2379  unsigned CommutableOpIdx1;
2380  for (CommutableOpIdx1 = LastCommutableVecOp;
2381  CommutableOpIdx1 >= FirstCommutableVecOp; CommutableOpIdx1--) {
2382  // Just ignore and skip the k-mask operand.
2383  if (CommutableOpIdx1 == KMaskOp)
2384  continue;
2385 
2386  // The commuted operands must have different registers.
2387  // Otherwise, the commute transformation does not change anything and
2388  // is useless then.
2389  if (Op2Reg != MI.getOperand(CommutableOpIdx1).getReg())
2390  break;
2391  }
2392 
2393  // No appropriate commutable operands were found.
2394  if (CommutableOpIdx1 < FirstCommutableVecOp)
2395  return false;
2396 
2397  // Assign the found pair of commutable indices to SrcOpIdx1 and SrcOpidx2
2398  // to return those values.
2399  if (!fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2,
2400  CommutableOpIdx1, CommutableOpIdx2))
2401  return false;
2402  }
2403 
2404  return true;
2405 }
2406 
2408  unsigned &SrcOpIdx1,
2409  unsigned &SrcOpIdx2) const {
2410  const MCInstrDesc &Desc = MI.getDesc();
2411  if (!Desc.isCommutable())
2412  return false;
2413 
2414  switch (MI.getOpcode()) {
2415  case X86::CMPSDrr:
2416  case X86::CMPSSrr:
2417  case X86::CMPPDrri:
2418  case X86::CMPPSrri:
2419  case X86::VCMPSDrr:
2420  case X86::VCMPSSrr:
2421  case X86::VCMPPDrri:
2422  case X86::VCMPPSrri:
2423  case X86::VCMPPDYrri:
2424  case X86::VCMPPSYrri:
2425  case X86::VCMPSDZrr:
2426  case X86::VCMPSSZrr:
2427  case X86::VCMPPDZrri:
2428  case X86::VCMPPSZrri:
2429  case X86::VCMPSHZrr:
2430  case X86::VCMPPHZrri:
2431  case X86::VCMPPHZ128rri:
2432  case X86::VCMPPHZ256rri:
2433  case X86::VCMPPDZ128rri:
2434  case X86::VCMPPSZ128rri:
2435  case X86::VCMPPDZ256rri:
2436  case X86::VCMPPSZ256rri:
2437  case X86::VCMPPDZrrik:
2438  case X86::VCMPPSZrrik:
2439  case X86::VCMPPDZ128rrik:
2440  case X86::VCMPPSZ128rrik:
2441  case X86::VCMPPDZ256rrik:
2442  case X86::VCMPPSZ256rrik: {
2443  unsigned OpOffset = X86II::isKMasked(Desc.TSFlags) ? 1 : 0;
2444 
2445  // Float comparison can be safely commuted for
2446  // Ordered/Unordered/Equal/NotEqual tests
2447  unsigned Imm = MI.getOperand(3 + OpOffset).getImm() & 0x7;
2448  switch (Imm) {
2449  default:
2450  // EVEX versions can be commuted.
2451  if ((Desc.TSFlags & X86II::EncodingMask) == X86II::EVEX)
2452  break;
2453  return false;
2454  case 0x00: // EQUAL
2455  case 0x03: // UNORDERED
2456  case 0x04: // NOT EQUAL
2457  case 0x07: // ORDERED
2458  break;
2459  }
2460 
2461  // The indices of the commutable operands are 1 and 2 (or 2 and 3
2462  // when masked).
2463  // Assign them to the returned operand indices here.
2464  return fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2, 1 + OpOffset,
2465  2 + OpOffset);
2466  }
2467  case X86::MOVSSrr:
2468  // X86::MOVSDrr is always commutable. MOVSS is only commutable if we can
2469  // form sse4.1 blend. We assume VMOVSSrr/VMOVSDrr is always commutable since
2470  // AVX implies sse4.1.
2471  if (Subtarget.hasSSE41())
2472  return TargetInstrInfo::findCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2);
2473  return false;
2474  case X86::SHUFPDrri:
2475  // We can commute this to MOVSD.
2476  if (MI.getOperand(3).getImm() == 0x02)
2477  return TargetInstrInfo::findCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2);
2478  return false;
2479  case X86::MOVHLPSrr:
2480  case X86::UNPCKHPDrr:
2481  case X86::VMOVHLPSrr:
2482  case X86::VUNPCKHPDrr:
2483  case X86::VMOVHLPSZrr:
2484  case X86::VUNPCKHPDZ128rr:
2485  if (Subtarget.hasSSE2())
2486  return TargetInstrInfo::findCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2);
2487  return false;
2488  case X86::VPTERNLOGDZrri: case X86::VPTERNLOGDZrmi:
2489  case X86::VPTERNLOGDZ128rri: case X86::VPTERNLOGDZ128rmi:
2490  case X86::VPTERNLOGDZ256rri: case X86::VPTERNLOGDZ256rmi:
2491  case X86::VPTERNLOGQZrri: case X86::VPTERNLOGQZrmi:
2492  case X86::VPTERNLOGQZ128rri: case X86::VPTERNLOGQZ128rmi:
2493  case X86::VPTERNLOGQZ256rri: case X86::VPTERNLOGQZ256rmi:
2494  case X86::VPTERNLOGDZrrik:
2495  case X86::VPTERNLOGDZ128rrik:
2496  case X86::VPTERNLOGDZ256rrik:
2497  case X86::VPTERNLOGQZrrik:
2498  case X86::VPTERNLOGQZ128rrik:
2499  case X86::VPTERNLOGQZ256rrik:
2500  case X86::VPTERNLOGDZrrikz: case X86::VPTERNLOGDZrmikz:
2501  case X86::VPTERNLOGDZ128rrikz: case X86::VPTERNLOGDZ128rmikz:
2502  case X86::VPTERNLOGDZ256rrikz: case X86::VPTERNLOGDZ256rmikz:
2503  case X86::VPTERNLOGQZrrikz: case X86::VPTERNLOGQZrmikz:
2504  case X86::VPTERNLOGQZ128rrikz: case X86::VPTERNLOGQZ128rmikz:
2505  case X86::VPTERNLOGQZ256rrikz: case X86::VPTERNLOGQZ256rmikz:
2506  case X86::VPTERNLOGDZ128rmbi:
2507  case X86::VPTERNLOGDZ256rmbi:
2508  case X86::VPTERNLOGDZrmbi:
2509  case X86::VPTERNLOGQZ128rmbi:
2510  case X86::VPTERNLOGQZ256rmbi:
2511  case X86::VPTERNLOGQZrmbi:
2512  case X86::VPTERNLOGDZ128rmbikz:
2513  case X86::VPTERNLOGDZ256rmbikz:
2514  case X86::VPTERNLOGDZrmbikz:
2515  case X86::VPTERNLOGQZ128rmbikz:
2516  case X86::VPTERNLOGQZ256rmbikz:
2517  case X86::VPTERNLOGQZrmbikz:
2518  return findThreeSrcCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2);
2519  case X86::VPDPWSSDYrr:
2520  case X86::VPDPWSSDrr:
2521  case X86::VPDPWSSDSYrr:
2522  case X86::VPDPWSSDSrr:
2523  case X86::VPDPWSSDZ128r:
2524  case X86::VPDPWSSDZ128rk:
2525  case X86::VPDPWSSDZ128rkz:
2526  case X86::VPDPWSSDZ256r:
2527  case X86::VPDPWSSDZ256rk:
2528  case X86::VPDPWSSDZ256rkz:
2529  case X86::VPDPWSSDZr:
2530  case X86::VPDPWSSDZrk:
2531  case X86::VPDPWSSDZrkz:
2532  case X86::VPDPWSSDSZ128r:
2533  case X86::VPDPWSSDSZ128rk:
2534  case X86::VPDPWSSDSZ128rkz:
2535  case X86::VPDPWSSDSZ256r:
2536  case X86::VPDPWSSDSZ256rk:
2537  case X86::VPDPWSSDSZ256rkz:
2538  case X86::VPDPWSSDSZr:
2539  case X86::VPDPWSSDSZrk:
2540  case X86::VPDPWSSDSZrkz:
2541  case X86::VPMADD52HUQZ128r:
2542  case X86::VPMADD52HUQZ128rk:
2543  case X86::VPMADD52HUQZ128rkz:
2544  case X86::VPMADD52HUQZ256r:
2545  case X86::VPMADD52HUQZ256rk:
2546  case X86::VPMADD52HUQZ256rkz:
2547  case X86::VPMADD52HUQZr:
2548  case X86::VPMADD52HUQZrk:
2549  case X86::VPMADD52HUQZrkz:
2550  case X86::VPMADD52LUQZ128r:
2551  case X86::VPMADD52LUQZ128rk:
2552  case X86::VPMADD52LUQZ128rkz:
2553  case X86::VPMADD52LUQZ256r:
2554  case X86::VPMADD52LUQZ256rk:
2555  case X86::VPMADD52LUQZ256rkz:
2556  case X86::VPMADD52LUQZr:
2557  case X86::VPMADD52LUQZrk:
2558  case X86::VPMADD52LUQZrkz:
2559  case X86::VFMADDCPHZr:
2560  case X86::VFMADDCPHZrk:
2561  case X86::VFMADDCPHZrkz:
2562  case X86::VFMADDCPHZ128r:
2563  case X86::VFMADDCPHZ128rk:
2564  case X86::VFMADDCPHZ128rkz:
2565  case X86::VFMADDCPHZ256r:
2566  case X86::VFMADDCPHZ256rk:
2567  case X86::VFMADDCPHZ256rkz:
2568  case X86::VFMADDCSHZr:
2569  case X86::VFMADDCSHZrk:
2570  case X86::VFMADDCSHZrkz: {
2571  unsigned CommutableOpIdx1 = 2;
2572  unsigned CommutableOpIdx2 = 3;
2573  if (X86II::isKMasked(Desc.TSFlags)) {
2574  // Skip the mask register.
2575  ++CommutableOpIdx1;
2576  ++CommutableOpIdx2;
2577  }
2578  if (!fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2,
2579  CommutableOpIdx1, CommutableOpIdx2))
2580  return false;
2581  if (!MI.getOperand(SrcOpIdx1).isReg() ||
2582  !MI.getOperand(SrcOpIdx2).isReg())
2583  // No idea.
2584  return false;
2585  return true;
2586  }
2587 
2588  default:
2589  const X86InstrFMA3Group *FMA3Group = getFMA3Group(MI.getOpcode(),
2590  MI.getDesc().TSFlags);
2591  if (FMA3Group)
2592  return findThreeSrcCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2,
2593  FMA3Group->isIntrinsic());
2594 
2595  // Handled masked instructions since we need to skip over the mask input
2596  // and the preserved input.
2597  if (X86II::isKMasked(Desc.TSFlags)) {
2598  // First assume that the first input is the mask operand and skip past it.
2599  unsigned CommutableOpIdx1 = Desc.getNumDefs() + 1;
2600  unsigned CommutableOpIdx2 = Desc.getNumDefs() + 2;
2601  // Check if the first input is tied. If there isn't one then we only
2602  // need to skip the mask operand which we did above.
2603  if ((MI.getDesc().getOperandConstraint(Desc.getNumDefs(),
2604  MCOI::TIED_TO) != -1)) {
2605  // If this is zero masking instruction with a tied operand, we need to
2606  // move the first index back to the first input since this must
2607  // be a 3 input instruction and we want the first two non-mask inputs.
2608  // Otherwise this is a 2 input instruction with a preserved input and
2609  // mask, so we need to move the indices to skip one more input.
2610  if (X86II::isKMergeMasked(Desc.TSFlags)) {
2611  ++CommutableOpIdx1;
2612  ++CommutableOpIdx2;
2613  } else {
2614  --CommutableOpIdx1;
2615  }
2616  }
2617 
2618  if (!fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2,
2619  CommutableOpIdx1, CommutableOpIdx2))
2620  return false;
2621 
2622  if (!MI.getOperand(SrcOpIdx1).isReg() ||
2623  !MI.getOperand(SrcOpIdx2).isReg())
2624  // No idea.
2625  return false;
2626  return true;
2627  }
2628 
2629  return TargetInstrInfo::findCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2);
2630  }
2631  return false;
2632 }
2633 
2635  unsigned Opcode = MI->getOpcode();
2636  if (Opcode != X86::LEA32r && Opcode != X86::LEA64r &&
2637  Opcode != X86::LEA64_32r)
2638  return false;
2639 
2640  const MachineOperand &Scale = MI->getOperand(1 + X86::AddrScaleAmt);
2641  const MachineOperand &Disp = MI->getOperand(1 + X86::AddrDisp);
2642  const MachineOperand &Segment = MI->getOperand(1 + X86::AddrSegmentReg);
2643 
2644  if (Segment.getReg() != 0 || !Disp.isImm() || Disp.getImm() != 0 ||
2645  Scale.getImm() > 1)
2646  return false;
2647 
2648  return true;
2649 }
2650 
2652  // Currently we're interested in following sequence only.
2653  // r3 = lea r1, r2
2654  // r5 = add r3, r4
2655  // Both r3 and r4 are killed in add, we hope the add instruction has the
2656  // operand order
2657  // r5 = add r4, r3
2658  // So later in X86FixupLEAs the lea instruction can be rewritten as add.
2659  unsigned Opcode = MI.getOpcode();
2660  if (Opcode != X86::ADD32rr && Opcode != X86::ADD64rr)
2661  return false;
2662 
2663  const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
2664  Register Reg1 = MI.getOperand(1).getReg();
2665  Register Reg2 = MI.getOperand(2).getReg();
2666 
2667  // Check if Reg1 comes from LEA in the same MBB.
2668  if (MachineInstr *Inst = MRI.getUniqueVRegDef(Reg1)) {
2669  if (isConvertibleLEA(Inst) && Inst->getParent() == MI.getParent()) {
2670  Commute = true;
2671  return true;
2672  }
2673  }
2674 
2675  // Check if Reg2 comes from LEA in the same MBB.
2676  if (MachineInstr *Inst = MRI.getUniqueVRegDef(Reg2)) {
2677  if (isConvertibleLEA(Inst) && Inst->getParent() == MI.getParent()) {
2678  Commute = false;
2679  return true;
2680  }
2681  }
2682 
2683  return false;
2684 }
2685 
2687  unsigned Opcode = MCID.getOpcode();
2688  if (!(X86::isJCC(Opcode) || X86::isSETCC(Opcode) || X86::isCMOVCC(Opcode)))
2689  return -1;
2690  // Assume that condition code is always the last use operand.
2691  unsigned NumUses = MCID.getNumOperands() - MCID.getNumDefs();
2692  return NumUses - 1;
2693 }
2694 
2696  const MCInstrDesc &MCID = MI.getDesc();
2697  int CondNo = getCondSrcNoFromDesc(MCID);
2698  if (CondNo < 0)
2699  return X86::COND_INVALID;
2700  CondNo += MCID.getNumDefs();
2701  return static_cast<X86::CondCode>(MI.getOperand(CondNo).getImm());
2702 }
2703 
2705  return X86::isJCC(MI.getOpcode()) ? X86::getCondFromMI(MI)
2707 }
2708 
2710  return X86::isSETCC(MI.getOpcode()) ? X86::getCondFromMI(MI)
2712 }
2713 
2715  return X86::isCMOVCC(MI.getOpcode()) ? X86::getCondFromMI(MI)
2717 }
2718 
2719 /// Return the inverse of the specified condition,
2720 /// e.g. turning COND_E to COND_NE.
2722  switch (CC) {
2723  default: llvm_unreachable("Illegal condition code!");
2724  case X86::COND_E: return X86::COND_NE;
2725  case X86::COND_NE: return X86::COND_E;
2726  case X86::COND_L: return X86::COND_GE;
2727  case X86::COND_LE: return X86::COND_G;
2728  case X86::COND_G: return X86::COND_LE;
2729  case X86::COND_GE: return X86::COND_L;
2730  case X86::COND_B: return X86::COND_AE;
2731  case X86::COND_BE: return X86::COND_A;
2732  case X86::COND_A: return X86::COND_BE;
2733  case X86::COND_AE: return X86::COND_B;
2734  case X86::COND_S: return X86::COND_NS;
2735  case X86::COND_NS: return X86::COND_S;
2736  case X86::COND_P: return X86::COND_NP;
2737  case X86::COND_NP: return X86::COND_P;
2738  case X86::COND_O: return X86::COND_NO;
2739  case X86::COND_NO: return X86::COND_O;
2742  }
2743 }
2744 
2745 /// Assuming the flags are set by MI(a,b), return the condition code if we
2746 /// modify the instructions such that flags are set by MI(b,a).
2748  switch (CC) {
2749  default: return X86::COND_INVALID;
2750  case X86::COND_E: return X86::COND_E;
2751  case X86::COND_NE: return X86::COND_NE;
2752  case X86::COND_L: return X86::COND_G;
2753  case X86::COND_LE: return X86::COND_GE;
2754  case X86::COND_G: return X86::COND_L;
2755  case X86::COND_GE: return X86::COND_LE;
2756  case X86::COND_B: return X86::COND_A;
2757  case X86::COND_BE: return X86::COND_AE;
2758  case X86::COND_A: return X86::COND_B;
2759  case X86::COND_AE: return X86::COND_BE;
2760  }
2761 }
2762 
2763 std::pair<X86::CondCode, bool>
2766  bool NeedSwap = false;
2767  switch (Predicate) {
2768  default: break;
2769  // Floating-point Predicates
2770  case CmpInst::FCMP_UEQ: CC = X86::COND_E; break;
2771  case CmpInst::FCMP_OLT: NeedSwap = true; LLVM_FALLTHROUGH;
2772  case CmpInst::FCMP_OGT: CC = X86::COND_A; break;
2773  case CmpInst::FCMP_OLE: NeedSwap = true; LLVM_FALLTHROUGH;
2774  case CmpInst::FCMP_OGE: CC = X86::COND_AE; break;
2775  case CmpInst::FCMP_UGT: NeedSwap = true; LLVM_FALLTHROUGH;
2776  case CmpInst::FCMP_ULT: CC = X86::COND_B; break;
2777  case CmpInst::FCMP_UGE: NeedSwap = true; LLVM_FALLTHROUGH;
2778  case CmpInst::FCMP_ULE: CC = X86::COND_BE; break;
2779  case CmpInst::FCMP_ONE: CC = X86::COND_NE; break;
2780  case CmpInst::FCMP_UNO: CC = X86::COND_P; break;
2781  case CmpInst::FCMP_ORD: CC = X86::COND_NP; break;
2783  case CmpInst::FCMP_UNE: CC = X86::COND_INVALID; break;
2784 
2785  // Integer Predicates
2786  case CmpInst::ICMP_EQ: CC = X86::COND_E; break;
2787  case CmpInst::ICMP_NE: CC = X86::COND_NE; break;
2788  case CmpInst::ICMP_UGT: CC = X86::COND_A; break;
2789  case CmpInst::ICMP_UGE: CC = X86::COND_AE; break;
2790  case CmpInst::ICMP_ULT: CC = X86::COND_B; break;
2791  case CmpInst::ICMP_ULE: CC = X86::COND_BE; break;
2792  case CmpInst::ICMP_SGT: CC = X86::COND_G; break;
2793  case CmpInst::ICMP_SGE: CC = X86::COND_GE; break;
2794  case CmpInst::ICMP_SLT: CC = X86::COND_L; break;
2795  case CmpInst::ICMP_SLE: CC = X86::COND_LE; break;
2796  }
2797 
2798  return std::make_pair(CC, NeedSwap);
2799 }
2800 
2801 /// Return a cmov opcode for the given register size in bytes, and operand type.
2802 unsigned X86::getCMovOpcode(unsigned RegBytes, bool HasMemoryOperand) {
2803  switch(RegBytes) {
2804  default: llvm_unreachable("Illegal register size!");
2805  case 2: return HasMemoryOperand ? X86::CMOV16rm : X86::CMOV16rr;
2806  case 4: return HasMemoryOperand ? X86::CMOV32rm : X86::CMOV32rr;
2807  case 8: return HasMemoryOperand ? X86::CMOV64rm : X86::CMOV64rr;
2808  }
2809 }
2810 
2811 /// Get the VPCMP immediate for the given condition.
2813  switch (CC) {
2814  default: llvm_unreachable("Unexpected SETCC condition");
2815  case ISD::SETNE: return 4;
2816  case ISD::SETEQ: return 0;
2817  case ISD::SETULT:
2818  case ISD::SETLT: return 1;
2819  case ISD::SETUGT:
2820  case ISD::SETGT: return 6;
2821  case ISD::SETUGE:
2822  case ISD::SETGE: return 5;
2823  case ISD::SETULE:
2824  case ISD::SETLE: return 2;
2825  }
2826 }
2827 
2828 /// Get the VPCMP immediate if the operands are swapped.
2829 unsigned X86::getSwappedVPCMPImm(unsigned Imm) {
2830  switch (Imm) {
2831  default: llvm_unreachable("Unreachable!");
2832  case 0x01: Imm = 0x06; break; // LT -> NLE
2833  case 0x02: Imm = 0x05; break; // LE -> NLT
2834  case 0x05: Imm = 0x02; break; // NLT -> LE
2835  case 0x06: Imm = 0x01; break; // NLE -> LT
2836  case 0x00: // EQ
2837  case 0x03: // FALSE
2838  case 0x04: // NE
2839  case 0x07: // TRUE
2840  break;
2841  }
2842 
2843  return Imm;
2844 }
2845 
2846 /// Get the VPCOM immediate if the operands are swapped.
2847 unsigned X86::getSwappedVPCOMImm(unsigned Imm) {
2848  switch (Imm) {
2849  default: llvm_unreachable("Unreachable!");
2850  case 0x00: Imm = 0x02; break; // LT -> GT
2851  case 0x01: Imm = 0x03; break; // LE -> GE
2852  case 0x02: Imm = 0x00; break; // GT -> LT
2853  case 0x03: Imm = 0x01; break; // GE -> LE
2854  case 0x04: // EQ
2855  case 0x05: // NE
2856  case 0x06: // FALSE
2857  case 0x07: // TRUE
2858  break;
2859  }
2860 
2861  return Imm;
2862 }
2863 
2864 /// Get the VCMP immediate if the operands are swapped.
2865 unsigned X86::getSwappedVCMPImm(unsigned Imm) {
2866  // Only need the lower 2 bits to distinquish.
2867  switch (Imm & 0x3) {
2868  default: llvm_unreachable("Unreachable!");
2869  case 0x00: case 0x03:
2870  // EQ/NE/TRUE/FALSE/ORD/UNORD don't change immediate when commuted.
2871  break;
2872  case 0x01: case 0x02:
2873  // Need to toggle bits 3:0. Bit 4 stays the same.
2874  Imm ^= 0xf;
2875  break;
2876  }
2877 
2878  return Imm;
2879 }
2880 
2881 /// Return true if the Reg is X87 register.
2882 static bool isX87Reg(unsigned Reg) {
2883  return (Reg == X86::FPCW || Reg == X86::FPSW ||
2884  (Reg >= X86::ST0 && Reg <= X86::ST7));
2885 }
2886 
2887 /// check if the instruction is X87 instruction
2889  for (const MachineOperand &MO : MI.operands()) {
2890  if (!MO.isReg())
2891  continue;
2892  if (isX87Reg(MO.getReg()))
2893  return true;
2894  }
2895  return false;
2896 }
2897 
2899  switch (MI.getOpcode()) {
2900  case X86::TCRETURNdi:
2901  case X86::TCRETURNri:
2902  case X86::TCRETURNmi:
2903  case X86::TCRETURNdi64:
2904  case X86::TCRETURNri64:
2905  case X86::TCRETURNmi64:
2906  return true;
2907  default:
2908  return false;
2909  }
2910 }
2911 
2913  SmallVectorImpl<MachineOperand> &BranchCond,
2914  const MachineInstr &TailCall) const {
2915  if (TailCall.getOpcode() != X86::TCRETURNdi &&
2916  TailCall.getOpcode() != X86::TCRETURNdi64) {
2917  // Only direct calls can be done with a conditional branch.
2918  return false;
2919  }
2920 
2921  const MachineFunction *MF = TailCall.getParent()->getParent();
2922  if (Subtarget.isTargetWin64() && MF->hasWinCFI()) {
2923  // Conditional tail calls confuse the Win64 unwinder.
2924  return false;
2925  }
2926 
2927  assert(BranchCond.size() == 1);
2928  if (BranchCond[0].getImm() > X86::LAST_VALID_COND) {
2929  // Can't make a conditional tail call with this condition.
2930  return false;
2931  }
2932 
2934  if (X86FI->getTCReturnAddrDelta() != 0 ||
2935  TailCall.getOperand(1).getImm() != 0) {
2936  // A conditional tail call cannot do any stack adjustment.
2937  return false;
2938  }
2939 
2940  return true;
2941 }
2942 
2945  const MachineInstr &TailCall) const {
2947 
2949  while (I != MBB.begin()) {
2950  --I;
2951  if (I->isDebugInstr())
2952  continue;
2953  if (!I->isBranch())
2954  assert(0 && "Can't find the branch to replace!");
2955 
2957  assert(BranchCond.size() == 1);
2958  if (CC != BranchCond[0].getImm())
2959  continue;
2960 
2961  break;
2962  }
2963 
2964  unsigned Opc = TailCall.getOpcode() == X86::TCRETURNdi ? X86::TCRETURNdicc
2965  : X86::TCRETURNdi64cc;
2966 
2967  auto MIB = BuildMI(MBB, I, MBB.findDebugLoc(I), get(Opc));
2968  MIB->addOperand(TailCall.getOperand(0)); // Destination.
2969  MIB.addImm(0); // Stack offset (not used).
2970  MIB->addOperand(BranchCond[0]); // Condition.
2971  MIB.copyImplicitOps(TailCall); // Regmask and (imp-used) parameters.
2972 
2973  // Add implicit uses and defs of all live regs potentially clobbered by the
2974  // call. This way they still appear live across the call.
2975  LivePhysRegs LiveRegs(getRegisterInfo());
2976  LiveRegs.addLiveOuts(MBB);
2978  LiveRegs.stepForward(*MIB, Clobbers);
2979  for (const auto &C : Clobbers) {
2980  MIB.addReg(C.first, RegState::Implicit);
2982  }
2983 
2984  I->eraseFromParent();
2985 }
2986 
2987 // Given a MBB and its TBB, find the FBB which was a fallthrough MBB (it may
2988 // not be a fallthrough MBB now due to layout changes). Return nullptr if the
2989 // fallthrough MBB cannot be identified.
2991  MachineBasicBlock *TBB) {
2992  // Look for non-EHPad successors other than TBB. If we find exactly one, it
2993  // is the fallthrough MBB. If we find zero, then TBB is both the target MBB
2994  // and fallthrough MBB. If we find more than one, we cannot identify the
2995  // fallthrough MBB and should return nullptr.
2996  MachineBasicBlock *FallthroughBB = nullptr;
2997  for (MachineBasicBlock *Succ : MBB->successors()) {
2998  if (Succ->isEHPad() || (Succ == TBB && FallthroughBB))
2999  continue;
3000  // Return a nullptr if we found more than one fallthrough successor.
3001  if (FallthroughBB && FallthroughBB != TBB)
3002  return nullptr;
3003  FallthroughBB = Succ;
3004  }
3005  return FallthroughBB;
3006 }
3007 
3008 bool X86InstrInfo::AnalyzeBranchImpl(
3011  SmallVectorImpl<MachineInstr *> &CondBranches, bool AllowModify) const {
3012 
3013  // Start from the bottom of the block and work up, examining the
3014  // terminator instructions.
3016  MachineBasicBlock::iterator UnCondBrIter = MBB.end();
3017  while (I != MBB.begin()) {
3018  --I;
3019  if (I->isDebugInstr())
3020  continue;
3021 
3022  // Working from the bottom, when we see a non-terminator instruction, we're
3023  // done.
3024  if (!isUnpredicatedTerminator(*I))
3025  break;
3026 
3027  // A terminator that isn't a branch can't easily be handled by this
3028  // analysis.
3029  if (!I->isBranch())
3030  return true;
3031 
3032  // Handle unconditional branches.
3033  if (I->getOpcode() == X86::JMP_1) {
3034  UnCondBrIter = I;
3035 
3036  if (!AllowModify) {
3037  TBB = I->getOperand(0).getMBB();
3038  continue;
3039  }
3040 
3041  // If the block has any instructions after a JMP, delete them.
3042  while (std::next(I) != MBB.end())
3043  std::next(I)->eraseFromParent();
3044 
3045  Cond.clear();
3046  FBB = nullptr;
3047 
3048  // Delete the JMP if it's equivalent to a fall-through.
3049  if (MBB.isLayoutSuccessor(I->getOperand(0).getMBB())) {
3050  TBB = nullptr;
3051  I->eraseFromParent();
3052  I = MBB.end();
3053  UnCondBrIter = MBB.end();
3054  continue;
3055  }
3056 
3057  // TBB is used to indicate the unconditional destination.
3058  TBB = I->getOperand(0).getMBB();
3059  continue;
3060  }
3061 
3062  // Handle conditional branches.
3063  X86::CondCode BranchCode = X86::getCondFromBranch(*I);
3064  if (BranchCode == X86::COND_INVALID)
3065  return true; // Can't handle indirect branch.
3066 
3067  // In practice we should never have an undef eflags operand, if we do
3068  // abort here as we are not prepared to preserve the flag.
3069  if (I->findRegisterUseOperand(X86::EFLAGS)->isUndef())
3070  return true;
3071 
3072  // Working from the bottom, handle the first conditional branch.
3073  if (Cond.empty()) {
3074  MachineBasicBlock *TargetBB = I->getOperand(0).getMBB();
3075  if (AllowModify && UnCondBrIter != MBB.end() &&
3076  MBB.isLayoutSuccessor(TargetBB)) {
3077  // If we can modify the code and it ends in something like:
3078  //
3079  // jCC L1
3080  // jmp L2
3081  // L1:
3082  // ...
3083  // L2:
3084  //
3085  // Then we can change this to:
3086  //
3087  // jnCC L2
3088  // L1:
3089  // ...
3090  // L2:
3091  //
3092  // Which is a bit more efficient.
3093  // We conditionally jump to the fall-through block.
3094  BranchCode = GetOppositeBranchCondition(BranchCode);
3095  MachineBasicBlock::iterator OldInst = I;
3096 
3097  BuildMI(MBB, UnCondBrIter, MBB.findDebugLoc(I), get(X86::JCC_1))
3098  .addMBB(UnCondBrIter->getOperand(0).getMBB())
3099  .addImm(BranchCode);
3100  BuildMI(MBB, UnCondBrIter, MBB.findDebugLoc(I), get(X86::JMP_1))
3101  .addMBB(TargetBB);
3102 
3103  OldInst->eraseFromParent();
3104  UnCondBrIter->eraseFromParent();
3105 
3106  // Restart the analysis.
3107  UnCondBrIter = MBB.end();
3108  I = MBB.end();
3109  continue;
3110  }
3111 
3112  FBB = TBB;
3113  TBB = I->getOperand(0).getMBB();
3114  Cond.push_back(MachineOperand::CreateImm(BranchCode));
3115  CondBranches.push_back(&*I);
3116  continue;
3117  }
3118 
3119  // Handle subsequent conditional branches. Only handle the case where all
3120  // conditional branches branch to the same destination and their condition
3121  // opcodes fit one of the special multi-branch idioms.
3122  assert(Cond.size() == 1);
3123  assert(TBB);
3124 
3125  // If the conditions are the same, we can leave them alone.
3126  X86::CondCode OldBranchCode = (X86::CondCode)Cond[0].getImm();
3127  auto NewTBB = I->getOperand(0).getMBB();
3128  if (OldBranchCode == BranchCode && TBB == NewTBB)
3129  continue;
3130 
3131  // If they differ, see if they fit one of the known patterns. Theoretically,
3132  // we could handle more patterns here, but we shouldn't expect to see them
3133  // if instruction selection has done a reasonable job.
3134  if (TBB == NewTBB &&
3135  ((OldBranchCode == X86::COND_P && BranchCode == X86::COND_NE) ||
3136  (OldBranchCode == X86::COND_NE && BranchCode == X86::COND_P))) {
3137  BranchCode = X86::COND_NE_OR_P;
3138  } else if ((OldBranchCode == X86::COND_NP && BranchCode == X86::COND_NE) ||
3139  (OldBranchCode == X86::COND_E && BranchCode == X86::COND_P)) {
3140  if (NewTBB != (FBB ? FBB : getFallThroughMBB(&MBB, TBB)))
3141  return true;
3142 
3143  // X86::COND_E_AND_NP usually has two different branch destinations.
3144  //
3145  // JP B1
3146  // JE B2
3147  // JMP B1
3148  // B1:
3149  // B2:
3150  //
3151  // Here this condition branches to B2 only if NP && E. It has another
3152  // equivalent form:
3153  //
3154  // JNE B1
3155  // JNP B2
3156  // JMP B1
3157  // B1:
3158  // B2:
3159  //
3160  // Similarly it branches to B2 only if E && NP. That is why this condition
3161  // is named with COND_E_AND_NP.
3162  BranchCode = X86::COND_E_AND_NP;
3163  } else
3164  return true;
3165 
3166  // Update the MachineOperand.
3167  Cond[0].setImm(BranchCode);
3168  CondBranches.push_back(&*I);
3169  }
3170 
3171  return false;
3172 }
3173 
3175  MachineBasicBlock *&TBB,
3176  MachineBasicBlock *&FBB,
3178  bool AllowModify) const {
3179  SmallVector<MachineInstr *, 4> CondBranches;
3180  return AnalyzeBranchImpl(MBB, TBB, FBB, Cond, CondBranches, AllowModify);
3181 }
3182 
3184  MachineBranchPredicate &MBP,
3185  bool AllowModify) const {
3186  using namespace std::placeholders;
3187 
3189  SmallVector<MachineInstr *, 4> CondBranches;
3190  if (AnalyzeBranchImpl(MBB, MBP.TrueDest, MBP.FalseDest, Cond, CondBranches,
3191  AllowModify))
3192  return true;
3193 
3194  if (Cond.size() != 1)
3195  return true;
3196 
3197  assert(MBP.TrueDest && "expected!");
3198 
3199  if (!MBP.FalseDest)
3200  MBP.FalseDest = MBB.getNextNode();
3201 
3203 
3204  MachineInstr *ConditionDef = nullptr;
3205  bool SingleUseCondition = true;
3206 
3208  if (MI.modifiesRegister(X86::EFLAGS, TRI)) {
3209  ConditionDef = &MI;
3210  break;
3211  }
3212 
3213  if (MI.readsRegister(X86::EFLAGS, TRI))
3214  SingleUseCondition = false;
3215  }
3216 
3217  if (!ConditionDef)
3218  return true;
3219 
3220  if (SingleUseCondition) {
3221  for (auto *Succ : MBB.successors())
3222  if (Succ->isLiveIn(X86::EFLAGS))
3223  SingleUseCondition = false;
3224  }
3225 
3226  MBP.ConditionDef = ConditionDef;
3227  MBP.SingleUseCondition = SingleUseCondition;
3228 
3229  // Currently we only recognize the simple pattern:
3230  //
3231  // test %reg, %reg
3232  // je %label
3233  //
3234  const unsigned TestOpcode =
3235  Subtarget.is64Bit() ? X86::TEST64rr : X86::TEST32rr;
3236 
3237  if (ConditionDef->getOpcode() == TestOpcode &&
3238  ConditionDef->getNumOperands() == 3 &&
3239  ConditionDef->getOperand(0).isIdenticalTo(ConditionDef->getOperand(1)) &&
3240  (Cond[0].getImm() == X86::COND_NE || Cond[0].getImm() == X86::COND_E)) {
3241  MBP.LHS = ConditionDef->getOperand(0);
3242  MBP.RHS = MachineOperand::CreateImm(0);
3243  MBP.Predicate = Cond[0].getImm() == X86::COND_NE
3246  return false;
3247  }
3248 
3249  return true;
3250 }
3251 
3253  int *BytesRemoved) const {
3254  assert(!BytesRemoved && "code size not handled");
3255 
3257  unsigned Count = 0;
3258 
3259  while (I != MBB.begin()) {
3260  --I;
3261  if (I->isDebugInstr())
3262  continue;
3263  if (I->getOpcode() != X86::JMP_1 &&
3265  break;
3266  // Remove the branch.
3267  I->eraseFromParent();
3268  I = MBB.end();
3269  ++Count;
3270  }
3271 
3272  return Count;
3273 }
3274 
3276  MachineBasicBlock *TBB,
3277  MachineBasicBlock *FBB,
3279  const DebugLoc &DL,
3280  int *BytesAdded) const {
3281  // Shouldn't be a fall through.
3282  assert(TBB && "insertBranch must not be told to insert a fallthrough");
3283  assert((Cond.size() == 1 || Cond.size() == 0) &&
3284  "X86 branch conditions have one component!");
3285  assert(!BytesAdded && "code size not handled");
3286 
3287  if (Cond.empty()) {
3288  // Unconditional branch?
3289  assert(!FBB && "Unconditional branch with multiple successors!");
3290  BuildMI(&MBB, DL, get(X86::JMP_1)).addMBB(TBB);
3291  return 1;
3292  }
3293 
3294  // If FBB is null, it is implied to be a fall-through block.
3295  bool FallThru = FBB == nullptr;
3296 
3297  // Conditional branch.
3298  unsigned Count = 0;
3299  X86::CondCode CC = (X86::CondCode)Cond[0].getImm();
3300  switch (CC) {
3301  case X86::COND_NE_OR_P:
3302  // Synthesize NE_OR_P with two branches.
3303  BuildMI(&MBB, DL, get(X86::JCC_1)).addMBB(TBB).addImm(X86::COND_NE);
3304  ++Count;
3305  BuildMI(&MBB, DL, get(X86::JCC_1)).addMBB(TBB).addImm(X86::COND_P);
3306  ++Count;
3307  break;
3308  case X86::COND_E_AND_NP:
3309  // Use the next block of MBB as FBB if it is null.
3310  if (FBB == nullptr) {
3311  FBB = getFallThroughMBB(&MBB, TBB);
3312  assert(FBB && "MBB cannot be the last block in function when the false "
3313  "body is a fall-through.");
3314  }
3315  // Synthesize COND_E_AND_NP with two branches.
3316  BuildMI(&MBB, DL, get(X86::JCC_1)).addMBB(FBB).addImm(X86::COND_NE);
3317  ++Count;
3318  BuildMI(&MBB, DL, get(X86::JCC_1)).addMBB(TBB).addImm(X86::COND_NP);
3319  ++Count;
3320  break;
3321  default: {
3322  BuildMI(&MBB, DL, get(X86::JCC_1)).addMBB(TBB).addImm(CC);
3323  ++Count;
3324  }
3325  }
3326  if (!FallThru) {
3327  // Two-way Conditional branch. Insert the second branch.
3328  BuildMI(&MBB, DL, get(X86::JMP_1)).addMBB(FBB);
3329  ++Count;
3330  }
3331  return Count;
3332 }
3333 
3336  Register DstReg, Register TrueReg,
3337  Register FalseReg, int &CondCycles,
3338  int &TrueCycles, int &FalseCycles) const {
3339  // Not all subtargets have cmov instructions.
3340  if (!Subtarget.canUseCMOV())
3341  return false;
3342  if (Cond.size() != 1)
3343  return false;
3344  // We cannot do the composite conditions, at least not in SSA form.
3345  if ((X86::CondCode)Cond[0].getImm() > X86::LAST_VALID_COND)
3346  return false;
3347 
3348  // Check register classes.
3350  const TargetRegisterClass *RC =
3351  RI.getCommonSubClass(MRI.getRegClass(TrueReg), MRI.getRegClass(FalseReg));
3352  if (!RC)
3353  return false;
3354 
3355  // We have cmov instructions for 16, 32, and 64 bit general purpose registers.
3356  if (X86::GR16RegClass.hasSubClassEq(RC) ||
3357  X86::GR32RegClass.hasSubClassEq(RC) ||
3358  X86::GR64RegClass.hasSubClassEq(RC)) {
3359  // This latency applies to Pentium M, Merom, Wolfdale, Nehalem, and Sandy
3360  // Bridge. Probably Ivy Bridge as well.
3361  CondCycles = 2;
3362  TrueCycles = 2;
3363  FalseCycles = 2;
3364  return true;
3365  }
3366 
3367  // Can't do vectors.
3368  return false;
3369 }
3370 
3373  const DebugLoc &DL, Register DstReg,
3375  Register FalseReg) const {
3378  const TargetRegisterClass &RC = *MRI.getRegClass(DstReg);
3379  assert(Cond.size() == 1 && "Invalid Cond array");
3380  unsigned Opc = X86::getCMovOpcode(TRI.getRegSizeInBits(RC) / 8,
3381  false /*HasMemoryOperand*/);
3382  BuildMI(MBB, I, DL, get(Opc), DstReg)
3383  .addReg(FalseReg)
3384  .addReg(TrueReg)
3385  .addImm(Cond[0].getImm());
3386 }
3387 
3388 /// Test if the given register is a physical h register.
3389 static bool isHReg(unsigned Reg) {
3390  return X86::GR8_ABCD_HRegClass.contains(Reg);
3391 }
3392 
3393 // Try and copy between VR128/VR64 and GR64 registers.
3394 static unsigned CopyToFromAsymmetricReg(unsigned DestReg, unsigned SrcReg,
3395  const X86Subtarget &Subtarget) {
3396  bool HasAVX = Subtarget.hasAVX();
3397  bool HasAVX512 = Subtarget.hasAVX512();
3398 
3399  // SrcReg(MaskReg) -> DestReg(GR64)
3400  // SrcReg(MaskReg) -> DestReg(GR32)
3401 
3402  // All KMASK RegClasses hold the same k registers, can be tested against anyone.
3403  if (X86::VK16RegClass.contains(SrcReg)) {
3404  if (X86::GR64RegClass.contains(DestReg)) {
3405  assert(Subtarget.hasBWI());
3406  return X86::KMOVQrk;
3407  }
3408  if (X86::GR32RegClass.contains(DestReg))
3409  return Subtarget.hasBWI() ? X86::KMOVDrk : X86::KMOVWrk;
3410  }
3411 
3412  // SrcReg(GR64) -> DestReg(MaskReg)
3413  // SrcReg(GR32) -> DestReg(MaskReg)
3414 
3415  // All KMASK RegClasses hold the same k registers, can be tested against anyone.
3416  if (X86::VK16RegClass.contains(DestReg)) {
3417  if (X86::GR64RegClass.contains(SrcReg)) {
3418  assert(Subtarget.hasBWI());
3419  return X86::KMOVQkr;
3420  }
3421  if (X86::GR32RegClass.contains(SrcReg))
3422  return Subtarget.hasBWI() ? X86::KMOVDkr : X86::KMOVWkr;
3423  }
3424 
3425 
3426  // SrcReg(VR128) -> DestReg(GR64)
3427  // SrcReg(VR64) -> DestReg(GR64)
3428  // SrcReg(GR64) -> DestReg(VR128)
3429  // SrcReg(GR64) -> DestReg(VR64)
3430 
3431  if (X86::GR64RegClass.contains(DestReg)) {
3432  if (X86::VR128XRegClass.contains(SrcReg))
3433  // Copy from a VR128 register to a GR64 register.
3434  return HasAVX512 ? X86::VMOVPQIto64Zrr :
3435  HasAVX ? X86::VMOVPQIto64rr :
3436  X86::MOVPQIto64rr;
3437  if (X86::VR64RegClass.contains(SrcReg))
3438  // Copy from a VR64 register to a GR64 register.
3439  return X86::MMX_MOVD64from64rr;
3440  } else if (X86::GR64RegClass.contains(SrcReg)) {
3441  // Copy from a GR64 register to a VR128 register.
3442  if (X86::VR128XRegClass.contains(DestReg))
3443  return HasAVX512 ? X86::VMOV64toPQIZrr :
3444  HasAVX ? X86::VMOV64toPQIrr :
3445  X86::MOV64toPQIrr;
3446  // Copy from a GR64 register to a VR64 register.
3447  if (X86::VR64RegClass.contains(DestReg))
3448  return X86::MMX_MOVD64to64rr;
3449  }
3450 
3451  // SrcReg(VR128) -> DestReg(GR32)
3452  // SrcReg(GR32) -> DestReg(VR128)
3453 
3454  if (X86::GR32RegClass.contains(DestReg) &&
3455  X86::VR128XRegClass.contains(SrcReg))
3456  // Copy from a VR128 register to a GR32 register.
3457  return HasAVX512 ? X86::VMOVPDI2DIZrr :
3458  HasAVX ? X86::VMOVPDI2DIrr :
3459  X86::MOVPDI2DIrr;
3460 
3461  if (X86::VR128XRegClass.contains(DestReg) &&
3462  X86::GR32RegClass.contains(SrcReg))
3463  // Copy from a VR128 register to a VR128 register.
3464  return HasAVX512 ? X86::VMOVDI2PDIZrr :
3465  HasAVX ? X86::VMOVDI2PDIrr :
3466  X86::MOVDI2PDIrr;
3467  return 0;
3468 }
3469 
3472  const DebugLoc &DL, MCRegister DestReg,
3473  MCRegister SrcReg, bool KillSrc) const {
3474  // First deal with the normal symmetric copies.
3475  bool HasAVX = Subtarget.hasAVX();
3476  bool HasVLX = Subtarget.hasVLX();
3477  unsigned Opc = 0;
3478  if (X86::GR64RegClass.contains(DestReg, SrcReg))
3479  Opc = X86::MOV64rr;
3480  else if (X86::GR32RegClass.contains(DestReg, SrcReg))
3481  Opc = X86::MOV32rr;
3482  else if (X86::GR16RegClass.contains(DestReg, SrcReg))
3483  Opc = X86::MOV16rr;
3484  else if (X86::GR8RegClass.contains(DestReg, SrcReg)) {
3485  // Copying to or from a physical H register on x86-64 requires a NOREX
3486  // move. Otherwise use a normal move.
3487  if ((isHReg(DestReg) || isHReg(SrcReg)) &&
3488  Subtarget.is64Bit()) {
3489  Opc = X86::MOV8rr_NOREX;
3490  // Both operands must be encodable without an REX prefix.
3491  assert(X86::GR8_NOREXRegClass.contains(SrcReg, DestReg) &&
3492  "8-bit H register can not be copied outside GR8_NOREX");
3493  } else
3494  Opc = X86::MOV8rr;
3495  }
3496  else if (X86::VR64RegClass.contains(DestReg, SrcReg))
3497  Opc = X86::MMX_MOVQ64rr;
3498  else if (X86::VR128XRegClass.contains(DestReg, SrcReg)) {
3499  if (HasVLX)
3500  Opc = X86::VMOVAPSZ128rr;
3501  else if (X86::VR128RegClass.contains(DestReg, SrcReg))
3502  Opc = HasAVX ? X86::VMOVAPSrr : X86::MOVAPSrr;
3503  else {
3504  // If this an extended register and we don't have VLX we need to use a
3505  // 512-bit move.
3506  Opc = X86::VMOVAPSZrr;
3508  DestReg = TRI->getMatchingSuperReg(DestReg, X86::sub_xmm,
3509  &X86::VR512RegClass);
3510  SrcReg = TRI->getMatchingSuperReg(SrcReg, X86::sub_xmm,
3511  &X86::VR512RegClass);
3512  }
3513  } else if (X86::VR256XRegClass.contains(DestReg, SrcReg)) {
3514  if (HasVLX)
3515  Opc = X86::VMOVAPSZ256rr;
3516  else if (X86::VR256RegClass.contains(DestReg, SrcReg))
3517  Opc = X86::VMOVAPSYrr;
3518  else {
3519  // If this an extended register and we don't have VLX we need to use a
3520  // 512-bit move.
3521  Opc = X86::VMOVAPSZrr;
3523  DestReg = TRI->getMatchingSuperReg(DestReg, X86::sub_ymm,
3524  &X86::VR512RegClass);
3525  SrcReg = TRI->getMatchingSuperReg(SrcReg, X86::sub_ymm,
3526  &X86::VR512RegClass);
3527  }
3528  } else if (X86::VR512RegClass.contains(DestReg, SrcReg))
3529  Opc = X86::VMOVAPSZrr;
3530  // All KMASK RegClasses hold the same k registers, can be tested against anyone.
3531  else if (X86::VK16RegClass.contains(DestReg, SrcReg))
3532  Opc = Subtarget.hasBWI() ? X86::KMOVQkk : X86::KMOVWkk;
3533  if (!Opc)
3534  Opc = CopyToFromAsymmetricReg(DestReg, SrcReg, Subtarget);
3535 
3536  if (Opc) {
3537  BuildMI(MBB, MI, DL, get(Opc), DestReg)
3538  .addReg(SrcReg, getKillRegState(KillSrc));
3539  return;
3540  }
3541 
3542  if (SrcReg == X86::EFLAGS || DestReg == X86::EFLAGS) {
3543  // FIXME: We use a fatal error here because historically LLVM has tried
3544  // lower some of these physreg copies and we want to ensure we get
3545  // reasonable bug reports if someone encounters a case no other testing
3546  // found. This path should be removed after the LLVM 7 release.
3547  report_fatal_error("Unable to copy EFLAGS physical register!");
3548  }
3549 
3550  LLVM_DEBUG(dbgs() << "Cannot copy " << RI.getName(SrcReg) << " to "
3551  << RI.getName(DestReg) << '\n');
3552  report_fatal_error("Cannot emit physreg copy instruction");
3553 }
3554 
3557  if (MI.isMoveReg())
3558  return DestSourcePair{MI.getOperand(0), MI.getOperand(1)};
3559  return None;
3560 }
3561 
3563  const TargetRegisterClass *RC,
3564  bool IsStackAligned,
3565  const X86Subtarget &STI, bool load) {
3566  bool HasAVX = STI.hasAVX();
3567  bool HasAVX512 = STI.hasAVX512();
3568  bool HasVLX = STI.hasVLX();
3569 
3570  switch (STI.getRegisterInfo()->getSpillSize(*RC)) {
3571  default:
3572  llvm_unreachable("Unknown spill size");
3573  case 1:
3574  assert(X86::GR8RegClass.hasSubClassEq(RC) && "Unknown 1-byte regclass");
3575  if (STI.is64Bit())
3576  // Copying to or from a physical H register on x86-64 requires a NOREX
3577  // move. Otherwise use a normal move.
3578  if (isHReg(Reg) || X86::GR8_ABCD_HRegClass.hasSubClassEq(RC))
3579  return load ? X86::MOV8rm_NOREX : X86::MOV8mr_NOREX;
3580  return load ? X86::MOV8rm : X86::MOV8mr;
3581  case 2:
3582  if (X86::VK16RegClass.hasSubClassEq(RC))
3583  return load ? X86::KMOVWkm : X86::KMOVWmk;
3584  if (X86::FR16XRegClass.hasSubClassEq(RC)) {
3585  assert(STI.hasFP16());
3586  return load ? X86::VMOVSHZrm_alt : X86::VMOVSHZmr;
3587  }
3588  assert(X86::GR16RegClass.hasSubClassEq(RC) && "Unknown 2-byte regclass");
3589  return load ? X86::MOV16rm : X86::MOV16mr;
3590  case 4:
3591  if (X86::GR32RegClass.hasSubClassEq(RC))
3592  return load ? X86::MOV32rm : X86::MOV32mr;
3593  if (X86::FR32XRegClass.hasSubClassEq(RC))
3594  return load ?
3595  (HasAVX512 ? X86::VMOVSSZrm_alt :
3596  HasAVX ? X86::VMOVSSrm_alt :
3597  X86::MOVSSrm_alt) :
3598  (HasAVX512 ? X86::VMOVSSZmr :
3599  HasAVX ? X86::VMOVSSmr :
3600  X86::MOVSSmr);
3601  if (X86::RFP32RegClass.hasSubClassEq(RC))
3602  return load ? X86::LD_Fp32m : X86::ST_Fp32m;
3603  if (X86::VK32RegClass.hasSubClassEq(RC)) {
3604  assert(STI.hasBWI() && "KMOVD requires BWI");
3605  return load ? X86::KMOVDkm : X86::KMOVDmk;
3606  }
3607  // All of these mask pair classes have the same spill size, the same kind
3608  // of kmov instructions can be used with all of them.
3609  if (X86::VK1PAIRRegClass.hasSubClassEq(RC) ||
3610  X86::VK2PAIRRegClass.hasSubClassEq(RC) ||
3611  X86::VK4PAIRRegClass.hasSubClassEq(RC) ||
3612  X86::VK8PAIRRegClass.hasSubClassEq(RC) ||
3613  X86::VK16PAIRRegClass.hasSubClassEq(RC))
3614  return load ? X86::MASKPAIR16LOAD : X86::MASKPAIR16STORE;
3615  llvm_unreachable("Unknown 4-byte regclass");
3616  case 8:
3617  if (X86::GR64RegClass.hasSubClassEq(RC))
3618  return load ? X86::MOV64rm : X86::MOV64mr;
3619  if (X86::FR64XRegClass.hasSubClassEq(RC))
3620  return load ?
3621  (HasAVX512 ? X86::VMOVSDZrm_alt :
3622  HasAVX ? X86::VMOVSDrm_alt :
3623  X86::MOVSDrm_alt) :
3624  (HasAVX512 ? X86::VMOVSDZmr :
3625  HasAVX ? X86::VMOVSDmr :
3626  X86::MOVSDmr);
3627  if (X86::VR64RegClass.hasSubClassEq(RC))
3628  return load ? X86::MMX_MOVQ64rm : X86::MMX_MOVQ64mr;
3629  if (X86::RFP64RegClass.hasSubClassEq(RC))
3630  return load ? X86::LD_Fp64m : X86::ST_Fp64m;
3631  if (X86::VK64RegClass.hasSubClassEq(RC)) {
3632  assert(STI.hasBWI() && "KMOVQ requires BWI");
3633  return load ? X86::KMOVQkm : X86::KMOVQmk;
3634  }
3635  llvm_unreachable("Unknown 8-byte regclass");
3636  case 10:
3637  assert(X86::RFP80RegClass.hasSubClassEq(RC) && "Unknown 10-byte regclass");
3638  return load ? X86::LD_Fp80m : X86::ST_FpP80m;
3639  case 16: {
3640  if (X86::VR128XRegClass.hasSubClassEq(RC)) {
3641  // If stack is realigned we can use aligned stores.
3642  if (IsStackAligned)
3643  return load ?
3644  (HasVLX ? X86::VMOVAPSZ128rm :
3645  HasAVX512 ? X86::VMOVAPSZ128rm_NOVLX :
3646  HasAVX ? X86::VMOVAPSrm :
3647  X86::MOVAPSrm):
3648  (HasVLX ? X86::VMOVAPSZ128mr :
3649  HasAVX512 ? X86::VMOVAPSZ128mr_NOVLX :
3650  HasAVX ? X86::VMOVAPSmr :
3651  X86::MOVAPSmr);
3652  else
3653  return load ?
3654  (HasVLX ? X86::VMOVUPSZ128rm :
3655  HasAVX512 ? X86::VMOVUPSZ128rm_NOVLX :
3656  HasAVX ? X86::VMOVUPSrm :
3657  X86::MOVUPSrm):
3658  (HasVLX ? X86::VMOVUPSZ128mr :
3659  HasAVX512 ? X86::VMOVUPSZ128mr_NOVLX :
3660  HasAVX ? X86::VMOVUPSmr :
3661  X86::MOVUPSmr);
3662  }
3663  llvm_unreachable("Unknown 16-byte regclass");
3664  }
3665  case 32:
3666  assert(X86::VR256XRegClass.hasSubClassEq(RC) && "Unknown 32-byte regclass");
3667  // If stack is realigned we can use aligned stores.
3668  if (IsStackAligned)
3669  return load ?
3670  (HasVLX ? X86::VMOVAPSZ256rm :
3671  HasAVX512 ? X86::VMOVAPSZ256rm_NOVLX :
3672  X86::VMOVAPSYrm) :
3673  (HasVLX ? X86::VMOVAPSZ256mr :
3674  HasAVX512 ? X86::VMOVAPSZ256mr_NOVLX :
3675  X86::VMOVAPSYmr);
3676  else
3677  return load ?
3678  (HasVLX ? X86::VMOVUPSZ256rm :
3679  HasAVX512 ? X86::VMOVUPSZ256rm_NOVLX :
3680  X86::VMOVUPSYrm) :
3681  (HasVLX ? X86::VMOVUPSZ256mr :
3682  HasAVX512 ? X86::VMOVUPSZ256mr_NOVLX :
3683  X86::VMOVUPSYmr);
3684  case 64:
3685  assert(X86::VR512RegClass.hasSubClassEq(RC) && "Unknown 64-byte regclass");
3686  assert(STI.hasAVX512() && "Using 512-bit register requires AVX512");
3687  if (IsStackAligned)
3688  return load ? X86::VMOVAPSZrm : X86::VMOVAPSZmr;
3689  else
3690  return load ? X86::VMOVUPSZrm : X86::VMOVUPSZmr;
3691  }
3692 }
3693 
3696  const TargetRegisterInfo *TRI) const {
3697  const MCInstrDesc &Desc = MemI.getDesc();
3698  int MemRefBegin = X86II::getMemoryOperandNo(Desc.TSFlags);
3699  if (MemRefBegin < 0)
3700  return None;
3701 
3702  MemRefBegin += X86II::getOperandBias(Desc);
3703 
3704  auto &BaseOp = MemI.getOperand(MemRefBegin + X86::AddrBaseReg);
3705  if (!BaseOp.isReg()) // Can be an MO_FrameIndex
3706  return None;
3707 
3708  const MachineOperand &DispMO = MemI.getOperand(MemRefBegin + X86::AddrDisp);
3709  // Displacement can be symbolic
3710  if (!DispMO.isImm())
3711  return None;
3712 
3713  ExtAddrMode AM;
3714  AM.BaseReg = BaseOp.getReg();
3715  AM.ScaledReg = MemI.getOperand(MemRefBegin + X86::AddrIndexReg).getReg();
3716  AM.Scale = MemI.getOperand(MemRefBegin + X86::AddrScaleAmt).getImm();
3717  AM.Displacement = DispMO.getImm();
3718  return AM;
3719 }
3720 
3722  StringRef &ErrInfo) const {
3723  Optional<ExtAddrMode> AMOrNone = getAddrModeFromMemoryOp(MI, nullptr);
3724  if (!AMOrNone)
3725  return true;
3726 
3727  ExtAddrMode AM = *AMOrNone;
3728 
3729  if (AM.ScaledReg != X86::NoRegister) {
3730  switch (AM.Scale) {
3731  case 1:
3732  case 2:
3733  case 4:
3734  case 8:
3735  break;
3736  default:
3737  ErrInfo = "Scale factor in address must be 1, 2, 4 or 8";
3738  return false;
3739  }
3740  }
3741  if (!isInt<32>(AM.Displacement)) {
3742  ErrInfo = "Displacement in address must fit into 32-bit signed "
3743  "integer";
3744  return false;
3745  }
3746 
3747  return true;
3748 }
3749 
3751  const Register Reg,
3752  int64_t &ImmVal) const {
3753  if (MI.getOpcode() != X86::MOV32ri && MI.getOpcode() != X86::MOV64ri)
3754  return false;
3755  // Mov Src can be a global address.
3756  if (!MI.getOperand(1).isImm() || MI.getOperand(0).getReg() != Reg)
3757  return false;
3758  ImmVal = MI.getOperand(1).getImm();
3759  return true;
3760 }
3761 
3763  const MachineInstr *MI, const Register NullValueReg,
3764  const TargetRegisterInfo *TRI) const {
3765  if (!MI->modifiesRegister(NullValueReg, TRI))
3766  return true;
3767  switch (MI->getOpcode()) {
3768  // Shift right/left of a null unto itself is still a null, i.e. rax = shl rax
3769  // X.
3770  case X86::SHR64ri:
3771  case X86::SHR32ri:
3772  case X86::SHL64ri:
3773  case X86::SHL32ri:
3774  assert(MI->getOperand(0).isDef() && MI->getOperand(1).isUse() &&
3775  "expected for shift opcode!");
3776  return MI->getOperand(0).getReg() == NullValueReg &&
3777  MI->getOperand(1).getReg() == NullValueReg;
3778  // Zero extend of a sub-reg of NullValueReg into itself does not change the
3779  // null value.
3780  case X86::MOV32rr:
3781  return llvm::all_of(MI->operands(), [&](const MachineOperand &MO) {
3782  return TRI->isSubRegisterEq(NullValueReg, MO.getReg());
3783  });
3784  default:
3785  return false;
3786  }
3787  llvm_unreachable("Should be handled above!");
3788 }
3789 
3792  int64_t &Offset, bool &OffsetIsScalable, unsigned &Width,
3793  const TargetRegisterInfo *TRI) const {
3794  const MCInstrDesc &Desc = MemOp.getDesc();
3795  int MemRefBegin = X86II::getMemoryOperandNo(Desc.TSFlags);
3796  if (MemRefBegin < 0)
3797  return false;
3798 
3799  MemRefBegin += X86II::getOperandBias(Desc);
3800 
3801  const MachineOperand *BaseOp =
3802  &MemOp.getOperand(MemRefBegin + X86::AddrBaseReg);
3803  if (!BaseOp->isReg()) // Can be an MO_FrameIndex
3804  return false;
3805 
3806  if (MemOp.getOperand(MemRefBegin + X86::AddrScaleAmt).getImm() != 1)
3807  return false;
3808 
3809  if (MemOp.getOperand(MemRefBegin + X86::AddrIndexReg).getReg() !=
3810  X86::NoRegister)
3811  return false;
3812 
3813  const MachineOperand &DispMO = MemOp.getOperand(MemRefBegin + X86::AddrDisp);
3814 
3815  // Displacement can be symbolic
3816  if (!DispMO.isImm())
3817  return false;
3818 
3819  Offset = DispMO.getImm();
3820 
3821  if (!BaseOp->isReg())
3822  return false;
3823 
3824  OffsetIsScalable = false;
3825  // FIXME: Relying on memoperands() may not be right thing to do here. Check
3826  // with X86 maintainers, and fix it accordingly. For now, it is ok, since
3827  // there is no use of `Width` for X86 back-end at the moment.
3828  Width =
3829  !MemOp.memoperands_empty() ? MemOp.memoperands().front()->getSize() : 0;
3830  BaseOps.push_back(BaseOp);
3831  return true;
3832 }
3833 
3834 static unsigned getStoreRegOpcode(Register SrcReg,
3835  const TargetRegisterClass *RC,
3836  bool IsStackAligned,
3837  const X86Subtarget &STI) {
3838  return getLoadStoreRegOpcode(SrcReg, RC, IsStackAligned, STI, false);
3839 }
3840 
3841 static unsigned getLoadRegOpcode(Register DestReg,
3842  const TargetRegisterClass *RC,
3843  bool IsStackAligned, const X86Subtarget &STI) {
3844  return getLoadStoreRegOpcode(DestReg, RC, IsStackAligned, STI, true);
3845 }
3846 
3849  Register SrcReg, bool isKill, int FrameIdx,
3850  const TargetRegisterClass *RC,
3851  const TargetRegisterInfo *TRI) const {
3852  const MachineFunction &MF = *MBB.getParent();
3853  const MachineFrameInfo &MFI = MF.getFrameInfo();
3854  assert(MFI.getObjectSize(FrameIdx) >= TRI->getSpillSize(*RC) &&
3855  "Stack slot too small for store");
3856  if (RC->getID() == X86::TILERegClassID) {
3857  unsigned Opc = X86::TILESTORED;
3858  // tilestored %tmm, (%sp, %idx)
3859  MachineRegisterInfo &RegInfo = MBB.getParent()->getRegInfo();
3860  Register VirtReg = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass);
3861  BuildMI(MBB, MI, DebugLoc(), get(X86::MOV64ri), VirtReg).addImm(64);
3862  MachineInstr *NewMI =
3863  addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc)), FrameIdx)
3864  .addReg(SrcReg, getKillRegState(isKill));
3865  MachineOperand &MO = NewMI->getOperand(2);
3866  MO.setReg(VirtReg);
3867  MO.setIsKill(true);
3868  } else {
3869  unsigned Alignment = std::max<uint32_t>(TRI->getSpillSize(*RC), 16);
3870  bool isAligned =
3871  (Subtarget.getFrameLowering()->getStackAlign() >= Alignment) ||
3872  (RI.canRealignStack(MF) && !MFI.isFixedObjectIndex(FrameIdx));
3873  unsigned Opc = getStoreRegOpcode(SrcReg, RC, isAligned, Subtarget);
3874  addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc)), FrameIdx)
3875  .addReg(SrcReg, getKillRegState(isKill));
3876  }
3877 }
3878 
3881  Register DestReg, int FrameIdx,
3882  const TargetRegisterClass *RC,
3883  const TargetRegisterInfo *TRI) const {
3884  if (RC->getID() == X86::TILERegClassID) {
3885  unsigned Opc = X86::TILELOADD;
3886  // tileloadd (%sp, %idx), %tmm
3887  MachineRegisterInfo &RegInfo = MBB.getParent()->getRegInfo();
3888  Register VirtReg = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass);
3889  MachineInstr *NewMI =
3890  BuildMI(MBB, MI, DebugLoc(), get(X86::MOV64ri), VirtReg).addImm(64);
3891  NewMI = addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc), DestReg),
3892  FrameIdx);
3893  MachineOperand &MO = NewMI->getOperand(3);
3894  MO.setReg(VirtReg);
3895  MO.setIsKill(true);
3896  } else {
3897  const MachineFunction &MF = *MBB.getParent();
3898  const MachineFrameInfo &MFI = MF.getFrameInfo();
3899  unsigned Alignment = std::max<uint32_t>(TRI->getSpillSize(*RC), 16);
3900  bool isAligned =
3901  (Subtarget.getFrameLowering()->getStackAlign() >= Alignment) ||
3902  (RI.canRealignStack(MF) && !MFI.isFixedObjectIndex(FrameIdx));
3903  unsigned Opc = getLoadRegOpcode(DestReg, RC, isAligned, Subtarget);
3904  addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc), DestReg),
3905  FrameIdx);
3906  }
3907 }
3908 
3910  Register &SrcReg2, int64_t &CmpMask,
3911  int64_t &CmpValue) const {
3912  switch (MI.getOpcode()) {
3913  default: break;
3914  case X86::CMP64ri32:
3915  case X86::CMP64ri8:
3916  case X86::CMP32ri:
3917  case X86::CMP32ri8:
3918  case X86::CMP16ri:
3919  case X86::CMP16ri8:
3920  case X86::CMP8ri:
3921  SrcReg = MI.getOperand(0).getReg();
3922  SrcReg2 = 0;
3923  if (MI.getOperand(1).isImm()) {
3924  CmpMask = ~0;
3925  CmpValue = MI.getOperand(1).getImm();
3926  } else {
3927  CmpMask = CmpValue = 0;
3928  }
3929  return true;
3930  // A SUB can be used to perform comparison.
3931  case X86::SUB64rm:
3932  case X86::SUB32rm:
3933  case X86::SUB16rm:
3934  case X86::SUB8rm:
3935  SrcReg = MI.getOperand(1).getReg();
3936  SrcReg2 = 0;
3937  CmpMask = 0;
3938  CmpValue = 0;
3939  return true;
3940  case X86::SUB64rr:
3941  case X86::SUB32rr:
3942  case X86::SUB16rr:
3943  case X86::SUB8rr:
3944  SrcReg = MI.getOperand(1).getReg();
3945  SrcReg2 = MI.getOperand(2).getReg();
3946  CmpMask = 0;
3947  CmpValue = 0;
3948  return true;
3949  case X86::SUB64ri32:
3950  case X86::SUB64ri8:
3951  case X86::SUB32ri:
3952  case X86::SUB32ri8:
3953  case X86::SUB16ri:
3954  case X86::SUB16ri8:
3955  case X86::SUB8ri:
3956  SrcReg = MI.getOperand(1).getReg();
3957  SrcReg2 = 0;
3958  if (MI.getOperand(2).isImm()) {
3959  CmpMask = ~0;
3960  CmpValue = MI.getOperand(2).getImm();
3961  } else {
3962  CmpMask = CmpValue = 0;
3963  }
3964  return true;
3965  case X86::CMP64rr:
3966  case X86::CMP32rr:
3967  case X86::CMP16rr:
3968  case X86::CMP8rr:
3969  SrcReg = MI.getOperand(0).getReg();
3970  SrcReg2 = MI.getOperand(1).getReg();
3971  CmpMask = 0;
3972  CmpValue = 0;
3973  return true;
3974  case X86::TEST8rr:
3975  case X86::TEST16rr:
3976  case X86::TEST32rr:
3977  case X86::TEST64rr:
3978  SrcReg = MI.getOperand(0).getReg();
3979  if (MI.getOperand(1).getReg() != SrcReg)
3980  return false;
3981  // Compare against zero.
3982  SrcReg2 = 0;
3983  CmpMask = ~0;
3984  CmpValue = 0;
3985  return true;
3986  }
3987  return false;
3988 }
3989 
3990 bool X86InstrInfo::isRedundantFlagInstr(const MachineInstr &FlagI,
3991  Register SrcReg, Register SrcReg2,
3992  int64_t ImmMask, int64_t ImmValue,
3993  const MachineInstr &OI, bool *IsSwapped,
3994  int64_t *ImmDelta) const {
3995  switch (OI.getOpcode()) {
3996  case X86::CMP64rr:
3997  case X86::CMP32rr:
3998  case X86::CMP16rr:
3999  case X86::CMP8rr:
4000  case X86::SUB64rr:
4001  case X86::SUB32rr:
4002  case X86::SUB16rr:
4003  case X86::SUB8rr: {
4004  Register OISrcReg;
4005  Register OISrcReg2;
4006  int64_t OIMask;
4007  int64_t OIValue;
4008  if (!analyzeCompare(OI, OISrcReg, OISrcReg2, OIMask, OIValue) ||
4009  OIMask != ImmMask || OIValue != ImmValue)
4010  return false;
4011  if (SrcReg == OISrcReg && SrcReg2 == OISrcReg2) {
4012  *IsSwapped = false;
4013  return true;
4014  }
4015  if (SrcReg == OISrcReg2 && SrcReg2 == OISrcReg) {
4016  *IsSwapped = true;
4017  return true;
4018  }
4019  return false;
4020  }
4021  case X86::CMP64ri32:
4022  case X86::CMP64ri8:
4023  case X86::CMP32ri:
4024  case X86::CMP32ri8:
4025  case X86::CMP16ri:
4026  case X86::CMP16ri8:
4027  case X86::CMP8ri:
4028  case X86::SUB64ri32:
4029  case X86::SUB64ri8:
4030  case X86::SUB32ri:
4031  case X86::SUB32ri8:
4032  case X86::SUB16ri:
4033  case X86::SUB16ri8:
4034  case X86::SUB8ri:
4035  case X86::TEST64rr:
4036  case X86::TEST32rr:
4037  case X86::TEST16rr:
4038  case X86::TEST8rr: {
4039  if (ImmMask != 0) {
4040  Register OISrcReg;
4041  Register OISrcReg2;
4042  int64_t OIMask;
4043  int64_t OIValue;
4044  if (analyzeCompare(OI, OISrcReg, OISrcReg2, OIMask, OIValue) &&
4045  SrcReg == OISrcReg && ImmMask == OIMask) {
4046  if (OIValue == ImmValue) {
4047  *ImmDelta = 0;
4048  return true;
4049  } else if (static_cast<uint64_t>(ImmValue) ==
4050  static_cast<uint64_t>(OIValue) - 1) {
4051  *ImmDelta = -1;
4052  return true;
4053  } else if (static_cast<uint64_t>(ImmValue) ==
4054  static_cast<uint64_t>(OIValue) + 1) {
4055  *ImmDelta = 1;
4056  return true;
4057  } else {
4058  return false;
4059  }
4060  }
4061  }
4062  return FlagI.isIdenticalTo(OI);
4063  }
4064  default:
4065  return false;
4066  }
4067 }
4068 
4069 /// Check whether the definition can be converted
4070 /// to remove a comparison against zero.
4071 inline static bool isDefConvertible(const MachineInstr &MI, bool &NoSignFlag,
4072  bool &ClearsOverflowFlag) {
4073  NoSignFlag = false;
4074  ClearsOverflowFlag = false;
4075 
4076  switch (MI.getOpcode()) {
4077  default: return false;
4078 
4079  // The shift instructions only modify ZF if their shift count is non-zero.
4080  // N.B.: The processor truncates the shift count depending on the encoding.
4081  case X86::SAR8ri: case X86::SAR16ri: case X86::SAR32ri:case X86::SAR64ri:
4082  case X86::SHR8ri: case X86::SHR16ri: case X86::SHR32ri:case X86::SHR64ri:
4083  return getTruncatedShiftCount(MI, 2) != 0;
4084 
4085  // Some left shift instructions can be turned into LEA instructions but only
4086  // if their flags aren't used. Avoid transforming such instructions.
4087  case X86::SHL8ri: case X86::SHL16ri: case X86::SHL32ri:case X86::SHL64ri:{
4088  unsigned ShAmt = getTruncatedShiftCount(MI, 2);
4089  if (isTruncatedShiftCountForLEA(ShAmt)) return false;
4090  return ShAmt != 0;
4091  }
4092 
4093  case X86::SHRD16rri8:case X86::SHRD32rri8:case X86::SHRD64rri8:
4094  case X86::SHLD16rri8:case X86::SHLD32rri8:case X86::SHLD64rri8:
4095  return getTruncatedShiftCount(MI, 3) != 0;
4096 
4097  case X86::SUB64ri32: case X86::SUB64ri8: case X86::SUB32ri:
4098  case X86::SUB32ri8: case X86::SUB16ri: case X86::SUB16ri8:
4099  case X86::SUB8ri: case X86::SUB64rr: case X86::SUB32rr:
4100  case X86::SUB16rr: case X86::SUB8rr: case X86::SUB64rm:
4101  case X86::SUB32rm: case X86::SUB16rm: case X86::SUB8rm:
4102  case X86::DEC64r: case X86::DEC32r: case X86::DEC16r: case X86::DEC8r:
4103  case X86::ADD64ri32: case X86::ADD64ri8: case X86::ADD32ri:
4104  case X86::ADD32ri8: case X86::ADD16ri: case X86::ADD16ri8:
4105  case X86::ADD8ri: case X86::ADD64rr: case X86::ADD32rr:
4106  case X86::ADD16rr: case X86::ADD8rr: case X86::ADD64rm:
4107  case X86::ADD32rm: case X86::ADD16rm: case X86::ADD8rm:
4108  case X86::INC64r: case X86::INC32r: case X86::INC16r: case X86::INC8r:
4109  case X86::ADC64ri32: case X86::ADC64ri8: case X86::ADC32ri:
4110  case X86::ADC32ri8: case X86::ADC16ri: case X86::ADC16ri8:
4111  case X86::ADC8ri: case X86::ADC64rr: case X86::ADC32rr:
4112  case X86::ADC16rr: case X86::ADC8rr: case X86::ADC64rm:
4113  case X86::ADC32rm: case X86::ADC16rm: case X86::ADC8rm:
4114  case X86::SBB64ri32: case X86::SBB64ri8: case X86::SBB32ri:
4115  case X86::SBB32ri8: case X86::SBB16ri: case X86::SBB16ri8:
4116  case X86::SBB8ri: case X86::SBB64rr: case X86::SBB32rr:
4117  case X86::SBB16rr: case X86::SBB8rr: case X86::SBB64rm:
4118  case X86::SBB32rm: case X86::SBB16rm: case X86::SBB8rm:
4119  case X86::NEG8r: case X86::NEG16r: case X86::NEG32r: case X86::NEG64r:
4120  case X86::SAR8r1: case X86::SAR16r1: case X86::SAR32r1:case X86::SAR64r1:
4121  case X86::SHR8r1: case X86::SHR16r1: case X86::SHR32r1:case X86::SHR64r1:
4122  case X86::SHL8r1: case X86::SHL16r1: case X86::SHL32r1:case X86::SHL64r1:
4123  case X86::LZCNT16rr: case X86::LZCNT16rm:
4124  case X86::LZCNT32rr: case X86::LZCNT32rm:
4125  case X86::LZCNT64rr: case X86::LZCNT64rm:
4126  case X86::POPCNT16rr:case X86::POPCNT16rm:
4127  case X86::POPCNT32rr:case X86::POPCNT32rm:
4128  case X86::POPCNT64rr:case X86::POPCNT64rm:
4129  case X86::TZCNT16rr: case X86::TZCNT16rm:
4130  case X86::TZCNT32rr: case X86::TZCNT32rm:
4131  case X86::TZCNT64rr: case X86::TZCNT64rm:
4132  return true;
4133  case X86::AND64ri32: case X86::AND64ri8: case X86::AND32ri:
4134  case X86::AND32ri8: case X86::AND16ri: case X86::AND16ri8:
4135  case X86::AND8ri: case X86::AND64rr: case X86::AND32rr:
4136  case X86::AND16rr: case X86::AND8rr: case X86::AND64rm:
4137  case X86::AND32rm: case X86::AND16rm: case X86::AND8rm:
4138  case X86::XOR64ri32: case X86::XOR64ri8: case X86::XOR32ri:
4139  case X86::XOR32ri8: case X86::XOR16ri: case X86::XOR16ri8:
4140  case X86::XOR8ri: case X86::XOR64rr: case X86::XOR32rr:
4141  case X86::XOR16rr: case X86::XOR8rr: case X86::XOR64rm:
4142  case X86::XOR32rm: case X86::XOR16rm: case X86::XOR8rm:
4143  case X86::OR64ri32: case X86::OR64ri8: case X86::OR32ri:
4144  case X86::OR32ri8: case X86::OR16ri: case X86::OR16ri8:
4145  case X86::OR8ri: case X86::OR64rr: case X86::OR32rr:
4146  case X86::OR16rr: case X86::OR8rr: case X86::OR64rm:
4147  case X86::OR32rm: case X86::OR16rm: case X86::OR8rm:
4148  case X86::ANDN32rr: case X86::ANDN32rm:
4149  case X86::ANDN64rr: case X86::ANDN64rm:
4150  case X86::BLSI32rr: case X86::BLSI32rm:
4151  case X86::BLSI64rr: case X86::BLSI64rm:
4152  case X86::BLSMSK32rr: case X86::BLSMSK32rm:
4153  case X86::BLSMSK64rr: case X86::BLSMSK64rm:
4154  case X86::BLSR32rr: case X86::BLSR32rm:
4155  case X86::BLSR64rr: case X86::BLSR64rm:
4156  case X86::BLCFILL32rr: case X86::BLCFILL32rm:
4157  case X86::BLCFILL64rr: case X86::BLCFILL64rm:
4158  case X86::BLCI32rr: case X86::BLCI32rm:
4159  case X86::BLCI64rr: case X86::BLCI64rm:
4160  case X86::BLCIC32rr: case X86::BLCIC32rm:
4161  case X86::BLCIC64rr: case X86::BLCIC64rm:
4162  case X86::BLCMSK32rr: case X86::BLCMSK32rm:
4163  case X86::BLCMSK64rr: case X86::BLCMSK64rm:
4164  case X86::BLCS32rr: case X86::BLCS32rm:
4165  case X86::BLCS64rr: case X86::BLCS64rm:
4166  case X86::BLSFILL32rr: case X86::BLSFILL32rm:
4167  case X86::BLSFILL64rr: case X86::BLSFILL64rm:
4168  case X86::BLSIC32rr: case X86::BLSIC32rm:
4169  case X86::BLSIC64rr: case X86::BLSIC64rm:
4170  case X86::BZHI32rr: case X86::BZHI32rm:
4171  case X86::BZHI64rr: case X86::BZHI64rm:
4172  case X86::T1MSKC32rr: case X86::T1MSKC32rm:
4173  case X86::T1MSKC64rr: case X86::T1MSKC64rm:
4174  case X86::TZMSK32rr: case X86::TZMSK32rm:
4175  case X86::TZMSK64rr: case X86::TZMSK64rm:
4176  // These instructions clear the overflow flag just like TEST.
4177  // FIXME: These are not the only instructions in this switch that clear the
4178  // overflow flag.
4179  ClearsOverflowFlag = true;
4180  return true;
4181  case X86::BEXTR32rr: case X86::BEXTR64rr:
4182  case X86::BEXTR32rm: case X86::BEXTR64rm:
4183  case X86::BEXTRI32ri: case X86::BEXTRI32mi:
4184  case X86::BEXTRI64ri: case X86::BEXTRI64mi:
4185  // BEXTR doesn't update the sign flag so we can't use it. It does clear
4186  // the overflow flag, but that's not useful without the sign flag.
4187  NoSignFlag = true;
4188  return true;
4189  }
4190 }
4191 
4192 /// Check whether the use can be converted to remove a comparison against zero.
4194  switch (MI.getOpcode()) {
4195  default: return X86::COND_INVALID;
4196  case X86::NEG8r:
4197  case X86::NEG16r:
4198  case X86::NEG32r:
4199  case X86::NEG64r:
4200  return X86::COND_AE;
4201  case X86::LZCNT16rr:
4202  case X86::LZCNT32rr:
4203  case X86::LZCNT64rr:
4204  return X86::COND_B;
4205  case X86::POPCNT16rr:
4206  case X86::POPCNT32rr:
4207  case X86::POPCNT64rr:
4208  return X86::COND_E;
4209  case X86::TZCNT16rr:
4210  case X86::TZCNT32rr:
4211  case X86::TZCNT64rr:
4212  return X86::COND_B;
4213  case X86::BSF16rr:
4214  case X86::BSF32rr:
4215  case X86::BSF64rr:
4216  case X86::BSR16rr:
4217  case X86::BSR32rr:
4218  case X86::BSR64rr:
4219  return X86::COND_E;
4220  case X86::BLSI32rr:
4221  case X86::BLSI64rr:
4222  return X86::COND_AE;
4223  case X86::BLSR32rr:
4224  case X86::BLSR64rr:
4225  case X86::BLSMSK32rr:
4226  case X86::BLSMSK64rr:
4227  return X86::COND_B;
4228  // TODO: TBM instructions.
4229  }
4230 }
4231 
4232 /// Check if there exists an earlier instruction that
4233 /// operates on the same source operands and sets flags in the same way as
4234 /// Compare; remove Compare if possible.
4236  Register SrcReg2, int64_t CmpMask,
4237  int64_t CmpValue,
4238  const MachineRegisterInfo *MRI) const {
4239  // Check whether we can replace SUB with CMP.
4240  switch (CmpInstr.getOpcode()) {
4241  default: break;
4242  case X86::SUB64ri32:
4243  case X86::SUB64ri8:
4244  case X86::SUB32ri:
4245  case X86::SUB32ri8:
4246  case X86::SUB16ri:
4247  case X86::SUB16ri8:
4248  case X86::SUB8ri:
4249  case X86::SUB64rm:
4250  case X86::SUB32rm:
4251  case X86::SUB16rm:
4252  case X86::SUB8rm:
4253  case X86::SUB64rr:
4254  case X86::SUB32rr:
4255  case X86::SUB16rr:
4256  case X86::SUB8rr: {
4257  if (!MRI->use_nodbg_empty(CmpInstr.getOperand(0).getReg()))
4258  return false;
4259  // There is no use of the destination register, we can replace SUB with CMP.
4260  unsigned NewOpcode = 0;
4261  switch (CmpInstr.getOpcode()) {
4262  default: llvm_unreachable("Unreachable!");
4263  case X86::SUB64rm: NewOpcode = X86::CMP64rm; break;
4264  case X86::SUB32rm: NewOpcode = X86::CMP32rm; break;
4265  case X86::SUB16rm: NewOpcode = X86::CMP16rm; break;
4266  case X86::SUB8rm: NewOpcode = X86::CMP8rm; break;
4267  case X86::SUB64rr: NewOpcode = X86::CMP64rr; break;
4268  case X86::SUB32rr: NewOpcode = X86::CMP32rr; break;
4269  case X86::SUB16rr: NewOpcode = X86::CMP16rr; break;
4270  case X86::SUB8rr: NewOpcode = X86::CMP8rr; break;
4271  case X86::SUB64ri32: NewOpcode = X86::CMP64ri32; break;
4272  case X86::SUB64ri8: NewOpcode = X86::CMP64ri8; break;
4273  case X86::SUB32ri: NewOpcode = X86::CMP32ri; break;
4274  case X86::SUB32ri8: NewOpcode = X86::CMP32ri8; break;
4275  case X86::SUB16ri: NewOpcode = X86::CMP16ri; break;
4276  case X86::SUB16ri8: NewOpcode = X86::CMP16ri8; break;
4277  case X86::SUB8ri: NewOpcode = X86::CMP8ri; break;
4278  }
4279  CmpInstr.setDesc(get(NewOpcode));
4280  CmpInstr.removeOperand(0);
4281  // Mutating this instruction invalidates any debug data associated with it.
4282  CmpInstr.dropDebugNumber();
4283  // Fall through to optimize Cmp if Cmp is CMPrr or CMPri.
4284  if (NewOpcode == X86::CMP64rm || NewOpcode == X86::CMP32rm ||
4285  NewOpcode == X86::CMP16rm || NewOpcode == X86::CMP8rm)
4286  return false;
4287  }
4288  }
4289 
4290  // The following code tries to remove the comparison by re-using EFLAGS
4291  // from earlier instructions.
4292 
4293  bool IsCmpZero = (CmpMask != 0 && CmpValue == 0);
4294 
4295  // Transformation currently requires SSA values.
4296  if (SrcReg2.isPhysical())
4297  return false;
4298  MachineInstr *SrcRegDef = MRI->getVRegDef(SrcReg);
4299  assert(SrcRegDef && "Must have a definition (SSA)");
4300 
4301  MachineInstr *MI = nullptr;
4302  MachineInstr *Sub = nullptr;
4303  MachineInstr *Movr0Inst = nullptr;
4304  bool NoSignFlag = false;
4305  bool ClearsOverflowFlag = false;
4306  bool ShouldUpdateCC = false;
4307  bool IsSwapped = false;
4309  int64_t ImmDelta = 0;
4310 
4311  // Search backward from CmpInstr for the next instruction defining EFLAGS.
4313  MachineBasicBlock &CmpMBB = *CmpInstr.getParent();
4315  std::next(MachineBasicBlock::reverse_iterator(CmpInstr));
4316  for (MachineBasicBlock *MBB = &CmpMBB;;) {
4317  for (MachineInstr &Inst : make_range(From, MBB->rend())) {
4318  // Try to use EFLAGS from the instruction defining %SrcReg. Example:
4319  // %eax = addl ...
4320  // ... // EFLAGS not changed
4321  // testl %eax, %eax // <-- can be removed
4322  if (&Inst == SrcRegDef) {
4323  if (IsCmpZero &&
4324  isDefConvertible(Inst, NoSignFlag, ClearsOverflowFlag)) {
4325  MI = &Inst;
4326  break;
4327  }
4328 
4329  // Look back for the following pattern, in which case the test64rr
4330  // instruction could be erased.
4331  //
4332  // Example:
4333  // %reg = and32ri %in_reg, 5
4334  // ... // EFLAGS not changed.
4335  // %src_reg = subreg_to_reg 0, %reg, %subreg.sub_index
4336  // test64rr %src_reg, %src_reg, implicit-def $eflags
4337  MachineInstr *AndInstr = nullptr;
4338  if (IsCmpZero &&
4339  findRedundantFlagInstr(CmpInstr, Inst, MRI, &AndInstr, TRI,
4340  NoSignFlag, ClearsOverflowFlag)) {
4341  assert(AndInstr != nullptr && X86::isAND(AndInstr->getOpcode()));
4342  MI = AndInstr;
4343  break;
4344  }
4345  // Cannot find other candidates before definition of SrcReg.
4346  return false;
4347  }
4348 
4349  if (Inst.modifiesRegister(X86::EFLAGS, TRI)) {
4350  // Try to use EFLAGS produced by an instruction reading %SrcReg.
4351  // Example:
4352  // %eax = ...
4353  // ...
4354  // popcntl %eax
4355  // ... // EFLAGS not changed
4356  // testl %eax, %eax // <-- can be removed
4357  if (IsCmpZero) {
4358  NewCC = isUseDefConvertible(Inst);
4359  if (NewCC != X86::COND_INVALID && Inst.getOperand(1).isReg() &&
4360  Inst.getOperand(1).getReg() == SrcReg) {
4361  ShouldUpdateCC = true;
4362  MI = &Inst;
4363  break;
4364  }
4365  }
4366 
4367  // Try to use EFLAGS from an instruction with similar flag results.
4368  // Example:
4369  // sub x, y or cmp x, y
4370  // ... // EFLAGS not changed
4371  // cmp x, y // <-- can be removed
4372  if (isRedundantFlagInstr(CmpInstr, SrcReg, SrcReg2, CmpMask, CmpValue,
4373  Inst, &IsSwapped, &ImmDelta)) {
4374  Sub = &Inst;
4375  break;
4376  }
4377 
4378  // MOV32r0 is implemented with xor which clobbers condition code. It is
4379  // safe to move up, if the definition to EFLAGS is dead and earlier
4380  // instructions do not read or write EFLAGS.
4381  if (!Movr0Inst && Inst.getOpcode() == X86::MOV32r0 &&
4382  Inst.registerDefIsDead(X86::EFLAGS, TRI)) {
4383  Movr0Inst = &Inst;
4384  continue;
4385  }
4386 
4387  // Cannot do anything for any other EFLAG changes.
4388  return false;
4389  }
4390  }
4391 
4392  if (MI || Sub)
4393  break;
4394 
4395  // Reached begin of basic block. Continue in predecessor if there is
4396  // exactly one.
4397  if (MBB->pred_size() != 1)
4398  return false;
4399  MBB = *MBB->pred_begin();
4400  From = MBB->rbegin();
4401  }
4402 
4403  // Scan forward from the instruction after CmpInstr for uses of EFLAGS.
4404  // It is safe to remove CmpInstr if EFLAGS is redefined or killed.
4405  // If we are done with the basic block, we need to check whether EFLAGS is
4406  // live-out.
4407  bool FlagsMayLiveOut = true;
4409  MachineBasicBlock::iterator AfterCmpInstr =
4410  std::next(MachineBasicBlock::iterator(CmpInstr));
4411  for (MachineInstr &Instr : make_range(AfterCmpInstr, CmpMBB.end())) {
4412  bool ModifyEFLAGS = Instr.modifiesRegister(X86::EFLAGS, TRI);
4413  bool UseEFLAGS = Instr.readsRegister(X86::EFLAGS, TRI);
4414  // We should check the usage if this instruction uses and updates EFLAGS.
4415  if (!UseEFLAGS && ModifyEFLAGS) {
4416  // It is safe to remove CmpInstr if EFLAGS is updated again.
4417  FlagsMayLiveOut = false;
4418  break;
4419  }
4420  if (!UseEFLAGS && !ModifyEFLAGS)
4421  continue;
4422 
4423  // EFLAGS is used by this instruction.
4425  if (MI || IsSwapped || ImmDelta != 0) {
4426  // We decode the condition code from opcode.
4427  if (Instr.isBranch())
4428  OldCC = X86::getCondFromBranch(Instr);
4429  else {
4430  OldCC = X86::getCondFromSETCC(Instr);
4431  if (OldCC == X86::COND_INVALID)
4432  OldCC = X86::getCondFromCMov(Instr);
4433  }
4434  if (OldCC == X86::COND_INVALID) return false;
4435  }
4436  X86::CondCode ReplacementCC = X86::COND_INVALID;
4437  if (MI) {
4438  switch (OldCC) {
4439  default: break;
4440  case X86::COND_A: case X86::COND_AE:
4441  case X86::COND_B: case X86::COND_BE:
4442  // CF is used, we can't perform this optimization.
4443  return false;
4444  case X86::COND_G: case X86::COND_GE:
4445  case X86::COND_L: case X86::COND_LE:
4446  case X86::COND_O: case X86::COND_NO:
4447  // If OF is used, the instruction needs to clear it like CmpZero does.
4448  if (!ClearsOverflowFlag)
4449  return false;
4450  break;
4451  case X86::COND_S: case X86::COND_NS:
4452  // If SF is used, but the instruction doesn't update the SF, then we
4453  // can't do the optimization.
4454  if (NoSignFlag)
4455  return false;
4456  break;
4457  }
4458 
4459  // If we're updating the condition code check if we have to reverse the
4460  // condition.
4461  if (ShouldUpdateCC)
4462  switch (OldCC) {
4463  default:
4464  return false;
4465  case X86::COND_E:
4466  ReplacementCC = NewCC;
4467  break;
4468  case X86::COND_NE:
4469  ReplacementCC = GetOppositeBranchCondition(NewCC);
4470  break;
4471  }
4472  } else if (IsSwapped) {
4473  // If we have SUB(r1, r2) and CMP(r2, r1), the condition code needs
4474  // to be changed from r2 > r1 to r1 < r2, from r2 < r1 to r1 > r2, etc.
4475  // We swap the condition code and synthesize the new opcode.
4476  ReplacementCC = getSwappedCondition(OldCC);
4477  if (ReplacementCC == X86::COND_INVALID)
4478  return false;
4479  ShouldUpdateCC = true;
4480  } else if (ImmDelta != 0) {
4481  unsigned BitWidth = TRI->getRegSizeInBits(*MRI->getRegClass(SrcReg));
4482  // Shift amount for min/max constants to adjust for 8/16/32 instruction
4483  // sizes.
4484  switch (OldCC) {
4485  case X86::COND_L: // x <s (C + 1) --> x <=s C
4486  if (ImmDelta != 1 || APInt::getSignedMinValue(BitWidth) == CmpValue)
4487  return false;
4488  ReplacementCC = X86::COND_LE;
4489  break;
4490  case X86::COND_B: // x <u (C + 1) --> x <=u C
4491  if (ImmDelta != 1 || CmpValue == 0)
4492  return false;
4493  ReplacementCC = X86::COND_BE;
4494  break;
4495  case X86::COND_GE: // x >=s (C + 1) --> x >s C
4496  if (ImmDelta != 1 || APInt::getSignedMinValue(BitWidth) == CmpValue)
4497  return false;
4498  ReplacementCC = X86::COND_G;
4499  break;
4500  case X86::COND_AE: // x >=u (C + 1) --> x >u C
4501  if (ImmDelta != 1 || CmpValue == 0)
4502  return false;
4503  ReplacementCC = X86::COND_A;
4504  break;
4505  case X86::COND_G: // x >s (C - 1) --> x >=s C
4506  if (ImmDelta != -1 || APInt::getSignedMaxValue(BitWidth) == CmpValue)
4507  return false;
4508  ReplacementCC = X86::COND_GE;
4509  break;
4510  case X86::COND_A: // x >u (C - 1) --> x >=u C
4511  if (ImmDelta != -1 || APInt::getMaxValue(BitWidth) == CmpValue)
4512  return false;
4513  ReplacementCC = X86::COND_AE;
4514  break;
4515  case X86::COND_LE: // x <=s (C - 1) --> x <s C
4516  if (ImmDelta != -1 || APInt::getSignedMaxValue(BitWidth) == CmpValue)
4517  return false;
4518  ReplacementCC = X86::COND_L;
4519  break;
4520  case X86::COND_BE: // x <=u (C - 1) --> x <u C
4521  if (ImmDelta != -1 || APInt::getMaxValue(BitWidth) == CmpValue)
4522  return false;
4523  ReplacementCC = X86::COND_B;
4524  break;
4525  default:
4526  return false;
4527  }
4528  ShouldUpdateCC = true;
4529  }
4530 
4531  if (ShouldUpdateCC && ReplacementCC != OldCC) {
4532  // Push the MachineInstr to OpsToUpdate.
4533  // If it is safe to remove CmpInstr, the condition code of these
4534  // instructions will be modified.
4535  OpsToUpdate.push_back(std::make_pair(&Instr, ReplacementCC));
4536  }
4537  if (ModifyEFLAGS || Instr.killsRegister(X86::EFLAGS, TRI)) {
4538  // It is safe to remove CmpInstr if EFLAGS is updated again or killed.
4539  FlagsMayLiveOut = false;
4540  break;
4541  }
4542  }
4543 
4544  // If we have to update users but EFLAGS is live-out abort, since we cannot
4545  // easily find all of the users.
4546  if ((MI != nullptr || ShouldUpdateCC) && FlagsMayLiveOut) {
4547  for (MachineBasicBlock *Successor : CmpMBB.successors())
4548  if (Successor->isLiveIn(X86::EFLAGS))
4549  return false;
4550  }
4551 
4552  // The instruction to be updated is either Sub or MI.
4553  assert((MI == nullptr || Sub == nullptr) && "Should not have Sub and MI set");
4554  Sub = MI != nullptr ? MI : Sub;
4555  MachineBasicBlock *SubBB = Sub->getParent();
4556  // Move Movr0Inst to the appropriate place before Sub.
4557  if (Movr0Inst) {
4558  // Only move within the same block so we don't accidentally move to a
4559  // block with higher execution frequency.
4560  if (&CmpMBB != SubBB)
4561  return false;
4562  // Look backwards until we find a def that doesn't use the current EFLAGS.
4564  InsertE = Sub->getParent()->rend();
4565  for (; InsertI != InsertE; ++InsertI) {
4566  MachineInstr *Instr = &*InsertI;
4567  if (!Instr->readsRegister(X86::EFLAGS, TRI) &&
4568  Instr->modifiesRegister(X86::EFLAGS, TRI)) {
4569  Movr0Inst->getParent()->remove(Movr0Inst);
4570  Instr->getParent()->insert(MachineBasicBlock::iterator(Instr),
4571  Movr0Inst);
4572  break;
4573  }
4574  }
4575  if (InsertI == InsertE)
4576  return false;
4577  }
4578 
4579  // Make sure Sub instruction defines EFLAGS and mark the def live.
4580  MachineOperand *FlagDef = Sub->findRegisterDefOperand(X86::EFLAGS);
4581  assert(FlagDef && "Unable to locate a def EFLAGS operand");
4582  FlagDef->setIsDead(false);
4583 
4584  CmpInstr.eraseFromParent();
4585 
4586  // Modify the condition code of instructions in OpsToUpdate.
4587  for (auto &Op : OpsToUpdate) {
4588  Op.first->getOperand(Op.first->getDesc().getNumOperands() - 1)
4589  .setImm(Op.second);
4590  }
4591  // Add EFLAGS to block live-ins between CmpBB and block of flags producer.
4592  for (MachineBasicBlock *MBB = &CmpMBB; MBB != SubBB;
4593  MBB = *MBB->pred_begin()) {
4594  assert(MBB->pred_size() == 1 && "Expected exactly one predecessor");
4595  if (!MBB->isLiveIn(X86::EFLAGS))
4596  MBB->addLiveIn(X86::EFLAGS);
4597  }
4598  return true;
4599 }
4600 
4601 /// Try to remove the load by folding it to a register
4602 /// operand at the use. We fold the load instructions if load defines a virtual
4603 /// register, the virtual register is used once in the same BB, and the
4604 /// instructions in-between do not load or store, and have no side effects.
4606  const MachineRegisterInfo *MRI,
4607  Register &FoldAsLoadDefReg,
4608  MachineInstr *&DefMI) const {
4609  // Check whether we can move DefMI here.
4610  DefMI = MRI->getVRegDef(FoldAsLoadDefReg);
4611  assert(DefMI);
4612  bool SawStore = false;
4613  if (!DefMI->isSafeToMove(nullptr, SawStore))
4614  return nullptr;
4615 
4616  // Collect information about virtual register operands of MI.
4617  SmallVector<unsigned, 1> SrcOperandIds;
4618  for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
4619  MachineOperand &MO = MI.getOperand(i);
4620  if (!MO.isReg())
4621  continue;
4622  Register Reg = MO.getReg();
4623  if (Reg != FoldAsLoadDefReg)
4624  continue;
4625  // Do not fold if we have a subreg use or a def.
4626  if (MO.getSubReg() || MO.isDef())
4627  return nullptr;
4628  SrcOperandIds.push_back(i);
4629  }
4630  if (SrcOperandIds.empty())
4631  return nullptr;
4632 
4633  // Check whether we can fold the def into SrcOperandId.
4634  if (MachineInstr *FoldMI = foldMemoryOperand(MI, SrcOperandIds, *DefMI)) {
4635  FoldAsLoadDefReg = 0;
4636  return FoldMI;
4637  }
4638 
4639  return nullptr;
4640 }
4641 
4642 /// Expand a single-def pseudo instruction to a two-addr
4643 /// instruction with two undef reads of the register being defined.
4644 /// This is used for mapping:
4645 /// %xmm4 = V_SET0
4646 /// to:
4647 /// %xmm4 = PXORrr undef %xmm4, undef %xmm4
4648 ///
4650  const MCInstrDesc &Desc) {
4651  assert(Desc.getNumOperands() == 3 && "Expected two-addr instruction.");
4652  Register Reg = MIB.getReg(0);
4653  MIB->setDesc(Desc);
4654 
4655  // MachineInstr::addOperand() will insert explicit operands before any
4656  // implicit operands.
4658  // But we don't trust that.
4659  assert(MIB.getReg(1) == Reg &&
4660  MIB.getReg(2) == Reg && "Misplaced operand");
4661  return true;
4662 }
4663 
4664 /// Expand a single-def pseudo instruction to a two-addr
4665 /// instruction with two %k0 reads.
4666 /// This is used for mapping:
4667 /// %k4 = K_SET1
4668 /// to:
4669 /// %k4 = KXNORrr %k0, %k0
4670 static bool Expand2AddrKreg(MachineInstrBuilder &MIB, const MCInstrDesc &Desc,
4671  Register Reg) {
4672  assert(Desc.getNumOperands() == 3 && "Expected two-addr instruction.");
4673  MIB->setDesc(Desc);
4675  return true;
4676 }
4677 
4679  bool MinusOne) {
4680  MachineBasicBlock &MBB = *MIB->getParent();
4681  const DebugLoc &DL = MIB->getDebugLoc();
4682  Register Reg = MIB.getReg(0);
4683 
4684  // Insert the XOR.
4685  BuildMI(MBB, MIB.getInstr(), DL, TII.get(X86::XOR32rr), Reg)
4688 
4689  // Turn the pseudo into an INC or DEC.
4690  MIB->setDesc(TII.get(MinusOne ? X86::DEC32r : X86::INC32r));
4691  MIB.addReg(Reg);
4692 
4693  return true;
4694 }
4695 
4697  const TargetInstrInfo &TII,
4698  const X86Subtarget &Subtarget) {
4699  MachineBasicBlock &MBB = *MIB->getParent();
4700  const DebugLoc &DL = MIB->getDebugLoc();
4701  int64_t Imm = MIB->getOperand(1).getImm();
4702  assert(Imm != 0 && "Using push/pop for 0 is not efficient.");
4704 
4705  int StackAdjustment;
4706 
4707  if (Subtarget.is64Bit()) {
4708  assert(MIB->getOpcode() == X86::MOV64ImmSExti8 ||
4709  MIB->getOpcode() == X86::MOV32ImmSExti8);
4710 
4711  // Can't use push/pop lowering if the function might write to the red zone.
4712  X86MachineFunctionInfo *X86FI =
4714  if (X86FI->getUsesRedZone()) {
4715  MIB->setDesc(TII.get(MIB->getOpcode() ==
4716  X86::MOV32ImmSExti8 ? X86::MOV32ri : X86::MOV64ri));
4717  return true;
4718  }
4719 
4720  // 64-bit mode doesn't have 32-bit push/pop, so use 64-bit operations and
4721  // widen the register if necessary.
4722  StackAdjustment = 8;
4723  BuildMI(MBB, I, DL, TII.get(X86::PUSH64i8)).addImm(Imm);
4724  MIB->setDesc(TII.get(X86::POP64r));
4725  MIB->getOperand(0)
4726  .setReg(getX86SubSuperRegister(MIB.getReg(0), 64));
4727  } else {
4728  assert(MIB->getOpcode() == X86::MOV32ImmSExti8);
4729  StackAdjustment = 4;
4730  BuildMI(MBB, I, DL, TII.get(X86::PUSH32i8)).addImm(Imm);
4731  MIB->setDesc(TII.get(X86::POP32r));
4732  }
4733  MIB->removeOperand(1);
4735 
4736  // Build CFI if necessary.
4737  MachineFunction &MF = *MBB.getParent();
4738  const X86FrameLowering *TFL = Subtarget.getFrameLowering();
4739  bool IsWin64Prologue = MF.getTarget().getMCAsmInfo()->usesWindowsCFI();
4740  bool NeedsDwarfCFI = !IsWin64Prologue && MF.needsFrameMoves();
4741  bool EmitCFI = !TFL->hasFP(MF) && NeedsDwarfCFI;
4742  if (EmitCFI) {
4743  TFL->BuildCFI(MBB, I, DL,
4745  TFL->BuildCFI(MBB, std::next(I), DL,
4747  }
4748 
4749  return true;
4750 }
4751 
4752 // LoadStackGuard has so far only been implemented for 64-bit MachO. Different
4753 // code sequence is needed for other targets.
4755  const TargetInstrInfo &TII) {
4756  MachineBasicBlock &MBB = *MIB->getParent();
4757  const DebugLoc &DL = MIB->getDebugLoc();
4758  Register Reg = MIB.getReg(0);
4759  const GlobalValue *GV =
4760  cast<GlobalValue>((*MIB->memoperands_begin())->getValue());
4761  auto Flags = MachineMemOperand::MOLoad |
4765  MachinePointerInfo::getGOT(*MBB.getParent()), Flags, 8, Align(8));
4767 
4768  BuildMI(MBB, I, DL, TII.get(X86::MOV64rm), Reg).addReg(X86::RIP).addImm(1)
4770  .addMemOperand(MMO);
4771  MIB->setDebugLoc(DL);
4772  MIB->setDesc(TII.get(X86::MOV64rm));
4773  MIB.addReg(Reg, RegState::Kill).addImm(1).addReg(0).addImm(0).addReg(0);
4774 }
4775 
4777  MachineBasicBlock &MBB = *MIB->getParent();
4778  MachineFunction &MF = *MBB.getParent();
4779  const X86Subtarget &Subtarget = MF.getSubtarget<X86Subtarget>();
4780  const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
4781  unsigned XorOp =
4782  MIB->getOpcode() == X86::XOR64_FP ? X86::XOR64rr : X86::XOR32rr;
4783  MIB->setDesc(TII.get(XorOp));
4785  return true;
4786 }
4787 
4788 // This is used to handle spills for 128/256-bit registers when we have AVX512,
4789 // but not VLX. If it uses an extended register we need to use an instruction
4790 // that loads the lower 128/256-bit, but is available with only AVX512F.
4792  const TargetRegisterInfo *TRI,
4793  const MCInstrDesc &LoadDesc,
4794  const MCInstrDesc &BroadcastDesc,
4795  unsigned SubIdx) {
4796  Register DestReg = MIB.getReg(0);
4797  // Check if DestReg is XMM16-31 or YMM16-31.
4798  if (TRI->getEncodingValue(DestReg) < 16) {
4799  // We can use a normal VEX encoded load.
4800  MIB->setDesc(LoadDesc);
4801  } else {
4802  // Use a 128/256-bit VBROADCAST instruction.
4803  MIB->setDesc(BroadcastDesc);
4804  // Change the destination to a 512-bit register.
4805  DestReg = TRI->getMatchingSuperReg(DestReg, SubIdx, &X86::VR512RegClass);
4806  MIB->getOperand(0).setReg(DestReg);
4807  }
4808  return true;
4809 }
4810 
4811 // This is used to handle spills for 128/256-bit registers when we have AVX512,
4812 // but not VLX. If it uses an extended register we need to use an instruction
4813 // that stores the lower 128/256-bit, but is available with only AVX512F.
4815  const TargetRegisterInfo *TRI,
4816  const MCInstrDesc &StoreDesc,
4817  const MCInstrDesc &ExtractDesc,
4818  unsigned SubIdx) {
4819  Register SrcReg = MIB.getReg(X86::AddrNumOperands);
4820  // Check if DestReg is XMM16-31 or YMM16-31.
4821  if (TRI->getEncodingValue(SrcReg) < 16) {
4822  // We can use a normal VEX encoded store.
4823  MIB->setDesc(StoreDesc);
4824  } else {
4825  // Use a VEXTRACTF instruction.
4826  MIB->setDesc(ExtractDesc);
4827  // Change the destination to a 512-bit register.
4828  SrcReg = TRI->getMatchingSuperReg(SrcReg, SubIdx, &X86::VR512RegClass);
4829  MIB->getOperand(X86::AddrNumOperands).setReg(SrcReg);
4830  MIB.addImm(0x0); // Append immediate to extract from the lower bits.
4831  }
4832 
4833  return true;
4834 }
4835 
4836 static bool expandSHXDROT(MachineInstrBuilder &MIB, const MCInstrDesc &Desc) {
4837  MIB->setDesc(Desc);
4838  int64_t ShiftAmt = MIB->getOperand(2).getImm();
4839  // Temporarily remove the immediate so we can add another source register.
4840  MIB->removeOperand(2);
4841  // Add the register. Don't copy the kill flag if there is one.
4842  MIB.addReg(MIB.getReg(1),
4843  getUndefRegState(MIB->getOperand(1).isUndef()));
4844  // Add back the immediate.
4845  MIB.addImm(ShiftAmt);
4846  return true;
4847 }
4848 
4850  bool HasAVX = Subtarget.hasAVX();
4851  MachineInstrBuilder MIB(*MI.getParent()->getParent(), MI);
4852  switch (MI.getOpcode()) {
4853  case X86::MOV32r0:
4854  return Expand2AddrUndef(MIB, get(X86::XOR32rr));
4855  case X86::MOV32r1:
4856  return expandMOV32r1(MIB, *this, /*MinusOne=*/ false);
4857  case X86::MOV32r_1:
4858  return expandMOV32r1(MIB, *this, /*MinusOne=*/ true);
4859  case X86::MOV32ImmSExti8:
4860  case X86::MOV64ImmSExti8:
4861  return ExpandMOVImmSExti8(MIB, *this, Subtarget);
4862  case X86::SETB_C32r:
4863  return Expand2AddrUndef(MIB, get(X86::SBB32rr));
4864  case X86::SETB_C64r:
4865  return Expand2AddrUndef(MIB, get(X86::SBB64rr));
4866  case X86::MMX_SET0:
4867  return Expand2AddrUndef(MIB, get(X86::MMX_PXORrr));
4868  case X86::V_SET0:
4869  case X86::FsFLD0SS:
4870  case X86::FsFLD0SD:
4871  case X86::FsFLD0F128:
4872  return Expand2AddrUndef(MIB, get(HasAVX ? X86::VXORPSrr : X86::XORPSrr));
4873  case X86::AVX_SET0: {
4874  assert(HasAVX && "AVX not supported");
4876  Register SrcReg = MIB.getReg(0);
4877  Register XReg = TRI->getSubReg(SrcReg, X86::sub_xmm);
4878  MIB->getOperand(0).setReg(XReg);
4879  Expand2AddrUndef(MIB, get(X86::VXORPSrr));
4880  MIB.addReg(SrcReg, RegState::ImplicitDefine);
4881  return true;
4882  }
4883  case X86::AVX512_128_SET0:
4884  case X86::AVX512_FsFLD0SH:
4885  case X86::AVX512_FsFLD0SS:
4886  case X86::AVX512_FsFLD0SD:
4887  case X86::AVX512_FsFLD0F128: {
4888  bool HasVLX = Subtarget.hasVLX();
4889  Register SrcReg = MIB.getReg(0);
4891  if (HasVLX || TRI->getEncodingValue(SrcReg) < 16)
4892  return Expand2AddrUndef(MIB,
4893  get(HasVLX ? X86::VPXORDZ128rr : X86::VXORPSrr));
4894  // Extended register without VLX. Use a larger XOR.
4895  SrcReg =
4896  TRI->getMatchingSuperReg(SrcReg, X86::sub_xmm, &X86::VR512RegClass);
4897  MIB->getOperand(0).setReg(SrcReg);
4898  return Expand2AddrUndef(MIB, get(X86::VPXORDZrr));
4899  }
4900  case X86::AVX512_256_SET0:
4901  case X86::AVX512_512_SET0: {
4902  bool HasVLX = Subtarget.hasVLX();
4903  Register SrcReg = MIB.getReg(0);
4905  if (HasVLX || TRI->getEncodingValue(SrcReg) < 16) {
4906  Register XReg = TRI->getSubReg(SrcReg, X86::sub_xmm);
4907  MIB->getOperand(0).setReg(XReg);
4908  Expand2AddrUndef(MIB,
4909  get(HasVLX ? X86::VPXORDZ128rr : X86::VXORPSrr));
4910  MIB.addReg(SrcReg, RegState::ImplicitDefine);
4911  return true;
4912  }
4913  if (MI.getOpcode() == X86::AVX512_256_SET0) {
4914  // No VLX so we must reference a zmm.
4915  unsigned ZReg =
4916  TRI->getMatchingSuperReg(SrcReg, X86::sub_ymm, &X86::VR512RegClass);
4917  MIB->getOperand(0).setReg(ZReg);
4918  }
4919  return Expand2AddrUndef(MIB, get(X86::VPXORDZrr));
4920  }
4921  case X86::V_SETALLONES:
4922  return Expand2AddrUndef(MIB, get(HasAVX ? X86::VPCMPEQDrr : X86::PCMPEQDrr));
4923  case X86::AVX2_SETALLONES:
4924  return Expand2AddrUndef(MIB, get(X86::VPCMPEQDYrr));
4925  case X86::AVX1_SETALLONES: {
4926  Register Reg = MIB.getReg(0);
4927  // VCMPPSYrri with an immediate 0xf should produce VCMPTRUEPS.
4928  MIB->setDesc(get(X86::VCMPPSYrri));
4930  return true;
4931  }
4932  case X86::AVX512_512_SETALLONES: {
4933  Register Reg = MIB.getReg(0);
4934  MIB->setDesc(get(X86::VPTERNLOGDZrri));
4935  // VPTERNLOGD needs 3 register inputs and an immediate.
4936  // 0xff will return 1s for any input.
4938  .addReg(Reg, RegState::Undef).addImm(0xff);
4939  return true;
4940  }
4941  case X86::AVX512_512_SEXT_MASK_32:
4942  case X86::AVX512_512_SEXT_MASK_64: {
4943  Register Reg = MIB.getReg(0);
4944  Register MaskReg = MIB.getReg(1);
4945  unsigned MaskState = getRegState(MIB->getOperand(1));
4946  unsigned Opc = (MI.getOpcode() == X86::AVX512_512_SEXT_MASK_64) ?
4947  X86::VPTERNLOGQZrrikz : X86::VPTERNLOGDZrrikz;
4948  MI.removeOperand(1);
4949  MIB->setDesc(get(Opc));
4950  // VPTERNLOG needs 3 register inputs and an immediate.
4951  // 0xff will return 1s for any input.
4952  MIB.addReg(Reg, RegState::Undef).addReg(MaskReg, MaskState)
4954  return true;
4955  }
4956  case X86::VMOVAPSZ128rm_NOVLX:
4957  return expandNOVLXLoad(MIB, &getRegisterInfo(), get(X86::VMOVAPSrm),
4958  get(X86::VBROADCASTF32X4rm), X86::sub_xmm);
4959  case X86::VMOVUPSZ128rm_NOVLX:
4960  return expandNOVLXLoad(MIB, &getRegisterInfo(), get(X86::VMOVUPSrm),
4961  get(X86::VBROADCASTF32X4rm), X86::sub_xmm);
4962  case X86::VMOVAPSZ256rm_NOVLX:
4963  return expandNOVLXLoad(MIB, &getRegisterInfo(), get(X86::VMOVAPSYrm),
4964  get(X86::VBROADCASTF64X4rm), X86::sub_ymm);
4965  case X86::VMOVUPSZ256rm_NOVLX:
4966  return expandNOVLXLoad(MIB, &getRegisterInfo(), get(X86::VMOVUPSYrm),
4967  get(X86::VBROADCASTF64X4rm), X86::sub_ymm);
4968  case X86::VMOVAPSZ128mr_NOVLX:
4969  return expandNOVLXStore(MIB, &getRegisterInfo(), get(X86::VMOVAPSmr),
4970  get(X86::VEXTRACTF32x4Zmr), X86::sub_xmm);
4971  case X86::VMOVUPSZ128mr_NOVLX:
4972  return expandNOVLXStore(MIB, &getRegisterInfo(), get(X86::VMOVUPSmr),
4973  get(X86::VEXTRACTF32x4Zmr), X86::sub_xmm);
4974  case X86::VMOVAPSZ256mr_NOVLX:
4975  return expandNOVLXStore(MIB, &getRegisterInfo(), get(X86::VMOVAPSYmr),
4976  get(X86::VEXTRACTF64x4Zmr), X86::sub_ymm);
4977  case X86::VMOVUPSZ256mr_NOVLX:
4978  return expandNOVLXStore(MIB, &getRegisterInfo(), get(X86::VMOVUPSYmr),
4979  get(X86::VEXTRACTF64x4Zmr), X86::sub_ymm);
4980  case X86::MOV32ri64: {
4981  Register Reg = MIB.getReg(0);
4982  Register Reg32 = RI.getSubReg(Reg, X86::sub_32bit);
4983  MI.setDesc(get(X86::MOV32ri));
4984  MIB->getOperand(0).setReg(Reg32);
4986  return true;
4987  }
4988 
4989  // KNL does not recognize dependency-breaking idioms for mask registers,
4990  // so kxnor %k1, %k1, %k2 has a RAW dependence on %k1.
4991  // Using %k0 as the undef input register is a performance heuristic based
4992  // on the assumption that %k0 is used less frequently than the other mask
4993  // registers, since it is not usable as a write mask.
4994  // FIXME: A more advanced approach would be to choose the best input mask
4995  // register based on context.
4996  case X86::KSET0W: return Expand2AddrKreg(MIB, get(X86::KXORWrr), X86::K0);
4997  case X86::KSET0D: return Expand2AddrKreg(MIB, get(X86::KXORDrr), X86::K0);
4998  case X86::KSET0Q: return Expand2AddrKreg(MIB, get(X86::KXORQrr), X86::K0);
4999  case X86::KSET1W: return Expand2AddrKreg(MIB, get(X86::KXNORWrr), X86::K0);
5000  case X86::KSET1D: return Expand2AddrKreg(MIB, get(X86::KXNORDrr), X86::K0);
5001  case X86::KSET1Q: return Expand2AddrKreg(MIB, get(X86::KXNORQrr), X86::K0);
5002  case TargetOpcode::LOAD_STACK_GUARD:
5003  expandLoadStackGuard(MIB, *this);
5004  return true;
5005  case X86::XOR64_FP:
5006  case X86::XOR32_FP:
5007  return expandXorFP(MIB, *this);
5008  case X86::SHLDROT32ri: return expandSHXDROT(MIB, get(X86::SHLD32rri8));
5009  case X86::SHLDROT64ri: return expandSHXDROT(MIB, get(X86::SHLD64rri8));
5010  case X86::SHRDROT32ri: return expandSHXDROT(MIB, get(X86::SHRD32rri8));
5011  case X86::SHRDROT64ri: return expandSHXDROT(MIB, get(X86::SHRD64rri8));
5012  case X86::ADD8rr_DB: MIB->setDesc(get(X86::OR8rr)); break;
5013  case X86::ADD16rr_DB: MIB->setDesc(get(X86::OR16rr)); break;
5014  case X86::ADD32rr_DB: MIB->setDesc(get(X86::OR32rr)); break;
5015  case X86::ADD64rr_DB: MIB->setDesc(get(X86::OR64rr)); break;
5016  case X86::ADD8ri_DB: MIB->setDesc(get(X86::OR8ri)); break;
5017  case X86::ADD16ri_DB: MIB->setDesc(get(X86::OR16ri)); break;
5018  case X86::ADD32ri_DB: MIB->setDesc(get(X86::OR32ri)); break;
5019  case X86::ADD64ri32_DB: MIB->setDesc(get(X86::OR64ri32)); break;
5020  case X86::ADD16ri8_DB: MIB->setDesc(get(X86::OR16ri8)); break;
5021  case X86::ADD32ri8_DB: MIB->setDesc(get(X86::OR32ri8)); break;
5022  case X86::ADD64ri8_DB: MIB->setDesc(get(X86::OR64ri8)); break;
5023  }
5024  return false;
5025 }
5026 
5027 /// Return true for all instructions that only update
5028 /// the first 32 or 64-bits of the destination register and leave the rest
5029 /// unmodified. This can be used to avoid folding loads if the instructions
5030 /// only update part of the destination register, and the non-updated part is
5031 /// not needed. e.g. cvtss2sd, sqrtss. Unfolding the load from these
5032 /// instructions breaks the partial register dependency and it can improve
5033 /// performance. e.g.:
5034 ///
5035 /// movss (%rdi), %xmm0
5036 /// cvtss2sd %xmm0, %xmm0
5037 ///
5038 /// Instead of
5039 /// cvtss2sd (%rdi), %xmm0
5040 ///
5041 /// FIXME: This should be turned into a TSFlags.
5042 ///
5043 static bool hasPartialRegUpdate(unsigned Opcode,
5044  const X86Subtarget &Subtarget,
5045  bool ForLoadFold = false) {
5046  switch (Opcode) {
5047  case X86::CVTSI2SSrr:
5048  case X86::CVTSI2SSrm:
5049  case X86::CVTSI642SSrr:
5050  case X86::CVTSI642SSrm:
5051  case X86::CVTSI2SDrr:
5052  case X86::CVTSI2SDrm:
5053  case X86::CVTSI642SDrr:
5054  case X86::CVTSI642SDrm:
5055  // Load folding won't effect the undef register update since the input is
5056  // a GPR.
5057  return !ForLoadFold;
5058  case X86::CVTSD2SSrr:
5059  case X86::CVTSD2SSrm:
5060  case X86::CVTSS2SDrr:
5061  case X86::CVTSS2SDrm:
5062  case X86::MOVHPDrm:
5063  case X86::MOVHPSrm:
5064  case X86::MOVLPDrm:
5065  case X86::MOVLPSrm:
5066  case X86::RCPSSr:
5067  case X86::RCPSSm:
5068  case X86::RCPSSr_Int:
5069  case X86::RCPSSm_Int:
5070  case X86::ROUNDSDr:
5071  case X86::ROUNDSDm:
5072  case X86::ROUNDSSr:
5073  case X86::ROUNDSSm:
5074  case X86::RSQRTSSr:
5075  case X86::RSQRTSSm:
5076  case X86::RSQRTSSr_Int:
5077  case X86::RSQRTSSm_Int:
5078  case X86::SQRTSSr:
5079  case X86::SQRTSSm:
5080  case X86::SQRTSSr_Int:
5081  case X86::SQRTSSm_Int:
5082  case X86::SQRTSDr:
5083  case X86::SQRTSDm:
5084  case X86::SQRTSDr_Int:
5085  case X86::SQRTSDm_Int:
5086  return true;
5087  case X86::VFCMULCPHZ128rm:
5088  case X86::VFCMULCPHZ128rmb:
5089  case X86::VFCMULCPHZ128rmbkz:
5090  case X86::VFCMULCPHZ128rmkz:
5091  case X86::VFCMULCPHZ128rr:
5092  case X86::VFCMULCPHZ128rrkz:
5093  case X86::VFCMULCPHZ256rm:
5094  case X86::VFCMULCPHZ256rmb:
5095  case X86::VFCMULCPHZ256rmbkz:
5096  case X86::VFCMULCPHZ256rmkz:
5097  case X86::VFCMULCPHZ256rr:
5098  case X86::VFCMULCPHZ256rrkz:
5099  case X86::VFCMULCPHZrm:
5100  case X86::VFCMULCPHZrmb:
5101  case X86::VFCMULCPHZrmbkz:
5102  case X86::VFCMULCPHZrmkz:
5103  case X86::VFCMULCPHZrr:
5104  case X86::VFCMULCPHZrrb:
5105  case X86::VFCMULCPHZrrbkz:
5106  case X86::VFCMULCPHZrrkz:
5107  case X86::VFMULCPHZ128rm:
5108  case X86::VFMULCPHZ128rmb:
5109  case X86::VFMULCPHZ128rmbkz:
5110  case X86::VFMULCPHZ128rmkz:
5111  case X86::VFMULCPHZ128rr:
5112  case X86::VFMULCPHZ128rrkz:
5113  case X86::VFMULCPHZ256rm:
5114  case X86::VFMULCPHZ256rmb:
5115  case X86::VFMULCPHZ256rmbkz:
5116  case X86::VFMULCPHZ256rmkz:
5117  case X86::VFMULCPHZ256rr:
5118  case X86::VFMULCPHZ256rrkz:
5119  case X86::VFMULCPHZrm:
5120  case X86::VFMULCPHZrmb:
5121  case X86::VFMULCPHZrmbkz:
5122  case X86::VFMULCPHZrmkz:
5123  case X86::VFMULCPHZrr:
5124  case X86::VFMULCPHZrrb:
5125  case X86::VFMULCPHZrrbkz:
5126  case X86::VFMULCPHZrrkz:
5127  case X86::VFCMULCSHZrm:
5128  case X86::VFCMULCSHZrmkz:
5129  case X86::VFCMULCSHZrr:
5130  case X86::VFCMULCSHZrrb:
5131  case X86::VFCMULCSHZrrbkz:
5132  case X86::VFCMULCSHZrrkz:
5133  case X86::VFMULCSHZrm:
5134  case X86::VFMULCSHZrmkz:
5135  case X86::VFMULCSHZrr:
5136  case X86::VFMULCSHZrrb:
5137  case X86::VFMULCSHZrrbkz:
5138  case X86::VFMULCSHZrrkz:
5139  return Subtarget.hasMULCFalseDeps();
5140  case X86::VPERMDYrm:
5141  case X86::VPERMDYrr:
5142  case X86::VPERMQYmi:
5143  case X86::VPERMQYri:
5144  case X86::VPERMPSYrm:
5145  case X86::VPERMPSYrr:
5146  case X86::VPERMPDYmi:
5147  case X86::VPERMPDYri:
5148  case X86::VPERMDZ256rm:
5149  case X86::VPERMDZ256rmb:
5150  case X86::VPERMDZ256rmbkz:
5151  case X86::VPERMDZ256rmkz:
5152  case X86::VPERMDZ256rr:
5153  case X86::VPERMDZ256rrkz:
5154  case X86::VPERMDZrm:
5155  case X86::VPERMDZrmb:
5156  case X86::VPERMDZrmbkz:
5157  case X86::VPERMDZrmkz:
5158  case X86::VPERMDZrr:
5159  case X86::VPERMDZrrkz:
5160  case X86::VPERMQZ256mbi:
5161  case X86::VPERMQZ256mbikz:
5162  case X86::VPERMQZ256mi:
5163  case X86::VPERMQZ256mikz:
5164  case X86::VPERMQZ256ri:
5165  case X86::VPERMQZ256rikz:
5166  case X86::VPERMQZ256rm:
5167  case X86::VPERMQZ256rmb:
5168  case X86::VPERMQZ256rmbkz:
5169  case X86::VPERMQZ256rmkz:
5170  case X86::VPERMQZ256rr:
5171  case X86::VPERMQZ256rrkz:
5172  case X86::VPERMQZmbi:
5173  case X86::VPERMQZmbikz:
5174  case X86::VPERMQZmi:
5175  case X86::VPERMQZmikz:
5176  case X86::VPERMQZri:
5177  case X86::VPERMQZrikz:
5178  case X86::VPERMQZrm:
5179  case X86::VPERMQZrmb:
5180  case X86::VPERMQZrmbkz: