LLVM  16.0.0git
X86InstrInfo.cpp
Go to the documentation of this file.
1 //===-- X86InstrInfo.cpp - X86 Instruction Information --------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file contains the X86 implementation of the TargetInstrInfo class.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "X86InstrInfo.h"
14 #include "X86.h"
15 #include "X86InstrBuilder.h"
16 #include "X86InstrFoldTables.h"
17 #include "X86MachineFunctionInfo.h"
18 #include "X86Subtarget.h"
19 #include "X86TargetMachine.h"
20 #include "llvm/ADT/STLExtras.h"
21 #include "llvm/ADT/Sequence.h"
33 #include "llvm/CodeGen/StackMaps.h"
35 #include "llvm/IR/DerivedTypes.h"
36 #include "llvm/IR/Function.h"
37 #include "llvm/IR/InstrTypes.h"
38 #include "llvm/MC/MCAsmInfo.h"
39 #include "llvm/MC/MCExpr.h"
40 #include "llvm/MC/MCInst.h"
42 #include "llvm/Support/Debug.h"
46 
47 using namespace llvm;
48 
49 #define DEBUG_TYPE "x86-instr-info"
50 
51 #define GET_INSTRINFO_CTOR_DTOR
52 #include "X86GenInstrInfo.inc"
53 
54 static cl::opt<bool>
55  NoFusing("disable-spill-fusing",
56  cl::desc("Disable fusing of spill code into instructions"),
57  cl::Hidden);
58 static cl::opt<bool>
59 PrintFailedFusing("print-failed-fuse-candidates",
60  cl::desc("Print instructions that the allocator wants to"
61  " fuse, but the X86 backend currently can't"),
62  cl::Hidden);
63 static cl::opt<bool>
64 ReMatPICStubLoad("remat-pic-stub-load",
65  cl::desc("Re-materialize load from stub in PIC mode"),
66  cl::init(false), cl::Hidden);
67 static cl::opt<unsigned>
68 PartialRegUpdateClearance("partial-reg-update-clearance",
69  cl::desc("Clearance between two register writes "
70  "for inserting XOR to avoid partial "
71  "register update"),
72  cl::init(64), cl::Hidden);
73 static cl::opt<unsigned>
74 UndefRegClearance("undef-reg-clearance",
75  cl::desc("How many idle instructions we would like before "
76  "certain undef register reads"),
77  cl::init(128), cl::Hidden);
78 
79 
80 // Pin the vtable to this file.
81 void X86InstrInfo::anchor() {}
82 
84  : X86GenInstrInfo((STI.isTarget64BitLP64() ? X86::ADJCALLSTACKDOWN64
85  : X86::ADJCALLSTACKDOWN32),
86  (STI.isTarget64BitLP64() ? X86::ADJCALLSTACKUP64
87  : X86::ADJCALLSTACKUP32),
88  X86::CATCHRET,
89  (STI.is64Bit() ? X86::RET64 : X86::RET32)),
90  Subtarget(STI), RI(STI.getTargetTriple()) {
91 }
92 
93 bool
95  Register &SrcReg, Register &DstReg,
96  unsigned &SubIdx) const {
97  switch (MI.getOpcode()) {
98  default: break;
99  case X86::MOVSX16rr8:
100  case X86::MOVZX16rr8:
101  case X86::MOVSX32rr8:
102  case X86::MOVZX32rr8:
103  case X86::MOVSX64rr8:
104  if (!Subtarget.is64Bit())
105  // It's not always legal to reference the low 8-bit of the larger
106  // register in 32-bit mode.
107  return false;
108  [[fallthrough]];
109  case X86::MOVSX32rr16:
110  case X86::MOVZX32rr16:
111  case X86::MOVSX64rr16:
112  case X86::MOVSX64rr32: {
113  if (MI.getOperand(0).getSubReg() || MI.getOperand(1).getSubReg())
114  // Be conservative.
115  return false;
116  SrcReg = MI.getOperand(1).getReg();
117  DstReg = MI.getOperand(0).getReg();
118  switch (MI.getOpcode()) {
119  default: llvm_unreachable("Unreachable!");
120  case X86::MOVSX16rr8:
121  case X86::MOVZX16rr8:
122  case X86::MOVSX32rr8:
123  case X86::MOVZX32rr8:
124  case X86::MOVSX64rr8:
125  SubIdx = X86::sub_8bit;
126  break;
127  case X86::MOVSX32rr16:
128  case X86::MOVZX32rr16:
129  case X86::MOVSX64rr16:
130  SubIdx = X86::sub_16bit;
131  break;
132  case X86::MOVSX64rr32:
133  SubIdx = X86::sub_32bit;
134  break;
135  }
136  return true;
137  }
138  }
139  return false;
140 }
141 
143  if (MI.mayLoad() || MI.mayStore())
144  return false;
145 
146  // Some target-independent operations that trivially lower to data-invariant
147  // instructions.
148  if (MI.isCopyLike() || MI.isInsertSubreg())
149  return true;
150 
151  unsigned Opcode = MI.getOpcode();
152  using namespace X86;
153  // On x86 it is believed that imul is constant time w.r.t. the loaded data.
154  // However, they set flags and are perhaps the most surprisingly constant
155  // time operations so we call them out here separately.
156  if (isIMUL(Opcode))
157  return true;
158  // Bit scanning and counting instructions that are somewhat surprisingly
159  // constant time as they scan across bits and do other fairly complex
160  // operations like popcnt, but are believed to be constant time on x86.
161  // However, these set flags.
162  if (isBSF(Opcode) || isBSR(Opcode) || isLZCNT(Opcode) || isPOPCNT(Opcode) ||
163  isTZCNT(Opcode))
164  return true;
165  // Bit manipulation instructions are effectively combinations of basic
166  // arithmetic ops, and should still execute in constant time. These also
167  // set flags.
168  if (isBLCFILL(Opcode) || isBLCI(Opcode) || isBLCIC(Opcode) ||
169  isBLCMSK(Opcode) || isBLCS(Opcode) || isBLSFILL(Opcode) ||
170  isBLSI(Opcode) || isBLSIC(Opcode) || isBLSMSK(Opcode) || isBLSR(Opcode) ||
171  isTZMSK(Opcode))
172  return true;
173  // Bit extracting and clearing instructions should execute in constant time,
174  // and set flags.
175  if (isBEXTR(Opcode) || isBZHI(Opcode))
176  return true;
177  // Shift and rotate.
178  if (isROL(Opcode) || isROR(Opcode) || isSAR(Opcode) || isSHL(Opcode) ||
179  isSHR(Opcode) || isSHLD(Opcode) || isSHRD(Opcode))
180  return true;
181  // Basic arithmetic is constant time on the input but does set flags.
182  if (isADC(Opcode) || isADD(Opcode) || isAND(Opcode) || isOR(Opcode) ||
183  isSBB(Opcode) || isSUB(Opcode) || isXOR(Opcode))
184  return true;
185  // Arithmetic with just 32-bit and 64-bit variants and no immediates.
186  if (isADCX(Opcode) || isADOX(Opcode) || isANDN(Opcode))
187  return true;
188  // Unary arithmetic operations.
189  if (isDEC(Opcode) || isINC(Opcode) || isNEG(Opcode))
190  return true;
191  // Unlike other arithmetic, NOT doesn't set EFLAGS.
192  if (isNOT(Opcode))
193  return true;
194  // Various move instructions used to zero or sign extend things. Note that we
195  // intentionally don't support the _NOREX variants as we can't handle that
196  // register constraint anyways.
197  if (isMOVSX(Opcode) || isMOVZX(Opcode) || isMOVSXD(Opcode) || isMOV(Opcode))
198  return true;
199  // Arithmetic instructions that are both constant time and don't set flags.
200  if (isRORX(Opcode) || isSARX(Opcode) || isSHLX(Opcode) || isSHRX(Opcode))
201  return true;
202  // LEA doesn't actually access memory, and its arithmetic is constant time.
203  if (isLEA(Opcode))
204  return true;
205  // By default, assume that the instruction is not data invariant.
206  return false;
207 }
208 
210  switch (MI.getOpcode()) {
211  default:
212  // By default, assume that the load will immediately leak.
213  return false;
214 
215  // On x86 it is believed that imul is constant time w.r.t. the loaded data.
216  // However, they set flags and are perhaps the most surprisingly constant
217  // time operations so we call them out here separately.
218  case X86::IMUL16rm:
219  case X86::IMUL16rmi8:
220  case X86::IMUL16rmi:
221  case X86::IMUL32rm:
222  case X86::IMUL32rmi8:
223  case X86::IMUL32rmi:
224  case X86::IMUL64rm:
225  case X86::IMUL64rmi32:
226  case X86::IMUL64rmi8:
227 
228  // Bit scanning and counting instructions that are somewhat surprisingly
229  // constant time as they scan across bits and do other fairly complex
230  // operations like popcnt, but are believed to be constant time on x86.
231  // However, these set flags.
232  case X86::BSF16rm:
233  case X86::BSF32rm:
234  case X86::BSF64rm:
235  case X86::BSR16rm:
236  case X86::BSR32rm:
237  case X86::BSR64rm:
238  case X86::LZCNT16rm:
239  case X86::LZCNT32rm:
240  case X86::LZCNT64rm:
241  case X86::POPCNT16rm:
242  case X86::POPCNT32rm:
243  case X86::POPCNT64rm:
244  case X86::TZCNT16rm:
245  case X86::TZCNT32rm:
246  case X86::TZCNT64rm:
247 
248  // Bit manipulation instructions are effectively combinations of basic
249  // arithmetic ops, and should still execute in constant time. These also
250  // set flags.
251  case X86::BLCFILL32rm:
252  case X86::BLCFILL64rm:
253  case X86::BLCI32rm:
254  case X86::BLCI64rm:
255  case X86::BLCIC32rm:
256  case X86::BLCIC64rm:
257  case X86::BLCMSK32rm:
258  case X86::BLCMSK64rm:
259  case X86::BLCS32rm:
260  case X86::BLCS64rm:
261  case X86::BLSFILL32rm:
262  case X86::BLSFILL64rm:
263  case X86::BLSI32rm:
264  case X86::BLSI64rm:
265  case X86::BLSIC32rm:
266  case X86::BLSIC64rm:
267  case X86::BLSMSK32rm:
268  case X86::BLSMSK64rm:
269  case X86::BLSR32rm:
270  case X86::BLSR64rm:
271  case X86::TZMSK32rm:
272  case X86::TZMSK64rm:
273 
274  // Bit extracting and clearing instructions should execute in constant time,
275  // and set flags.
276  case X86::BEXTR32rm:
277  case X86::BEXTR64rm:
278  case X86::BEXTRI32mi:
279  case X86::BEXTRI64mi:
280  case X86::BZHI32rm:
281  case X86::BZHI64rm:
282 
283  // Basic arithmetic is constant time on the input but does set flags.
284  case X86::ADC8rm:
285  case X86::ADC16rm:
286  case X86::ADC32rm:
287  case X86::ADC64rm:
288  case X86::ADCX32rm:
289  case X86::ADCX64rm:
290  case X86::ADD8rm:
291  case X86::ADD16rm:
292  case X86::ADD32rm:
293  case X86::ADD64rm:
294  case X86::ADOX32rm:
295  case X86::ADOX64rm:
296  case X86::AND8rm:
297  case X86::AND16rm:
298  case X86::AND32rm:
299  case X86::AND64rm:
300  case X86::ANDN32rm:
301  case X86::ANDN64rm:
302  case X86::OR8rm:
303  case X86::OR16rm:
304  case X86::OR32rm:
305  case X86::OR64rm:
306  case X86::SBB8rm:
307  case X86::SBB16rm:
308  case X86::SBB32rm:
309  case X86::SBB64rm:
310  case X86::SUB8rm:
311  case X86::SUB16rm:
312  case X86::SUB32rm:
313  case X86::SUB64rm:
314  case X86::XOR8rm:
315  case X86::XOR16rm:
316  case X86::XOR32rm:
317  case X86::XOR64rm:
318 
319  // Integer multiply w/o affecting flags is still believed to be constant
320  // time on x86. Called out separately as this is among the most surprising
321  // instructions to exhibit that behavior.
322  case X86::MULX32rm:
323  case X86::MULX64rm:
324 
325  // Arithmetic instructions that are both constant time and don't set flags.
326  case X86::RORX32mi:
327  case X86::RORX64mi:
328  case X86::SARX32rm:
329  case X86::SARX64rm:
330  case X86::SHLX32rm:
331  case X86::SHLX64rm:
332  case X86::SHRX32rm:
333  case X86::SHRX64rm:
334 
335  // Conversions are believed to be constant time and don't set flags.
336  case X86::CVTTSD2SI64rm:
337  case X86::VCVTTSD2SI64rm:
338  case X86::VCVTTSD2SI64Zrm:
339  case X86::CVTTSD2SIrm:
340  case X86::VCVTTSD2SIrm:
341  case X86::VCVTTSD2SIZrm:
342  case X86::CVTTSS2SI64rm:
343  case X86::VCVTTSS2SI64rm:
344  case X86::VCVTTSS2SI64Zrm:
345  case X86::CVTTSS2SIrm:
346  case X86::VCVTTSS2SIrm:
347  case X86::VCVTTSS2SIZrm:
348  case X86::CVTSI2SDrm:
349  case X86::VCVTSI2SDrm:
350  case X86::VCVTSI2SDZrm:
351  case X86::CVTSI2SSrm:
352  case X86::VCVTSI2SSrm:
353  case X86::VCVTSI2SSZrm:
354  case X86::CVTSI642SDrm:
355  case X86::VCVTSI642SDrm:
356  case X86::VCVTSI642SDZrm:
357  case X86::CVTSI642SSrm:
358  case X86::VCVTSI642SSrm:
359  case X86::VCVTSI642SSZrm:
360  case X86::CVTSS2SDrm:
361  case X86::VCVTSS2SDrm:
362  case X86::VCVTSS2SDZrm:
363  case X86::CVTSD2SSrm:
364  case X86::VCVTSD2SSrm:
365  case X86::VCVTSD2SSZrm:
366  // AVX512 added unsigned integer conversions.
367  case X86::VCVTTSD2USI64Zrm:
368  case X86::VCVTTSD2USIZrm:
369  case X86::VCVTTSS2USI64Zrm:
370  case X86::VCVTTSS2USIZrm:
371  case X86::VCVTUSI2SDZrm:
372  case X86::VCVTUSI642SDZrm:
373  case X86::VCVTUSI2SSZrm:
374  case X86::VCVTUSI642SSZrm:
375 
376  // Loads to register don't set flags.
377  case X86::MOV8rm:
378  case X86::MOV8rm_NOREX:
379  case X86::MOV16rm:
380  case X86::MOV32rm:
381  case X86::MOV64rm:
382  case X86::MOVSX16rm8:
383  case X86::MOVSX32rm16:
384  case X86::MOVSX32rm8:
385  case X86::MOVSX32rm8_NOREX:
386  case X86::MOVSX64rm16:
387  case X86::MOVSX64rm32:
388  case X86::MOVSX64rm8:
389  case X86::MOVZX16rm8:
390  case X86::MOVZX32rm16:
391  case X86::MOVZX32rm8:
392  case X86::MOVZX32rm8_NOREX:
393  case X86::MOVZX64rm16:
394  case X86::MOVZX64rm8:
395  return true;
396  }
397 }
398 
400  const MachineFunction *MF = MI.getParent()->getParent();
402 
403  if (isFrameInstr(MI)) {
404  int SPAdj = alignTo(getFrameSize(MI), TFI->getStackAlign());
405  SPAdj -= getFrameAdjustment(MI);
406  if (!isFrameSetup(MI))
407  SPAdj = -SPAdj;
408  return SPAdj;
409  }
410 
411  // To know whether a call adjusts the stack, we need information
412  // that is bound to the following ADJCALLSTACKUP pseudo.
413  // Look for the next ADJCALLSTACKUP that follows the call.
414  if (MI.isCall()) {
415  const MachineBasicBlock *MBB = MI.getParent();
417  for (auto E = MBB->end(); I != E; ++I) {
418  if (I->getOpcode() == getCallFrameDestroyOpcode() ||
419  I->isCall())
420  break;
421  }
422 
423  // If we could not find a frame destroy opcode, then it has already
424  // been simplified, so we don't care.
425  if (I->getOpcode() != getCallFrameDestroyOpcode())
426  return 0;
427 
428  return -(I->getOperand(1).getImm());
429  }
430 
431  // Currently handle only PUSHes we can reasonably expect to see
432  // in call sequences
433  switch (MI.getOpcode()) {
434  default:
435  return 0;
436  case X86::PUSH32i8:
437  case X86::PUSH32r:
438  case X86::PUSH32rmm:
439  case X86::PUSH32rmr:
440  case X86::PUSHi32:
441  return 4;
442  case X86::PUSH64i8:
443  case X86::PUSH64r:
444  case X86::PUSH64rmm:
445  case X86::PUSH64rmr:
446  case X86::PUSH64i32:
447  return 8;
448  }
449 }
450 
451 /// Return true and the FrameIndex if the specified
452 /// operand and follow operands form a reference to the stack frame.
453 bool X86InstrInfo::isFrameOperand(const MachineInstr &MI, unsigned int Op,
454  int &FrameIndex) const {
455  if (MI.getOperand(Op + X86::AddrBaseReg).isFI() &&
456  MI.getOperand(Op + X86::AddrScaleAmt).isImm() &&
457  MI.getOperand(Op + X86::AddrIndexReg).isReg() &&
458  MI.getOperand(Op + X86::AddrDisp).isImm() &&
459  MI.getOperand(Op + X86::AddrScaleAmt).getImm() == 1 &&
460  MI.getOperand(Op + X86::AddrIndexReg).getReg() == 0 &&
461  MI.getOperand(Op + X86::AddrDisp).getImm() == 0) {
462  FrameIndex = MI.getOperand(Op + X86::AddrBaseReg).getIndex();
463  return true;
464  }
465  return false;
466 }
467 
468 static bool isFrameLoadOpcode(int Opcode, unsigned &MemBytes) {
469  switch (Opcode) {
470  default:
471  return false;
472  case X86::MOV8rm:
473  case X86::KMOVBkm:
474  MemBytes = 1;
475  return true;
476  case X86::MOV16rm:
477  case X86::KMOVWkm:
478  case X86::VMOVSHZrm:
479  case X86::VMOVSHZrm_alt:
480  MemBytes = 2;
481  return true;
482  case X86::MOV32rm:
483  case X86::MOVSSrm:
484  case X86::MOVSSrm_alt:
485  case X86::VMOVSSrm:
486  case X86::VMOVSSrm_alt:
487  case X86::VMOVSSZrm:
488  case X86::VMOVSSZrm_alt:
489  case X86::KMOVDkm:
490  MemBytes = 4;
491  return true;
492  case X86::MOV64rm:
493  case X86::LD_Fp64m:
494  case X86::MOVSDrm:
495  case X86::MOVSDrm_alt:
496  case X86::VMOVSDrm:
497  case X86::VMOVSDrm_alt:
498  case X86::VMOVSDZrm:
499  case X86::VMOVSDZrm_alt:
500  case X86::MMX_MOVD64rm:
501  case X86::MMX_MOVQ64rm:
502  case X86::KMOVQkm:
503  MemBytes = 8;
504  return true;
505  case X86::MOVAPSrm:
506  case X86::MOVUPSrm:
507  case X86::MOVAPDrm:
508  case X86::MOVUPDrm:
509  case X86::MOVDQArm:
510  case X86::MOVDQUrm:
511  case X86::VMOVAPSrm:
512  case X86::VMOVUPSrm:
513  case X86::VMOVAPDrm:
514  case X86::VMOVUPDrm:
515  case X86::VMOVDQArm:
516  case X86::VMOVDQUrm:
517  case X86::VMOVAPSZ128rm:
518  case X86::VMOVUPSZ128rm:
519  case X86::VMOVAPSZ128rm_NOVLX:
520  case X86::VMOVUPSZ128rm_NOVLX:
521  case X86::VMOVAPDZ128rm:
522  case X86::VMOVUPDZ128rm:
523  case X86::VMOVDQU8Z128rm:
524  case X86::VMOVDQU16Z128rm:
525  case X86::VMOVDQA32Z128rm:
526  case X86::VMOVDQU32Z128rm:
527  case X86::VMOVDQA64Z128rm:
528  case X86::VMOVDQU64Z128rm:
529  MemBytes = 16;
530  return true;
531  case X86::VMOVAPSYrm:
532  case X86::VMOVUPSYrm:
533  case X86::VMOVAPDYrm:
534  case X86::VMOVUPDYrm:
535  case X86::VMOVDQAYrm:
536  case X86::VMOVDQUYrm:
537  case X86::VMOVAPSZ256rm:
538  case X86::VMOVUPSZ256rm:
539  case X86::VMOVAPSZ256rm_NOVLX:
540  case X86::VMOVUPSZ256rm_NOVLX:
541  case X86::VMOVAPDZ256rm:
542  case X86::VMOVUPDZ256rm:
543  case X86::VMOVDQU8Z256rm:
544  case X86::VMOVDQU16Z256rm:
545  case X86::VMOVDQA32Z256rm:
546  case X86::VMOVDQU32Z256rm:
547  case X86::VMOVDQA64Z256rm:
548  case X86::VMOVDQU64Z256rm:
549  MemBytes = 32;
550  return true;
551  case X86::VMOVAPSZrm:
552  case X86::VMOVUPSZrm:
553  case X86::VMOVAPDZrm:
554  case X86::VMOVUPDZrm:
555  case X86::VMOVDQU8Zrm:
556  case X86::VMOVDQU16Zrm:
557  case X86::VMOVDQA32Zrm:
558  case X86::VMOVDQU32Zrm:
559  case X86::VMOVDQA64Zrm:
560  case X86::VMOVDQU64Zrm:
561  MemBytes = 64;
562  return true;
563  }
564 }
565 
566 static bool isFrameStoreOpcode(int Opcode, unsigned &MemBytes) {
567  switch (Opcode) {
568  default:
569  return false;
570  case X86::MOV8mr:
571  case X86::KMOVBmk:
572  MemBytes = 1;
573  return true;
574  case X86::MOV16mr:
575  case X86::KMOVWmk:
576  case X86::VMOVSHZmr:
577  MemBytes = 2;
578  return true;
579  case X86::MOV32mr:
580  case X86::MOVSSmr:
581  case X86::VMOVSSmr:
582  case X86::VMOVSSZmr:
583  case X86::KMOVDmk:
584  MemBytes = 4;
585  return true;
586  case X86::MOV64mr:
587  case X86::ST_FpP64m:
588  case X86::MOVSDmr:
589  case X86::VMOVSDmr:
590  case X86::VMOVSDZmr:
591  case X86::MMX_MOVD64mr:
592  case X86::MMX_MOVQ64mr:
593  case X86::MMX_MOVNTQmr:
594  case X86::KMOVQmk:
595  MemBytes = 8;
596  return true;
597  case X86::MOVAPSmr:
598  case X86::MOVUPSmr:
599  case X86::MOVAPDmr:
600  case X86::MOVUPDmr:
601  case X86::MOVDQAmr:
602  case X86::MOVDQUmr:
603  case X86::VMOVAPSmr:
604  case X86::VMOVUPSmr:
605  case X86::VMOVAPDmr:
606  case X86::VMOVUPDmr:
607  case X86::VMOVDQAmr:
608  case X86::VMOVDQUmr:
609  case X86::VMOVUPSZ128mr:
610  case X86::VMOVAPSZ128mr:
611  case X86::VMOVUPSZ128mr_NOVLX:
612  case X86::VMOVAPSZ128mr_NOVLX:
613  case X86::VMOVUPDZ128mr:
614  case X86::VMOVAPDZ128mr:
615  case X86::VMOVDQA32Z128mr:
616  case X86::VMOVDQU32Z128mr:
617  case X86::VMOVDQA64Z128mr:
618  case X86::VMOVDQU64Z128mr:
619  case X86::VMOVDQU8Z128mr:
620  case X86::VMOVDQU16Z128mr:
621  MemBytes = 16;
622  return true;
623  case X86::VMOVUPSYmr:
624  case X86::VMOVAPSYmr:
625  case X86::VMOVUPDYmr:
626  case X86::VMOVAPDYmr:
627  case X86::VMOVDQUYmr:
628  case X86::VMOVDQAYmr:
629  case X86::VMOVUPSZ256mr:
630  case X86::VMOVAPSZ256mr:
631  case X86::VMOVUPSZ256mr_NOVLX:
632  case X86::VMOVAPSZ256mr_NOVLX:
633  case X86::VMOVUPDZ256mr:
634  case X86::VMOVAPDZ256mr:
635  case X86::VMOVDQU8Z256mr:
636  case X86::VMOVDQU16Z256mr:
637  case X86::VMOVDQA32Z256mr:
638  case X86::VMOVDQU32Z256mr:
639  case X86::VMOVDQA64Z256mr:
640  case X86::VMOVDQU64Z256mr:
641  MemBytes = 32;
642  return true;
643  case X86::VMOVUPSZmr:
644  case X86::VMOVAPSZmr:
645  case X86::VMOVUPDZmr:
646  case X86::VMOVAPDZmr:
647  case X86::VMOVDQU8Zmr:
648  case X86::VMOVDQU16Zmr:
649  case X86::VMOVDQA32Zmr:
650  case X86::VMOVDQU32Zmr:
651  case X86::VMOVDQA64Zmr:
652  case X86::VMOVDQU64Zmr:
653  MemBytes = 64;
654  return true;
655  }
656  return false;
657 }
658 
660  int &FrameIndex) const {
661  unsigned Dummy;
663 }
664 
666  int &FrameIndex,
667  unsigned &MemBytes) const {
668  if (isFrameLoadOpcode(MI.getOpcode(), MemBytes))
669  if (MI.getOperand(0).getSubReg() == 0 && isFrameOperand(MI, 1, FrameIndex))
670  return MI.getOperand(0).getReg();
671  return 0;
672 }
673 
675  int &FrameIndex) const {
676  unsigned Dummy;
677  if (isFrameLoadOpcode(MI.getOpcode(), Dummy)) {
678  unsigned Reg;
680  return Reg;
681  // Check for post-frame index elimination operations
683  if (hasLoadFromStackSlot(MI, Accesses)) {
684  FrameIndex =
685  cast<FixedStackPseudoSourceValue>(Accesses.front()->getPseudoValue())
686  ->getFrameIndex();
687  return MI.getOperand(0).getReg();
688  }
689  }
690  return 0;
691 }
692 
694  int &FrameIndex) const {
695  unsigned Dummy;
697 }
698 
700  int &FrameIndex,
701  unsigned &MemBytes) const {
702  if (isFrameStoreOpcode(MI.getOpcode(), MemBytes))
703  if (MI.getOperand(X86::AddrNumOperands).getSubReg() == 0 &&
704  isFrameOperand(MI, 0, FrameIndex))
705  return MI.getOperand(X86::AddrNumOperands).getReg();
706  return 0;
707 }
708 
710  int &FrameIndex) const {
711  unsigned Dummy;
712  if (isFrameStoreOpcode(MI.getOpcode(), Dummy)) {
713  unsigned Reg;
715  return Reg;
716  // Check for post-frame index elimination operations
718  if (hasStoreToStackSlot(MI, Accesses)) {
719  FrameIndex =
720  cast<FixedStackPseudoSourceValue>(Accesses.front()->getPseudoValue())
721  ->getFrameIndex();
722  return MI.getOperand(X86::AddrNumOperands).getReg();
723  }
724  }
725  return 0;
726 }
727 
728 /// Return true if register is PIC base; i.e.g defined by X86::MOVPC32r.
729 static bool regIsPICBase(Register BaseReg, const MachineRegisterInfo &MRI) {
730  // Don't waste compile time scanning use-def chains of physregs.
731  if (!BaseReg.isVirtual())
732  return false;
733  bool isPICBase = false;
735  E = MRI.def_instr_end(); I != E; ++I) {
736  MachineInstr *DefMI = &*I;
737  if (DefMI->getOpcode() != X86::MOVPC32r)
738  return false;
739  assert(!isPICBase && "More than one PIC base?");
740  isPICBase = true;
741  }
742  return isPICBase;
743 }
744 
746  const MachineInstr &MI) const {
747  switch (MI.getOpcode()) {
748  default:
749  // This function should only be called for opcodes with the ReMaterializable
750  // flag set.
751  llvm_unreachable("Unknown rematerializable operation!");
752  break;
753 
754  case X86::LOAD_STACK_GUARD:
755  case X86::AVX1_SETALLONES:
756  case X86::AVX2_SETALLONES:
757  case X86::AVX512_128_SET0:
758  case X86::AVX512_256_SET0:
759  case X86::AVX512_512_SET0:
760  case X86::AVX512_512_SETALLONES:
761  case X86::AVX512_FsFLD0SD:
762  case X86::AVX512_FsFLD0SH:
763  case X86::AVX512_FsFLD0SS:
764  case X86::AVX512_FsFLD0F128:
765  case X86::AVX_SET0:
766  case X86::FsFLD0SD:
767  case X86::FsFLD0SS:
768  case X86::FsFLD0SH:
769  case X86::FsFLD0F128:
770  case X86::KSET0D:
771  case X86::KSET0Q:
772  case X86::KSET0W:
773  case X86::KSET1D:
774  case X86::KSET1Q:
775  case X86::KSET1W:
776  case X86::MMX_SET0:
777  case X86::MOV32ImmSExti8:
778  case X86::MOV32r0:
779  case X86::MOV32r1:
780  case X86::MOV32r_1:
781  case X86::MOV32ri64:
782  case X86::MOV64ImmSExti8:
783  case X86::V_SET0:
784  case X86::V_SETALLONES:
785  case X86::MOV16ri:
786  case X86::MOV32ri:
787  case X86::MOV64ri:
788  case X86::MOV64ri32:
789  case X86::MOV8ri:
790  case X86::PTILEZEROV:
791  return true;
792 
793  case X86::MOV8rm:
794  case X86::MOV8rm_NOREX:
795  case X86::MOV16rm:
796  case X86::MOV32rm:
797  case X86::MOV64rm:
798  case X86::MOVSSrm:
799  case X86::MOVSSrm_alt:
800  case X86::MOVSDrm:
801  case X86::MOVSDrm_alt:
802  case X86::MOVAPSrm:
803  case X86::MOVUPSrm:
804  case X86::MOVAPDrm:
805  case X86::MOVUPDrm:
806  case X86::MOVDQArm:
807  case X86::MOVDQUrm:
808  case X86::VMOVSSrm:
809  case X86::VMOVSSrm_alt:
810  case X86::VMOVSDrm:
811  case X86::VMOVSDrm_alt:
812  case X86::VMOVAPSrm:
813  case X86::VMOVUPSrm:
814  case X86::VMOVAPDrm:
815  case X86::VMOVUPDrm:
816  case X86::VMOVDQArm:
817  case X86::VMOVDQUrm:
818  case X86::VMOVAPSYrm:
819  case X86::VMOVUPSYrm:
820  case X86::VMOVAPDYrm:
821  case X86::VMOVUPDYrm:
822  case X86::VMOVDQAYrm:
823  case X86::VMOVDQUYrm:
824  case X86::MMX_MOVD64rm:
825  case X86::MMX_MOVQ64rm:
826  // AVX-512
827  case X86::VMOVSSZrm:
828  case X86::VMOVSSZrm_alt:
829  case X86::VMOVSDZrm:
830  case X86::VMOVSDZrm_alt:
831  case X86::VMOVSHZrm:
832  case X86::VMOVSHZrm_alt:
833  case X86::VMOVAPDZ128rm:
834  case X86::VMOVAPDZ256rm:
835  case X86::VMOVAPDZrm:
836  case X86::VMOVAPSZ128rm:
837  case X86::VMOVAPSZ256rm:
838  case X86::VMOVAPSZ128rm_NOVLX:
839  case X86::VMOVAPSZ256rm_NOVLX:
840  case X86::VMOVAPSZrm:
841  case X86::VMOVDQA32Z128rm:
842  case X86::VMOVDQA32Z256rm:
843  case X86::VMOVDQA32Zrm:
844  case X86::VMOVDQA64Z128rm:
845  case X86::VMOVDQA64Z256rm:
846  case X86::VMOVDQA64Zrm:
847  case X86::VMOVDQU16Z128rm:
848  case X86::VMOVDQU16Z256rm:
849  case X86::VMOVDQU16Zrm:
850  case X86::VMOVDQU32Z128rm:
851  case X86::VMOVDQU32Z256rm:
852  case X86::VMOVDQU32Zrm:
853  case X86::VMOVDQU64Z128rm:
854  case X86::VMOVDQU64Z256rm:
855  case X86::VMOVDQU64Zrm:
856  case X86::VMOVDQU8Z128rm:
857  case X86::VMOVDQU8Z256rm:
858  case X86::VMOVDQU8Zrm:
859  case X86::VMOVUPDZ128rm:
860  case X86::VMOVUPDZ256rm:
861  case X86::VMOVUPDZrm:
862  case X86::VMOVUPSZ128rm:
863  case X86::VMOVUPSZ256rm:
864  case X86::VMOVUPSZ128rm_NOVLX:
865  case X86::VMOVUPSZ256rm_NOVLX:
866  case X86::VMOVUPSZrm: {
867  // Loads from constant pools are trivially rematerializable.
868  if (MI.getOperand(1 + X86::AddrBaseReg).isReg() &&
869  MI.getOperand(1 + X86::AddrScaleAmt).isImm() &&
870  MI.getOperand(1 + X86::AddrIndexReg).isReg() &&
871  MI.getOperand(1 + X86::AddrIndexReg).getReg() == 0 &&
872  MI.isDereferenceableInvariantLoad()) {
873  Register BaseReg = MI.getOperand(1 + X86::AddrBaseReg).getReg();
874  if (BaseReg == 0 || BaseReg == X86::RIP)
875  return true;
876  // Allow re-materialization of PIC load.
877  if (!ReMatPICStubLoad && MI.getOperand(1 + X86::AddrDisp).isGlobal())
878  return false;
879  const MachineFunction &MF = *MI.getParent()->getParent();
880  const MachineRegisterInfo &MRI = MF.getRegInfo();
881  return regIsPICBase(BaseReg, MRI);
882  }
883  return false;
884  }
885 
886  case X86::LEA32r:
887  case X86::LEA64r: {
888  if (MI.getOperand(1 + X86::AddrScaleAmt).isImm() &&
889  MI.getOperand(1 + X86::AddrIndexReg).isReg() &&
890  MI.getOperand(1 + X86::AddrIndexReg).getReg() == 0 &&
891  !MI.getOperand(1 + X86::AddrDisp).isReg()) {
892  // lea fi#, lea GV, etc. are all rematerializable.
893  if (!MI.getOperand(1 + X86::AddrBaseReg).isReg())
894  return true;
895  Register BaseReg = MI.getOperand(1 + X86::AddrBaseReg).getReg();
896  if (BaseReg == 0)
897  return true;
898  // Allow re-materialization of lea PICBase + x.
899  const MachineFunction &MF = *MI.getParent()->getParent();
900  const MachineRegisterInfo &MRI = MF.getRegInfo();
901  return regIsPICBase(BaseReg, MRI);
902  }
903  return false;
904  }
905  }
906 }
907 
910  Register DestReg, unsigned SubIdx,
911  const MachineInstr &Orig,
912  const TargetRegisterInfo &TRI) const {
913  bool ClobbersEFLAGS = Orig.modifiesRegister(X86::EFLAGS, &TRI);
914  if (ClobbersEFLAGS && MBB.computeRegisterLiveness(&TRI, X86::EFLAGS, I) !=
916  // The instruction clobbers EFLAGS. Re-materialize as MOV32ri to avoid side
917  // effects.
918  int Value;
919  switch (Orig.getOpcode()) {
920  case X86::MOV32r0: Value = 0; break;
921  case X86::MOV32r1: Value = 1; break;
922  case X86::MOV32r_1: Value = -1; break;
923  default:
924  llvm_unreachable("Unexpected instruction!");
925  }
926 
927  const DebugLoc &DL = Orig.getDebugLoc();
928  BuildMI(MBB, I, DL, get(X86::MOV32ri))
929  .add(Orig.getOperand(0))
930  .addImm(Value);
931  } else {
933  MBB.insert(I, MI);
934  }
935 
936  MachineInstr &NewMI = *std::prev(I);
937  NewMI.substituteRegister(Orig.getOperand(0).getReg(), DestReg, SubIdx, TRI);
938 }
939 
940 /// True if MI has a condition code def, e.g. EFLAGS, that is not marked dead.
942  for (const MachineOperand &MO : MI.operands()) {
943  if (MO.isReg() && MO.isDef() &&
944  MO.getReg() == X86::EFLAGS && !MO.isDead()) {
945  return true;
946  }
947  }
948  return false;
949 }
950 
951 /// Check whether the shift count for a machine operand is non-zero.
952 inline static unsigned getTruncatedShiftCount(const MachineInstr &MI,
953  unsigned ShiftAmtOperandIdx) {
954  // The shift count is six bits with the REX.W prefix and five bits without.
955  unsigned ShiftCountMask = (MI.getDesc().TSFlags & X86II::REX_W) ? 63 : 31;
956  unsigned Imm = MI.getOperand(ShiftAmtOperandIdx).getImm();
957  return Imm & ShiftCountMask;
958 }
959 
960 /// Check whether the given shift count is appropriate
961 /// can be represented by a LEA instruction.
962 inline static bool isTruncatedShiftCountForLEA(unsigned ShAmt) {
963  // Left shift instructions can be transformed into load-effective-address
964  // instructions if we can encode them appropriately.
965  // A LEA instruction utilizes a SIB byte to encode its scale factor.
966  // The SIB.scale field is two bits wide which means that we can encode any
967  // shift amount less than 4.
968  return ShAmt < 4 && ShAmt > 0;
969 }
970 
971 static bool findRedundantFlagInstr(MachineInstr &CmpInstr,
972  MachineInstr &CmpValDefInstr,
973  const MachineRegisterInfo *MRI,
974  MachineInstr **AndInstr,
975  const TargetRegisterInfo *TRI,
976  bool &NoSignFlag, bool &ClearsOverflowFlag) {
977  if (CmpValDefInstr.getOpcode() != X86::SUBREG_TO_REG)
978  return false;
979 
980  if (CmpInstr.getOpcode() != X86::TEST64rr)
981  return false;
982 
983  // CmpInstr is a TEST64rr instruction, and `X86InstrInfo::analyzeCompare`
984  // guarantees that it's analyzable only if two registers are identical.
985  assert(
986  (CmpInstr.getOperand(0).getReg() == CmpInstr.getOperand(1).getReg()) &&
987  "CmpInstr is an analyzable TEST64rr, and `X86InstrInfo::analyzeCompare` "
988  "requires two reg operands are the same.");
989 
990  // Caller (`X86InstrInfo::optimizeCompareInstr`) guarantees that
991  // `CmpValDefInstr` defines the value that's used by `CmpInstr`; in this case
992  // if `CmpValDefInstr` sets the EFLAGS, it is likely that `CmpInstr` is
993  // redundant.
994  assert(
995  (MRI->getVRegDef(CmpInstr.getOperand(0).getReg()) == &CmpValDefInstr) &&
996  "Caller guarantees that TEST64rr is a user of SUBREG_TO_REG.");
997 
998  // As seen in X86 td files, CmpValDefInstr.getOperand(1).getImm() is typically
999  // 0.
1000  if (CmpValDefInstr.getOperand(1).getImm() != 0)
1001  return false;
1002 
1003  // As seen in X86 td files, CmpValDefInstr.getOperand(3) is typically
1004  // sub_32bit or sub_xmm.
1005  if (CmpValDefInstr.getOperand(3).getImm() != X86::sub_32bit)
1006  return false;
1007 
1008  MachineInstr *VregDefInstr =
1009  MRI->getVRegDef(CmpValDefInstr.getOperand(2).getReg());
1010 
1011  assert(VregDefInstr && "Must have a definition (SSA)");
1012 
1013  // Requires `CmpValDefInstr` and `VregDefInstr` are from the same MBB
1014  // to simplify the subsequent analysis.
1015  //
1016  // FIXME: If `VregDefInstr->getParent()` is the only predecessor of
1017  // `CmpValDefInstr.getParent()`, this could be handled.
1018  if (VregDefInstr->getParent() != CmpValDefInstr.getParent())
1019  return false;
1020 
1021  if (X86::isAND(VregDefInstr->getOpcode())) {
1022  // Get a sequence of instructions like
1023  // %reg = and* ... // Set EFLAGS
1024  // ... // EFLAGS not changed
1025  // %extended_reg = subreg_to_reg 0, %reg, %subreg.sub_32bit
1026  // test64rr %extended_reg, %extended_reg, implicit-def $eflags
1027  //
1028  // If subsequent readers use a subset of bits that don't change
1029  // after `and*` instructions, it's likely that the test64rr could
1030  // be optimized away.
1031  for (const MachineInstr &Instr :
1032  make_range(std::next(MachineBasicBlock::iterator(VregDefInstr)),
1033  MachineBasicBlock::iterator(CmpValDefInstr))) {
1034  // There are instructions between 'VregDefInstr' and
1035  // 'CmpValDefInstr' that modifies EFLAGS.
1036  if (Instr.modifiesRegister(X86::EFLAGS, TRI))
1037  return false;
1038  }
1039 
1040  *AndInstr = VregDefInstr;
1041 
1042  // AND instruction will essentially update SF and clear OF, so
1043  // NoSignFlag should be false in the sense that SF is modified by `AND`.
1044  //
1045  // However, the implementation artifically sets `NoSignFlag` to true
1046  // to poison the SF bit; that is to say, if SF is looked at later, the
1047  // optimization (to erase TEST64rr) will be disabled.
1048  //
1049  // The reason to poison SF bit is that SF bit value could be different
1050  // in the `AND` and `TEST` operation; signed bit is not known for `AND`,
1051  // and is known to be 0 as a result of `TEST64rr`.
1052  //
1053  // FIXME: As opposed to poisoning the SF bit directly, consider peeking into
1054  // the AND instruction and using the static information to guide peephole
1055  // optimization if possible. For example, it's possible to fold a
1056  // conditional move into a copy if the relevant EFLAG bits could be deduced
1057  // from an immediate operand of and operation.
1058  //
1059  NoSignFlag = true;
1060  // ClearsOverflowFlag is true for AND operation (no surprise).
1061  ClearsOverflowFlag = true;
1062  return true;
1063  }
1064  return false;
1065 }
1066 
1068  unsigned Opc, bool AllowSP, Register &NewSrc,
1069  bool &isKill, MachineOperand &ImplicitOp,
1070  LiveVariables *LV, LiveIntervals *LIS) const {
1071  MachineFunction &MF = *MI.getParent()->getParent();
1072  const TargetRegisterClass *RC;
1073  if (AllowSP) {
1074  RC = Opc != X86::LEA32r ? &X86::GR64RegClass : &X86::GR32RegClass;
1075  } else {
1076  RC = Opc != X86::LEA32r ?
1077  &X86::GR64_NOSPRegClass : &X86::GR32_NOSPRegClass;
1078  }
1079  Register SrcReg = Src.getReg();
1080  isKill = MI.killsRegister(SrcReg);
1081 
1082  // For both LEA64 and LEA32 the register already has essentially the right
1083  // type (32-bit or 64-bit) we may just need to forbid SP.
1084  if (Opc != X86::LEA64_32r) {
1085  NewSrc = SrcReg;
1086  assert(!Src.isUndef() && "Undef op doesn't need optimization");
1087 
1088  if (NewSrc.isVirtual() && !MF.getRegInfo().constrainRegClass(NewSrc, RC))
1089  return false;
1090 
1091  return true;
1092  }
1093 
1094  // This is for an LEA64_32r and incoming registers are 32-bit. One way or
1095  // another we need to add 64-bit registers to the final MI.
1096  if (SrcReg.isPhysical()) {
1097  ImplicitOp = Src;
1098  ImplicitOp.setImplicit();
1099 
1100  NewSrc = getX86SubSuperRegister(SrcReg, 64);
1101  assert(!Src.isUndef() && "Undef op doesn't need optimization");
1102  } else {
1103  // Virtual register of the wrong class, we have to create a temporary 64-bit
1104  // vreg to feed into the LEA.
1105  NewSrc = MF.getRegInfo().createVirtualRegister(RC);
1106  MachineInstr *Copy =
1107  BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(TargetOpcode::COPY))
1108  .addReg(NewSrc, RegState::Define | RegState::Undef, X86::sub_32bit)
1109  .addReg(SrcReg, getKillRegState(isKill));
1110 
1111  // Which is obviously going to be dead after we're done with it.
1112  isKill = true;
1113 
1114  if (LV)
1115  LV->replaceKillInstruction(SrcReg, MI, *Copy);
1116 
1117  if (LIS) {
1118  SlotIndex CopyIdx = LIS->InsertMachineInstrInMaps(*Copy);
1119  SlotIndex Idx = LIS->getInstructionIndex(MI);
1120  LiveInterval &LI = LIS->getInterval(SrcReg);
1122  if (S->end.getBaseIndex() == Idx)
1123  S->end = CopyIdx.getRegSlot();
1124  }
1125  }
1126 
1127  // We've set all the parameters without issue.
1128  return true;
1129 }
1130 
1131 MachineInstr *X86InstrInfo::convertToThreeAddressWithLEA(unsigned MIOpc,
1132  MachineInstr &MI,
1133  LiveVariables *LV,
1134  LiveIntervals *LIS,
1135  bool Is8BitOp) const {
1136  // We handle 8-bit adds and various 16-bit opcodes in the switch below.
1137  MachineBasicBlock &MBB = *MI.getParent();
1138  MachineRegisterInfo &RegInfo = MBB.getParent()->getRegInfo();
1139  assert((Is8BitOp || RegInfo.getTargetRegisterInfo()->getRegSizeInBits(
1140  *RegInfo.getRegClass(MI.getOperand(0).getReg())) == 16) &&
1141  "Unexpected type for LEA transform");
1142 
1143  // TODO: For a 32-bit target, we need to adjust the LEA variables with
1144  // something like this:
1145  // Opcode = X86::LEA32r;
1146  // InRegLEA = RegInfo.createVirtualRegister(&X86::GR32_NOSPRegClass);
1147  // OutRegLEA =
1148  // Is8BitOp ? RegInfo.createVirtualRegister(&X86::GR32ABCD_RegClass)
1149  // : RegInfo.createVirtualRegister(&X86::GR32RegClass);
1150  if (!Subtarget.is64Bit())
1151  return nullptr;
1152 
1153  unsigned Opcode = X86::LEA64_32r;
1154  Register InRegLEA = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass);
1155  Register OutRegLEA = RegInfo.createVirtualRegister(&X86::GR32RegClass);
1156  Register InRegLEA2;
1157 
1158  // Build and insert into an implicit UNDEF value. This is OK because
1159  // we will be shifting and then extracting the lower 8/16-bits.
1160  // This has the potential to cause partial register stall. e.g.
1161  // movw (%rbp,%rcx,2), %dx
1162  // leal -65(%rdx), %esi
1163  // But testing has shown this *does* help performance in 64-bit mode (at
1164  // least on modern x86 machines).
1165  MachineBasicBlock::iterator MBBI = MI.getIterator();
1166  Register Dest = MI.getOperand(0).getReg();
1167  Register Src = MI.getOperand(1).getReg();
1168  Register Src2;
1169  bool IsDead = MI.getOperand(0).isDead();
1170  bool IsKill = MI.getOperand(1).isKill();
1171  unsigned SubReg = Is8BitOp ? X86::sub_8bit : X86::sub_16bit;
1172  assert(!MI.getOperand(1).isUndef() && "Undef op doesn't need optimization");
1173  MachineInstr *ImpDef =
1174  BuildMI(MBB, MBBI, MI.getDebugLoc(), get(X86::IMPLICIT_DEF), InRegLEA);
1175  MachineInstr *InsMI =
1176  BuildMI(MBB, MBBI, MI.getDebugLoc(), get(TargetOpcode::COPY))
1177  .addReg(InRegLEA, RegState::Define, SubReg)
1178  .addReg(Src, getKillRegState(IsKill));
1179  MachineInstr *ImpDef2 = nullptr;
1180  MachineInstr *InsMI2 = nullptr;
1181 
1182  MachineInstrBuilder MIB =
1183  BuildMI(MBB, MBBI, MI.getDebugLoc(), get(Opcode), OutRegLEA);
1184  switch (MIOpc) {
1185  default: llvm_unreachable("Unreachable!");
1186  case X86::SHL8ri:
1187  case X86::SHL16ri: {
1188  unsigned ShAmt = MI.getOperand(2).getImm();
1189  MIB.addReg(0)
1190  .addImm(1LL << ShAmt)
1191  .addReg(InRegLEA, RegState::Kill)
1192  .addImm(0)
1193  .addReg(0);
1194  break;
1195  }
1196  case X86::INC8r:
1197  case X86::INC16r:
1198  addRegOffset(MIB, InRegLEA, true, 1);
1199  break;
1200  case X86::DEC8r:
1201  case X86::DEC16r:
1202  addRegOffset(MIB, InRegLEA, true, -1);
1203  break;
1204  case X86::ADD8ri:
1205  case X86::ADD8ri_DB:
1206  case X86::ADD16ri:
1207  case X86::ADD16ri8:
1208  case X86::ADD16ri_DB:
1209  case X86::ADD16ri8_DB:
1210  addRegOffset(MIB, InRegLEA, true, MI.getOperand(2).getImm());
1211  break;
1212  case X86::ADD8rr:
1213  case X86::ADD8rr_DB:
1214  case X86::ADD16rr:
1215  case X86::ADD16rr_DB: {
1216  Src2 = MI.getOperand(2).getReg();
1217  bool IsKill2 = MI.getOperand(2).isKill();
1218  assert(!MI.getOperand(2).isUndef() && "Undef op doesn't need optimization");
1219  if (Src == Src2) {
1220  // ADD8rr/ADD16rr killed %reg1028, %reg1028
1221  // just a single insert_subreg.
1222  addRegReg(MIB, InRegLEA, true, InRegLEA, false);
1223  } else {
1224  if (Subtarget.is64Bit())
1225  InRegLEA2 = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass);
1226  else
1227  InRegLEA2 = RegInfo.createVirtualRegister(&X86::GR32_NOSPRegClass);
1228  // Build and insert into an implicit UNDEF value. This is OK because
1229  // we will be shifting and then extracting the lower 8/16-bits.
1230  ImpDef2 = BuildMI(MBB, &*MIB, MI.getDebugLoc(), get(X86::IMPLICIT_DEF),
1231  InRegLEA2);
1232  InsMI2 = BuildMI(MBB, &*MIB, MI.getDebugLoc(), get(TargetOpcode::COPY))
1233  .addReg(InRegLEA2, RegState::Define, SubReg)
1234  .addReg(Src2, getKillRegState(IsKill2));
1235  addRegReg(MIB, InRegLEA, true, InRegLEA2, true);
1236  }
1237  if (LV && IsKill2 && InsMI2)
1238  LV->replaceKillInstruction(Src2, MI, *InsMI2);
1239  break;
1240  }
1241  }
1242 
1243  MachineInstr *NewMI = MIB;
1244  MachineInstr *ExtMI =
1245  BuildMI(MBB, MBBI, MI.getDebugLoc(), get(TargetOpcode::COPY))
1247  .addReg(OutRegLEA, RegState::Kill, SubReg);
1248 
1249  if (LV) {
1250  // Update live variables.
1251  LV->getVarInfo(InRegLEA).Kills.push_back(NewMI);
1252  if (InRegLEA2)
1253  LV->getVarInfo(InRegLEA2).Kills.push_back(NewMI);
1254  LV->getVarInfo(OutRegLEA).Kills.push_back(ExtMI);
1255  if (IsKill)
1256  LV->replaceKillInstruction(Src, MI, *InsMI);
1257  if (IsDead)
1258  LV->replaceKillInstruction(Dest, MI, *ExtMI);
1259  }
1260 
1261  if (LIS) {
1262  LIS->InsertMachineInstrInMaps(*ImpDef);
1263  SlotIndex InsIdx = LIS->InsertMachineInstrInMaps(*InsMI);
1264  if (ImpDef2)
1265  LIS->InsertMachineInstrInMaps(*ImpDef2);
1266  SlotIndex Ins2Idx;
1267  if (InsMI2)
1268  Ins2Idx = LIS->InsertMachineInstrInMaps(*InsMI2);
1269  SlotIndex NewIdx = LIS->ReplaceMachineInstrInMaps(MI, *NewMI);
1270  SlotIndex ExtIdx = LIS->InsertMachineInstrInMaps(*ExtMI);
1271  LIS->getInterval(InRegLEA);
1272  LIS->getInterval(OutRegLEA);
1273  if (InRegLEA2)
1274  LIS->getInterval(InRegLEA2);
1275 
1276  // Move the use of Src up to InsMI.
1277  LiveInterval &SrcLI = LIS->getInterval(Src);
1278  LiveRange::Segment *SrcSeg = SrcLI.getSegmentContaining(NewIdx);
1279  if (SrcSeg->end == NewIdx.getRegSlot())
1280  SrcSeg->end = InsIdx.getRegSlot();
1281 
1282  if (InsMI2) {
1283  // Move the use of Src2 up to InsMI2.
1284  LiveInterval &Src2LI = LIS->getInterval(Src2);
1285  LiveRange::Segment *Src2Seg = Src2LI.getSegmentContaining(NewIdx);
1286  if (Src2Seg->end == NewIdx.getRegSlot())
1287  Src2Seg->end = Ins2Idx.getRegSlot();
1288  }
1289 
1290  // Move the definition of Dest down to ExtMI.
1291  LiveInterval &DestLI = LIS->getInterval(Dest);
1292  LiveRange::Segment *DestSeg =
1293  DestLI.getSegmentContaining(NewIdx.getRegSlot());
1294  assert(DestSeg->start == NewIdx.getRegSlot() &&
1295  DestSeg->valno->def == NewIdx.getRegSlot());
1296  DestSeg->start = ExtIdx.getRegSlot();
1297  DestSeg->valno->def = ExtIdx.getRegSlot();
1298  }
1299 
1300  return ExtMI;
1301 }
1302 
1303 /// This method must be implemented by targets that
1304 /// set the M_CONVERTIBLE_TO_3_ADDR flag. When this flag is set, the target
1305 /// may be able to convert a two-address instruction into a true
1306 /// three-address instruction on demand. This allows the X86 target (for
1307 /// example) to convert ADD and SHL instructions into LEA instructions if they
1308 /// would require register copies due to two-addressness.
1309 ///
1310 /// This method returns a null pointer if the transformation cannot be
1311 /// performed, otherwise it returns the new instruction.
1312 ///
1314  LiveVariables *LV,
1315  LiveIntervals *LIS) const {
1316  // The following opcodes also sets the condition code register(s). Only
1317  // convert them to equivalent lea if the condition code register def's
1318  // are dead!
1319  if (hasLiveCondCodeDef(MI))
1320  return nullptr;
1321 
1322  MachineFunction &MF = *MI.getParent()->getParent();
1323  // All instructions input are two-addr instructions. Get the known operands.
1324  const MachineOperand &Dest = MI.getOperand(0);
1325  const MachineOperand &Src = MI.getOperand(1);
1326 
1327  // Ideally, operations with undef should be folded before we get here, but we
1328  // can't guarantee it. Bail out because optimizing undefs is a waste of time.
1329  // Without this, we have to forward undef state to new register operands to
1330  // avoid machine verifier errors.
1331  if (Src.isUndef())
1332  return nullptr;
1333  if (MI.getNumOperands() > 2)
1334  if (MI.getOperand(2).isReg() && MI.getOperand(2).isUndef())
1335  return nullptr;
1336 
1337  MachineInstr *NewMI = nullptr;
1338  Register SrcReg, SrcReg2;
1339  bool Is64Bit = Subtarget.is64Bit();
1340 
1341  bool Is8BitOp = false;
1342  unsigned NumRegOperands = 2;
1343  unsigned MIOpc = MI.getOpcode();
1344  switch (MIOpc) {
1345  default: llvm_unreachable("Unreachable!");
1346  case X86::SHL64ri: {
1347  assert(MI.getNumOperands() >= 3 && "Unknown shift instruction!");
1348  unsigned ShAmt = getTruncatedShiftCount(MI, 2);
1349  if (!isTruncatedShiftCountForLEA(ShAmt)) return nullptr;
1350 
1351  // LEA can't handle RSP.
1352  if (Src.getReg().isVirtual() && !MF.getRegInfo().constrainRegClass(
1353  Src.getReg(), &X86::GR64_NOSPRegClass))
1354  return nullptr;
1355 
1356  NewMI = BuildMI(MF, MI.getDebugLoc(), get(X86::LEA64r))
1357  .add(Dest)
1358  .addReg(0)
1359  .addImm(1LL << ShAmt)
1360  .add(Src)
1361  .addImm(0)
1362  .addReg(0);
1363  break;
1364  }
1365  case X86::SHL32ri: {
1366  assert(MI.getNumOperands() >= 3 && "Unknown shift instruction!");
1367  unsigned ShAmt = getTruncatedShiftCount(MI, 2);
1368  if (!isTruncatedShiftCountForLEA(ShAmt)) return nullptr;
1369 
1370  unsigned Opc = Is64Bit ? X86::LEA64_32r : X86::LEA32r;
1371 
1372  // LEA can't handle ESP.
1373  bool isKill;
1374  MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
1375  if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/false, SrcReg, isKill,
1376  ImplicitOp, LV, LIS))
1377  return nullptr;
1378 
1379  MachineInstrBuilder MIB =
1380  BuildMI(MF, MI.getDebugLoc(), get(Opc))
1381  .add(Dest)
1382  .addReg(0)
1383  .addImm(1LL << ShAmt)
1384  .addReg(SrcReg, getKillRegState(isKill))
1385  .addImm(0)
1386  .addReg(0);
1387  if (ImplicitOp.getReg() != 0)
1388  MIB.add(ImplicitOp);
1389  NewMI = MIB;
1390 
1391  // Add kills if classifyLEAReg created a new register.
1392  if (LV && SrcReg != Src.getReg())
1393  LV->getVarInfo(SrcReg).Kills.push_back(NewMI);
1394  break;
1395  }
1396  case X86::SHL8ri:
1397  Is8BitOp = true;
1398  [[fallthrough]];
1399  case X86::SHL16ri: {
1400  assert(MI.getNumOperands() >= 3 && "Unknown shift instruction!");
1401  unsigned ShAmt = getTruncatedShiftCount(MI, 2);
1402  if (!isTruncatedShiftCountForLEA(ShAmt))
1403  return nullptr;
1404  return convertToThreeAddressWithLEA(MIOpc, MI, LV, LIS, Is8BitOp);
1405  }
1406  case X86::INC64r:
1407  case X86::INC32r: {
1408  assert(MI.getNumOperands() >= 2 && "Unknown inc instruction!");
1409  unsigned Opc = MIOpc == X86::INC64r ? X86::LEA64r :
1410  (Is64Bit ? X86::LEA64_32r : X86::LEA32r);
1411  bool isKill;
1412  MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
1413  if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/false, SrcReg, isKill,
1414  ImplicitOp, LV, LIS))
1415  return nullptr;
1416 
1417  MachineInstrBuilder MIB =
1418  BuildMI(MF, MI.getDebugLoc(), get(Opc))
1419  .add(Dest)
1420  .addReg(SrcReg, getKillRegState(isKill));
1421  if (ImplicitOp.getReg() != 0)
1422  MIB.add(ImplicitOp);
1423 
1424  NewMI = addOffset(MIB, 1);
1425 
1426  // Add kills if classifyLEAReg created a new register.
1427  if (LV && SrcReg != Src.getReg())
1428  LV->getVarInfo(SrcReg).Kills.push_back(NewMI);
1429  break;
1430  }
1431  case X86::DEC64r:
1432  case X86::DEC32r: {
1433  assert(MI.getNumOperands() >= 2 && "Unknown dec instruction!");
1434  unsigned Opc = MIOpc == X86::DEC64r ? X86::LEA64r
1435  : (Is64Bit ? X86::LEA64_32r : X86::LEA32r);
1436 
1437  bool isKill;
1438  MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
1439  if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/false, SrcReg, isKill,
1440  ImplicitOp, LV, LIS))
1441  return nullptr;
1442 
1443  MachineInstrBuilder MIB = BuildMI(MF, MI.getDebugLoc(), get(Opc))
1444  .add(Dest)
1445  .addReg(SrcReg, getKillRegState(isKill));
1446  if (ImplicitOp.getReg() != 0)
1447  MIB.add(ImplicitOp);
1448 
1449  NewMI = addOffset(MIB, -1);
1450 
1451  // Add kills if classifyLEAReg created a new register.
1452  if (LV && SrcReg != Src.getReg())
1453  LV->getVarInfo(SrcReg).Kills.push_back(NewMI);
1454  break;
1455  }
1456  case X86::DEC8r:
1457  case X86::INC8r:
1458  Is8BitOp = true;
1459  [[fallthrough]];
1460  case X86::DEC16r:
1461  case X86::INC16r:
1462  return convertToThreeAddressWithLEA(MIOpc, MI, LV, LIS, Is8BitOp);
1463  case X86::ADD64rr:
1464  case X86::ADD64rr_DB:
1465  case X86::ADD32rr:
1466  case X86::ADD32rr_DB: {
1467  assert(MI.getNumOperands() >= 3 && "Unknown add instruction!");
1468  unsigned Opc;
1469  if (MIOpc == X86::ADD64rr || MIOpc == X86::ADD64rr_DB)
1470  Opc = X86::LEA64r;
1471  else
1472  Opc = Is64Bit ? X86::LEA64_32r : X86::LEA32r;
1473 
1474  const MachineOperand &Src2 = MI.getOperand(2);
1475  bool isKill2;
1476  MachineOperand ImplicitOp2 = MachineOperand::CreateReg(0, false);
1477  if (!classifyLEAReg(MI, Src2, Opc, /*AllowSP=*/false, SrcReg2, isKill2,
1478  ImplicitOp2, LV, LIS))
1479  return nullptr;
1480 
1481  bool isKill;
1482  MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
1483  if (Src.getReg() == Src2.getReg()) {
1484  // Don't call classify LEAReg a second time on the same register, in case
1485  // the first call inserted a COPY from Src2 and marked it as killed.
1486  isKill = isKill2;
1487  SrcReg = SrcReg2;
1488  } else {
1489  if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/true, SrcReg, isKill,
1490  ImplicitOp, LV, LIS))
1491  return nullptr;
1492  }
1493 
1494  MachineInstrBuilder MIB = BuildMI(MF, MI.getDebugLoc(), get(Opc)).add(Dest);
1495  if (ImplicitOp.getReg() != 0)
1496  MIB.add(ImplicitOp);
1497  if (ImplicitOp2.getReg() != 0)
1498  MIB.add(ImplicitOp2);
1499 
1500  NewMI = addRegReg(MIB, SrcReg, isKill, SrcReg2, isKill2);
1501 
1502  // Add kills if classifyLEAReg created a new register.
1503  if (LV) {
1504  if (SrcReg2 != Src2.getReg())
1505  LV->getVarInfo(SrcReg2).Kills.push_back(NewMI);
1506  if (SrcReg != SrcReg2 && SrcReg != Src.getReg())
1507  LV->getVarInfo(SrcReg).Kills.push_back(NewMI);
1508  }
1509  NumRegOperands = 3;
1510  break;
1511  }
1512  case X86::ADD8rr:
1513  case X86::ADD8rr_DB:
1514  Is8BitOp = true;
1515  [[fallthrough]];
1516  case X86::ADD16rr:
1517  case X86::ADD16rr_DB:
1518  return convertToThreeAddressWithLEA(MIOpc, MI, LV, LIS, Is8BitOp);
1519  case X86::ADD64ri32:
1520  case X86::ADD64ri8:
1521  case X86::ADD64ri32_DB:
1522  case X86::ADD64ri8_DB:
1523  assert(MI.getNumOperands() >= 3 && "Unknown add instruction!");
1524  NewMI = addOffset(
1525  BuildMI(MF, MI.getDebugLoc(), get(X86::LEA64r)).add(Dest).add(Src),
1526  MI.getOperand(2));
1527  break;
1528  case X86::ADD32ri:
1529  case X86::ADD32ri8:
1530  case X86::ADD32ri_DB:
1531  case X86::ADD32ri8_DB: {
1532  assert(MI.getNumOperands() >= 3 && "Unknown add instruction!");
1533  unsigned Opc = Is64Bit ? X86::LEA64_32r : X86::LEA32r;
1534 
1535  bool isKill;
1536  MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
1537  if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/true, SrcReg, isKill,
1538  ImplicitOp, LV, LIS))
1539  return nullptr;
1540 
1541  MachineInstrBuilder MIB = BuildMI(MF, MI.getDebugLoc(), get(Opc))
1542  .add(Dest)
1543  .addReg(SrcReg, getKillRegState(isKill));
1544  if (ImplicitOp.getReg() != 0)
1545  MIB.add(ImplicitOp);
1546 
1547  NewMI = addOffset(MIB, MI.getOperand(2));
1548 
1549  // Add kills if classifyLEAReg created a new register.
1550  if (LV && SrcReg != Src.getReg())
1551  LV->getVarInfo(SrcReg).Kills.push_back(NewMI);
1552  break;
1553  }
1554  case X86::ADD8ri:
1555  case X86::ADD8ri_DB:
1556  Is8BitOp = true;
1557  [[fallthrough]];
1558  case X86::ADD16ri:
1559  case X86::ADD16ri8:
1560  case X86::ADD16ri_DB:
1561  case X86::ADD16ri8_DB:
1562  return convertToThreeAddressWithLEA(MIOpc, MI, LV, LIS, Is8BitOp);
1563  case X86::SUB8ri:
1564  case X86::SUB16ri8:
1565  case X86::SUB16ri:
1566  /// FIXME: Support these similar to ADD8ri/ADD16ri*.
1567  return nullptr;
1568  case X86::SUB32ri8:
1569  case X86::SUB32ri: {
1570  if (!MI.getOperand(2).isImm())
1571  return nullptr;
1572  int64_t Imm = MI.getOperand(2).getImm();
1573  if (!isInt<32>(-Imm))
1574  return nullptr;
1575 
1576  assert(MI.getNumOperands() >= 3 && "Unknown add instruction!");
1577  unsigned Opc = Is64Bit ? X86::LEA64_32r : X86::LEA32r;
1578 
1579  bool isKill;
1580  MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
1581  if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/true, SrcReg, isKill,
1582  ImplicitOp, LV, LIS))
1583  return nullptr;
1584 
1585  MachineInstrBuilder MIB = BuildMI(MF, MI.getDebugLoc(), get(Opc))
1586  .add(Dest)
1587  .addReg(SrcReg, getKillRegState(isKill));
1588  if (ImplicitOp.getReg() != 0)
1589  MIB.add(ImplicitOp);
1590 
1591  NewMI = addOffset(MIB, -Imm);
1592 
1593  // Add kills if classifyLEAReg created a new register.
1594  if (LV && SrcReg != Src.getReg())
1595  LV->getVarInfo(SrcReg).Kills.push_back(NewMI);
1596  break;
1597  }
1598 
1599  case X86::SUB64ri8:
1600  case X86::SUB64ri32: {
1601  if (!MI.getOperand(2).isImm())
1602  return nullptr;
1603  int64_t Imm = MI.getOperand(2).getImm();
1604  if (!isInt<32>(-Imm))
1605  return nullptr;
1606 
1607  assert(MI.getNumOperands() >= 3 && "Unknown sub instruction!");
1608 
1609  MachineInstrBuilder MIB = BuildMI(MF, MI.getDebugLoc(),
1610  get(X86::LEA64r)).add(Dest).add(Src);
1611  NewMI = addOffset(MIB, -Imm);
1612  break;
1613  }
1614 
1615  case X86::VMOVDQU8Z128rmk:
1616  case X86::VMOVDQU8Z256rmk:
1617  case X86::VMOVDQU8Zrmk:
1618  case X86::VMOVDQU16Z128rmk:
1619  case X86::VMOVDQU16Z256rmk:
1620  case X86::VMOVDQU16Zrmk:
1621  case X86::VMOVDQU32Z128rmk: case X86::VMOVDQA32Z128rmk:
1622  case X86::VMOVDQU32Z256rmk: case X86::VMOVDQA32Z256rmk:
1623  case X86::VMOVDQU32Zrmk: case X86::VMOVDQA32Zrmk:
1624  case X86::VMOVDQU64Z128rmk: case X86::VMOVDQA64Z128rmk:
1625  case X86::VMOVDQU64Z256rmk: case X86::VMOVDQA64Z256rmk:
1626  case X86::VMOVDQU64Zrmk: case X86::VMOVDQA64Zrmk:
1627  case X86::VMOVUPDZ128rmk: case X86::VMOVAPDZ128rmk:
1628  case X86::VMOVUPDZ256rmk: case X86::VMOVAPDZ256rmk:
1629  case X86::VMOVUPDZrmk: case X86::VMOVAPDZrmk:
1630  case X86::VMOVUPSZ128rmk: case X86::VMOVAPSZ128rmk:
1631  case X86::VMOVUPSZ256rmk: case X86::VMOVAPSZ256rmk:
1632  case X86::VMOVUPSZrmk: case X86::VMOVAPSZrmk:
1633  case X86::VBROADCASTSDZ256rmk:
1634  case X86::VBROADCASTSDZrmk:
1635  case X86::VBROADCASTSSZ128rmk:
1636  case X86::VBROADCASTSSZ256rmk:
1637  case X86::VBROADCASTSSZrmk:
1638  case X86::VPBROADCASTDZ128rmk:
1639  case X86::VPBROADCASTDZ256rmk:
1640  case X86::VPBROADCASTDZrmk:
1641  case X86::VPBROADCASTQZ128rmk:
1642  case X86::VPBROADCASTQZ256rmk:
1643  case X86::VPBROADCASTQZrmk: {
1644  unsigned Opc;
1645  switch (MIOpc) {
1646  default: llvm_unreachable("Unreachable!");
1647  case X86::VMOVDQU8Z128rmk: Opc = X86::VPBLENDMBZ128rmk; break;
1648  case X86::VMOVDQU8Z256rmk: Opc = X86::VPBLENDMBZ256rmk; break;
1649  case X86::VMOVDQU8Zrmk: Opc = X86::VPBLENDMBZrmk; break;
1650  case X86::VMOVDQU16Z128rmk: Opc = X86::VPBLENDMWZ128rmk; break;
1651  case X86::VMOVDQU16Z256rmk: Opc = X86::VPBLENDMWZ256rmk; break;
1652  case X86::VMOVDQU16Zrmk: Opc = X86::VPBLENDMWZrmk; break;
1653  case X86::VMOVDQU32Z128rmk: Opc = X86::VPBLENDMDZ128rmk; break;
1654  case X86::VMOVDQU32Z256rmk: Opc = X86::VPBLENDMDZ256rmk; break;
1655  case X86::VMOVDQU32Zrmk: Opc = X86::VPBLENDMDZrmk; break;
1656  case X86::VMOVDQU64Z128rmk: Opc = X86::VPBLENDMQZ128rmk; break;
1657  case X86::VMOVDQU64Z256rmk: Opc = X86::VPBLENDMQZ256rmk; break;
1658  case X86::VMOVDQU64Zrmk: Opc = X86::VPBLENDMQZrmk; break;
1659  case X86::VMOVUPDZ128rmk: Opc = X86::VBLENDMPDZ128rmk; break;
1660  case X86::VMOVUPDZ256rmk: Opc = X86::VBLENDMPDZ256rmk; break;
1661  case X86::VMOVUPDZrmk: Opc = X86::VBLENDMPDZrmk; break;
1662  case X86::VMOVUPSZ128rmk: Opc = X86::VBLENDMPSZ128rmk; break;
1663  case X86::VMOVUPSZ256rmk: Opc = X86::VBLENDMPSZ256rmk; break;
1664  case X86::VMOVUPSZrmk: Opc = X86::VBLENDMPSZrmk; break;
1665  case X86::VMOVDQA32Z128rmk: Opc = X86::VPBLENDMDZ128rmk; break;
1666  case X86::VMOVDQA32Z256rmk: Opc = X86::VPBLENDMDZ256rmk; break;
1667  case X86::VMOVDQA32Zrmk: Opc = X86::VPBLENDMDZrmk; break;
1668  case X86::VMOVDQA64Z128rmk: Opc = X86::VPBLENDMQZ128rmk; break;
1669  case X86::VMOVDQA64Z256rmk: Opc = X86::VPBLENDMQZ256rmk; break;
1670  case X86::VMOVDQA64Zrmk: Opc = X86::VPBLENDMQZrmk; break;
1671  case X86::VMOVAPDZ128rmk: Opc = X86::VBLENDMPDZ128rmk; break;
1672  case X86::VMOVAPDZ256rmk: Opc = X86::VBLENDMPDZ256rmk; break;
1673  case X86::VMOVAPDZrmk: Opc = X86::VBLENDMPDZrmk; break;
1674  case X86::VMOVAPSZ128rmk: Opc = X86::VBLENDMPSZ128rmk; break;
1675  case X86::VMOVAPSZ256rmk: Opc = X86::VBLENDMPSZ256rmk; break;
1676  case X86::VMOVAPSZrmk: Opc = X86::VBLENDMPSZrmk; break;
1677  case X86::VBROADCASTSDZ256rmk: Opc = X86::VBLENDMPDZ256rmbk; break;
1678  case X86::VBROADCASTSDZrmk: Opc = X86::VBLENDMPDZrmbk; break;
1679  case X86::VBROADCASTSSZ128rmk: Opc = X86::VBLENDMPSZ128rmbk; break;
1680  case X86::VBROADCASTSSZ256rmk: Opc = X86::VBLENDMPSZ256rmbk; break;
1681  case X86::VBROADCASTSSZrmk: Opc = X86::VBLENDMPSZrmbk; break;
1682  case X86::VPBROADCASTDZ128rmk: Opc = X86::VPBLENDMDZ128rmbk; break;
1683  case X86::VPBROADCASTDZ256rmk: Opc = X86::VPBLENDMDZ256rmbk; break;
1684  case X86::VPBROADCASTDZrmk: Opc = X86::VPBLENDMDZrmbk; break;
1685  case X86::VPBROADCASTQZ128rmk: Opc = X86::VPBLENDMQZ128rmbk; break;
1686  case X86::VPBROADCASTQZ256rmk: Opc = X86::VPBLENDMQZ256rmbk; break;
1687  case X86::VPBROADCASTQZrmk: Opc = X86::VPBLENDMQZrmbk; break;
1688  }
1689 
1690  NewMI = BuildMI(MF, MI.getDebugLoc(), get(Opc))
1691  .add(Dest)
1692  .add(MI.getOperand(2))
1693  .add(Src)
1694  .add(MI.getOperand(3))
1695  .add(MI.getOperand(4))
1696  .add(MI.getOperand(5))
1697  .add(MI.getOperand(6))
1698  .add(MI.getOperand(7));
1699  NumRegOperands = 4;
1700  break;
1701  }
1702 
1703  case X86::VMOVDQU8Z128rrk:
1704  case X86::VMOVDQU8Z256rrk:
1705  case X86::VMOVDQU8Zrrk:
1706  case X86::VMOVDQU16Z128rrk:
1707  case X86::VMOVDQU16Z256rrk:
1708  case X86::VMOVDQU16Zrrk:
1709  case X86::VMOVDQU32Z128rrk: case X86::VMOVDQA32Z128rrk:
1710  case X86::VMOVDQU32Z256rrk: case X86::VMOVDQA32Z256rrk:
1711  case X86::VMOVDQU32Zrrk: case X86::VMOVDQA32Zrrk:
1712  case X86::VMOVDQU64Z128rrk: case X86::VMOVDQA64Z128rrk:
1713  case X86::VMOVDQU64Z256rrk: case X86::VMOVDQA64Z256rrk:
1714  case X86::VMOVDQU64Zrrk: case X86::VMOVDQA64Zrrk:
1715  case X86::VMOVUPDZ128rrk: case X86::VMOVAPDZ128rrk:
1716  case X86::VMOVUPDZ256rrk: case X86::VMOVAPDZ256rrk:
1717  case X86::VMOVUPDZrrk: case X86::VMOVAPDZrrk:
1718  case X86::VMOVUPSZ128rrk: case X86::VMOVAPSZ128rrk:
1719  case X86::VMOVUPSZ256rrk: case X86::VMOVAPSZ256rrk:
1720  case X86::VMOVUPSZrrk: case X86::VMOVAPSZrrk: {
1721  unsigned Opc;
1722  switch (MIOpc) {
1723  default: llvm_unreachable("Unreachable!");
1724  case X86::VMOVDQU8Z128rrk: Opc = X86::VPBLENDMBZ128rrk; break;
1725  case X86::VMOVDQU8Z256rrk: Opc = X86::VPBLENDMBZ256rrk; break;
1726  case X86::VMOVDQU8Zrrk: Opc = X86::VPBLENDMBZrrk; break;
1727  case X86::VMOVDQU16Z128rrk: Opc = X86::VPBLENDMWZ128rrk; break;
1728  case X86::VMOVDQU16Z256rrk: Opc = X86::VPBLENDMWZ256rrk; break;
1729  case X86::VMOVDQU16Zrrk: Opc = X86::VPBLENDMWZrrk; break;
1730  case X86::VMOVDQU32Z128rrk: Opc = X86::VPBLENDMDZ128rrk; break;
1731  case X86::VMOVDQU32Z256rrk: Opc = X86::VPBLENDMDZ256rrk; break;
1732  case X86::VMOVDQU32Zrrk: Opc = X86::VPBLENDMDZrrk; break;
1733  case X86::VMOVDQU64Z128rrk: Opc = X86::VPBLENDMQZ128rrk; break;
1734  case X86::VMOVDQU64Z256rrk: Opc = X86::VPBLENDMQZ256rrk; break;
1735  case X86::VMOVDQU64Zrrk: Opc = X86::VPBLENDMQZrrk; break;
1736  case X86::VMOVUPDZ128rrk: Opc = X86::VBLENDMPDZ128rrk; break;
1737  case X86::VMOVUPDZ256rrk: Opc = X86::VBLENDMPDZ256rrk; break;
1738  case X86::VMOVUPDZrrk: Opc = X86::VBLENDMPDZrrk; break;
1739  case X86::VMOVUPSZ128rrk: Opc = X86::VBLENDMPSZ128rrk; break;
1740  case X86::VMOVUPSZ256rrk: Opc = X86::VBLENDMPSZ256rrk; break;
1741  case X86::VMOVUPSZrrk: Opc = X86::VBLENDMPSZrrk; break;
1742  case X86::VMOVDQA32Z128rrk: Opc = X86::VPBLENDMDZ128rrk; break;
1743  case X86::VMOVDQA32Z256rrk: Opc = X86::VPBLENDMDZ256rrk; break;
1744  case X86::VMOVDQA32Zrrk: Opc = X86::VPBLENDMDZrrk; break;
1745  case X86::VMOVDQA64Z128rrk: Opc = X86::VPBLENDMQZ128rrk; break;
1746  case X86::VMOVDQA64Z256rrk: Opc = X86::VPBLENDMQZ256rrk; break;
1747  case X86::VMOVDQA64Zrrk: Opc = X86::VPBLENDMQZrrk; break;
1748  case X86::VMOVAPDZ128rrk: Opc = X86::VBLENDMPDZ128rrk; break;
1749  case X86::VMOVAPDZ256rrk: Opc = X86::VBLENDMPDZ256rrk; break;
1750  case X86::VMOVAPDZrrk: Opc = X86::VBLENDMPDZrrk; break;
1751  case X86::VMOVAPSZ128rrk: Opc = X86::VBLENDMPSZ128rrk; break;
1752  case X86::VMOVAPSZ256rrk: Opc = X86::VBLENDMPSZ256rrk; break;
1753  case X86::VMOVAPSZrrk: Opc = X86::VBLENDMPSZrrk; break;
1754  }
1755 
1756  NewMI = BuildMI(MF, MI.getDebugLoc(), get(Opc))
1757  .add(Dest)
1758  .add(MI.getOperand(2))
1759  .add(Src)
1760  .add(MI.getOperand(3));
1761  NumRegOperands = 4;
1762  break;
1763  }
1764  }
1765 
1766  if (!NewMI) return nullptr;
1767 
1768  if (LV) { // Update live variables
1769  for (unsigned I = 0; I < NumRegOperands; ++I) {
1770  MachineOperand &Op = MI.getOperand(I);
1771  if (Op.isReg() && (Op.isDead() || Op.isKill()))
1772  LV->replaceKillInstruction(Op.getReg(), MI, *NewMI);
1773  }
1774  }
1775 
1776  MachineBasicBlock &MBB = *MI.getParent();
1777  MBB.insert(MI.getIterator(), NewMI); // Insert the new inst
1778 
1779  if (LIS) {
1780  LIS->ReplaceMachineInstrInMaps(MI, *NewMI);
1781  if (SrcReg)
1782  LIS->getInterval(SrcReg);
1783  if (SrcReg2)
1784  LIS->getInterval(SrcReg2);
1785  }
1786 
1787  return NewMI;
1788 }
1789 
1790 /// This determines which of three possible cases of a three source commute
1791 /// the source indexes correspond to taking into account any mask operands.
1792 /// All prevents commuting a passthru operand. Returns -1 if the commute isn't
1793 /// possible.
1794 /// Case 0 - Possible to commute the first and second operands.
1795 /// Case 1 - Possible to commute the first and third operands.
1796 /// Case 2 - Possible to commute the second and third operands.
1797 static unsigned getThreeSrcCommuteCase(uint64_t TSFlags, unsigned SrcOpIdx1,
1798  unsigned SrcOpIdx2) {
1799  // Put the lowest index to SrcOpIdx1 to simplify the checks below.
1800  if (SrcOpIdx1 > SrcOpIdx2)
1801  std::swap(SrcOpIdx1, SrcOpIdx2);
1802 
1803  unsigned Op1 = 1, Op2 = 2, Op3 = 3;
1804  if (X86II::isKMasked(TSFlags)) {
1805  Op2++;
1806  Op3++;
1807  }
1808 
1809  if (SrcOpIdx1 == Op1 && SrcOpIdx2 == Op2)
1810  return 0;
1811  if (SrcOpIdx1 == Op1 && SrcOpIdx2 == Op3)
1812  return 1;
1813  if (SrcOpIdx1 == Op2 && SrcOpIdx2 == Op3)
1814  return 2;
1815  llvm_unreachable("Unknown three src commute case.");
1816 }
1817 
1819  const MachineInstr &MI, unsigned SrcOpIdx1, unsigned SrcOpIdx2,
1820  const X86InstrFMA3Group &FMA3Group) const {
1821 
1822  unsigned Opc = MI.getOpcode();
1823 
1824  // TODO: Commuting the 1st operand of FMA*_Int requires some additional
1825  // analysis. The commute optimization is legal only if all users of FMA*_Int
1826  // use only the lowest element of the FMA*_Int instruction. Such analysis are
1827  // not implemented yet. So, just return 0 in that case.
1828  // When such analysis are available this place will be the right place for
1829  // calling it.
1830  assert(!(FMA3Group.isIntrinsic() && (SrcOpIdx1 == 1 || SrcOpIdx2 == 1)) &&
1831  "Intrinsic instructions can't commute operand 1");
1832 
1833  // Determine which case this commute is or if it can't be done.
1834  unsigned Case = getThreeSrcCommuteCase(MI.getDesc().TSFlags, SrcOpIdx1,
1835  SrcOpIdx2);
1836  assert(Case < 3 && "Unexpected case number!");
1837 
1838  // Define the FMA forms mapping array that helps to map input FMA form
1839  // to output FMA form to preserve the operation semantics after
1840  // commuting the operands.
1841  const unsigned Form132Index = 0;
1842  const unsigned Form213Index = 1;
1843  const unsigned Form231Index = 2;
1844  static const unsigned FormMapping[][3] = {
1845  // 0: SrcOpIdx1 == 1 && SrcOpIdx2 == 2;
1846  // FMA132 A, C, b; ==> FMA231 C, A, b;
1847  // FMA213 B, A, c; ==> FMA213 A, B, c;
1848  // FMA231 C, A, b; ==> FMA132 A, C, b;
1849  { Form231Index, Form213Index, Form132Index },
1850  // 1: SrcOpIdx1 == 1 && SrcOpIdx2 == 3;
1851  // FMA132 A, c, B; ==> FMA132 B, c, A;
1852  // FMA213 B, a, C; ==> FMA231 C, a, B;
1853  // FMA231 C, a, B; ==> FMA213 B, a, C;
1854  { Form132Index, Form231Index, Form213Index },
1855  // 2: SrcOpIdx1 == 2 && SrcOpIdx2 == 3;
1856  // FMA132 a, C, B; ==> FMA213 a, B, C;
1857  // FMA213 b, A, C; ==> FMA132 b, C, A;
1858  // FMA231 c, A, B; ==> FMA231 c, B, A;
1859  { Form213Index, Form132Index, Form231Index }
1860  };
1861 
1862  unsigned FMAForms[3];
1863  FMAForms[0] = FMA3Group.get132Opcode();
1864  FMAForms[1] = FMA3Group.get213Opcode();
1865  FMAForms[2] = FMA3Group.get231Opcode();
1866 
1867  // Everything is ready, just adjust the FMA opcode and return it.
1868  for (unsigned FormIndex = 0; FormIndex < 3; FormIndex++)
1869  if (Opc == FMAForms[FormIndex])
1870  return FMAForms[FormMapping[Case][FormIndex]];
1871 
1872  llvm_unreachable("Illegal FMA3 format");
1873 }
1874 
1875 static void commuteVPTERNLOG(MachineInstr &MI, unsigned SrcOpIdx1,
1876  unsigned SrcOpIdx2) {
1877  // Determine which case this commute is or if it can't be done.
1878  unsigned Case = getThreeSrcCommuteCase(MI.getDesc().TSFlags, SrcOpIdx1,
1879  SrcOpIdx2);
1880  assert(Case < 3 && "Unexpected case value!");
1881 
1882  // For each case we need to swap two pairs of bits in the final immediate.
1883  static const uint8_t SwapMasks[3][4] = {
1884  { 0x04, 0x10, 0x08, 0x20 }, // Swap bits 2/4 and 3/5.
1885  { 0x02, 0x10, 0x08, 0x40 }, // Swap bits 1/4 and 3/6.
1886  { 0x02, 0x04, 0x20, 0x40 }, // Swap bits 1/2 and 5/6.
1887  };
1888 
1889  uint8_t Imm = MI.getOperand(MI.getNumOperands()-1).getImm();
1890  // Clear out the bits we are swapping.
1891  uint8_t NewImm = Imm & ~(SwapMasks[Case][0] | SwapMasks[Case][1] |
1892  SwapMasks[Case][2] | SwapMasks[Case][3]);
1893  // If the immediate had a bit of the pair set, then set the opposite bit.
1894  if (Imm & SwapMasks[Case][0]) NewImm |= SwapMasks[Case][1];
1895  if (Imm & SwapMasks[Case][1]) NewImm |= SwapMasks[Case][0];
1896  if (Imm & SwapMasks[Case][2]) NewImm |= SwapMasks[Case][3];
1897  if (Imm & SwapMasks[Case][3]) NewImm |= SwapMasks[Case][2];
1898  MI.getOperand(MI.getNumOperands()-1).setImm(NewImm);
1899 }
1900 
1901 // Returns true if this is a VPERMI2 or VPERMT2 instruction that can be
1902 // commuted.
1903 static bool isCommutableVPERMV3Instruction(unsigned Opcode) {
1904 #define VPERM_CASES(Suffix) \
1905  case X86::VPERMI2##Suffix##128rr: case X86::VPERMT2##Suffix##128rr: \
1906  case X86::VPERMI2##Suffix##256rr: case X86::VPERMT2##Suffix##256rr: \
1907  case X86::VPERMI2##Suffix##rr: case X86::VPERMT2##Suffix##rr: \
1908  case X86::VPERMI2##Suffix##128rm: case X86::VPERMT2##Suffix##128rm: \
1909  case X86::VPERMI2##Suffix##256rm: case X86::VPERMT2##Suffix##256rm: \
1910  case X86::VPERMI2##Suffix##rm: case X86::VPERMT2##Suffix##rm: \
1911  case X86::VPERMI2##Suffix##128rrkz: case X86::VPERMT2##Suffix##128rrkz: \
1912  case X86::VPERMI2##Suffix##256rrkz: case X86::VPERMT2##Suffix##256rrkz: \
1913  case X86::VPERMI2##Suffix##rrkz: case X86::VPERMT2##Suffix##rrkz: \
1914  case X86::VPERMI2##Suffix##128rmkz: case X86::VPERMT2##Suffix##128rmkz: \
1915  case X86::VPERMI2##Suffix##256rmkz: case X86::VPERMT2##Suffix##256rmkz: \
1916  case X86::VPERMI2##Suffix##rmkz: case X86::VPERMT2##Suffix##rmkz:
1917 
1918 #define VPERM_CASES_BROADCAST(Suffix) \
1919  VPERM_CASES(Suffix) \
1920  case X86::VPERMI2##Suffix##128rmb: case X86::VPERMT2##Suffix##128rmb: \
1921  case X86::VPERMI2##Suffix##256rmb: case X86::VPERMT2##Suffix##256rmb: \
1922  case X86::VPERMI2##Suffix##rmb: case X86::VPERMT2##Suffix##rmb: \
1923  case X86::VPERMI2##Suffix##128rmbkz: case X86::VPERMT2##Suffix##128rmbkz: \
1924  case X86::VPERMI2##Suffix##256rmbkz: case X86::VPERMT2##Suffix##256rmbkz: \
1925  case X86::VPERMI2##Suffix##rmbkz: case X86::VPERMT2##Suffix##rmbkz:
1926 
1927  switch (Opcode) {
1928  default: return false;
1929  VPERM_CASES(B)
1934  VPERM_CASES(W)
1935  return true;
1936  }
1937 #undef VPERM_CASES_BROADCAST
1938 #undef VPERM_CASES
1939 }
1940 
1941 // Returns commuted opcode for VPERMI2 and VPERMT2 instructions by switching
1942 // from the I opcode to the T opcode and vice versa.
1943 static unsigned getCommutedVPERMV3Opcode(unsigned Opcode) {
1944 #define VPERM_CASES(Orig, New) \
1945  case X86::Orig##128rr: return X86::New##128rr; \
1946  case X86::Orig##128rrkz: return X86::New##128rrkz; \
1947  case X86::Orig##128rm: return X86::New##128rm; \
1948  case X86::Orig##128rmkz: return X86::New##128rmkz; \
1949  case X86::Orig##256rr: return X86::New##256rr; \
1950  case X86::Orig##256rrkz: return X86::New##256rrkz; \
1951  case X86::Orig##256rm: return X86::New##256rm; \
1952  case X86::Orig##256rmkz: return X86::New##256rmkz; \
1953  case X86::Orig##rr: return X86::New##rr; \
1954  case X86::Orig##rrkz: return X86::New##rrkz; \
1955  case X86::Orig##rm: return X86::New##rm; \
1956  case X86::Orig##rmkz: return X86::New##rmkz;
1957 
1958 #define VPERM_CASES_BROADCAST(Orig, New) \
1959  VPERM_CASES(Orig, New) \
1960  case X86::Orig##128rmb: return X86::New##128rmb; \
1961  case X86::Orig##128rmbkz: return X86::New##128rmbkz; \
1962  case X86::Orig##256rmb: return X86::New##256rmb; \
1963  case X86::Orig##256rmbkz: return X86::New##256rmbkz; \
1964  case X86::Orig##rmb: return X86::New##rmb; \
1965  case X86::Orig##rmbkz: return X86::New##rmbkz;
1966 
1967  switch (Opcode) {
1968  VPERM_CASES(VPERMI2B, VPERMT2B)
1969  VPERM_CASES_BROADCAST(VPERMI2D, VPERMT2D)
1970  VPERM_CASES_BROADCAST(VPERMI2PD, VPERMT2PD)
1971  VPERM_CASES_BROADCAST(VPERMI2PS, VPERMT2PS)
1972  VPERM_CASES_BROADCAST(VPERMI2Q, VPERMT2Q)
1973  VPERM_CASES(VPERMI2W, VPERMT2W)
1974  VPERM_CASES(VPERMT2B, VPERMI2B)
1975  VPERM_CASES_BROADCAST(VPERMT2D, VPERMI2D)
1976  VPERM_CASES_BROADCAST(VPERMT2PD, VPERMI2PD)
1977  VPERM_CASES_BROADCAST(VPERMT2PS, VPERMI2PS)
1978  VPERM_CASES_BROADCAST(VPERMT2Q, VPERMI2Q)
1979  VPERM_CASES(VPERMT2W, VPERMI2W)
1980  }
1981 
1982  llvm_unreachable("Unreachable!");
1983 #undef VPERM_CASES_BROADCAST
1984 #undef VPERM_CASES
1985 }
1986 
1988  unsigned OpIdx1,
1989  unsigned OpIdx2) const {
1990  auto cloneIfNew = [NewMI](MachineInstr &MI) -> MachineInstr & {
1991  if (NewMI)
1992  return *MI.getParent()->getParent()->CloneMachineInstr(&MI);
1993  return MI;
1994  };
1995 
1996  switch (MI.getOpcode()) {
1997  case X86::SHRD16rri8: // A = SHRD16rri8 B, C, I -> A = SHLD16rri8 C, B, (16-I)
1998  case X86::SHLD16rri8: // A = SHLD16rri8 B, C, I -> A = SHRD16rri8 C, B, (16-I)
1999  case X86::SHRD32rri8: // A = SHRD32rri8 B, C, I -> A = SHLD32rri8 C, B, (32-I)
2000  case X86::SHLD32rri8: // A = SHLD32rri8 B, C, I -> A = SHRD32rri8 C, B, (32-I)
2001  case X86::SHRD64rri8: // A = SHRD64rri8 B, C, I -> A = SHLD64rri8 C, B, (64-I)
2002  case X86::SHLD64rri8:{// A = SHLD64rri8 B, C, I -> A = SHRD64rri8 C, B, (64-I)
2003  unsigned Opc;
2004  unsigned Size;
2005  switch (MI.getOpcode()) {
2006  default: llvm_unreachable("Unreachable!");
2007  case X86::SHRD16rri8: Size = 16; Opc = X86::SHLD16rri8; break;
2008  case X86::SHLD16rri8: Size = 16; Opc = X86::SHRD16rri8; break;
2009  case X86::SHRD32rri8: Size = 32; Opc = X86::SHLD32rri8; break;
2010  case X86::SHLD32rri8: Size = 32; Opc = X86::SHRD32rri8; break;
2011  case X86::SHRD64rri8: Size = 64; Opc = X86::SHLD64rri8; break;
2012  case X86::SHLD64rri8: Size = 64; Opc = X86::SHRD64rri8; break;
2013  }
2014  unsigned Amt = MI.getOperand(3).getImm();
2015  auto &WorkingMI = cloneIfNew(MI);
2016  WorkingMI.setDesc(get(Opc));
2017  WorkingMI.getOperand(3).setImm(Size - Amt);
2018  return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
2019  OpIdx1, OpIdx2);
2020  }
2021  case X86::PFSUBrr:
2022  case X86::PFSUBRrr: {
2023  // PFSUB x, y: x = x - y
2024  // PFSUBR x, y: x = y - x
2025  unsigned Opc =
2026  (X86::PFSUBRrr == MI.getOpcode() ? X86::PFSUBrr : X86::PFSUBRrr);
2027  auto &WorkingMI = cloneIfNew(MI);
2028  WorkingMI.setDesc(get(Opc));
2029  return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
2030  OpIdx1, OpIdx2);
2031  }
2032  case X86::BLENDPDrri:
2033  case X86::BLENDPSrri:
2034  case X86::VBLENDPDrri:
2035  case X86::VBLENDPSrri:
2036  // If we're optimizing for size, try to use MOVSD/MOVSS.
2037  if (MI.getParent()->getParent()->getFunction().hasOptSize()) {
2038  unsigned Mask, Opc;
2039  switch (MI.getOpcode()) {
2040  default: llvm_unreachable("Unreachable!");
2041  case X86::BLENDPDrri: Opc = X86::MOVSDrr; Mask = 0x03; break;
2042  case X86::BLENDPSrri: Opc = X86::MOVSSrr; Mask = 0x0F; break;
2043  case X86::VBLENDPDrri: Opc = X86::VMOVSDrr; Mask = 0x03; break;
2044  case X86::VBLENDPSrri: Opc = X86::VMOVSSrr; Mask = 0x0F; break;
2045  }
2046  if ((MI.getOperand(3).getImm() ^ Mask) == 1) {
2047  auto &WorkingMI = cloneIfNew(MI);
2048  WorkingMI.setDesc(get(Opc));
2049  WorkingMI.removeOperand(3);
2050  return TargetInstrInfo::commuteInstructionImpl(WorkingMI,
2051  /*NewMI=*/false,
2052  OpIdx1, OpIdx2);
2053  }
2054  }
2055  [[fallthrough]];
2056  case X86::PBLENDWrri:
2057  case X86::VBLENDPDYrri:
2058  case X86::VBLENDPSYrri:
2059  case X86::VPBLENDDrri:
2060  case X86::VPBLENDWrri:
2061  case X86::VPBLENDDYrri:
2062  case X86::VPBLENDWYrri:{
2063  int8_t Mask;
2064  switch (MI.getOpcode()) {
2065  default: llvm_unreachable("Unreachable!");
2066  case X86::BLENDPDrri: Mask = (int8_t)0x03; break;
2067  case X86::BLENDPSrri: Mask = (int8_t)0x0F; break;
2068  case X86::PBLENDWrri: Mask = (int8_t)0xFF; break;
2069  case X86::VBLENDPDrri: Mask = (int8_t)0x03; break;
2070  case X86::VBLENDPSrri: Mask = (int8_t)0x0F; break;
2071  case X86::VBLENDPDYrri: Mask = (int8_t)0x0F; break;
2072  case X86::VBLENDPSYrri: Mask = (int8_t)0xFF; break;
2073  case X86::VPBLENDDrri: Mask = (int8_t)0x0F; break;
2074  case X86::VPBLENDWrri: Mask = (int8_t)0xFF; break;
2075  case X86::VPBLENDDYrri: Mask = (int8_t)0xFF; break;
2076  case X86::VPBLENDWYrri: Mask = (int8_t)0xFF; break;
2077  }
2078  // Only the least significant bits of Imm are used.
2079  // Using int8_t to ensure it will be sign extended to the int64_t that
2080  // setImm takes in order to match isel behavior.
2081  int8_t Imm = MI.getOperand(3).getImm() & Mask;
2082  auto &WorkingMI = cloneIfNew(MI);
2083  WorkingMI.getOperand(3).setImm(Mask ^ Imm);
2084  return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
2085  OpIdx1, OpIdx2);
2086  }
2087  case X86::INSERTPSrr:
2088  case X86::VINSERTPSrr:
2089  case X86::VINSERTPSZrr: {
2090  unsigned Imm = MI.getOperand(MI.getNumOperands() - 1).getImm();
2091  unsigned ZMask = Imm & 15;
2092  unsigned DstIdx = (Imm >> 4) & 3;
2093  unsigned SrcIdx = (Imm >> 6) & 3;
2094 
2095  // We can commute insertps if we zero 2 of the elements, the insertion is
2096  // "inline" and we don't override the insertion with a zero.
2097  if (DstIdx == SrcIdx && (ZMask & (1 << DstIdx)) == 0 &&
2098  countPopulation(ZMask) == 2) {
2099  unsigned AltIdx = findFirstSet((ZMask | (1 << DstIdx)) ^ 15);
2100  assert(AltIdx < 4 && "Illegal insertion index");
2101  unsigned AltImm = (AltIdx << 6) | (AltIdx << 4) | ZMask;
2102  auto &WorkingMI = cloneIfNew(MI);
2103  WorkingMI.getOperand(MI.getNumOperands() - 1).setImm(AltImm);
2104  return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
2105  OpIdx1, OpIdx2);
2106  }
2107  return nullptr;
2108  }
2109  case X86::MOVSDrr:
2110  case X86::MOVSSrr:
2111  case X86::VMOVSDrr:
2112  case X86::VMOVSSrr:{
2113  // On SSE41 or later we can commute a MOVSS/MOVSD to a BLENDPS/BLENDPD.
2114  if (Subtarget.hasSSE41()) {
2115  unsigned Mask, Opc;
2116  switch (MI.getOpcode()) {
2117  default: llvm_unreachable("Unreachable!");
2118  case X86::MOVSDrr: Opc = X86::BLENDPDrri; Mask = 0x02; break;
2119  case X86::MOVSSrr: Opc = X86::BLENDPSrri; Mask = 0x0E; break;
2120  case X86::VMOVSDrr: Opc = X86::VBLENDPDrri; Mask = 0x02; break;
2121  case X86::VMOVSSrr: Opc = X86::VBLENDPSrri; Mask = 0x0E; break;
2122  }
2123 
2124  auto &WorkingMI = cloneIfNew(MI);
2125  WorkingMI.setDesc(get(Opc));
2126  WorkingMI.addOperand(MachineOperand::CreateImm(Mask));
2127  return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
2128  OpIdx1, OpIdx2);
2129  }
2130 
2131  // Convert to SHUFPD.
2132  assert(MI.getOpcode() == X86::MOVSDrr &&
2133  "Can only commute MOVSDrr without SSE4.1");
2134 
2135  auto &WorkingMI = cloneIfNew(MI);
2136  WorkingMI.setDesc(get(X86::SHUFPDrri));
2137  WorkingMI.addOperand(MachineOperand::CreateImm(0x02));
2138  return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
2139  OpIdx1, OpIdx2);
2140  }
2141  case X86::SHUFPDrri: {
2142  // Commute to MOVSD.
2143  assert(MI.getOperand(3).getImm() == 0x02 && "Unexpected immediate!");
2144  auto &WorkingMI = cloneIfNew(MI);
2145  WorkingMI.setDesc(get(X86::MOVSDrr));
2146  WorkingMI.removeOperand(3);
2147  return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
2148  OpIdx1, OpIdx2);
2149  }
2150  case X86::PCLMULQDQrr:
2151  case X86::VPCLMULQDQrr:
2152  case X86::VPCLMULQDQYrr:
2153  case X86::VPCLMULQDQZrr:
2154  case X86::VPCLMULQDQZ128rr:
2155  case X86::VPCLMULQDQZ256rr: {
2156  // SRC1 64bits = Imm[0] ? SRC1[127:64] : SRC1[63:0]
2157  // SRC2 64bits = Imm[4] ? SRC2[127:64] : SRC2[63:0]
2158  unsigned Imm = MI.getOperand(3).getImm();
2159  unsigned Src1Hi = Imm & 0x01;
2160  unsigned Src2Hi = Imm & 0x10;
2161  auto &WorkingMI = cloneIfNew(MI);
2162  WorkingMI.getOperand(3).setImm((Src1Hi << 4) | (Src2Hi >> 4));
2163  return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
2164  OpIdx1, OpIdx2);
2165  }
2166  case X86::VPCMPBZ128rri: case X86::VPCMPUBZ128rri:
2167  case X86::VPCMPBZ256rri: case X86::VPCMPUBZ256rri:
2168  case X86::VPCMPBZrri: case X86::VPCMPUBZrri:
2169  case X86::VPCMPDZ128rri: case X86::VPCMPUDZ128rri:
2170  case X86::VPCMPDZ256rri: case X86::VPCMPUDZ256rri:
2171  case X86::VPCMPDZrri: case X86::VPCMPUDZrri:
2172  case X86::VPCMPQZ128rri: case X86::VPCMPUQZ128rri:
2173  case X86::VPCMPQZ256rri: case X86::VPCMPUQZ256rri:
2174  case X86::VPCMPQZrri: case X86::VPCMPUQZrri:
2175  case X86::VPCMPWZ128rri: case X86::VPCMPUWZ128rri:
2176  case X86::VPCMPWZ256rri: case X86::VPCMPUWZ256rri:
2177  case X86::VPCMPWZrri: case X86::VPCMPUWZrri:
2178  case X86::VPCMPBZ128rrik: case X86::VPCMPUBZ128rrik:
2179  case X86::VPCMPBZ256rrik: case X86::VPCMPUBZ256rrik:
2180  case X86::VPCMPBZrrik: case X86::VPCMPUBZrrik:
2181  case X86::VPCMPDZ128rrik: case X86::VPCMPUDZ128rrik:
2182  case X86::VPCMPDZ256rrik: case X86::VPCMPUDZ256rrik:
2183  case X86::VPCMPDZrrik: case X86::VPCMPUDZrrik:
2184  case X86::VPCMPQZ128rrik: case X86::VPCMPUQZ128rrik:
2185  case X86::VPCMPQZ256rrik: case X86::VPCMPUQZ256rrik:
2186  case X86::VPCMPQZrrik: case X86::VPCMPUQZrrik:
2187  case X86::VPCMPWZ128rrik: case X86::VPCMPUWZ128rrik:
2188  case X86::VPCMPWZ256rrik: case X86::VPCMPUWZ256rrik:
2189  case X86::VPCMPWZrrik: case X86::VPCMPUWZrrik: {
2190  // Flip comparison mode immediate (if necessary).
2191  unsigned Imm = MI.getOperand(MI.getNumOperands() - 1).getImm() & 0x7;
2193  auto &WorkingMI = cloneIfNew(MI);
2194  WorkingMI.getOperand(MI.getNumOperands() - 1).setImm(Imm);
2195  return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
2196  OpIdx1, OpIdx2);
2197  }
2198  case X86::VPCOMBri: case X86::VPCOMUBri:
2199  case X86::VPCOMDri: case X86::VPCOMUDri:
2200  case X86::VPCOMQri: case X86::VPCOMUQri:
2201  case X86::VPCOMWri: case X86::VPCOMUWri: {
2202  // Flip comparison mode immediate (if necessary).
2203  unsigned Imm = MI.getOperand(3).getImm() & 0x7;
2205  auto &WorkingMI = cloneIfNew(MI);
2206  WorkingMI.getOperand(3).setImm(Imm);
2207  return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
2208  OpIdx1, OpIdx2);
2209  }
2210  case X86::VCMPSDZrr:
2211  case X86::VCMPSSZrr:
2212  case X86::VCMPPDZrri:
2213  case X86::VCMPPSZrri:
2214  case X86::VCMPSHZrr:
2215  case X86::VCMPPHZrri:
2216  case X86::VCMPPHZ128rri:
2217  case X86::VCMPPHZ256rri:
2218  case X86::VCMPPDZ128rri:
2219  case X86::VCMPPSZ128rri:
2220  case X86::VCMPPDZ256rri:
2221  case X86::VCMPPSZ256rri:
2222  case X86::VCMPPDZrrik:
2223  case X86::VCMPPSZrrik:
2224  case X86::VCMPPDZ128rrik:
2225  case X86::VCMPPSZ128rrik:
2226  case X86::VCMPPDZ256rrik:
2227  case X86::VCMPPSZ256rrik: {
2228  unsigned Imm =
2229  MI.getOperand(MI.getNumExplicitOperands() - 1).getImm() & 0x1f;
2231  auto &WorkingMI = cloneIfNew(MI);
2232  WorkingMI.getOperand(MI.getNumExplicitOperands() - 1).setImm(Imm);
2233  return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
2234  OpIdx1, OpIdx2);
2235  }
2236  case X86::VPERM2F128rr:
2237  case X86::VPERM2I128rr: {
2238  // Flip permute source immediate.
2239  // Imm & 0x02: lo = if set, select Op1.lo/hi else Op0.lo/hi.
2240  // Imm & 0x20: hi = if set, select Op1.lo/hi else Op0.lo/hi.
2241  int8_t Imm = MI.getOperand(3).getImm() & 0xFF;
2242  auto &WorkingMI = cloneIfNew(MI);
2243  WorkingMI.getOperand(3).setImm(Imm ^ 0x22);
2244  return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
2245  OpIdx1, OpIdx2);
2246  }
2247  case X86::MOVHLPSrr:
2248  case X86::UNPCKHPDrr:
2249  case X86::VMOVHLPSrr:
2250  case X86::VUNPCKHPDrr:
2251  case X86::VMOVHLPSZrr:
2252  case X86::VUNPCKHPDZ128rr: {
2253  assert(Subtarget.hasSSE2() && "Commuting MOVHLP/UNPCKHPD requires SSE2!");
2254 
2255  unsigned Opc = MI.getOpcode();
2256  switch (Opc) {
2257  default: llvm_unreachable("Unreachable!");
2258  case X86::MOVHLPSrr: Opc = X86::UNPCKHPDrr; break;
2259  case X86::UNPCKHPDrr: Opc = X86::MOVHLPSrr; break;
2260  case X86::VMOVHLPSrr: Opc = X86::VUNPCKHPDrr; break;
2261  case X86::VUNPCKHPDrr: Opc = X86::VMOVHLPSrr; break;
2262  case X86::VMOVHLPSZrr: Opc = X86::VUNPCKHPDZ128rr; break;
2263  case X86::VUNPCKHPDZ128rr: Opc = X86::VMOVHLPSZrr; break;
2264  }
2265  auto &WorkingMI = cloneIfNew(MI);
2266  WorkingMI.setDesc(get(Opc));
2267  return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
2268  OpIdx1, OpIdx2);
2269  }
2270  case X86::CMOV16rr: case X86::CMOV32rr: case X86::CMOV64rr: {
2271  auto &WorkingMI = cloneIfNew(MI);
2272  unsigned OpNo = MI.getDesc().getNumOperands() - 1;
2273  X86::CondCode CC = static_cast<X86::CondCode>(MI.getOperand(OpNo).getImm());
2274  WorkingMI.getOperand(OpNo).setImm(X86::GetOppositeBranchCondition(CC));
2275  return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
2276  OpIdx1, OpIdx2);
2277  }
2278  case X86::VPTERNLOGDZrri: case X86::VPTERNLOGDZrmi:
2279  case X86::VPTERNLOGDZ128rri: case X86::VPTERNLOGDZ128rmi:
2280  case X86::VPTERNLOGDZ256rri: case X86::VPTERNLOGDZ256rmi:
2281  case X86::VPTERNLOGQZrri: case X86::VPTERNLOGQZrmi:
2282  case X86::VPTERNLOGQZ128rri: case X86::VPTERNLOGQZ128rmi:
2283  case X86::VPTERNLOGQZ256rri: case X86::VPTERNLOGQZ256rmi:
2284  case X86::VPTERNLOGDZrrik:
2285  case X86::VPTERNLOGDZ128rrik:
2286  case X86::VPTERNLOGDZ256rrik:
2287  case X86::VPTERNLOGQZrrik:
2288  case X86::VPTERNLOGQZ128rrik:
2289  case X86::VPTERNLOGQZ256rrik:
2290  case X86::VPTERNLOGDZrrikz: case X86::VPTERNLOGDZrmikz:
2291  case X86::VPTERNLOGDZ128rrikz: case X86::VPTERNLOGDZ128rmikz:
2292  case X86::VPTERNLOGDZ256rrikz: case X86::VPTERNLOGDZ256rmikz:
2293  case X86::VPTERNLOGQZrrikz: case X86::VPTERNLOGQZrmikz:
2294  case X86::VPTERNLOGQZ128rrikz: case X86::VPTERNLOGQZ128rmikz:
2295  case X86::VPTERNLOGQZ256rrikz: case X86::VPTERNLOGQZ256rmikz:
2296  case X86::VPTERNLOGDZ128rmbi:
2297  case X86::VPTERNLOGDZ256rmbi:
2298  case X86::VPTERNLOGDZrmbi:
2299  case X86::VPTERNLOGQZ128rmbi:
2300  case X86::VPTERNLOGQZ256rmbi:
2301  case X86::VPTERNLOGQZrmbi:
2302  case X86::VPTERNLOGDZ128rmbikz:
2303  case X86::VPTERNLOGDZ256rmbikz:
2304  case X86::VPTERNLOGDZrmbikz:
2305  case X86::VPTERNLOGQZ128rmbikz:
2306  case X86::VPTERNLOGQZ256rmbikz:
2307  case X86::VPTERNLOGQZrmbikz: {
2308  auto &WorkingMI = cloneIfNew(MI);
2309  commuteVPTERNLOG(WorkingMI, OpIdx1, OpIdx2);
2310  return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
2311  OpIdx1, OpIdx2);
2312  }
2313  default: {
2314  if (isCommutableVPERMV3Instruction(MI.getOpcode())) {
2315  unsigned Opc = getCommutedVPERMV3Opcode(MI.getOpcode());
2316  auto &WorkingMI = cloneIfNew(MI);
2317  WorkingMI.setDesc(get(Opc));
2318  return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
2319  OpIdx1, OpIdx2);
2320  }
2321 
2322  const X86InstrFMA3Group *FMA3Group = getFMA3Group(MI.getOpcode(),
2323  MI.getDesc().TSFlags);
2324  if (FMA3Group) {
2325  unsigned Opc =
2326  getFMA3OpcodeToCommuteOperands(MI, OpIdx1, OpIdx2, *FMA3Group);
2327  auto &WorkingMI = cloneIfNew(MI);
2328  WorkingMI.setDesc(get(Opc));
2329  return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
2330  OpIdx1, OpIdx2);
2331  }
2332 
2333  return TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2);
2334  }
2335  }
2336 }
2337 
2338 bool
2339 X86InstrInfo::findThreeSrcCommutedOpIndices(const MachineInstr &MI,
2340  unsigned &SrcOpIdx1,
2341  unsigned &SrcOpIdx2,
2342  bool IsIntrinsic) const {
2343  uint64_t TSFlags = MI.getDesc().TSFlags;
2344 
2345  unsigned FirstCommutableVecOp = 1;
2346  unsigned LastCommutableVecOp = 3;
2347  unsigned KMaskOp = -1U;
2348  if (X86II::isKMasked(TSFlags)) {
2349  // For k-zero-masked operations it is Ok to commute the first vector
2350  // operand. Unless this is an intrinsic instruction.
2351  // For regular k-masked operations a conservative choice is done as the
2352  // elements of the first vector operand, for which the corresponding bit
2353  // in the k-mask operand is set to 0, are copied to the result of the
2354  // instruction.
2355  // TODO/FIXME: The commute still may be legal if it is known that the
2356  // k-mask operand is set to either all ones or all zeroes.
2357  // It is also Ok to commute the 1st operand if all users of MI use only
2358  // the elements enabled by the k-mask operand. For example,
2359  // v4 = VFMADD213PSZrk v1, k, v2, v3; // v1[i] = k[i] ? v2[i]*v1[i]+v3[i]
2360  // : v1[i];
2361  // VMOVAPSZmrk <mem_addr>, k, v4; // this is the ONLY user of v4 ->
2362  // // Ok, to commute v1 in FMADD213PSZrk.
2363 
2364  // The k-mask operand has index = 2 for masked and zero-masked operations.
2365  KMaskOp = 2;
2366 
2367  // The operand with index = 1 is used as a source for those elements for
2368  // which the corresponding bit in the k-mask is set to 0.
2369  if (X86II::isKMergeMasked(TSFlags) || IsIntrinsic)
2370  FirstCommutableVecOp = 3;
2371 
2372  LastCommutableVecOp++;
2373  } else if (IsIntrinsic) {
2374  // Commuting the first operand of an intrinsic instruction isn't possible
2375  // unless we can prove that only the lowest element of the result is used.
2376  FirstCommutableVecOp = 2;
2377  }
2378 
2379  if (isMem(MI, LastCommutableVecOp))
2380  LastCommutableVecOp--;
2381 
2382  // Only the first RegOpsNum operands are commutable.
2383  // Also, the value 'CommuteAnyOperandIndex' is valid here as it means
2384  // that the operand is not specified/fixed.
2385  if (SrcOpIdx1 != CommuteAnyOperandIndex &&
2386  (SrcOpIdx1 < FirstCommutableVecOp || SrcOpIdx1 > LastCommutableVecOp ||
2387  SrcOpIdx1 == KMaskOp))
2388  return false;
2389  if (SrcOpIdx2 != CommuteAnyOperandIndex &&
2390  (SrcOpIdx2 < FirstCommutableVecOp || SrcOpIdx2 > LastCommutableVecOp ||
2391  SrcOpIdx2 == KMaskOp))
2392  return false;
2393 
2394  // Look for two different register operands assumed to be commutable
2395  // regardless of the FMA opcode. The FMA opcode is adjusted later.
2396  if (SrcOpIdx1 == CommuteAnyOperandIndex ||
2397  SrcOpIdx2 == CommuteAnyOperandIndex) {
2398  unsigned CommutableOpIdx2 = SrcOpIdx2;
2399 
2400  // At least one of operands to be commuted is not specified and
2401  // this method is free to choose appropriate commutable operands.
2402  if (SrcOpIdx1 == SrcOpIdx2)
2403  // Both of operands are not fixed. By default set one of commutable
2404  // operands to the last register operand of the instruction.
2405  CommutableOpIdx2 = LastCommutableVecOp;
2406  else if (SrcOpIdx2 == CommuteAnyOperandIndex)
2407  // Only one of operands is not fixed.
2408  CommutableOpIdx2 = SrcOpIdx1;
2409 
2410  // CommutableOpIdx2 is well defined now. Let's choose another commutable
2411  // operand and assign its index to CommutableOpIdx1.
2412  Register Op2Reg = MI.getOperand(CommutableOpIdx2).getReg();
2413 
2414  unsigned CommutableOpIdx1;
2415  for (CommutableOpIdx1 = LastCommutableVecOp;
2416  CommutableOpIdx1 >= FirstCommutableVecOp; CommutableOpIdx1--) {
2417  // Just ignore and skip the k-mask operand.
2418  if (CommutableOpIdx1 == KMaskOp)
2419  continue;
2420 
2421  // The commuted operands must have different registers.
2422  // Otherwise, the commute transformation does not change anything and
2423  // is useless then.
2424  if (Op2Reg != MI.getOperand(CommutableOpIdx1).getReg())
2425  break;
2426  }
2427 
2428  // No appropriate commutable operands were found.
2429  if (CommutableOpIdx1 < FirstCommutableVecOp)
2430  return false;
2431 
2432  // Assign the found pair of commutable indices to SrcOpIdx1 and SrcOpidx2
2433  // to return those values.
2434  if (!fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2,
2435  CommutableOpIdx1, CommutableOpIdx2))
2436  return false;
2437  }
2438 
2439  return true;
2440 }
2441 
2443  unsigned &SrcOpIdx1,
2444  unsigned &SrcOpIdx2) const {
2445  const MCInstrDesc &Desc = MI.getDesc();
2446  if (!Desc.isCommutable())
2447  return false;
2448 
2449  switch (MI.getOpcode()) {
2450  case X86::CMPSDrr:
2451  case X86::CMPSSrr:
2452  case X86::CMPPDrri:
2453  case X86::CMPPSrri:
2454  case X86::VCMPSDrr:
2455  case X86::VCMPSSrr:
2456  case X86::VCMPPDrri:
2457  case X86::VCMPPSrri:
2458  case X86::VCMPPDYrri:
2459  case X86::VCMPPSYrri:
2460  case X86::VCMPSDZrr:
2461  case X86::VCMPSSZrr:
2462  case X86::VCMPPDZrri:
2463  case X86::VCMPPSZrri:
2464  case X86::VCMPSHZrr:
2465  case X86::VCMPPHZrri:
2466  case X86::VCMPPHZ128rri:
2467  case X86::VCMPPHZ256rri:
2468  case X86::VCMPPDZ128rri:
2469  case X86::VCMPPSZ128rri:
2470  case X86::VCMPPDZ256rri:
2471  case X86::VCMPPSZ256rri:
2472  case X86::VCMPPDZrrik:
2473  case X86::VCMPPSZrrik:
2474  case X86::VCMPPDZ128rrik:
2475  case X86::VCMPPSZ128rrik:
2476  case X86::VCMPPDZ256rrik:
2477  case X86::VCMPPSZ256rrik: {
2478  unsigned OpOffset = X86II::isKMasked(Desc.TSFlags) ? 1 : 0;
2479 
2480  // Float comparison can be safely commuted for
2481  // Ordered/Unordered/Equal/NotEqual tests
2482  unsigned Imm = MI.getOperand(3 + OpOffset).getImm() & 0x7;
2483  switch (Imm) {
2484  default:
2485  // EVEX versions can be commuted.
2486  if ((Desc.TSFlags & X86II::EncodingMask) == X86II::EVEX)
2487  break;
2488  return false;
2489  case 0x00: // EQUAL
2490  case 0x03: // UNORDERED
2491  case 0x04: // NOT EQUAL
2492  case 0x07: // ORDERED
2493  break;
2494  }
2495 
2496  // The indices of the commutable operands are 1 and 2 (or 2 and 3
2497  // when masked).
2498  // Assign them to the returned operand indices here.
2499  return fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2, 1 + OpOffset,
2500  2 + OpOffset);
2501  }
2502  case X86::MOVSSrr:
2503  // X86::MOVSDrr is always commutable. MOVSS is only commutable if we can
2504  // form sse4.1 blend. We assume VMOVSSrr/VMOVSDrr is always commutable since
2505  // AVX implies sse4.1.
2506  if (Subtarget.hasSSE41())
2507  return TargetInstrInfo::findCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2);
2508  return false;
2509  case X86::SHUFPDrri:
2510  // We can commute this to MOVSD.
2511  if (MI.getOperand(3).getImm() == 0x02)
2512  return TargetInstrInfo::findCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2);
2513  return false;
2514  case X86::MOVHLPSrr:
2515  case X86::UNPCKHPDrr:
2516  case X86::VMOVHLPSrr:
2517  case X86::VUNPCKHPDrr:
2518  case X86::VMOVHLPSZrr:
2519  case X86::VUNPCKHPDZ128rr:
2520  if (Subtarget.hasSSE2())
2521  return TargetInstrInfo::findCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2);
2522  return false;
2523  case X86::VPTERNLOGDZrri: case X86::VPTERNLOGDZrmi:
2524  case X86::VPTERNLOGDZ128rri: case X86::VPTERNLOGDZ128rmi:
2525  case X86::VPTERNLOGDZ256rri: case X86::VPTERNLOGDZ256rmi:
2526  case X86::VPTERNLOGQZrri: case X86::VPTERNLOGQZrmi:
2527  case X86::VPTERNLOGQZ128rri: case X86::VPTERNLOGQZ128rmi:
2528  case X86::VPTERNLOGQZ256rri: case X86::VPTERNLOGQZ256rmi:
2529  case X86::VPTERNLOGDZrrik:
2530  case X86::VPTERNLOGDZ128rrik:
2531  case X86::VPTERNLOGDZ256rrik:
2532  case X86::VPTERNLOGQZrrik:
2533  case X86::VPTERNLOGQZ128rrik:
2534  case X86::VPTERNLOGQZ256rrik:
2535  case X86::VPTERNLOGDZrrikz: case X86::VPTERNLOGDZrmikz:
2536  case X86::VPTERNLOGDZ128rrikz: case X86::VPTERNLOGDZ128rmikz:
2537  case X86::VPTERNLOGDZ256rrikz: case X86::VPTERNLOGDZ256rmikz:
2538  case X86::VPTERNLOGQZrrikz: case X86::VPTERNLOGQZrmikz:
2539  case X86::VPTERNLOGQZ128rrikz: case X86::VPTERNLOGQZ128rmikz:
2540  case X86::VPTERNLOGQZ256rrikz: case X86::VPTERNLOGQZ256rmikz:
2541  case X86::VPTERNLOGDZ128rmbi:
2542  case X86::VPTERNLOGDZ256rmbi:
2543  case X86::VPTERNLOGDZrmbi:
2544  case X86::VPTERNLOGQZ128rmbi:
2545  case X86::VPTERNLOGQZ256rmbi:
2546  case X86::VPTERNLOGQZrmbi:
2547  case X86::VPTERNLOGDZ128rmbikz:
2548  case X86::VPTERNLOGDZ256rmbikz:
2549  case X86::VPTERNLOGDZrmbikz:
2550  case X86::VPTERNLOGQZ128rmbikz:
2551  case X86::VPTERNLOGQZ256rmbikz:
2552  case X86::VPTERNLOGQZrmbikz:
2553  return findThreeSrcCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2);
2554  case X86::VPDPWSSDYrr:
2555  case X86::VPDPWSSDrr:
2556  case X86::VPDPWSSDSYrr:
2557  case X86::VPDPWSSDSrr:
2558  case X86::VPDPBSSDSrr:
2559  case X86::VPDPBSSDSYrr:
2560  case X86::VPDPBSSDrr:
2561  case X86::VPDPBSSDYrr:
2562  case X86::VPDPBUUDSrr:
2563  case X86::VPDPBUUDSYrr:
2564  case X86::VPDPBUUDrr:
2565  case X86::VPDPBUUDYrr:
2566  case X86::VPDPWSSDZ128r:
2567  case X86::VPDPWSSDZ128rk:
2568  case X86::VPDPWSSDZ128rkz:
2569  case X86::VPDPWSSDZ256r:
2570  case X86::VPDPWSSDZ256rk:
2571  case X86::VPDPWSSDZ256rkz:
2572  case X86::VPDPWSSDZr:
2573  case X86::VPDPWSSDZrk:
2574  case X86::VPDPWSSDZrkz:
2575  case X86::VPDPWSSDSZ128r:
2576  case X86::VPDPWSSDSZ128rk:
2577  case X86::VPDPWSSDSZ128rkz:
2578  case X86::VPDPWSSDSZ256r:
2579  case X86::VPDPWSSDSZ256rk:
2580  case X86::VPDPWSSDSZ256rkz:
2581  case X86::VPDPWSSDSZr:
2582  case X86::VPDPWSSDSZrk:
2583  case X86::VPDPWSSDSZrkz:
2584  case X86::VPMADD52HUQrr:
2585  case X86::VPMADD52HUQYrr:
2586  case X86::VPMADD52HUQZ128r:
2587  case X86::VPMADD52HUQZ128rk:
2588  case X86::VPMADD52HUQZ128rkz:
2589  case X86::VPMADD52HUQZ256r:
2590  case X86::VPMADD52HUQZ256rk:
2591  case X86::VPMADD52HUQZ256rkz:
2592  case X86::VPMADD52HUQZr:
2593  case X86::VPMADD52HUQZrk:
2594  case X86::VPMADD52HUQZrkz:
2595  case X86::VPMADD52LUQrr:
2596  case X86::VPMADD52LUQYrr:
2597  case X86::VPMADD52LUQZ128r:
2598  case X86::VPMADD52LUQZ128rk:
2599  case X86::VPMADD52LUQZ128rkz:
2600  case X86::VPMADD52LUQZ256r:
2601  case X86::VPMADD52LUQZ256rk:
2602  case X86::VPMADD52LUQZ256rkz:
2603  case X86::VPMADD52LUQZr:
2604  case X86::VPMADD52LUQZrk:
2605  case X86::VPMADD52LUQZrkz:
2606  case X86::VFMADDCPHZr:
2607  case X86::VFMADDCPHZrk:
2608  case X86::VFMADDCPHZrkz:
2609  case X86::VFMADDCPHZ128r:
2610  case X86::VFMADDCPHZ128rk:
2611  case X86::VFMADDCPHZ128rkz:
2612  case X86::VFMADDCPHZ256r:
2613  case X86::VFMADDCPHZ256rk:
2614  case X86::VFMADDCPHZ256rkz:
2615  case X86::VFMADDCSHZr:
2616  case X86::VFMADDCSHZrk:
2617  case X86::VFMADDCSHZrkz: {
2618  unsigned CommutableOpIdx1 = 2;
2619  unsigned CommutableOpIdx2 = 3;
2620  if (X86II::isKMasked(Desc.TSFlags)) {
2621  // Skip the mask register.
2622  ++CommutableOpIdx1;
2623  ++CommutableOpIdx2;
2624  }
2625  if (!fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2,
2626  CommutableOpIdx1, CommutableOpIdx2))
2627  return false;
2628  if (!MI.getOperand(SrcOpIdx1).isReg() ||
2629  !MI.getOperand(SrcOpIdx2).isReg())
2630  // No idea.
2631  return false;
2632  return true;
2633  }
2634 
2635  default:
2636  const X86InstrFMA3Group *FMA3Group = getFMA3Group(MI.getOpcode(),
2637  MI.getDesc().TSFlags);
2638  if (FMA3Group)
2639  return findThreeSrcCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2,
2640  FMA3Group->isIntrinsic());
2641 
2642  // Handled masked instructions since we need to skip over the mask input
2643  // and the preserved input.
2644  if (X86II::isKMasked(Desc.TSFlags)) {
2645  // First assume that the first input is the mask operand and skip past it.
2646  unsigned CommutableOpIdx1 = Desc.getNumDefs() + 1;
2647  unsigned CommutableOpIdx2 = Desc.getNumDefs() + 2;
2648  // Check if the first input is tied. If there isn't one then we only
2649  // need to skip the mask operand which we did above.
2650  if ((MI.getDesc().getOperandConstraint(Desc.getNumDefs(),
2651  MCOI::TIED_TO) != -1)) {
2652  // If this is zero masking instruction with a tied operand, we need to
2653  // move the first index back to the first input since this must
2654  // be a 3 input instruction and we want the first two non-mask inputs.
2655  // Otherwise this is a 2 input instruction with a preserved input and
2656  // mask, so we need to move the indices to skip one more input.
2657  if (X86II::isKMergeMasked(Desc.TSFlags)) {
2658  ++CommutableOpIdx1;
2659  ++CommutableOpIdx2;
2660  } else {
2661  --CommutableOpIdx1;
2662  }
2663  }
2664 
2665  if (!fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2,
2666  CommutableOpIdx1, CommutableOpIdx2))
2667  return false;
2668 
2669  if (!MI.getOperand(SrcOpIdx1).isReg() ||
2670  !MI.getOperand(SrcOpIdx2).isReg())
2671  // No idea.
2672  return false;
2673  return true;
2674  }
2675 
2676  return TargetInstrInfo::findCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2);
2677  }
2678  return false;
2679 }
2680 
2682  unsigned Opcode = MI->getOpcode();
2683  if (Opcode != X86::LEA32r && Opcode != X86::LEA64r &&
2684  Opcode != X86::LEA64_32r)
2685  return false;
2686 
2687  const MachineOperand &Scale = MI->getOperand(1 + X86::AddrScaleAmt);
2688  const MachineOperand &Disp = MI->getOperand(1 + X86::AddrDisp);
2689  const MachineOperand &Segment = MI->getOperand(1 + X86::AddrSegmentReg);
2690 
2691  if (Segment.getReg() != 0 || !Disp.isImm() || Disp.getImm() != 0 ||
2692  Scale.getImm() > 1)
2693  return false;
2694 
2695  return true;
2696 }
2697 
2699  // Currently we're interested in following sequence only.
2700  // r3 = lea r1, r2
2701  // r5 = add r3, r4
2702  // Both r3 and r4 are killed in add, we hope the add instruction has the
2703  // operand order
2704  // r5 = add r4, r3
2705  // So later in X86FixupLEAs the lea instruction can be rewritten as add.
2706  unsigned Opcode = MI.getOpcode();
2707  if (Opcode != X86::ADD32rr && Opcode != X86::ADD64rr)
2708  return false;
2709 
2710  const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
2711  Register Reg1 = MI.getOperand(1).getReg();
2712  Register Reg2 = MI.getOperand(2).getReg();
2713 
2714  // Check if Reg1 comes from LEA in the same MBB.
2715  if (MachineInstr *Inst = MRI.getUniqueVRegDef(Reg1)) {
2716  if (isConvertibleLEA(Inst) && Inst->getParent() == MI.getParent()) {
2717  Commute = true;
2718  return true;
2719  }
2720  }
2721 
2722  // Check if Reg2 comes from LEA in the same MBB.
2723  if (MachineInstr *Inst = MRI.getUniqueVRegDef(Reg2)) {
2724  if (isConvertibleLEA(Inst) && Inst->getParent() == MI.getParent()) {
2725  Commute = false;
2726  return true;
2727  }
2728  }
2729 
2730  return false;
2731 }
2732 
2734  unsigned Opcode = MCID.getOpcode();
2735  if (!(X86::isJCC(Opcode) || X86::isSETCC(Opcode) || X86::isCMOVCC(Opcode)))
2736  return -1;
2737  // Assume that condition code is always the last use operand.
2738  unsigned NumUses = MCID.getNumOperands() - MCID.getNumDefs();
2739  return NumUses - 1;
2740 }
2741 
2743  const MCInstrDesc &MCID = MI.getDesc();
2744  int CondNo = getCondSrcNoFromDesc(MCID);
2745  if (CondNo < 0)
2746  return X86::COND_INVALID;
2747  CondNo += MCID.getNumDefs();
2748  return static_cast<X86::CondCode>(MI.getOperand(CondNo).getImm());
2749 }
2750 
2752  return X86::isJCC(MI.getOpcode()) ? X86::getCondFromMI(MI)
2754 }
2755 
2757  return X86::isSETCC(MI.getOpcode()) ? X86::getCondFromMI(MI)
2759 }
2760 
2762  return X86::isCMOVCC(MI.getOpcode()) ? X86::getCondFromMI(MI)
2764 }
2765 
2766 /// Return the inverse of the specified condition,
2767 /// e.g. turning COND_E to COND_NE.
2769  switch (CC) {
2770  default: llvm_unreachable("Illegal condition code!");
2771  case X86::COND_E: return X86::COND_NE;
2772  case X86::COND_NE: return X86::COND_E;
2773  case X86::COND_L: return X86::COND_GE;
2774  case X86::COND_LE: return X86::COND_G;
2775  case X86::COND_G: return X86::COND_LE;
2776  case X86::COND_GE: return X86::COND_L;
2777  case X86::COND_B: return X86::COND_AE;
2778  case X86::COND_BE: return X86::COND_A;
2779  case X86::COND_A: return X86::COND_BE;
2780  case X86::COND_AE: return X86::COND_B;
2781  case X86::COND_S: return X86::COND_NS;
2782  case X86::COND_NS: return X86::COND_S;
2783  case X86::COND_P: return X86::COND_NP;
2784  case X86::COND_NP: return X86::COND_P;
2785  case X86::COND_O: return X86::COND_NO;
2786  case X86::COND_NO: return X86::COND_O;
2789  }
2790 }
2791 
2792 /// Assuming the flags are set by MI(a,b), return the condition code if we
2793 /// modify the instructions such that flags are set by MI(b,a).
2795  switch (CC) {
2796  default: return X86::COND_INVALID;
2797  case X86::COND_E: return X86::COND_E;
2798  case X86::COND_NE: return X86::COND_NE;
2799  case X86::COND_L: return X86::COND_G;
2800  case X86::COND_LE: return X86::COND_GE;
2801  case X86::COND_G: return X86::COND_L;
2802  case X86::COND_GE: return X86::COND_LE;
2803  case X86::COND_B: return X86::COND_A;
2804  case X86::COND_BE: return X86::COND_AE;
2805  case X86::COND_A: return X86::COND_B;
2806  case X86::COND_AE: return X86::COND_BE;
2807  }
2808 }
2809 
2810 std::pair<X86::CondCode, bool>
2813  bool NeedSwap = false;
2814  switch (Predicate) {
2815  default: break;
2816  // Floating-point Predicates
2817  case CmpInst::FCMP_UEQ: CC = X86::COND_E; break;
2818  case CmpInst::FCMP_OLT: NeedSwap = true; [[fallthrough]];
2819  case CmpInst::FCMP_OGT: CC = X86::COND_A; break;
2820  case CmpInst::FCMP_OLE: NeedSwap = true; [[fallthrough]];
2821  case CmpInst::FCMP_OGE: CC = X86::COND_AE; break;
2822  case CmpInst::FCMP_UGT: NeedSwap = true; [[fallthrough]];
2823  case CmpInst::FCMP_ULT: CC = X86::COND_B; break;
2824  case CmpInst::FCMP_UGE: NeedSwap = true; [[fallthrough]];
2825  case CmpInst::FCMP_ULE: CC = X86::COND_BE; break;
2826  case CmpInst::FCMP_ONE: CC = X86::COND_NE; break;
2827  case CmpInst::FCMP_UNO: CC = X86::COND_P; break;
2828  case CmpInst::FCMP_ORD: CC = X86::COND_NP; break;
2829  case CmpInst::FCMP_OEQ: [[fallthrough]];
2830  case CmpInst::FCMP_UNE: CC = X86::COND_INVALID; break;
2831 
2832  // Integer Predicates
2833  case CmpInst::ICMP_EQ: CC = X86::COND_E; break;
2834  case CmpInst::ICMP_NE: CC = X86::COND_NE; break;
2835  case CmpInst::ICMP_UGT: CC = X86::COND_A; break;
2836  case CmpInst::ICMP_UGE: CC = X86::COND_AE; break;
2837  case CmpInst::ICMP_ULT: CC = X86::COND_B; break;
2838  case CmpInst::ICMP_ULE: CC = X86::COND_BE; break;
2839  case CmpInst::ICMP_SGT: CC = X86::COND_G; break;
2840  case CmpInst::ICMP_SGE: CC = X86::COND_GE; break;
2841  case CmpInst::ICMP_SLT: CC = X86::COND_L; break;
2842  case CmpInst::ICMP_SLE: CC = X86::COND_LE; break;
2843  }
2844 
2845  return std::make_pair(CC, NeedSwap);
2846 }
2847 
2848 /// Return a cmov opcode for the given register size in bytes, and operand type.
2849 unsigned X86::getCMovOpcode(unsigned RegBytes, bool HasMemoryOperand) {
2850  switch(RegBytes) {
2851  default: llvm_unreachable("Illegal register size!");
2852  case 2: return HasMemoryOperand ? X86::CMOV16rm : X86::CMOV16rr;
2853  case 4: return HasMemoryOperand ? X86::CMOV32rm : X86::CMOV32rr;
2854  case 8: return HasMemoryOperand ? X86::CMOV64rm : X86::CMOV64rr;
2855  }
2856 }
2857 
2858 /// Get the VPCMP immediate for the given condition.
2860  switch (CC) {
2861  default: llvm_unreachable("Unexpected SETCC condition");
2862  case ISD::SETNE: return 4;
2863  case ISD::SETEQ: return 0;
2864  case ISD::SETULT:
2865  case ISD::SETLT: return 1;
2866  case ISD::SETUGT:
2867  case ISD::SETGT: return 6;
2868  case ISD::SETUGE:
2869  case ISD::SETGE: return 5;
2870  case ISD::SETULE:
2871  case ISD::SETLE: return 2;
2872  }
2873 }
2874 
2875 /// Get the VPCMP immediate if the operands are swapped.
2876 unsigned X86::getSwappedVPCMPImm(unsigned Imm) {
2877  switch (Imm) {
2878  default: llvm_unreachable("Unreachable!");
2879  case 0x01: Imm = 0x06; break; // LT -> NLE
2880  case 0x02: Imm = 0x05; break; // LE -> NLT
2881  case 0x05: Imm = 0x02; break; // NLT -> LE
2882  case 0x06: Imm = 0x01; break; // NLE -> LT
2883  case 0x00: // EQ
2884  case 0x03: // FALSE
2885  case 0x04: // NE
2886  case 0x07: // TRUE
2887  break;
2888  }
2889 
2890  return Imm;
2891 }
2892 
2893 /// Get the VPCOM immediate if the operands are swapped.
2894 unsigned X86::getSwappedVPCOMImm(unsigned Imm) {
2895  switch (Imm) {
2896  default: llvm_unreachable("Unreachable!");
2897  case 0x00: Imm = 0x02; break; // LT -> GT
2898  case 0x01: Imm = 0x03; break; // LE -> GE
2899  case 0x02: Imm = 0x00; break; // GT -> LT
2900  case 0x03: Imm = 0x01; break; // GE -> LE
2901  case 0x04: // EQ
2902  case 0x05: // NE
2903  case 0x06: // FALSE
2904  case 0x07: // TRUE
2905  break;
2906  }
2907 
2908  return Imm;
2909 }
2910 
2911 /// Get the VCMP immediate if the operands are swapped.
2912 unsigned X86::getSwappedVCMPImm(unsigned Imm) {
2913  // Only need the lower 2 bits to distinquish.
2914  switch (Imm & 0x3) {
2915  default: llvm_unreachable("Unreachable!");
2916  case 0x00: case 0x03:
2917  // EQ/NE/TRUE/FALSE/ORD/UNORD don't change immediate when commuted.
2918  break;
2919  case 0x01: case 0x02:
2920  // Need to toggle bits 3:0. Bit 4 stays the same.
2921  Imm ^= 0xf;
2922  break;
2923  }
2924 
2925  return Imm;
2926 }
2927 
2928 /// Return true if the Reg is X87 register.
2929 static bool isX87Reg(unsigned Reg) {
2930  return (Reg == X86::FPCW || Reg == X86::FPSW ||
2931  (Reg >= X86::ST0 && Reg <= X86::ST7));
2932 }
2933 
2934 /// check if the instruction is X87 instruction
2936  for (const MachineOperand &MO : MI.operands()) {
2937  if (!MO.isReg())
2938  continue;
2939  if (isX87Reg(MO.getReg()))
2940  return true;
2941  }
2942  return false;
2943 }
2944 
2946  switch (MI.getOpcode()) {
2947  case X86::TCRETURNdi:
2948  case X86::TCRETURNri:
2949  case X86::TCRETURNmi:
2950  case X86::TCRETURNdi64:
2951  case X86::TCRETURNri64:
2952  case X86::TCRETURNmi64:
2953  return true;
2954  default:
2955  return false;
2956  }
2957 }
2958 
2960  SmallVectorImpl<MachineOperand> &BranchCond,
2961  const MachineInstr &TailCall) const {
2962 
2963  const MachineFunction *MF = TailCall.getMF();
2964 
2965  if (MF->getTarget().getCodeModel() == CodeModel::Kernel) {
2966  // Kernel patches thunk calls in runtime, these should never be conditional.
2967  const MachineOperand &Target = TailCall.getOperand(0);
2968  if (Target.isSymbol()) {
2969  StringRef Symbol(Target.getSymbolName());
2970  // this is currently only relevant to r11/kernel indirect thunk.
2971  if (Symbol.equals("__x86_indirect_thunk_r11"))
2972  return false;
2973  }
2974  }
2975 
2976  if (TailCall.getOpcode() != X86::TCRETURNdi &&
2977  TailCall.getOpcode() != X86::TCRETURNdi64) {
2978  // Only direct calls can be done with a conditional branch.
2979  return false;
2980  }
2981 
2982  if (Subtarget.isTargetWin64() && MF->hasWinCFI()) {
2983  // Conditional tail calls confuse the Win64 unwinder.
2984  return false;
2985  }
2986 
2987  assert(BranchCond.size() == 1);
2988  if (BranchCond[0].getImm() > X86::LAST_VALID_COND) {
2989  // Can't make a conditional tail call with this condition.
2990  return false;
2991  }
2992 
2994  if (X86FI->getTCReturnAddrDelta() != 0 ||
2995  TailCall.getOperand(1).getImm() != 0) {
2996  // A conditional tail call cannot do any stack adjustment.
2997  return false;
2998  }
2999 
3000  return true;
3001 }
3002 
3005  const MachineInstr &TailCall) const {
3007 
3009  while (I != MBB.begin()) {
3010  --I;
3011  if (I->isDebugInstr())
3012  continue;
3013  if (!I->isBranch())
3014  assert(0 && "Can't find the branch to replace!");
3015 
3017  assert(BranchCond.size() == 1);
3018  if (CC != BranchCond[0].getImm())
3019  continue;
3020 
3021  break;
3022  }
3023 
3024  unsigned Opc = TailCall.getOpcode() == X86::TCRETURNdi ? X86::TCRETURNdicc
3025  : X86::TCRETURNdi64cc;
3026 
3027  auto MIB = BuildMI(MBB, I, MBB.findDebugLoc(I), get(Opc));
3028  MIB->addOperand(TailCall.getOperand(0)); // Destination.
3029  MIB.addImm(0); // Stack offset (not used).
3030  MIB->addOperand(BranchCond[0]); // Condition.
3031  MIB.copyImplicitOps(TailCall); // Regmask and (imp-used) parameters.
3032 
3033  // Add implicit uses and defs of all live regs potentially clobbered by the
3034  // call. This way they still appear live across the call.
3035  LivePhysRegs LiveRegs(getRegisterInfo());
3036  LiveRegs.addLiveOuts(MBB);
3038  LiveRegs.stepForward(*MIB, Clobbers);
3039  for (const auto &C : Clobbers) {
3040  MIB.addReg(C.first, RegState::Implicit);
3042  }
3043 
3044  I->eraseFromParent();
3045 }
3046 
3047 // Given a MBB and its TBB, find the FBB which was a fallthrough MBB (it may
3048 // not be a fallthrough MBB now due to layout changes). Return nullptr if the
3049 // fallthrough MBB cannot be identified.
3052  // Look for non-EHPad successors other than TBB. If we find exactly one, it
3053  // is the fallthrough MBB. If we find zero, then TBB is both the target MBB
3054  // and fallthrough MBB. If we find more than one, we cannot identify the
3055  // fallthrough MBB and should return nullptr.
3056  MachineBasicBlock *FallthroughBB = nullptr;
3057  for (MachineBasicBlock *Succ : MBB->successors()) {
3058  if (Succ->isEHPad() || (Succ == TBB && FallthroughBB))
3059  continue;
3060  // Return a nullptr if we found more than one fallthrough successor.
3061  if (FallthroughBB && FallthroughBB != TBB)
3062  return nullptr;
3063  FallthroughBB = Succ;
3064  }
3065  return FallthroughBB;
3066 }
3067 
3068 bool X86InstrInfo::AnalyzeBranchImpl(
3071  SmallVectorImpl<MachineInstr *> &CondBranches, bool AllowModify) const {
3072 
3073  // Start from the bottom of the block and work up, examining the
3074  // terminator instructions.
3076  MachineBasicBlock::iterator UnCondBrIter = MBB.end();
3077  while (I != MBB.begin()) {
3078  --I;
3079  if (I->isDebugInstr())
3080  continue;
3081 
3082  // Working from the bottom, when we see a non-terminator instruction, we're
3083  // done.
3084  if (!isUnpredicatedTerminator(*I))
3085  break;
3086 
3087  // A terminator that isn't a branch can't easily be handled by this
3088  // analysis.
3089  if (!I->isBranch())
3090  return true;
3091 
3092  // Handle unconditional branches.
3093  if (I->getOpcode() == X86::JMP_1) {
3094  UnCondBrIter = I;
3095 
3096  if (!AllowModify) {
3097  TBB = I->getOperand(0).getMBB();
3098  continue;
3099  }
3100 
3101  // If the block has any instructions after a JMP, delete them.
3102  MBB.erase(std::next(I), MBB.end());
3103 
3104  Cond.clear();
3105  FBB = nullptr;
3106 
3107  // Delete the JMP if it's equivalent to a fall-through.
3108  if (MBB.isLayoutSuccessor(I->getOperand(0).getMBB())) {
3109  TBB = nullptr;
3110  I->eraseFromParent();
3111  I = MBB.end();
3112  UnCondBrIter = MBB.end();
3113  continue;
3114  }
3115 
3116  // TBB is used to indicate the unconditional destination.
3117  TBB = I->getOperand(0).getMBB();
3118  continue;
3119  }
3120 
3121  // Handle conditional branches.
3122  X86::CondCode BranchCode = X86::getCondFromBranch(*I);
3123  if (BranchCode == X86::COND_INVALID)
3124  return true; // Can't handle indirect branch.
3125 
3126  // In practice we should never have an undef eflags operand, if we do
3127  // abort here as we are not prepared to preserve the flag.
3128  if (I->findRegisterUseOperand(X86::EFLAGS)->isUndef())
3129  return true;
3130 
3131  // Working from the bottom, handle the first conditional branch.
3132  if (Cond.empty()) {
3133  FBB = TBB;
3134  TBB = I->getOperand(0).getMBB();
3135  Cond.push_back(MachineOperand::CreateImm(BranchCode));
3136  CondBranches.push_back(&*I);
3137  continue;
3138  }
3139 
3140  // Handle subsequent conditional branches. Only handle the case where all
3141  // conditional branches branch to the same destination and their condition
3142  // opcodes fit one of the special multi-branch idioms.
3143  assert(Cond.size() == 1);
3144  assert(TBB);
3145 
3146  // If the conditions are the same, we can leave them alone.
3147  X86::CondCode OldBranchCode = (X86::CondCode)Cond[0].getImm();
3148  auto NewTBB = I->getOperand(0).getMBB();
3149  if (OldBranchCode == BranchCode && TBB == NewTBB)
3150  continue;
3151 
3152  // If they differ, see if they fit one of the known patterns. Theoretically,
3153  // we could handle more patterns here, but we shouldn't expect to see them
3154  // if instruction selection has done a reasonable job.
3155  if (TBB == NewTBB &&
3156  ((OldBranchCode == X86::COND_P && BranchCode == X86::COND_NE) ||
3157  (OldBranchCode == X86::COND_NE && BranchCode == X86::COND_P))) {
3158  BranchCode = X86::COND_NE_OR_P;
3159  } else if ((OldBranchCode == X86::COND_NP && BranchCode == X86::COND_NE) ||
3160  (OldBranchCode == X86::COND_E && BranchCode == X86::COND_P)) {
3161  if (NewTBB != (FBB ? FBB : getFallThroughMBB(&MBB, TBB)))
3162  return true;
3163 
3164  // X86::COND_E_AND_NP usually has two different branch destinations.
3165  //
3166  // JP B1
3167  // JE B2
3168  // JMP B1
3169  // B1:
3170  // B2:
3171  //
3172  // Here this condition branches to B2 only if NP && E. It has another
3173  // equivalent form:
3174  //
3175  // JNE B1
3176  // JNP B2
3177  // JMP B1
3178  // B1:
3179  // B2:
3180  //
3181  // Similarly it branches to B2 only if E && NP. That is why this condition
3182  // is named with COND_E_AND_NP.
3183  BranchCode = X86::COND_E_AND_NP;
3184  } else
3185  return true;
3186 
3187  // Update the MachineOperand.
3188  Cond[0].setImm(BranchCode);
3189  CondBranches.push_back(&*I);
3190  }
3191 
3192  return false;
3193 }
3194 
3197  MachineBasicBlock *&FBB,
3199  bool AllowModify) const {
3200  SmallVector<MachineInstr *, 4> CondBranches;
3201  return AnalyzeBranchImpl(MBB, TBB, FBB, Cond, CondBranches, AllowModify);
3202 }
3203 
3205  MachineBranchPredicate &MBP,
3206  bool AllowModify) const {
3207  using namespace std::placeholders;
3208 
3210  SmallVector<MachineInstr *, 4> CondBranches;
3211  if (AnalyzeBranchImpl(MBB, MBP.TrueDest, MBP.FalseDest, Cond, CondBranches,
3212  AllowModify))
3213  return true;
3214 
3215  if (Cond.size() != 1)
3216  return true;
3217 
3218  assert(MBP.TrueDest && "expected!");
3219 
3220  if (!MBP.FalseDest)
3221  MBP.FalseDest = MBB.getNextNode();
3222 
3224 
3225  MachineInstr *ConditionDef = nullptr;
3226  bool SingleUseCondition = true;
3227 
3229  if (MI.modifiesRegister(X86::EFLAGS, TRI)) {
3230  ConditionDef = &MI;
3231  break;
3232  }
3233 
3234  if (MI.readsRegister(X86::EFLAGS, TRI))
3235  SingleUseCondition = false;
3236  }
3237 
3238  if (!ConditionDef)
3239  return true;
3240 
3241  if (SingleUseCondition) {
3242  for (auto *Succ : MBB.successors())
3243  if (Succ->isLiveIn(X86::EFLAGS))
3244  SingleUseCondition = false;
3245  }
3246 
3247  MBP.ConditionDef = ConditionDef;
3248  MBP.SingleUseCondition = SingleUseCondition;
3249 
3250  // Currently we only recognize the simple pattern:
3251  //
3252  // test %reg, %reg
3253  // je %label
3254  //
3255  const unsigned TestOpcode =
3256  Subtarget.is64Bit() ? X86::TEST64rr : X86::TEST32rr;
3257 
3258  if (ConditionDef->getOpcode() == TestOpcode &&
3259  ConditionDef->getNumOperands() == 3 &&
3260  ConditionDef->getOperand(0).isIdenticalTo(ConditionDef->getOperand(1)) &&
3261  (Cond[0].getImm() == X86::COND_NE || Cond[0].getImm() == X86::COND_E)) {
3262  MBP.LHS = ConditionDef->getOperand(0);
3263  MBP.RHS = MachineOperand::CreateImm(0);
3264  MBP.Predicate = Cond[0].getImm() == X86::COND_NE
3267  return false;
3268  }
3269 
3270  return true;
3271 }
3272 
3274  int *BytesRemoved) const {
3275  assert(!BytesRemoved && "code size not handled");
3276 
3278  unsigned Count = 0;
3279 
3280  while (I != MBB.begin()) {
3281  --I;
3282  if (I->isDebugInstr())
3283  continue;
3284  if (I->getOpcode() != X86::JMP_1 &&
3286  break;
3287  // Remove the branch.
3288  I->eraseFromParent();
3289  I = MBB.end();
3290  ++Count;
3291  }
3292 
3293  return Count;
3294 }
3295 
3298  MachineBasicBlock *FBB,
3300  const DebugLoc &DL,
3301  int *BytesAdded) const {
3302  // Shouldn't be a fall through.
3303  assert(TBB && "insertBranch must not be told to insert a fallthrough");
3304  assert((Cond.size() == 1 || Cond.size() == 0) &&
3305  "X86 branch conditions have one component!");
3306  assert(!BytesAdded && "code size not handled");
3307 
3308  if (Cond.empty()) {
3309  // Unconditional branch?
3310  assert(!FBB && "Unconditional branch with multiple successors!");
3311  BuildMI(&MBB, DL, get(X86::JMP_1)).addMBB(TBB);
3312  return 1;
3313  }
3314 
3315  // If FBB is null, it is implied to be a fall-through block.
3316  bool FallThru = FBB == nullptr;
3317 
3318  // Conditional branch.
3319  unsigned Count = 0;
3320  X86::CondCode CC = (X86::CondCode)Cond[0].getImm();
3321  switch (CC) {
3322  case X86::COND_NE_OR_P:
3323  // Synthesize NE_OR_P with two branches.
3324  BuildMI(&MBB, DL, get(X86::JCC_1)).addMBB(TBB).addImm(X86::COND_NE);
3325  ++Count;
3326  BuildMI(&MBB, DL, get(X86::JCC_1)).addMBB(TBB).addImm(X86::COND_P);
3327  ++Count;
3328  break;
3329  case X86::COND_E_AND_NP:
3330  // Use the next block of MBB as FBB if it is null.
3331  if (FBB == nullptr) {
3332  FBB = getFallThroughMBB(&MBB, TBB);
3333  assert(FBB && "MBB cannot be the last block in function when the false "
3334  "body is a fall-through.");
3335  }
3336  // Synthesize COND_E_AND_NP with two branches.
3337  BuildMI(&MBB, DL, get(X86::JCC_1)).addMBB(FBB).addImm(X86::COND_NE);
3338  ++Count;
3339  BuildMI(&MBB, DL, get(X86::JCC_1)).addMBB(TBB).addImm(X86::COND_NP);
3340  ++Count;
3341  break;
3342  default: {
3343  BuildMI(&MBB, DL, get(X86::JCC_1)).addMBB(TBB).addImm(CC);
3344  ++Count;
3345  }
3346  }
3347  if (!FallThru) {
3348  // Two-way Conditional branch. Insert the second branch.
3349  BuildMI(&MBB, DL, get(X86::JMP_1)).addMBB(FBB);
3350  ++Count;
3351  }
3352  return Count;
3353 }
3354 
3357  Register DstReg, Register TrueReg,
3358  Register FalseReg, int &CondCycles,
3359  int &TrueCycles, int &FalseCycles) const {
3360  // Not all subtargets have cmov instructions.
3361  if (!Subtarget.canUseCMOV())
3362  return false;
3363  if (Cond.size() != 1)
3364  return false;
3365  // We cannot do the composite conditions, at least not in SSA form.
3366  if ((X86::CondCode)Cond[0].getImm() > X86::LAST_VALID_COND)
3367  return false;
3368 
3369  // Check register classes.
3371  const TargetRegisterClass *RC =
3372  RI.getCommonSubClass(MRI.getRegClass(TrueReg), MRI.getRegClass(FalseReg));
3373  if (!RC)
3374  return false;
3375 
3376  // We have cmov instructions for 16, 32, and 64 bit general purpose registers.
3377  if (X86::GR16RegClass.hasSubClassEq(RC) ||
3378  X86::GR32RegClass.hasSubClassEq(RC) ||
3379  X86::GR64RegClass.hasSubClassEq(RC)) {
3380  // This latency applies to Pentium M, Merom, Wolfdale, Nehalem, and Sandy
3381  // Bridge. Probably Ivy Bridge as well.
3382  CondCycles = 2;
3383  TrueCycles = 2;
3384  FalseCycles = 2;
3385  return true;
3386  }
3387 
3388  // Can't do vectors.
3389  return false;
3390 }
3391 
3394  const DebugLoc &DL, Register DstReg,
3396  Register FalseReg) const {
3399  const TargetRegisterClass &RC = *MRI.getRegClass(DstReg);
3400  assert(Cond.size() == 1 && "Invalid Cond array");
3401  unsigned Opc = X86::getCMovOpcode(TRI.getRegSizeInBits(RC) / 8,
3402  false /*HasMemoryOperand*/);
3403  BuildMI(MBB, I, DL, get(Opc), DstReg)
3404  .addReg(FalseReg)
3405  .addReg(TrueReg)
3406  .addImm(Cond[0].getImm());
3407 }
3408 
3409 /// Test if the given register is a physical h register.
3410 static bool isHReg(unsigned Reg) {
3411  return X86::GR8_ABCD_HRegClass.contains(Reg);
3412 }
3413 
3414 // Try and copy between VR128/VR64 and GR64 registers.
3415 static unsigned CopyToFromAsymmetricReg(unsigned DestReg, unsigned SrcReg,
3416  const X86Subtarget &Subtarget) {
3417  bool HasAVX = Subtarget.hasAVX();
3418  bool HasAVX512 = Subtarget.hasAVX512();
3419 
3420  // SrcReg(MaskReg) -> DestReg(GR64)
3421  // SrcReg(MaskReg) -> DestReg(GR32)
3422 
3423  // All KMASK RegClasses hold the same k registers, can be tested against anyone.
3424  if (X86::VK16RegClass.contains(SrcReg)) {
3425  if (X86::GR64RegClass.contains(DestReg)) {
3426  assert(Subtarget.hasBWI());
3427  return X86::KMOVQrk;
3428  }
3429  if (X86::GR32RegClass.contains(DestReg))
3430  return Subtarget.hasBWI() ? X86::KMOVDrk : X86::KMOVWrk;
3431  }
3432 
3433  // SrcReg(GR64) -> DestReg(MaskReg)
3434  // SrcReg(GR32) -> DestReg(MaskReg)
3435 
3436  // All KMASK RegClasses hold the same k registers, can be tested against anyone.
3437  if (X86::VK16RegClass.contains(DestReg)) {
3438  if (X86::GR64RegClass.contains(SrcReg)) {
3439  assert(Subtarget.hasBWI());
3440  return X86::KMOVQkr;
3441  }
3442  if (X86::GR32RegClass.contains(SrcReg))
3443  return Subtarget.hasBWI() ? X86::KMOVDkr : X86::KMOVWkr;
3444  }
3445 
3446 
3447  // SrcReg(VR128) -> DestReg(GR64)
3448  // SrcReg(VR64) -> DestReg(GR64)
3449  // SrcReg(GR64) -> DestReg(VR128)
3450  // SrcReg(GR64) -> DestReg(VR64)
3451 
3452  if (X86::GR64RegClass.contains(DestReg)) {
3453  if (X86::VR128XRegClass.contains(SrcReg))
3454  // Copy from a VR128 register to a GR64 register.
3455  return HasAVX512 ? X86::VMOVPQIto64Zrr :
3456  HasAVX ? X86::VMOVPQIto64rr :
3457  X86::MOVPQIto64rr;
3458  if (X86::VR64RegClass.contains(SrcReg))
3459  // Copy from a VR64 register to a GR64 register.
3460  return X86::MMX_MOVD64from64rr;
3461  } else if (X86::GR64RegClass.contains(SrcReg)) {
3462  // Copy from a GR64 register to a VR128 register.
3463  if (X86::VR128XRegClass.contains(DestReg))
3464  return HasAVX512 ? X86::VMOV64toPQIZrr :
3465  HasAVX ? X86::VMOV64toPQIrr :
3466  X86::MOV64toPQIrr;
3467  // Copy from a GR64 register to a VR64 register.
3468  if (X86::VR64RegClass.contains(DestReg))
3469  return X86::MMX_MOVD64to64rr;
3470  }
3471 
3472  // SrcReg(VR128) -> DestReg(GR32)
3473  // SrcReg(GR32) -> DestReg(VR128)
3474 
3475  if (X86::GR32RegClass.contains(DestReg) &&
3476  X86::VR128XRegClass.contains(SrcReg))
3477  // Copy from a VR128 register to a GR32 register.
3478  return HasAVX512 ? X86::VMOVPDI2DIZrr :
3479  HasAVX ? X86::VMOVPDI2DIrr :
3480  X86::MOVPDI2DIrr;
3481 
3482  if (X86::VR128XRegClass.contains(DestReg) &&
3483  X86::GR32RegClass.contains(SrcReg))
3484  // Copy from a VR128 register to a VR128 register.
3485  return HasAVX512 ? X86::VMOVDI2PDIZrr :
3486  HasAVX ? X86::VMOVDI2PDIrr :
3487  X86::MOVDI2PDIrr;
3488  return 0;
3489 }
3490 
3493  const DebugLoc &DL, MCRegister DestReg,
3494  MCRegister SrcReg, bool KillSrc) const {
3495  // First deal with the normal symmetric copies.
3496  bool HasAVX = Subtarget.hasAVX();
3497  bool HasVLX = Subtarget.hasVLX();
3498  unsigned Opc = 0;
3499  if (X86::GR64RegClass.contains(DestReg, SrcReg))
3500  Opc = X86::MOV64rr;
3501  else if (X86::GR32RegClass.contains(DestReg, SrcReg))
3502  Opc = X86::MOV32rr;
3503  else if (X86::GR16RegClass.contains(DestReg, SrcReg))
3504  Opc = X86::MOV16rr;
3505  else if (X86::GR8RegClass.contains(DestReg, SrcReg)) {
3506  // Copying to or from a physical H register on x86-64 requires a NOREX
3507  // move. Otherwise use a normal move.
3508  if ((isHReg(DestReg) || isHReg(SrcReg)) &&
3509  Subtarget.is64Bit()) {
3510  Opc = X86::MOV8rr_NOREX;
3511  // Both operands must be encodable without an REX prefix.
3512  assert(X86::GR8_NOREXRegClass.contains(SrcReg, DestReg) &&
3513  "8-bit H register can not be copied outside GR8_NOREX");
3514  } else
3515  Opc = X86::MOV8rr;
3516  }
3517  else if (X86::VR64RegClass.contains(DestReg, SrcReg))
3518  Opc = X86::MMX_MOVQ64rr;
3519  else if (X86::VR128XRegClass.contains(DestReg, SrcReg)) {
3520  if (HasVLX)
3521  Opc = X86::VMOVAPSZ128rr;
3522  else if (X86::VR128RegClass.contains(DestReg, SrcReg))
3523  Opc = HasAVX ? X86::VMOVAPSrr : X86::MOVAPSrr;
3524  else {
3525  // If this an extended register and we don't have VLX we need to use a
3526  // 512-bit move.
3527  Opc = X86::VMOVAPSZrr;
3529  DestReg = TRI->getMatchingSuperReg(DestReg, X86::sub_xmm,
3530  &X86::VR512RegClass);
3531  SrcReg = TRI->getMatchingSuperReg(SrcReg, X86::sub_xmm,
3532  &X86::VR512RegClass);
3533  }
3534  } else if (X86::VR256XRegClass.contains(DestReg, SrcReg)) {
3535  if (HasVLX)
3536  Opc = X86::VMOVAPSZ256rr;
3537  else if (X86::VR256RegClass.contains(DestReg, SrcReg))
3538  Opc = X86::VMOVAPSYrr;
3539  else {
3540  // If this an extended register and we don't have VLX we need to use a
3541  // 512-bit move.
3542  Opc = X86::VMOVAPSZrr;
3544  DestReg = TRI->getMatchingSuperReg(DestReg, X86::sub_ymm,
3545  &X86::VR512RegClass);
3546  SrcReg = TRI->getMatchingSuperReg(SrcReg, X86::sub_ymm,
3547  &X86::VR512RegClass);
3548  }
3549  } else if (X86::VR512RegClass.contains(DestReg, SrcReg))
3550  Opc = X86::VMOVAPSZrr;
3551  // All KMASK RegClasses hold the same k registers, can be tested against anyone.
3552  else if (X86::VK16RegClass.contains(DestReg, SrcReg))
3553  Opc = Subtarget.hasBWI() ? X86::KMOVQkk : X86::KMOVWkk;
3554  if (!Opc)
3555  Opc = CopyToFromAsymmetricReg(DestReg, SrcReg, Subtarget);
3556 
3557  if (Opc) {
3558  BuildMI(MBB, MI, DL, get(Opc), DestReg)
3559  .addReg(SrcReg, getKillRegState(KillSrc));
3560  return;
3561  }
3562 
3563  if (SrcReg == X86::EFLAGS || DestReg == X86::EFLAGS) {
3564  // FIXME: We use a fatal error here because historically LLVM has tried
3565  // lower some of these physreg copies and we want to ensure we get
3566  // reasonable bug reports if someone encounters a case no other testing
3567  // found. This path should be removed after the LLVM 7 release.
3568  report_fatal_error("Unable to copy EFLAGS physical register!");
3569  }
3570 
3571  LLVM_DEBUG(dbgs() << "Cannot copy " << RI.getName(SrcReg) << " to "
3572  << RI.getName(DestReg) << '\n');
3573  report_fatal_error("Cannot emit physreg copy instruction");
3574 }
3575 
3578  if (MI.isMoveReg())
3579  return DestSourcePair{MI.getOperand(0), MI.getOperand(1)};
3580  return std::nullopt;
3581 }
3582 
3583 static unsigned getLoadStoreOpcodeForFP16(bool Load, const X86Subtarget &STI) {
3584  if (STI.hasFP16())
3585  return Load ? X86::VMOVSHZrm_alt : X86::VMOVSHZmr;
3586  if (Load)
3587  return STI.hasAVX512() ? X86::VMOVSSZrm
3588  : STI.hasAVX() ? X86::VMOVSSrm
3589  : X86::MOVSSrm;
3590  else
3591  return STI.hasAVX512() ? X86::VMOVSSZmr
3592  : STI.hasAVX() ? X86::VMOVSSmr
3593  : X86::MOVSSmr;
3594 }
3595 
3597  const TargetRegisterClass *RC,
3598  bool IsStackAligned,
3599  const X86Subtarget &STI, bool Load) {
3600  bool HasAVX = STI.hasAVX();
3601  bool HasAVX512 = STI.hasAVX512();
3602  bool HasVLX = STI.hasVLX();
3603 
3604  switch (STI.getRegisterInfo()->getSpillSize(*RC)) {
3605  default:
3606  llvm_unreachable("Unknown spill size");
3607  case 1:
3608  assert(X86::GR8RegClass.hasSubClassEq(RC) && "Unknown 1-byte regclass");
3609  if (STI.is64Bit())
3610  // Copying to or from a physical H register on x86-64 requires a NOREX
3611  // move. Otherwise use a normal move.
3612  if (isHReg(Reg) || X86::GR8_ABCD_HRegClass.hasSubClassEq(RC))
3613  return Load ? X86::MOV8rm_NOREX : X86::MOV8mr_NOREX;
3614  return Load ? X86::MOV8rm : X86::MOV8mr;
3615  case 2:
3616  if (X86::VK16RegClass.hasSubClassEq(RC))
3617  return Load ? X86::KMOVWkm : X86::KMOVWmk;
3618  assert(X86::GR16RegClass.hasSubClassEq(RC) && "Unknown 2-byte regclass");
3619  return Load ? X86::MOV16rm : X86::MOV16mr;
3620  case 4:
3621  if (X86::GR32RegClass.hasSubClassEq(RC))
3622  return Load ? X86::MOV32rm : X86::MOV32mr;
3623  if (X86::FR32XRegClass.hasSubClassEq(RC))
3624  return Load ?
3625  (HasAVX512 ? X86::VMOVSSZrm_alt :
3626  HasAVX ? X86::VMOVSSrm_alt :
3627  X86::MOVSSrm_alt) :
3628  (HasAVX512 ? X86::VMOVSSZmr :
3629  HasAVX ? X86::VMOVSSmr :
3630  X86::MOVSSmr);
3631  if (X86::RFP32RegClass.hasSubClassEq(RC))
3632  return Load ? X86::LD_Fp32m : X86::ST_Fp32m;
3633  if (X86::VK32RegClass.hasSubClassEq(RC)) {
3634  assert(STI.hasBWI() && "KMOVD requires BWI");
3635  return Load ? X86::KMOVDkm : X86::KMOVDmk;
3636  }
3637  // All of these mask pair classes have the same spill size, the same kind
3638  // of kmov instructions can be used with all of them.
3639  if (X86::VK1PAIRRegClass.hasSubClassEq(RC) ||
3640  X86::VK2PAIRRegClass.hasSubClassEq(RC) ||
3641  X86::VK4PAIRRegClass.hasSubClassEq(RC) ||
3642  X86::VK8PAIRRegClass.hasSubClassEq(RC) ||
3643  X86::VK16PAIRRegClass.hasSubClassEq(RC))
3644  return Load ? X86::MASKPAIR16LOAD : X86::MASKPAIR16STORE;
3645  if (X86::FR16RegClass.hasSubClassEq(RC) ||
3646  X86::FR16XRegClass.hasSubClassEq(RC))
3647  return getLoadStoreOpcodeForFP16(Load, STI);
3648  llvm_unreachable("Unknown 4-byte regclass");
3649  case 8:
3650  if (X86::GR64RegClass.hasSubClassEq(RC))
3651  return Load ? X86::MOV64rm : X86::MOV64mr;
3652  if (X86::FR64XRegClass.hasSubClassEq(RC))
3653  return Load ?
3654  (HasAVX512 ? X86::VMOVSDZrm_alt :
3655  HasAVX ? X86::VMOVSDrm_alt :
3656  X86::MOVSDrm_alt) :
3657  (HasAVX512 ? X86::VMOVSDZmr :
3658  HasAVX ? X86::VMOVSDmr :
3659  X86::MOVSDmr);
3660  if (X86::VR64RegClass.hasSubClassEq(RC))
3661  return Load ? X86::MMX_MOVQ64rm : X86::MMX_MOVQ64mr;
3662  if (X86::RFP64RegClass.hasSubClassEq(RC))
3663  return Load ? X86::LD_Fp64m : X86::ST_Fp64m;
3664  if (X86::VK64RegClass.hasSubClassEq(RC)) {
3665  assert(STI.hasBWI() && "KMOVQ requires BWI");
3666  return Load ? X86::KMOVQkm : X86::KMOVQmk;
3667  }
3668  llvm_unreachable("Unknown 8-byte regclass");
3669  case 10:
3670  assert(X86::RFP80RegClass.hasSubClassEq(RC) && "Unknown 10-byte regclass");
3671  return Load ? X86::LD_Fp80m : X86::ST_FpP80m;
3672  case 16: {
3673  if (X86::VR128XRegClass.hasSubClassEq(RC)) {
3674  // If stack is realigned we can use aligned stores.
3675  if (IsStackAligned)
3676  return Load ?
3677  (HasVLX ? X86::VMOVAPSZ128rm :
3678  HasAVX512 ? X86::VMOVAPSZ128rm_NOVLX :
3679  HasAVX ? X86::VMOVAPSrm :
3680  X86::MOVAPSrm):
3681  (HasVLX ? X86::VMOVAPSZ128mr :
3682  HasAVX512 ? X86::VMOVAPSZ128mr_NOVLX :
3683  HasAVX ? X86::VMOVAPSmr :
3684  X86::MOVAPSmr);
3685  else
3686  return Load ?
3687  (HasVLX ? X86::VMOVUPSZ128rm :
3688  HasAVX512 ? X86::VMOVUPSZ128rm_NOVLX :
3689  HasAVX ? X86::VMOVUPSrm :
3690  X86::MOVUPSrm):
3691  (HasVLX ? X86::VMOVUPSZ128mr :
3692  HasAVX512 ? X86::VMOVUPSZ128mr_NOVLX :
3693  HasAVX ? X86::VMOVUPSmr :
3694  X86::MOVUPSmr);
3695  }
3696  llvm_unreachable("Unknown 16-byte regclass");
3697  }
3698  case 32:
3699  assert(X86::VR256XRegClass.hasSubClassEq(RC) && "Unknown 32-byte regclass");
3700  // If stack is realigned we can use aligned stores.
3701  if (IsStackAligned)
3702  return Load ?
3703  (HasVLX ? X86::VMOVAPSZ256rm :
3704  HasAVX512 ? X86::VMOVAPSZ256rm_NOVLX :
3705  X86::VMOVAPSYrm) :
3706  (HasVLX ? X86::VMOVAPSZ256mr :
3707  HasAVX512 ? X86::VMOVAPSZ256mr_NOVLX :
3708  X86::VMOVAPSYmr);
3709  else
3710  return Load ?
3711  (HasVLX ? X86::VMOVUPSZ256rm :
3712  HasAVX512 ? X86::VMOVUPSZ256rm_NOVLX :
3713  X86::VMOVUPSYrm) :
3714  (HasVLX ? X86::VMOVUPSZ256mr :
3715  HasAVX512 ? X86::VMOVUPSZ256mr_NOVLX :
3716  X86::VMOVUPSYmr);
3717  case 64:
3718  assert(X86::VR512RegClass.hasSubClassEq(RC) && "Unknown 64-byte regclass");
3719  assert(STI.hasAVX512() && "Using 512-bit register requires AVX512");
3720  if (IsStackAligned)
3721  return Load ? X86::VMOVAPSZrm : X86::VMOVAPSZmr;
3722  else
3723  return Load ? X86::VMOVUPSZrm : X86::VMOVUPSZmr;
3724  case 1024:
3725  assert(X86::TILERegClass.hasSubClassEq(RC) && "Unknown 1024-byte regclass");
3726  assert(STI.hasAMXTILE() && "Using 8*1024-bit register requires AMX-TILE");
3727  return Load ? X86::TILELOADD : X86::TILESTORED;
3728  }
3729 }
3730 
3733  const TargetRegisterInfo *TRI) const {
3734  const MCInstrDesc &Desc = MemI.getDesc();
3735  int MemRefBegin = X86II::getMemoryOperandNo(Desc.TSFlags);
3736  if (MemRefBegin < 0)
3737  return std::nullopt;
3738 
3739  MemRefBegin += X86II::getOperandBias(Desc);
3740 
3741  auto &BaseOp = MemI.getOperand(MemRefBegin + X86::AddrBaseReg);
3742  if (!BaseOp.isReg()) // Can be an MO_FrameIndex
3743  return std::nullopt;
3744 
3745  const MachineOperand &DispMO = MemI.getOperand(MemRefBegin + X86::AddrDisp);
3746  // Displacement can be symbolic
3747  if (!DispMO.isImm())
3748  return std::nullopt;
3749 
3750  ExtAddrMode AM;
3751  AM.BaseReg = BaseOp.getReg();
3752  AM.ScaledReg = MemI.getOperand(MemRefBegin + X86::AddrIndexReg).getReg();
3753  AM.Scale = MemI.getOperand(MemRefBegin + X86::AddrScaleAmt).getImm();
3754  AM.Displacement = DispMO.getImm();
3755  return AM;
3756 }
3757 
3759  StringRef &ErrInfo) const {
3760  Optional<ExtAddrMode> AMOrNone = getAddrModeFromMemoryOp(MI, nullptr);
3761  if (!AMOrNone)
3762  return true;
3763 
3764  ExtAddrMode AM = *AMOrNone;
3765 
3766  if (AM.ScaledReg != X86::NoRegister) {
3767  switch (AM.Scale) {
3768  case 1:
3769  case 2:
3770  case 4:
3771  case 8:
3772  break;
3773  default:
3774  ErrInfo = "Scale factor in address must be 1, 2, 4 or 8";
3775  return false;
3776  }
3777  }
3778  if (!isInt<32>(AM.Displacement)) {
3779  ErrInfo = "Displacement in address must fit into 32-bit signed "
3780  "integer";
3781  return false;
3782  }
3783 
3784  return true;
3785 }
3786 
3788  const Register Reg,
3789  int64_t &ImmVal) const {
3790  if (MI.getOpcode() != X86::MOV32ri && MI.getOpcode() != X86::MOV64ri)
3791  return false;
3792  // Mov Src can be a global address.
3793  if (!MI.getOperand(1).isImm() || MI.getOperand(0).getReg() != Reg)
3794  return false;
3795  ImmVal = MI.getOperand(1).getImm();
3796  return true;
3797 }
3798 
3800  const MachineInstr *MI, const Register NullValueReg,
3801  const TargetRegisterInfo *TRI) const {
3802  if (!MI->modifiesRegister(NullValueReg, TRI))
3803  return true;
3804  switch (MI->getOpcode()) {
3805  // Shift right/left of a null unto itself is still a null, i.e. rax = shl rax
3806  // X.
3807  case X86::SHR64ri:
3808  case X86::SHR32ri:
3809  case X86::SHL64ri:
3810  case X86::SHL32ri:
3811  assert(MI->getOperand(0).isDef() && MI->getOperand(1).isUse() &&
3812  "expected for shift opcode!");
3813  return MI->getOperand(0).getReg() == NullValueReg &&
3814  MI->getOperand(1).getReg() == NullValueReg;
3815  // Zero extend of a sub-reg of NullValueReg into itself does not change the
3816  // null value.
3817  case X86::MOV32rr:
3818  return llvm::all_of(MI->operands(), [&](const MachineOperand &MO) {
3819  return TRI->isSubRegisterEq(NullValueReg, MO.getReg());
3820  });
3821  default:
3822  return false;
3823  }
3824  llvm_unreachable("Should be handled above!");
3825 }
3826 
3829  int64_t &Offset, bool &OffsetIsScalable, unsigned &Width,
3830  const TargetRegisterInfo *TRI) const {
3831  const MCInstrDesc &Desc = MemOp.getDesc();
3832  int MemRefBegin = X86II::getMemoryOperandNo(Desc.TSFlags);
3833  if (MemRefBegin < 0)
3834  return false;
3835 
3836  MemRefBegin += X86II::getOperandBias(Desc);
3837 
3838  const MachineOperand *BaseOp =
3839  &MemOp.getOperand(MemRefBegin + X86::AddrBaseReg);
3840  if (!BaseOp->isReg()) // Can be an MO_FrameIndex
3841  return false;
3842 
3843  if (MemOp.getOperand(MemRefBegin + X86::AddrScaleAmt).getImm() != 1)
3844  return false;
3845 
3846  if (MemOp.getOperand(MemRefBegin + X86::AddrIndexReg).getReg() !=
3847  X86::NoRegister)
3848  return false;
3849 
3850  const MachineOperand &DispMO = MemOp.getOperand(MemRefBegin + X86::AddrDisp);
3851 
3852  // Displacement can be symbolic
3853  if (!DispMO.isImm())
3854  return false;
3855 
3856  Offset = DispMO.getImm();
3857 
3858  if (!BaseOp->isReg())
3859  return false;
3860 
3861  OffsetIsScalable = false;
3862  // FIXME: Relying on memoperands() may not be right thing to do here. Check
3863  // with X86 maintainers, and fix it accordingly. For now, it is ok, since
3864  // there is no use of `Width` for X86 back-end at the moment.
3865  Width =
3866  !MemOp.memoperands_empty() ? MemOp.memoperands().front()->getSize() : 0;
3867  BaseOps.push_back(BaseOp);
3868  return true;
3869 }
3870 
3871 static unsigned getStoreRegOpcode(Register SrcReg,
3872  const TargetRegisterClass *RC,
3873  bool IsStackAligned,
3874  const X86Subtarget &STI) {
3875  return getLoadStoreRegOpcode(SrcReg, RC, IsStackAligned, STI, false);
3876 }
3877 
3878 static unsigned getLoadRegOpcode(Register DestReg,
3879  const TargetRegisterClass *RC,
3880  bool IsStackAligned, const X86Subtarget &STI) {
3881  return getLoadStoreRegOpcode(DestReg, RC, IsStackAligned, STI, true);
3882 }
3883 
3884 static bool isAMXOpcode(unsigned Opc) {
3885  switch (Opc) {
3886  default:
3887  return false;
3888  case X86::TILELOADD:
3889  case X86::TILESTORED:
3890  return true;
3891  }
3892 }
3893 
3896  unsigned Opc, Register Reg, int FrameIdx,
3897  bool isKill) const {
3898  switch (Opc) {
3899  default:
3900  llvm_unreachable("Unexpected special opcode!");
3901  case X86::TILESTORED: {
3902  // tilestored %tmm, (%sp, %idx)
3903  MachineRegisterInfo &RegInfo = MBB.getParent()->getRegInfo();
3904  Register VirtReg = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass);
3905  BuildMI(MBB, MI, DebugLoc(), get(X86::MOV64ri), VirtReg).addImm(64);
3906  MachineInstr *NewMI =
3907  addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc)), FrameIdx)
3908  .addReg(Reg, getKillRegState(isKill));
3910  MO.setReg(VirtReg);
3911  MO.setIsKill(true);
3912  break;
3913  }
3914  case X86::TILELOADD: {
3915  // tileloadd (%sp, %idx), %tmm
3916  MachineRegisterInfo &RegInfo = MBB.getParent()->getRegInfo();
3917  Register VirtReg = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass);
3918  BuildMI(MBB, MI, DebugLoc(), get(X86::MOV64ri), VirtReg).addImm(64);
3920  BuildMI(MBB, MI, DebugLoc(), get(Opc), Reg), FrameIdx);
3921  MachineOperand &MO = NewMI->getOperand(1 + X86::AddrIndexReg);
3922  MO.setReg(VirtReg);
3923  MO.setIsKill(true);
3924  break;
3925  }
3926  }
3927 }
3928 
3931  Register SrcReg, bool isKill,
3932  int FrameIdx,
3933  const TargetRegisterClass *RC,
3934  const TargetRegisterInfo *TRI) const {
3935  const MachineFunction &MF = *MBB.getParent();
3936  const MachineFrameInfo &MFI = MF.getFrameInfo();
3937  assert(MFI.getObjectSize(FrameIdx) >= TRI->getSpillSize(*RC) &&
3938  "Stack slot too small for store");
3939 
3940  unsigned Alignment = std::max<uint32_t>(TRI->getSpillSize(*RC), 16);
3941  bool isAligned =
3942  (Subtarget.getFrameLowering()->getStackAlign() >= Alignment) ||
3943  (RI.canRealignStack(MF) && !MFI.isFixedObjectIndex(FrameIdx));
3944 
3945  unsigned Opc = getStoreRegOpcode(SrcReg, RC, isAligned, Subtarget);
3946  if (isAMXOpcode(Opc))
3947  loadStoreTileReg(MBB, MI, Opc, SrcReg, FrameIdx, isKill);
3948  else
3949  addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc)), FrameIdx)
3950  .addReg(SrcReg, getKillRegState(isKill));
3951 }
3952 
3955  Register DestReg, int FrameIdx,
3956  const TargetRegisterClass *RC,
3957  const TargetRegisterInfo *TRI) const {
3958  const MachineFunction &MF = *MBB.getParent();
3959  const MachineFrameInfo &MFI = MF.getFrameInfo();
3960  assert(MFI.getObjectSize(FrameIdx) >= TRI->getSpillSize(*RC) &&
3961  "Load size exceeds stack slot");
3962  unsigned Alignment = std::max<uint32_t>(TRI->getSpillSize(*RC), 16);
3963  bool isAligned =
3964  (Subtarget.getFrameLowering()->getStackAlign() >= Alignment) ||
3965  (RI.canRealignStack(MF) && !MFI.isFixedObjectIndex(FrameIdx));
3966 
3967  unsigned Opc = getLoadRegOpcode(DestReg, RC, isAligned, Subtarget);
3968  if (isAMXOpcode(Opc))
3969  loadStoreTileReg(MBB, MI, Opc, DestReg, FrameIdx);
3970  else
3971  addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc), DestReg),
3972  FrameIdx);
3973 }
3974 
3976  Register &SrcReg2, int64_t &CmpMask,
3977  int64_t &CmpValue) const {
3978  switch (MI.getOpcode()) {
3979  default: break;
3980  case X86::CMP64ri32:
3981  case X86::CMP64ri8:
3982  case X86::CMP32ri:
3983  case X86::CMP32ri8:
3984  case X86::CMP16ri:
3985  case X86::CMP16ri8:
3986  case X86::CMP8ri:
3987  SrcReg = MI.getOperand(0).getReg();
3988  SrcReg2 = 0;
3989  if (MI.getOperand(1).isImm()) {
3990  CmpMask = ~0;
3991  CmpValue = MI.getOperand(1).getImm();
3992  } else {
3993  CmpMask = CmpValue = 0;
3994  }
3995  return true;
3996  // A SUB can be used to perform comparison.
3997  case X86::SUB64rm:
3998  case X86::SUB32rm:
3999  case X86::SUB16rm:
4000  case X86::SUB8rm:
4001  SrcReg = MI.getOperand(1).getReg();
4002  SrcReg2 = 0;
4003  CmpMask = 0;
4004  CmpValue = 0;
4005  return true;
4006  case X86::SUB64rr:
4007  case X86::SUB32rr:
4008  case X86::SUB16rr:
4009  case X86::SUB8rr:
4010  SrcReg = MI.getOperand(1).getReg();
4011  SrcReg2 = MI.getOperand(2).getReg();
4012  CmpMask = 0;
4013  CmpValue = 0;
4014  return true;
4015  case X86::SUB64ri32:
4016  case X86::SUB64ri8:
4017  case X86::SUB32ri:
4018  case X86::SUB32ri8:
4019  case X86::SUB16ri:
4020  case X86::SUB16ri8:
4021  case X86::SUB8ri:
4022  SrcReg = MI.getOperand(1).getReg();
4023  SrcReg2 = 0;
4024  if (MI.getOperand(2).isImm()) {
4025  CmpMask = ~0;
4026  CmpValue = MI.getOperand(2).getImm();
4027  } else {
4028  CmpMask = CmpValue = 0;
4029  }
4030  return true;
4031  case X86::CMP64rr:
4032  case X86::CMP32rr:
4033  case X86::CMP16rr:
4034  case X86::CMP8rr:
4035  SrcReg = MI.getOperand(0).getReg();
4036  SrcReg2 = MI.getOperand(1).getReg();
4037  CmpMask = 0;
4038  CmpValue = 0;
4039  return true;
4040  case X86::TEST8rr:
4041  case X86::TEST16rr:
4042  case X86::TEST32rr:
4043  case X86::TEST64rr:
4044  SrcReg = MI.getOperand(0).getReg();
4045  if (MI.getOperand(1).getReg() != SrcReg)
4046  return false;
4047  // Compare against zero.
4048  SrcReg2 = 0;
4049  CmpMask = ~0;
4050  CmpValue = 0;
4051  return true;
4052  }
4053  return false;
4054 }
4055 
4056 bool X86InstrInfo::isRedundantFlagInstr(const MachineInstr &FlagI,
4057  Register SrcReg, Register SrcReg2,
4058  int64_t ImmMask, int64_t ImmValue,
4059  const MachineInstr &OI, bool *IsSwapped,
4060  int64_t *ImmDelta) const {
4061  switch (OI.getOpcode()) {
4062  case X86::CMP64rr:
4063  case X86::CMP32rr:
4064  case X86::CMP16rr:
4065  case X86::CMP8rr:
4066  case X86::SUB64rr:
4067  case X86::SUB32rr:
4068  case X86::SUB16rr:
4069  case X86::SUB8rr: {
4070  Register OISrcReg;
4071  Register OISrcReg2;
4072  int64_t OIMask;
4073  int64_t OIValue;
4074  if (!analyzeCompare(OI, OISrcReg, OISrcReg2, OIMask, OIValue) ||
4075  OIMask != ImmMask || OIValue != ImmValue)
4076  return false;
4077  if (SrcReg == OISrcReg && SrcReg2 == OISrcReg2) {
4078  *IsSwapped = false;
4079  return true;
4080  }
4081  if (SrcReg == OISrcReg2 && SrcReg2 == OISrcReg) {
4082  *IsSwapped = true;
4083  return true;
4084  }
4085  return false;
4086  }
4087  case X86::CMP64ri32:
4088  case X86::CMP64ri8:
4089  case X86::CMP32ri:
4090  case X86::CMP32ri8:
4091  case X86::CMP16ri:
4092  case X86::CMP16ri8:
4093  case X86::CMP8ri:
4094  case X86::SUB64ri32:
4095  case X86::SUB64ri8:
4096  case X86::SUB32ri:
4097  case X86::SUB32ri8:
4098  case X86::SUB16ri:
4099  case X86::SUB16ri8:
4100  case X86::SUB8ri:
4101  case X86::TEST64rr:
4102  case X86::TEST32rr:
4103  case X86::TEST16rr:
4104  case X86::TEST8rr: {
4105  if (ImmMask != 0) {
4106  Register OISrcReg;
4107  Register OISrcReg2;
4108  int64_t OIMask;
4109  int64_t OIValue;
4110  if (analyzeCompare(OI, OISrcReg, OISrcReg2, OIMask, OIValue) &&
4111  SrcReg == OISrcReg && ImmMask == OIMask) {
4112  if (OIValue == ImmValue) {
4113  *ImmDelta = 0;
4114  return true;
4115  } else if (static_cast<uint64_t>(ImmValue) ==
4116  static_cast<uint64_t>(OIValue) - 1) {
4117  *ImmDelta = -1;
4118  return true;
4119  } else if (static_cast<uint64_t>(ImmValue) ==
4120  static_cast<uint64_t>(OIValue) + 1) {
4121  *ImmDelta = 1;
4122  return true;
4123  } else {
4124  return false;
4125  }
4126  }
4127  }
4128  return FlagI.isIdenticalTo(OI);
4129  }
4130  default:
4131  return false;
4132  }
4133 }
4134 
4135 /// Check whether the definition can be converted
4136 /// to remove a comparison against zero.
4137 inline static bool isDefConvertible(const MachineInstr &MI, bool &NoSignFlag,
4138  bool &ClearsOverflowFlag) {
4139  NoSignFlag = false;
4140  ClearsOverflowFlag = false;
4141 
4142  // "ELF Handling for Thread-Local Storage" specifies that x86-64 GOTTPOFF, and
4143  // i386 GOTNTPOFF/INDNTPOFF relocations can convert an ADD to a LEA during
4144  // Initial Exec to Local Exec relaxation. In these cases, we must not depend
4145  // on the EFLAGS modification of ADD actually happening in the final binary.
4146  if (MI.getOpcode() == X86::ADD64rm || MI.getOpcode() == X86::ADD32rm) {
4147  unsigned Flags = MI.getOperand(5).getTargetFlags();
4148  if (Flags == X86II::MO_GOTTPOFF || Flags == X86II::MO_INDNTPOFF ||
4149  Flags == X86II::MO_GOTNTPOFF)
4150  return false;
4151  }
4152 
4153  switch (MI.getOpcode()) {
4154  default: return false;
4155 
4156  // The shift instructions only modify ZF if their shift count is non-zero.
4157  // N.B.: The processor truncates the shift count depending on the encoding.
4158  case X86::SAR8ri: case X86::SAR16ri: case X86::SAR32ri:case X86::SAR64ri:
4159  case X86::SHR8ri: case X86::SHR16ri: case X86::SHR32ri:case X86::SHR64ri:
4160  return getTruncatedShiftCount(MI, 2) != 0;
4161 
4162  // Some left shift instructions can be turned into LEA instructions but only
4163  // if their flags aren't used. Avoid transforming such instructions.
4164  case X86::SHL8ri: case X86::SHL16ri: case X86::SHL32ri:case X86::SHL64ri:{
4165  unsigned ShAmt = getTruncatedShiftCount(MI, 2);
4166  if (isTruncatedShiftCountForLEA(ShAmt)) return false;
4167  return ShAmt != 0;
4168  }
4169 
4170  case X86::SHRD16rri8:case X86::SHRD32rri8:case X86::SHRD64rri8:
4171  case X86::SHLD16rri8:case X86::SHLD32rri8:case X86::SHLD64rri8:
4172  return getTruncatedShiftCount(MI, 3) != 0;
4173 
4174  case X86::SUB64ri32: case X86::SUB64ri8: case X86::SUB32ri:
4175  case X86::SUB32ri8: case X86::SUB16ri: case X86::SUB16ri8:
4176  case X86::SUB8ri: case X86::SUB64rr: case X86::SUB32rr:
4177  case X86::SUB16rr: case X86::SUB8rr: case X86::SUB64rm:
4178  case X86::SUB32rm: case X86::SUB16rm: case X86::SUB8rm:
4179  case X86::DEC64r: case X86::DEC32r: case X86::DEC16r: case X86::DEC8r:
4180  case X86::ADD64ri32: case X86::ADD64ri8: case X86::ADD32ri:
4181  case X86::ADD32ri8: case X86::ADD16ri: case X86::ADD16ri8:
4182  case X86::ADD8ri: case X86::ADD64rr: case X86::ADD32rr:
4183  case X86::ADD16rr: case X86::ADD8rr: case X86::ADD64rm:
4184  case X86::ADD32rm: case X86::ADD16rm: case X86::ADD8rm:
4185  case X86::INC64r: case X86::INC32r: case X86::INC16r: case X86::INC8r:
4186  case X86::ADC64ri32: case X86::ADC64ri8: case X86::ADC32ri:
4187  case X86::ADC32ri8: case X86::ADC16ri: case X86::ADC16ri8:
4188  case X86::ADC8ri: case X86::ADC64rr: case X86::ADC32rr:
4189  case X86::ADC16rr: case X86::ADC8rr: case X86::ADC64rm:
4190  case X86::ADC32rm: case X86::ADC16rm: case X86::ADC8rm:
4191  case X86::SBB64ri32: case X86::SBB64ri8: case X86::SBB32ri:
4192  case X86::SBB32ri8: case X86::SBB16ri: case X86::SBB16ri8:
4193  case X86::SBB8ri: case X86::SBB64rr: case X86::SBB32rr:
4194  case X86::SBB16rr: case X86::SBB8rr: case X86::SBB64rm:
4195  case X86::SBB32rm: case X86::SBB16rm: case X86::SBB8rm:
4196  case X86::NEG8r: case X86::NEG16r: case X86::NEG32r: case X86::NEG64r:
4197  case X86::SAR8r1: case X86::SAR16r1: case X86::SAR32r1:case X86::SAR64r1:
4198  case X86::SHR8r1: case X86::SHR16r1: case X86::SHR32r1:case X86::SHR64r1:
4199  case X86::SHL8r1: case X86::SHL16r1: case X86::SHL32r1:case X86::SHL64r1:
4200  case X86::LZCNT16rr: case X86::LZCNT16rm:
4201  case X86::LZCNT32rr: case X86::LZCNT32rm:
4202  case X86::LZCNT64rr: case X86::LZCNT64rm:
4203  case X86::POPCNT16rr:case X86::POPCNT16rm:
4204  case X86::POPCNT32rr:case X86::POPCNT32rm:
4205  case X86::POPCNT64rr:case X86::POPCNT64rm:
4206  case X86::TZCNT16rr: case X86::TZCNT16rm:
4207  case X86::TZCNT32rr: case X86::TZCNT32rm:
4208  case X86::TZCNT64rr: case X86::TZCNT64rm:
4209  return true;
4210  case X86::AND64ri32: case X86::AND64ri8: case X86::AND32ri:
4211  case X86::AND32ri8: case X86::AND16ri: case X86::AND16ri8:
4212  case X86::AND8ri: case X86::AND64rr: case X86::AND32rr:
4213  case X86::AND16rr: case X86::AND8rr: case X86::AND64rm:
4214  case X86::AND32rm: case X86::AND16rm: case X86::AND8rm:
4215  case X86::XOR64ri32: case X86::XOR64ri8: case X86::XOR32ri:
4216  case X86::XOR32ri8: case X86::XOR16ri: case X86::XOR16ri8:
4217  case X86::XOR8ri: case X86::XOR64rr: case X86::XOR32rr:
4218  case X86::XOR16rr: case X86::XOR8rr: case X86::XOR64rm:
4219  case X86::XOR32rm: case X86::XOR16rm: case X86::XOR8rm:
4220  case X86::OR64ri32: case X86::OR64ri8: case X86::OR32ri:
4221  case X86::OR32ri8: case X86::OR16ri: case X86::OR16ri8:
4222  case X86::OR8ri: case X86::OR64rr: case X86::OR32rr:
4223  case X86::OR16rr: case X86::OR8rr: case X86::OR64rm:
4224  case X86::OR32rm: case X86::OR16rm: case X86::OR8rm:
4225  case X86::ANDN32rr: case X86::ANDN32rm:
4226  case X86::ANDN64rr: case X86::ANDN64rm:
4227  case X86::BLSI32rr: case X86::BLSI32rm:
4228  case X86::BLSI64rr: case X86::BLSI64rm:
4229  case X86::BLSMSK32rr: case X86::BLSMSK32rm:
4230  case X86::BLSMSK64rr: case X86::BLSMSK64rm:
4231  case X86::BLSR32rr: case X86::BLSR32rm:
4232  case X86::BLSR64rr: case X86::BLSR64rm:
4233  case X86::BLCFILL32rr: case X86::BLCFILL32rm:
4234  case X86::BLCFILL64rr: case X86::BLCFILL64rm:
4235  case X86::BLCI32rr: case X86::BLCI32rm:
4236  case X86::BLCI64rr: case X86::BLCI64rm:
4237  case X86::BLCIC32rr: case X86::BLCIC32rm:
4238  case X86::BLCIC64rr: case X86::BLCIC64rm:
4239  case X86::BLCMSK32rr: case X86::BLCMSK32rm:
4240  case X86::BLCMSK64rr: case X86::BLCMSK64rm:
4241  case X86::BLCS32rr: case X86::BLCS32rm:
4242  case X86::BLCS64rr: case X86::BLCS64rm:
4243  case X86::BLSFILL32rr: case X86::BLSFILL32rm:
4244  case X86::BLSFILL64rr: case X86::BLSFILL64rm:
4245  case X86::BLSIC32rr: case X86::BLSIC32rm:
4246  case X86::BLSIC64rr: case X86::BLSIC64rm:
4247  case X86::BZHI32rr: case X86::BZHI32rm:
4248  case X86::BZHI64rr: case X86::BZHI64rm:
4249  case X86::T1MSKC32rr: case X86::T1MSKC32rm:
4250  case X86::T1MSKC64rr: case X86::T1MSKC64rm:
4251  case X86::TZMSK32rr: case X86::TZMSK32rm:
4252  case X86::TZMSK64rr: case X86::TZMSK64rm:
4253  // These instructions clear the overflow flag just like TEST.
4254  // FIXME: These are not the only instructions in this switch that clear the
4255  // overflow flag.
4256  ClearsOverflowFlag = true;
4257  return true;
4258  case X86::BEXTR32rr: case X86::BEXTR64rr:
4259  case X86::BEXTR32rm: case X86::BEXTR64rm:
4260  case X86::BEXTRI32ri: case X86::BEXTRI32mi:
4261  case X86::BEXTRI64ri: case X86::BEXTRI64mi:
4262  // BEXTR doesn't update the sign flag so we can't use it. It does clear
4263  // the overflow flag, but that's not useful without the sign flag.
4264  NoSignFlag = true;
4265  return true;
4266  }
4267 }
4268 
4269 /// Check whether the use can be converted to remove a comparison against zero.
4271  switch (MI.getOpcode()) {
4272  default: return X86::COND_INVALID;
4273  case X86::NEG8r:
4274  case X86::NEG16r:
4275  case X86::NEG32r:
4276  case X86::NEG64r:
4277  return X86::COND_AE;
4278  case X86::LZCNT16rr:
4279  case X86::LZCNT32rr:
4280  case X86::LZCNT64rr:
4281  return X86::COND_B;
4282  case X86::POPCNT16rr:
4283  case X86::POPCNT32rr:
4284  case X86::POPCNT64rr:
4285  return X86::COND_E;
4286  case X86::TZCNT16rr:
4287  case X86::TZCNT32rr:
4288  case X86::TZCNT64rr:
4289  return X86::COND_B;
4290  case X86::BSF16rr:
4291  case X86::BSF32rr:
4292  case X86::BSF64rr:
4293  case X86::BSR16rr:
4294  case X86::BSR32rr:
4295  case X86::BSR64rr:
4296  return X86::COND_E;
4297  case X86::BLSI32rr:
4298  case X86::BLSI64rr:
4299  return X86::COND_AE;
4300  case X86::BLSR32rr:
4301  case X86::BLSR64rr:
4302  case X86::BLSMSK32rr:
4303  case X86::BLSMSK64rr:
4304  return X86::COND_B;
4305  // TODO: TBM instructions.
4306  }
4307 }
4308 
4309 /// Check if there exists an earlier instruction that
4310 /// operates on the same source operands and sets flags in the same way as
4311 /// Compare; remove Compare if possible.
4313  Register SrcReg2, int64_t CmpMask,
4314  int64_t CmpValue,
4315  const MachineRegisterInfo *MRI) const {
4316  // Check whether we can replace SUB with CMP.
4317  switch (CmpInstr.getOpcode()) {
4318  default: break;
4319  case X86::SUB64ri32:
4320  case X86::SUB64ri8:
4321  case X86::SUB32ri:
4322  case X86::SUB32ri8:
4323  case X86::SUB16ri:
4324  case X86::SUB16ri8:
4325  case X86::SUB8ri:
4326  case X86::SUB64rm:
4327  case X86::SUB32rm:
4328  case X86::SUB16rm:
4329  case X86::SUB8rm:
4330  case X86::SUB64rr:
4331  case X86::SUB32rr:
4332  case X86::SUB16rr:
4333  case X86::SUB8rr: {
4334  if (!MRI->use_nodbg_empty(CmpInstr.getOperand(0).getReg()))
4335  return false;
4336  // There is no use of the destination register, we can replace SUB with CMP.
4337  unsigned NewOpcode = 0;
4338  switch (CmpInstr.getOpcode()) {
4339  default: llvm_unreachable("Unreachable!");
4340  case X86::SUB64rm: NewOpcode = X86::CMP64rm; break;
4341  case X86::SUB32rm: NewOpcode = X86::CMP32rm; break;
4342  case X86::SUB16rm: NewOpcode = X86::CMP16rm; break;
4343  case X86::SUB8rm: NewOpcode = X86::CMP8rm; break;
4344  case X86::SUB64rr: NewOpcode = X86::CMP64rr; break;
4345  case X86::SUB32rr: NewOpcode = X86::CMP32rr; break;
4346  case X86::SUB16rr: NewOpcode = X86::CMP16rr; break;
4347  case X86::SUB8rr: NewOpcode = X86::CMP8rr; break;
4348  case X86::SUB64ri32: NewOpcode = X86::CMP64ri32; break;
4349  case X86::SUB64ri8: NewOpcode = X86::CMP64ri8; break;
4350  case X86::SUB32ri: NewOpcode = X86::CMP32ri; break;
4351  case X86::SUB32ri8: NewOpcode = X86::CMP32ri8; break;
4352  case X86::SUB16ri: NewOpcode = X86::CMP16ri; break;
4353  case X86::SUB16ri8: NewOpcode = X86::CMP16ri8; break;
4354  case X86::SUB8ri: NewOpcode = X86::CMP8ri; break;
4355  }
4356  CmpInstr.setDesc(get(NewOpcode));
4357  CmpInstr.removeOperand(0);
4358  // Mutating this instruction invalidates any debug data associated with it.
4359  CmpInstr.dropDebugNumber();
4360  // Fall through to optimize Cmp if Cmp is CMPrr or CMPri.
4361  if (NewOpcode == X86::CMP64rm || NewOpcode == X86::CMP32rm ||
4362  NewOpcode == X86::CMP16rm || NewOpcode == X86::CMP8rm)
4363  return false;
4364  }
4365  }
4366 
4367  // The following code tries to remove the comparison by re-using EFLAGS
4368  // from earlier instructions.
4369 
4370  bool IsCmpZero = (CmpMask != 0 && CmpValue == 0);
4371 
4372  // Transformation currently requires SSA values.
4373  if (SrcReg2.isPhysical())
4374  return false;
4375  MachineInstr *SrcRegDef = MRI->getVRegDef(SrcReg);
4376  assert(SrcRegDef && "Must have a definition (SSA)");
4377 
4378  MachineInstr *MI = nullptr;
4379  MachineInstr *Sub = nullptr;
4380  MachineInstr *Movr0Inst = nullptr;
4381  bool NoSignFlag = false;
4382  bool ClearsOverflowFlag = false;
4383  bool ShouldUpdateCC = false;
4384  bool IsSwapped = false;
4386  int64_t ImmDelta = 0;
4387 
4388  // Search backward from CmpInstr for the next instruction defining EFLAGS.
4390  MachineBasicBlock &CmpMBB = *CmpInstr.getParent();
4392  std::next(MachineBasicBlock::reverse_iterator(CmpInstr));
4393  for (MachineBasicBlock *MBB = &CmpMBB;;) {
4394  for (MachineInstr &Inst : make_range(From, MBB->rend())) {
4395  // Try to use EFLAGS from the instruction defining %SrcReg. Example:
4396  // %eax = addl ...
4397  // ... // EFLAGS not changed
4398  // testl %eax, %eax // <-- can be removed
4399  if (&Inst == SrcRegDef) {
4400  if (IsCmpZero &&
4401  isDefConvertible(Inst, NoSignFlag, ClearsOverflowFlag)) {
4402  MI = &Inst;
4403  break;
4404  }
4405 
4406  // Look back for the following pattern, in which case the test64rr
4407  // instruction could be erased.
4408  //
4409  // Example:
4410  // %reg = and32ri %in_reg, 5
4411  // ... // EFLAGS not changed.
4412  // %src_reg = subreg_to_reg 0, %reg, %subreg.sub_index
4413  // test64rr %src_reg, %src_reg, implicit-def $eflags
4414  MachineInstr *AndInstr = nullptr;
4415  if (IsCmpZero &&
4416  findRedundantFlagInstr(CmpInstr, Inst, MRI, &AndInstr, TRI,
4417  NoSignFlag, ClearsOverflowFlag)) {
4418  assert(AndInstr != nullptr && X86::isAND(AndInstr->getOpcode()));
4419  MI = AndInstr;
4420  break;
4421  }
4422  // Cannot find other candidates before definition of SrcReg.
4423  return false;
4424  }
4425 
4426  if (Inst.modifiesRegister(X86::EFLAGS, TRI)) {
4427  // Try to use EFLAGS produced by an instruction reading %SrcReg.
4428  // Example:
4429  // %eax = ...
4430  // ...
4431  // popcntl %eax
4432  // ... // EFLAGS not changed
4433  // testl %eax, %eax // <-- can be removed
4434  if (IsCmpZero) {
4435  NewCC = isUseDefConvertible(Inst);
4436  if (NewCC != X86::COND_INVALID && Inst.getOperand(1).isReg() &&
4437  Inst.getOperand(1).getReg() == SrcReg) {
4438  ShouldUpdateCC = true;
4439  MI = &Inst;
4440  break;
4441  }
4442  }
4443 
4444  // Try to use EFLAGS from an instruction with similar flag results.
4445  // Example:
4446  // sub x, y or cmp x, y
4447  // ... // EFLAGS not changed
4448  // cmp x, y // <-- can be removed
4449  if (isRedundantFlagInstr(CmpInstr, SrcReg, SrcReg2, CmpMask, CmpValue,
4450  Inst, &IsSwapped, &ImmDelta)) {
4451  Sub = &Inst;
4452  break;
4453  }
4454 
4455  // MOV32r0 is implemented with xor which clobbers condition code. It is
4456  // safe to move up, if the definition to EFLAGS is dead and earlier
4457  // instructions do not read or write EFLAGS.
4458  if (!Movr0Inst && Inst.getOpcode() == X86::MOV32r0 &&
4459  Inst.registerDefIsDead(X86::EFLAGS, TRI)) {
4460  Movr0Inst = &Inst;
4461  continue;
4462  }
4463 
4464  // Cannot do anything for any other EFLAG changes.
4465  return false;
4466  }
4467  }
4468 
4469  if (MI || Sub)
4470  break;
4471 
4472  // Reached begin of basic block. Continue in predecessor if there is
4473  // exactly one.
4474  if (MBB->pred_size() != 1)
4475  return false;
4476  MBB = *MBB->pred_begin();
4477  From = MBB->rbegin();
4478  }
4479 
4480  // Scan forward from the instruction after CmpInstr for uses of EFLAGS.
4481  // It is safe to remove CmpInstr if EFLAGS is redefined or killed.
4482  // If we are done with the basic block, we need to check whether EFLAGS is
4483  // live-out.
4484  bool FlagsMayLiveOut = true;
4486  MachineBasicBlock::iterator AfterCmpInstr =
4487  std::next(MachineBasicBlock::iterator(CmpInstr));
4488  for (MachineInstr &Instr : make_range(AfterCmpInstr, CmpMBB.end())) {
4489  bool ModifyEFLAGS = Instr.modifiesRegister(X86::EFLAGS, TRI);
4490  bool UseEFLAGS = Instr.readsRegister(X86::EFLAGS, TRI);
4491  // We should check the usage if this instruction uses and updates EFLAGS.
4492  if (!UseEFLAGS && ModifyEFLAGS) {
4493  // It is safe to remove CmpInstr if EFLAGS is updated again.
4494  FlagsMayLiveOut = false;
4495  break;
4496  }
4497  if (!UseEFLAGS && !ModifyEFLAGS)
4498  continue;
4499 
4500  // EFLAGS is used by this instruction.
4501  X86::CondCode OldCC = X86::getCondFromMI(Instr);
4502  if ((MI || IsSwapped || ImmDelta != 0) && OldCC == X86::COND_INVALID)
4503  return false;
4504 
4505  X86::CondCode ReplacementCC = X86::COND_INVALID;
4506  if (MI) {
4507  switch (OldCC) {
4508  default: break;
4509  case X86::COND_A: case X86::COND_AE:
4510  case X86::COND_B: case X86::COND_BE:
4511  // CF is used, we can't perform this optimization.
4512  return false;
4513  case X86::COND_G: case X86::COND_GE:
4514  case X86::COND_L: case X86::COND_LE:
4515  // If SF is used, but the instruction doesn't update the SF, then we
4516  // can't do the optimization.
4517  if (NoSignFlag)
4518  return false;
4519  [[fallthrough]];
4520  case X86::COND_O: case X86::COND_NO:
4521  // If OF is used, the instruction needs to clear it like CmpZero does.
4522  if (!ClearsOverflowFlag)
4523  return false;
4524  break;
4525  case X86::COND_S: case X86::COND_NS:
4526  // If SF is used, but the instruction doesn't update the SF, then we
4527  // can't do the optimization.
4528  if (NoSignFlag)
4529  return false;
4530  break;
4531  }
4532 
4533  // If we're updating the condition code check if we have to reverse the
4534  // condition.
4535  if (ShouldUpdateCC)
4536  switch (OldCC) {
4537  default:
4538  return false;
4539  case X86::COND_E:
4540  ReplacementCC = NewCC;
4541  break;
4542  case X86::COND_NE:
4543  ReplacementCC = GetOppositeBranchCondition(NewCC);
4544  break;
4545  }
4546  } else if (IsSwapped) {
4547  // If we have SUB(r1, r2) and CMP(r2, r1), the condition code needs
4548  // to be changed from r2 > r1 to r1 < r2, from r2 < r1 to r1 > r2, etc.
4549  // We swap the condition code and synthesize the new opcode.
4550  ReplacementCC = getSwappedCondition(OldCC);
4551  if (ReplacementCC == X86::COND_INVALID)
4552  return false;
4553  ShouldUpdateCC = true;
4554  } else if (ImmDelta != 0) {
4555  unsigned BitWidth = TRI->getRegSizeInBits(*MRI->getRegClass(SrcReg));
4556  // Shift amount for min/max constants to adjust for 8/16/32 instruction
4557  // sizes.
4558  switch (OldCC) {
4559  case X86::COND_L: // x <s (C + 1) --> x <=s C
4560  if (ImmDelta != 1 || APInt::getSignedMinValue(BitWidth) == CmpValue)
4561  return false;
4562  ReplacementCC = X86::COND_LE;
4563  break;
4564  case X86::COND_B: // x <u (C + 1) --> x <=u C
4565  if (ImmDelta != 1 || CmpValue == 0)
4566  return false;
4567  ReplacementCC = X86::COND_BE;
4568  break;
4569  case X86::COND_GE: // x >=s (C + 1) --> x >s C
4570  if (ImmDelta != 1 || APInt::getSignedMinValue(BitWidth) == CmpValue)
4571  return false;
4572  ReplacementCC = X86::COND_G;
4573  break;
4574  case X86::COND_AE: // x >=u (C + 1) --> x >u C
4575  if (ImmDelta != 1 || CmpValue == 0)
4576  return false;
4577  ReplacementCC = X86::COND_A;
4578  break;
4579  case X86::COND_G: // x >s (C - 1) --> x >=s C
4580  if (ImmDelta != -1 || APInt::getSignedMaxValue(BitWidth) == CmpValue)
4581  return false;
4582  ReplacementCC = X86::COND_GE;
4583  break;
4584  case X86::COND_A: // x >u (C - 1) --> x >=u C
4585  if (ImmDelta != -1 || APInt::getMaxValue(BitWidth) == CmpValue)
4586  return false;
4587  ReplacementCC = X86::COND_AE;
4588  break;
4589  case X86::COND_LE: // x <=s (C - 1) --> x <s C
4590  if (ImmDelta != -1 || APInt::getSignedMaxValue(BitWidth) == CmpValue)
4591  return false;
4592  ReplacementCC = X86::COND_L;
4593  break;
4594  case X86::COND_BE: // x <=u (C - 1) --> x <u C
4595  if (ImmDelta != -1 || APInt::getMaxValue(BitWidth) == CmpValue)
4596  return false;
4597  ReplacementCC = X86::COND_B;
4598  break;
4599  default:
4600  return false;
4601  }
4602  ShouldUpdateCC = true;
4603  }
4604 
4605  if (ShouldUpdateCC && ReplacementCC != OldCC) {
4606  // Push the MachineInstr to OpsToUpdate.
4607  // If it is safe to remove CmpInstr, the condition code of these
4608  // instructions will be modified.
4609  OpsToUpdate.push_back(std::make_pair(&Instr, ReplacementCC));
4610  }
4611  if (ModifyEFLAGS || Instr.killsRegister(X86::EFLAGS, TRI)) {
4612  // It is safe to remove CmpInstr if EFLAGS is updated again or killed.
4613  FlagsMayLiveOut = false;
4614  break;
4615  }
4616  }
4617 
4618  // If we have to update users but EFLAGS is live-out abort, since we cannot
4619  // easily find all of the users.
4620  if ((MI != nullptr || ShouldUpdateCC) && FlagsMayLiveOut) {
4621  for (MachineBasicBlock *Successor : CmpMBB.successors())
4622  if (Successor->isLiveIn(X86::EFLAGS))
4623  return false;
4624  }
4625 
4626  // The instruction to be updated is either Sub or MI.
4627  assert((MI == nullptr || Sub == nullptr) && "Should not have Sub and MI set");
4628  Sub = MI != nullptr ? MI : Sub;
4629  MachineBasicBlock *SubBB = Sub->getParent();
4630  // Move Movr0Inst to the appropriate place before Sub.
4631  if (Movr0Inst) {
4632  // Only move within the same block so we don't accidentally move to a
4633  // block with higher execution frequency.
4634  if (&CmpMBB != SubBB)
4635  return false;
4636  // Look backwards until we find a def that doesn't use the current EFLAGS.
4638  InsertE = Sub->getParent()->rend();
4639  for (; InsertI != InsertE; ++InsertI) {
4640  MachineInstr *Instr = &*InsertI;
4641  if (!Instr->readsRegister(X86::EFLAGS, TRI) &&
4642  Instr->modifiesRegister(X86::EFLAGS, TRI)) {
4643  Movr0Inst->getParent()->remove(Movr0Inst);
4644  Instr->getParent()->insert(MachineBasicBlock::iterator(Instr),
4645  Movr0Inst);
4646  break;
4647  }
4648  }
4649  if (InsertI == InsertE)
4650  return false;
4651  }
4652 
4653  // Make sure Sub instruction defines EFLAGS and mark the def live.
4654  MachineOperand *FlagDef = Sub->findRegisterDefOperand(X86::EFLAGS);
4655  assert(FlagDef && "Unable to locate a def EFLAGS operand");
4656  FlagDef->setIsDead(false);
4657 
4658  CmpInstr.eraseFromParent();
4659 
4660  // Modify the condition code of instructions in OpsToUpdate.
4661  for (auto &Op : OpsToUpdate) {
4662  Op.first->getOperand(Op.first->getDesc().getNumOperands() - 1)
4663  .setImm(Op.second);
4664  }
4665  // Add EFLAGS to block live-ins between CmpBB and block of flags producer.
4666  for (MachineBasicBlock *MBB = &CmpMBB; MBB != SubBB;
4667  MBB = *MBB->pred_begin()) {
4668  assert(MBB->pred_size() == 1 && "Expected exactly one predecessor");
4669  if (!MBB->isLiveIn(X86::EFLAGS))
4670  MBB->addLiveIn(X86::EFLAGS);
4671  }
4672  return true;
4673 }
4674 
4675 /// Try to remove the load by folding it to a register
4676 /// operand at the use. We fold the load instructions if load defines a virtual
4677 /// register, the virtual register is used once in the same BB, and the
4678 /// instructions in-between do not load or store, and have no side effects.
4680  const MachineRegisterInfo *MRI,
4681  Register &FoldAsLoadDefReg,
4682  MachineInstr *&DefMI) const {
4683  // Check whether we can move DefMI here.
4684  DefMI = MRI->getVRegDef(FoldAsLoadDefReg);
4685  assert(DefMI);
4686  bool SawStore = false;
4687  if (!DefMI->isSafeToMove(nullptr, SawStore))
4688  return nullptr;
4689 
4690  // Collect information about virtual register operands of MI.
4691  SmallVector<unsigned, 1> SrcOperandIds;
4692  for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
4693  MachineOperand &MO = MI.getOperand(i);
4694  if (!MO.isReg())
4695  continue;
4696  Register Reg = MO.getReg();
4697  if (Reg != FoldAsLoadDefReg)
4698  continue;
4699  // Do not fold if we have a subreg use or a def.
4700  if (MO.getSubReg() || MO.isDef())
4701  return nullptr;
4702  SrcOperandIds.push_back(i);
4703  }
4704  if (SrcOperandIds.empty())
4705  return nullptr;
4706 
4707  // Check whether we can fold the def into SrcOperandId.
4708  if (MachineInstr *FoldMI = foldMemoryOperand(MI, SrcOperandIds, *DefMI)) {
4709  FoldAsLoadDefReg = 0;
4710  return FoldMI;
4711  }
4712 
4713  return nullptr;
4714 }
4715 
4716 /// Expand a single-def pseudo instruction to a two-addr
4717 /// instruction with two undef reads of the register being defined.
4718 /// This is used for mapping:
4719 /// %xmm4 = V_SET0
4720 /// to:
4721 /// %xmm4 = PXORrr undef %xmm4, undef %xmm4
4722 ///
4724  const MCInstrDesc &Desc) {
4725  assert(Desc.getNumOperands() == 3 && "Expected two-addr instruction.");
4726  Register Reg = MIB.getReg(0);
4727  MIB->setDesc(Desc);
4728 
4729  // MachineInstr::addOperand() will insert explicit operands before any
4730  // implicit operands.
4732  // But we don't trust that.
4733  assert(MIB.getReg(1) == Reg &&
4734  MIB.getReg(2) == Reg && "Misplaced operand");
4735  return true;
4736 }
4737 
4738 /// Expand a single-def pseudo instruction to a two-addr
4739 /// instruction with two %k0 reads.
4740 /// This is used for mapping:
4741 /// %k4 = K_SET1
4742 /// to:
4743 /// %k4 = KXNORrr %k0, %k0
4744 static bool Expand2AddrKreg(MachineInstrBuilder &MIB, const MCInstrDesc &Desc,
4745  Register Reg) {
4746  assert(Desc.getNumOperands() == 3 && "Expected two-addr instruction.");
4747  MIB->setDesc(Desc);
4749  return true;
4750 }
4751 
4753  bool MinusOne) {
4754  MachineBasicBlock &MBB = *MIB->getParent();
4755  const DebugLoc &DL = MIB->getDebugLoc();
4756  Register Reg = MIB.getReg(0);
4757 
4758  // Insert the XOR.
4759  BuildMI(MBB, MIB.getInstr(), DL, TII.get(X86::XOR32rr), Reg)
4762 
4763  // Turn the pseudo into an INC or DEC.
4764  MIB->setDesc(TII.get(MinusOne ? X86::DEC32r : X86::INC32r));
4765  MIB.addReg(Reg);
4766 
4767  return true;
4768 }
4769 
4771  const TargetInstrInfo &TII,
4772  const X86Subtarget &Subtarget) {
4773  MachineBasicBlock &MBB = *MIB->getParent();
4774  const DebugLoc &DL = MIB->getDebugLoc();
4775  int64_t Imm = MIB->getOperand(1).getImm();
4776  assert(Imm != 0 && "Using push/pop for 0 is not efficient.");
4778 
4779  int StackAdjustment;
4780 
4781  if (Subtarget.is64Bit()) {
4782  assert(MIB->getOpcode() == X86::MOV64ImmSExti8 ||
4783  MIB->getOpcode() == X86::MOV32ImmSExti8);
4784 
4785  // Can't use push/pop lowering if the function might write to the red zone.
4786  X86MachineFunctionInfo *X86FI =
4788  if (X86FI->getUsesRedZone()) {
4789  MIB->setDesc(TII.get(MIB->getOpcode() ==
4790  X86::MOV32ImmSExti8 ? X86::MOV32ri : X86::MOV64ri));
4791  return true;
4792  }
4793 
4794  // 64-bit mode doesn't have 32-bit push/pop, so use 64-bit operations and
4795  // widen the register if necessary.
4796  StackAdjustment = 8;
4797  BuildMI(MBB, I, DL, TII.get(X86::PUSH64i8)).addImm(Imm);
4798  MIB->setDesc(TII.get(X86::POP64r));
4799  MIB->getOperand(0)
4800  .setReg(getX86SubSuperRegister(MIB.getReg(0), 64));
4801  } else {
4802  assert(MIB->getOpcode() == X86::MOV32ImmSExti8);
4803  StackAdjustment = 4;
4804  BuildMI(MBB, I, DL, TII.get(X86::PUSH32i8)).addImm(Imm);
4805  MIB->setDesc(TII.get(X86::POP32r));
4806  }
4807  MIB->removeOperand(1);
4809 
4810  // Build CFI if necessary.
4811  MachineFunction &MF = *MBB.getParent();
4812  const X86FrameLowering *TFL = Subtarget.getFrameLowering();
4813  bool IsWin64Prologue = MF.getTarget().getMCAsmInfo()->usesWindowsCFI();
4814  bool NeedsDwarfCFI = !IsWin64Prologue && MF.needsFrameMoves();
4815  bool EmitCFI = !TFL->hasFP(MF) && NeedsDwarfCFI;
4816  if (EmitCFI) {
4817  TFL->BuildCFI(MBB, I, DL,
4819  TFL->BuildCFI(MBB, std::next(I), DL,
4821  }
4822 
4823  return true;
4824 }
4825 
4826 // LoadStackGuard has so far only been implemented for 64-bit MachO. Different
4827 // code sequence is needed for other targets.
4829  const TargetInstrInfo &TII) {
4830  MachineBasicBlock &MBB = *MIB->getParent();
4831  const DebugLoc &DL = MIB->getDebugLoc();
4832  Register Reg = MIB.getReg(0);
4833  const GlobalValue *GV =
4834  cast<GlobalValue>((*MIB->memoperands_begin())->getValue());
4835  auto Flags = MachineMemOperand::MOLoad |
4839  MachinePointerInfo::getGOT(*MBB.getParent()), Flags, 8, Align(8));
4841 
4842  BuildMI(MBB, I, DL, TII.get(X86::MOV64rm), Reg).addReg(X86::RIP).addImm(1)
4844  .addMemOperand(MMO);
4845  MIB->setDebugLoc(DL);
4846  MIB->setDesc(TII.get(X86::MOV64rm));
4847  MIB.addReg(Reg, RegState::Kill).addImm(1).addReg(0).addImm(0).addReg(0);
4848 }
4849 
4851  MachineBasicBlock &MBB = *MIB->getParent();
4852  MachineFunction &MF = *MBB.getParent();
4853  const X86Subtarget &Subtarget = MF.getSubtarget<X86Subtarget>();
4854  const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
4855  unsigned XorOp =
4856  MIB->getOpcode() == X86::XOR64_FP ? X86::XOR64rr : X86::XOR32rr;
4857  MIB->setDesc(TII.get(XorOp));
4859  return true;
4860 }
4861 
4862 // This is used to handle spills for 128/256-bit registers when we have AVX512,
4863 // but not VLX. If it uses an extended register we need to use an instruction
4864 // that loads the lower 128/256-bit, but is available with only AVX512F.
4866  const TargetRegisterInfo *TRI,
4867  const MCInstrDesc &LoadDesc,
4868  const MCInstrDesc &BroadcastDesc,
4869  unsigned SubIdx) {
4870  Register DestReg = MIB.getReg(0);
4871  // Check if DestReg is XMM16-31 or YMM16-31.
4872  if (TRI->getEncodingValue(DestReg) < 16) {
4873  // We can use a normal VEX encoded load.
4874  MIB->setDesc(LoadDesc);
4875  } else {
4876  // Use a 128/256-bit VBROADCAST instruction.
4877  MIB->setDesc(BroadcastDesc);
4878  // Change the destination to a 512-bit register.
4879  DestReg = TRI->getMatchingSuperReg(DestReg, SubIdx, &X86::VR512RegClass);
4880  MIB->getOperand(0).setReg(DestReg);
4881  }
4882  return true;
4883 }
4884 
4885 // This is used to handle spills for 128/256-bit registers when we have AVX512,
4886 // but not VLX. If it uses an extended register we need to use an instruction
4887 // that stores the lower 128/256-bit, but is available with only AVX512F.
4889  const TargetRegisterInfo *TRI,
4890  const MCInstrDesc &StoreDesc,
4891  const MCInstrDesc &ExtractDesc,
4892  unsigned SubIdx) {
4893  Register SrcReg = MIB.getReg(X86::AddrNumOperands);
4894  // Check if DestReg is XMM16-31 or YMM16-31.
4895  if (TRI->getEncodingValue(SrcReg) < 16) {
4896  // We can use a normal VEX encoded store.
4897  MIB->setDesc(StoreDesc);
4898  } else {
4899  // Use a VEXTRACTF instruction.
4900  MIB->setDesc(ExtractDesc);
4901  // Change the destination to a 512-bit register.
4902  SrcReg = TRI->getMatchingSuperReg(SrcReg, SubIdx, &X86::VR512RegClass);
4903  MIB->getOperand(X86::AddrNumOperands).setReg(SrcReg);
4904  MIB.addImm(0x0); // Append immediate to extract from the lower bits.
4905  }
4906 
4907  return true;
4908 }
4909 
4910 static bool expandSHXDROT(MachineInstrBuilder &MIB, const MCInstrDesc &Desc) {
4911  MIB->setDesc(Desc);
4912  int64_t ShiftAmt = MIB->getOperand(2).getImm();
4913  // Temporarily remove the immediate so we can add another source register.
4914  MIB->removeOperand(2);
4915  // Add the register. Don't copy the kill flag if there is one.
4916  MIB.addReg(MIB.getReg(1),
4917  getUndefRegState(MIB->getOperand(1).isUndef()));
4918  // Add back the immediate.
4919  MIB.addImm(ShiftAmt);
4920  return true;
4921 }
4922 
4924  bool HasAVX = Subtarget.hasAVX();
4925  MachineInstrBuilder MIB(*MI.getParent()->getParent(), MI);
4926  switch (MI.getOpcode()) {
4927  case X86::MOV32r0:
4928  return Expand2AddrUndef(MIB, get(X86::XOR32rr));
4929  case X86::MOV32r1:
4930  return expandMOV32r1(MIB, *this, /*MinusOne=*/ false);
4931  case X86::MOV32r_1:
4932  return expandMOV32r1(MIB, *this, /*MinusOne=*/ true);
4933  case X86::MOV32ImmSExti8:
4934  case X86::MOV64ImmSExti8:
4935  return ExpandMOVImmSExti8(MIB, *this, Subtarget);
4936  case X86::SETB_C32r:
4937  return Expand2AddrUndef(MIB, get(X86::SBB32rr));
4938  case X86::SETB_C64r:
4939  return Expand2AddrUndef(MIB, get(X86::SBB64rr));
4940  case X86::MMX_SET0:
4941  return Expand2AddrUndef(MIB, get(X86::MMX_PXORrr));
4942  case X86::V_SET0:
4943  case X86::FsFLD0SS:
4944  case X86::FsFLD0SD:
4945  case X86::FsFLD0SH:
4946  case X86::FsFLD0F128:
4947  return Expand2AddrUndef(MIB, get(HasAVX ? X86::VXORPSrr : X86::XORPSrr));
4948  case X86::AVX_SET0: {
4949  assert(HasAVX && "AVX not supported");
4951  Register SrcReg = MIB.getReg(0);
4952  Register XReg = TRI->getSubReg(SrcReg, X86::sub_xmm);
4953  MIB->getOperand(0).setReg(XReg);
4954  Expand2AddrUndef(MIB, get(X86::VXORPSrr));
4955  MIB.addReg(SrcReg, RegState::ImplicitDefine);
4956  return true;
4957  }
4958  case X86::AVX512_128_SET0:
4959  case X86::AVX512_FsFLD0SH:
4960  case X86::AVX512_FsFLD0SS:
4961  case X86::AVX512_FsFLD0SD:
4962  case X86::AVX512_FsFLD0F128: {
4963  bool HasVLX = Subtarget.hasVLX();
4964  Register SrcReg = MIB.getReg(0);
4966  if (HasVLX || TRI->getEncodingValue(SrcReg) < 16)
4967  return Expand2AddrUndef(MIB,
4968  get(HasVLX ? X86::VPXORDZ128rr : X86::VXORPSrr));
4969  // Extended register without VLX. Use a larger XOR.
4970  SrcReg =
4971  TRI->getMatchingSuperReg(SrcReg, X86::sub_xmm, &X86::VR512RegClass);
4972  MIB->getOperand(0).setReg(SrcReg);
4973  return Expand2AddrUndef(MIB, get(X86::VPXORDZrr));
4974  }
4975  case X86::AVX512_256_SET0:
4976  case X86::AVX512_512_SET0: {
4977  bool HasVLX = Subtarget.hasVLX();
4978  Register SrcReg = MIB.getReg(0);
4980  if (HasVLX || TRI->getEncodingValue(SrcReg) < 16) {
4981  Register XReg = TRI->getSubReg(SrcReg, X86::sub_xmm);
4982  MIB->getOperand(0).setReg(XReg);
4983  Expand2AddrUndef(MIB,
4984  get(HasVLX ? X86::VPXORDZ128rr : X86::VXORPSrr));
4985  MIB.addReg(SrcReg, RegState::ImplicitDefine);
4986  return true;
4987  }
4988  if (MI.getOpcode() == X86::AVX512_256_SET0) {
4989  // No VLX so we must reference a zmm.
4990  unsigned ZReg =
4991  TRI->getMatchingSuperReg(SrcReg, X86::sub_ymm, &X86::VR512RegClass);
4992  MIB->getOperand(0).setReg(ZReg);
4993  }
4994  return Expand2AddrUndef(MIB, get(X86::VPXORDZrr));
4995  }
4996  case X86::V_SETALLONES:
4997  return Expand2AddrUndef(MIB, get(HasAVX ? X86::VPCMPEQDrr : X86::PCMPEQDrr));
4998  case X86::AVX2_SETALLONES:
4999  return Expand2AddrUndef(MIB, get(X86::VPCMPEQDYrr));
5000  case X86::AVX1_SETALLONES: {
5001  Register Reg = MIB.getReg(0);
5002  // VCMPPSYrri with an immediate 0xf should produce VCMPTRUEPS.
5003  MIB->setDesc(get(X86::VCMPPSYrri));
5005  return true;
5006  }
5007  case X86::AVX512_512_SETALLONES: {
5008  Register Reg = MIB.getReg(0);
5009  MIB->setDesc(get(X86::VPTERNLOGDZrri));
5010  // VPTERNLOGD needs 3 register inputs and an immediate.
5011  // 0xff will return 1s for any input.
5013  .addReg(Reg, RegState::Undef).addImm(0xff);
5014  return true;
5015  }
5016  case X86::AVX512_512_SEXT_MASK_32:
5017  case X86::AVX512_512_SEXT_MASK_64: {
5018  Register Reg = MIB.getReg(0);
5019  Register MaskReg = MIB.getReg(1);
5020  unsigned MaskState = getRegState(MIB->getOperand(1));
5021  unsigned Opc = (MI.getOpcode() == X86::AVX512_512_SEXT_MASK_64) ?
5022  X86::VPTERNLOGQZrrikz : X86::VPTERNLOGDZrrikz;
5023  MI.removeOperand(1);
5024  MIB->setDesc(get(Opc));
5025  // VPTERNLOG needs 3 register inputs and an immediate.
5026  // 0xff will return 1s for any input.
5027  MIB.addReg(Reg, RegState::Undef).addReg(MaskReg, MaskState)
5029  return true;
5030  }
5031  case X86::VMOVAPSZ128rm_NOVLX:
5032  return expandNOVLXLoad(MIB, &getRegisterInfo(), get(X86::VMOVAPSrm),
5033  get(X86::VBROADCASTF32X4rm), X86::sub_xmm);
5034  case X86::VMOVUPSZ128rm_NOVLX:
5035  return expandNOVLXLoad(MIB, &getRegisterInfo(), get(X86::VMOVUPSrm),
5036  get(X86::VBROADCASTF32X4rm), X86::sub_xmm);
5037  case X86::VMOVAPSZ256rm_NOVLX:
5038  return expandNOVLXLoad(MIB, &getRegisterInfo(), get(X86::VMOVAPSYrm),
5039  get(X86::VBROADCASTF64X4rm), X86::sub_ymm);
5040  case X86::VMOVUPSZ256rm_NOVLX:
5041  return expandNOVLXLoad(MIB, &getRegisterInfo(), get(X86::VMOVUPSYrm),
5042  get(X86::VBROADCASTF64X4rm), X86::sub_ymm);
5043  case X86::VMOVAPSZ128mr_NOVLX:
5044  return expandNOVLXStore(MIB, &getRegisterInfo(), get(X86::VMOVAPSmr),
5045  get(X86::VEXTRACTF32x4Zmr), X86::sub_xmm);
5046  case X86::VMOVUPSZ128mr_NOVLX:
5047  return expandNOVLXStore(MIB, &getRegisterInfo(), get(X86::VMOVUPSmr),
5048  get(X86::VEXTRACTF32x4Zmr), X86::sub_xmm);
5049  case X86::VMOVAPSZ256mr_NOVLX:
5050  return expandNOVLXStore(MIB, &getRegisterInfo(), get(X86::VMOVAPSYmr),
5051  get(X86::VEXTRACTF64x4Zmr), X86::sub_ymm);
5052  case X86::VMOVUPSZ256mr_NOVLX:
5053  return expandNOVLXStore(MIB, &getRegisterInfo(), get(X86::VMOVUPSYmr),
5054  get(X86::VEXTRACTF64x4Zmr), X86::sub_ymm);
5055  case X86::MOV32ri64: {
5056  Register Reg = MIB.getReg(0);
5057  Register Reg32 = RI.getSubReg(Reg, X86::sub_32bit);
5058  MI.setDesc(get(X86::MOV32ri));
5059  MIB->getOperand(0).setReg(Reg32);
5061  return true;
5062  }
5063 
5064  // KNL does not recognize dependency-breaking idioms for mask registers,
5065  // so kxnor %k1, %k1, %k2 has a RAW dependence on %k1.
5066  // Using %k0 as the undef input register is a performance heuristic based
5067  // on the assumption that %k0 is used less frequently than the other mask
5068  // registers, since it is not usable as a write mask.
5069  // FIXME: A more advanced approach would be to choose the best input mask
5070  // register based on context.
5071  case X86::KSET0W: return Expand2AddrKreg(MIB, get(X86::KXORWrr), X86::K0);
5072  case X86::KSET0D: return Expand2AddrKreg(MIB, get(X86::KXORDrr), X86::K0);
5073  case X86::KSET0Q: return Expand2AddrKreg(MIB, get(X86::KXORQrr), X86::K0);
5074  case X86::KSET1W: return Expand2AddrKreg(MIB, get(X86::KXNORWrr), X86::K0);
5075  case X86::KSET1D: return Expand2AddrKreg(MIB, get(X86::KXNORDrr), X86::K0);
5076  case X86::KSET1Q: return Expand2AddrKreg(MIB, get(X86::KXNORQrr), X86::K0);
5077  case TargetOpcode::LOAD_STACK_GUARD:
5078  expandLoadStackGuard(MIB, *this);
5079  return true;
5080  case X86::XOR64_FP:
5081  case X86::XOR32_FP:
5082  return expandXorFP(MIB, *this);
5083  case X86::SHLDROT32ri: return expandSHXDROT(MIB, get(X86::SHLD32rri8));
5084  case X86::SHLDROT64ri: return expandSHXDROT(MIB, get(X86::SHLD64rri8));
5085  case X86::SHRDROT32ri: return expandSHXDROT(MIB, get(X86::SHRD32rri8));
5086  case X86::SHRDROT64ri: return expandSHXDROT(MIB, get(X86::SHRD64rri8));
5087  case X86::ADD8rr_DB: MIB->setDesc(get(X86::OR8rr)); break;
5088  case X86::ADD16rr_DB: MIB->setDesc(get(X86::OR16rr)); break;
5089  case X86::ADD32rr_DB: MIB->setDesc(get(X86::OR32rr)); break;
5090  case X86::ADD64rr_DB: MIB->setDesc(get(X86::OR64rr)); break;
5091  case X86::ADD8ri_DB: MIB->setDesc(get(X86::OR8ri)); break;
5092  case X86::ADD16ri_DB: MIB->setDesc(get(X86::OR16ri)); break;
5093  case X86::ADD32ri_DB: MIB->setDesc(get(X86::OR32ri)); break;
5094  case X86::ADD64ri32_DB: MIB->setDesc(get(X86::OR64ri32)); break;
5095  case X86::ADD16ri8_DB: MIB->setDesc(get(X86::OR16ri8)); break;
5096  case X86::ADD32ri8_DB: MIB->setDesc(get(X86::OR32ri8)); break;
5097  case X86::ADD64ri8_DB: MIB->setDesc(get(X86::OR64ri8)); break;
5098  }
5099  return false;
5100 }
5101 
5102 /// Return true for all instructions that only update
5103 /// the first 32 or 64-bits of the destination register and leave the rest
5104 /// unmodified. This can be used to avoid folding loads if the instructions
5105 /// only update part of the destination register, and the non-updated part is
5106 /// not needed. e.g. cvtss2sd, sqrtss. Unfolding the load from these
5107 /// instructions breaks the partial register dependency and it can improve
5108 /// performance. e.g.:
5109 ///
5110 /// movss (%rdi), %xmm0
5111 /// cvtss2sd %xmm0, %xmm0
5112 ///
5113 /// Instead of
5114 /// cvtss2sd (%rdi), %xmm0
5115 ///
5116 /// FIXME: This should be turned into a TSFlags.
5117 ///
5118 static bool hasPartialRegUpdate(unsigned Opcode,
5119  const X86Subtarget &Subtarget,
5120  bool ForLoadFold = false) {
5121  switch (Opcode) {
5122  case X86::CVTSI2SSrr:
5123  case X86::CVTSI2SSrm:
5124  case X86::CVTSI642SSrr:
5125  case X86::CVTSI642SSrm:
5126  case X86::CVTSI2SDrr:
5127  case X86::CVTSI2SDrm:
5128  case X86::CVTSI