LLVM 17.0.0git
X86InstrInfo.cpp
Go to the documentation of this file.
1//===-- X86InstrInfo.cpp - X86 Instruction Information --------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file contains the X86 implementation of the TargetInstrInfo class.
10//
11//===----------------------------------------------------------------------===//
12
13#include "X86InstrInfo.h"
14#include "X86.h"
15#include "X86InstrBuilder.h"
16#include "X86InstrFoldTables.h"
18#include "X86Subtarget.h"
19#include "X86TargetMachine.h"
20#include "llvm/ADT/STLExtras.h"
21#include "llvm/ADT/Sequence.h"
36#include "llvm/IR/Function.h"
37#include "llvm/IR/InstrTypes.h"
38#include "llvm/MC/MCAsmInfo.h"
39#include "llvm/MC/MCExpr.h"
40#include "llvm/MC/MCInst.h"
42#include "llvm/Support/Debug.h"
46#include <optional>
47
48using namespace llvm;
49
50#define DEBUG_TYPE "x86-instr-info"
51
52#define GET_INSTRINFO_CTOR_DTOR
53#include "X86GenInstrInfo.inc"
54
55static cl::opt<bool>
56 NoFusing("disable-spill-fusing",
57 cl::desc("Disable fusing of spill code into instructions"),
59static cl::opt<bool>
60PrintFailedFusing("print-failed-fuse-candidates",
61 cl::desc("Print instructions that the allocator wants to"
62 " fuse, but the X86 backend currently can't"),
64static cl::opt<bool>
65ReMatPICStubLoad("remat-pic-stub-load",
66 cl::desc("Re-materialize load from stub in PIC mode"),
67 cl::init(false), cl::Hidden);
69PartialRegUpdateClearance("partial-reg-update-clearance",
70 cl::desc("Clearance between two register writes "
71 "for inserting XOR to avoid partial "
72 "register update"),
73 cl::init(64), cl::Hidden);
75UndefRegClearance("undef-reg-clearance",
76 cl::desc("How many idle instructions we would like before "
77 "certain undef register reads"),
78 cl::init(128), cl::Hidden);
79
80
81// Pin the vtable to this file.
82void X86InstrInfo::anchor() {}
83
85 : X86GenInstrInfo((STI.isTarget64BitLP64() ? X86::ADJCALLSTACKDOWN64
86 : X86::ADJCALLSTACKDOWN32),
87 (STI.isTarget64BitLP64() ? X86::ADJCALLSTACKUP64
88 : X86::ADJCALLSTACKUP32),
89 X86::CATCHRET,
90 (STI.is64Bit() ? X86::RET64 : X86::RET32)),
91 Subtarget(STI), RI(STI.getTargetTriple()) {
92}
93
94bool
96 Register &SrcReg, Register &DstReg,
97 unsigned &SubIdx) const {
98 switch (MI.getOpcode()) {
99 default: break;
100 case X86::MOVSX16rr8:
101 case X86::MOVZX16rr8:
102 case X86::MOVSX32rr8:
103 case X86::MOVZX32rr8:
104 case X86::MOVSX64rr8:
105 if (!Subtarget.is64Bit())
106 // It's not always legal to reference the low 8-bit of the larger
107 // register in 32-bit mode.
108 return false;
109 [[fallthrough]];
110 case X86::MOVSX32rr16:
111 case X86::MOVZX32rr16:
112 case X86::MOVSX64rr16:
113 case X86::MOVSX64rr32: {
114 if (MI.getOperand(0).getSubReg() || MI.getOperand(1).getSubReg())
115 // Be conservative.
116 return false;
117 SrcReg = MI.getOperand(1).getReg();
118 DstReg = MI.getOperand(0).getReg();
119 switch (MI.getOpcode()) {
120 default: llvm_unreachable("Unreachable!");
121 case X86::MOVSX16rr8:
122 case X86::MOVZX16rr8:
123 case X86::MOVSX32rr8:
124 case X86::MOVZX32rr8:
125 case X86::MOVSX64rr8:
126 SubIdx = X86::sub_8bit;
127 break;
128 case X86::MOVSX32rr16:
129 case X86::MOVZX32rr16:
130 case X86::MOVSX64rr16:
131 SubIdx = X86::sub_16bit;
132 break;
133 case X86::MOVSX64rr32:
134 SubIdx = X86::sub_32bit;
135 break;
136 }
137 return true;
138 }
139 }
140 return false;
141}
142
144 if (MI.mayLoad() || MI.mayStore())
145 return false;
146
147 // Some target-independent operations that trivially lower to data-invariant
148 // instructions.
149 if (MI.isCopyLike() || MI.isInsertSubreg())
150 return true;
151
152 unsigned Opcode = MI.getOpcode();
153 using namespace X86;
154 // On x86 it is believed that imul is constant time w.r.t. the loaded data.
155 // However, they set flags and are perhaps the most surprisingly constant
156 // time operations so we call them out here separately.
157 if (isIMUL(Opcode))
158 return true;
159 // Bit scanning and counting instructions that are somewhat surprisingly
160 // constant time as they scan across bits and do other fairly complex
161 // operations like popcnt, but are believed to be constant time on x86.
162 // However, these set flags.
163 if (isBSF(Opcode) || isBSR(Opcode) || isLZCNT(Opcode) || isPOPCNT(Opcode) ||
164 isTZCNT(Opcode))
165 return true;
166 // Bit manipulation instructions are effectively combinations of basic
167 // arithmetic ops, and should still execute in constant time. These also
168 // set flags.
169 if (isBLCFILL(Opcode) || isBLCI(Opcode) || isBLCIC(Opcode) ||
170 isBLCMSK(Opcode) || isBLCS(Opcode) || isBLSFILL(Opcode) ||
171 isBLSI(Opcode) || isBLSIC(Opcode) || isBLSMSK(Opcode) || isBLSR(Opcode) ||
172 isTZMSK(Opcode))
173 return true;
174 // Bit extracting and clearing instructions should execute in constant time,
175 // and set flags.
176 if (isBEXTR(Opcode) || isBZHI(Opcode))
177 return true;
178 // Shift and rotate.
179 if (isROL(Opcode) || isROR(Opcode) || isSAR(Opcode) || isSHL(Opcode) ||
180 isSHR(Opcode) || isSHLD(Opcode) || isSHRD(Opcode))
181 return true;
182 // Basic arithmetic is constant time on the input but does set flags.
183 if (isADC(Opcode) || isADD(Opcode) || isAND(Opcode) || isOR(Opcode) ||
184 isSBB(Opcode) || isSUB(Opcode) || isXOR(Opcode))
185 return true;
186 // Arithmetic with just 32-bit and 64-bit variants and no immediates.
187 if (isADCX(Opcode) || isADOX(Opcode) || isANDN(Opcode))
188 return true;
189 // Unary arithmetic operations.
190 if (isDEC(Opcode) || isINC(Opcode) || isNEG(Opcode))
191 return true;
192 // Unlike other arithmetic, NOT doesn't set EFLAGS.
193 if (isNOT(Opcode))
194 return true;
195 // Various move instructions used to zero or sign extend things. Note that we
196 // intentionally don't support the _NOREX variants as we can't handle that
197 // register constraint anyways.
198 if (isMOVSX(Opcode) || isMOVZX(Opcode) || isMOVSXD(Opcode) || isMOV(Opcode))
199 return true;
200 // Arithmetic instructions that are both constant time and don't set flags.
201 if (isRORX(Opcode) || isSARX(Opcode) || isSHLX(Opcode) || isSHRX(Opcode))
202 return true;
203 // LEA doesn't actually access memory, and its arithmetic is constant time.
204 if (isLEA(Opcode))
205 return true;
206 // By default, assume that the instruction is not data invariant.
207 return false;
208}
209
211 switch (MI.getOpcode()) {
212 default:
213 // By default, assume that the load will immediately leak.
214 return false;
215
216 // On x86 it is believed that imul is constant time w.r.t. the loaded data.
217 // However, they set flags and are perhaps the most surprisingly constant
218 // time operations so we call them out here separately.
219 case X86::IMUL16rm:
220 case X86::IMUL16rmi8:
221 case X86::IMUL16rmi:
222 case X86::IMUL32rm:
223 case X86::IMUL32rmi8:
224 case X86::IMUL32rmi:
225 case X86::IMUL64rm:
226 case X86::IMUL64rmi32:
227 case X86::IMUL64rmi8:
228
229 // Bit scanning and counting instructions that are somewhat surprisingly
230 // constant time as they scan across bits and do other fairly complex
231 // operations like popcnt, but are believed to be constant time on x86.
232 // However, these set flags.
233 case X86::BSF16rm:
234 case X86::BSF32rm:
235 case X86::BSF64rm:
236 case X86::BSR16rm:
237 case X86::BSR32rm:
238 case X86::BSR64rm:
239 case X86::LZCNT16rm:
240 case X86::LZCNT32rm:
241 case X86::LZCNT64rm:
242 case X86::POPCNT16rm:
243 case X86::POPCNT32rm:
244 case X86::POPCNT64rm:
245 case X86::TZCNT16rm:
246 case X86::TZCNT32rm:
247 case X86::TZCNT64rm:
248
249 // Bit manipulation instructions are effectively combinations of basic
250 // arithmetic ops, and should still execute in constant time. These also
251 // set flags.
252 case X86::BLCFILL32rm:
253 case X86::BLCFILL64rm:
254 case X86::BLCI32rm:
255 case X86::BLCI64rm:
256 case X86::BLCIC32rm:
257 case X86::BLCIC64rm:
258 case X86::BLCMSK32rm:
259 case X86::BLCMSK64rm:
260 case X86::BLCS32rm:
261 case X86::BLCS64rm:
262 case X86::BLSFILL32rm:
263 case X86::BLSFILL64rm:
264 case X86::BLSI32rm:
265 case X86::BLSI64rm:
266 case X86::BLSIC32rm:
267 case X86::BLSIC64rm:
268 case X86::BLSMSK32rm:
269 case X86::BLSMSK64rm:
270 case X86::BLSR32rm:
271 case X86::BLSR64rm:
272 case X86::TZMSK32rm:
273 case X86::TZMSK64rm:
274
275 // Bit extracting and clearing instructions should execute in constant time,
276 // and set flags.
277 case X86::BEXTR32rm:
278 case X86::BEXTR64rm:
279 case X86::BEXTRI32mi:
280 case X86::BEXTRI64mi:
281 case X86::BZHI32rm:
282 case X86::BZHI64rm:
283
284 // Basic arithmetic is constant time on the input but does set flags.
285 case X86::ADC8rm:
286 case X86::ADC16rm:
287 case X86::ADC32rm:
288 case X86::ADC64rm:
289 case X86::ADCX32rm:
290 case X86::ADCX64rm:
291 case X86::ADD8rm:
292 case X86::ADD16rm:
293 case X86::ADD32rm:
294 case X86::ADD64rm:
295 case X86::ADOX32rm:
296 case X86::ADOX64rm:
297 case X86::AND8rm:
298 case X86::AND16rm:
299 case X86::AND32rm:
300 case X86::AND64rm:
301 case X86::ANDN32rm:
302 case X86::ANDN64rm:
303 case X86::OR8rm:
304 case X86::OR16rm:
305 case X86::OR32rm:
306 case X86::OR64rm:
307 case X86::SBB8rm:
308 case X86::SBB16rm:
309 case X86::SBB32rm:
310 case X86::SBB64rm:
311 case X86::SUB8rm:
312 case X86::SUB16rm:
313 case X86::SUB32rm:
314 case X86::SUB64rm:
315 case X86::XOR8rm:
316 case X86::XOR16rm:
317 case X86::XOR32rm:
318 case X86::XOR64rm:
319
320 // Integer multiply w/o affecting flags is still believed to be constant
321 // time on x86. Called out separately as this is among the most surprising
322 // instructions to exhibit that behavior.
323 case X86::MULX32rm:
324 case X86::MULX64rm:
325
326 // Arithmetic instructions that are both constant time and don't set flags.
327 case X86::RORX32mi:
328 case X86::RORX64mi:
329 case X86::SARX32rm:
330 case X86::SARX64rm:
331 case X86::SHLX32rm:
332 case X86::SHLX64rm:
333 case X86::SHRX32rm:
334 case X86::SHRX64rm:
335
336 // Conversions are believed to be constant time and don't set flags.
337 case X86::CVTTSD2SI64rm:
338 case X86::VCVTTSD2SI64rm:
339 case X86::VCVTTSD2SI64Zrm:
340 case X86::CVTTSD2SIrm:
341 case X86::VCVTTSD2SIrm:
342 case X86::VCVTTSD2SIZrm:
343 case X86::CVTTSS2SI64rm:
344 case X86::VCVTTSS2SI64rm:
345 case X86::VCVTTSS2SI64Zrm:
346 case X86::CVTTSS2SIrm:
347 case X86::VCVTTSS2SIrm:
348 case X86::VCVTTSS2SIZrm:
349 case X86::CVTSI2SDrm:
350 case X86::VCVTSI2SDrm:
351 case X86::VCVTSI2SDZrm:
352 case X86::CVTSI2SSrm:
353 case X86::VCVTSI2SSrm:
354 case X86::VCVTSI2SSZrm:
355 case X86::CVTSI642SDrm:
356 case X86::VCVTSI642SDrm:
357 case X86::VCVTSI642SDZrm:
358 case X86::CVTSI642SSrm:
359 case X86::VCVTSI642SSrm:
360 case X86::VCVTSI642SSZrm:
361 case X86::CVTSS2SDrm:
362 case X86::VCVTSS2SDrm:
363 case X86::VCVTSS2SDZrm:
364 case X86::CVTSD2SSrm:
365 case X86::VCVTSD2SSrm:
366 case X86::VCVTSD2SSZrm:
367 // AVX512 added unsigned integer conversions.
368 case X86::VCVTTSD2USI64Zrm:
369 case X86::VCVTTSD2USIZrm:
370 case X86::VCVTTSS2USI64Zrm:
371 case X86::VCVTTSS2USIZrm:
372 case X86::VCVTUSI2SDZrm:
373 case X86::VCVTUSI642SDZrm:
374 case X86::VCVTUSI2SSZrm:
375 case X86::VCVTUSI642SSZrm:
376
377 // Loads to register don't set flags.
378 case X86::MOV8rm:
379 case X86::MOV8rm_NOREX:
380 case X86::MOV16rm:
381 case X86::MOV32rm:
382 case X86::MOV64rm:
383 case X86::MOVSX16rm8:
384 case X86::MOVSX32rm16:
385 case X86::MOVSX32rm8:
386 case X86::MOVSX32rm8_NOREX:
387 case X86::MOVSX64rm16:
388 case X86::MOVSX64rm32:
389 case X86::MOVSX64rm8:
390 case X86::MOVZX16rm8:
391 case X86::MOVZX32rm16:
392 case X86::MOVZX32rm8:
393 case X86::MOVZX32rm8_NOREX:
394 case X86::MOVZX64rm16:
395 case X86::MOVZX64rm8:
396 return true;
397 }
398}
399
401 const MachineFunction *MF = MI.getParent()->getParent();
403
404 if (isFrameInstr(MI)) {
405 int SPAdj = alignTo(getFrameSize(MI), TFI->getStackAlign());
406 SPAdj -= getFrameAdjustment(MI);
407 if (!isFrameSetup(MI))
408 SPAdj = -SPAdj;
409 return SPAdj;
410 }
411
412 // To know whether a call adjusts the stack, we need information
413 // that is bound to the following ADJCALLSTACKUP pseudo.
414 // Look for the next ADJCALLSTACKUP that follows the call.
415 if (MI.isCall()) {
416 const MachineBasicBlock *MBB = MI.getParent();
418 for (auto E = MBB->end(); I != E; ++I) {
419 if (I->getOpcode() == getCallFrameDestroyOpcode() ||
420 I->isCall())
421 break;
422 }
423
424 // If we could not find a frame destroy opcode, then it has already
425 // been simplified, so we don't care.
426 if (I->getOpcode() != getCallFrameDestroyOpcode())
427 return 0;
428
429 return -(I->getOperand(1).getImm());
430 }
431
432 // Currently handle only PUSHes we can reasonably expect to see
433 // in call sequences
434 switch (MI.getOpcode()) {
435 default:
436 return 0;
437 case X86::PUSH32i8:
438 case X86::PUSH32r:
439 case X86::PUSH32rmm:
440 case X86::PUSH32rmr:
441 case X86::PUSHi32:
442 return 4;
443 case X86::PUSH64i8:
444 case X86::PUSH64r:
445 case X86::PUSH64rmm:
446 case X86::PUSH64rmr:
447 case X86::PUSH64i32:
448 return 8;
449 }
450}
451
452/// Return true and the FrameIndex if the specified
453/// operand and follow operands form a reference to the stack frame.
454bool X86InstrInfo::isFrameOperand(const MachineInstr &MI, unsigned int Op,
455 int &FrameIndex) const {
456 if (MI.getOperand(Op + X86::AddrBaseReg).isFI() &&
457 MI.getOperand(Op + X86::AddrScaleAmt).isImm() &&
458 MI.getOperand(Op + X86::AddrIndexReg).isReg() &&
459 MI.getOperand(Op + X86::AddrDisp).isImm() &&
460 MI.getOperand(Op + X86::AddrScaleAmt).getImm() == 1 &&
461 MI.getOperand(Op + X86::AddrIndexReg).getReg() == 0 &&
462 MI.getOperand(Op + X86::AddrDisp).getImm() == 0) {
463 FrameIndex = MI.getOperand(Op + X86::AddrBaseReg).getIndex();
464 return true;
465 }
466 return false;
467}
468
469static bool isFrameLoadOpcode(int Opcode, unsigned &MemBytes) {
470 switch (Opcode) {
471 default:
472 return false;
473 case X86::MOV8rm:
474 case X86::KMOVBkm:
475 MemBytes = 1;
476 return true;
477 case X86::MOV16rm:
478 case X86::KMOVWkm:
479 case X86::VMOVSHZrm:
480 case X86::VMOVSHZrm_alt:
481 MemBytes = 2;
482 return true;
483 case X86::MOV32rm:
484 case X86::MOVSSrm:
485 case X86::MOVSSrm_alt:
486 case X86::VMOVSSrm:
487 case X86::VMOVSSrm_alt:
488 case X86::VMOVSSZrm:
489 case X86::VMOVSSZrm_alt:
490 case X86::KMOVDkm:
491 MemBytes = 4;
492 return true;
493 case X86::MOV64rm:
494 case X86::LD_Fp64m:
495 case X86::MOVSDrm:
496 case X86::MOVSDrm_alt:
497 case X86::VMOVSDrm:
498 case X86::VMOVSDrm_alt:
499 case X86::VMOVSDZrm:
500 case X86::VMOVSDZrm_alt:
501 case X86::MMX_MOVD64rm:
502 case X86::MMX_MOVQ64rm:
503 case X86::KMOVQkm:
504 MemBytes = 8;
505 return true;
506 case X86::MOVAPSrm:
507 case X86::MOVUPSrm:
508 case X86::MOVAPDrm:
509 case X86::MOVUPDrm:
510 case X86::MOVDQArm:
511 case X86::MOVDQUrm:
512 case X86::VMOVAPSrm:
513 case X86::VMOVUPSrm:
514 case X86::VMOVAPDrm:
515 case X86::VMOVUPDrm:
516 case X86::VMOVDQArm:
517 case X86::VMOVDQUrm:
518 case X86::VMOVAPSZ128rm:
519 case X86::VMOVUPSZ128rm:
520 case X86::VMOVAPSZ128rm_NOVLX:
521 case X86::VMOVUPSZ128rm_NOVLX:
522 case X86::VMOVAPDZ128rm:
523 case X86::VMOVUPDZ128rm:
524 case X86::VMOVDQU8Z128rm:
525 case X86::VMOVDQU16Z128rm:
526 case X86::VMOVDQA32Z128rm:
527 case X86::VMOVDQU32Z128rm:
528 case X86::VMOVDQA64Z128rm:
529 case X86::VMOVDQU64Z128rm:
530 MemBytes = 16;
531 return true;
532 case X86::VMOVAPSYrm:
533 case X86::VMOVUPSYrm:
534 case X86::VMOVAPDYrm:
535 case X86::VMOVUPDYrm:
536 case X86::VMOVDQAYrm:
537 case X86::VMOVDQUYrm:
538 case X86::VMOVAPSZ256rm:
539 case X86::VMOVUPSZ256rm:
540 case X86::VMOVAPSZ256rm_NOVLX:
541 case X86::VMOVUPSZ256rm_NOVLX:
542 case X86::VMOVAPDZ256rm:
543 case X86::VMOVUPDZ256rm:
544 case X86::VMOVDQU8Z256rm:
545 case X86::VMOVDQU16Z256rm:
546 case X86::VMOVDQA32Z256rm:
547 case X86::VMOVDQU32Z256rm:
548 case X86::VMOVDQA64Z256rm:
549 case X86::VMOVDQU64Z256rm:
550 MemBytes = 32;
551 return true;
552 case X86::VMOVAPSZrm:
553 case X86::VMOVUPSZrm:
554 case X86::VMOVAPDZrm:
555 case X86::VMOVUPDZrm:
556 case X86::VMOVDQU8Zrm:
557 case X86::VMOVDQU16Zrm:
558 case X86::VMOVDQA32Zrm:
559 case X86::VMOVDQU32Zrm:
560 case X86::VMOVDQA64Zrm:
561 case X86::VMOVDQU64Zrm:
562 MemBytes = 64;
563 return true;
564 }
565}
566
567static bool isFrameStoreOpcode(int Opcode, unsigned &MemBytes) {
568 switch (Opcode) {
569 default:
570 return false;
571 case X86::MOV8mr:
572 case X86::KMOVBmk:
573 MemBytes = 1;
574 return true;
575 case X86::MOV16mr:
576 case X86::KMOVWmk:
577 case X86::VMOVSHZmr:
578 MemBytes = 2;
579 return true;
580 case X86::MOV32mr:
581 case X86::MOVSSmr:
582 case X86::VMOVSSmr:
583 case X86::VMOVSSZmr:
584 case X86::KMOVDmk:
585 MemBytes = 4;
586 return true;
587 case X86::MOV64mr:
588 case X86::ST_FpP64m:
589 case X86::MOVSDmr:
590 case X86::VMOVSDmr:
591 case X86::VMOVSDZmr:
592 case X86::MMX_MOVD64mr:
593 case X86::MMX_MOVQ64mr:
594 case X86::MMX_MOVNTQmr:
595 case X86::KMOVQmk:
596 MemBytes = 8;
597 return true;
598 case X86::MOVAPSmr:
599 case X86::MOVUPSmr:
600 case X86::MOVAPDmr:
601 case X86::MOVUPDmr:
602 case X86::MOVDQAmr:
603 case X86::MOVDQUmr:
604 case X86::VMOVAPSmr:
605 case X86::VMOVUPSmr:
606 case X86::VMOVAPDmr:
607 case X86::VMOVUPDmr:
608 case X86::VMOVDQAmr:
609 case X86::VMOVDQUmr:
610 case X86::VMOVUPSZ128mr:
611 case X86::VMOVAPSZ128mr:
612 case X86::VMOVUPSZ128mr_NOVLX:
613 case X86::VMOVAPSZ128mr_NOVLX:
614 case X86::VMOVUPDZ128mr:
615 case X86::VMOVAPDZ128mr:
616 case X86::VMOVDQA32Z128mr:
617 case X86::VMOVDQU32Z128mr:
618 case X86::VMOVDQA64Z128mr:
619 case X86::VMOVDQU64Z128mr:
620 case X86::VMOVDQU8Z128mr:
621 case X86::VMOVDQU16Z128mr:
622 MemBytes = 16;
623 return true;
624 case X86::VMOVUPSYmr:
625 case X86::VMOVAPSYmr:
626 case X86::VMOVUPDYmr:
627 case X86::VMOVAPDYmr:
628 case X86::VMOVDQUYmr:
629 case X86::VMOVDQAYmr:
630 case X86::VMOVUPSZ256mr:
631 case X86::VMOVAPSZ256mr:
632 case X86::VMOVUPSZ256mr_NOVLX:
633 case X86::VMOVAPSZ256mr_NOVLX:
634 case X86::VMOVUPDZ256mr:
635 case X86::VMOVAPDZ256mr:
636 case X86::VMOVDQU8Z256mr:
637 case X86::VMOVDQU16Z256mr:
638 case X86::VMOVDQA32Z256mr:
639 case X86::VMOVDQU32Z256mr:
640 case X86::VMOVDQA64Z256mr:
641 case X86::VMOVDQU64Z256mr:
642 MemBytes = 32;
643 return true;
644 case X86::VMOVUPSZmr:
645 case X86::VMOVAPSZmr:
646 case X86::VMOVUPDZmr:
647 case X86::VMOVAPDZmr:
648 case X86::VMOVDQU8Zmr:
649 case X86::VMOVDQU16Zmr:
650 case X86::VMOVDQA32Zmr:
651 case X86::VMOVDQU32Zmr:
652 case X86::VMOVDQA64Zmr:
653 case X86::VMOVDQU64Zmr:
654 MemBytes = 64;
655 return true;
656 }
657 return false;
658}
659
661 int &FrameIndex) const {
662 unsigned Dummy;
663 return X86InstrInfo::isLoadFromStackSlot(MI, FrameIndex, Dummy);
664}
665
667 int &FrameIndex,
668 unsigned &MemBytes) const {
669 if (isFrameLoadOpcode(MI.getOpcode(), MemBytes))
670 if (MI.getOperand(0).getSubReg() == 0 && isFrameOperand(MI, 1, FrameIndex))
671 return MI.getOperand(0).getReg();
672 return 0;
673}
674
676 int &FrameIndex) const {
677 unsigned Dummy;
678 if (isFrameLoadOpcode(MI.getOpcode(), Dummy)) {
679 unsigned Reg;
680 if ((Reg = isLoadFromStackSlot(MI, FrameIndex)))
681 return Reg;
682 // Check for post-frame index elimination operations
684 if (hasLoadFromStackSlot(MI, Accesses)) {
685 FrameIndex =
686 cast<FixedStackPseudoSourceValue>(Accesses.front()->getPseudoValue())
687 ->getFrameIndex();
688 return MI.getOperand(0).getReg();
689 }
690 }
691 return 0;
692}
693
695 int &FrameIndex) const {
696 unsigned Dummy;
697 return X86InstrInfo::isStoreToStackSlot(MI, FrameIndex, Dummy);
698}
699
701 int &FrameIndex,
702 unsigned &MemBytes) const {
703 if (isFrameStoreOpcode(MI.getOpcode(), MemBytes))
704 if (MI.getOperand(X86::AddrNumOperands).getSubReg() == 0 &&
705 isFrameOperand(MI, 0, FrameIndex))
706 return MI.getOperand(X86::AddrNumOperands).getReg();
707 return 0;
708}
709
711 int &FrameIndex) const {
712 unsigned Dummy;
713 if (isFrameStoreOpcode(MI.getOpcode(), Dummy)) {
714 unsigned Reg;
715 if ((Reg = isStoreToStackSlot(MI, FrameIndex)))
716 return Reg;
717 // Check for post-frame index elimination operations
719 if (hasStoreToStackSlot(MI, Accesses)) {
720 FrameIndex =
721 cast<FixedStackPseudoSourceValue>(Accesses.front()->getPseudoValue())
722 ->getFrameIndex();
723 return MI.getOperand(X86::AddrNumOperands).getReg();
724 }
725 }
726 return 0;
727}
728
729/// Return true if register is PIC base; i.e.g defined by X86::MOVPC32r.
730static bool regIsPICBase(Register BaseReg, const MachineRegisterInfo &MRI) {
731 // Don't waste compile time scanning use-def chains of physregs.
732 if (!BaseReg.isVirtual())
733 return false;
734 bool isPICBase = false;
735 for (MachineRegisterInfo::def_instr_iterator I = MRI.def_instr_begin(BaseReg),
736 E = MRI.def_instr_end(); I != E; ++I) {
737 MachineInstr *DefMI = &*I;
738 if (DefMI->getOpcode() != X86::MOVPC32r)
739 return false;
740 assert(!isPICBase && "More than one PIC base?");
741 isPICBase = true;
742 }
743 return isPICBase;
744}
745
747 const MachineInstr &MI) const {
748 switch (MI.getOpcode()) {
749 default:
750 // This function should only be called for opcodes with the ReMaterializable
751 // flag set.
752 llvm_unreachable("Unknown rematerializable operation!");
753 break;
754
755 case X86::LOAD_STACK_GUARD:
756 case X86::AVX1_SETALLONES:
757 case X86::AVX2_SETALLONES:
758 case X86::AVX512_128_SET0:
759 case X86::AVX512_256_SET0:
760 case X86::AVX512_512_SET0:
761 case X86::AVX512_512_SETALLONES:
762 case X86::AVX512_FsFLD0SD:
763 case X86::AVX512_FsFLD0SH:
764 case X86::AVX512_FsFLD0SS:
765 case X86::AVX512_FsFLD0F128:
766 case X86::AVX_SET0:
767 case X86::FsFLD0SD:
768 case X86::FsFLD0SS:
769 case X86::FsFLD0SH:
770 case X86::FsFLD0F128:
771 case X86::KSET0D:
772 case X86::KSET0Q:
773 case X86::KSET0W:
774 case X86::KSET1D:
775 case X86::KSET1Q:
776 case X86::KSET1W:
777 case X86::MMX_SET0:
778 case X86::MOV32ImmSExti8:
779 case X86::MOV32r0:
780 case X86::MOV32r1:
781 case X86::MOV32r_1:
782 case X86::MOV32ri64:
783 case X86::MOV64ImmSExti8:
784 case X86::V_SET0:
785 case X86::V_SETALLONES:
786 case X86::MOV16ri:
787 case X86::MOV32ri:
788 case X86::MOV64ri:
789 case X86::MOV64ri32:
790 case X86::MOV8ri:
791 case X86::PTILEZEROV:
792 return true;
793
794 case X86::MOV8rm:
795 case X86::MOV8rm_NOREX:
796 case X86::MOV16rm:
797 case X86::MOV32rm:
798 case X86::MOV64rm:
799 case X86::MOVSSrm:
800 case X86::MOVSSrm_alt:
801 case X86::MOVSDrm:
802 case X86::MOVSDrm_alt:
803 case X86::MOVAPSrm:
804 case X86::MOVUPSrm:
805 case X86::MOVAPDrm:
806 case X86::MOVUPDrm:
807 case X86::MOVDQArm:
808 case X86::MOVDQUrm:
809 case X86::VMOVSSrm:
810 case X86::VMOVSSrm_alt:
811 case X86::VMOVSDrm:
812 case X86::VMOVSDrm_alt:
813 case X86::VMOVAPSrm:
814 case X86::VMOVUPSrm:
815 case X86::VMOVAPDrm:
816 case X86::VMOVUPDrm:
817 case X86::VMOVDQArm:
818 case X86::VMOVDQUrm:
819 case X86::VMOVAPSYrm:
820 case X86::VMOVUPSYrm:
821 case X86::VMOVAPDYrm:
822 case X86::VMOVUPDYrm:
823 case X86::VMOVDQAYrm:
824 case X86::VMOVDQUYrm:
825 case X86::MMX_MOVD64rm:
826 case X86::MMX_MOVQ64rm:
827 // AVX-512
828 case X86::VMOVSSZrm:
829 case X86::VMOVSSZrm_alt:
830 case X86::VMOVSDZrm:
831 case X86::VMOVSDZrm_alt:
832 case X86::VMOVSHZrm:
833 case X86::VMOVSHZrm_alt:
834 case X86::VMOVAPDZ128rm:
835 case X86::VMOVAPDZ256rm:
836 case X86::VMOVAPDZrm:
837 case X86::VMOVAPSZ128rm:
838 case X86::VMOVAPSZ256rm:
839 case X86::VMOVAPSZ128rm_NOVLX:
840 case X86::VMOVAPSZ256rm_NOVLX:
841 case X86::VMOVAPSZrm:
842 case X86::VMOVDQA32Z128rm:
843 case X86::VMOVDQA32Z256rm:
844 case X86::VMOVDQA32Zrm:
845 case X86::VMOVDQA64Z128rm:
846 case X86::VMOVDQA64Z256rm:
847 case X86::VMOVDQA64Zrm:
848 case X86::VMOVDQU16Z128rm:
849 case X86::VMOVDQU16Z256rm:
850 case X86::VMOVDQU16Zrm:
851 case X86::VMOVDQU32Z128rm:
852 case X86::VMOVDQU32Z256rm:
853 case X86::VMOVDQU32Zrm:
854 case X86::VMOVDQU64Z128rm:
855 case X86::VMOVDQU64Z256rm:
856 case X86::VMOVDQU64Zrm:
857 case X86::VMOVDQU8Z128rm:
858 case X86::VMOVDQU8Z256rm:
859 case X86::VMOVDQU8Zrm:
860 case X86::VMOVUPDZ128rm:
861 case X86::VMOVUPDZ256rm:
862 case X86::VMOVUPDZrm:
863 case X86::VMOVUPSZ128rm:
864 case X86::VMOVUPSZ256rm:
865 case X86::VMOVUPSZ128rm_NOVLX:
866 case X86::VMOVUPSZ256rm_NOVLX:
867 case X86::VMOVUPSZrm: {
868 // Loads from constant pools are trivially rematerializable.
869 if (MI.getOperand(1 + X86::AddrBaseReg).isReg() &&
870 MI.getOperand(1 + X86::AddrScaleAmt).isImm() &&
871 MI.getOperand(1 + X86::AddrIndexReg).isReg() &&
872 MI.getOperand(1 + X86::AddrIndexReg).getReg() == 0 &&
873 MI.isDereferenceableInvariantLoad()) {
874 Register BaseReg = MI.getOperand(1 + X86::AddrBaseReg).getReg();
875 if (BaseReg == 0 || BaseReg == X86::RIP)
876 return true;
877 // Allow re-materialization of PIC load.
878 if (!ReMatPICStubLoad && MI.getOperand(1 + X86::AddrDisp).isGlobal())
879 return false;
880 const MachineFunction &MF = *MI.getParent()->getParent();
881 const MachineRegisterInfo &MRI = MF.getRegInfo();
882 return regIsPICBase(BaseReg, MRI);
883 }
884 return false;
885 }
886
887 case X86::LEA32r:
888 case X86::LEA64r: {
889 if (MI.getOperand(1 + X86::AddrScaleAmt).isImm() &&
890 MI.getOperand(1 + X86::AddrIndexReg).isReg() &&
891 MI.getOperand(1 + X86::AddrIndexReg).getReg() == 0 &&
892 !MI.getOperand(1 + X86::AddrDisp).isReg()) {
893 // lea fi#, lea GV, etc. are all rematerializable.
894 if (!MI.getOperand(1 + X86::AddrBaseReg).isReg())
895 return true;
896 Register BaseReg = MI.getOperand(1 + X86::AddrBaseReg).getReg();
897 if (BaseReg == 0)
898 return true;
899 // Allow re-materialization of lea PICBase + x.
900 const MachineFunction &MF = *MI.getParent()->getParent();
901 const MachineRegisterInfo &MRI = MF.getRegInfo();
902 return regIsPICBase(BaseReg, MRI);
903 }
904 return false;
905 }
906 }
907}
908
911 Register DestReg, unsigned SubIdx,
912 const MachineInstr &Orig,
913 const TargetRegisterInfo &TRI) const {
914 bool ClobbersEFLAGS = Orig.modifiesRegister(X86::EFLAGS, &TRI);
915 if (ClobbersEFLAGS && MBB.computeRegisterLiveness(&TRI, X86::EFLAGS, I) !=
917 // The instruction clobbers EFLAGS. Re-materialize as MOV32ri to avoid side
918 // effects.
919 int Value;
920 switch (Orig.getOpcode()) {
921 case X86::MOV32r0: Value = 0; break;
922 case X86::MOV32r1: Value = 1; break;
923 case X86::MOV32r_1: Value = -1; break;
924 default:
925 llvm_unreachable("Unexpected instruction!");
926 }
927
928 const DebugLoc &DL = Orig.getDebugLoc();
929 BuildMI(MBB, I, DL, get(X86::MOV32ri))
930 .add(Orig.getOperand(0))
931 .addImm(Value);
932 } else {
934 MBB.insert(I, MI);
935 }
936
937 MachineInstr &NewMI = *std::prev(I);
938 NewMI.substituteRegister(Orig.getOperand(0).getReg(), DestReg, SubIdx, TRI);
939}
940
941/// True if MI has a condition code def, e.g. EFLAGS, that is not marked dead.
943 for (const MachineOperand &MO : MI.operands()) {
944 if (MO.isReg() && MO.isDef() &&
945 MO.getReg() == X86::EFLAGS && !MO.isDead()) {
946 return true;
947 }
948 }
949 return false;
950}
951
952/// Check whether the shift count for a machine operand is non-zero.
953inline static unsigned getTruncatedShiftCount(const MachineInstr &MI,
954 unsigned ShiftAmtOperandIdx) {
955 // The shift count is six bits with the REX.W prefix and five bits without.
956 unsigned ShiftCountMask = (MI.getDesc().TSFlags & X86II::REX_W) ? 63 : 31;
957 unsigned Imm = MI.getOperand(ShiftAmtOperandIdx).getImm();
958 return Imm & ShiftCountMask;
959}
960
961/// Check whether the given shift count is appropriate
962/// can be represented by a LEA instruction.
963inline static bool isTruncatedShiftCountForLEA(unsigned ShAmt) {
964 // Left shift instructions can be transformed into load-effective-address
965 // instructions if we can encode them appropriately.
966 // A LEA instruction utilizes a SIB byte to encode its scale factor.
967 // The SIB.scale field is two bits wide which means that we can encode any
968 // shift amount less than 4.
969 return ShAmt < 4 && ShAmt > 0;
970}
971
973 MachineInstr &CmpValDefInstr,
975 MachineInstr **AndInstr,
976 const TargetRegisterInfo *TRI,
977 bool &NoSignFlag, bool &ClearsOverflowFlag) {
978 if (CmpValDefInstr.getOpcode() != X86::SUBREG_TO_REG)
979 return false;
980
981 if (CmpInstr.getOpcode() != X86::TEST64rr)
982 return false;
983
984 // CmpInstr is a TEST64rr instruction, and `X86InstrInfo::analyzeCompare`
985 // guarantees that it's analyzable only if two registers are identical.
986 assert(
987 (CmpInstr.getOperand(0).getReg() == CmpInstr.getOperand(1).getReg()) &&
988 "CmpInstr is an analyzable TEST64rr, and `X86InstrInfo::analyzeCompare` "
989 "requires two reg operands are the same.");
990
991 // Caller (`X86InstrInfo::optimizeCompareInstr`) guarantees that
992 // `CmpValDefInstr` defines the value that's used by `CmpInstr`; in this case
993 // if `CmpValDefInstr` sets the EFLAGS, it is likely that `CmpInstr` is
994 // redundant.
995 assert(
996 (MRI->getVRegDef(CmpInstr.getOperand(0).getReg()) == &CmpValDefInstr) &&
997 "Caller guarantees that TEST64rr is a user of SUBREG_TO_REG.");
998
999 // As seen in X86 td files, CmpValDefInstr.getOperand(1).getImm() is typically
1000 // 0.
1001 if (CmpValDefInstr.getOperand(1).getImm() != 0)
1002 return false;
1003
1004 // As seen in X86 td files, CmpValDefInstr.getOperand(3) is typically
1005 // sub_32bit or sub_xmm.
1006 if (CmpValDefInstr.getOperand(3).getImm() != X86::sub_32bit)
1007 return false;
1008
1009 MachineInstr *VregDefInstr =
1010 MRI->getVRegDef(CmpValDefInstr.getOperand(2).getReg());
1011
1012 assert(VregDefInstr && "Must have a definition (SSA)");
1013
1014 // Requires `CmpValDefInstr` and `VregDefInstr` are from the same MBB
1015 // to simplify the subsequent analysis.
1016 //
1017 // FIXME: If `VregDefInstr->getParent()` is the only predecessor of
1018 // `CmpValDefInstr.getParent()`, this could be handled.
1019 if (VregDefInstr->getParent() != CmpValDefInstr.getParent())
1020 return false;
1021
1022 if (X86::isAND(VregDefInstr->getOpcode())) {
1023 // Get a sequence of instructions like
1024 // %reg = and* ... // Set EFLAGS
1025 // ... // EFLAGS not changed
1026 // %extended_reg = subreg_to_reg 0, %reg, %subreg.sub_32bit
1027 // test64rr %extended_reg, %extended_reg, implicit-def $eflags
1028 //
1029 // If subsequent readers use a subset of bits that don't change
1030 // after `and*` instructions, it's likely that the test64rr could
1031 // be optimized away.
1032 for (const MachineInstr &Instr :
1033 make_range(std::next(MachineBasicBlock::iterator(VregDefInstr)),
1034 MachineBasicBlock::iterator(CmpValDefInstr))) {
1035 // There are instructions between 'VregDefInstr' and
1036 // 'CmpValDefInstr' that modifies EFLAGS.
1037 if (Instr.modifiesRegister(X86::EFLAGS, TRI))
1038 return false;
1039 }
1040
1041 *AndInstr = VregDefInstr;
1042
1043 // AND instruction will essentially update SF and clear OF, so
1044 // NoSignFlag should be false in the sense that SF is modified by `AND`.
1045 //
1046 // However, the implementation artifically sets `NoSignFlag` to true
1047 // to poison the SF bit; that is to say, if SF is looked at later, the
1048 // optimization (to erase TEST64rr) will be disabled.
1049 //
1050 // The reason to poison SF bit is that SF bit value could be different
1051 // in the `AND` and `TEST` operation; signed bit is not known for `AND`,
1052 // and is known to be 0 as a result of `TEST64rr`.
1053 //
1054 // FIXME: As opposed to poisoning the SF bit directly, consider peeking into
1055 // the AND instruction and using the static information to guide peephole
1056 // optimization if possible. For example, it's possible to fold a
1057 // conditional move into a copy if the relevant EFLAG bits could be deduced
1058 // from an immediate operand of and operation.
1059 //
1060 NoSignFlag = true;
1061 // ClearsOverflowFlag is true for AND operation (no surprise).
1062 ClearsOverflowFlag = true;
1063 return true;
1064 }
1065 return false;
1066}
1067
1069 unsigned Opc, bool AllowSP, Register &NewSrc,
1070 bool &isKill, MachineOperand &ImplicitOp,
1071 LiveVariables *LV, LiveIntervals *LIS) const {
1072 MachineFunction &MF = *MI.getParent()->getParent();
1073 const TargetRegisterClass *RC;
1074 if (AllowSP) {
1075 RC = Opc != X86::LEA32r ? &X86::GR64RegClass : &X86::GR32RegClass;
1076 } else {
1077 RC = Opc != X86::LEA32r ?
1078 &X86::GR64_NOSPRegClass : &X86::GR32_NOSPRegClass;
1079 }
1080 Register SrcReg = Src.getReg();
1081 isKill = MI.killsRegister(SrcReg);
1082
1083 // For both LEA64 and LEA32 the register already has essentially the right
1084 // type (32-bit or 64-bit) we may just need to forbid SP.
1085 if (Opc != X86::LEA64_32r) {
1086 NewSrc = SrcReg;
1087 assert(!Src.isUndef() && "Undef op doesn't need optimization");
1088
1089 if (NewSrc.isVirtual() && !MF.getRegInfo().constrainRegClass(NewSrc, RC))
1090 return false;
1091
1092 return true;
1093 }
1094
1095 // This is for an LEA64_32r and incoming registers are 32-bit. One way or
1096 // another we need to add 64-bit registers to the final MI.
1097 if (SrcReg.isPhysical()) {
1098 ImplicitOp = Src;
1099 ImplicitOp.setImplicit();
1100
1101 NewSrc = getX86SubSuperRegister(SrcReg, 64);
1102 assert(NewSrc.isValid() && "Invalid Operand");
1103 assert(!Src.isUndef() && "Undef op doesn't need optimization");
1104 } else {
1105 // Virtual register of the wrong class, we have to create a temporary 64-bit
1106 // vreg to feed into the LEA.
1107 NewSrc = MF.getRegInfo().createVirtualRegister(RC);
1108 MachineInstr *Copy =
1109 BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(TargetOpcode::COPY))
1110 .addReg(NewSrc, RegState::Define | RegState::Undef, X86::sub_32bit)
1111 .addReg(SrcReg, getKillRegState(isKill));
1112
1113 // Which is obviously going to be dead after we're done with it.
1114 isKill = true;
1115
1116 if (LV)
1117 LV->replaceKillInstruction(SrcReg, MI, *Copy);
1118
1119 if (LIS) {
1120 SlotIndex CopyIdx = LIS->InsertMachineInstrInMaps(*Copy);
1122 LiveInterval &LI = LIS->getInterval(SrcReg);
1124 if (S->end.getBaseIndex() == Idx)
1125 S->end = CopyIdx.getRegSlot();
1126 }
1127 }
1128
1129 // We've set all the parameters without issue.
1130 return true;
1131}
1132
1133MachineInstr *X86InstrInfo::convertToThreeAddressWithLEA(unsigned MIOpc,
1135 LiveVariables *LV,
1136 LiveIntervals *LIS,
1137 bool Is8BitOp) const {
1138 // We handle 8-bit adds and various 16-bit opcodes in the switch below.
1139 MachineBasicBlock &MBB = *MI.getParent();
1141 assert((Is8BitOp || RegInfo.getTargetRegisterInfo()->getRegSizeInBits(
1142 *RegInfo.getRegClass(MI.getOperand(0).getReg())) == 16) &&
1143 "Unexpected type for LEA transform");
1144
1145 // TODO: For a 32-bit target, we need to adjust the LEA variables with
1146 // something like this:
1147 // Opcode = X86::LEA32r;
1148 // InRegLEA = RegInfo.createVirtualRegister(&X86::GR32_NOSPRegClass);
1149 // OutRegLEA =
1150 // Is8BitOp ? RegInfo.createVirtualRegister(&X86::GR32ABCD_RegClass)
1151 // : RegInfo.createVirtualRegister(&X86::GR32RegClass);
1152 if (!Subtarget.is64Bit())
1153 return nullptr;
1154
1155 unsigned Opcode = X86::LEA64_32r;
1156 Register InRegLEA = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass);
1157 Register OutRegLEA = RegInfo.createVirtualRegister(&X86::GR32RegClass);
1158 Register InRegLEA2;
1159
1160 // Build and insert into an implicit UNDEF value. This is OK because
1161 // we will be shifting and then extracting the lower 8/16-bits.
1162 // This has the potential to cause partial register stall. e.g.
1163 // movw (%rbp,%rcx,2), %dx
1164 // leal -65(%rdx), %esi
1165 // But testing has shown this *does* help performance in 64-bit mode (at
1166 // least on modern x86 machines).
1167 MachineBasicBlock::iterator MBBI = MI.getIterator();
1168 Register Dest = MI.getOperand(0).getReg();
1169 Register Src = MI.getOperand(1).getReg();
1170 Register Src2;
1171 bool IsDead = MI.getOperand(0).isDead();
1172 bool IsKill = MI.getOperand(1).isKill();
1173 unsigned SubReg = Is8BitOp ? X86::sub_8bit : X86::sub_16bit;
1174 assert(!MI.getOperand(1).isUndef() && "Undef op doesn't need optimization");
1175 MachineInstr *ImpDef =
1176 BuildMI(MBB, MBBI, MI.getDebugLoc(), get(X86::IMPLICIT_DEF), InRegLEA);
1177 MachineInstr *InsMI =
1178 BuildMI(MBB, MBBI, MI.getDebugLoc(), get(TargetOpcode::COPY))
1179 .addReg(InRegLEA, RegState::Define, SubReg)
1180 .addReg(Src, getKillRegState(IsKill));
1181 MachineInstr *ImpDef2 = nullptr;
1182 MachineInstr *InsMI2 = nullptr;
1183
1185 BuildMI(MBB, MBBI, MI.getDebugLoc(), get(Opcode), OutRegLEA);
1186 switch (MIOpc) {
1187 default: llvm_unreachable("Unreachable!");
1188 case X86::SHL8ri:
1189 case X86::SHL16ri: {
1190 unsigned ShAmt = MI.getOperand(2).getImm();
1191 MIB.addReg(0)
1192 .addImm(1LL << ShAmt)
1193 .addReg(InRegLEA, RegState::Kill)
1194 .addImm(0)
1195 .addReg(0);
1196 break;
1197 }
1198 case X86::INC8r:
1199 case X86::INC16r:
1200 addRegOffset(MIB, InRegLEA, true, 1);
1201 break;
1202 case X86::DEC8r:
1203 case X86::DEC16r:
1204 addRegOffset(MIB, InRegLEA, true, -1);
1205 break;
1206 case X86::ADD8ri:
1207 case X86::ADD8ri_DB:
1208 case X86::ADD16ri:
1209 case X86::ADD16ri8:
1210 case X86::ADD16ri_DB:
1211 case X86::ADD16ri8_DB:
1212 addRegOffset(MIB, InRegLEA, true, MI.getOperand(2).getImm());
1213 break;
1214 case X86::ADD8rr:
1215 case X86::ADD8rr_DB:
1216 case X86::ADD16rr:
1217 case X86::ADD16rr_DB: {
1218 Src2 = MI.getOperand(2).getReg();
1219 bool IsKill2 = MI.getOperand(2).isKill();
1220 assert(!MI.getOperand(2).isUndef() && "Undef op doesn't need optimization");
1221 if (Src == Src2) {
1222 // ADD8rr/ADD16rr killed %reg1028, %reg1028
1223 // just a single insert_subreg.
1224 addRegReg(MIB, InRegLEA, true, InRegLEA, false);
1225 } else {
1226 if (Subtarget.is64Bit())
1227 InRegLEA2 = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass);
1228 else
1229 InRegLEA2 = RegInfo.createVirtualRegister(&X86::GR32_NOSPRegClass);
1230 // Build and insert into an implicit UNDEF value. This is OK because
1231 // we will be shifting and then extracting the lower 8/16-bits.
1232 ImpDef2 = BuildMI(MBB, &*MIB, MI.getDebugLoc(), get(X86::IMPLICIT_DEF),
1233 InRegLEA2);
1234 InsMI2 = BuildMI(MBB, &*MIB, MI.getDebugLoc(), get(TargetOpcode::COPY))
1235 .addReg(InRegLEA2, RegState::Define, SubReg)
1236 .addReg(Src2, getKillRegState(IsKill2));
1237 addRegReg(MIB, InRegLEA, true, InRegLEA2, true);
1238 }
1239 if (LV && IsKill2 && InsMI2)
1240 LV->replaceKillInstruction(Src2, MI, *InsMI2);
1241 break;
1242 }
1243 }
1244
1245 MachineInstr *NewMI = MIB;
1246 MachineInstr *ExtMI =
1247 BuildMI(MBB, MBBI, MI.getDebugLoc(), get(TargetOpcode::COPY))
1249 .addReg(OutRegLEA, RegState::Kill, SubReg);
1250
1251 if (LV) {
1252 // Update live variables.
1253 LV->getVarInfo(InRegLEA).Kills.push_back(NewMI);
1254 if (InRegLEA2)
1255 LV->getVarInfo(InRegLEA2).Kills.push_back(NewMI);
1256 LV->getVarInfo(OutRegLEA).Kills.push_back(ExtMI);
1257 if (IsKill)
1258 LV->replaceKillInstruction(Src, MI, *InsMI);
1259 if (IsDead)
1260 LV->replaceKillInstruction(Dest, MI, *ExtMI);
1261 }
1262
1263 if (LIS) {
1264 LIS->InsertMachineInstrInMaps(*ImpDef);
1265 SlotIndex InsIdx = LIS->InsertMachineInstrInMaps(*InsMI);
1266 if (ImpDef2)
1267 LIS->InsertMachineInstrInMaps(*ImpDef2);
1268 SlotIndex Ins2Idx;
1269 if (InsMI2)
1270 Ins2Idx = LIS->InsertMachineInstrInMaps(*InsMI2);
1271 SlotIndex NewIdx = LIS->ReplaceMachineInstrInMaps(MI, *NewMI);
1272 SlotIndex ExtIdx = LIS->InsertMachineInstrInMaps(*ExtMI);
1273 LIS->getInterval(InRegLEA);
1274 LIS->getInterval(OutRegLEA);
1275 if (InRegLEA2)
1276 LIS->getInterval(InRegLEA2);
1277
1278 // Move the use of Src up to InsMI.
1279 LiveInterval &SrcLI = LIS->getInterval(Src);
1280 LiveRange::Segment *SrcSeg = SrcLI.getSegmentContaining(NewIdx);
1281 if (SrcSeg->end == NewIdx.getRegSlot())
1282 SrcSeg->end = InsIdx.getRegSlot();
1283
1284 if (InsMI2) {
1285 // Move the use of Src2 up to InsMI2.
1286 LiveInterval &Src2LI = LIS->getInterval(Src2);
1287 LiveRange::Segment *Src2Seg = Src2LI.getSegmentContaining(NewIdx);
1288 if (Src2Seg->end == NewIdx.getRegSlot())
1289 Src2Seg->end = Ins2Idx.getRegSlot();
1290 }
1291
1292 // Move the definition of Dest down to ExtMI.
1293 LiveInterval &DestLI = LIS->getInterval(Dest);
1294 LiveRange::Segment *DestSeg =
1295 DestLI.getSegmentContaining(NewIdx.getRegSlot());
1296 assert(DestSeg->start == NewIdx.getRegSlot() &&
1297 DestSeg->valno->def == NewIdx.getRegSlot());
1298 DestSeg->start = ExtIdx.getRegSlot();
1299 DestSeg->valno->def = ExtIdx.getRegSlot();
1300 }
1301
1302 return ExtMI;
1303}
1304
1305/// This method must be implemented by targets that
1306/// set the M_CONVERTIBLE_TO_3_ADDR flag. When this flag is set, the target
1307/// may be able to convert a two-address instruction into a true
1308/// three-address instruction on demand. This allows the X86 target (for
1309/// example) to convert ADD and SHL instructions into LEA instructions if they
1310/// would require register copies due to two-addressness.
1311///
1312/// This method returns a null pointer if the transformation cannot be
1313/// performed, otherwise it returns the new instruction.
1314///
1316 LiveVariables *LV,
1317 LiveIntervals *LIS) const {
1318 // The following opcodes also sets the condition code register(s). Only
1319 // convert them to equivalent lea if the condition code register def's
1320 // are dead!
1322 return nullptr;
1323
1324 MachineFunction &MF = *MI.getParent()->getParent();
1325 // All instructions input are two-addr instructions. Get the known operands.
1326 const MachineOperand &Dest = MI.getOperand(0);
1327 const MachineOperand &Src = MI.getOperand(1);
1328
1329 // Ideally, operations with undef should be folded before we get here, but we
1330 // can't guarantee it. Bail out because optimizing undefs is a waste of time.
1331 // Without this, we have to forward undef state to new register operands to
1332 // avoid machine verifier errors.
1333 if (Src.isUndef())
1334 return nullptr;
1335 if (MI.getNumOperands() > 2)
1336 if (MI.getOperand(2).isReg() && MI.getOperand(2).isUndef())
1337 return nullptr;
1338
1339 MachineInstr *NewMI = nullptr;
1340 Register SrcReg, SrcReg2;
1341 bool Is64Bit = Subtarget.is64Bit();
1342
1343 bool Is8BitOp = false;
1344 unsigned NumRegOperands = 2;
1345 unsigned MIOpc = MI.getOpcode();
1346 switch (MIOpc) {
1347 default: llvm_unreachable("Unreachable!");
1348 case X86::SHL64ri: {
1349 assert(MI.getNumOperands() >= 3 && "Unknown shift instruction!");
1350 unsigned ShAmt = getTruncatedShiftCount(MI, 2);
1351 if (!isTruncatedShiftCountForLEA(ShAmt)) return nullptr;
1352
1353 // LEA can't handle RSP.
1354 if (Src.getReg().isVirtual() && !MF.getRegInfo().constrainRegClass(
1355 Src.getReg(), &X86::GR64_NOSPRegClass))
1356 return nullptr;
1357
1358 NewMI = BuildMI(MF, MI.getDebugLoc(), get(X86::LEA64r))
1359 .add(Dest)
1360 .addReg(0)
1361 .addImm(1LL << ShAmt)
1362 .add(Src)
1363 .addImm(0)
1364 .addReg(0);
1365 break;
1366 }
1367 case X86::SHL32ri: {
1368 assert(MI.getNumOperands() >= 3 && "Unknown shift instruction!");
1369 unsigned ShAmt = getTruncatedShiftCount(MI, 2);
1370 if (!isTruncatedShiftCountForLEA(ShAmt)) return nullptr;
1371
1372 unsigned Opc = Is64Bit ? X86::LEA64_32r : X86::LEA32r;
1373
1374 // LEA can't handle ESP.
1375 bool isKill;
1376 MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
1377 if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/false, SrcReg, isKill,
1378 ImplicitOp, LV, LIS))
1379 return nullptr;
1380
1382 BuildMI(MF, MI.getDebugLoc(), get(Opc))
1383 .add(Dest)
1384 .addReg(0)
1385 .addImm(1LL << ShAmt)
1386 .addReg(SrcReg, getKillRegState(isKill))
1387 .addImm(0)
1388 .addReg(0);
1389 if (ImplicitOp.getReg() != 0)
1390 MIB.add(ImplicitOp);
1391 NewMI = MIB;
1392
1393 // Add kills if classifyLEAReg created a new register.
1394 if (LV && SrcReg != Src.getReg())
1395 LV->getVarInfo(SrcReg).Kills.push_back(NewMI);
1396 break;
1397 }
1398 case X86::SHL8ri:
1399 Is8BitOp = true;
1400 [[fallthrough]];
1401 case X86::SHL16ri: {
1402 assert(MI.getNumOperands() >= 3 && "Unknown shift instruction!");
1403 unsigned ShAmt = getTruncatedShiftCount(MI, 2);
1404 if (!isTruncatedShiftCountForLEA(ShAmt))
1405 return nullptr;
1406 return convertToThreeAddressWithLEA(MIOpc, MI, LV, LIS, Is8BitOp);
1407 }
1408 case X86::INC64r:
1409 case X86::INC32r: {
1410 assert(MI.getNumOperands() >= 2 && "Unknown inc instruction!");
1411 unsigned Opc = MIOpc == X86::INC64r ? X86::LEA64r :
1412 (Is64Bit ? X86::LEA64_32r : X86::LEA32r);
1413 bool isKill;
1414 MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
1415 if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/false, SrcReg, isKill,
1416 ImplicitOp, LV, LIS))
1417 return nullptr;
1418
1420 BuildMI(MF, MI.getDebugLoc(), get(Opc))
1421 .add(Dest)
1422 .addReg(SrcReg, getKillRegState(isKill));
1423 if (ImplicitOp.getReg() != 0)
1424 MIB.add(ImplicitOp);
1425
1426 NewMI = addOffset(MIB, 1);
1427
1428 // Add kills if classifyLEAReg created a new register.
1429 if (LV && SrcReg != Src.getReg())
1430 LV->getVarInfo(SrcReg).Kills.push_back(NewMI);
1431 break;
1432 }
1433 case X86::DEC64r:
1434 case X86::DEC32r: {
1435 assert(MI.getNumOperands() >= 2 && "Unknown dec instruction!");
1436 unsigned Opc = MIOpc == X86::DEC64r ? X86::LEA64r
1437 : (Is64Bit ? X86::LEA64_32r : X86::LEA32r);
1438
1439 bool isKill;
1440 MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
1441 if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/false, SrcReg, isKill,
1442 ImplicitOp, LV, LIS))
1443 return nullptr;
1444
1445 MachineInstrBuilder MIB = BuildMI(MF, MI.getDebugLoc(), get(Opc))
1446 .add(Dest)
1447 .addReg(SrcReg, getKillRegState(isKill));
1448 if (ImplicitOp.getReg() != 0)
1449 MIB.add(ImplicitOp);
1450
1451 NewMI = addOffset(MIB, -1);
1452
1453 // Add kills if classifyLEAReg created a new register.
1454 if (LV && SrcReg != Src.getReg())
1455 LV->getVarInfo(SrcReg).Kills.push_back(NewMI);
1456 break;
1457 }
1458 case X86::DEC8r:
1459 case X86::INC8r:
1460 Is8BitOp = true;
1461 [[fallthrough]];
1462 case X86::DEC16r:
1463 case X86::INC16r:
1464 return convertToThreeAddressWithLEA(MIOpc, MI, LV, LIS, Is8BitOp);
1465 case X86::ADD64rr:
1466 case X86::ADD64rr_DB:
1467 case X86::ADD32rr:
1468 case X86::ADD32rr_DB: {
1469 assert(MI.getNumOperands() >= 3 && "Unknown add instruction!");
1470 unsigned Opc;
1471 if (MIOpc == X86::ADD64rr || MIOpc == X86::ADD64rr_DB)
1472 Opc = X86::LEA64r;
1473 else
1474 Opc = Is64Bit ? X86::LEA64_32r : X86::LEA32r;
1475
1476 const MachineOperand &Src2 = MI.getOperand(2);
1477 bool isKill2;
1478 MachineOperand ImplicitOp2 = MachineOperand::CreateReg(0, false);
1479 if (!classifyLEAReg(MI, Src2, Opc, /*AllowSP=*/false, SrcReg2, isKill2,
1480 ImplicitOp2, LV, LIS))
1481 return nullptr;
1482
1483 bool isKill;
1484 MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
1485 if (Src.getReg() == Src2.getReg()) {
1486 // Don't call classify LEAReg a second time on the same register, in case
1487 // the first call inserted a COPY from Src2 and marked it as killed.
1488 isKill = isKill2;
1489 SrcReg = SrcReg2;
1490 } else {
1491 if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/true, SrcReg, isKill,
1492 ImplicitOp, LV, LIS))
1493 return nullptr;
1494 }
1495
1496 MachineInstrBuilder MIB = BuildMI(MF, MI.getDebugLoc(), get(Opc)).add(Dest);
1497 if (ImplicitOp.getReg() != 0)
1498 MIB.add(ImplicitOp);
1499 if (ImplicitOp2.getReg() != 0)
1500 MIB.add(ImplicitOp2);
1501
1502 NewMI = addRegReg(MIB, SrcReg, isKill, SrcReg2, isKill2);
1503
1504 // Add kills if classifyLEAReg created a new register.
1505 if (LV) {
1506 if (SrcReg2 != Src2.getReg())
1507 LV->getVarInfo(SrcReg2).Kills.push_back(NewMI);
1508 if (SrcReg != SrcReg2 && SrcReg != Src.getReg())
1509 LV->getVarInfo(SrcReg).Kills.push_back(NewMI);
1510 }
1511 NumRegOperands = 3;
1512 break;
1513 }
1514 case X86::ADD8rr:
1515 case X86::ADD8rr_DB:
1516 Is8BitOp = true;
1517 [[fallthrough]];
1518 case X86::ADD16rr:
1519 case X86::ADD16rr_DB:
1520 return convertToThreeAddressWithLEA(MIOpc, MI, LV, LIS, Is8BitOp);
1521 case X86::ADD64ri32:
1522 case X86::ADD64ri8:
1523 case X86::ADD64ri32_DB:
1524 case X86::ADD64ri8_DB:
1525 assert(MI.getNumOperands() >= 3 && "Unknown add instruction!");
1526 NewMI = addOffset(
1527 BuildMI(MF, MI.getDebugLoc(), get(X86::LEA64r)).add(Dest).add(Src),
1528 MI.getOperand(2));
1529 break;
1530 case X86::ADD32ri:
1531 case X86::ADD32ri8:
1532 case X86::ADD32ri_DB:
1533 case X86::ADD32ri8_DB: {
1534 assert(MI.getNumOperands() >= 3 && "Unknown add instruction!");
1535 unsigned Opc = Is64Bit ? X86::LEA64_32r : X86::LEA32r;
1536
1537 bool isKill;
1538 MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
1539 if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/true, SrcReg, isKill,
1540 ImplicitOp, LV, LIS))
1541 return nullptr;
1542
1543 MachineInstrBuilder MIB = BuildMI(MF, MI.getDebugLoc(), get(Opc))
1544 .add(Dest)
1545 .addReg(SrcReg, getKillRegState(isKill));
1546 if (ImplicitOp.getReg() != 0)
1547 MIB.add(ImplicitOp);
1548
1549 NewMI = addOffset(MIB, MI.getOperand(2));
1550
1551 // Add kills if classifyLEAReg created a new register.
1552 if (LV && SrcReg != Src.getReg())
1553 LV->getVarInfo(SrcReg).Kills.push_back(NewMI);
1554 break;
1555 }
1556 case X86::ADD8ri:
1557 case X86::ADD8ri_DB:
1558 Is8BitOp = true;
1559 [[fallthrough]];
1560 case X86::ADD16ri:
1561 case X86::ADD16ri8:
1562 case X86::ADD16ri_DB:
1563 case X86::ADD16ri8_DB:
1564 return convertToThreeAddressWithLEA(MIOpc, MI, LV, LIS, Is8BitOp);
1565 case X86::SUB8ri:
1566 case X86::SUB16ri8:
1567 case X86::SUB16ri:
1568 /// FIXME: Support these similar to ADD8ri/ADD16ri*.
1569 return nullptr;
1570 case X86::SUB32ri8:
1571 case X86::SUB32ri: {
1572 if (!MI.getOperand(2).isImm())
1573 return nullptr;
1574 int64_t Imm = MI.getOperand(2).getImm();
1575 if (!isInt<32>(-Imm))
1576 return nullptr;
1577
1578 assert(MI.getNumOperands() >= 3 && "Unknown add instruction!");
1579 unsigned Opc = Is64Bit ? X86::LEA64_32r : X86::LEA32r;
1580
1581 bool isKill;
1582 MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
1583 if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/true, SrcReg, isKill,
1584 ImplicitOp, LV, LIS))
1585 return nullptr;
1586
1587 MachineInstrBuilder MIB = BuildMI(MF, MI.getDebugLoc(), get(Opc))
1588 .add(Dest)
1589 .addReg(SrcReg, getKillRegState(isKill));
1590 if (ImplicitOp.getReg() != 0)
1591 MIB.add(ImplicitOp);
1592
1593 NewMI = addOffset(MIB, -Imm);
1594
1595 // Add kills if classifyLEAReg created a new register.
1596 if (LV && SrcReg != Src.getReg())
1597 LV->getVarInfo(SrcReg).Kills.push_back(NewMI);
1598 break;
1599 }
1600
1601 case X86::SUB64ri8:
1602 case X86::SUB64ri32: {
1603 if (!MI.getOperand(2).isImm())
1604 return nullptr;
1605 int64_t Imm = MI.getOperand(2).getImm();
1606 if (!isInt<32>(-Imm))
1607 return nullptr;
1608
1609 assert(MI.getNumOperands() >= 3 && "Unknown sub instruction!");
1610
1611 MachineInstrBuilder MIB = BuildMI(MF, MI.getDebugLoc(),
1612 get(X86::LEA64r)).add(Dest).add(Src);
1613 NewMI = addOffset(MIB, -Imm);
1614 break;
1615 }
1616
1617 case X86::VMOVDQU8Z128rmk:
1618 case X86::VMOVDQU8Z256rmk:
1619 case X86::VMOVDQU8Zrmk:
1620 case X86::VMOVDQU16Z128rmk:
1621 case X86::VMOVDQU16Z256rmk:
1622 case X86::VMOVDQU16Zrmk:
1623 case X86::VMOVDQU32Z128rmk: case X86::VMOVDQA32Z128rmk:
1624 case X86::VMOVDQU32Z256rmk: case X86::VMOVDQA32Z256rmk:
1625 case X86::VMOVDQU32Zrmk: case X86::VMOVDQA32Zrmk:
1626 case X86::VMOVDQU64Z128rmk: case X86::VMOVDQA64Z128rmk:
1627 case X86::VMOVDQU64Z256rmk: case X86::VMOVDQA64Z256rmk:
1628 case X86::VMOVDQU64Zrmk: case X86::VMOVDQA64Zrmk:
1629 case X86::VMOVUPDZ128rmk: case X86::VMOVAPDZ128rmk:
1630 case X86::VMOVUPDZ256rmk: case X86::VMOVAPDZ256rmk:
1631 case X86::VMOVUPDZrmk: case X86::VMOVAPDZrmk:
1632 case X86::VMOVUPSZ128rmk: case X86::VMOVAPSZ128rmk:
1633 case X86::VMOVUPSZ256rmk: case X86::VMOVAPSZ256rmk:
1634 case X86::VMOVUPSZrmk: case X86::VMOVAPSZrmk:
1635 case X86::VBROADCASTSDZ256rmk:
1636 case X86::VBROADCASTSDZrmk:
1637 case X86::VBROADCASTSSZ128rmk:
1638 case X86::VBROADCASTSSZ256rmk:
1639 case X86::VBROADCASTSSZrmk:
1640 case X86::VPBROADCASTDZ128rmk:
1641 case X86::VPBROADCASTDZ256rmk:
1642 case X86::VPBROADCASTDZrmk:
1643 case X86::VPBROADCASTQZ128rmk:
1644 case X86::VPBROADCASTQZ256rmk:
1645 case X86::VPBROADCASTQZrmk: {
1646 unsigned Opc;
1647 switch (MIOpc) {
1648 default: llvm_unreachable("Unreachable!");
1649 case X86::VMOVDQU8Z128rmk: Opc = X86::VPBLENDMBZ128rmk; break;
1650 case X86::VMOVDQU8Z256rmk: Opc = X86::VPBLENDMBZ256rmk; break;
1651 case X86::VMOVDQU8Zrmk: Opc = X86::VPBLENDMBZrmk; break;
1652 case X86::VMOVDQU16Z128rmk: Opc = X86::VPBLENDMWZ128rmk; break;
1653 case X86::VMOVDQU16Z256rmk: Opc = X86::VPBLENDMWZ256rmk; break;
1654 case X86::VMOVDQU16Zrmk: Opc = X86::VPBLENDMWZrmk; break;
1655 case X86::VMOVDQU32Z128rmk: Opc = X86::VPBLENDMDZ128rmk; break;
1656 case X86::VMOVDQU32Z256rmk: Opc = X86::VPBLENDMDZ256rmk; break;
1657 case X86::VMOVDQU32Zrmk: Opc = X86::VPBLENDMDZrmk; break;
1658 case X86::VMOVDQU64Z128rmk: Opc = X86::VPBLENDMQZ128rmk; break;
1659 case X86::VMOVDQU64Z256rmk: Opc = X86::VPBLENDMQZ256rmk; break;
1660 case X86::VMOVDQU64Zrmk: Opc = X86::VPBLENDMQZrmk; break;
1661 case X86::VMOVUPDZ128rmk: Opc = X86::VBLENDMPDZ128rmk; break;
1662 case X86::VMOVUPDZ256rmk: Opc = X86::VBLENDMPDZ256rmk; break;
1663 case X86::VMOVUPDZrmk: Opc = X86::VBLENDMPDZrmk; break;
1664 case X86::VMOVUPSZ128rmk: Opc = X86::VBLENDMPSZ128rmk; break;
1665 case X86::VMOVUPSZ256rmk: Opc = X86::VBLENDMPSZ256rmk; break;
1666 case X86::VMOVUPSZrmk: Opc = X86::VBLENDMPSZrmk; break;
1667 case X86::VMOVDQA32Z128rmk: Opc = X86::VPBLENDMDZ128rmk; break;
1668 case X86::VMOVDQA32Z256rmk: Opc = X86::VPBLENDMDZ256rmk; break;
1669 case X86::VMOVDQA32Zrmk: Opc = X86::VPBLENDMDZrmk; break;
1670 case X86::VMOVDQA64Z128rmk: Opc = X86::VPBLENDMQZ128rmk; break;
1671 case X86::VMOVDQA64Z256rmk: Opc = X86::VPBLENDMQZ256rmk; break;
1672 case X86::VMOVDQA64Zrmk: Opc = X86::VPBLENDMQZrmk; break;
1673 case X86::VMOVAPDZ128rmk: Opc = X86::VBLENDMPDZ128rmk; break;
1674 case X86::VMOVAPDZ256rmk: Opc = X86::VBLENDMPDZ256rmk; break;
1675 case X86::VMOVAPDZrmk: Opc = X86::VBLENDMPDZrmk; break;
1676 case X86::VMOVAPSZ128rmk: Opc = X86::VBLENDMPSZ128rmk; break;
1677 case X86::VMOVAPSZ256rmk: Opc = X86::VBLENDMPSZ256rmk; break;
1678 case X86::VMOVAPSZrmk: Opc = X86::VBLENDMPSZrmk; break;
1679 case X86::VBROADCASTSDZ256rmk: Opc = X86::VBLENDMPDZ256rmbk; break;
1680 case X86::VBROADCASTSDZrmk: Opc = X86::VBLENDMPDZrmbk; break;
1681 case X86::VBROADCASTSSZ128rmk: Opc = X86::VBLENDMPSZ128rmbk; break;
1682 case X86::VBROADCASTSSZ256rmk: Opc = X86::VBLENDMPSZ256rmbk; break;
1683 case X86::VBROADCASTSSZrmk: Opc = X86::VBLENDMPSZrmbk; break;
1684 case X86::VPBROADCASTDZ128rmk: Opc = X86::VPBLENDMDZ128rmbk; break;
1685 case X86::VPBROADCASTDZ256rmk: Opc = X86::VPBLENDMDZ256rmbk; break;
1686 case X86::VPBROADCASTDZrmk: Opc = X86::VPBLENDMDZrmbk; break;
1687 case X86::VPBROADCASTQZ128rmk: Opc = X86::VPBLENDMQZ128rmbk; break;
1688 case X86::VPBROADCASTQZ256rmk: Opc = X86::VPBLENDMQZ256rmbk; break;
1689 case X86::VPBROADCASTQZrmk: Opc = X86::VPBLENDMQZrmbk; break;
1690 }
1691
1692 NewMI = BuildMI(MF, MI.getDebugLoc(), get(Opc))
1693 .add(Dest)
1694 .add(MI.getOperand(2))
1695 .add(Src)
1696 .add(MI.getOperand(3))
1697 .add(MI.getOperand(4))
1698 .add(MI.getOperand(5))
1699 .add(MI.getOperand(6))
1700 .add(MI.getOperand(7));
1701 NumRegOperands = 4;
1702 break;
1703 }
1704
1705 case X86::VMOVDQU8Z128rrk:
1706 case X86::VMOVDQU8Z256rrk:
1707 case X86::VMOVDQU8Zrrk:
1708 case X86::VMOVDQU16Z128rrk:
1709 case X86::VMOVDQU16Z256rrk:
1710 case X86::VMOVDQU16Zrrk:
1711 case X86::VMOVDQU32Z128rrk: case X86::VMOVDQA32Z128rrk:
1712 case X86::VMOVDQU32Z256rrk: case X86::VMOVDQA32Z256rrk:
1713 case X86::VMOVDQU32Zrrk: case X86::VMOVDQA32Zrrk:
1714 case X86::VMOVDQU64Z128rrk: case X86::VMOVDQA64Z128rrk:
1715 case X86::VMOVDQU64Z256rrk: case X86::VMOVDQA64Z256rrk:
1716 case X86::VMOVDQU64Zrrk: case X86::VMOVDQA64Zrrk:
1717 case X86::VMOVUPDZ128rrk: case X86::VMOVAPDZ128rrk:
1718 case X86::VMOVUPDZ256rrk: case X86::VMOVAPDZ256rrk:
1719 case X86::VMOVUPDZrrk: case X86::VMOVAPDZrrk:
1720 case X86::VMOVUPSZ128rrk: case X86::VMOVAPSZ128rrk:
1721 case X86::VMOVUPSZ256rrk: case X86::VMOVAPSZ256rrk:
1722 case X86::VMOVUPSZrrk: case X86::VMOVAPSZrrk: {
1723 unsigned Opc;
1724 switch (MIOpc) {
1725 default: llvm_unreachable("Unreachable!");
1726 case X86::VMOVDQU8Z128rrk: Opc = X86::VPBLENDMBZ128rrk; break;
1727 case X86::VMOVDQU8Z256rrk: Opc = X86::VPBLENDMBZ256rrk; break;
1728 case X86::VMOVDQU8Zrrk: Opc = X86::VPBLENDMBZrrk; break;
1729 case X86::VMOVDQU16Z128rrk: Opc = X86::VPBLENDMWZ128rrk; break;
1730 case X86::VMOVDQU16Z256rrk: Opc = X86::VPBLENDMWZ256rrk; break;
1731 case X86::VMOVDQU16Zrrk: Opc = X86::VPBLENDMWZrrk; break;
1732 case X86::VMOVDQU32Z128rrk: Opc = X86::VPBLENDMDZ128rrk; break;
1733 case X86::VMOVDQU32Z256rrk: Opc = X86::VPBLENDMDZ256rrk; break;
1734 case X86::VMOVDQU32Zrrk: Opc = X86::VPBLENDMDZrrk; break;
1735 case X86::VMOVDQU64Z128rrk: Opc = X86::VPBLENDMQZ128rrk; break;
1736 case X86::VMOVDQU64Z256rrk: Opc = X86::VPBLENDMQZ256rrk; break;
1737 case X86::VMOVDQU64Zrrk: Opc = X86::VPBLENDMQZrrk; break;
1738 case X86::VMOVUPDZ128rrk: Opc = X86::VBLENDMPDZ128rrk; break;
1739 case X86::VMOVUPDZ256rrk: Opc = X86::VBLENDMPDZ256rrk; break;
1740 case X86::VMOVUPDZrrk: Opc = X86::VBLENDMPDZrrk; break;
1741 case X86::VMOVUPSZ128rrk: Opc = X86::VBLENDMPSZ128rrk; break;
1742 case X86::VMOVUPSZ256rrk: Opc = X86::VBLENDMPSZ256rrk; break;
1743 case X86::VMOVUPSZrrk: Opc = X86::VBLENDMPSZrrk; break;
1744 case X86::VMOVDQA32Z128rrk: Opc = X86::VPBLENDMDZ128rrk; break;
1745 case X86::VMOVDQA32Z256rrk: Opc = X86::VPBLENDMDZ256rrk; break;
1746 case X86::VMOVDQA32Zrrk: Opc = X86::VPBLENDMDZrrk; break;
1747 case X86::VMOVDQA64Z128rrk: Opc = X86::VPBLENDMQZ128rrk; break;
1748 case X86::VMOVDQA64Z256rrk: Opc = X86::VPBLENDMQZ256rrk; break;
1749 case X86::VMOVDQA64Zrrk: Opc = X86::VPBLENDMQZrrk; break;
1750 case X86::VMOVAPDZ128rrk: Opc = X86::VBLENDMPDZ128rrk; break;
1751 case X86::VMOVAPDZ256rrk: Opc = X86::VBLENDMPDZ256rrk; break;
1752 case X86::VMOVAPDZrrk: Opc = X86::VBLENDMPDZrrk; break;
1753 case X86::VMOVAPSZ128rrk: Opc = X86::VBLENDMPSZ128rrk; break;
1754 case X86::VMOVAPSZ256rrk: Opc = X86::VBLENDMPSZ256rrk; break;
1755 case X86::VMOVAPSZrrk: Opc = X86::VBLENDMPSZrrk; break;
1756 }
1757
1758 NewMI = BuildMI(MF, MI.getDebugLoc(), get(Opc))
1759 .add(Dest)
1760 .add(MI.getOperand(2))
1761 .add(Src)
1762 .add(MI.getOperand(3));
1763 NumRegOperands = 4;
1764 break;
1765 }
1766 }
1767
1768 if (!NewMI) return nullptr;
1769
1770 if (LV) { // Update live variables
1771 for (unsigned I = 0; I < NumRegOperands; ++I) {
1772 MachineOperand &Op = MI.getOperand(I);
1773 if (Op.isReg() && (Op.isDead() || Op.isKill()))
1774 LV->replaceKillInstruction(Op.getReg(), MI, *NewMI);
1775 }
1776 }
1777
1778 MachineBasicBlock &MBB = *MI.getParent();
1779 MBB.insert(MI.getIterator(), NewMI); // Insert the new inst
1780
1781 if (LIS) {
1782 LIS->ReplaceMachineInstrInMaps(MI, *NewMI);
1783 if (SrcReg)
1784 LIS->getInterval(SrcReg);
1785 if (SrcReg2)
1786 LIS->getInterval(SrcReg2);
1787 }
1788
1789 return NewMI;
1790}
1791
1792/// This determines which of three possible cases of a three source commute
1793/// the source indexes correspond to taking into account any mask operands.
1794/// All prevents commuting a passthru operand. Returns -1 if the commute isn't
1795/// possible.
1796/// Case 0 - Possible to commute the first and second operands.
1797/// Case 1 - Possible to commute the first and third operands.
1798/// Case 2 - Possible to commute the second and third operands.
1799static unsigned getThreeSrcCommuteCase(uint64_t TSFlags, unsigned SrcOpIdx1,
1800 unsigned SrcOpIdx2) {
1801 // Put the lowest index to SrcOpIdx1 to simplify the checks below.
1802 if (SrcOpIdx1 > SrcOpIdx2)
1803 std::swap(SrcOpIdx1, SrcOpIdx2);
1804
1805 unsigned Op1 = 1, Op2 = 2, Op3 = 3;
1807 Op2++;
1808 Op3++;
1809 }
1810
1811 if (SrcOpIdx1 == Op1 && SrcOpIdx2 == Op2)
1812 return 0;
1813 if (SrcOpIdx1 == Op1 && SrcOpIdx2 == Op3)
1814 return 1;
1815 if (SrcOpIdx1 == Op2 && SrcOpIdx2 == Op3)
1816 return 2;
1817 llvm_unreachable("Unknown three src commute case.");
1818}
1819
1821 const MachineInstr &MI, unsigned SrcOpIdx1, unsigned SrcOpIdx2,
1822 const X86InstrFMA3Group &FMA3Group) const {
1823
1824 unsigned Opc = MI.getOpcode();
1825
1826 // TODO: Commuting the 1st operand of FMA*_Int requires some additional
1827 // analysis. The commute optimization is legal only if all users of FMA*_Int
1828 // use only the lowest element of the FMA*_Int instruction. Such analysis are
1829 // not implemented yet. So, just return 0 in that case.
1830 // When such analysis are available this place will be the right place for
1831 // calling it.
1832 assert(!(FMA3Group.isIntrinsic() && (SrcOpIdx1 == 1 || SrcOpIdx2 == 1)) &&
1833 "Intrinsic instructions can't commute operand 1");
1834
1835 // Determine which case this commute is or if it can't be done.
1836 unsigned Case = getThreeSrcCommuteCase(MI.getDesc().TSFlags, SrcOpIdx1,
1837 SrcOpIdx2);
1838 assert(Case < 3 && "Unexpected case number!");
1839
1840 // Define the FMA forms mapping array that helps to map input FMA form
1841 // to output FMA form to preserve the operation semantics after
1842 // commuting the operands.
1843 const unsigned Form132Index = 0;
1844 const unsigned Form213Index = 1;
1845 const unsigned Form231Index = 2;
1846 static const unsigned FormMapping[][3] = {
1847 // 0: SrcOpIdx1 == 1 && SrcOpIdx2 == 2;
1848 // FMA132 A, C, b; ==> FMA231 C, A, b;
1849 // FMA213 B, A, c; ==> FMA213 A, B, c;
1850 // FMA231 C, A, b; ==> FMA132 A, C, b;
1851 { Form231Index, Form213Index, Form132Index },
1852 // 1: SrcOpIdx1 == 1 && SrcOpIdx2 == 3;
1853 // FMA132 A, c, B; ==> FMA132 B, c, A;
1854 // FMA213 B, a, C; ==> FMA231 C, a, B;
1855 // FMA231 C, a, B; ==> FMA213 B, a, C;
1856 { Form132Index, Form231Index, Form213Index },
1857 // 2: SrcOpIdx1 == 2 && SrcOpIdx2 == 3;
1858 // FMA132 a, C, B; ==> FMA213 a, B, C;
1859 // FMA213 b, A, C; ==> FMA132 b, C, A;
1860 // FMA231 c, A, B; ==> FMA231 c, B, A;
1861 { Form213Index, Form132Index, Form231Index }
1862 };
1863
1864 unsigned FMAForms[3];
1865 FMAForms[0] = FMA3Group.get132Opcode();
1866 FMAForms[1] = FMA3Group.get213Opcode();
1867 FMAForms[2] = FMA3Group.get231Opcode();
1868
1869 // Everything is ready, just adjust the FMA opcode and return it.
1870 for (unsigned FormIndex = 0; FormIndex < 3; FormIndex++)
1871 if (Opc == FMAForms[FormIndex])
1872 return FMAForms[FormMapping[Case][FormIndex]];
1873
1874 llvm_unreachable("Illegal FMA3 format");
1875}
1876
1877static void commuteVPTERNLOG(MachineInstr &MI, unsigned SrcOpIdx1,
1878 unsigned SrcOpIdx2) {
1879 // Determine which case this commute is or if it can't be done.
1880 unsigned Case = getThreeSrcCommuteCase(MI.getDesc().TSFlags, SrcOpIdx1,
1881 SrcOpIdx2);
1882 assert(Case < 3 && "Unexpected case value!");
1883
1884 // For each case we need to swap two pairs of bits in the final immediate.
1885 static const uint8_t SwapMasks[3][4] = {
1886 { 0x04, 0x10, 0x08, 0x20 }, // Swap bits 2/4 and 3/5.
1887 { 0x02, 0x10, 0x08, 0x40 }, // Swap bits 1/4 and 3/6.
1888 { 0x02, 0x04, 0x20, 0x40 }, // Swap bits 1/2 and 5/6.
1889 };
1890
1891 uint8_t Imm = MI.getOperand(MI.getNumOperands()-1).getImm();
1892 // Clear out the bits we are swapping.
1893 uint8_t NewImm = Imm & ~(SwapMasks[Case][0] | SwapMasks[Case][1] |
1894 SwapMasks[Case][2] | SwapMasks[Case][3]);
1895 // If the immediate had a bit of the pair set, then set the opposite bit.
1896 if (Imm & SwapMasks[Case][0]) NewImm |= SwapMasks[Case][1];
1897 if (Imm & SwapMasks[Case][1]) NewImm |= SwapMasks[Case][0];
1898 if (Imm & SwapMasks[Case][2]) NewImm |= SwapMasks[Case][3];
1899 if (Imm & SwapMasks[Case][3]) NewImm |= SwapMasks[Case][2];
1900 MI.getOperand(MI.getNumOperands()-1).setImm(NewImm);
1901}
1902
1903// Returns true if this is a VPERMI2 or VPERMT2 instruction that can be
1904// commuted.
1905static bool isCommutableVPERMV3Instruction(unsigned Opcode) {
1906#define VPERM_CASES(Suffix) \
1907 case X86::VPERMI2##Suffix##128rr: case X86::VPERMT2##Suffix##128rr: \
1908 case X86::VPERMI2##Suffix##256rr: case X86::VPERMT2##Suffix##256rr: \
1909 case X86::VPERMI2##Suffix##rr: case X86::VPERMT2##Suffix##rr: \
1910 case X86::VPERMI2##Suffix##128rm: case X86::VPERMT2##Suffix##128rm: \
1911 case X86::VPERMI2##Suffix##256rm: case X86::VPERMT2##Suffix##256rm: \
1912 case X86::VPERMI2##Suffix##rm: case X86::VPERMT2##Suffix##rm: \
1913 case X86::VPERMI2##Suffix##128rrkz: case X86::VPERMT2##Suffix##128rrkz: \
1914 case X86::VPERMI2##Suffix##256rrkz: case X86::VPERMT2##Suffix##256rrkz: \
1915 case X86::VPERMI2##Suffix##rrkz: case X86::VPERMT2##Suffix##rrkz: \
1916 case X86::VPERMI2##Suffix##128rmkz: case X86::VPERMT2##Suffix##128rmkz: \
1917 case X86::VPERMI2##Suffix##256rmkz: case X86::VPERMT2##Suffix##256rmkz: \
1918 case X86::VPERMI2##Suffix##rmkz: case X86::VPERMT2##Suffix##rmkz:
1919
1920#define VPERM_CASES_BROADCAST(Suffix) \
1921 VPERM_CASES(Suffix) \
1922 case X86::VPERMI2##Suffix##128rmb: case X86::VPERMT2##Suffix##128rmb: \
1923 case X86::VPERMI2##Suffix##256rmb: case X86::VPERMT2##Suffix##256rmb: \
1924 case X86::VPERMI2##Suffix##rmb: case X86::VPERMT2##Suffix##rmb: \
1925 case X86::VPERMI2##Suffix##128rmbkz: case X86::VPERMT2##Suffix##128rmbkz: \
1926 case X86::VPERMI2##Suffix##256rmbkz: case X86::VPERMT2##Suffix##256rmbkz: \
1927 case X86::VPERMI2##Suffix##rmbkz: case X86::VPERMT2##Suffix##rmbkz:
1928
1929 switch (Opcode) {
1930 default: return false;
1931 VPERM_CASES(B)
1936 VPERM_CASES(W)
1937 return true;
1938 }
1939#undef VPERM_CASES_BROADCAST
1940#undef VPERM_CASES
1941}
1942
1943// Returns commuted opcode for VPERMI2 and VPERMT2 instructions by switching
1944// from the I opcode to the T opcode and vice versa.
1945static unsigned getCommutedVPERMV3Opcode(unsigned Opcode) {
1946#define VPERM_CASES(Orig, New) \
1947 case X86::Orig##128rr: return X86::New##128rr; \
1948 case X86::Orig##128rrkz: return X86::New##128rrkz; \
1949 case X86::Orig##128rm: return X86::New##128rm; \
1950 case X86::Orig##128rmkz: return X86::New##128rmkz; \
1951 case X86::Orig##256rr: return X86::New##256rr; \
1952 case X86::Orig##256rrkz: return X86::New##256rrkz; \
1953 case X86::Orig##256rm: return X86::New##256rm; \
1954 case X86::Orig##256rmkz: return X86::New##256rmkz; \
1955 case X86::Orig##rr: return X86::New##rr; \
1956 case X86::Orig##rrkz: return X86::New##rrkz; \
1957 case X86::Orig##rm: return X86::New##rm; \
1958 case X86::Orig##rmkz: return X86::New##rmkz;
1959
1960#define VPERM_CASES_BROADCAST(Orig, New) \
1961 VPERM_CASES(Orig, New) \
1962 case X86::Orig##128rmb: return X86::New##128rmb; \
1963 case X86::Orig##128rmbkz: return X86::New##128rmbkz; \
1964 case X86::Orig##256rmb: return X86::New##256rmb; \
1965 case X86::Orig##256rmbkz: return X86::New##256rmbkz; \
1966 case X86::Orig##rmb: return X86::New##rmb; \
1967 case X86::Orig##rmbkz: return X86::New##rmbkz;
1968
1969 switch (Opcode) {
1970 VPERM_CASES(VPERMI2B, VPERMT2B)
1971 VPERM_CASES_BROADCAST(VPERMI2D, VPERMT2D)
1972 VPERM_CASES_BROADCAST(VPERMI2PD, VPERMT2PD)
1973 VPERM_CASES_BROADCAST(VPERMI2PS, VPERMT2PS)
1974 VPERM_CASES_BROADCAST(VPERMI2Q, VPERMT2Q)
1975 VPERM_CASES(VPERMI2W, VPERMT2W)
1976 VPERM_CASES(VPERMT2B, VPERMI2B)
1977 VPERM_CASES_BROADCAST(VPERMT2D, VPERMI2D)
1978 VPERM_CASES_BROADCAST(VPERMT2PD, VPERMI2PD)
1979 VPERM_CASES_BROADCAST(VPERMT2PS, VPERMI2PS)
1980 VPERM_CASES_BROADCAST(VPERMT2Q, VPERMI2Q)
1981 VPERM_CASES(VPERMT2W, VPERMI2W)
1982 }
1983
1984 llvm_unreachable("Unreachable!");
1985#undef VPERM_CASES_BROADCAST
1986#undef VPERM_CASES
1987}
1988
1990 unsigned OpIdx1,
1991 unsigned OpIdx2) const {
1992 auto cloneIfNew = [NewMI](MachineInstr &MI) -> MachineInstr & {
1993 if (NewMI)
1994 return *MI.getParent()->getParent()->CloneMachineInstr(&MI);
1995 return MI;
1996 };
1997
1998 switch (MI.getOpcode()) {
1999 case X86::SHRD16rri8: // A = SHRD16rri8 B, C, I -> A = SHLD16rri8 C, B, (16-I)
2000 case X86::SHLD16rri8: // A = SHLD16rri8 B, C, I -> A = SHRD16rri8 C, B, (16-I)
2001 case X86::SHRD32rri8: // A = SHRD32rri8 B, C, I -> A = SHLD32rri8 C, B, (32-I)
2002 case X86::SHLD32rri8: // A = SHLD32rri8 B, C, I -> A = SHRD32rri8 C, B, (32-I)
2003 case X86::SHRD64rri8: // A = SHRD64rri8 B, C, I -> A = SHLD64rri8 C, B, (64-I)
2004 case X86::SHLD64rri8:{// A = SHLD64rri8 B, C, I -> A = SHRD64rri8 C, B, (64-I)
2005 unsigned Opc;
2006 unsigned Size;
2007 switch (MI.getOpcode()) {
2008 default: llvm_unreachable("Unreachable!");
2009 case X86::SHRD16rri8: Size = 16; Opc = X86::SHLD16rri8; break;
2010 case X86::SHLD16rri8: Size = 16; Opc = X86::SHRD16rri8; break;
2011 case X86::SHRD32rri8: Size = 32; Opc = X86::SHLD32rri8; break;
2012 case X86::SHLD32rri8: Size = 32; Opc = X86::SHRD32rri8; break;
2013 case X86::SHRD64rri8: Size = 64; Opc = X86::SHLD64rri8; break;
2014 case X86::SHLD64rri8: Size = 64; Opc = X86::SHRD64rri8; break;
2015 }
2016 unsigned Amt = MI.getOperand(3).getImm();
2017 auto &WorkingMI = cloneIfNew(MI);
2018 WorkingMI.setDesc(get(Opc));
2019 WorkingMI.getOperand(3).setImm(Size - Amt);
2020 return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
2021 OpIdx1, OpIdx2);
2022 }
2023 case X86::PFSUBrr:
2024 case X86::PFSUBRrr: {
2025 // PFSUB x, y: x = x - y
2026 // PFSUBR x, y: x = y - x
2027 unsigned Opc =
2028 (X86::PFSUBRrr == MI.getOpcode() ? X86::PFSUBrr : X86::PFSUBRrr);
2029 auto &WorkingMI = cloneIfNew(MI);
2030 WorkingMI.setDesc(get(Opc));
2031 return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
2032 OpIdx1, OpIdx2);
2033 }
2034 case X86::BLENDPDrri:
2035 case X86::BLENDPSrri:
2036 case X86::VBLENDPDrri:
2037 case X86::VBLENDPSrri:
2038 // If we're optimizing for size, try to use MOVSD/MOVSS.
2039 if (MI.getParent()->getParent()->getFunction().hasOptSize()) {
2040 unsigned Mask, Opc;
2041 switch (MI.getOpcode()) {
2042 default: llvm_unreachable("Unreachable!");
2043 case X86::BLENDPDrri: Opc = X86::MOVSDrr; Mask = 0x03; break;
2044 case X86::BLENDPSrri: Opc = X86::MOVSSrr; Mask = 0x0F; break;
2045 case X86::VBLENDPDrri: Opc = X86::VMOVSDrr; Mask = 0x03; break;
2046 case X86::VBLENDPSrri: Opc = X86::VMOVSSrr; Mask = 0x0F; break;
2047 }
2048 if ((MI.getOperand(3).getImm() ^ Mask) == 1) {
2049 auto &WorkingMI = cloneIfNew(MI);
2050 WorkingMI.setDesc(get(Opc));
2051 WorkingMI.removeOperand(3);
2053 /*NewMI=*/false,
2054 OpIdx1, OpIdx2);
2055 }
2056 }
2057 [[fallthrough]];
2058 case X86::PBLENDWrri:
2059 case X86::VBLENDPDYrri:
2060 case X86::VBLENDPSYrri:
2061 case X86::VPBLENDDrri:
2062 case X86::VPBLENDWrri:
2063 case X86::VPBLENDDYrri:
2064 case X86::VPBLENDWYrri:{
2065 int8_t Mask;
2066 switch (MI.getOpcode()) {
2067 default: llvm_unreachable("Unreachable!");
2068 case X86::BLENDPDrri: Mask = (int8_t)0x03; break;
2069 case X86::BLENDPSrri: Mask = (int8_t)0x0F; break;
2070 case X86::PBLENDWrri: Mask = (int8_t)0xFF; break;
2071 case X86::VBLENDPDrri: Mask = (int8_t)0x03; break;
2072 case X86::VBLENDPSrri: Mask = (int8_t)0x0F; break;
2073 case X86::VBLENDPDYrri: Mask = (int8_t)0x0F; break;
2074 case X86::VBLENDPSYrri: Mask = (int8_t)0xFF; break;
2075 case X86::VPBLENDDrri: Mask = (int8_t)0x0F; break;
2076 case X86::VPBLENDWrri: Mask = (int8_t)0xFF; break;
2077 case X86::VPBLENDDYrri: Mask = (int8_t)0xFF; break;
2078 case X86::VPBLENDWYrri: Mask = (int8_t)0xFF; break;
2079 }
2080 // Only the least significant bits of Imm are used.
2081 // Using int8_t to ensure it will be sign extended to the int64_t that
2082 // setImm takes in order to match isel behavior.
2083 int8_t Imm = MI.getOperand(3).getImm() & Mask;
2084 auto &WorkingMI = cloneIfNew(MI);
2085 WorkingMI.getOperand(3).setImm(Mask ^ Imm);
2086 return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
2087 OpIdx1, OpIdx2);
2088 }
2089 case X86::INSERTPSrr:
2090 case X86::VINSERTPSrr:
2091 case X86::VINSERTPSZrr: {
2092 unsigned Imm = MI.getOperand(MI.getNumOperands() - 1).getImm();
2093 unsigned ZMask = Imm & 15;
2094 unsigned DstIdx = (Imm >> 4) & 3;
2095 unsigned SrcIdx = (Imm >> 6) & 3;
2096
2097 // We can commute insertps if we zero 2 of the elements, the insertion is
2098 // "inline" and we don't override the insertion with a zero.
2099 if (DstIdx == SrcIdx && (ZMask & (1 << DstIdx)) == 0 &&
2100 llvm::popcount(ZMask) == 2) {
2101 unsigned AltIdx = llvm::countr_zero((ZMask | (1 << DstIdx)) ^ 15);
2102 assert(AltIdx < 4 && "Illegal insertion index");
2103 unsigned AltImm = (AltIdx << 6) | (AltIdx << 4) | ZMask;
2104 auto &WorkingMI = cloneIfNew(MI);
2105 WorkingMI.getOperand(MI.getNumOperands() - 1).setImm(AltImm);
2106 return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
2107 OpIdx1, OpIdx2);
2108 }
2109 return nullptr;
2110 }
2111 case X86::MOVSDrr:
2112 case X86::MOVSSrr:
2113 case X86::VMOVSDrr:
2114 case X86::VMOVSSrr:{
2115 // On SSE41 or later we can commute a MOVSS/MOVSD to a BLENDPS/BLENDPD.
2116 if (Subtarget.hasSSE41()) {
2117 unsigned Mask, Opc;
2118 switch (MI.getOpcode()) {
2119 default: llvm_unreachable("Unreachable!");
2120 case X86::MOVSDrr: Opc = X86::BLENDPDrri; Mask = 0x02; break;
2121 case X86::MOVSSrr: Opc = X86::BLENDPSrri; Mask = 0x0E; break;
2122 case X86::VMOVSDrr: Opc = X86::VBLENDPDrri; Mask = 0x02; break;
2123 case X86::VMOVSSrr: Opc = X86::VBLENDPSrri; Mask = 0x0E; break;
2124 }
2125
2126 auto &WorkingMI = cloneIfNew(MI);
2127 WorkingMI.setDesc(get(Opc));
2128 WorkingMI.addOperand(MachineOperand::CreateImm(Mask));
2129 return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
2130 OpIdx1, OpIdx2);
2131 }
2132
2133 // Convert to SHUFPD.
2134 assert(MI.getOpcode() == X86::MOVSDrr &&
2135 "Can only commute MOVSDrr without SSE4.1");
2136
2137 auto &WorkingMI = cloneIfNew(MI);
2138 WorkingMI.setDesc(get(X86::SHUFPDrri));
2139 WorkingMI.addOperand(MachineOperand::CreateImm(0x02));
2140 return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
2141 OpIdx1, OpIdx2);
2142 }
2143 case X86::SHUFPDrri: {
2144 // Commute to MOVSD.
2145 assert(MI.getOperand(3).getImm() == 0x02 && "Unexpected immediate!");
2146 auto &WorkingMI = cloneIfNew(MI);
2147 WorkingMI.setDesc(get(X86::MOVSDrr));
2148 WorkingMI.removeOperand(3);
2149 return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
2150 OpIdx1, OpIdx2);
2151 }
2152 case X86::PCLMULQDQrr:
2153 case X86::VPCLMULQDQrr:
2154 case X86::VPCLMULQDQYrr:
2155 case X86::VPCLMULQDQZrr:
2156 case X86::VPCLMULQDQZ128rr:
2157 case X86::VPCLMULQDQZ256rr: {
2158 // SRC1 64bits = Imm[0] ? SRC1[127:64] : SRC1[63:0]
2159 // SRC2 64bits = Imm[4] ? SRC2[127:64] : SRC2[63:0]
2160 unsigned Imm = MI.getOperand(3).getImm();
2161 unsigned Src1Hi = Imm & 0x01;
2162 unsigned Src2Hi = Imm & 0x10;
2163 auto &WorkingMI = cloneIfNew(MI);
2164 WorkingMI.getOperand(3).setImm((Src1Hi << 4) | (Src2Hi >> 4));
2165 return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
2166 OpIdx1, OpIdx2);
2167 }
2168 case X86::VPCMPBZ128rri: case X86::VPCMPUBZ128rri:
2169 case X86::VPCMPBZ256rri: case X86::VPCMPUBZ256rri:
2170 case X86::VPCMPBZrri: case X86::VPCMPUBZrri:
2171 case X86::VPCMPDZ128rri: case X86::VPCMPUDZ128rri:
2172 case X86::VPCMPDZ256rri: case X86::VPCMPUDZ256rri:
2173 case X86::VPCMPDZrri: case X86::VPCMPUDZrri:
2174 case X86::VPCMPQZ128rri: case X86::VPCMPUQZ128rri:
2175 case X86::VPCMPQZ256rri: case X86::VPCMPUQZ256rri:
2176 case X86::VPCMPQZrri: case X86::VPCMPUQZrri:
2177 case X86::VPCMPWZ128rri: case X86::VPCMPUWZ128rri:
2178 case X86::VPCMPWZ256rri: case X86::VPCMPUWZ256rri:
2179 case X86::VPCMPWZrri: case X86::VPCMPUWZrri:
2180 case X86::VPCMPBZ128rrik: case X86::VPCMPUBZ128rrik:
2181 case X86::VPCMPBZ256rrik: case X86::VPCMPUBZ256rrik:
2182 case X86::VPCMPBZrrik: case X86::VPCMPUBZrrik:
2183 case X86::VPCMPDZ128rrik: case X86::VPCMPUDZ128rrik:
2184 case X86::VPCMPDZ256rrik: case X86::VPCMPUDZ256rrik:
2185 case X86::VPCMPDZrrik: case X86::VPCMPUDZrrik:
2186 case X86::VPCMPQZ128rrik: case X86::VPCMPUQZ128rrik:
2187 case X86::VPCMPQZ256rrik: case X86::VPCMPUQZ256rrik:
2188 case X86::VPCMPQZrrik: case X86::VPCMPUQZrrik:
2189 case X86::VPCMPWZ128rrik: case X86::VPCMPUWZ128rrik:
2190 case X86::VPCMPWZ256rrik: case X86::VPCMPUWZ256rrik:
2191 case X86::VPCMPWZrrik: case X86::VPCMPUWZrrik: {
2192 // Flip comparison mode immediate (if necessary).
2193 unsigned Imm = MI.getOperand(MI.getNumOperands() - 1).getImm() & 0x7;
2194 Imm = X86::getSwappedVPCMPImm(Imm);
2195 auto &WorkingMI = cloneIfNew(MI);
2196 WorkingMI.getOperand(MI.getNumOperands() - 1).setImm(Imm);
2197 return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
2198 OpIdx1, OpIdx2);
2199 }
2200 case X86::VPCOMBri: case X86::VPCOMUBri:
2201 case X86::VPCOMDri: case X86::VPCOMUDri:
2202 case X86::VPCOMQri: case X86::VPCOMUQri:
2203 case X86::VPCOMWri: case X86::VPCOMUWri: {
2204 // Flip comparison mode immediate (if necessary).
2205 unsigned Imm = MI.getOperand(3).getImm() & 0x7;
2206 Imm = X86::getSwappedVPCOMImm(Imm);
2207 auto &WorkingMI = cloneIfNew(MI);
2208 WorkingMI.getOperand(3).setImm(Imm);
2209 return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
2210 OpIdx1, OpIdx2);
2211 }
2212 case X86::VCMPSDZrr:
2213 case X86::VCMPSSZrr:
2214 case X86::VCMPPDZrri:
2215 case X86::VCMPPSZrri:
2216 case X86::VCMPSHZrr:
2217 case X86::VCMPPHZrri:
2218 case X86::VCMPPHZ128rri:
2219 case X86::VCMPPHZ256rri:
2220 case X86::VCMPPDZ128rri:
2221 case X86::VCMPPSZ128rri:
2222 case X86::VCMPPDZ256rri:
2223 case X86::VCMPPSZ256rri:
2224 case X86::VCMPPDZrrik:
2225 case X86::VCMPPSZrrik:
2226 case X86::VCMPPDZ128rrik:
2227 case X86::VCMPPSZ128rrik:
2228 case X86::VCMPPDZ256rrik:
2229 case X86::VCMPPSZ256rrik: {
2230 unsigned Imm =
2231 MI.getOperand(MI.getNumExplicitOperands() - 1).getImm() & 0x1f;
2232 Imm = X86::getSwappedVCMPImm(Imm);
2233 auto &WorkingMI = cloneIfNew(MI);
2234 WorkingMI.getOperand(MI.getNumExplicitOperands() - 1).setImm(Imm);
2235 return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
2236 OpIdx1, OpIdx2);
2237 }
2238 case X86::VPERM2F128rr:
2239 case X86::VPERM2I128rr: {
2240 // Flip permute source immediate.
2241 // Imm & 0x02: lo = if set, select Op1.lo/hi else Op0.lo/hi.
2242 // Imm & 0x20: hi = if set, select Op1.lo/hi else Op0.lo/hi.
2243 int8_t Imm = MI.getOperand(3).getImm() & 0xFF;
2244 auto &WorkingMI = cloneIfNew(MI);
2245 WorkingMI.getOperand(3).setImm(Imm ^ 0x22);
2246 return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
2247 OpIdx1, OpIdx2);
2248 }
2249 case X86::MOVHLPSrr:
2250 case X86::UNPCKHPDrr:
2251 case X86::VMOVHLPSrr:
2252 case X86::VUNPCKHPDrr:
2253 case X86::VMOVHLPSZrr:
2254 case X86::VUNPCKHPDZ128rr: {
2255 assert(Subtarget.hasSSE2() && "Commuting MOVHLP/UNPCKHPD requires SSE2!");
2256
2257 unsigned Opc = MI.getOpcode();
2258 switch (Opc) {
2259 default: llvm_unreachable("Unreachable!");
2260 case X86::MOVHLPSrr: Opc = X86::UNPCKHPDrr; break;
2261 case X86::UNPCKHPDrr: Opc = X86::MOVHLPSrr; break;
2262 case X86::VMOVHLPSrr: Opc = X86::VUNPCKHPDrr; break;
2263 case X86::VUNPCKHPDrr: Opc = X86::VMOVHLPSrr; break;
2264 case X86::VMOVHLPSZrr: Opc = X86::VUNPCKHPDZ128rr; break;
2265 case X86::VUNPCKHPDZ128rr: Opc = X86::VMOVHLPSZrr; break;
2266 }
2267 auto &WorkingMI = cloneIfNew(MI);
2268 WorkingMI.setDesc(get(Opc));
2269 return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
2270 OpIdx1, OpIdx2);
2271 }
2272 case X86::CMOV16rr: case X86::CMOV32rr: case X86::CMOV64rr: {
2273 auto &WorkingMI = cloneIfNew(MI);
2274 unsigned OpNo = MI.getDesc().getNumOperands() - 1;
2275 X86::CondCode CC = static_cast<X86::CondCode>(MI.getOperand(OpNo).getImm());
2276 WorkingMI.getOperand(OpNo).setImm(X86::GetOppositeBranchCondition(CC));
2277 return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
2278 OpIdx1, OpIdx2);
2279 }
2280 case X86::VPTERNLOGDZrri: case X86::VPTERNLOGDZrmi:
2281 case X86::VPTERNLOGDZ128rri: case X86::VPTERNLOGDZ128rmi:
2282 case X86::VPTERNLOGDZ256rri: case X86::VPTERNLOGDZ256rmi:
2283 case X86::VPTERNLOGQZrri: case X86::VPTERNLOGQZrmi:
2284 case X86::VPTERNLOGQZ128rri: case X86::VPTERNLOGQZ128rmi:
2285 case X86::VPTERNLOGQZ256rri: case X86::VPTERNLOGQZ256rmi:
2286 case X86::VPTERNLOGDZrrik:
2287 case X86::VPTERNLOGDZ128rrik:
2288 case X86::VPTERNLOGDZ256rrik:
2289 case X86::VPTERNLOGQZrrik:
2290 case X86::VPTERNLOGQZ128rrik:
2291 case X86::VPTERNLOGQZ256rrik:
2292 case X86::VPTERNLOGDZrrikz: case X86::VPTERNLOGDZrmikz:
2293 case X86::VPTERNLOGDZ128rrikz: case X86::VPTERNLOGDZ128rmikz:
2294 case X86::VPTERNLOGDZ256rrikz: case X86::VPTERNLOGDZ256rmikz:
2295 case X86::VPTERNLOGQZrrikz: case X86::VPTERNLOGQZrmikz:
2296 case X86::VPTERNLOGQZ128rrikz: case X86::VPTERNLOGQZ128rmikz:
2297 case X86::VPTERNLOGQZ256rrikz: case X86::VPTERNLOGQZ256rmikz:
2298 case X86::VPTERNLOGDZ128rmbi:
2299 case X86::VPTERNLOGDZ256rmbi:
2300 case X86::VPTERNLOGDZrmbi:
2301 case X86::VPTERNLOGQZ128rmbi:
2302 case X86::VPTERNLOGQZ256rmbi:
2303 case X86::VPTERNLOGQZrmbi:
2304 case X86::VPTERNLOGDZ128rmbikz:
2305 case X86::VPTERNLOGDZ256rmbikz:
2306 case X86::VPTERNLOGDZrmbikz:
2307 case X86::VPTERNLOGQZ128rmbikz:
2308 case X86::VPTERNLOGQZ256rmbikz:
2309 case X86::VPTERNLOGQZrmbikz: {
2310 auto &WorkingMI = cloneIfNew(MI);
2311 commuteVPTERNLOG(WorkingMI, OpIdx1, OpIdx2);
2312 return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
2313 OpIdx1, OpIdx2);
2314 }
2315 default: {
2316 if (isCommutableVPERMV3Instruction(MI.getOpcode())) {
2317 unsigned Opc = getCommutedVPERMV3Opcode(MI.getOpcode());
2318 auto &WorkingMI = cloneIfNew(MI);
2319 WorkingMI.setDesc(get(Opc));
2320 return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
2321 OpIdx1, OpIdx2);
2322 }
2323
2324 const X86InstrFMA3Group *FMA3Group = getFMA3Group(MI.getOpcode(),
2325 MI.getDesc().TSFlags);
2326 if (FMA3Group) {
2327 unsigned Opc =
2328 getFMA3OpcodeToCommuteOperands(MI, OpIdx1, OpIdx2, *FMA3Group);
2329 auto &WorkingMI = cloneIfNew(MI);
2330 WorkingMI.setDesc(get(Opc));
2331 return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
2332 OpIdx1, OpIdx2);
2333 }
2334
2335 return TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2);
2336 }
2337 }
2338}
2339
2340bool
2341X86InstrInfo::findThreeSrcCommutedOpIndices(const MachineInstr &MI,
2342 unsigned &SrcOpIdx1,
2343 unsigned &SrcOpIdx2,
2344 bool IsIntrinsic) const {
2345 uint64_t TSFlags = MI.getDesc().TSFlags;
2346
2347 unsigned FirstCommutableVecOp = 1;
2348 unsigned LastCommutableVecOp = 3;
2349 unsigned KMaskOp = -1U;
2351 // For k-zero-masked operations it is Ok to commute the first vector
2352 // operand. Unless this is an intrinsic instruction.
2353 // For regular k-masked operations a conservative choice is done as the
2354 // elements of the first vector operand, for which the corresponding bit
2355 // in the k-mask operand is set to 0, are copied to the result of the
2356 // instruction.
2357 // TODO/FIXME: The commute still may be legal if it is known that the
2358 // k-mask operand is set to either all ones or all zeroes.
2359 // It is also Ok to commute the 1st operand if all users of MI use only
2360 // the elements enabled by the k-mask operand. For example,
2361 // v4 = VFMADD213PSZrk v1, k, v2, v3; // v1[i] = k[i] ? v2[i]*v1[i]+v3[i]
2362 // : v1[i];
2363 // VMOVAPSZmrk <mem_addr>, k, v4; // this is the ONLY user of v4 ->
2364 // // Ok, to commute v1 in FMADD213PSZrk.
2365
2366 // The k-mask operand has index = 2 for masked and zero-masked operations.
2367 KMaskOp = 2;
2368
2369 // The operand with index = 1 is used as a source for those elements for
2370 // which the corresponding bit in the k-mask is set to 0.
2371 if (X86II::isKMergeMasked(TSFlags) || IsIntrinsic)
2372 FirstCommutableVecOp = 3;
2373
2374 LastCommutableVecOp++;
2375 } else if (IsIntrinsic) {
2376 // Commuting the first operand of an intrinsic instruction isn't possible
2377 // unless we can prove that only the lowest element of the result is used.
2378 FirstCommutableVecOp = 2;
2379 }
2380
2381 if (isMem(MI, LastCommutableVecOp))
2382 LastCommutableVecOp--;
2383
2384 // Only the first RegOpsNum operands are commutable.
2385 // Also, the value 'CommuteAnyOperandIndex' is valid here as it means
2386 // that the operand is not specified/fixed.
2387 if (SrcOpIdx1 != CommuteAnyOperandIndex &&
2388 (SrcOpIdx1 < FirstCommutableVecOp || SrcOpIdx1 > LastCommutableVecOp ||
2389 SrcOpIdx1 == KMaskOp))
2390 return false;
2391 if (SrcOpIdx2 != CommuteAnyOperandIndex &&
2392 (SrcOpIdx2 < FirstCommutableVecOp || SrcOpIdx2 > LastCommutableVecOp ||
2393 SrcOpIdx2 == KMaskOp))
2394 return false;
2395
2396 // Look for two different register operands assumed to be commutable
2397 // regardless of the FMA opcode. The FMA opcode is adjusted later.
2398 if (SrcOpIdx1 == CommuteAnyOperandIndex ||
2399 SrcOpIdx2 == CommuteAnyOperandIndex) {
2400 unsigned CommutableOpIdx2 = SrcOpIdx2;
2401
2402 // At least one of operands to be commuted is not specified and
2403 // this method is free to choose appropriate commutable operands.
2404 if (SrcOpIdx1 == SrcOpIdx2)
2405 // Both of operands are not fixed. By default set one of commutable
2406 // operands to the last register operand of the instruction.
2407 CommutableOpIdx2 = LastCommutableVecOp;
2408 else if (SrcOpIdx2 == CommuteAnyOperandIndex)
2409 // Only one of operands is not fixed.
2410 CommutableOpIdx2 = SrcOpIdx1;
2411
2412 // CommutableOpIdx2 is well defined now. Let's choose another commutable
2413 // operand and assign its index to CommutableOpIdx1.
2414 Register Op2Reg = MI.getOperand(CommutableOpIdx2).getReg();
2415
2416 unsigned CommutableOpIdx1;
2417 for (CommutableOpIdx1 = LastCommutableVecOp;
2418 CommutableOpIdx1 >= FirstCommutableVecOp; CommutableOpIdx1--) {
2419 // Just ignore and skip the k-mask operand.
2420 if (CommutableOpIdx1 == KMaskOp)
2421 continue;
2422
2423 // The commuted operands must have different registers.
2424 // Otherwise, the commute transformation does not change anything and
2425 // is useless then.
2426 if (Op2Reg != MI.getOperand(CommutableOpIdx1).getReg())
2427 break;
2428 }
2429
2430 // No appropriate commutable operands were found.
2431 if (CommutableOpIdx1 < FirstCommutableVecOp)
2432 return false;
2433
2434 // Assign the found pair of commutable indices to SrcOpIdx1 and SrcOpidx2
2435 // to return those values.
2436 if (!fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2,
2437 CommutableOpIdx1, CommutableOpIdx2))
2438 return false;
2439 }
2440
2441 return true;
2442}
2443
2445 unsigned &SrcOpIdx1,
2446 unsigned &SrcOpIdx2) const {
2447 const MCInstrDesc &Desc = MI.getDesc();
2448 if (!Desc.isCommutable())
2449 return false;
2450
2451 switch (MI.getOpcode()) {
2452 case X86::CMPSDrr:
2453 case X86::CMPSSrr:
2454 case X86::CMPPDrri:
2455 case X86::CMPPSrri:
2456 case X86::VCMPSDrr:
2457 case X86::VCMPSSrr:
2458 case X86::VCMPPDrri:
2459 case X86::VCMPPSrri:
2460 case X86::VCMPPDYrri:
2461 case X86::VCMPPSYrri:
2462 case X86::VCMPSDZrr:
2463 case X86::VCMPSSZrr:
2464 case X86::VCMPPDZrri:
2465 case X86::VCMPPSZrri:
2466 case X86::VCMPSHZrr:
2467 case X86::VCMPPHZrri:
2468 case X86::VCMPPHZ128rri:
2469 case X86::VCMPPHZ256rri:
2470 case X86::VCMPPDZ128rri:
2471 case X86::VCMPPSZ128rri:
2472 case X86::VCMPPDZ256rri:
2473 case X86::VCMPPSZ256rri:
2474 case X86::VCMPPDZrrik:
2475 case X86::VCMPPSZrrik:
2476 case X86::VCMPPDZ128rrik:
2477 case X86::VCMPPSZ128rrik:
2478 case X86::VCMPPDZ256rrik:
2479 case X86::VCMPPSZ256rrik: {
2480 unsigned OpOffset = X86II::isKMasked(Desc.TSFlags) ? 1 : 0;
2481
2482 // Float comparison can be safely commuted for
2483 // Ordered/Unordered/Equal/NotEqual tests
2484 unsigned Imm = MI.getOperand(3 + OpOffset).getImm() & 0x7;
2485 switch (Imm) {
2486 default:
2487 // EVEX versions can be commuted.
2488 if ((Desc.TSFlags & X86II::EncodingMask) == X86II::EVEX)
2489 break;
2490 return false;
2491 case 0x00: // EQUAL
2492 case 0x03: // UNORDERED
2493 case 0x04: // NOT EQUAL
2494 case 0x07: // ORDERED
2495 break;
2496 }
2497
2498 // The indices of the commutable operands are 1 and 2 (or 2 and 3
2499 // when masked).
2500 // Assign them to the returned operand indices here.
2501 return fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2, 1 + OpOffset,
2502 2 + OpOffset);
2503 }
2504 case X86::MOVSSrr:
2505 // X86::MOVSDrr is always commutable. MOVSS is only commutable if we can
2506 // form sse4.1 blend. We assume VMOVSSrr/VMOVSDrr is always commutable since
2507 // AVX implies sse4.1.
2508 if (Subtarget.hasSSE41())
2509 return TargetInstrInfo::findCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2);
2510 return false;
2511 case X86::SHUFPDrri:
2512 // We can commute this to MOVSD.
2513 if (MI.getOperand(3).getImm() == 0x02)
2514 return TargetInstrInfo::findCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2);
2515 return false;
2516 case X86::MOVHLPSrr:
2517 case X86::UNPCKHPDrr:
2518 case X86::VMOVHLPSrr:
2519 case X86::VUNPCKHPDrr:
2520 case X86::VMOVHLPSZrr:
2521 case X86::VUNPCKHPDZ128rr:
2522 if (Subtarget.hasSSE2())
2523 return TargetInstrInfo::findCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2);
2524 return false;
2525 case X86::VPTERNLOGDZrri: case X86::VPTERNLOGDZrmi:
2526 case X86::VPTERNLOGDZ128rri: case X86::VPTERNLOGDZ128rmi:
2527 case X86::VPTERNLOGDZ256rri: case X86::VPTERNLOGDZ256rmi:
2528 case X86::VPTERNLOGQZrri: case X86::VPTERNLOGQZrmi:
2529 case X86::VPTERNLOGQZ128rri: case X86::VPTERNLOGQZ128rmi:
2530 case X86::VPTERNLOGQZ256rri: case X86::VPTERNLOGQZ256rmi:
2531 case X86::VPTERNLOGDZrrik:
2532 case X86::VPTERNLOGDZ128rrik:
2533 case X86::VPTERNLOGDZ256rrik:
2534 case X86::VPTERNLOGQZrrik:
2535 case X86::VPTERNLOGQZ128rrik:
2536 case X86::VPTERNLOGQZ256rrik:
2537 case X86::VPTERNLOGDZrrikz: case X86::VPTERNLOGDZrmikz:
2538 case X86::VPTERNLOGDZ128rrikz: case X86::VPTERNLOGDZ128rmikz:
2539 case X86::VPTERNLOGDZ256rrikz: case X86::VPTERNLOGDZ256rmikz:
2540 case X86::VPTERNLOGQZrrikz: case X86::VPTERNLOGQZrmikz:
2541 case X86::VPTERNLOGQZ128rrikz: case X86::VPTERNLOGQZ128rmikz:
2542 case X86::VPTERNLOGQZ256rrikz: case X86::VPTERNLOGQZ256rmikz:
2543 case X86::VPTERNLOGDZ128rmbi:
2544 case X86::VPTERNLOGDZ256rmbi:
2545 case X86::VPTERNLOGDZrmbi:
2546 case X86::VPTERNLOGQZ128rmbi:
2547 case X86::VPTERNLOGQZ256rmbi:
2548 case X86::VPTERNLOGQZrmbi:
2549 case X86::VPTERNLOGDZ128rmbikz:
2550 case X86::VPTERNLOGDZ256rmbikz:
2551 case X86::VPTERNLOGDZrmbikz:
2552 case X86::VPTERNLOGQZ128rmbikz:
2553 case X86::VPTERNLOGQZ256rmbikz:
2554 case X86::VPTERNLOGQZrmbikz:
2555 return findThreeSrcCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2);
2556 case X86::VPDPWSSDYrr:
2557 case X86::VPDPWSSDrr:
2558 case X86::VPDPWSSDSYrr:
2559 case X86::VPDPWSSDSrr:
2560 case X86::VPDPBSSDSrr:
2561 case X86::VPDPBSSDSYrr:
2562 case X86::VPDPBSSDrr:
2563 case X86::VPDPBSSDYrr:
2564 case X86::VPDPBUUDSrr:
2565 case X86::VPDPBUUDSYrr:
2566 case X86::VPDPBUUDrr:
2567 case X86::VPDPBUUDYrr:
2568 case X86::VPDPWSSDZ128r:
2569 case X86::VPDPWSSDZ128rk:
2570 case X86::VPDPWSSDZ128rkz:
2571 case X86::VPDPWSSDZ256r:
2572 case X86::VPDPWSSDZ256rk:
2573 case X86::VPDPWSSDZ256rkz:
2574 case X86::VPDPWSSDZr:
2575 case X86::VPDPWSSDZrk:
2576 case X86::VPDPWSSDZrkz:
2577 case X86::VPDPWSSDSZ128r:
2578 case X86::VPDPWSSDSZ128rk:
2579 case X86::VPDPWSSDSZ128rkz:
2580 case X86::VPDPWSSDSZ256r:
2581 case X86::VPDPWSSDSZ256rk:
2582 case X86::VPDPWSSDSZ256rkz:
2583 case X86::VPDPWSSDSZr:
2584 case X86::VPDPWSSDSZrk:
2585 case X86::VPDPWSSDSZrkz:
2586 case X86::VPMADD52HUQrr:
2587 case X86::VPMADD52HUQYrr:
2588 case X86::VPMADD52HUQZ128r:
2589 case X86::VPMADD52HUQZ128rk:
2590 case X86::VPMADD52HUQZ128rkz:
2591 case X86::VPMADD52HUQZ256r:
2592 case X86::VPMADD52HUQZ256rk:
2593 case X86::VPMADD52HUQZ256rkz:
2594 case X86::VPMADD52HUQZr:
2595 case X86::VPMADD52HUQZrk:
2596 case X86::VPMADD52HUQZrkz:
2597 case X86::VPMADD52LUQrr:
2598 case X86::VPMADD52LUQYrr:
2599 case X86::VPMADD52LUQZ128r:
2600 case X86::VPMADD52LUQZ128rk:
2601 case X86::VPMADD52LUQZ128rkz:
2602 case X86::VPMADD52LUQZ256r:
2603 case X86::VPMADD52LUQZ256rk:
2604 case X86::VPMADD52LUQZ256rkz:
2605 case X86::VPMADD52LUQZr:
2606 case X86::VPMADD52LUQZrk:
2607 case X86::VPMADD52LUQZrkz:
2608 case X86::VFMADDCPHZr:
2609 case X86::VFMADDCPHZrk:
2610 case X86::VFMADDCPHZrkz:
2611 case X86::VFMADDCPHZ128r:
2612 case X86::VFMADDCPHZ128rk:
2613 case X86::VFMADDCPHZ128rkz:
2614 case X86::VFMADDCPHZ256r:
2615 case X86::VFMADDCPHZ256rk:
2616 case X86::VFMADDCPHZ256rkz:
2617 case X86::VFMADDCSHZr:
2618 case X86::VFMADDCSHZrk:
2619 case X86::VFMADDCSHZrkz: {
2620 unsigned CommutableOpIdx1 = 2;
2621 unsigned CommutableOpIdx2 = 3;
2622 if (X86II::isKMasked(Desc.TSFlags)) {
2623 // Skip the mask register.
2624 ++CommutableOpIdx1;
2625 ++CommutableOpIdx2;
2626 }
2627 if (!fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2,
2628 CommutableOpIdx1, CommutableOpIdx2))
2629 return false;
2630 if (!MI.getOperand(SrcOpIdx1).isReg() ||
2631 !MI.getOperand(SrcOpIdx2).isReg())
2632 // No idea.
2633 return false;
2634 return true;
2635 }
2636
2637 default:
2638 const X86InstrFMA3Group *FMA3Group = getFMA3Group(MI.getOpcode(),
2639 MI.getDesc().TSFlags);
2640 if (FMA3Group)
2641 return findThreeSrcCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2,
2642 FMA3Group->isIntrinsic());
2643
2644 // Handled masked instructions since we need to skip over the mask input
2645 // and the preserved input.
2646 if (X86II::isKMasked(Desc.TSFlags)) {
2647 // First assume that the first input is the mask operand and skip past it.
2648 unsigned CommutableOpIdx1 = Desc.getNumDefs() + 1;
2649 unsigned CommutableOpIdx2 = Desc.getNumDefs() + 2;
2650 // Check if the first input is tied. If there isn't one then we only
2651 // need to skip the mask operand which we did above.
2652 if ((MI.getDesc().getOperandConstraint(Desc.getNumDefs(),
2653 MCOI::TIED_TO) != -1)) {
2654 // If this is zero masking instruction with a tied operand, we need to
2655 // move the first index back to the first input since this must
2656 // be a 3 input instruction and we want the first two non-mask inputs.
2657 // Otherwise this is a 2 input instruction with a preserved input and
2658 // mask, so we need to move the indices to skip one more input.
2659 if (X86II::isKMergeMasked(Desc.TSFlags)) {
2660 ++CommutableOpIdx1;
2661 ++CommutableOpIdx2;
2662 } else {
2663 --CommutableOpIdx1;
2664 }
2665 }
2666
2667 if (!fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2,
2668 CommutableOpIdx1, CommutableOpIdx2))
2669 return false;
2670
2671 if (!MI.getOperand(SrcOpIdx1).isReg() ||
2672 !MI.getOperand(SrcOpIdx2).isReg())
2673 // No idea.
2674 return false;
2675 return true;
2676 }
2677
2678 return TargetInstrInfo::findCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2);
2679 }
2680 return false;
2681}
2682
2684 unsigned Opcode = MI->getOpcode();
2685 if (Opcode != X86::LEA32r && Opcode != X86::LEA64r &&
2686 Opcode != X86::LEA64_32r)
2687 return false;
2688
2689 const MachineOperand &Scale = MI->getOperand(1 + X86::AddrScaleAmt);
2690 const MachineOperand &Disp = MI->getOperand(1 + X86::AddrDisp);
2691 const MachineOperand &Segment = MI->getOperand(1 + X86::AddrSegmentReg);
2692
2693 if (Segment.getReg() != 0 || !Disp.isImm() || Disp.getImm() != 0 ||
2694 Scale.getImm() > 1)
2695 return false;
2696
2697 return true;
2698}
2699
2701 // Currently we're interested in following sequence only.
2702 // r3 = lea r1, r2
2703 // r5 = add r3, r4
2704 // Both r3 and r4 are killed in add, we hope the add instruction has the
2705 // operand order
2706 // r5 = add r4, r3
2707 // So later in X86FixupLEAs the lea instruction can be rewritten as add.
2708 unsigned Opcode = MI.getOpcode();
2709 if (Opcode != X86::ADD32rr && Opcode != X86::ADD64rr)
2710 return false;
2711
2712 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
2713 Register Reg1 = MI.getOperand(1).getReg();
2714 Register Reg2 = MI.getOperand(2).getReg();
2715
2716 // Check if Reg1 comes from LEA in the same MBB.
2717 if (MachineInstr *Inst = MRI.getUniqueVRegDef(Reg1)) {
2718 if (isConvertibleLEA(Inst) && Inst->getParent() == MI.getParent()) {
2719 Commute = true;
2720 return true;
2721 }
2722 }
2723
2724 // Check if Reg2 comes from LEA in the same MBB.
2725 if (MachineInstr *Inst = MRI.getUniqueVRegDef(Reg2)) {
2726 if (isConvertibleLEA(Inst) && Inst->getParent() == MI.getParent()) {
2727 Commute = false;
2728 return true;
2729 }
2730 }
2731
2732 return false;
2733}
2734
2736 unsigned Opcode = MCID.getOpcode();
2737 if (!(X86::isJCC(Opcode) || X86::isSETCC(Opcode) || X86::isCMOVCC(Opcode)))
2738 return -1;
2739 // Assume that condition code is always the last use operand.
2740 unsigned NumUses = MCID.getNumOperands() - MCID.getNumDefs();
2741 return NumUses - 1;
2742}
2743
2745 const MCInstrDesc &MCID = MI.getDesc();
2746 int CondNo = getCondSrcNoFromDesc(MCID);
2747 if (CondNo < 0)
2748 return X86::COND_INVALID;
2749 CondNo += MCID.getNumDefs();
2750 return static_cast<X86::CondCode>(MI.getOperand(CondNo).getImm());
2751}
2752
2754 return X86::isJCC(MI.getOpcode()) ? X86::getCondFromMI(MI)
2756}
2757
2759 return X86::isSETCC(MI.getOpcode()) ? X86::getCondFromMI(MI)
2761}
2762
2764 return X86::isCMOVCC(MI.getOpcode()) ? X86::getCondFromMI(MI)
2766}
2767
2768/// Return the inverse of the specified condition,
2769/// e.g. turning COND_E to COND_NE.
2771 switch (CC) {
2772 default: llvm_unreachable("Illegal condition code!");
2773 case X86::COND_E: return X86::COND_NE;
2774 case X86::COND_NE: return X86::COND_E;
2775 case X86::COND_L: return X86::COND_GE;
2776 case X86::COND_LE: return X86::COND_G;
2777 case X86::COND_G: return X86::COND_LE;
2778 case X86::COND_GE: return X86::COND_L;
2779 case X86::COND_B: return X86::COND_AE;
2780 case X86::COND_BE: return X86::COND_A;
2781 case X86::COND_A: return X86::COND_BE;
2782 case X86::COND_AE: return X86::COND_B;
2783 case X86::COND_S: return X86::COND_NS;
2784 case X86::COND_NS: return X86::COND_S;
2785 case X86::COND_P: return X86::COND_NP;
2786 case X86::COND_NP: return X86::COND_P;
2787 case X86::COND_O: return X86::COND_NO;
2788 case X86::COND_NO: return X86::COND_O;
2791 }
2792}
2793
2794/// Assuming the flags are set by MI(a,b), return the condition code if we
2795/// modify the instructions such that flags are set by MI(b,a).
2797 switch (CC) {
2798 default: return X86::COND_INVALID;
2799 case X86::COND_E: return X86::COND_E;
2800 case X86::COND_NE: return X86::COND_NE;
2801 case X86::COND_L: return X86::COND_G;
2802 case X86::COND_LE: return X86::COND_GE;
2803 case X86::COND_G: return X86::COND_L;
2804 case X86::COND_GE: return X86::COND_LE;
2805 case X86::COND_B: return X86::COND_A;
2806 case X86::COND_BE: return X86::COND_AE;
2807 case X86::COND_A: return X86::COND_B;
2808 case X86::COND_AE: return X86::COND_BE;
2809 }
2810}
2811
2812std::pair<X86::CondCode, bool>
2815 bool NeedSwap = false;
2816 switch (Predicate) {
2817 default: break;
2818 // Floating-point Predicates
2819 case CmpInst::FCMP_UEQ: CC = X86::COND_E; break;
2820 case CmpInst::FCMP_OLT: NeedSwap = true; [[fallthrough]];
2821 case CmpInst::FCMP_OGT: CC = X86::COND_A; break;
2822 case CmpInst::FCMP_OLE: NeedSwap = true; [[fallthrough]];
2823 case CmpInst::FCMP_OGE: CC = X86::COND_AE; break;
2824 case CmpInst::FCMP_UGT: NeedSwap = true; [[fallthrough]];
2825 case CmpInst::FCMP_ULT: CC = X86::COND_B; break;
2826 case CmpInst::FCMP_UGE: NeedSwap = true; [[fallthrough]];
2827 case CmpInst::FCMP_ULE: CC = X86::COND_BE; break;
2828 case CmpInst::FCMP_ONE: CC = X86::COND_NE; break;
2829 case CmpInst::FCMP_UNO: CC = X86::COND_P; break;
2830 case CmpInst::FCMP_ORD: CC = X86::COND_NP; break;
2831 case CmpInst::FCMP_OEQ: [[fallthrough]];
2832 case CmpInst::FCMP_UNE: CC = X86::COND_INVALID; break;
2833
2834 // Integer Predicates
2835 case CmpInst::ICMP_EQ: CC = X86::COND_E; break;
2836 case CmpInst::ICMP_NE: CC = X86::COND_NE; break;
2837 case CmpInst::ICMP_UGT: CC = X86::COND_A; break;
2838 case CmpInst::ICMP_UGE: CC = X86::COND_AE; break;
2839 case CmpInst::ICMP_ULT: CC = X86::COND_B; break;
2840 case CmpInst::ICMP_ULE: CC = X86::COND_BE; break;
2841 case CmpInst::ICMP_SGT: CC = X86::COND_G; break;
2842 case CmpInst::ICMP_SGE: CC = X86::COND_GE; break;
2843 case CmpInst::ICMP_SLT: CC = X86::COND_L; break;
2844 case CmpInst::ICMP_SLE: CC = X86::COND_LE; break;
2845 }
2846
2847 return std::make_pair(CC, NeedSwap);
2848}
2849
2850/// Return a cmov opcode for the given register size in bytes, and operand type.
2851unsigned X86::getCMovOpcode(unsigned RegBytes, bool HasMemoryOperand) {
2852 switch(RegBytes) {
2853 default: llvm_unreachable("Illegal register size!");
2854 case 2: return HasMemoryOperand ? X86::CMOV16rm : X86::CMOV16rr;
2855 case 4: return HasMemoryOperand ? X86::CMOV32rm : X86::CMOV32rr;
2856 case 8: return HasMemoryOperand ? X86::CMOV64rm : X86::CMOV64rr;
2857 }
2858}
2859
2860/// Get the VPCMP immediate for the given condition.
2862 switch (CC) {
2863 default: llvm_unreachable("Unexpected SETCC condition");
2864 case ISD::SETNE: return 4;
2865 case ISD::SETEQ: return 0;
2866 case ISD::SETULT:
2867 case ISD::SETLT: return 1;
2868 case ISD::SETUGT:
2869 case ISD::SETGT: return 6;
2870 case ISD::SETUGE:
2871 case ISD::SETGE: return 5;
2872 case ISD::SETULE:
2873 case ISD::SETLE: return 2;
2874 }
2875}
2876
2877/// Get the VPCMP immediate if the operands are swapped.
2878unsigned X86::getSwappedVPCMPImm(unsigned Imm) {
2879 switch (Imm) {
2880 default: llvm_unreachable("Unreachable!");
2881 case 0x01: Imm = 0x06; break; // LT -> NLE
2882 case 0x02: Imm = 0x05; break; // LE -> NLT
2883 case 0x05: Imm = 0x02; break; // NLT -> LE
2884 case 0x06: Imm = 0x01; break; // NLE -> LT
2885 case 0x00: // EQ
2886 case 0x03: // FALSE
2887 case 0x04: // NE
2888 case 0x07: // TRUE
2889 break;
2890 }
2891
2892 return Imm;
2893}
2894
2895/// Get the VPCOM immediate if the operands are swapped.
2896unsigned X86::getSwappedVPCOMImm(unsigned Imm) {
2897 switch (Imm) {
2898 default: llvm_unreachable("Unreachable!");
2899 case 0x00: Imm = 0x02; break; // LT -> GT
2900 case 0x01: Imm = 0x03; break; // LE -> GE
2901 case 0x02: Imm = 0x00; break; // GT -> LT
2902 case 0x03: Imm = 0x01; break; // GE -> LE
2903 case 0x04: // EQ
2904 case 0x05: // NE
2905 case 0x06: // FALSE
2906 case 0x07: // TRUE
2907 break;
2908 }
2909
2910 return Imm;
2911}
2912
2913/// Get the VCMP immediate if the operands are swapped.
2914unsigned X86::getSwappedVCMPImm(unsigned Imm) {
2915 // Only need the lower 2 bits to distinquish.
2916 switch (Imm & 0x3) {
2917 default: llvm_unreachable("Unreachable!");
2918 case 0x00: case 0x03:
2919 // EQ/NE/TRUE/FALSE/ORD/UNORD don't change immediate when commuted.
2920 break;
2921 case 0x01: case 0x02:
2922 // Need to toggle bits 3:0. Bit 4 stays the same.
2923 Imm ^= 0xf;
2924 break;
2925 }
2926
2927 return Imm;
2928}
2929
2930/// Return true if the Reg is X87 register.
2931static bool isX87Reg(unsigned Reg) {
2932 return (Reg == X86::FPCW || Reg == X86::FPSW ||
2933 (Reg >= X86::ST0 && Reg <= X86::ST7));
2934}
2935
2936/// check if the instruction is X87 instruction
2938 for (const MachineOperand &MO : MI.operands()) {
2939 if (!MO.isReg())
2940 continue;
2941 if (isX87Reg(MO.getReg()))
2942 return true;
2943 }
2944 return false;
2945}
2946
2948 switch (MI.getOpcode()) {
2949 case X86::TCRETURNdi:
2950 case X86::TCRETURNri:
2951 case X86::TCRETURNmi:
2952 case X86::TCRETURNdi64:
2953 case X86::TCRETURNri64:
2954 case X86::TCRETURNmi64:
2955 return true;
2956 default:
2957 return false;
2958 }
2959}
2960
2963 const MachineInstr &TailCall) const {
2964
2965 const MachineFunction *MF = TailCall.getMF();
2966
2967 if (MF->getTarget().getCodeModel() == CodeModel::Kernel) {
2968 // Kernel patches thunk calls in runtime, these should never be conditional.
2969 const MachineOperand &Target = TailCall.getOperand(0);
2970 if (Target.isSymbol()) {
2971 StringRef Symbol(Target.getSymbolName());
2972 // this is currently only relevant to r11/kernel indirect thunk.
2973 if (Symbol.equals("__x86_indirect_thunk_r11"))
2974 return false;
2975 }
2976 }
2977
2978 if (TailCall.getOpcode() != X86::TCRETURNdi &&
2979 TailCall.getOpcode() != X86::TCRETURNdi64) {
2980 // Only direct calls can be done with a conditional branch.
2981 return false;
2982 }
2983
2984 if (Subtarget.isTargetWin64() && MF->hasWinCFI()) {
2985 // Conditional tail calls confuse the Win64 unwinder.
2986 return false;
2987 }
2988
2989 assert(BranchCond.size() == 1);
2990 if (BranchCond[0].getImm() > X86::LAST_VALID_COND) {
2991 // Can't make a conditional tail call with this condition.
2992 return false;
2993 }
2994
2996 if (X86FI->getTCReturnAddrDelta() != 0 ||
2997 TailCall.getOperand(1).getImm() != 0) {
2998 // A conditional tail call cannot do any stack adjustment.
2999 return false;
3000 }
3001
3002 return true;
3003}
3004
3007 const MachineInstr &TailCall) const {
3008 assert(canMakeTailCallConditional(BranchCond, TailCall));
3009
3011 while (I != MBB.begin()) {
3012 --I;
3013 if (I->isDebugInstr())
3014 continue;
3015 if (!I->isBranch())
3016 assert(0 && "Can't find the branch to replace!");
3017
3019 assert(BranchCond.size() == 1);
3020 if (CC != BranchCond[0].getImm())
3021 continue;
3022
3023 break;
3024 }
3025
3026 unsigned Opc = TailCall.getOpcode() == X86::TCRETURNdi ? X86::TCRETURNdicc
3027 : X86::TCRETURNdi64cc;
3028
3029 auto MIB = BuildMI(MBB, I, MBB.findDebugLoc(I), get(Opc));
3030 MIB->addOperand(TailCall.getOperand(0)); // Destination.
3031 MIB.addImm(0); // Stack offset (not used).
3032 MIB->addOperand(BranchCond[0]); // Condition.
3033 MIB.copyImplicitOps(TailCall); // Regmask and (imp-used) parameters.
3034
3035 // Add implicit uses and defs of all live regs potentially clobbered by the
3036 // call. This way they still appear live across the call.
3037 LivePhysRegs LiveRegs(getRegisterInfo());
3038 LiveRegs.addLiveOuts(MBB);
3040 LiveRegs.stepForward(*MIB, Clobbers);
3041 for (const auto &C : Clobbers) {
3042 MIB.addReg(C.first, RegState::Implicit);
3044 }
3045
3046 I->eraseFromParent();
3047}
3048
3049// Given a MBB and its TBB, find the FBB which was a fallthrough MBB (it may
3050// not be a fallthrough MBB now due to layout changes). Return nullptr if the
3051// fallthrough MBB cannot be identified.
3054 // Look for non-EHPad successors other than TBB. If we find exactly one, it
3055 // is the fallthrough MBB. If we find zero, then TBB is both the target MBB
3056 // and fallthrough MBB. If we find more than one, we cannot identify the
3057 // fallthrough MBB and should return nullptr.
3058 MachineBasicBlock *FallthroughBB = nullptr;
3059 for (MachineBasicBlock *Succ : MBB->successors()) {
3060 if (Succ->isEHPad() || (Succ == TBB && FallthroughBB))
3061 continue;
3062 // Return a nullptr if we found more than one fallthrough successor.
3063 if (FallthroughBB && FallthroughBB != TBB)
3064 return nullptr;
3065 FallthroughBB = Succ;
3066 }
3067 return FallthroughBB;
3068}
3069
3070bool X86InstrInfo::AnalyzeBranchImpl(
3073 SmallVectorImpl<MachineInstr *> &CondBranches, bool AllowModify) const {
3074
3075 // Start from the bottom of the block and work up, examining the
3076 // terminator instructions.
3078 MachineBasicBlock::iterator UnCondBrIter = MBB.end();
3079 while (I != MBB.begin()) {
3080 --I;
3081 if (I->isDebugInstr())
3082 continue;
3083
3084 // Working from the bottom, when we see a non-terminator instruction, we're
3085 // done.
3086 if (!isUnpredicatedTerminator(*I))
3087 break;
3088
3089 // A terminator that isn't a branch can't easily be handled by this
3090 // analysis.
3091 if (!I->isBranch())
3092 return true;
3093
3094 // Handle unconditional branches.
3095 if (I->getOpcode() == X86::JMP_1) {
3096 UnCondBrIter = I;
3097
3098 if (!AllowModify) {
3099 TBB = I->getOperand(0).getMBB();
3100 continue;
3101 }
3102
3103 // If the block has any instructions after a JMP, delete them.
3104 MBB.erase(std::next(I), MBB.end());
3105
3106 Cond.clear();
3107 FBB = nullptr;
3108
3109 // Delete the JMP if it's equivalent to a fall-through.
3110 if (MBB.isLayoutSuccessor(I->getOperand(0).getMBB())) {
3111 TBB = nullptr;
3112 I->eraseFromParent();
3113 I = MBB.end();
3114 UnCondBrIter = MBB.end();
3115 continue;
3116 }
3117
3118 // TBB is used to indicate the unconditional destination.
3119 TBB = I->getOperand(0).getMBB();
3120 continue;
3121 }
3122
3123 // Handle conditional branches.
3124 X86::CondCode BranchCode = X86::getCondFromBranch(*I);
3125 if (BranchCode == X86::COND_INVALID)
3126 return true; // Can't handle indirect branch.
3127
3128 // In practice we should never have an undef eflags operand, if we do
3129 // abort here as we are not prepared to preserve the flag.
3130 if (I->findRegisterUseOperand(X86::EFLAGS)->isUndef())
3131 return true;
3132
3133 // Working from the bottom, handle the first conditional branch.
3134 if (Cond.empty()) {
3135 FBB = TBB;
3136 TBB = I->getOperand(0).getMBB();
3137 Cond.push_back(MachineOperand::CreateImm(BranchCode));
3138 CondBranches.push_back(&*I);
3139 continue;
3140 }
3141
3142 // Handle subsequent conditional branches. Only handle the case where all
3143 // conditional branches branch to the same destination and their condition
3144 // opcodes fit one of the special multi-branch idioms.
3145 assert(Cond.size() == 1);
3146 assert(TBB);
3147
3148 // If the conditions are the same, we can leave them alone.
3149 X86::CondCode OldBranchCode = (X86::CondCode)Cond[0].getImm();
3150 auto NewTBB = I->getOperand(0).getMBB();
3151 if (OldBranchCode == BranchCode && TBB == NewTBB)
3152 continue;
3153
3154 // If they differ, see if they fit one of the known patterns. Theoretically,
3155 // we could handle more patterns here, but we shouldn't expect to see them
3156 // if instruction selection has done a reasonable job.
3157 if (TBB == NewTBB &&
3158 ((OldBranchCode == X86::COND_P && BranchCode == X86::COND_NE) ||
3159 (OldBranchCode == X86::COND_NE && BranchCode == X86::COND_P))) {
3160 BranchCode = X86::COND_NE_OR_P;
3161 } else if ((OldBranchCode == X86::COND_NP && BranchCode == X86::COND_NE) ||
3162 (OldBranchCode == X86::COND_E && BranchCode == X86::COND_P)) {
3163 if (NewTBB != (FBB ? FBB : getFallThroughMBB(&MBB, TBB)))
3164 return true;
3165
3166 // X86::COND_E_AND_NP usually has two different branch destinations.
3167 //
3168 // JP B1
3169 // JE B2
3170 // JMP B1
3171 // B1:
3172 // B2:
3173 //
3174 // Here this condition branches to B2 only if NP && E. It has another
3175 // equivalent form:
3176 //
3177 // JNE B1
3178 // JNP B2
3179 // JMP B1
3180 // B1:
3181 // B2:
3182 //
3183 // Similarly it branches to B2 only if E && NP. That is why this condition
3184 // is named with COND_E_AND_NP.
3185 BranchCode = X86::COND_E_AND_NP;
3186 } else
3187 return true;
3188
3189 // Update the MachineOperand.
3190 Cond[0].setImm(BranchCode);
3191 CondBranches.push_back(&*I);
3192 }
3193
3194 return false;
3195}
3196
3199 MachineBasicBlock *&FBB,
3201 bool AllowModify) const {
3202 SmallVector<MachineInstr *, 4> CondBranches;
3203 return AnalyzeBranchImpl(MBB, TBB, FBB, Cond, CondBranches, AllowModify);
3204}
3205
3207 MachineBranchPredicate &MBP,
3208 bool AllowModify) const {
3209 using namespace std::placeholders;
3210
3212 SmallVector<MachineInstr *, 4> CondBranches;
3213 if (AnalyzeBranchImpl(MBB, MBP.TrueDest, MBP.FalseDest, Cond, CondBranches,
3214 AllowModify))
3215 return true;
3216
3217 if (Cond.size() != 1)
3218 return true;
3219
3220 assert(MBP.TrueDest && "expected!");
3221
3222 if (!MBP.FalseDest)
3223 MBP.FalseDest = MBB.getNextNode();
3224
3226
3227 MachineInstr *ConditionDef = nullptr;
3228 bool SingleUseCondition = true;
3229
3231 if (MI.modifiesRegister(X86::EFLAGS, TRI)) {
3232 ConditionDef = &MI;
3233 break;
3234 }
3235
3236 if (MI.readsRegister(X86::EFLAGS, TRI))
3237 SingleUseCondition = false;
3238 }
3239
3240 if (!ConditionDef)
3241 return true;
3242
3243 if (SingleUseCondition) {
3244 for (auto *Succ : MBB.successors())
3245 if (Succ->isLiveIn(X86::EFLAGS))
3246 SingleUseCondition = false;
3247 }
3248
3249 MBP.ConditionDef = ConditionDef;
3250 MBP.SingleUseCondition = SingleUseCondition;
3251
3252 // Currently we only recognize the simple pattern:
3253 //
3254 // test %reg, %reg
3255 // je %label
3256 //
3257 const unsigned TestOpcode =
3258 Subtarget.is64Bit() ? X86::TEST64rr : X86::TEST32rr;
3259
3260 if (ConditionDef->getOpcode() == TestOpcode &&
3261 ConditionDef->getNumOperands() == 3 &&
3262 ConditionDef->getOperand(0).isIdenticalTo(ConditionDef->getOperand(1)) &&
3263 (Cond[0].getImm() == X86::COND_NE || Cond[0].getImm() == X86::COND_E)) {
3264 MBP.LHS = ConditionDef->getOperand(0);
3265 MBP.RHS = MachineOperand::CreateImm(0);
3266 MBP.Predicate = Cond[0].getImm() == X86::COND_NE
3267 ? MachineBranchPredicate::PRED_NE
3268 : MachineBranchPredicate::PRED_EQ;
3269 return false;
3270 }
3271
3272 return true;
3273}
3274
3276 int *BytesRemoved) const {
3277 assert(!BytesRemoved && "code size not handled");
3278
3280 unsigned Count = 0;
3281
3282 while (I != MBB.begin()) {
3283 --I;
3284 if (I->isDebugInstr())
3285 continue;
3286 if (I->getOpcode() != X86::JMP_1 &&
3288 break;
3289 // Remove the branch.
3290 I->eraseFromParent();
3291 I = MBB.end();
3292 ++Count;
3293 }
3294
3295 return Count;
3296}
3297
3300 MachineBasicBlock *FBB,
3302 const DebugLoc &DL,
3303 int *BytesAdded) const {
3304 // Shouldn't be a fall through.
3305 assert(TBB && "insertBranch must not be told to insert a fallthrough");
3306 assert((Cond.size() == 1 || Cond.size() == 0) &&
3307 "X86 branch conditions have one component!");
3308 assert(!BytesAdded && "code size not handled");
3309
3310 if (Cond.empty()) {
3311 // Unconditional branch?
3312 assert(!FBB && "Unconditional branch with multiple successors!");
3313 BuildMI(&MBB, DL, get(X86::JMP_1)).addMBB(TBB);
3314 return 1;
3315 }
3316
3317 // If FBB is null, it is implied to be a fall-through block.
3318 bool FallThru = FBB == nullptr;
3319
3320 // Conditional branch.
3321 unsigned Count = 0;
3322 X86::CondCode CC = (X86::CondCode)Cond[0].getImm();
3323 switch (CC) {
3324 case X86::COND_NE_OR_P:
3325 // Synthesize NE_OR_P with two branches.
3326 BuildMI(&MBB, DL, get(X86::JCC_1)).addMBB(TBB).addImm(X86::COND_NE);
3327 ++Count;
3328 BuildMI(&MBB, DL, get(X86::JCC_1)).addMBB(TBB).addImm(X86::COND_P);
3329 ++Count;
3330 break;
3331 case X86::COND_E_AND_NP:
3332 // Use the next block of MBB as FBB if it is null.
3333 if (FBB == nullptr) {
3334 FBB = getFallThroughMBB(&MBB, TBB);
3335 assert(FBB && "MBB cannot be the last block in function when the false "
3336 "body is a fall-through.");
3337 }
3338 // Synthesize COND_E_AND_NP with two branches.
3339 BuildMI(&MBB, DL, get(X86::JCC_1)).addMBB(FBB).addImm(X86::COND_NE);
3340 ++Count;
3341 BuildMI(&MBB, DL, get(X86::JCC_1)).addMBB(TBB).addImm(X86::COND_NP);
3342 ++Count;
3343 break;
3344 default: {
3345 BuildMI(&MBB, DL, get(X86::JCC_1)).addMBB(TBB).addImm(CC);
3346 ++Count;
3347 }
3348 }
3349 if (!FallThru) {
3350 // Two-way Conditional branch. Insert the second branch.
3351 BuildMI(&MBB, DL, get(X86::JMP_1)).addMBB(FBB);
3352 ++Count;
3353 }
3354 return Count;
3355}
3356
3359 Register DstReg, Register TrueReg,
3360 Register FalseReg, int &CondCycles,
3361 int &TrueCycles, int &FalseCycles) const {
3362 // Not all subtargets have cmov instructions.
3363 if (!Subtarget.canUseCMOV())
3364 return false;
3365 if (Cond.size() != 1)
3366 return false;
3367 // We cannot do the composite conditions, at least not in SSA form.
3368 if ((X86::CondCode)Cond[0].getImm() > X86::LAST_VALID_COND)
3369 return false;
3370
3371 // Check register classes.
3373 const TargetRegisterClass *RC =
3374 RI.getCommonSubClass(MRI.getRegClass(TrueReg), MRI.getRegClass(FalseReg));
3375 if (!RC)
3376 return false;
3377
3378 // We have cmov instructions for 16, 32, and 64 bit general purpose registers.
3379 if (X86::GR16RegClass.hasSubClassEq(RC) ||
3380 X86::GR32RegClass.hasSubClassEq(RC) ||
3381 X86::GR64RegClass.hasSubClassEq(RC)) {
3382 // This latency applies to Pentium M, Merom, Wolfdale, Nehalem, and Sandy
3383 // Bridge. Probably Ivy Bridge as well.
3384 CondCycles = 2;
3385 TrueCycles = 2;
3386 FalseCycles = 2;
3387 return true;
3388 }
3389
3390 // Can't do vectors.
3391 return false;
3392}
3393
3396 const DebugLoc &DL, Register DstReg,
3398 Register FalseReg) const {
3400 const TargetRegisterInfo &TRI = *MRI.getTargetRegisterInfo();
3401 const TargetRegisterClass &RC = *MRI.getRegClass(DstReg);
3402 assert(Cond.size() == 1 && "Invalid Cond array");
3403 unsigned Opc = X86::getCMovOpcode(TRI.getRegSizeInBits(RC) / 8,
3404 false /*HasMemoryOperand*/);
3405 BuildMI(MBB, I, DL, get(Opc), DstReg)
3406 .addReg(FalseReg)
3407 .addReg(TrueReg)
3408 .addImm(Cond[0].getImm());
3409}
3410
3411/// Test if the given register is a physical h register.
3412static bool isHReg(unsigned Reg) {
3413 return X86::GR8_ABCD_HRegClass.contains(Reg);
3414}
3415
3416// Try and copy between VR128/VR64 and GR64 registers.
3417static unsigned CopyToFromAsymmetricReg(unsigned DestReg, unsigned SrcReg,
3418 const X86Subtarget &Subtarget) {
3419 bool HasAVX = Subtarget.hasAVX();
3420 bool HasAVX512 = Subtarget.hasAVX512();
3421
3422 // SrcReg(MaskReg) -> DestReg(GR64)
3423 // SrcReg(MaskReg) -> DestReg(GR32)
3424
3425 // All KMASK RegClasses hold the same k registers, can be tested against anyone.
3426 if (X86::VK16RegClass.contains(SrcReg)) {
3427 if (X86::GR64RegClass.contains(DestReg)) {
3428 assert(Subtarget.hasBWI());
3429 return X86::KMOVQrk;
3430 }
3431 if (X86::GR32RegClass.contains(DestReg))
3432 return Subtarget.hasBWI() ? X86::KMOVDrk : X86::KMOVWrk;
3433 }
3434
3435 // SrcReg(GR64) -> DestReg(MaskReg)
3436 // SrcReg(GR32) -> DestReg(MaskReg)
3437
3438 // All KMASK RegClasses hold the same k registers, can be tested against anyone.
3439 if (X86::VK16RegClass.contains(DestReg)) {
3440 if (X86::GR64RegClass.contains(SrcReg)) {
3441 assert(Subtarget.hasBWI());
3442 return X86::KMOVQkr;
3443 }
3444 if (X86::GR32RegClass.contains(SrcReg))
3445 return Subtarget.hasBWI() ? X86::KMOVDkr : X86::KMOVWkr;
3446 }
3447
3448
3449 // SrcReg(VR128) -> DestReg(GR64)
3450 // SrcReg(VR64) -> DestReg(GR64)
3451 // SrcReg(GR64) -> DestReg(VR128)
3452 // SrcReg(GR64) -> DestReg(VR64)
3453
3454 if (X86::GR64RegClass.contains(DestReg)) {
3455 if (X86::VR128XRegClass.contains(SrcReg))
3456 // Copy from a VR128 register to a GR64 register.
3457 return HasAVX512 ? X86::VMOVPQIto64Zrr :
3458 HasAVX ? X86::VMOVPQIto64rr :
3459 X86::MOVPQIto64rr;
3460 if (X86::VR64RegClass.contains(SrcReg))
3461 // Copy from a VR64 register to a GR64 register.
3462 return X86::MMX_MOVD64from64rr;
3463 } else if (X86::GR64RegClass.contains(SrcReg)) {
3464 // Copy from a GR64 register to a VR128 register.
3465 if (X86::VR128XRegClass.contains(DestReg))
3466 return HasAVX512 ? X86::VMOV64toPQIZrr :
3467 HasAVX ? X86::VMOV64toPQIrr :
3468 X86::MOV64toPQIrr;
3469 // Copy from a GR64 register to a VR64 register.
3470 if (X86::VR64RegClass.contains(DestReg))
3471 return X86::MMX_MOVD64to64rr;
3472 }
3473
3474 // SrcReg(VR128) -> DestReg(GR32)
3475 // SrcReg(GR32) -> DestReg(VR128)
3476
3477 if (X86::GR32RegClass.contains(DestReg) &&
3478 X86::VR128XRegClass.contains(SrcReg))
3479 // Copy from a VR128 register to a GR32 register.
3480 return HasAVX512 ? X86::VMOVPDI2DIZrr :
3481 HasAVX ? X86::VMOVPDI2DIrr :
3482 X86::MOVPDI2DIrr;
3483
3484 if (X86::VR128XRegClass.contains(DestReg) &&
3485 X86::GR32RegClass.contains(SrcReg))
3486 // Copy from a VR128 register to a VR128 register.
3487 return HasAVX512 ? X86::VMOVDI2PDIZrr :
3488 HasAVX ? X86::VMOVDI2PDIrr :
3489 X86::MOVDI2PDIrr;
3490 return 0;
3491}
3492
3495 const DebugLoc &DL, MCRegister DestReg,
3496 MCRegister SrcReg, bool KillSrc) const {
3497 // First deal with the normal symmetric copies.
3498 bool HasAVX = Subtarget.hasAVX();
3499 bool HasVLX = Subtarget.hasVLX();
3500 unsigned Opc = 0;
3501 if (X86::GR64RegClass.contains(DestReg, SrcReg))
3502 Opc = X86::MOV64rr;
3503 else if (X86::GR32RegClass.contains(DestReg, SrcReg))
3504 Opc = X86::MOV32rr;
3505 else if (X86::GR16RegClass.contains(DestReg, SrcReg))
3506 Opc = X86::MOV16rr;
3507 else if (X86::GR8RegClass.contains(DestReg, SrcReg)) {
3508 // Copying to or from a physical H register on x86-64 requires a NOREX
3509 // move. Otherwise use a normal move.
3510 if ((isHReg(DestReg) || isHReg(SrcReg)) &&
3511 Subtarget.is64Bit()) {
3512 Opc = X86::MOV8rr_NOREX;
3513 // Both operands must be encodable without an REX prefix.
3514 assert(X86::GR8_NOREXRegClass.contains(SrcReg, DestReg) &&
3515 "8-bit H register can not be copied outside GR8_NOREX");
3516 } else
3517 Opc = X86::MOV8rr;
3518 }
3519 else if (X86::VR64RegClass.contains(DestReg, SrcReg))
3520 Opc = X86::MMX_MOVQ64rr;
3521 else if (X86::VR128XRegClass.contains(DestReg, SrcReg)) {
3522 if (HasVLX)
3523 Opc = X86::VMOVAPSZ128rr;
3524 else if (X86::VR128RegClass.contains(DestReg, SrcReg))
3525 Opc = HasAVX ? X86::VMOVAPSrr : X86::MOVAPSrr;
3526 else {
3527 // If this an extended register and we don't have VLX we need to use a
3528 // 512-bit move.
3529 Opc = X86::VMOVAPSZrr;
3531 DestReg = TRI->getMatchingSuperReg(DestReg, X86::sub_xmm,
3532 &X86::VR512RegClass);
3533 SrcReg = TRI->getMatchingSuperReg(SrcReg, X86::sub_xmm,
3534 &X86::VR512RegClass);
3535 }
3536 } else if (X86::VR256XRegClass.contains(DestReg, SrcReg)) {
3537 if (HasVLX)
3538 Opc = X86::VMOVAPSZ256rr;
3539 else if (X86::VR256RegClass.contains(DestReg, SrcReg))
3540 Opc = X86::VMOVAPSYrr;
3541 else {
3542 // If this an extended register and we don't have VLX we need to use a
3543 // 512-bit move.
3544 Opc = X86::VMOVAPSZrr;
3546 DestReg = TRI->getMatchingSuperReg(DestReg, X86::sub_ymm,
3547 &X86::VR512RegClass);
3548 SrcReg = TRI->getMatchingSuperReg(SrcReg, X86::sub_ymm,
3549 &X86::VR512RegClass);
3550 }
3551 } else if (X86::VR512RegClass.contains(DestReg, SrcReg))
3552 Opc = X86::VMOVAPSZrr;
3553 // All KMASK RegClasses hold the same k registers, can be tested against anyone.
3554 else if (X86::VK16RegClass.contains(DestReg, SrcReg))
3555 Opc = Subtarget.hasBWI() ? X86::KMOVQkk : X86::KMOVWkk;
3556 if (!Opc)
3557 Opc = CopyToFromAsymmetricReg(DestReg, SrcReg, Subtarget);
3558
3559 if (Opc) {
3560 BuildMI(MBB, MI, DL, get(Opc), DestReg)
3561 .addReg(SrcReg, getKillRegState(KillSrc));
3562 return;
3563 }
3564
3565 if (SrcReg == X86::EFLAGS || DestReg == X86::EFLAGS) {
3566 // FIXME: We use a fatal error here because historically LLVM has tried
3567 // lower some of these physreg copies and we want to ensure we get
3568 // reasonable bug reports if someone encounters a case no other testing
3569 // found. This path should be removed after the LLVM 7 release.
3570 report_fatal_error("Unable to copy EFLAGS physical register!");
3571 }
3572
3573 LLVM_DEBUG(dbgs() << "Cannot copy " << RI.getName(SrcReg) << " to "
3574 << RI.getName(DestReg) << '\n');
3575 report_fatal_error("Cannot emit physreg copy instruction");
3576}
3577
3578std::optional<DestSourcePair>
3580 if (MI.isMoveReg())
3581 return DestSourcePair{MI.getOperand(0), MI.getOperand(1)};
3582 return std::nullopt;
3583}
3584
3585static unsigned getLoadStoreOpcodeForFP16(bool Load, const X86Subtarget &STI) {
3586 if (STI.hasFP16())
3587 return Load ? X86::VMOVSHZrm_alt : X86::VMOVSHZmr;
3588 if (Load)
3589 return STI.hasAVX512() ? X86::VMOVSSZrm
3590 : STI.hasAVX() ? X86::VMOVSSrm
3591 : X86::MOVSSrm;
3592 else
3593 return STI.hasAVX512() ? X86::VMOVSSZmr
3594 : STI.hasAVX() ? X86::VMOVSSmr
3595 : X86::MOVSSmr;
3596}
3597
3599 const TargetRegisterClass *RC,
3600 bool IsStackAligned,
3601 const X86Subtarget &STI, bool Load) {
3602 bool HasAVX = STI.hasAVX();
3603 bool HasAVX512 = STI.hasAVX512();
3604 bool HasVLX = STI.hasVLX();
3605
3606 switch (STI.getRegisterInfo()->getSpillSize(*RC)) {
3607 default:
3608 llvm_unreachable("Unknown spill size");
3609 case 1:
3610 assert(X86::GR8RegClass.hasSubClassEq(RC) && "Unknown 1-byte regclass");
3611 if (STI.is64Bit())
3612 // Copying to or from a physical H register on x86-64 requires a NOREX
3613 // move. Otherwise use a normal move.
3614 if (isHReg(Reg) || X86::GR8_ABCD_HRegClass.hasSubClassEq(RC))
3615 return Load ? X86::MOV8rm_NOREX : X86::MOV8mr_NOREX;
3616 return Load ? X86::MOV8rm : X86::MOV8mr;
3617 case 2:
3618 if (X86::VK16RegClass.hasSubClassEq(RC))
3619 return Load ? X86::KMOVWkm : X86::KMOVWmk;
3620 assert(X86::GR16RegClass.hasSubClassEq(RC) && "Unknown 2-byte regclass");
3621 return Load ? X86::MOV16rm : X86::MOV16mr;
3622 case 4:
3623 if (X86::GR32RegClass.hasSubClassEq(RC))
3624 return Load ? X86::MOV32rm : X86::MOV32mr;
3625 if (X86::FR32XRegClass.hasSubClassEq(RC))
3626 return Load ?
3627 (HasAVX512 ? X86::VMOVSSZrm_alt :
3628 HasAVX ? X86::VMOVSSrm_alt :
3629 X86::MOVSSrm_alt) :
3630 (HasAVX512 ? X86::VMOVSSZmr :
3631 HasAVX ? X86::VMOVSSmr :
3632 X86::MOVSSmr);
3633 if (X86::RFP32RegClass.hasSubClassEq(RC))
3634 return Load ? X86::LD_Fp32m : X86::ST_Fp32m;
3635 if (X86::VK32RegClass.hasSubClassEq(RC)) {
3636 assert(STI.hasBWI() && "KMOVD requires BWI");
3637 return Load ? X86::KMOVDkm : X86::KMOVDmk;
3638 }
3639 // All of these mask pair classes have the same spill size, the same kind
3640 // of kmov instructions can be used with all of them.
3641 if (X86::VK1PAIRRegClass.hasSubClassEq(RC) ||
3642 X86::VK2PAIRRegClass.hasSubClassEq(RC) ||
3643 X86::VK4PAIRRegClass.hasSubClassEq(RC) ||
3644 X86::VK8PAIRRegClass.hasSubClassEq(RC) ||
3645 X86::VK16PAIRRegClass.hasSubClassEq(RC))
3646 return Load ? X86::MASKPAIR16LOAD : X86::MASKPAIR16STORE;
3647 if (X86::FR16RegClass.hasSubClassEq(RC) ||
3648 X86::FR16XRegClass.hasSubClassEq(RC))
3649 return getLoadStoreOpcodeForFP16(Load, STI);
3650 llvm_unreachable("Unknown 4-byte regclass");
3651 case 8:
3652 if (X86::GR64RegClass.hasSubClassEq(RC))
3653 return Load ? X86::MOV64rm : X86::MOV64mr;
3654 if (X86::FR64XRegClass.hasSubClassEq(RC))
3655 return Load ?
3656 (HasAVX512 ? X86::VMOVSDZrm_alt :
3657 HasAVX ? X86::VMOVSDrm_alt :
3658 X86::MOVSDrm_alt) :
3659 (HasAVX512 ? X86::VMOVSDZmr :
3660 HasAVX ? X86::VMOVSDmr :
3661 X86::MOVSDmr);
3662 if (X86::VR64RegClass.hasSubClassEq(RC))
3663 return Load ? X86::MMX_MOVQ64rm : X86::MMX_MOVQ64mr;
3664 if (X86::RFP64RegClass.hasSubClassEq(RC))
3665 return Load ? X86::LD_Fp64m : X86::ST_Fp64m;
3666 if (X86::VK64RegClass.hasSubClassEq(RC)) {
3667 assert(STI.hasBWI() && "KMOVQ requires BWI");
3668 return Load ? X86::KMOVQkm : X86::KMOVQmk;
3669 }
3670 llvm_unreachable("Unknown 8-byte regclass");
3671 case 10:
3672 assert(X86::RFP80RegClass.hasSubClassEq(RC) && "Unknown 10-byte regclass");
3673 return Load ? X86::LD_Fp80m : X86::ST_FpP80m;
3674 case 16: {
3675 if (X86::VR128XRegClass.hasSubClassEq(RC)) {
3676 // If stack is realigned we can use aligned stores.
3677 if (IsStackAligned)
3678 return Load ?
3679 (HasVLX ? X86::VMOVAPSZ128rm :
3680 HasAVX512 ? X86::VMOVAPSZ128rm_NOVLX :
3681 HasAVX ? X86::VMOVAPSrm :
3682 X86::MOVAPSrm):
3683 (HasVLX ? X86::VMOVAPSZ128mr :
3684 HasAVX512 ? X86::VMOVAPSZ128mr_NOVLX :
3685 HasAVX ? X86::VMOVAPSmr :
3686 X86::MOVAPSmr);
3687 else
3688 return Load ?
3689 (HasVLX ? X86::VMOVUPSZ128rm :
3690 HasAVX512 ? X86::VMOVUPSZ128rm_NOVLX :
3691 HasAVX ? X86::VMOVUPSrm :
3692 X86::MOVUPSrm):
3693 (HasVLX ? X86::VMOVUPSZ128mr :
3694 HasAVX512 ? X86::VMOVUPSZ128mr_NOVLX :
3695 HasAVX ? X86::VMOVUPSmr :
3696 X86::MOVUPSmr);
3697 }
3698 llvm_unreachable("Unknown 16-byte regclass");
3699 }
3700 case 32:
3701 assert(X86::VR256XRegClass.hasSubClassEq(RC) && "Unknown 32-byte regclass");
3702 // If stack is realigned we can use aligned stores.
3703 if (IsStackAligned)
3704 return Load ?
3705 (HasVLX ? X86::VMOVAPSZ256rm :
3706 HasAVX512 ? X86::VMOVAPSZ256rm_NOVLX :
3707 X86::VMOVAPSYrm) :
3708 (HasVLX ? X86::VMOVAPSZ256mr :
3709 HasAVX512 ? X86::VMOVAPSZ256mr_NOVLX :
3710 X86::VMOVAPSYmr);
3711 else
3712 return Load ?
3713 (HasVLX ? X86::VMOVUPSZ256rm :
3714 HasAVX512 ? X86::VMOVUPSZ256rm_NOVLX :
3715 X86::VMOVUPSYrm) :
3716 (HasVLX ? X86::VMOVUPSZ256mr :
3717 HasAVX512 ? X86::VMOVUPSZ256mr_NOVLX :
3718 X86::VMOVUPSYmr);
3719 case 64:
3720 assert(X86::VR512RegClass.hasSubClassEq(RC) && "Unknown 64-byte regclass");
3721 assert(STI.hasAVX512() && "Using 512-bit register requires AVX512");
3722 if (IsStackAligned)
3723 return Load ? X86::VMOVAPSZrm : X86::VMOVAPSZmr;
3724 else
3725 return Load ? X86::VMOVUPSZrm : X86::VMOVUPSZmr;
3726 case 1024:
3727 assert(X86::TILERegClass.hasSubClassEq(RC) && "Unknown 1024-byte regclass");
3728 assert(STI.hasAMXTILE() && "Using 8*1024-bit register requires AMX-TILE");
3729 return Load ? X86::TILELOADD : X86::TILESTORED;
3730 }
3731}
3732
3733std::optional<ExtAddrMode>
3735 const TargetRegisterInfo *TRI) const {
3736 const MCInstrDesc &Desc = MemI.getDesc();
3737 int MemRefBegin = X86II::getMemoryOperandNo(Desc.TSFlags);
3738 if (MemRefBegin < 0)
3739 return std::nullopt;
3740
3741 MemRefBegin += X86II::getOperandBias(Desc);
3742
3743 auto &BaseOp = MemI.getOperand(MemRefBegin + X86::AddrBaseReg);
3744 if (!BaseOp.isReg()) // Can be an MO_FrameIndex
3745 return std::nullopt;
3746
3747 const MachineOperand &DispMO = MemI.getOperand(MemRefBegin + X86::AddrDisp);
3748 // Displacement can be symbolic
3749 if (!DispMO.isImm())
3750 return std::nullopt;
3751
3752 ExtAddrMode AM;
3753 AM.BaseReg = BaseOp.getReg();
3754 AM.ScaledReg = MemI.getOperand(MemRefBegin + X86::AddrIndexReg).getReg();
3755 AM.Scale = MemI.getOperand(MemRefBegin + X86::AddrScaleAmt).getImm();
3756 AM.Displacement = DispMO.getImm();
3757 return AM;
3758}
3759
3761 StringRef &ErrInfo) const {
3762 std::optional<ExtAddrMode> AMOrNone = getAddrModeFromMemoryOp(MI, nullptr);
3763 if (!AMOrNone)
3764 return true;
3765
3766 ExtAddrMode AM = *AMOrNone;
3767
3768 if (AM.ScaledReg != X86::NoRegister) {
3769 switch (AM.Scale) {
3770 case 1:
3771 case 2:
3772 case 4:
3773 case 8:
3774 break;
3775 default:
3776 ErrInfo = "Scale factor in address must be 1, 2, 4 or 8";
3777 return false;
3778 }
3779 }
3780 if (!isInt<32>(AM.Displacement)) {
3781 ErrInfo = "Displacement in address must fit into 32-bit signed "
3782 "integer";
3783 return false;
3784 }
3785
3786 return true;
3787}
3788
3790 const Register Reg,
3791 int64_t &ImmVal) const {
3792 if (MI.getOpcode() != X86::MOV32ri && MI.getOpcode() != X86::MOV64ri)
3793 return false;
3794 // Mov Src can be a global address.
3795 if (!MI.getOperand(1).isImm() || MI.getOperand(0).getReg() != Reg)
3796 return false;
3797 ImmVal = MI.getOperand(1).getImm();
3798 return true;
3799}
3800
3802 const MachineInstr *MI, const Register NullValueReg,
3803 const TargetRegisterInfo *TRI) const {
3804 if (!MI->modifiesRegister(NullValueReg, TRI))
3805 return true;
3806 switch (MI->getOpcode()) {
3807 // Shift right/left of a null unto itself is still a null, i.e. rax = shl rax
3808 // X.
3809 case X86::SHR64ri:
3810 case X86::SHR32ri:
3811 case X86::SHL64ri:
3812 case X86::SHL32ri:
3813 assert(MI->getOperand(0).isDef() && MI->getOperand(1).isUse() &&
3814 "expected for shift opcode!");
3815 return MI->getOperand(0).getReg() == NullValueReg &&
3816 MI->getOperand(1).getReg() == NullValueReg;
3817 // Zero extend of a sub-reg of NullValueReg into itself does not change the
3818 // null value.
3819 case X86::MOV32rr:
3820 return llvm::all_of(MI->operands(), [&](const MachineOperand &MO) {
3821 return TRI->isSubRegisterEq(NullValueReg, MO.getReg());
3822 });
3823 default:
3824 return false;
3825 }
3826 llvm_unreachable("Should be handled above!");
3827}
3828
3831 int64_t &Offset, bool &OffsetIsScalable, unsigned &Width,
3832 const TargetRegisterInfo *TRI) const {
3833 const MCInstrDesc &Desc = MemOp.getDesc();
3834 int MemRefBegin = X86II::getMemoryOperandNo(Desc.TSFlags);
3835 if (MemRefBegin < 0)
3836 return false;
3837
3838 MemRefBegin += X86II::getOperandBias(Desc);
3839
3840 const MachineOperand *BaseOp =
3841 &MemOp.getOperand(MemRefBegin + X86::AddrBaseReg);
3842 if (!BaseOp->isReg()) // Can be an MO_FrameIndex
3843 return false;
3844
3845 if (MemOp.getOperand(MemRefBegin + X86::AddrScaleAmt).getImm() != 1)
3846 return false;
3847
3848 if (MemOp.getOperand(MemRefBegin + X86::AddrIndexReg).getReg() !=
3849 X86::NoRegister)
3850 return false;
3851
3852 const MachineOperand &DispMO = MemOp.getOperand(MemRefBegin + X86::AddrDisp);
3853
3854 // Displacement can be symbolic
3855 if (!DispMO.isImm())
3856 return false;
3857
3858 Offset = DispMO.getImm();
3859
3860 if (!BaseOp->isReg())
3861 return false;
3862
3863 OffsetIsScalable = false;
3864 // FIXME: Relying on memoperands() may not be right thing to do here. Check
3865 // with X86 maintainers, and fix it accordingly. For now, it is ok, since
3866 // there is no use of `Width` for X86 back-end at the moment.
3867 Width =
3868 !MemOp.memoperands_empty() ? MemOp.memoperands().front()->getSize() : 0;
3869 BaseOps.push_back(BaseOp);
3870 return true;
3871}
3872
3873static unsigned getStoreRegOpcode(Register SrcReg,
3874 const TargetRegisterClass *RC,
3875 bool IsStackAligned,
3876 const X86Subtarget &STI) {
3877 return getLoadStoreRegOpcode(SrcReg, RC, IsStackAligned, STI, false);
3878}
3879
3880static unsigned getLoadRegOpcode(Register DestReg,
3881 const TargetRegisterClass *RC,
3882 bool IsStackAligned, const X86Subtarget &STI) {
3883 return getLoadStoreRegOpcode(DestReg, RC, IsStackAligned, STI, true);
3884}
3885
3886static bool isAMXOpcode(unsigned Opc) {
3887 switch (Opc) {
3888 default:
3889 return false;
3890 case X86::TILELOADD:
3891 case X86::TILESTORED:
3892 return true;
3893 }
3894}
3895
3898 unsigned Opc, Register Reg, int FrameIdx,
3899 bool isKill) const {
3900 switch (Opc) {
3901 default:
3902 llvm_unreachable("Unexpected special opcode!");
3903 case X86::TILESTORED: {
3904 // tilestored %tmm, (%sp, %idx)
3906 Register VirtReg = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass);
3907 BuildMI(MBB, MI, DebugLoc(), get(X86::MOV64ri), VirtReg).addImm(64);
3908 MachineInstr *NewMI =
3909 addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc)), FrameIdx)
3910 .addReg(Reg, getKillRegState(isKill));
3912 MO.setReg(VirtReg);
3913 MO.setIsKill(true);
3914 break;
3915 }
3916 case X86::TILELOADD: {
3917 // tileloadd (%sp, %idx), %tmm
3919 Register VirtReg = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass);
3920 BuildMI(MBB, MI, DebugLoc(), get(X86::MOV64ri), VirtReg).addImm(64);
3922 BuildMI(MBB, MI, DebugLoc(), get(Opc), Reg), FrameIdx);
3924 MO.setReg(VirtReg);
3925 MO.setIsKill(true);
3926 break;
3927 }
3928 }
3929}
3930
3933 bool isKill, int FrameIdx, const TargetRegisterClass *RC,
3934 const TargetRegisterInfo *TRI, Register VReg) const {
3935 const MachineFunction &MF = *MBB.getParent();
3936 const MachineFrameInfo &MFI = MF.getFrameInfo();
3937 assert(MFI.getObjectSize(FrameIdx) >= TRI->getSpillSize(*RC) &&
3938 "Stack slot too small for store");
3939
3940 unsigned Alignment = std::max<uint32_t>(TRI->getSpillSize(*RC), 16);
3941 bool isAligned =
3942 (Subtarget.getFrameLowering()->getStackAlign() >= Alignment) ||
3943 (RI.canRealignStack(MF) && !MFI.isFixedObjectIndex(FrameIdx));
3944
3945 unsigned Opc = getStoreRegOpcode(SrcReg, RC, isAligned, Subtarget);
3946 if (isAMXOpcode(Opc))
3947 loadStoreTileReg(MBB, MI, Opc, SrcReg, FrameIdx, isKill);
3948 else
3949 addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc)), FrameIdx)
3950 .addReg(SrcReg, getKillRegState(isKill));
3951}
3952
3955 Register DestReg, int FrameIdx,
3956 const TargetRegisterClass *RC,
3957 const TargetRegisterInfo *TRI,
3958 Register VReg) const {
3959 const MachineFunction &MF = *MBB.getParent();
3960 const MachineFrameInfo &MFI = MF.getFrameInfo();
3961 assert(MFI.getObjectSize(FrameIdx) >= TRI->getSpillSize(*RC) &&
3962 "Load size exceeds stack slot");
3963 unsigned Alignment = std::max<uint32_t>(TRI->getSpillSize(*RC), 16);
3964 bool isAligned =
3965 (Subtarget.getFrameLowering()->getStackAlign() >= Alignment) ||
3966 (RI.canRealignStack(MF) && !MFI.isFixedObjectIndex(FrameIdx));
3967
3968 unsigned Opc = getLoadRegOpcode(DestReg, RC, isAligned, Subtarget);
3969 if (isAMXOpcode(Opc))
3970 loadStoreTileReg(MBB, MI, Opc, DestReg, FrameIdx);
3971 else
3972 addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc), DestReg),
3973 FrameIdx);
3974}
3975
3977 Register &SrcReg2, int64_t &CmpMask,
3978 int64_t &CmpValue) const {
3979 switch (MI.getOpcode()) {
3980 default: break;
3981 case X86::CMP64ri32:
3982 case X86::CMP64ri8:
3983 case X86::CMP32ri:
3984 case X86::CMP32ri8:
3985 case X86::CMP16ri:
3986 case X86::CMP16ri8:
3987 case X86::CMP8ri:
3988 SrcReg = MI.getOperand(0).getReg();
3989 SrcReg2 = 0;
3990 if (MI.getOperand(1).isImm()) {
3991 CmpMask = ~0;
3992 CmpValue = MI.getOperand(1).getImm();
3993 } else {
3994 CmpMask = CmpValue = 0;
3995 }
3996 return true;
3997 // A SUB can be used to perform comparison.
3998 case X86::SUB64rm:
3999 case X86::SUB32rm:
4000 case X86::SUB16rm:
4001 case X86::SUB8rm:
4002 SrcReg = MI.getOperand(1).getReg();
4003 SrcReg2 = 0;
4004 CmpMask = 0;
4005 CmpValue = 0;
4006 return true;
4007 case X86::SUB64rr:
4008 case X86::SUB32rr:
4009 case X86::SUB16rr:
4010 case X86::SUB8rr:
4011 SrcReg = MI.getOperand(1).getReg();
4012 SrcReg2 = MI.getOperand(2).getReg();
4013 CmpMask = 0;
4014 CmpValue = 0;
4015 return true;
4016 case X86::SUB64ri32:
4017 case X86::SUB64ri8:
4018 case X86::SUB32ri:
4019 case X86::SUB32ri8:
4020 case X86::SUB16ri:
4021 case X86::SUB16ri8:
4022 case X86::SUB8ri:
4023 SrcReg = MI.getOperand(1).getReg();
4024 SrcReg2 = 0;
4025 if (MI.getOperand(2).isImm()) {
4026 CmpMask = ~0;
4027 CmpValue = MI.getOperand(2).getImm();
4028 } else {
4029 CmpMask = CmpValue = 0;
4030 }
4031 return true;
4032 case X86::CMP64rr:
4033 case X86::CMP32rr:
4034 case X86::CMP16rr:
4035 case X86::CMP8rr:
4036 SrcReg = MI.getOperand(0).getReg();
4037 SrcReg2 = MI.getOperand(1).getReg();
4038 CmpMask = 0;
4039 CmpValue = 0;
4040 return true;
4041 case X86::TEST8rr:
4042 case X86::TEST16rr:
4043 case X86::TEST32rr:
4044 case X86::TEST64rr:
4045 SrcReg = MI.getOperand(0).getReg();
4046 if (MI.getOperand(1).getReg() != SrcReg)
4047 return false;
4048 // Compare against zero.
4049 SrcReg2 = 0;
4050 CmpMask = ~0;
4051 CmpValue = 0;
4052 return true;
4053 }
4054 return false;
4055}
4056
4057bool X86InstrInfo::isRedundantFlagInstr(const MachineInstr &FlagI,
4058 Register SrcReg, Register SrcReg2,
4059 int64_t ImmMask, int64_t ImmValue,
4060 const MachineInstr &OI, bool *IsSwapped,
4061 int64_t *ImmDelta) const {
4062 switch (OI.getOpcode()) {
4063 case X86::CMP64rr:
4064 case X86::CMP32rr:
4065 case X86::CMP16rr:
4066 case X86::CMP8rr:
4067 case X86::SUB64rr:
4068 case X86::SUB32rr:
4069 case X86::SUB16rr:
4070 case X86::SUB8rr: {
4071 Register OISrcReg;
4072 Register OISrcReg2;
4073 int64_t OIMask;
4074 int64_t OIValue;
4075 if (!analyzeCompare(OI, OISrcReg, OISrcReg2, OIMask, OIValue) ||
4076 OIMask != ImmMask || OIValue != ImmValue)
4077 return false;
4078 if (SrcReg == OISrcReg && SrcReg2 == OISrcReg2) {
4079 *IsSwapped = false;
4080 return true;
4081 }
4082 if (SrcReg == OISrcReg2 && SrcReg2 == OISrcReg) {
4083 *IsSwapped = true;
4084 return true;
4085 }
4086 return false;
4087 }
4088 case X86::CMP64ri32:
4089 case X86::CMP64ri8:
4090 case X86::CMP32ri:
4091 case X86::CMP32ri8:
4092 case X86::CMP16ri:
4093 case X86::CMP16ri8:
4094 case X86::CMP8ri:
4095 case X86::SUB64ri32:
4096 case X86::SUB64ri8:
4097 case X86::SUB32ri:
4098 case X86::SUB32ri8:
4099 case X86::SUB16ri:
4100 case X86::SUB16ri8:
4101 case X86::SUB8ri:
4102 case X86::TEST64rr:
4103 case X86::TEST32rr:
4104 case X86::TEST16rr:
4105 case X86::TEST8rr: {
4106 if (ImmMask != 0) {
4107 Register OISrcReg;
4108 Register OISrcReg2;
4109 int64_t OIMask;
4110 int64_t OIValue;
4111 if (analyzeCompare(OI, OISrcReg, OISrcReg2, OIMask, OIValue) &&
4112 SrcReg == OISrcReg && ImmMask == OIMask) {
4113 if (OIValue == ImmValue) {
4114 *ImmDelta = 0;
4115 return true;
4116 } else if (static_cast<uint64_t>(ImmValue) ==
4117 static_cast<uint64_t>(OIValue) - 1) {
4118 *ImmDelta = -1;
4119 return true;
4120 } else if (static_cast<uint64_t>(ImmValue) ==
4121 static_cast<uint64_t>(OIValue) + 1) {
4122 *ImmDelta = 1;
4123 return true;
4124 } else {
4125 return false;
4126 }
4127 }
4128 }
4129 return FlagI.isIdenticalTo(OI);
4130 }
4131 default:
4132 return false;
4133 }
4134}
4135
4136/// Check whether the definition can be converted
4137/// to remove a comparison against zero.
4138inline static bool isDefConvertible(const MachineInstr &MI, bool &NoSignFlag,
4139 bool &ClearsOverflowFlag) {
4140 NoSignFlag = false;
4141 ClearsOverflowFlag = false;
4142
4143 // "ELF Handling for Thread-Local Storage" specifies that x86-64 GOTTPOFF, and
4144 // i386 GOTNTPOFF/INDNTPOFF relocations can convert an ADD to a LEA during
4145 // Initial Exec to Local Exec relaxation. In these cases, we must not depend
4146 // on the EFLAGS modification of ADD actually happening in the final binary.
4147 if (MI.getOpcode() == X86::ADD64rm || MI.getOpcode() == X86::ADD32rm) {
4148 unsigned Flags = MI.getOperand(5).getTargetFlags();
4151 return false;
4152 }
4153
4154 switch (MI.getOpcode()) {
4155 default: return false;
4156
4157 // The shift instructions only modify ZF if their shift count is non-zero.
4158 // N.B.: The processor truncates the shift count depending on the encoding.
4159 case X86::SAR8ri: case X86::SAR16ri: case X86::SAR32ri:case X86::SAR64ri:
4160 case X86::SHR8ri: case X86::SHR16ri: case X86::SHR32ri:case X86::SHR64ri:
4161 return getTruncatedShiftCount(MI, 2) != 0;
4162
4163 // Some left shift instructions can be turned into LEA instructions but only
4164 // if their flags aren't used. Avoid transforming such instructions.
4165 case X86::SHL8ri: case X86::SHL16ri: case X86::SHL32ri:case X86::SHL64ri:{
4166 unsigned ShAmt = getTruncatedShiftCount(MI, 2);
4167 if (isTruncatedShiftCountForLEA(ShAmt)) return false;
4168 return ShAmt != 0;
4169 }
4170
4171 case X86::SHRD16rri8:case X86::SHRD32rri8:case X86::SHRD64rri8:
4172 case X86::SHLD16rri8:case X86::SHLD32rri8:case X86::SHLD64rri8:
4173 return getTruncatedShiftCount(MI, 3) != 0;
4174
4175 case X86::SUB64ri32: case X86::SUB64ri8: case X86::SUB32ri:
4176 case X86::SUB32ri8: case X86::SUB16ri: case X86::SUB16ri8:
4177 case X86::SUB8ri: case X86::SUB64rr: case X86::SUB32rr:
4178 case X86::SUB16rr: case X86::SUB8rr: case X86::SUB64rm:
4179 case X86::SUB32rm: case X86::SUB16rm: case X86::SUB8rm:
4180 case X86::DEC64r: case X86::DEC32r: case X86::DEC16r: case X86::DEC8r:
4181 case X86::ADD64ri32: case X86::ADD64ri8: case X86::ADD32ri:
4182 case X86::ADD32ri8: case X86::ADD16ri: case X86::ADD16ri8:
4183 case X86::ADD8ri: case X86::ADD64rr: case X86::ADD32rr:
4184 case X86::ADD16rr: case X86::ADD8rr: case X86::ADD64rm:
4185 case X86::ADD32rm: case X86::ADD16rm: case X86::ADD8rm:
4186 case X86::INC64r: case X86::INC32r: case X86::INC16r: case X86::INC8r:
4187 case X86::ADC64ri32: case X86::ADC64ri8: case X86::ADC32ri:
4188 case X86::ADC32ri8: case X86::ADC16ri: case X86::ADC16ri8:
4189 case X86::ADC8ri: case X86::ADC64rr: case X86::ADC32rr:
4190 case X86::ADC16rr: case X86::ADC8rr: case X86::ADC64rm:
4191 case X86::ADC32rm: case X86::ADC16rm: case X86::ADC8rm:
4192 case X86::SBB64ri32: case X86::SBB64ri8: case X86::SBB32ri:
4193 case X86::SBB32ri8: case X86::SBB16ri: case X86::SBB16ri8:
4194 case X86::SBB8ri: case X86::SBB64rr: case X86::SBB32rr:
4195 case X86::SBB16rr: case X86::SBB8rr: case X86::SBB64rm:
4196 case X86::SBB32rm: case X86::SBB16rm: case X86::SBB8rm:
4197 case X86::NEG8r: case X86::NEG16r: case X86::NEG32r: case X86::NEG64r:
4198 case X86::SAR8r1: case X86::SAR16r1: case X86::SAR32r1:case X86::SAR64r1:
4199 case X86::SHR8r1: case X86::SHR16r1: case X86::SHR32r1:case X86::SHR64r1:
4200 case X86::SHL8r1: case X86::SHL16r1: case X86::SHL32r1:case X86::SHL64r1:
4201 case X86::LZCNT16rr: case X86::LZCNT16rm:
4202 case X86::LZCNT32rr: case X86::LZCNT32rm:
4203 case X86::LZCNT64rr: case X86::LZCNT64rm:
4204 case X86::POPCNT16rr:case X86::POPCNT16rm:
4205 case X86::POPCNT32rr:case X86::POPCNT32rm:
4206 case X86::POPCNT64rr:case X86::POPCNT64rm:
4207 case X86::TZCNT16rr: case X86::TZCNT16rm:
4208 case X86::TZCNT32rr: case X86::TZCNT32rm:
4209 case X86::TZCNT64rr: case X86::TZCNT64rm:
4210 return true;
4211 case X86::AND64ri32: case X86::AND64ri8: case X86::AND32ri:
4212 case X86::AND32ri8: case X86::AND16ri: case X86::AND16ri8:
4213 case X86::AND8ri: case X86::AND64rr: case X86::AND32rr:
4214 case X86::AND16rr: case X86::AND8rr: case X86::AND64rm:
4215 case X86::AND32rm: case X86::AND16rm: case X86::AND8rm:
4216 case X86::XOR64ri32: case X86::XOR64ri8: case X86::XOR32ri:
4217 case X86::XOR32ri8: case X86::XOR16ri: case X86::XOR16ri8:
4218 case X86::XOR8ri: case X86::XOR64rr: case X86::XOR32rr:
4219 case X86::XOR16rr: case X86::XOR8rr: case X86::XOR64rm:
4220 case X86::XOR32rm: case X86::XOR16rm: case X86::XOR8rm:
4221 case X86::OR64ri32: case X86::OR64ri8: case X86::OR32ri:
4222 case X86::OR32ri8: case X86::OR16ri: case X86::OR16ri8:
4223 case X86::OR8ri: case X86::OR64rr: case X86::OR32rr:
4224 case X86::OR16rr: case X86::OR8rr: case X86::OR64rm:
4225 case X86::OR32rm: case X86::OR16rm: case X86::OR8rm:
4226 case X86::ANDN32rr: case X86::ANDN32rm:
4227 case X86::ANDN64rr: case X86::ANDN64rm:
4228 case X86::BLSI32rr: case X86::BLSI32rm:
4229 case X86::BLSI64rr: case X86::BLSI64rm:
4230 case X86::BLSMSK32rr: case X86::BLSMSK32rm:
4231 case X86::BLSMSK64rr: case X86::BLSMSK64rm:
4232 case X86::BLSR32rr: case X86::BLSR32rm:
4233 case X86::BLSR64rr: case X86::BLSR64rm:
4234 case X86::BLCFILL32rr: case X86::BLCFILL32rm:
4235 case X86::BLCFILL64rr: case X86::BLCFILL64rm:
4236 case X86::BLCI32rr: case X86::BLCI32rm:
4237 case X86::BLCI64rr: case X86::BLCI64rm:
4238 case X86::BLCIC32rr: case X86::BLCIC32rm:
4239 case X86::BLCIC64rr: case X86::BLCIC64rm:
4240 case X86::BLCMSK32rr: case X86::BLCMSK32rm:
4241 case X86::BLCMSK64rr: case X86::BLCMSK64rm:
4242 case X86::BLCS32rr: case X86::BLCS32rm:
4243 case X86::BLCS64rr: case X86::BLCS64rm:
4244 case X86::BLSFILL32rr: case X86::BLSFILL32rm:
4245 case X86::BLSFILL64rr: case X86::BLSFILL64rm:
4246 case X86::BLSIC32rr: case X86::BLSIC32rm:
4247 case X86::BLSIC64rr: case X86::BLSIC64rm:
4248 case X86::BZHI32rr: case X86::BZHI32rm:
4249 case X86::BZHI64rr: case X86::BZHI64rm:
4250 case X86::T1MSKC32rr: case X86::T1MSKC32rm:
4251 case X86::T1MSKC64rr: case X86::T1MSKC64rm:
4252 case X86::TZMSK32rr: case X86::TZMSK32rm:
4253 case X86::TZMSK64rr: case X86::TZMSK64rm:
4254 // These instructions clear the overflow flag just like TEST.
4255 // FIXME: These are not the only instructions in this switch that clear the
4256 // overflow flag.
4257 ClearsOverflowFlag = true;
4258 return true;
4259 case X86::BEXTR32rr: case X86::BEXTR64rr:
4260 case X86::BEXTR32rm: case X86::BEXTR64rm:
4261 case X86::BEXTRI32ri: case X86::BEXTRI32mi:
4262 case X86::BEXTRI64ri: case X86::BEXTRI64mi:
4263 // BEXTR doesn't update the sign flag so we can't use it. It does clear
4264 // the overflow flag, but that's not useful without the sign flag.
4265 NoSignFlag = true;
4266 return true;
4267 }
4268}
4269
4270/// Check whether the use can be converted to remove a comparison against zero.
4272 switch (MI.getOpcode()) {
4273 default: return X86::COND_INVALID;
4274 case X86::NEG8r:
4275 case X86::NEG16r:
4276 case X86::NEG32r:
4277 case X86::NEG64r:
4278 return X86::COND_AE;
4279 case X86::LZCNT16rr:
4280 case X86::LZCNT32rr:
4281 case X86::LZCNT64rr:
4282 return X86::COND_B;
4283 case X86::POPCNT16rr:
4284 case X86::POPCNT32rr:
4285 case X86::POPCNT64rr:
4286 return X86::COND_E;
4287 case X86::TZCNT16rr:
4288 case X86::TZCNT32rr:
4289 case X86::TZCNT64rr:
4290 return X86::COND_B;
4291 case X86::BSF16rr:
4292 case X86::BSF32rr:
4293 case X86::BSF64rr:
4294 case X86::BSR16rr:
4295 case X86::BSR32rr:
4296 case X86::BSR64rr:
4297 return X86::COND_E;
4298 case X86::BLSI32rr:
4299 case X86::BLSI64rr:
4300 return X86::COND_AE;
4301 case X86::BLSR32rr:
4302 case X86::BLSR64rr:
4303 case X86::BLSMSK32rr:
4304 case X86::BLSMSK64rr:
4305 return X86::COND_B;
4306 // TODO: TBM instructions.
4307 }
4308}
4309
4310/// Check if there exists an earlier instruction that
4311/// operates on the same source operands and sets flags in the same way as
4312/// Compare; remove Compare if possible.
4314 Register SrcReg2, int64_t CmpMask,
4315 int64_t CmpValue,
4316 const MachineRegisterInfo *MRI) const {
4317 // Check whether we can replace SUB with CMP.
4318 switch (CmpInstr.getOpcode()) {
4319 default: break;
4320 case X86::SUB64ri32:
4321 case X86::SUB64ri8:
4322 case X86::SUB32ri:
4323 case X86::SUB32ri8:
4324 case X86::SUB16ri:
4325 case X86::SUB16ri8:
4326 case X86::SUB8ri:
4327 case X86::SUB64rm:
4328 case X86::SUB32rm:
4329 case X86::SUB16rm:
4330 case X86::SUB8rm:
4331 case X86::SUB64rr:
4332 case X86::SUB32rr:
4333 case X86::SUB16rr:
4334 case X86::SUB8rr: {
4335 if (!MRI->use_nodbg_empty(CmpInstr.getOperand(0).getReg()))
4336 return false;
4337 // There is no use of the destination register, we can replace SUB with CMP.
4338 unsigned NewOpcode = 0;
4339 switch (CmpInstr.getOpcode()) {
4340 default: llvm_unreachable("Unreachable!");
4341 case X86::SUB64rm: NewOpcode = X86::CMP64rm; break;
4342 case X86::SUB32rm: NewOpcode = X86::CMP32rm; break;
4343 case X86::SUB16rm: NewOpcode = X86::CMP16rm; break;
4344 case X86::SUB8rm: NewOpcode = X86::CMP8rm; break;
4345 case X86::SUB64rr: NewOpcode = X86::CMP64rr; break;
4346 case X86::SUB32rr: NewOpcode = X86::CMP32rr; break;
4347 case X86::SUB16rr: NewOpcode = X86::CMP16rr; break;
4348 case X86::SUB8rr: NewOpcode = X86::CMP8rr; break;
4349 case X86::SUB64ri32: NewOpcode = X86::CMP64ri32; break;
4350 case X86::SUB64ri8: NewOpcode = X86::CMP64ri8; break;
4351 case X86::SUB32ri: NewOpcode = X86::CMP32ri; break;
4352 case X86::SUB32ri8: NewOpcode = X86::CMP32ri8; break;
4353 case X86::SUB16ri: NewOpcode = X86::CMP16ri; break;
4354 case X86::SUB16ri8: NewOpcode = X86::CMP16ri8; break;
4355 case X86::SUB8ri: NewOpcode = X86::CMP8ri; break;
4356 }
4357 CmpInstr.setDesc(get(NewOpcode));
4358 CmpInstr.removeOperand(0);
4359 // Mutating this instruction invalidates any debug data associated with it.
4360 CmpInstr.dropDebugNumber();
4361 // Fall through to optimize Cmp if Cmp is CMPrr or CMPri.
4362 if (NewOpcode == X86::CMP64rm || NewOpcode == X86::CMP32rm ||
4363 NewOpcode == X86::CMP16rm || NewOpcode == X86::CMP8rm)
4364 return false;
4365 }
4366 }
4367
4368 // The following code tries to remove the comparison by re-using EFLAGS
4369 // from earlier instructions.
4370
4371 bool IsCmpZero = (CmpMask != 0 && CmpValue == 0);
4372
4373 // Transformation currently requires SSA values.
4374 if (SrcReg2.isPhysical())
4375 return false;
4376 MachineInstr *SrcRegDef = MRI->getVRegDef(SrcReg);
4377 assert(SrcRegDef && "Must have a definition (SSA)");
4378
4379 MachineInstr *MI = nullptr;
4380 MachineInstr *Sub = nullptr;
4381 MachineInstr *Movr0Inst = nullptr;
4382 bool NoSignFlag = false;
4383 bool ClearsOverflowFlag = false;
4384 bool ShouldUpdateCC = false;
4385 bool IsSwapped = false;
4387 int64_t ImmDelta = 0;
4388
4389 // Search backward from CmpInstr for the next instruction defining EFLAGS.
4391 MachineBasicBlock &CmpMBB = *CmpInstr.getParent();
4393 std::next(MachineBasicBlock::reverse_iterator(CmpInstr));
4394 for (MachineBasicBlock *MBB = &CmpMBB;;) {
4395 for (MachineInstr &Inst : make_range(From, MBB->rend())) {
4396 // Try to use EFLAGS from the instruction defining %SrcReg. Example:
4397 // %eax = addl ...
4398 // ... // EFLAGS not changed
4399 // testl %eax, %eax // <-- can be removed
4400 if (&Inst == SrcRegDef) {
4401 if (IsCmpZero &&
4402 isDefConvertible(Inst, NoSignFlag, ClearsOverflowFlag)) {
4403 MI = &Inst;
4404 break;
4405 }
4406
4407 // Look back for the following pattern, in which case the test64rr
4408 // instruction could be erased.
4409 //
4410 // Example:
4411 // %reg = and32ri %in_reg, 5
4412 // ... // EFLAGS not changed.
4413 // %src_reg = subreg_to_reg 0, %reg, %subreg.sub_index
4414 // test64rr %src_reg, %src_reg, implicit-def $eflags
4415 MachineInstr *AndInstr = nullptr;
4416 if (IsCmpZero &&
4417 findRedundantFlagInstr(CmpInstr, Inst, MRI, &AndInstr, TRI,
4418 NoSignFlag, ClearsOverflowFlag)) {
4419 assert(AndInstr != nullptr && X86::isAND(AndInstr->getOpcode()));
4420 MI = AndInstr;
4421 break;
4422 }
4423 // Cannot find other candidates before definition of SrcReg.
4424 return false;
4425 }
4426
4427 if (Inst.modifiesRegister(X86::EFLAGS, TRI)) {
4428 // Try to use EFLAGS produced by an instruction reading %SrcReg.
4429 // Example:
4430 // %eax = ...
4431 // ...
4432 // popcntl %eax
4433 // ... // EFLAGS not changed
4434 // testl %eax, %eax // <-- can be removed
4435 if (IsCmpZero) {
4436 NewCC = isUseDefConvertible(Inst);
4437 if (NewCC != X86::COND_INVALID && Inst.getOperand(1).isReg() &&
4438 Inst.getOperand(1).getReg() == SrcReg) {
4439 ShouldUpdateCC = true;
4440 MI = &Inst;
4441 break;
4442 }
4443 }
4444
4445 // Try to use EFLAGS from an instruction with similar flag results.
4446 // Example:
4447 // sub x, y or cmp x, y
4448 // ... // EFLAGS not changed
4449 // cmp x, y // <-- can be removed
4450 if (isRedundantFlagInstr(CmpInstr, SrcReg, SrcReg2, CmpMask, CmpValue,
4451 Inst, &IsSwapped, &ImmDelta)) {
4452 Sub = &Inst;
4453 break;
4454 }
4455
4456 // MOV32r0 is implemented with xor which clobbers condition code. It is
4457 // safe to move up, if the definition to EFLAGS is dead and earlier
4458 // instructions do not read or write EFLAGS.
4459 if (!Movr0Inst && Inst.getOpcode() == X86::MOV32r0 &&
4460 Inst.registerDefIsDead(X86::EFLAGS, TRI)) {
4461 Movr0Inst = &Inst;
4462 continue;
4463 }
4464
4465 // Cannot do anything for any other EFLAG changes.
4466 return false;
4467 }
4468 }
4469
4470 if (MI || Sub)
4471 break;
4472
4473 // Reached begin of basic block. Continue in predecessor if there is
4474 // exactly one.
4475 if (MBB->pred_size() != 1)
4476 return false;
4477 MBB = *MBB->pred_begin();
4478 From = MBB->rbegin();
4479 }
4480
4481 // Scan forward from the instruction after CmpInstr for uses of EFLAGS.
4482 // It is safe to remove CmpInstr if EFLAGS is redefined or killed.
4483 // If we are done with the basic block, we need to check whether EFLAGS is
4484 // live-out.
4485 bool FlagsMayLiveOut = true;
4487 MachineBasicBlock::iterator AfterCmpInstr =
4488 std::next(MachineBasicBlock::iterator(CmpInstr));
4489 for (MachineInstr &Instr : make_range(AfterCmpInstr, CmpMBB.end())) {
4490 bool ModifyEFLAGS = Instr.modifiesRegister(X86::EFLAGS, TRI);
4491 bool UseEFLAGS = Instr.readsRegister(X86::EFLAGS, TRI);
4492 // We should check the usage if this instruction uses and updates EFLAGS.
4493 if (!UseEFLAGS && ModifyEFLAGS) {
4494 // It is safe to remove CmpInstr if EFLAGS is updated again.
4495 FlagsMayLiveOut = false;
4496 break;
4497 }
4498 if (!UseEFLAGS && !ModifyEFLAGS)
4499 continue;
4500
4501 // EFLAGS is used by this instruction.
4502 X86::CondCode OldCC = X86::getCondFromMI(Instr);
4503 if ((MI || IsSwapped || ImmDelta != 0) && OldCC == X86::COND_INVALID)
4504 return false;
4505
4506 X86::CondCode ReplacementCC = X86::COND_INVALID;
4507 if (MI) {
4508 switch (OldCC) {
4509 default: break;
4510 case X86::COND_A: case X86::COND_AE:
4511 case X86::COND_B: case X86::COND_BE:
4512 // CF is used, we can't perform this optimization.
4513 return false;
4514 case X86::COND_G: case X86::COND_GE:
4515 case X86::COND_L: case X86::COND_LE:
4516 // If SF is used, but the instruction doesn't update the SF, then we
4517 // can't do the optimization.
4518 if (NoSignFlag)
4519 return false;
4520 [[fallthrough]];
4521 case X86::COND_O: case X86::COND_NO:
4522 // If OF is used, the instruction needs to clear it like CmpZero does.
4523 if (!ClearsOverflowFlag)
4524 return false;
4525 break;
4526 case X86::COND_S: case X86::COND_NS:
4527 // If SF is used, but the instruction doesn't update the SF, then we
4528 // can't do the optimization.
4529 if (NoSignFlag)
4530 return false;
4531 break;
4532 }
4533
4534 // If we're updating the condition code check if we have to reverse the
4535 // condition.
4536 if (ShouldUpdateCC)
4537 switch (OldCC) {
4538 default:
4539 return false;
4540 case X86::COND_E:
4541 ReplacementCC = NewCC;
4542 break;
4543 case X86::COND_NE:
4544 ReplacementCC = GetOppositeBranchCondition(NewCC);
4545 break;
4546 }
4547 } else if (IsSwapped) {
4548 // If we have SUB(r1, r2) and CMP(r2, r1), the condition code needs
4549 // to be changed from r2 > r1 to r1 < r2, from r2 < r1 to r1 > r2, etc.
4550 // We swap the condition code and synthesize the new opcode.
4551 ReplacementCC = getSwappedCondition(OldCC);
4552 if (ReplacementCC == X86::COND_INVALID)
4553 return false;
4554 ShouldUpdateCC = true;
4555 } else if (ImmDelta != 0) {
4556 unsigned BitWidth = TRI->getRegSizeInBits(*MRI->getRegClass(SrcReg));
4557 // Shift amount for min/max constants to adjust for 8/16/32 instruction
4558 // sizes.
4559 switch (OldCC) {
4560 case X86::COND_L: // x <s (C + 1) --> x <=s C
4561 if (ImmDelta != 1 || APInt::getSignedMinValue(BitWidth) == CmpValue)
4562 return false;
4563 ReplacementCC = X86::COND_LE;
4564 break;
4565 case X86::COND_B: // x <u (C + 1) --> x <=u C
4566 if (ImmDelta != 1 || CmpValue == 0)
4567 return false;
4568 ReplacementCC = X86::COND_BE;
4569 break;
4570 case X86::COND_GE: // x >=s (C + 1) --> x >s C
4571 if (ImmDelta != 1 || APInt::getSignedMinValue(BitWidth) == CmpValue)
4572 return false;
4573 ReplacementCC = X86::COND_G;
4574 break;
4575 case X86::COND_AE: // x >=u (C + 1) --> x >u C
4576 if (ImmDelta != 1 || CmpValue == 0)
4577 return false;
4578 ReplacementCC = X86::COND_A;
4579 break;
4580 case X86::COND_G: // x >s (C - 1) --> x >=s C
4581 if (ImmDelta != -1 || APInt::getSignedMaxValue(BitWidth) == CmpValue)
4582 return false;
4583 ReplacementCC = X86::COND_GE;
4584 break;
4585 case X86::COND_A: // x >u (C - 1) --> x >=u C
4586 if (ImmDelta != -1 || APInt::getMaxValue(BitWidth) == CmpValue)
4587 return false;
4588 ReplacementCC = X86::COND_AE;
4589 break;
4590 case X86::COND_LE: // x <=s (C - 1) --> x <s C
4591 if (ImmDelta != -1 || APInt::getSignedMaxValue(BitWidth) == CmpValue)
4592 return false;
4593 ReplacementCC = X86::COND_L;
4594 break;
4595 case X86::COND_BE: // x <=u (C - 1) --> x <u C
4596 if (ImmDelta != -1 || APInt::getMaxValue(BitWidth) == CmpValue)
4597 return false;
4598 ReplacementCC = X86::COND_B;
4599 break;
4600 default:
4601 return false;
4602 }
4603 ShouldUpdateCC = true;
4604 }
4605
4606 if (ShouldUpdateCC && ReplacementCC != OldCC) {
4607 // Push the MachineInstr to OpsToUpdate.
4608 // If it is safe to remove CmpInstr, the condition code of these
4609 // instructions will be modified.
4610 OpsToUpdate.push_back(std::make_pair(&Instr, ReplacementCC));
4611 }
4612 if (ModifyEFLAGS || Instr.killsRegister(X86::EFLAGS, TRI)) {
4613 // It is safe to remove CmpInstr if EFLAGS is updated again or killed.
4614 FlagsMayLiveOut = false;
4615 break;
4616 }
4617 }
4618
4619 // If we have to update users but EFLAGS is live-out abort, since we cannot
4620 // easily find all of the users.
4621 if ((MI != nullptr || ShouldUpdateCC) && FlagsMayLiveOut) {
4622 for (MachineBasicBlock *Successor : CmpMBB.successors())
4623 if (Successor->isLiveIn(X86::EFLAGS))
4624 return false;
4625 }
4626
4627 // The instruction to be updated is either Sub or MI.
4628 assert((MI == nullptr || Sub == nullptr) && "Should not have Sub and MI set");
4629 Sub = MI != nullptr ? MI : Sub;
4630 MachineBasicBlock *SubBB = Sub->getParent();
4631 // Move Movr0Inst to the appropriate place before Sub.
4632 if (Movr0Inst) {
4633 // Only move within the same block so we don't accidentally move to a
4634 // block with higher execution frequency.
4635 if (&CmpMBB != SubBB)
4636 return false;
4637 // Look backwards until we find a def that doesn't use the current EFLAGS.
4639 InsertE = Sub->getParent()->rend();
4640 for (; InsertI != InsertE; ++InsertI) {
4641 MachineInstr *Instr = &*InsertI;
4642 if (!Instr->readsRegister(X86::EFLAGS, TRI) &&
4643 Instr->modifiesRegister(X86::EFLAGS, TRI)) {
4644 Movr0Inst->getParent()->remove(Movr0Inst);
4646 Movr0Inst);
4647 break;
4648 }
4649 }
4650 if (InsertI == InsertE)
4651 return false;
4652 }
4653
4654 // Make sure Sub instruction defines EFLAGS and mark the def live.
4655 MachineOperand *FlagDef = Sub->findRegisterDefOperand(X86::EFLAGS);
4656 assert(FlagDef && "Unable to locate a def EFLAGS operand");
4657 FlagDef->setIsDead(false);
4658
4659 CmpInstr.eraseFromParent();
4660
4661 // Modify the condition code of instructions in OpsToUpdate.
4662 for (auto &Op : OpsToUpdate) {
4663 Op.first->getOperand(Op.first->getDesc().getNumOperands() - 1)
4664 .setImm(Op.second);
4665 }
4666 // Add EFLAGS to block live-ins between CmpBB and block of flags producer.
4667 for (MachineBasicBlock *MBB = &CmpMBB; MBB != SubBB;
4668 MBB = *MBB->pred_begin()) {
4669 assert(MBB->pred_size() == 1 && "Expected exactly one predecessor");
4670 if (!MBB->isLiveIn(X86::EFLAGS))
4671 MBB->addLiveIn(X86::EFLAGS);
4672 }
4673 return true;
4674}
4675
4676/// Try to remove the load by folding it to a register
4677/// operand at the use. We fold the load instructions if load defines a virtual
4678/// register, the virtual register is used once in the same BB, and the
4679/// instructions in-between do not load or store, and have no side effects.
4681 const MachineRegisterInfo *MRI,
4682 Register &FoldAsLoadDefReg,
4683 MachineInstr *&DefMI) const {
4684 // Check whether we can move DefMI here.
4685 DefMI = MRI->getVRegDef(FoldAsLoadDefReg);
4686 assert(DefMI);
4687 bool SawStore = false;
4688 if (!DefMI->isSafeToMove(nullptr, SawStore))
4689 return nullptr;
4690
4691 // Collect information about virtual register operands of MI.
4692 SmallVector<unsigned, 1> SrcOperandIds;
4693 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
4694 MachineOperand &MO = MI.getOperand(i);
4695 if (!MO.isReg())
4696 continue;
4697 Register Reg = MO.getReg();
4698 if (Reg != FoldAsLoadDefReg)
4699 continue;
4700 // Do not fold if we have a subreg use or a def.
4701 if (MO.getSubReg() || MO.isDef())
4702 return nullptr;
4703 SrcOperandIds.push_back(i);
4704 }
4705 if (SrcOperandIds.empty())
4706 return nullptr;
4707
4708 // Check whether we can fold the def into SrcOperandId.
4709 if (MachineInstr *FoldMI = foldMemoryOperand(MI, SrcOperandIds, *DefMI)) {
4710 FoldAsLoadDefReg = 0;
4711 return FoldMI;
4712 }
4713
4714 return nullptr;
4715}
4716
4717/// Expand a single-def pseudo instruction to a two-addr
4718/// instruction with two undef reads of the register being defined.
4719/// This is used for mapping:
4720/// %xmm4 = V_SET0
4721/// to:
4722/// %xmm4 = PXORrr undef %xmm4, undef %xmm4
4723///
4725 const MCInstrDesc &Desc) {
4726 assert(Desc.getNumOperands() == 3 && "Expected two-addr instruction.");
4727 Register Reg = MIB.getReg(0);
4728 MIB->setDesc(Desc);
4729
4730 // MachineInstr::addOperand() will insert explicit operands before any
4731 // implicit operands.
4733 // But we don't trust that.
4734 assert(MIB.getReg(1) == Reg &&
4735 MIB.getReg(2) == Reg && "Misplaced operand");
4736 return true;
4737}
4738
4739/// Expand a single-def pseudo instruction to a two-addr
4740/// instruction with two %k0 reads.
4741/// This is used for mapping:
4742/// %k4 = K_SET1
4743/// to:
4744/// %k4 = KXNORrr %k0, %k0
4746 Register Reg) {
4747 assert(Desc.getNumOperands() == 3 && "Expected two-addr instruction.");
4748 MIB->setDesc(Desc);
4750 return true;
4751}
4752
4754 bool MinusOne) {
4755 MachineBasicBlock &MBB = *MIB->getParent();
4756 const DebugLoc &DL = MIB->getDebugLoc();
4757 Register Reg = MIB.getReg(0);
4758
4759 // Insert the XOR.
4760 BuildMI(MBB, MIB.getInstr(), DL, TII.get(X86::XOR32rr), Reg)
4761 .addReg(Reg, RegState::Undef)
4762 .addReg(Reg, RegState::Undef);
4763
4764 // Turn the pseudo into an INC or DEC.
4765 MIB->setDesc(TII.get(MinusOne ? X86::DEC32r : X86::INC32r));
4766 MIB.addReg(Reg);
4767
4768 return true;
4769}
4770
4772 const TargetInstrInfo &TII,
4773 const X86Subtarget &Subtarget) {
4774 MachineBasicBlock &MBB = *MIB->getParent();
4775 const DebugLoc &DL = MIB->getDebugLoc();
4776 int64_t Imm = MIB->getOperand(1).getImm();
4777 assert(Imm != 0 && "Using push/pop for 0 is not efficient.");
4779
4780 int StackAdjustment;
4781
4782 if (Subtarget.is64Bit()) {
4783 assert(MIB->getOpcode() == X86::MOV64ImmSExti8 ||
4784 MIB->getOpcode() == X86::MOV32ImmSExti8);
4785
4786 // Can't use push/pop lowering if the function might write to the red zone.
4787 X86MachineFunctionInfo *X86FI =
4789 if (X86FI->getUsesRedZone()) {
4790 MIB->setDesc(TII.get(MIB->getOpcode() ==
4791 X86::MOV32ImmSExti8 ? X86::MOV32ri : X86::MOV64ri));
4792 return true;
4793 }
4794
4795 // 64-bit mode doesn't have 32-bit push/pop, so use 64-bit operations and
4796 // widen the register if necessary.
4797 StackAdjustment = 8;
4798 BuildMI(MBB, I, DL, TII.get(X86::PUSH64i8)).addImm(Imm);
4799 MIB->setDesc(TII.get(X86::POP64r));
4800 MIB->getOperand(0)
4801 .setReg(getX86SubSuperRegister(MIB.getReg(0), 64));
4802 } else {
4803 assert(MIB->getOpcode() == X86::MOV32ImmSExti8);
4804 StackAdjustment = 4;
4805 BuildMI(MBB, I, DL, TII.get(X86::PUSH32i8)).addImm(Imm);
4806 MIB->setDesc(TII.get(X86::POP32r));
4807 }
4808 MIB->removeOperand(1);
4810
4811 // Build CFI if necessary.
4812 MachineFunction &MF = *MBB.getParent();
4813 const X86FrameLowering *TFL = Subtarget.getFrameLowering();
4814 bool IsWin64Prologue = MF.getTarget().getMCAsmInfo()->usesWindowsCFI();
4815 bool NeedsDwarfCFI = !IsWin64Prologue && MF.needsFrameMoves();
4816 bool EmitCFI = !TFL->hasFP(MF) && NeedsDwarfCFI;
4817 if (EmitCFI) {
4818 TFL->BuildCFI(MBB, I, DL,
4819 MCCFIInstruction::createAdjustCfaOffset(nullptr, StackAdjustment));
4820 TFL->BuildCFI(MBB, std::next(I), DL,
4821 MCCFIInstruction::createAdjustCfaOffset(nullptr, -StackAdjustment));
4822 }
4823
4824 return true;
4825}
4826
4827// LoadStackGuard has so far only been implemented for 64-bit MachO. Different
4828// code sequence is needed for other targets.
4830 const TargetInstrInfo &TII) {
4831 MachineBasicBlock &MBB = *MIB->getParent();
4832 const DebugLoc &DL = MIB->getDebugLoc();
4833 Register Reg = MIB.getReg(0);
4834 const GlobalValue *GV =
4835 cast<GlobalValue>((*MIB->memoperands_begin())->getValue());
4842
4843 BuildMI(MBB, I, DL, TII.get(X86::MOV64rm), Reg).addReg(X86::RIP).addImm(1)
4845 .addMemOperand(MMO);
4846 MIB->setDebugLoc(DL);
4847 MIB->setDesc(TII.get(X86::MOV64rm));
4848 MIB.addReg(Reg, RegState::Kill).addImm(1).addReg(0).addImm(0).addReg(0);
4849}
4850
4852 MachineBasicBlock &MBB = *MIB->getParent();
4853 MachineFunction &MF = *MBB.getParent();
4854 const X86Subtarget &Subtarget = MF.getSubtarget<X86Subtarget>();
4855 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
4856 unsigned XorOp =
4857 MIB->getOpcode() == X86::XOR64_FP ? X86::XOR64rr : X86::XOR32rr;
4858 MIB->setDesc(TII.get(XorOp));
4859 MIB.addReg(TRI->getFrameRegister(MF), RegState::Undef);
4860 return true;
4861}
4862
4863// This is used to handle spills for 128/256-bit registers when we have AVX512,
4864// but not VLX. If it uses an extended register we need to use an instruction
4865// that loads the lower 128/256-bit, but is available with only AVX512F.
4867 const TargetRegisterInfo *TRI,
4868 const MCInstrDesc &LoadDesc,
4869 const MCInstrDesc &BroadcastDesc,
4870 unsigned SubIdx) {
4871 Register DestReg = MIB.getReg(0);
4872 // Check if DestReg is XMM16-31 or YMM16-31.
4873 if (TRI->getEncodingValue(DestReg) < 16) {
4874 // We can use a normal VEX encoded load.
4875 MIB->setDesc(LoadDesc);
4876 } else {
4877 // Use a 128/256-bit VBROADCAST instruction.
4878 MIB->setDesc(BroadcastDesc);
4879 // Change the destination to a 512-bit register.
4880 DestReg = TRI->getMatchingSuperReg(DestReg, SubIdx, &X86::VR512RegClass);
4881 MIB->getOperand(0).setReg(DestReg);
4882 }
4883 return true;
4884}
4885
4886// This is used to handle spills for 128/256-bit registers when we have AVX512,
4887// but not VLX. If it uses an extended register we need to use an instruction
4888// that stores the lower 128/256-bit, but is available with only AVX512F.
4890 const TargetRegisterInfo *TRI,
4891 const MCInstrDesc &StoreDesc,
4892 const MCInstrDesc &ExtractDesc,
4893 unsigned SubIdx) {
4894 Register SrcReg = MIB.getReg(X86::AddrNumOperands);
4895 // Check if DestReg is XMM16-31 or YMM16-31.
4896 if (TRI->getEncodingValue(SrcReg) < 16) {
4897 // We can use a normal VEX encoded store.
4898 MIB->setDesc(StoreDesc);
4899 } else {
4900 // Use a VEXTRACTF instruction.
4901 MIB->setDesc(ExtractDesc);
4902 // Change the destination to a 512-bit register.
4903 SrcReg = TRI->getMatchingSuperReg(SrcReg, SubIdx, &X86::VR512RegClass);
4905 MIB.addImm(0x0); // Append immediate to extract from the lower bits.
4906 }
4907
4908 return true;
4909}
4910
4911static bool expandSHXDROT(MachineInstrBuilder &MIB, const MCInstrDesc &Desc) {
4912 MIB->setDesc(Desc);
4913 int64_t ShiftAmt = MIB->getOperand(2).getImm();
4914 // Temporarily remove the immediate so we can add another source register.
4915 MIB->removeOperand(2);
4916 // Add the register. Don't copy the kill flag if there is one.
4917 MIB.addReg(MIB.getReg(1),
4919 // Add back the immediate.
4920 MIB.addImm(ShiftAmt);
4921 return true;
4922}
4923
4925 bool HasAVX = Subtarget.hasAVX();
4926 MachineInstrBuilder MIB(*MI.getParent()->getParent(), MI);
4927 switch (MI.getOpcode()) {
4928 case X86::MOV32r0:
4929 return Expand2AddrUndef(MIB, get(X86::XOR32rr));
4930 case X86::MOV32r1:
4931 return expandMOV32r1(MIB, *this, /*MinusOne=*/ false);
4932 case X86::MOV32r_1:
4933 return expandMOV32r1(MIB, *this, /*MinusOne=*/ true);
4934 case X86::MOV32ImmSExti8:
4935 case X86::MOV64ImmSExti8:
4936 return ExpandMOVImmSExti8(MIB, *this, Subtarget);
4937 case X86::SETB_C32r:
4938 return Expand2AddrUndef(MIB, get(X86::SBB32rr));
4939 case X86::SETB_C64r:
4940 return Expand2AddrUndef(MIB, get(X86::SBB64rr));
4941 case X86::MMX_SET0:
4942 return Expand2AddrUndef(MIB, get(X86::MMX_PXORrr));
4943 case X86::V_SET0:
4944 case X86::FsFLD0SS:
4945 case X86::FsFLD0SD:
4946 case X86::FsFLD0SH:
4947 case X86::FsFLD0F128:
4948 return Expand2AddrUndef(MIB, get(HasAVX ? X86::VXORPSrr : X86::XORPSrr));
4949 case X86::AVX_SET0: {
4950 assert(HasAVX && "AVX not supported");
4952 Register SrcReg = MIB.getReg(0);
4953 Register XReg = TRI->getSubReg(SrcReg, X86::sub_xmm);
4954 MIB->getOperand(0).setReg(XReg);
4955 Expand2AddrUndef(MIB, get(X86::VXORPSrr));
4956 MIB.addReg(SrcReg, RegState::ImplicitDefine);
4957 return true;
4958 }
4959 case X86::AVX512_128_SET0:
4960 case X86::AVX512_FsFLD0SH:
4961 case X86::AVX512_FsFLD0SS:
4962 case X86::AVX512_FsFLD0SD:
4963 case X86::AVX512_FsFLD0F128: {
4964 bool HasVLX = Subtarget.hasVLX();
4965 Register SrcReg = MIB.getReg(0);
4967 if (HasVLX || TRI->getEncodingValue(SrcReg) < 16)
4968 return Expand2AddrUndef(MIB,
4969 get(HasVLX ? X86::VPXORDZ128rr : X86::VXORPSrr));
4970 // Extended register without VLX. Use a larger XOR.
4971 SrcReg =
4972 TRI->getMatchingSuperReg(SrcReg, X86::sub_xmm, &X86::VR512RegClass);
4973 MIB->getOperand(0).setReg(SrcReg);
4974 return Expand2AddrUndef(MIB, get(X86::VPXORDZrr));
4975 }
4976 case X86::AVX512_256_SET0:
4977 case X86::AVX512_512_SET0: {
4978 bool HasVLX = Subtarget.hasVLX();
4979 Register SrcReg = MIB.getReg(0);
4981 if (HasVLX || TRI->getEncodingValue(SrcReg) < 16) {
4982 Register XReg = TRI->getSubReg(SrcReg, X86::sub_xmm);
4983 MIB->getOperand(0).setReg(XReg);
4984 Expand2AddrUndef(MIB,
4985 get(HasVLX ? X86::VPXORDZ128rr : X86::VXORPSrr));
4986 MIB.addReg(SrcReg, RegState::ImplicitDefine);
4987 return true;
4988 }
4989 if (MI.getOpcode() == X86::AVX512_256_SET0) {
4990 // No VLX so we must reference a zmm.
4991 unsigned ZReg =
4992 TRI->getMatchingSuperReg(SrcReg, X86::sub_ymm, &X86::VR512RegClass);
4993 MIB->getOperand(0).setReg(ZReg);
4994 }
4995 return Expand2AddrUndef(MIB, get(X86::VPXORDZrr));
4996 }
4997 case X86::V_SETALLONES:
4998 return Expand2AddrUndef(MIB, get(HasAVX ? X86::VPCMPEQDrr : X86::PCMPEQDrr));
4999 case X86::AVX2_SETALLONES:
5000 return Expand2AddrUndef(MIB, get(X86::VPCMPEQDYrr));
5001 case X86::AVX1_SETALLONES: {
5002 Register Reg = MIB.getReg(0);
5003 // VCMPPSYrri with an immediate 0xf should produce VCMPTRUEPS.
5004 MIB->setDesc(get(X86::VCMPPSYrri));
5005 MIB.addReg(Reg,