LLVM 20.0.0git
X86InstrInfo.cpp
Go to the documentation of this file.
1//===-- X86InstrInfo.cpp - X86 Instruction Information --------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file contains the X86 implementation of the TargetInstrInfo class.
10//
11//===----------------------------------------------------------------------===//
12
13#include "X86InstrInfo.h"
14#include "X86.h"
15#include "X86InstrBuilder.h"
16#include "X86InstrFoldTables.h"
18#include "X86Subtarget.h"
19#include "X86TargetMachine.h"
20#include "llvm/ADT/STLExtras.h"
21#include "llvm/ADT/Sequence.h"
37#include "llvm/IR/Function.h"
38#include "llvm/IR/InstrTypes.h"
39#include "llvm/IR/Module.h"
40#include "llvm/MC/MCAsmInfo.h"
41#include "llvm/MC/MCExpr.h"
42#include "llvm/MC/MCInst.h"
44#include "llvm/Support/Debug.h"
48#include <optional>
49
50using namespace llvm;
51
52#define DEBUG_TYPE "x86-instr-info"
53
54#define GET_INSTRINFO_CTOR_DTOR
55#include "X86GenInstrInfo.inc"
56
57static cl::opt<bool>
58 NoFusing("disable-spill-fusing",
59 cl::desc("Disable fusing of spill code into instructions"),
61static cl::opt<bool>
62 PrintFailedFusing("print-failed-fuse-candidates",
63 cl::desc("Print instructions that the allocator wants to"
64 " fuse, but the X86 backend currently can't"),
66static cl::opt<bool>
67 ReMatPICStubLoad("remat-pic-stub-load",
68 cl::desc("Re-materialize load from stub in PIC mode"),
69 cl::init(false), cl::Hidden);
71 PartialRegUpdateClearance("partial-reg-update-clearance",
72 cl::desc("Clearance between two register writes "
73 "for inserting XOR to avoid partial "
74 "register update"),
75 cl::init(64), cl::Hidden);
77 "undef-reg-clearance",
78 cl::desc("How many idle instructions we would like before "
79 "certain undef register reads"),
80 cl::init(128), cl::Hidden);
81
82// Pin the vtable to this file.
83void X86InstrInfo::anchor() {}
84
86 : X86GenInstrInfo((STI.isTarget64BitLP64() ? X86::ADJCALLSTACKDOWN64
87 : X86::ADJCALLSTACKDOWN32),
88 (STI.isTarget64BitLP64() ? X86::ADJCALLSTACKUP64
89 : X86::ADJCALLSTACKUP32),
90 X86::CATCHRET, (STI.is64Bit() ? X86::RET64 : X86::RET32)),
91 Subtarget(STI), RI(STI.getTargetTriple()) {}
92
94X86InstrInfo::getRegClass(const MCInstrDesc &MCID, unsigned OpNum,
96 const MachineFunction &MF) const {
97 auto *RC = TargetInstrInfo::getRegClass(MCID, OpNum, TRI, MF);
98 // If the target does not have egpr, then r16-r31 will be resereved for all
99 // instructions.
100 if (!RC || !Subtarget.hasEGPR())
101 return RC;
102
104 return RC;
105
106 switch (RC->getID()) {
107 default:
108 return RC;
109 case X86::GR8RegClassID:
110 return &X86::GR8_NOREX2RegClass;
111 case X86::GR16RegClassID:
112 return &X86::GR16_NOREX2RegClass;
113 case X86::GR32RegClassID:
114 return &X86::GR32_NOREX2RegClass;
115 case X86::GR64RegClassID:
116 return &X86::GR64_NOREX2RegClass;
117 case X86::GR32_NOSPRegClassID:
118 return &X86::GR32_NOREX2_NOSPRegClass;
119 case X86::GR64_NOSPRegClassID:
120 return &X86::GR64_NOREX2_NOSPRegClass;
121 }
122}
123
125 Register &SrcReg, Register &DstReg,
126 unsigned &SubIdx) const {
127 switch (MI.getOpcode()) {
128 default:
129 break;
130 case X86::MOVSX16rr8:
131 case X86::MOVZX16rr8:
132 case X86::MOVSX32rr8:
133 case X86::MOVZX32rr8:
134 case X86::MOVSX64rr8:
135 if (!Subtarget.is64Bit())
136 // It's not always legal to reference the low 8-bit of the larger
137 // register in 32-bit mode.
138 return false;
139 [[fallthrough]];
140 case X86::MOVSX32rr16:
141 case X86::MOVZX32rr16:
142 case X86::MOVSX64rr16:
143 case X86::MOVSX64rr32: {
144 if (MI.getOperand(0).getSubReg() || MI.getOperand(1).getSubReg())
145 // Be conservative.
146 return false;
147 SrcReg = MI.getOperand(1).getReg();
148 DstReg = MI.getOperand(0).getReg();
149 switch (MI.getOpcode()) {
150 default:
151 llvm_unreachable("Unreachable!");
152 case X86::MOVSX16rr8:
153 case X86::MOVZX16rr8:
154 case X86::MOVSX32rr8:
155 case X86::MOVZX32rr8:
156 case X86::MOVSX64rr8:
157 SubIdx = X86::sub_8bit;
158 break;
159 case X86::MOVSX32rr16:
160 case X86::MOVZX32rr16:
161 case X86::MOVSX64rr16:
162 SubIdx = X86::sub_16bit;
163 break;
164 case X86::MOVSX64rr32:
165 SubIdx = X86::sub_32bit;
166 break;
167 }
168 return true;
169 }
170 }
171 return false;
172}
173
175 if (MI.mayLoad() || MI.mayStore())
176 return false;
177
178 // Some target-independent operations that trivially lower to data-invariant
179 // instructions.
180 if (MI.isCopyLike() || MI.isInsertSubreg())
181 return true;
182
183 unsigned Opcode = MI.getOpcode();
184 using namespace X86;
185 // On x86 it is believed that imul is constant time w.r.t. the loaded data.
186 // However, they set flags and are perhaps the most surprisingly constant
187 // time operations so we call them out here separately.
188 if (isIMUL(Opcode))
189 return true;
190 // Bit scanning and counting instructions that are somewhat surprisingly
191 // constant time as they scan across bits and do other fairly complex
192 // operations like popcnt, but are believed to be constant time on x86.
193 // However, these set flags.
194 if (isBSF(Opcode) || isBSR(Opcode) || isLZCNT(Opcode) || isPOPCNT(Opcode) ||
195 isTZCNT(Opcode))
196 return true;
197 // Bit manipulation instructions are effectively combinations of basic
198 // arithmetic ops, and should still execute in constant time. These also
199 // set flags.
200 if (isBLCFILL(Opcode) || isBLCI(Opcode) || isBLCIC(Opcode) ||
201 isBLCMSK(Opcode) || isBLCS(Opcode) || isBLSFILL(Opcode) ||
202 isBLSI(Opcode) || isBLSIC(Opcode) || isBLSMSK(Opcode) || isBLSR(Opcode) ||
203 isTZMSK(Opcode))
204 return true;
205 // Bit extracting and clearing instructions should execute in constant time,
206 // and set flags.
207 if (isBEXTR(Opcode) || isBZHI(Opcode))
208 return true;
209 // Shift and rotate.
210 if (isROL(Opcode) || isROR(Opcode) || isSAR(Opcode) || isSHL(Opcode) ||
211 isSHR(Opcode) || isSHLD(Opcode) || isSHRD(Opcode))
212 return true;
213 // Basic arithmetic is constant time on the input but does set flags.
214 if (isADC(Opcode) || isADD(Opcode) || isAND(Opcode) || isOR(Opcode) ||
215 isSBB(Opcode) || isSUB(Opcode) || isXOR(Opcode))
216 return true;
217 // Arithmetic with just 32-bit and 64-bit variants and no immediates.
218 if (isANDN(Opcode))
219 return true;
220 // Unary arithmetic operations.
221 if (isDEC(Opcode) || isINC(Opcode) || isNEG(Opcode))
222 return true;
223 // Unlike other arithmetic, NOT doesn't set EFLAGS.
224 if (isNOT(Opcode))
225 return true;
226 // Various move instructions used to zero or sign extend things. Note that we
227 // intentionally don't support the _NOREX variants as we can't handle that
228 // register constraint anyways.
229 if (isMOVSX(Opcode) || isMOVZX(Opcode) || isMOVSXD(Opcode) || isMOV(Opcode))
230 return true;
231 // Arithmetic instructions that are both constant time and don't set flags.
232 if (isRORX(Opcode) || isSARX(Opcode) || isSHLX(Opcode) || isSHRX(Opcode))
233 return true;
234 // LEA doesn't actually access memory, and its arithmetic is constant time.
235 if (isLEA(Opcode))
236 return true;
237 // By default, assume that the instruction is not data invariant.
238 return false;
239}
240
242 switch (MI.getOpcode()) {
243 default:
244 // By default, assume that the load will immediately leak.
245 return false;
246
247 // On x86 it is believed that imul is constant time w.r.t. the loaded data.
248 // However, they set flags and are perhaps the most surprisingly constant
249 // time operations so we call them out here separately.
250 case X86::IMUL16rm:
251 case X86::IMUL16rmi:
252 case X86::IMUL32rm:
253 case X86::IMUL32rmi:
254 case X86::IMUL64rm:
255 case X86::IMUL64rmi32:
256
257 // Bit scanning and counting instructions that are somewhat surprisingly
258 // constant time as they scan across bits and do other fairly complex
259 // operations like popcnt, but are believed to be constant time on x86.
260 // However, these set flags.
261 case X86::BSF16rm:
262 case X86::BSF32rm:
263 case X86::BSF64rm:
264 case X86::BSR16rm:
265 case X86::BSR32rm:
266 case X86::BSR64rm:
267 case X86::LZCNT16rm:
268 case X86::LZCNT32rm:
269 case X86::LZCNT64rm:
270 case X86::POPCNT16rm:
271 case X86::POPCNT32rm:
272 case X86::POPCNT64rm:
273 case X86::TZCNT16rm:
274 case X86::TZCNT32rm:
275 case X86::TZCNT64rm:
276
277 // Bit manipulation instructions are effectively combinations of basic
278 // arithmetic ops, and should still execute in constant time. These also
279 // set flags.
280 case X86::BLCFILL32rm:
281 case X86::BLCFILL64rm:
282 case X86::BLCI32rm:
283 case X86::BLCI64rm:
284 case X86::BLCIC32rm:
285 case X86::BLCIC64rm:
286 case X86::BLCMSK32rm:
287 case X86::BLCMSK64rm:
288 case X86::BLCS32rm:
289 case X86::BLCS64rm:
290 case X86::BLSFILL32rm:
291 case X86::BLSFILL64rm:
292 case X86::BLSI32rm:
293 case X86::BLSI64rm:
294 case X86::BLSIC32rm:
295 case X86::BLSIC64rm:
296 case X86::BLSMSK32rm:
297 case X86::BLSMSK64rm:
298 case X86::BLSR32rm:
299 case X86::BLSR64rm:
300 case X86::TZMSK32rm:
301 case X86::TZMSK64rm:
302
303 // Bit extracting and clearing instructions should execute in constant time,
304 // and set flags.
305 case X86::BEXTR32rm:
306 case X86::BEXTR64rm:
307 case X86::BEXTRI32mi:
308 case X86::BEXTRI64mi:
309 case X86::BZHI32rm:
310 case X86::BZHI64rm:
311
312 // Basic arithmetic is constant time on the input but does set flags.
313 case X86::ADC8rm:
314 case X86::ADC16rm:
315 case X86::ADC32rm:
316 case X86::ADC64rm:
317 case X86::ADD8rm:
318 case X86::ADD16rm:
319 case X86::ADD32rm:
320 case X86::ADD64rm:
321 case X86::AND8rm:
322 case X86::AND16rm:
323 case X86::AND32rm:
324 case X86::AND64rm:
325 case X86::ANDN32rm:
326 case X86::ANDN64rm:
327 case X86::OR8rm:
328 case X86::OR16rm:
329 case X86::OR32rm:
330 case X86::OR64rm:
331 case X86::SBB8rm:
332 case X86::SBB16rm:
333 case X86::SBB32rm:
334 case X86::SBB64rm:
335 case X86::SUB8rm:
336 case X86::SUB16rm:
337 case X86::SUB32rm:
338 case X86::SUB64rm:
339 case X86::XOR8rm:
340 case X86::XOR16rm:
341 case X86::XOR32rm:
342 case X86::XOR64rm:
343
344 // Integer multiply w/o affecting flags is still believed to be constant
345 // time on x86. Called out separately as this is among the most surprising
346 // instructions to exhibit that behavior.
347 case X86::MULX32rm:
348 case X86::MULX64rm:
349
350 // Arithmetic instructions that are both constant time and don't set flags.
351 case X86::RORX32mi:
352 case X86::RORX64mi:
353 case X86::SARX32rm:
354 case X86::SARX64rm:
355 case X86::SHLX32rm:
356 case X86::SHLX64rm:
357 case X86::SHRX32rm:
358 case X86::SHRX64rm:
359
360 // Conversions are believed to be constant time and don't set flags.
361 case X86::CVTTSD2SI64rm:
362 case X86::VCVTTSD2SI64rm:
363 case X86::VCVTTSD2SI64Zrm:
364 case X86::CVTTSD2SIrm:
365 case X86::VCVTTSD2SIrm:
366 case X86::VCVTTSD2SIZrm:
367 case X86::CVTTSS2SI64rm:
368 case X86::VCVTTSS2SI64rm:
369 case X86::VCVTTSS2SI64Zrm:
370 case X86::CVTTSS2SIrm:
371 case X86::VCVTTSS2SIrm:
372 case X86::VCVTTSS2SIZrm:
373 case X86::CVTSI2SDrm:
374 case X86::VCVTSI2SDrm:
375 case X86::VCVTSI2SDZrm:
376 case X86::CVTSI2SSrm:
377 case X86::VCVTSI2SSrm:
378 case X86::VCVTSI2SSZrm:
379 case X86::CVTSI642SDrm:
380 case X86::VCVTSI642SDrm:
381 case X86::VCVTSI642SDZrm:
382 case X86::CVTSI642SSrm:
383 case X86::VCVTSI642SSrm:
384 case X86::VCVTSI642SSZrm:
385 case X86::CVTSS2SDrm:
386 case X86::VCVTSS2SDrm:
387 case X86::VCVTSS2SDZrm:
388 case X86::CVTSD2SSrm:
389 case X86::VCVTSD2SSrm:
390 case X86::VCVTSD2SSZrm:
391 // AVX512 added unsigned integer conversions.
392 case X86::VCVTTSD2USI64Zrm:
393 case X86::VCVTTSD2USIZrm:
394 case X86::VCVTTSS2USI64Zrm:
395 case X86::VCVTTSS2USIZrm:
396 case X86::VCVTUSI2SDZrm:
397 case X86::VCVTUSI642SDZrm:
398 case X86::VCVTUSI2SSZrm:
399 case X86::VCVTUSI642SSZrm:
400
401 // Loads to register don't set flags.
402 case X86::MOV8rm:
403 case X86::MOV8rm_NOREX:
404 case X86::MOV16rm:
405 case X86::MOV32rm:
406 case X86::MOV64rm:
407 case X86::MOVSX16rm8:
408 case X86::MOVSX32rm16:
409 case X86::MOVSX32rm8:
410 case X86::MOVSX32rm8_NOREX:
411 case X86::MOVSX64rm16:
412 case X86::MOVSX64rm32:
413 case X86::MOVSX64rm8:
414 case X86::MOVZX16rm8:
415 case X86::MOVZX32rm16:
416 case X86::MOVZX32rm8:
417 case X86::MOVZX32rm8_NOREX:
418 case X86::MOVZX64rm16:
419 case X86::MOVZX64rm8:
420 return true;
421 }
422}
423
425 const MachineFunction *MF = MI.getParent()->getParent();
427
428 if (isFrameInstr(MI)) {
429 int SPAdj = alignTo(getFrameSize(MI), TFI->getStackAlign());
430 SPAdj -= getFrameAdjustment(MI);
431 if (!isFrameSetup(MI))
432 SPAdj = -SPAdj;
433 return SPAdj;
434 }
435
436 // To know whether a call adjusts the stack, we need information
437 // that is bound to the following ADJCALLSTACKUP pseudo.
438 // Look for the next ADJCALLSTACKUP that follows the call.
439 if (MI.isCall()) {
440 const MachineBasicBlock *MBB = MI.getParent();
442 for (auto E = MBB->end(); I != E; ++I) {
443 if (I->getOpcode() == getCallFrameDestroyOpcode() || I->isCall())
444 break;
445 }
446
447 // If we could not find a frame destroy opcode, then it has already
448 // been simplified, so we don't care.
449 if (I->getOpcode() != getCallFrameDestroyOpcode())
450 return 0;
451
452 return -(I->getOperand(1).getImm());
453 }
454
455 // Currently handle only PUSHes we can reasonably expect to see
456 // in call sequences
457 switch (MI.getOpcode()) {
458 default:
459 return 0;
460 case X86::PUSH32r:
461 case X86::PUSH32rmm:
462 case X86::PUSH32rmr:
463 case X86::PUSH32i:
464 return 4;
465 case X86::PUSH64r:
466 case X86::PUSH64rmm:
467 case X86::PUSH64rmr:
468 case X86::PUSH64i32:
469 return 8;
470 }
471}
472
473/// Return true and the FrameIndex if the specified
474/// operand and follow operands form a reference to the stack frame.
475bool X86InstrInfo::isFrameOperand(const MachineInstr &MI, unsigned int Op,
476 int &FrameIndex) const {
477 if (MI.getOperand(Op + X86::AddrBaseReg).isFI() &&
478 MI.getOperand(Op + X86::AddrScaleAmt).isImm() &&
479 MI.getOperand(Op + X86::AddrIndexReg).isReg() &&
480 MI.getOperand(Op + X86::AddrDisp).isImm() &&
481 MI.getOperand(Op + X86::AddrScaleAmt).getImm() == 1 &&
482 MI.getOperand(Op + X86::AddrIndexReg).getReg() == 0 &&
483 MI.getOperand(Op + X86::AddrDisp).getImm() == 0) {
484 FrameIndex = MI.getOperand(Op + X86::AddrBaseReg).getIndex();
485 return true;
486 }
487 return false;
488}
489
490static bool isFrameLoadOpcode(int Opcode, unsigned &MemBytes) {
491 switch (Opcode) {
492 default:
493 return false;
494 case X86::MOV8rm:
495 case X86::KMOVBkm:
496 case X86::KMOVBkm_EVEX:
497 MemBytes = 1;
498 return true;
499 case X86::MOV16rm:
500 case X86::KMOVWkm:
501 case X86::KMOVWkm_EVEX:
502 case X86::VMOVSHZrm:
503 case X86::VMOVSHZrm_alt:
504 MemBytes = 2;
505 return true;
506 case X86::MOV32rm:
507 case X86::MOVSSrm:
508 case X86::MOVSSrm_alt:
509 case X86::VMOVSSrm:
510 case X86::VMOVSSrm_alt:
511 case X86::VMOVSSZrm:
512 case X86::VMOVSSZrm_alt:
513 case X86::KMOVDkm:
514 case X86::KMOVDkm_EVEX:
515 MemBytes = 4;
516 return true;
517 case X86::MOV64rm:
518 case X86::LD_Fp64m:
519 case X86::MOVSDrm:
520 case X86::MOVSDrm_alt:
521 case X86::VMOVSDrm:
522 case X86::VMOVSDrm_alt:
523 case X86::VMOVSDZrm:
524 case X86::VMOVSDZrm_alt:
525 case X86::MMX_MOVD64rm:
526 case X86::MMX_MOVQ64rm:
527 case X86::KMOVQkm:
528 case X86::KMOVQkm_EVEX:
529 MemBytes = 8;
530 return true;
531 case X86::MOVAPSrm:
532 case X86::MOVUPSrm:
533 case X86::MOVAPDrm:
534 case X86::MOVUPDrm:
535 case X86::MOVDQArm:
536 case X86::MOVDQUrm:
537 case X86::VMOVAPSrm:
538 case X86::VMOVUPSrm:
539 case X86::VMOVAPDrm:
540 case X86::VMOVUPDrm:
541 case X86::VMOVDQArm:
542 case X86::VMOVDQUrm:
543 case X86::VMOVAPSZ128rm:
544 case X86::VMOVUPSZ128rm:
545 case X86::VMOVAPSZ128rm_NOVLX:
546 case X86::VMOVUPSZ128rm_NOVLX:
547 case X86::VMOVAPDZ128rm:
548 case X86::VMOVUPDZ128rm:
549 case X86::VMOVDQU8Z128rm:
550 case X86::VMOVDQU16Z128rm:
551 case X86::VMOVDQA32Z128rm:
552 case X86::VMOVDQU32Z128rm:
553 case X86::VMOVDQA64Z128rm:
554 case X86::VMOVDQU64Z128rm:
555 MemBytes = 16;
556 return true;
557 case X86::VMOVAPSYrm:
558 case X86::VMOVUPSYrm:
559 case X86::VMOVAPDYrm:
560 case X86::VMOVUPDYrm:
561 case X86::VMOVDQAYrm:
562 case X86::VMOVDQUYrm:
563 case X86::VMOVAPSZ256rm:
564 case X86::VMOVUPSZ256rm:
565 case X86::VMOVAPSZ256rm_NOVLX:
566 case X86::VMOVUPSZ256rm_NOVLX:
567 case X86::VMOVAPDZ256rm:
568 case X86::VMOVUPDZ256rm:
569 case X86::VMOVDQU8Z256rm:
570 case X86::VMOVDQU16Z256rm:
571 case X86::VMOVDQA32Z256rm:
572 case X86::VMOVDQU32Z256rm:
573 case X86::VMOVDQA64Z256rm:
574 case X86::VMOVDQU64Z256rm:
575 MemBytes = 32;
576 return true;
577 case X86::VMOVAPSZrm:
578 case X86::VMOVUPSZrm:
579 case X86::VMOVAPDZrm:
580 case X86::VMOVUPDZrm:
581 case X86::VMOVDQU8Zrm:
582 case X86::VMOVDQU16Zrm:
583 case X86::VMOVDQA32Zrm:
584 case X86::VMOVDQU32Zrm:
585 case X86::VMOVDQA64Zrm:
586 case X86::VMOVDQU64Zrm:
587 MemBytes = 64;
588 return true;
589 }
590}
591
592static bool isFrameStoreOpcode(int Opcode, unsigned &MemBytes) {
593 switch (Opcode) {
594 default:
595 return false;
596 case X86::MOV8mr:
597 case X86::KMOVBmk:
598 case X86::KMOVBmk_EVEX:
599 MemBytes = 1;
600 return true;
601 case X86::MOV16mr:
602 case X86::KMOVWmk:
603 case X86::KMOVWmk_EVEX:
604 case X86::VMOVSHZmr:
605 MemBytes = 2;
606 return true;
607 case X86::MOV32mr:
608 case X86::MOVSSmr:
609 case X86::VMOVSSmr:
610 case X86::VMOVSSZmr:
611 case X86::KMOVDmk:
612 case X86::KMOVDmk_EVEX:
613 MemBytes = 4;
614 return true;
615 case X86::MOV64mr:
616 case X86::ST_FpP64m:
617 case X86::MOVSDmr:
618 case X86::VMOVSDmr:
619 case X86::VMOVSDZmr:
620 case X86::MMX_MOVD64mr:
621 case X86::MMX_MOVQ64mr:
622 case X86::MMX_MOVNTQmr:
623 case X86::KMOVQmk:
624 case X86::KMOVQmk_EVEX:
625 MemBytes = 8;
626 return true;
627 case X86::MOVAPSmr:
628 case X86::MOVUPSmr:
629 case X86::MOVAPDmr:
630 case X86::MOVUPDmr:
631 case X86::MOVDQAmr:
632 case X86::MOVDQUmr:
633 case X86::VMOVAPSmr:
634 case X86::VMOVUPSmr:
635 case X86::VMOVAPDmr:
636 case X86::VMOVUPDmr:
637 case X86::VMOVDQAmr:
638 case X86::VMOVDQUmr:
639 case X86::VMOVUPSZ128mr:
640 case X86::VMOVAPSZ128mr:
641 case X86::VMOVUPSZ128mr_NOVLX:
642 case X86::VMOVAPSZ128mr_NOVLX:
643 case X86::VMOVUPDZ128mr:
644 case X86::VMOVAPDZ128mr:
645 case X86::VMOVDQA32Z128mr:
646 case X86::VMOVDQU32Z128mr:
647 case X86::VMOVDQA64Z128mr:
648 case X86::VMOVDQU64Z128mr:
649 case X86::VMOVDQU8Z128mr:
650 case X86::VMOVDQU16Z128mr:
651 MemBytes = 16;
652 return true;
653 case X86::VMOVUPSYmr:
654 case X86::VMOVAPSYmr:
655 case X86::VMOVUPDYmr:
656 case X86::VMOVAPDYmr:
657 case X86::VMOVDQUYmr:
658 case X86::VMOVDQAYmr:
659 case X86::VMOVUPSZ256mr:
660 case X86::VMOVAPSZ256mr:
661 case X86::VMOVUPSZ256mr_NOVLX:
662 case X86::VMOVAPSZ256mr_NOVLX:
663 case X86::VMOVUPDZ256mr:
664 case X86::VMOVAPDZ256mr:
665 case X86::VMOVDQU8Z256mr:
666 case X86::VMOVDQU16Z256mr:
667 case X86::VMOVDQA32Z256mr:
668 case X86::VMOVDQU32Z256mr:
669 case X86::VMOVDQA64Z256mr:
670 case X86::VMOVDQU64Z256mr:
671 MemBytes = 32;
672 return true;
673 case X86::VMOVUPSZmr:
674 case X86::VMOVAPSZmr:
675 case X86::VMOVUPDZmr:
676 case X86::VMOVAPDZmr:
677 case X86::VMOVDQU8Zmr:
678 case X86::VMOVDQU16Zmr:
679 case X86::VMOVDQA32Zmr:
680 case X86::VMOVDQU32Zmr:
681 case X86::VMOVDQA64Zmr:
682 case X86::VMOVDQU64Zmr:
683 MemBytes = 64;
684 return true;
685 }
686 return false;
687}
688
690 int &FrameIndex) const {
691 unsigned Dummy;
692 return X86InstrInfo::isLoadFromStackSlot(MI, FrameIndex, Dummy);
693}
694
696 int &FrameIndex,
697 unsigned &MemBytes) const {
698 if (isFrameLoadOpcode(MI.getOpcode(), MemBytes))
699 if (MI.getOperand(0).getSubReg() == 0 && isFrameOperand(MI, 1, FrameIndex))
700 return MI.getOperand(0).getReg();
701 return 0;
702}
703
705 int &FrameIndex) const {
706 unsigned Dummy;
707 if (isFrameLoadOpcode(MI.getOpcode(), Dummy)) {
708 unsigned Reg;
709 if ((Reg = isLoadFromStackSlot(MI, FrameIndex)))
710 return Reg;
711 // Check for post-frame index elimination operations
713 if (hasLoadFromStackSlot(MI, Accesses)) {
714 FrameIndex =
715 cast<FixedStackPseudoSourceValue>(Accesses.front()->getPseudoValue())
716 ->getFrameIndex();
717 return MI.getOperand(0).getReg();
718 }
719 }
720 return 0;
721}
722
724 int &FrameIndex) const {
725 unsigned Dummy;
726 return X86InstrInfo::isStoreToStackSlot(MI, FrameIndex, Dummy);
727}
728
730 int &FrameIndex,
731 unsigned &MemBytes) const {
732 if (isFrameStoreOpcode(MI.getOpcode(), MemBytes))
733 if (MI.getOperand(X86::AddrNumOperands).getSubReg() == 0 &&
734 isFrameOperand(MI, 0, FrameIndex))
735 return MI.getOperand(X86::AddrNumOperands).getReg();
736 return 0;
737}
738
740 int &FrameIndex) const {
741 unsigned Dummy;
742 if (isFrameStoreOpcode(MI.getOpcode(), Dummy)) {
743 unsigned Reg;
744 if ((Reg = isStoreToStackSlot(MI, FrameIndex)))
745 return Reg;
746 // Check for post-frame index elimination operations
748 if (hasStoreToStackSlot(MI, Accesses)) {
749 FrameIndex =
750 cast<FixedStackPseudoSourceValue>(Accesses.front()->getPseudoValue())
751 ->getFrameIndex();
752 return MI.getOperand(X86::AddrNumOperands).getReg();
753 }
754 }
755 return 0;
756}
757
758/// Return true if register is PIC base; i.e.g defined by X86::MOVPC32r.
759static bool regIsPICBase(Register BaseReg, const MachineRegisterInfo &MRI) {
760 // Don't waste compile time scanning use-def chains of physregs.
761 if (!BaseReg.isVirtual())
762 return false;
763 bool isPICBase = false;
764 for (const MachineInstr &DefMI : MRI.def_instructions(BaseReg)) {
765 if (DefMI.getOpcode() != X86::MOVPC32r)
766 return false;
767 assert(!isPICBase && "More than one PIC base?");
768 isPICBase = true;
769 }
770 return isPICBase;
771}
772
774 const MachineInstr &MI) const {
775 switch (MI.getOpcode()) {
776 default:
777 // This function should only be called for opcodes with the ReMaterializable
778 // flag set.
779 llvm_unreachable("Unknown rematerializable operation!");
780 break;
781 case X86::IMPLICIT_DEF:
782 // Defer to generic logic.
783 break;
784 case X86::LOAD_STACK_GUARD:
785 case X86::LD_Fp032:
786 case X86::LD_Fp064:
787 case X86::LD_Fp080:
788 case X86::LD_Fp132:
789 case X86::LD_Fp164:
790 case X86::LD_Fp180:
791 case X86::AVX1_SETALLONES:
792 case X86::AVX2_SETALLONES:
793 case X86::AVX512_128_SET0:
794 case X86::AVX512_256_SET0:
795 case X86::AVX512_512_SET0:
796 case X86::AVX512_512_SETALLONES:
797 case X86::AVX512_FsFLD0SD:
798 case X86::AVX512_FsFLD0SH:
799 case X86::AVX512_FsFLD0SS:
800 case X86::AVX512_FsFLD0F128:
801 case X86::AVX_SET0:
802 case X86::FsFLD0SD:
803 case X86::FsFLD0SS:
804 case X86::FsFLD0SH:
805 case X86::FsFLD0F128:
806 case X86::KSET0D:
807 case X86::KSET0Q:
808 case X86::KSET0W:
809 case X86::KSET1D:
810 case X86::KSET1Q:
811 case X86::KSET1W:
812 case X86::MMX_SET0:
813 case X86::MOV32ImmSExti8:
814 case X86::MOV32r0:
815 case X86::MOV32r1:
816 case X86::MOV32r_1:
817 case X86::MOV32ri64:
818 case X86::MOV64ImmSExti8:
819 case X86::V_SET0:
820 case X86::V_SETALLONES:
821 case X86::MOV16ri:
822 case X86::MOV32ri:
823 case X86::MOV64ri:
824 case X86::MOV64ri32:
825 case X86::MOV8ri:
826 case X86::PTILEZEROV:
827 return true;
828
829 case X86::MOV8rm:
830 case X86::MOV8rm_NOREX:
831 case X86::MOV16rm:
832 case X86::MOV32rm:
833 case X86::MOV64rm:
834 case X86::MOVSSrm:
835 case X86::MOVSSrm_alt:
836 case X86::MOVSDrm:
837 case X86::MOVSDrm_alt:
838 case X86::MOVAPSrm:
839 case X86::MOVUPSrm:
840 case X86::MOVAPDrm:
841 case X86::MOVUPDrm:
842 case X86::MOVDQArm:
843 case X86::MOVDQUrm:
844 case X86::VMOVSSrm:
845 case X86::VMOVSSrm_alt:
846 case X86::VMOVSDrm:
847 case X86::VMOVSDrm_alt:
848 case X86::VMOVAPSrm:
849 case X86::VMOVUPSrm:
850 case X86::VMOVAPDrm:
851 case X86::VMOVUPDrm:
852 case X86::VMOVDQArm:
853 case X86::VMOVDQUrm:
854 case X86::VMOVAPSYrm:
855 case X86::VMOVUPSYrm:
856 case X86::VMOVAPDYrm:
857 case X86::VMOVUPDYrm:
858 case X86::VMOVDQAYrm:
859 case X86::VMOVDQUYrm:
860 case X86::MMX_MOVD64rm:
861 case X86::MMX_MOVQ64rm:
862 case X86::VBROADCASTSSrm:
863 case X86::VBROADCASTSSYrm:
864 case X86::VBROADCASTSDYrm:
865 // AVX-512
866 case X86::VPBROADCASTBZ128rm:
867 case X86::VPBROADCASTBZ256rm:
868 case X86::VPBROADCASTBZrm:
869 case X86::VBROADCASTF32X2Z256rm:
870 case X86::VBROADCASTF32X2Zrm:
871 case X86::VBROADCASTI32X2Z128rm:
872 case X86::VBROADCASTI32X2Z256rm:
873 case X86::VBROADCASTI32X2Zrm:
874 case X86::VPBROADCASTWZ128rm:
875 case X86::VPBROADCASTWZ256rm:
876 case X86::VPBROADCASTWZrm:
877 case X86::VPBROADCASTDZ128rm:
878 case X86::VPBROADCASTDZ256rm:
879 case X86::VPBROADCASTDZrm:
880 case X86::VBROADCASTSSZ128rm:
881 case X86::VBROADCASTSSZ256rm:
882 case X86::VBROADCASTSSZrm:
883 case X86::VPBROADCASTQZ128rm:
884 case X86::VPBROADCASTQZ256rm:
885 case X86::VPBROADCASTQZrm:
886 case X86::VBROADCASTSDZ256rm:
887 case X86::VBROADCASTSDZrm:
888 case X86::VMOVSSZrm:
889 case X86::VMOVSSZrm_alt:
890 case X86::VMOVSDZrm:
891 case X86::VMOVSDZrm_alt:
892 case X86::VMOVSHZrm:
893 case X86::VMOVSHZrm_alt:
894 case X86::VMOVAPDZ128rm:
895 case X86::VMOVAPDZ256rm:
896 case X86::VMOVAPDZrm:
897 case X86::VMOVAPSZ128rm:
898 case X86::VMOVAPSZ256rm:
899 case X86::VMOVAPSZ128rm_NOVLX:
900 case X86::VMOVAPSZ256rm_NOVLX:
901 case X86::VMOVAPSZrm:
902 case X86::VMOVDQA32Z128rm:
903 case X86::VMOVDQA32Z256rm:
904 case X86::VMOVDQA32Zrm:
905 case X86::VMOVDQA64Z128rm:
906 case X86::VMOVDQA64Z256rm:
907 case X86::VMOVDQA64Zrm:
908 case X86::VMOVDQU16Z128rm:
909 case X86::VMOVDQU16Z256rm:
910 case X86::VMOVDQU16Zrm:
911 case X86::VMOVDQU32Z128rm:
912 case X86::VMOVDQU32Z256rm:
913 case X86::VMOVDQU32Zrm:
914 case X86::VMOVDQU64Z128rm:
915 case X86::VMOVDQU64Z256rm:
916 case X86::VMOVDQU64Zrm:
917 case X86::VMOVDQU8Z128rm:
918 case X86::VMOVDQU8Z256rm:
919 case X86::VMOVDQU8Zrm:
920 case X86::VMOVUPDZ128rm:
921 case X86::VMOVUPDZ256rm:
922 case X86::VMOVUPDZrm:
923 case X86::VMOVUPSZ128rm:
924 case X86::VMOVUPSZ256rm:
925 case X86::VMOVUPSZ128rm_NOVLX:
926 case X86::VMOVUPSZ256rm_NOVLX:
927 case X86::VMOVUPSZrm: {
928 // Loads from constant pools are trivially rematerializable.
929 if (MI.getOperand(1 + X86::AddrBaseReg).isReg() &&
930 MI.getOperand(1 + X86::AddrScaleAmt).isImm() &&
931 MI.getOperand(1 + X86::AddrIndexReg).isReg() &&
932 MI.getOperand(1 + X86::AddrIndexReg).getReg() == 0 &&
933 MI.isDereferenceableInvariantLoad()) {
934 Register BaseReg = MI.getOperand(1 + X86::AddrBaseReg).getReg();
935 if (BaseReg == 0 || BaseReg == X86::RIP)
936 return true;
937 // Allow re-materialization of PIC load.
938 if (!(!ReMatPICStubLoad && MI.getOperand(1 + X86::AddrDisp).isGlobal())) {
939 const MachineFunction &MF = *MI.getParent()->getParent();
940 const MachineRegisterInfo &MRI = MF.getRegInfo();
941 if (regIsPICBase(BaseReg, MRI))
942 return true;
943 }
944 }
945 break;
946 }
947
948 case X86::LEA32r:
949 case X86::LEA64r: {
950 if (MI.getOperand(1 + X86::AddrScaleAmt).isImm() &&
951 MI.getOperand(1 + X86::AddrIndexReg).isReg() &&
952 MI.getOperand(1 + X86::AddrIndexReg).getReg() == 0 &&
953 !MI.getOperand(1 + X86::AddrDisp).isReg()) {
954 // lea fi#, lea GV, etc. are all rematerializable.
955 if (!MI.getOperand(1 + X86::AddrBaseReg).isReg())
956 return true;
957 Register BaseReg = MI.getOperand(1 + X86::AddrBaseReg).getReg();
958 if (BaseReg == 0)
959 return true;
960 // Allow re-materialization of lea PICBase + x.
961 const MachineFunction &MF = *MI.getParent()->getParent();
962 const MachineRegisterInfo &MRI = MF.getRegInfo();
963 if (regIsPICBase(BaseReg, MRI))
964 return true;
965 }
966 break;
967 }
968 }
970}
971
974 Register DestReg, unsigned SubIdx,
975 const MachineInstr &Orig,
976 const TargetRegisterInfo &TRI) const {
977 bool ClobbersEFLAGS = Orig.modifiesRegister(X86::EFLAGS, &TRI);
978 if (ClobbersEFLAGS && MBB.computeRegisterLiveness(&TRI, X86::EFLAGS, I) !=
980 // The instruction clobbers EFLAGS. Re-materialize as MOV32ri to avoid side
981 // effects.
982 int Value;
983 switch (Orig.getOpcode()) {
984 case X86::MOV32r0:
985 Value = 0;
986 break;
987 case X86::MOV32r1:
988 Value = 1;
989 break;
990 case X86::MOV32r_1:
991 Value = -1;
992 break;
993 default:
994 llvm_unreachable("Unexpected instruction!");
995 }
996
997 const DebugLoc &DL = Orig.getDebugLoc();
998 BuildMI(MBB, I, DL, get(X86::MOV32ri))
999 .add(Orig.getOperand(0))
1000 .addImm(Value);
1001 } else {
1003 MBB.insert(I, MI);
1004 }
1005
1006 MachineInstr &NewMI = *std::prev(I);
1007 NewMI.substituteRegister(Orig.getOperand(0).getReg(), DestReg, SubIdx, TRI);
1008}
1009
1010/// True if MI has a condition code def, e.g. EFLAGS, that is not marked dead.
1012 for (const MachineOperand &MO : MI.operands()) {
1013 if (MO.isReg() && MO.isDef() && MO.getReg() == X86::EFLAGS &&
1014 !MO.isDead()) {
1015 return true;
1016 }
1017 }
1018 return false;
1019}
1020
1021/// Check whether the shift count for a machine operand is non-zero.
1022inline static unsigned getTruncatedShiftCount(const MachineInstr &MI,
1023 unsigned ShiftAmtOperandIdx) {
1024 // The shift count is six bits with the REX.W prefix and five bits without.
1025 unsigned ShiftCountMask = (MI.getDesc().TSFlags & X86II::REX_W) ? 63 : 31;
1026 unsigned Imm = MI.getOperand(ShiftAmtOperandIdx).getImm();
1027 return Imm & ShiftCountMask;
1028}
1029
1030/// Check whether the given shift count is appropriate
1031/// can be represented by a LEA instruction.
1032inline static bool isTruncatedShiftCountForLEA(unsigned ShAmt) {
1033 // Left shift instructions can be transformed into load-effective-address
1034 // instructions if we can encode them appropriately.
1035 // A LEA instruction utilizes a SIB byte to encode its scale factor.
1036 // The SIB.scale field is two bits wide which means that we can encode any
1037 // shift amount less than 4.
1038 return ShAmt < 4 && ShAmt > 0;
1039}
1040
1042 MachineInstr &CmpValDefInstr,
1043 const MachineRegisterInfo *MRI,
1044 MachineInstr **AndInstr,
1045 const TargetRegisterInfo *TRI,
1046 bool &NoSignFlag, bool &ClearsOverflowFlag) {
1047 if (!(CmpValDefInstr.getOpcode() == X86::SUBREG_TO_REG &&
1048 CmpInstr.getOpcode() == X86::TEST64rr) &&
1049 !(CmpValDefInstr.getOpcode() == X86::COPY &&
1050 CmpInstr.getOpcode() == X86::TEST16rr))
1051 return false;
1052
1053 // CmpInstr is a TEST16rr/TEST64rr instruction, and
1054 // `X86InstrInfo::analyzeCompare` guarantees that it's analyzable only if two
1055 // registers are identical.
1056 assert((CmpInstr.getOperand(0).getReg() == CmpInstr.getOperand(1).getReg()) &&
1057 "CmpInstr is an analyzable TEST16rr/TEST64rr, and "
1058 "`X86InstrInfo::analyzeCompare` requires two reg operands are the"
1059 "same.");
1060
1061 // Caller (`X86InstrInfo::optimizeCompareInstr`) guarantees that
1062 // `CmpValDefInstr` defines the value that's used by `CmpInstr`; in this case
1063 // if `CmpValDefInstr` sets the EFLAGS, it is likely that `CmpInstr` is
1064 // redundant.
1065 assert(
1066 (MRI->getVRegDef(CmpInstr.getOperand(0).getReg()) == &CmpValDefInstr) &&
1067 "Caller guarantees that TEST64rr is a user of SUBREG_TO_REG or TEST16rr "
1068 "is a user of COPY sub16bit.");
1069 MachineInstr *VregDefInstr = nullptr;
1070 if (CmpInstr.getOpcode() == X86::TEST16rr) {
1071 if (!CmpValDefInstr.getOperand(1).getReg().isVirtual())
1072 return false;
1073 VregDefInstr = MRI->getVRegDef(CmpValDefInstr.getOperand(1).getReg());
1074 if (!VregDefInstr)
1075 return false;
1076 // We can only remove test when AND32ri or AND64ri32 whose imm can fit 16bit
1077 // size, others 32/64 bit ops would test higher bits which test16rr don't
1078 // want to.
1079 if (!((VregDefInstr->getOpcode() == X86::AND32ri ||
1080 VregDefInstr->getOpcode() == X86::AND64ri32) &&
1081 isUInt<16>(VregDefInstr->getOperand(2).getImm())))
1082 return false;
1083 }
1084
1085 if (CmpInstr.getOpcode() == X86::TEST64rr) {
1086 // As seen in X86 td files, CmpValDefInstr.getOperand(1).getImm() is
1087 // typically 0.
1088 if (CmpValDefInstr.getOperand(1).getImm() != 0)
1089 return false;
1090
1091 // As seen in X86 td files, CmpValDefInstr.getOperand(3) is typically
1092 // sub_32bit or sub_xmm.
1093 if (CmpValDefInstr.getOperand(3).getImm() != X86::sub_32bit)
1094 return false;
1095
1096 VregDefInstr = MRI->getVRegDef(CmpValDefInstr.getOperand(2).getReg());
1097 }
1098
1099 assert(VregDefInstr && "Must have a definition (SSA)");
1100
1101 // Requires `CmpValDefInstr` and `VregDefInstr` are from the same MBB
1102 // to simplify the subsequent analysis.
1103 //
1104 // FIXME: If `VregDefInstr->getParent()` is the only predecessor of
1105 // `CmpValDefInstr.getParent()`, this could be handled.
1106 if (VregDefInstr->getParent() != CmpValDefInstr.getParent())
1107 return false;
1108
1109 if (X86::isAND(VregDefInstr->getOpcode())) {
1110 // Get a sequence of instructions like
1111 // %reg = and* ... // Set EFLAGS
1112 // ... // EFLAGS not changed
1113 // %extended_reg = subreg_to_reg 0, %reg, %subreg.sub_32bit
1114 // test64rr %extended_reg, %extended_reg, implicit-def $eflags
1115 // or
1116 // %reg = and32* ...
1117 // ... // EFLAGS not changed.
1118 // %src_reg = copy %reg.sub_16bit:gr32
1119 // test16rr %src_reg, %src_reg, implicit-def $eflags
1120 //
1121 // If subsequent readers use a subset of bits that don't change
1122 // after `and*` instructions, it's likely that the test64rr could
1123 // be optimized away.
1124 for (const MachineInstr &Instr :
1125 make_range(std::next(MachineBasicBlock::iterator(VregDefInstr)),
1126 MachineBasicBlock::iterator(CmpValDefInstr))) {
1127 // There are instructions between 'VregDefInstr' and
1128 // 'CmpValDefInstr' that modifies EFLAGS.
1129 if (Instr.modifiesRegister(X86::EFLAGS, TRI))
1130 return false;
1131 }
1132
1133 *AndInstr = VregDefInstr;
1134
1135 // AND instruction will essentially update SF and clear OF, so
1136 // NoSignFlag should be false in the sense that SF is modified by `AND`.
1137 //
1138 // However, the implementation artifically sets `NoSignFlag` to true
1139 // to poison the SF bit; that is to say, if SF is looked at later, the
1140 // optimization (to erase TEST64rr) will be disabled.
1141 //
1142 // The reason to poison SF bit is that SF bit value could be different
1143 // in the `AND` and `TEST` operation; signed bit is not known for `AND`,
1144 // and is known to be 0 as a result of `TEST64rr`.
1145 //
1146 // FIXME: As opposed to poisoning the SF bit directly, consider peeking into
1147 // the AND instruction and using the static information to guide peephole
1148 // optimization if possible. For example, it's possible to fold a
1149 // conditional move into a copy if the relevant EFLAG bits could be deduced
1150 // from an immediate operand of and operation.
1151 //
1152 NoSignFlag = true;
1153 // ClearsOverflowFlag is true for AND operation (no surprise).
1154 ClearsOverflowFlag = true;
1155 return true;
1156 }
1157 return false;
1158}
1159
1161 unsigned Opc, bool AllowSP, Register &NewSrc,
1162 bool &isKill, MachineOperand &ImplicitOp,
1163 LiveVariables *LV, LiveIntervals *LIS) const {
1164 MachineFunction &MF = *MI.getParent()->getParent();
1165 const TargetRegisterClass *RC;
1166 if (AllowSP) {
1167 RC = Opc != X86::LEA32r ? &X86::GR64RegClass : &X86::GR32RegClass;
1168 } else {
1169 RC = Opc != X86::LEA32r ? &X86::GR64_NOSPRegClass : &X86::GR32_NOSPRegClass;
1170 }
1171 Register SrcReg = Src.getReg();
1172 isKill = MI.killsRegister(SrcReg, /*TRI=*/nullptr);
1173
1174 // For both LEA64 and LEA32 the register already has essentially the right
1175 // type (32-bit or 64-bit) we may just need to forbid SP.
1176 if (Opc != X86::LEA64_32r) {
1177 NewSrc = SrcReg;
1178 assert(!Src.isUndef() && "Undef op doesn't need optimization");
1179
1180 if (NewSrc.isVirtual() && !MF.getRegInfo().constrainRegClass(NewSrc, RC))
1181 return false;
1182
1183 return true;
1184 }
1185
1186 // This is for an LEA64_32r and incoming registers are 32-bit. One way or
1187 // another we need to add 64-bit registers to the final MI.
1188 if (SrcReg.isPhysical()) {
1189 ImplicitOp = Src;
1190 ImplicitOp.setImplicit();
1191
1192 NewSrc = getX86SubSuperRegister(SrcReg, 64);
1193 assert(NewSrc.isValid() && "Invalid Operand");
1194 assert(!Src.isUndef() && "Undef op doesn't need optimization");
1195 } else {
1196 // Virtual register of the wrong class, we have to create a temporary 64-bit
1197 // vreg to feed into the LEA.
1198 NewSrc = MF.getRegInfo().createVirtualRegister(RC);
1199 MachineInstr *Copy =
1200 BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(TargetOpcode::COPY))
1201 .addReg(NewSrc, RegState::Define | RegState::Undef, X86::sub_32bit)
1202 .addReg(SrcReg, getKillRegState(isKill));
1203
1204 // Which is obviously going to be dead after we're done with it.
1205 isKill = true;
1206
1207 if (LV)
1208 LV->replaceKillInstruction(SrcReg, MI, *Copy);
1209
1210 if (LIS) {
1211 SlotIndex CopyIdx = LIS->InsertMachineInstrInMaps(*Copy);
1213 LiveInterval &LI = LIS->getInterval(SrcReg);
1215 if (S->end.getBaseIndex() == Idx)
1216 S->end = CopyIdx.getRegSlot();
1217 }
1218 }
1219
1220 // We've set all the parameters without issue.
1221 return true;
1222}
1223
1224MachineInstr *X86InstrInfo::convertToThreeAddressWithLEA(unsigned MIOpc,
1226 LiveVariables *LV,
1227 LiveIntervals *LIS,
1228 bool Is8BitOp) const {
1229 // We handle 8-bit adds and various 16-bit opcodes in the switch below.
1230 MachineBasicBlock &MBB = *MI.getParent();
1232 assert((Is8BitOp ||
1234 *RegInfo.getRegClass(MI.getOperand(0).getReg())) == 16) &&
1235 "Unexpected type for LEA transform");
1236
1237 // TODO: For a 32-bit target, we need to adjust the LEA variables with
1238 // something like this:
1239 // Opcode = X86::LEA32r;
1240 // InRegLEA = RegInfo.createVirtualRegister(&X86::GR32_NOSPRegClass);
1241 // OutRegLEA =
1242 // Is8BitOp ? RegInfo.createVirtualRegister(&X86::GR32ABCD_RegClass)
1243 // : RegInfo.createVirtualRegister(&X86::GR32RegClass);
1244 if (!Subtarget.is64Bit())
1245 return nullptr;
1246
1247 unsigned Opcode = X86::LEA64_32r;
1248 Register InRegLEA = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass);
1249 Register OutRegLEA = RegInfo.createVirtualRegister(&X86::GR32RegClass);
1250 Register InRegLEA2;
1251
1252 // Build and insert into an implicit UNDEF value. This is OK because
1253 // we will be shifting and then extracting the lower 8/16-bits.
1254 // This has the potential to cause partial register stall. e.g.
1255 // movw (%rbp,%rcx,2), %dx
1256 // leal -65(%rdx), %esi
1257 // But testing has shown this *does* help performance in 64-bit mode (at
1258 // least on modern x86 machines).
1259 MachineBasicBlock::iterator MBBI = MI.getIterator();
1260 Register Dest = MI.getOperand(0).getReg();
1261 Register Src = MI.getOperand(1).getReg();
1262 Register Src2;
1263 bool IsDead = MI.getOperand(0).isDead();
1264 bool IsKill = MI.getOperand(1).isKill();
1265 unsigned SubReg = Is8BitOp ? X86::sub_8bit : X86::sub_16bit;
1266 assert(!MI.getOperand(1).isUndef() && "Undef op doesn't need optimization");
1267 MachineInstr *ImpDef =
1268 BuildMI(MBB, MBBI, MI.getDebugLoc(), get(X86::IMPLICIT_DEF), InRegLEA);
1269 MachineInstr *InsMI =
1270 BuildMI(MBB, MBBI, MI.getDebugLoc(), get(TargetOpcode::COPY))
1271 .addReg(InRegLEA, RegState::Define, SubReg)
1272 .addReg(Src, getKillRegState(IsKill));
1273 MachineInstr *ImpDef2 = nullptr;
1274 MachineInstr *InsMI2 = nullptr;
1275
1277 BuildMI(MBB, MBBI, MI.getDebugLoc(), get(Opcode), OutRegLEA);
1278 switch (MIOpc) {
1279 default:
1280 llvm_unreachable("Unreachable!");
1281 case X86::SHL8ri:
1282 case X86::SHL16ri: {
1283 unsigned ShAmt = MI.getOperand(2).getImm();
1284 MIB.addReg(0)
1285 .addImm(1LL << ShAmt)
1286 .addReg(InRegLEA, RegState::Kill)
1287 .addImm(0)
1288 .addReg(0);
1289 break;
1290 }
1291 case X86::INC8r:
1292 case X86::INC16r:
1293 addRegOffset(MIB, InRegLEA, true, 1);
1294 break;
1295 case X86::DEC8r:
1296 case X86::DEC16r:
1297 addRegOffset(MIB, InRegLEA, true, -1);
1298 break;
1299 case X86::ADD8ri:
1300 case X86::ADD8ri_DB:
1301 case X86::ADD16ri:
1302 case X86::ADD16ri_DB:
1303 addRegOffset(MIB, InRegLEA, true, MI.getOperand(2).getImm());
1304 break;
1305 case X86::ADD8rr:
1306 case X86::ADD8rr_DB:
1307 case X86::ADD16rr:
1308 case X86::ADD16rr_DB: {
1309 Src2 = MI.getOperand(2).getReg();
1310 bool IsKill2 = MI.getOperand(2).isKill();
1311 assert(!MI.getOperand(2).isUndef() && "Undef op doesn't need optimization");
1312 if (Src == Src2) {
1313 // ADD8rr/ADD16rr killed %reg1028, %reg1028
1314 // just a single insert_subreg.
1315 addRegReg(MIB, InRegLEA, true, InRegLEA, false);
1316 } else {
1317 if (Subtarget.is64Bit())
1318 InRegLEA2 = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass);
1319 else
1320 InRegLEA2 = RegInfo.createVirtualRegister(&X86::GR32_NOSPRegClass);
1321 // Build and insert into an implicit UNDEF value. This is OK because
1322 // we will be shifting and then extracting the lower 8/16-bits.
1323 ImpDef2 = BuildMI(MBB, &*MIB, MI.getDebugLoc(), get(X86::IMPLICIT_DEF),
1324 InRegLEA2);
1325 InsMI2 = BuildMI(MBB, &*MIB, MI.getDebugLoc(), get(TargetOpcode::COPY))
1326 .addReg(InRegLEA2, RegState::Define, SubReg)
1327 .addReg(Src2, getKillRegState(IsKill2));
1328 addRegReg(MIB, InRegLEA, true, InRegLEA2, true);
1329 }
1330 if (LV && IsKill2 && InsMI2)
1331 LV->replaceKillInstruction(Src2, MI, *InsMI2);
1332 break;
1333 }
1334 }
1335
1336 MachineInstr *NewMI = MIB;
1337 MachineInstr *ExtMI =
1338 BuildMI(MBB, MBBI, MI.getDebugLoc(), get(TargetOpcode::COPY))
1340 .addReg(OutRegLEA, RegState::Kill, SubReg);
1341
1342 if (LV) {
1343 // Update live variables.
1344 LV->getVarInfo(InRegLEA).Kills.push_back(NewMI);
1345 if (InRegLEA2)
1346 LV->getVarInfo(InRegLEA2).Kills.push_back(NewMI);
1347 LV->getVarInfo(OutRegLEA).Kills.push_back(ExtMI);
1348 if (IsKill)
1349 LV->replaceKillInstruction(Src, MI, *InsMI);
1350 if (IsDead)
1351 LV->replaceKillInstruction(Dest, MI, *ExtMI);
1352 }
1353
1354 if (LIS) {
1355 LIS->InsertMachineInstrInMaps(*ImpDef);
1356 SlotIndex InsIdx = LIS->InsertMachineInstrInMaps(*InsMI);
1357 if (ImpDef2)
1358 LIS->InsertMachineInstrInMaps(*ImpDef2);
1359 SlotIndex Ins2Idx;
1360 if (InsMI2)
1361 Ins2Idx = LIS->InsertMachineInstrInMaps(*InsMI2);
1362 SlotIndex NewIdx = LIS->ReplaceMachineInstrInMaps(MI, *NewMI);
1363 SlotIndex ExtIdx = LIS->InsertMachineInstrInMaps(*ExtMI);
1364 LIS->getInterval(InRegLEA);
1365 LIS->getInterval(OutRegLEA);
1366 if (InRegLEA2)
1367 LIS->getInterval(InRegLEA2);
1368
1369 // Move the use of Src up to InsMI.
1370 LiveInterval &SrcLI = LIS->getInterval(Src);
1371 LiveRange::Segment *SrcSeg = SrcLI.getSegmentContaining(NewIdx);
1372 if (SrcSeg->end == NewIdx.getRegSlot())
1373 SrcSeg->end = InsIdx.getRegSlot();
1374
1375 if (InsMI2) {
1376 // Move the use of Src2 up to InsMI2.
1377 LiveInterval &Src2LI = LIS->getInterval(Src2);
1378 LiveRange::Segment *Src2Seg = Src2LI.getSegmentContaining(NewIdx);
1379 if (Src2Seg->end == NewIdx.getRegSlot())
1380 Src2Seg->end = Ins2Idx.getRegSlot();
1381 }
1382
1383 // Move the definition of Dest down to ExtMI.
1384 LiveInterval &DestLI = LIS->getInterval(Dest);
1385 LiveRange::Segment *DestSeg =
1386 DestLI.getSegmentContaining(NewIdx.getRegSlot());
1387 assert(DestSeg->start == NewIdx.getRegSlot() &&
1388 DestSeg->valno->def == NewIdx.getRegSlot());
1389 DestSeg->start = ExtIdx.getRegSlot();
1390 DestSeg->valno->def = ExtIdx.getRegSlot();
1391 }
1392
1393 return ExtMI;
1394}
1395
1396/// This method must be implemented by targets that
1397/// set the M_CONVERTIBLE_TO_3_ADDR flag. When this flag is set, the target
1398/// may be able to convert a two-address instruction into a true
1399/// three-address instruction on demand. This allows the X86 target (for
1400/// example) to convert ADD and SHL instructions into LEA instructions if they
1401/// would require register copies due to two-addressness.
1402///
1403/// This method returns a null pointer if the transformation cannot be
1404/// performed, otherwise it returns the new instruction.
1405///
1407 LiveVariables *LV,
1408 LiveIntervals *LIS) const {
1409 // The following opcodes also sets the condition code register(s). Only
1410 // convert them to equivalent lea if the condition code register def's
1411 // are dead!
1413 return nullptr;
1414
1415 MachineFunction &MF = *MI.getParent()->getParent();
1416 // All instructions input are two-addr instructions. Get the known operands.
1417 const MachineOperand &Dest = MI.getOperand(0);
1418 const MachineOperand &Src = MI.getOperand(1);
1419
1420 // Ideally, operations with undef should be folded before we get here, but we
1421 // can't guarantee it. Bail out because optimizing undefs is a waste of time.
1422 // Without this, we have to forward undef state to new register operands to
1423 // avoid machine verifier errors.
1424 if (Src.isUndef())
1425 return nullptr;
1426 if (MI.getNumOperands() > 2)
1427 if (MI.getOperand(2).isReg() && MI.getOperand(2).isUndef())
1428 return nullptr;
1429
1430 MachineInstr *NewMI = nullptr;
1431 Register SrcReg, SrcReg2;
1432 bool Is64Bit = Subtarget.is64Bit();
1433
1434 bool Is8BitOp = false;
1435 unsigned NumRegOperands = 2;
1436 unsigned MIOpc = MI.getOpcode();
1437 switch (MIOpc) {
1438 default:
1439 llvm_unreachable("Unreachable!");
1440 case X86::SHL64ri: {
1441 assert(MI.getNumOperands() >= 3 && "Unknown shift instruction!");
1442 unsigned ShAmt = getTruncatedShiftCount(MI, 2);
1443 if (!isTruncatedShiftCountForLEA(ShAmt))
1444 return nullptr;
1445
1446 // LEA can't handle RSP.
1447 if (Src.getReg().isVirtual() && !MF.getRegInfo().constrainRegClass(
1448 Src.getReg(), &X86::GR64_NOSPRegClass))
1449 return nullptr;
1450
1451 NewMI = BuildMI(MF, MI.getDebugLoc(), get(X86::LEA64r))
1452 .add(Dest)
1453 .addReg(0)
1454 .addImm(1LL << ShAmt)
1455 .add(Src)
1456 .addImm(0)
1457 .addReg(0);
1458 break;
1459 }
1460 case X86::SHL32ri: {
1461 assert(MI.getNumOperands() >= 3 && "Unknown shift instruction!");
1462 unsigned ShAmt = getTruncatedShiftCount(MI, 2);
1463 if (!isTruncatedShiftCountForLEA(ShAmt))
1464 return nullptr;
1465
1466 unsigned Opc = Is64Bit ? X86::LEA64_32r : X86::LEA32r;
1467
1468 // LEA can't handle ESP.
1469 bool isKill;
1470 MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
1471 if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/false, SrcReg, isKill,
1472 ImplicitOp, LV, LIS))
1473 return nullptr;
1474
1475 MachineInstrBuilder MIB = BuildMI(MF, MI.getDebugLoc(), get(Opc))
1476 .add(Dest)
1477 .addReg(0)
1478 .addImm(1LL << ShAmt)
1479 .addReg(SrcReg, getKillRegState(isKill))
1480 .addImm(0)
1481 .addReg(0);
1482 if (ImplicitOp.getReg() != 0)
1483 MIB.add(ImplicitOp);
1484 NewMI = MIB;
1485
1486 // Add kills if classifyLEAReg created a new register.
1487 if (LV && SrcReg != Src.getReg())
1488 LV->getVarInfo(SrcReg).Kills.push_back(NewMI);
1489 break;
1490 }
1491 case X86::SHL8ri:
1492 Is8BitOp = true;
1493 [[fallthrough]];
1494 case X86::SHL16ri: {
1495 assert(MI.getNumOperands() >= 3 && "Unknown shift instruction!");
1496 unsigned ShAmt = getTruncatedShiftCount(MI, 2);
1497 if (!isTruncatedShiftCountForLEA(ShAmt))
1498 return nullptr;
1499 return convertToThreeAddressWithLEA(MIOpc, MI, LV, LIS, Is8BitOp);
1500 }
1501 case X86::INC64r:
1502 case X86::INC32r: {
1503 assert(MI.getNumOperands() >= 2 && "Unknown inc instruction!");
1504 unsigned Opc = MIOpc == X86::INC64r
1505 ? X86::LEA64r
1506 : (Is64Bit ? X86::LEA64_32r : X86::LEA32r);
1507 bool isKill;
1508 MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
1509 if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/false, SrcReg, isKill,
1510 ImplicitOp, LV, LIS))
1511 return nullptr;
1512
1513 MachineInstrBuilder MIB = BuildMI(MF, MI.getDebugLoc(), get(Opc))
1514 .add(Dest)
1515 .addReg(SrcReg, getKillRegState(isKill));
1516 if (ImplicitOp.getReg() != 0)
1517 MIB.add(ImplicitOp);
1518
1519 NewMI = addOffset(MIB, 1);
1520
1521 // Add kills if classifyLEAReg created a new register.
1522 if (LV && SrcReg != Src.getReg())
1523 LV->getVarInfo(SrcReg).Kills.push_back(NewMI);
1524 break;
1525 }
1526 case X86::DEC64r:
1527 case X86::DEC32r: {
1528 assert(MI.getNumOperands() >= 2 && "Unknown dec instruction!");
1529 unsigned Opc = MIOpc == X86::DEC64r
1530 ? X86::LEA64r
1531 : (Is64Bit ? X86::LEA64_32r : X86::LEA32r);
1532
1533 bool isKill;
1534 MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
1535 if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/false, SrcReg, isKill,
1536 ImplicitOp, LV, LIS))
1537 return nullptr;
1538
1539 MachineInstrBuilder MIB = BuildMI(MF, MI.getDebugLoc(), get(Opc))
1540 .add(Dest)
1541 .addReg(SrcReg, getKillRegState(isKill));
1542 if (ImplicitOp.getReg() != 0)
1543 MIB.add(ImplicitOp);
1544
1545 NewMI = addOffset(MIB, -1);
1546
1547 // Add kills if classifyLEAReg created a new register.
1548 if (LV && SrcReg != Src.getReg())
1549 LV->getVarInfo(SrcReg).Kills.push_back(NewMI);
1550 break;
1551 }
1552 case X86::DEC8r:
1553 case X86::INC8r:
1554 Is8BitOp = true;
1555 [[fallthrough]];
1556 case X86::DEC16r:
1557 case X86::INC16r:
1558 return convertToThreeAddressWithLEA(MIOpc, MI, LV, LIS, Is8BitOp);
1559 case X86::ADD64rr:
1560 case X86::ADD64rr_DB:
1561 case X86::ADD32rr:
1562 case X86::ADD32rr_DB: {
1563 assert(MI.getNumOperands() >= 3 && "Unknown add instruction!");
1564 unsigned Opc;
1565 if (MIOpc == X86::ADD64rr || MIOpc == X86::ADD64rr_DB)
1566 Opc = X86::LEA64r;
1567 else
1568 Opc = Is64Bit ? X86::LEA64_32r : X86::LEA32r;
1569
1570 const MachineOperand &Src2 = MI.getOperand(2);
1571 bool isKill2;
1572 MachineOperand ImplicitOp2 = MachineOperand::CreateReg(0, false);
1573 if (!classifyLEAReg(MI, Src2, Opc, /*AllowSP=*/false, SrcReg2, isKill2,
1574 ImplicitOp2, LV, LIS))
1575 return nullptr;
1576
1577 bool isKill;
1578 MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
1579 if (Src.getReg() == Src2.getReg()) {
1580 // Don't call classify LEAReg a second time on the same register, in case
1581 // the first call inserted a COPY from Src2 and marked it as killed.
1582 isKill = isKill2;
1583 SrcReg = SrcReg2;
1584 } else {
1585 if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/true, SrcReg, isKill,
1586 ImplicitOp, LV, LIS))
1587 return nullptr;
1588 }
1589
1590 MachineInstrBuilder MIB = BuildMI(MF, MI.getDebugLoc(), get(Opc)).add(Dest);
1591 if (ImplicitOp.getReg() != 0)
1592 MIB.add(ImplicitOp);
1593 if (ImplicitOp2.getReg() != 0)
1594 MIB.add(ImplicitOp2);
1595
1596 NewMI = addRegReg(MIB, SrcReg, isKill, SrcReg2, isKill2);
1597
1598 // Add kills if classifyLEAReg created a new register.
1599 if (LV) {
1600 if (SrcReg2 != Src2.getReg())
1601 LV->getVarInfo(SrcReg2).Kills.push_back(NewMI);
1602 if (SrcReg != SrcReg2 && SrcReg != Src.getReg())
1603 LV->getVarInfo(SrcReg).Kills.push_back(NewMI);
1604 }
1605 NumRegOperands = 3;
1606 break;
1607 }
1608 case X86::ADD8rr:
1609 case X86::ADD8rr_DB:
1610 Is8BitOp = true;
1611 [[fallthrough]];
1612 case X86::ADD16rr:
1613 case X86::ADD16rr_DB:
1614 return convertToThreeAddressWithLEA(MIOpc, MI, LV, LIS, Is8BitOp);
1615 case X86::ADD64ri32:
1616 case X86::ADD64ri32_DB:
1617 assert(MI.getNumOperands() >= 3 && "Unknown add instruction!");
1618 NewMI = addOffset(
1619 BuildMI(MF, MI.getDebugLoc(), get(X86::LEA64r)).add(Dest).add(Src),
1620 MI.getOperand(2));
1621 break;
1622 case X86::ADD32ri:
1623 case X86::ADD32ri_DB: {
1624 assert(MI.getNumOperands() >= 3 && "Unknown add instruction!");
1625 unsigned Opc = Is64Bit ? X86::LEA64_32r : X86::LEA32r;
1626
1627 bool isKill;
1628 MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
1629 if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/true, SrcReg, isKill,
1630 ImplicitOp, LV, LIS))
1631 return nullptr;
1632
1633 MachineInstrBuilder MIB = BuildMI(MF, MI.getDebugLoc(), get(Opc))
1634 .add(Dest)
1635 .addReg(SrcReg, getKillRegState(isKill));
1636 if (ImplicitOp.getReg() != 0)
1637 MIB.add(ImplicitOp);
1638
1639 NewMI = addOffset(MIB, MI.getOperand(2));
1640
1641 // Add kills if classifyLEAReg created a new register.
1642 if (LV && SrcReg != Src.getReg())
1643 LV->getVarInfo(SrcReg).Kills.push_back(NewMI);
1644 break;
1645 }
1646 case X86::ADD8ri:
1647 case X86::ADD8ri_DB:
1648 Is8BitOp = true;
1649 [[fallthrough]];
1650 case X86::ADD16ri:
1651 case X86::ADD16ri_DB:
1652 return convertToThreeAddressWithLEA(MIOpc, MI, LV, LIS, Is8BitOp);
1653 case X86::SUB8ri:
1654 case X86::SUB16ri:
1655 /// FIXME: Support these similar to ADD8ri/ADD16ri*.
1656 return nullptr;
1657 case X86::SUB32ri: {
1658 if (!MI.getOperand(2).isImm())
1659 return nullptr;
1660 int64_t Imm = MI.getOperand(2).getImm();
1661 if (!isInt<32>(-Imm))
1662 return nullptr;
1663
1664 assert(MI.getNumOperands() >= 3 && "Unknown add instruction!");
1665 unsigned Opc = Is64Bit ? X86::LEA64_32r : X86::LEA32r;
1666
1667 bool isKill;
1668 MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
1669 if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/true, SrcReg, isKill,
1670 ImplicitOp, LV, LIS))
1671 return nullptr;
1672
1673 MachineInstrBuilder MIB = BuildMI(MF, MI.getDebugLoc(), get(Opc))
1674 .add(Dest)
1675 .addReg(SrcReg, getKillRegState(isKill));
1676 if (ImplicitOp.getReg() != 0)
1677 MIB.add(ImplicitOp);
1678
1679 NewMI = addOffset(MIB, -Imm);
1680
1681 // Add kills if classifyLEAReg created a new register.
1682 if (LV && SrcReg != Src.getReg())
1683 LV->getVarInfo(SrcReg).Kills.push_back(NewMI);
1684 break;
1685 }
1686
1687 case X86::SUB64ri32: {
1688 if (!MI.getOperand(2).isImm())
1689 return nullptr;
1690 int64_t Imm = MI.getOperand(2).getImm();
1691 if (!isInt<32>(-Imm))
1692 return nullptr;
1693
1694 assert(MI.getNumOperands() >= 3 && "Unknown sub instruction!");
1695
1697 BuildMI(MF, MI.getDebugLoc(), get(X86::LEA64r)).add(Dest).add(Src);
1698 NewMI = addOffset(MIB, -Imm);
1699 break;
1700 }
1701
1702 case X86::VMOVDQU8Z128rmk:
1703 case X86::VMOVDQU8Z256rmk:
1704 case X86::VMOVDQU8Zrmk:
1705 case X86::VMOVDQU16Z128rmk:
1706 case X86::VMOVDQU16Z256rmk:
1707 case X86::VMOVDQU16Zrmk:
1708 case X86::VMOVDQU32Z128rmk:
1709 case X86::VMOVDQA32Z128rmk:
1710 case X86::VMOVDQU32Z256rmk:
1711 case X86::VMOVDQA32Z256rmk:
1712 case X86::VMOVDQU32Zrmk:
1713 case X86::VMOVDQA32Zrmk:
1714 case X86::VMOVDQU64Z128rmk:
1715 case X86::VMOVDQA64Z128rmk:
1716 case X86::VMOVDQU64Z256rmk:
1717 case X86::VMOVDQA64Z256rmk:
1718 case X86::VMOVDQU64Zrmk:
1719 case X86::VMOVDQA64Zrmk:
1720 case X86::VMOVUPDZ128rmk:
1721 case X86::VMOVAPDZ128rmk:
1722 case X86::VMOVUPDZ256rmk:
1723 case X86::VMOVAPDZ256rmk:
1724 case X86::VMOVUPDZrmk:
1725 case X86::VMOVAPDZrmk:
1726 case X86::VMOVUPSZ128rmk:
1727 case X86::VMOVAPSZ128rmk:
1728 case X86::VMOVUPSZ256rmk:
1729 case X86::VMOVAPSZ256rmk:
1730 case X86::VMOVUPSZrmk:
1731 case X86::VMOVAPSZrmk:
1732 case X86::VBROADCASTSDZ256rmk:
1733 case X86::VBROADCASTSDZrmk:
1734 case X86::VBROADCASTSSZ128rmk:
1735 case X86::VBROADCASTSSZ256rmk:
1736 case X86::VBROADCASTSSZrmk:
1737 case X86::VPBROADCASTDZ128rmk:
1738 case X86::VPBROADCASTDZ256rmk:
1739 case X86::VPBROADCASTDZrmk:
1740 case X86::VPBROADCASTQZ128rmk:
1741 case X86::VPBROADCASTQZ256rmk:
1742 case X86::VPBROADCASTQZrmk: {
1743 unsigned Opc;
1744 switch (MIOpc) {
1745 default:
1746 llvm_unreachable("Unreachable!");
1747 case X86::VMOVDQU8Z128rmk:
1748 Opc = X86::VPBLENDMBZ128rmk;
1749 break;
1750 case X86::VMOVDQU8Z256rmk:
1751 Opc = X86::VPBLENDMBZ256rmk;
1752 break;
1753 case X86::VMOVDQU8Zrmk:
1754 Opc = X86::VPBLENDMBZrmk;
1755 break;
1756 case X86::VMOVDQU16Z128rmk:
1757 Opc = X86::VPBLENDMWZ128rmk;
1758 break;
1759 case X86::VMOVDQU16Z256rmk:
1760 Opc = X86::VPBLENDMWZ256rmk;
1761 break;
1762 case X86::VMOVDQU16Zrmk:
1763 Opc = X86::VPBLENDMWZrmk;
1764 break;
1765 case X86::VMOVDQU32Z128rmk:
1766 Opc = X86::VPBLENDMDZ128rmk;
1767 break;
1768 case X86::VMOVDQU32Z256rmk:
1769 Opc = X86::VPBLENDMDZ256rmk;
1770 break;
1771 case X86::VMOVDQU32Zrmk:
1772 Opc = X86::VPBLENDMDZrmk;
1773 break;
1774 case X86::VMOVDQU64Z128rmk:
1775 Opc = X86::VPBLENDMQZ128rmk;
1776 break;
1777 case X86::VMOVDQU64Z256rmk:
1778 Opc = X86::VPBLENDMQZ256rmk;
1779 break;
1780 case X86::VMOVDQU64Zrmk:
1781 Opc = X86::VPBLENDMQZrmk;
1782 break;
1783 case X86::VMOVUPDZ128rmk:
1784 Opc = X86::VBLENDMPDZ128rmk;
1785 break;
1786 case X86::VMOVUPDZ256rmk:
1787 Opc = X86::VBLENDMPDZ256rmk;
1788 break;
1789 case X86::VMOVUPDZrmk:
1790 Opc = X86::VBLENDMPDZrmk;
1791 break;
1792 case X86::VMOVUPSZ128rmk:
1793 Opc = X86::VBLENDMPSZ128rmk;
1794 break;
1795 case X86::VMOVUPSZ256rmk:
1796 Opc = X86::VBLENDMPSZ256rmk;
1797 break;
1798 case X86::VMOVUPSZrmk:
1799 Opc = X86::VBLENDMPSZrmk;
1800 break;
1801 case X86::VMOVDQA32Z128rmk:
1802 Opc = X86::VPBLENDMDZ128rmk;
1803 break;
1804 case X86::VMOVDQA32Z256rmk:
1805 Opc = X86::VPBLENDMDZ256rmk;
1806 break;
1807 case X86::VMOVDQA32Zrmk:
1808 Opc = X86::VPBLENDMDZrmk;
1809 break;
1810 case X86::VMOVDQA64Z128rmk:
1811 Opc = X86::VPBLENDMQZ128rmk;
1812 break;
1813 case X86::VMOVDQA64Z256rmk:
1814 Opc = X86::VPBLENDMQZ256rmk;
1815 break;
1816 case X86::VMOVDQA64Zrmk:
1817 Opc = X86::VPBLENDMQZrmk;
1818 break;
1819 case X86::VMOVAPDZ128rmk:
1820 Opc = X86::VBLENDMPDZ128rmk;
1821 break;
1822 case X86::VMOVAPDZ256rmk:
1823 Opc = X86::VBLENDMPDZ256rmk;
1824 break;
1825 case X86::VMOVAPDZrmk:
1826 Opc = X86::VBLENDMPDZrmk;
1827 break;
1828 case X86::VMOVAPSZ128rmk:
1829 Opc = X86::VBLENDMPSZ128rmk;
1830 break;
1831 case X86::VMOVAPSZ256rmk:
1832 Opc = X86::VBLENDMPSZ256rmk;
1833 break;
1834 case X86::VMOVAPSZrmk:
1835 Opc = X86::VBLENDMPSZrmk;
1836 break;
1837 case X86::VBROADCASTSDZ256rmk:
1838 Opc = X86::VBLENDMPDZ256rmbk;
1839 break;
1840 case X86::VBROADCASTSDZrmk:
1841 Opc = X86::VBLENDMPDZrmbk;
1842 break;
1843 case X86::VBROADCASTSSZ128rmk:
1844 Opc = X86::VBLENDMPSZ128rmbk;
1845 break;
1846 case X86::VBROADCASTSSZ256rmk:
1847 Opc = X86::VBLENDMPSZ256rmbk;
1848 break;
1849 case X86::VBROADCASTSSZrmk:
1850 Opc = X86::VBLENDMPSZrmbk;
1851 break;
1852 case X86::VPBROADCASTDZ128rmk:
1853 Opc = X86::VPBLENDMDZ128rmbk;
1854 break;
1855 case X86::VPBROADCASTDZ256rmk:
1856 Opc = X86::VPBLENDMDZ256rmbk;
1857 break;
1858 case X86::VPBROADCASTDZrmk:
1859 Opc = X86::VPBLENDMDZrmbk;
1860 break;
1861 case X86::VPBROADCASTQZ128rmk:
1862 Opc = X86::VPBLENDMQZ128rmbk;
1863 break;
1864 case X86::VPBROADCASTQZ256rmk:
1865 Opc = X86::VPBLENDMQZ256rmbk;
1866 break;
1867 case X86::VPBROADCASTQZrmk:
1868 Opc = X86::VPBLENDMQZrmbk;
1869 break;
1870 }
1871
1872 NewMI = BuildMI(MF, MI.getDebugLoc(), get(Opc))
1873 .add(Dest)
1874 .add(MI.getOperand(2))
1875 .add(Src)
1876 .add(MI.getOperand(3))
1877 .add(MI.getOperand(4))
1878 .add(MI.getOperand(5))
1879 .add(MI.getOperand(6))
1880 .add(MI.getOperand(7));
1881 NumRegOperands = 4;
1882 break;
1883 }
1884
1885 case X86::VMOVDQU8Z128rrk:
1886 case X86::VMOVDQU8Z256rrk:
1887 case X86::VMOVDQU8Zrrk:
1888 case X86::VMOVDQU16Z128rrk:
1889 case X86::VMOVDQU16Z256rrk:
1890 case X86::VMOVDQU16Zrrk:
1891 case X86::VMOVDQU32Z128rrk:
1892 case X86::VMOVDQA32Z128rrk:
1893 case X86::VMOVDQU32Z256rrk:
1894 case X86::VMOVDQA32Z256rrk:
1895 case X86::VMOVDQU32Zrrk:
1896 case X86::VMOVDQA32Zrrk:
1897 case X86::VMOVDQU64Z128rrk:
1898 case X86::VMOVDQA64Z128rrk:
1899 case X86::VMOVDQU64Z256rrk:
1900 case X86::VMOVDQA64Z256rrk:
1901 case X86::VMOVDQU64Zrrk:
1902 case X86::VMOVDQA64Zrrk:
1903 case X86::VMOVUPDZ128rrk:
1904 case X86::VMOVAPDZ128rrk:
1905 case X86::VMOVUPDZ256rrk:
1906 case X86::VMOVAPDZ256rrk:
1907 case X86::VMOVUPDZrrk:
1908 case X86::VMOVAPDZrrk:
1909 case X86::VMOVUPSZ128rrk:
1910 case X86::VMOVAPSZ128rrk:
1911 case X86::VMOVUPSZ256rrk:
1912 case X86::VMOVAPSZ256rrk:
1913 case X86::VMOVUPSZrrk:
1914 case X86::VMOVAPSZrrk: {
1915 unsigned Opc;
1916 switch (MIOpc) {
1917 default:
1918 llvm_unreachable("Unreachable!");
1919 case X86::VMOVDQU8Z128rrk:
1920 Opc = X86::VPBLENDMBZ128rrk;
1921 break;
1922 case X86::VMOVDQU8Z256rrk:
1923 Opc = X86::VPBLENDMBZ256rrk;
1924 break;
1925 case X86::VMOVDQU8Zrrk:
1926 Opc = X86::VPBLENDMBZrrk;
1927 break;
1928 case X86::VMOVDQU16Z128rrk:
1929 Opc = X86::VPBLENDMWZ128rrk;
1930 break;
1931 case X86::VMOVDQU16Z256rrk:
1932 Opc = X86::VPBLENDMWZ256rrk;
1933 break;
1934 case X86::VMOVDQU16Zrrk:
1935 Opc = X86::VPBLENDMWZrrk;
1936 break;
1937 case X86::VMOVDQU32Z128rrk:
1938 Opc = X86::VPBLENDMDZ128rrk;
1939 break;
1940 case X86::VMOVDQU32Z256rrk:
1941 Opc = X86::VPBLENDMDZ256rrk;
1942 break;
1943 case X86::VMOVDQU32Zrrk:
1944 Opc = X86::VPBLENDMDZrrk;
1945 break;
1946 case X86::VMOVDQU64Z128rrk:
1947 Opc = X86::VPBLENDMQZ128rrk;
1948 break;
1949 case X86::VMOVDQU64Z256rrk:
1950 Opc = X86::VPBLENDMQZ256rrk;
1951 break;
1952 case X86::VMOVDQU64Zrrk:
1953 Opc = X86::VPBLENDMQZrrk;
1954 break;
1955 case X86::VMOVUPDZ128rrk:
1956 Opc = X86::VBLENDMPDZ128rrk;
1957 break;
1958 case X86::VMOVUPDZ256rrk:
1959 Opc = X86::VBLENDMPDZ256rrk;
1960 break;
1961 case X86::VMOVUPDZrrk:
1962 Opc = X86::VBLENDMPDZrrk;
1963 break;
1964 case X86::VMOVUPSZ128rrk:
1965 Opc = X86::VBLENDMPSZ128rrk;
1966 break;
1967 case X86::VMOVUPSZ256rrk:
1968 Opc = X86::VBLENDMPSZ256rrk;
1969 break;
1970 case X86::VMOVUPSZrrk:
1971 Opc = X86::VBLENDMPSZrrk;
1972 break;
1973 case X86::VMOVDQA32Z128rrk:
1974 Opc = X86::VPBLENDMDZ128rrk;
1975 break;
1976 case X86::VMOVDQA32Z256rrk:
1977 Opc = X86::VPBLENDMDZ256rrk;
1978 break;
1979 case X86::VMOVDQA32Zrrk:
1980 Opc = X86::VPBLENDMDZrrk;
1981 break;
1982 case X86::VMOVDQA64Z128rrk:
1983 Opc = X86::VPBLENDMQZ128rrk;
1984 break;
1985 case X86::VMOVDQA64Z256rrk:
1986 Opc = X86::VPBLENDMQZ256rrk;
1987 break;
1988 case X86::VMOVDQA64Zrrk:
1989 Opc = X86::VPBLENDMQZrrk;
1990 break;
1991 case X86::VMOVAPDZ128rrk:
1992 Opc = X86::VBLENDMPDZ128rrk;
1993 break;
1994 case X86::VMOVAPDZ256rrk:
1995 Opc = X86::VBLENDMPDZ256rrk;
1996 break;
1997 case X86::VMOVAPDZrrk:
1998 Opc = X86::VBLENDMPDZrrk;
1999 break;
2000 case X86::VMOVAPSZ128rrk:
2001 Opc = X86::VBLENDMPSZ128rrk;
2002 break;
2003 case X86::VMOVAPSZ256rrk:
2004 Opc = X86::VBLENDMPSZ256rrk;
2005 break;
2006 case X86::VMOVAPSZrrk:
2007 Opc = X86::VBLENDMPSZrrk;
2008 break;
2009 }
2010
2011 NewMI = BuildMI(MF, MI.getDebugLoc(), get(Opc))
2012 .add(Dest)
2013 .add(MI.getOperand(2))
2014 .add(Src)
2015 .add(MI.getOperand(3));
2016 NumRegOperands = 4;
2017 break;
2018 }
2019 }
2020
2021 if (!NewMI)
2022 return nullptr;
2023
2024 if (LV) { // Update live variables
2025 for (unsigned I = 0; I < NumRegOperands; ++I) {
2026 MachineOperand &Op = MI.getOperand(I);
2027 if (Op.isReg() && (Op.isDead() || Op.isKill()))
2028 LV->replaceKillInstruction(Op.getReg(), MI, *NewMI);
2029 }
2030 }
2031
2032 MachineBasicBlock &MBB = *MI.getParent();
2033 MBB.insert(MI.getIterator(), NewMI); // Insert the new inst
2034
2035 if (LIS) {
2036 LIS->ReplaceMachineInstrInMaps(MI, *NewMI);
2037 if (SrcReg)
2038 LIS->getInterval(SrcReg);
2039 if (SrcReg2)
2040 LIS->getInterval(SrcReg2);
2041 }
2042
2043 return NewMI;
2044}
2045
2046/// This determines which of three possible cases of a three source commute
2047/// the source indexes correspond to taking into account any mask operands.
2048/// All prevents commuting a passthru operand. Returns -1 if the commute isn't
2049/// possible.
2050/// Case 0 - Possible to commute the first and second operands.
2051/// Case 1 - Possible to commute the first and third operands.
2052/// Case 2 - Possible to commute the second and third operands.
2053static unsigned getThreeSrcCommuteCase(uint64_t TSFlags, unsigned SrcOpIdx1,
2054 unsigned SrcOpIdx2) {
2055 // Put the lowest index to SrcOpIdx1 to simplify the checks below.
2056 if (SrcOpIdx1 > SrcOpIdx2)
2057 std::swap(SrcOpIdx1, SrcOpIdx2);
2058
2059 unsigned Op1 = 1, Op2 = 2, Op3 = 3;
2060 if (X86II::isKMasked(TSFlags)) {
2061 Op2++;
2062 Op3++;
2063 }
2064
2065 if (SrcOpIdx1 == Op1 && SrcOpIdx2 == Op2)
2066 return 0;
2067 if (SrcOpIdx1 == Op1 && SrcOpIdx2 == Op3)
2068 return 1;
2069 if (SrcOpIdx1 == Op2 && SrcOpIdx2 == Op3)
2070 return 2;
2071 llvm_unreachable("Unknown three src commute case.");
2072}
2073
2075 const MachineInstr &MI, unsigned SrcOpIdx1, unsigned SrcOpIdx2,
2076 const X86InstrFMA3Group &FMA3Group) const {
2077
2078 unsigned Opc = MI.getOpcode();
2079
2080 // TODO: Commuting the 1st operand of FMA*_Int requires some additional
2081 // analysis. The commute optimization is legal only if all users of FMA*_Int
2082 // use only the lowest element of the FMA*_Int instruction. Such analysis are
2083 // not implemented yet. So, just return 0 in that case.
2084 // When such analysis are available this place will be the right place for
2085 // calling it.
2086 assert(!(FMA3Group.isIntrinsic() && (SrcOpIdx1 == 1 || SrcOpIdx2 == 1)) &&
2087 "Intrinsic instructions can't commute operand 1");
2088
2089 // Determine which case this commute is or if it can't be done.
2090 unsigned Case =
2091 getThreeSrcCommuteCase(MI.getDesc().TSFlags, SrcOpIdx1, SrcOpIdx2);
2092 assert(Case < 3 && "Unexpected case number!");
2093
2094 // Define the FMA forms mapping array that helps to map input FMA form
2095 // to output FMA form to preserve the operation semantics after
2096 // commuting the operands.
2097 const unsigned Form132Index = 0;
2098 const unsigned Form213Index = 1;
2099 const unsigned Form231Index = 2;
2100 static const unsigned FormMapping[][3] = {
2101 // 0: SrcOpIdx1 == 1 && SrcOpIdx2 == 2;
2102 // FMA132 A, C, b; ==> FMA231 C, A, b;
2103 // FMA213 B, A, c; ==> FMA213 A, B, c;
2104 // FMA231 C, A, b; ==> FMA132 A, C, b;
2105 {Form231Index, Form213Index, Form132Index},
2106 // 1: SrcOpIdx1 == 1 && SrcOpIdx2 == 3;
2107 // FMA132 A, c, B; ==> FMA132 B, c, A;
2108 // FMA213 B, a, C; ==> FMA231 C, a, B;
2109 // FMA231 C, a, B; ==> FMA213 B, a, C;
2110 {Form132Index, Form231Index, Form213Index},
2111 // 2: SrcOpIdx1 == 2 && SrcOpIdx2 == 3;
2112 // FMA132 a, C, B; ==> FMA213 a, B, C;
2113 // FMA213 b, A, C; ==> FMA132 b, C, A;
2114 // FMA231 c, A, B; ==> FMA231 c, B, A;
2115 {Form213Index, Form132Index, Form231Index}};
2116
2117 unsigned FMAForms[3];
2118 FMAForms[0] = FMA3Group.get132Opcode();
2119 FMAForms[1] = FMA3Group.get213Opcode();
2120 FMAForms[2] = FMA3Group.get231Opcode();
2121
2122 // Everything is ready, just adjust the FMA opcode and return it.
2123 for (unsigned FormIndex = 0; FormIndex < 3; FormIndex++)
2124 if (Opc == FMAForms[FormIndex])
2125 return FMAForms[FormMapping[Case][FormIndex]];
2126
2127 llvm_unreachable("Illegal FMA3 format");
2128}
2129
2130static void commuteVPTERNLOG(MachineInstr &MI, unsigned SrcOpIdx1,
2131 unsigned SrcOpIdx2) {
2132 // Determine which case this commute is or if it can't be done.
2133 unsigned Case =
2134 getThreeSrcCommuteCase(MI.getDesc().TSFlags, SrcOpIdx1, SrcOpIdx2);
2135 assert(Case < 3 && "Unexpected case value!");
2136
2137 // For each case we need to swap two pairs of bits in the final immediate.
2138 static const uint8_t SwapMasks[3][4] = {
2139 {0x04, 0x10, 0x08, 0x20}, // Swap bits 2/4 and 3/5.
2140 {0x02, 0x10, 0x08, 0x40}, // Swap bits 1/4 and 3/6.
2141 {0x02, 0x04, 0x20, 0x40}, // Swap bits 1/2 and 5/6.
2142 };
2143
2144 uint8_t Imm = MI.getOperand(MI.getNumOperands() - 1).getImm();
2145 // Clear out the bits we are swapping.
2146 uint8_t NewImm = Imm & ~(SwapMasks[Case][0] | SwapMasks[Case][1] |
2147 SwapMasks[Case][2] | SwapMasks[Case][3]);
2148 // If the immediate had a bit of the pair set, then set the opposite bit.
2149 if (Imm & SwapMasks[Case][0])
2150 NewImm |= SwapMasks[Case][1];
2151 if (Imm & SwapMasks[Case][1])
2152 NewImm |= SwapMasks[Case][0];
2153 if (Imm & SwapMasks[Case][2])
2154 NewImm |= SwapMasks[Case][3];
2155 if (Imm & SwapMasks[Case][3])
2156 NewImm |= SwapMasks[Case][2];
2157 MI.getOperand(MI.getNumOperands() - 1).setImm(NewImm);
2158}
2159
2160// Returns true if this is a VPERMI2 or VPERMT2 instruction that can be
2161// commuted.
2162static bool isCommutableVPERMV3Instruction(unsigned Opcode) {
2163#define VPERM_CASES(Suffix) \
2164 case X86::VPERMI2##Suffix##Z128rr: \
2165 case X86::VPERMT2##Suffix##Z128rr: \
2166 case X86::VPERMI2##Suffix##Z256rr: \
2167 case X86::VPERMT2##Suffix##Z256rr: \
2168 case X86::VPERMI2##Suffix##Zrr: \
2169 case X86::VPERMT2##Suffix##Zrr: \
2170 case X86::VPERMI2##Suffix##Z128rm: \
2171 case X86::VPERMT2##Suffix##Z128rm: \
2172 case X86::VPERMI2##Suffix##Z256rm: \
2173 case X86::VPERMT2##Suffix##Z256rm: \
2174 case X86::VPERMI2##Suffix##Zrm: \
2175 case X86::VPERMT2##Suffix##Zrm: \
2176 case X86::VPERMI2##Suffix##Z128rrkz: \
2177 case X86::VPERMT2##Suffix##Z128rrkz: \
2178 case X86::VPERMI2##Suffix##Z256rrkz: \
2179 case X86::VPERMT2##Suffix##Z256rrkz: \
2180 case X86::VPERMI2##Suffix##Zrrkz: \
2181 case X86::VPERMT2##Suffix##Zrrkz: \
2182 case X86::VPERMI2##Suffix##Z128rmkz: \
2183 case X86::VPERMT2##Suffix##Z128rmkz: \
2184 case X86::VPERMI2##Suffix##Z256rmkz: \
2185 case X86::VPERMT2##Suffix##Z256rmkz: \
2186 case X86::VPERMI2##Suffix##Zrmkz: \
2187 case X86::VPERMT2##Suffix##Zrmkz:
2188
2189#define VPERM_CASES_BROADCAST(Suffix) \
2190 VPERM_CASES(Suffix) \
2191 case X86::VPERMI2##Suffix##Z128rmb: \
2192 case X86::VPERMT2##Suffix##Z128rmb: \
2193 case X86::VPERMI2##Suffix##Z256rmb: \
2194 case X86::VPERMT2##Suffix##Z256rmb: \
2195 case X86::VPERMI2##Suffix##Zrmb: \
2196 case X86::VPERMT2##Suffix##Zrmb: \
2197 case X86::VPERMI2##Suffix##Z128rmbkz: \
2198 case X86::VPERMT2##Suffix##Z128rmbkz: \
2199 case X86::VPERMI2##Suffix##Z256rmbkz: \
2200 case X86::VPERMT2##Suffix##Z256rmbkz: \
2201 case X86::VPERMI2##Suffix##Zrmbkz: \
2202 case X86::VPERMT2##Suffix##Zrmbkz:
2203
2204 switch (Opcode) {
2205 default:
2206 return false;
2207 VPERM_CASES(B)
2212 VPERM_CASES(W)
2213 return true;
2214 }
2215#undef VPERM_CASES_BROADCAST
2216#undef VPERM_CASES
2217}
2218
2219// Returns commuted opcode for VPERMI2 and VPERMT2 instructions by switching
2220// from the I opcode to the T opcode and vice versa.
2221static unsigned getCommutedVPERMV3Opcode(unsigned Opcode) {
2222#define VPERM_CASES(Orig, New) \
2223 case X86::Orig##Z128rr: \
2224 return X86::New##Z128rr; \
2225 case X86::Orig##Z128rrkz: \
2226 return X86::New##Z128rrkz; \
2227 case X86::Orig##Z128rm: \
2228 return X86::New##Z128rm; \
2229 case X86::Orig##Z128rmkz: \
2230 return X86::New##Z128rmkz; \
2231 case X86::Orig##Z256rr: \
2232 return X86::New##Z256rr; \
2233 case X86::Orig##Z256rrkz: \
2234 return X86::New##Z256rrkz; \
2235 case X86::Orig##Z256rm: \
2236 return X86::New##Z256rm; \
2237 case X86::Orig##Z256rmkz: \
2238 return X86::New##Z256rmkz; \
2239 case X86::Orig##Zrr: \
2240 return X86::New##Zrr; \
2241 case X86::Orig##Zrrkz: \
2242 return X86::New##Zrrkz; \
2243 case X86::Orig##Zrm: \
2244 return X86::New##Zrm; \
2245 case X86::Orig##Zrmkz: \
2246 return X86::New##Zrmkz;
2247
2248#define VPERM_CASES_BROADCAST(Orig, New) \
2249 VPERM_CASES(Orig, New) \
2250 case X86::Orig##Z128rmb: \
2251 return X86::New##Z128rmb; \
2252 case X86::Orig##Z128rmbkz: \
2253 return X86::New##Z128rmbkz; \
2254 case X86::Orig##Z256rmb: \
2255 return X86::New##Z256rmb; \
2256 case X86::Orig##Z256rmbkz: \
2257 return X86::New##Z256rmbkz; \
2258 case X86::Orig##Zrmb: \
2259 return X86::New##Zrmb; \
2260 case X86::Orig##Zrmbkz: \
2261 return X86::New##Zrmbkz;
2262
2263 switch (Opcode) {
2264 VPERM_CASES(VPERMI2B, VPERMT2B)
2265 VPERM_CASES_BROADCAST(VPERMI2D, VPERMT2D)
2266 VPERM_CASES_BROADCAST(VPERMI2PD, VPERMT2PD)
2267 VPERM_CASES_BROADCAST(VPERMI2PS, VPERMT2PS)
2268 VPERM_CASES_BROADCAST(VPERMI2Q, VPERMT2Q)
2269 VPERM_CASES(VPERMI2W, VPERMT2W)
2270 VPERM_CASES(VPERMT2B, VPERMI2B)
2271 VPERM_CASES_BROADCAST(VPERMT2D, VPERMI2D)
2272 VPERM_CASES_BROADCAST(VPERMT2PD, VPERMI2PD)
2273 VPERM_CASES_BROADCAST(VPERMT2PS, VPERMI2PS)
2274 VPERM_CASES_BROADCAST(VPERMT2Q, VPERMI2Q)
2275 VPERM_CASES(VPERMT2W, VPERMI2W)
2276 }
2277
2278 llvm_unreachable("Unreachable!");
2279#undef VPERM_CASES_BROADCAST
2280#undef VPERM_CASES
2281}
2282
2284 unsigned OpIdx1,
2285 unsigned OpIdx2) const {
2286 auto CloneIfNew = [&](MachineInstr &MI) {
2287 return std::exchange(NewMI, false)
2288 ? MI.getParent()->getParent()->CloneMachineInstr(&MI)
2289 : &MI;
2290 };
2291 MachineInstr *WorkingMI = nullptr;
2292 unsigned Opc = MI.getOpcode();
2293
2294#define CASE_ND(OP) \
2295 case X86::OP: \
2296 case X86::OP##_ND:
2297
2298 switch (Opc) {
2299 // SHLD B, C, I <-> SHRD C, B, (BitWidth - I)
2300 CASE_ND(SHRD16rri8)
2301 CASE_ND(SHLD16rri8)
2302 CASE_ND(SHRD32rri8)
2303 CASE_ND(SHLD32rri8)
2304 CASE_ND(SHRD64rri8)
2305 CASE_ND(SHLD64rri8) {
2306 unsigned Size;
2307 switch (Opc) {
2308 default:
2309 llvm_unreachable("Unreachable!");
2310#define FROM_TO_SIZE(A, B, S) \
2311 case X86::A: \
2312 Opc = X86::B; \
2313 Size = S; \
2314 break; \
2315 case X86::A##_ND: \
2316 Opc = X86::B##_ND; \
2317 Size = S; \
2318 break; \
2319 case X86::B: \
2320 Opc = X86::A; \
2321 Size = S; \
2322 break; \
2323 case X86::B##_ND: \
2324 Opc = X86::A##_ND; \
2325 Size = S; \
2326 break;
2327
2328 FROM_TO_SIZE(SHRD16rri8, SHLD16rri8, 16)
2329 FROM_TO_SIZE(SHRD32rri8, SHLD32rri8, 32)
2330 FROM_TO_SIZE(SHRD64rri8, SHLD64rri8, 64)
2331#undef FROM_TO_SIZE
2332 }
2333 WorkingMI = CloneIfNew(MI);
2334 WorkingMI->setDesc(get(Opc));
2335 WorkingMI->getOperand(3).setImm(Size - MI.getOperand(3).getImm());
2336 break;
2337 }
2338 case X86::PFSUBrr:
2339 case X86::PFSUBRrr:
2340 // PFSUB x, y: x = x - y
2341 // PFSUBR x, y: x = y - x
2342 WorkingMI = CloneIfNew(MI);
2343 WorkingMI->setDesc(
2344 get(X86::PFSUBRrr == Opc ? X86::PFSUBrr : X86::PFSUBRrr));
2345 break;
2346 case X86::BLENDPDrri:
2347 case X86::BLENDPSrri:
2348 case X86::VBLENDPDrri:
2349 case X86::VBLENDPSrri:
2350 // If we're optimizing for size, try to use MOVSD/MOVSS.
2351 if (MI.getParent()->getParent()->getFunction().hasOptSize()) {
2352 unsigned Mask = (Opc == X86::BLENDPDrri || Opc == X86::VBLENDPDrri) ? 0x03: 0x0F;
2353 if ((MI.getOperand(3).getImm() ^ Mask) == 1) {
2354#define FROM_TO(FROM, TO) \
2355 case X86::FROM: \
2356 Opc = X86::TO; \
2357 break;
2358 switch (Opc) {
2359 default:
2360 llvm_unreachable("Unreachable!");
2361 FROM_TO(BLENDPDrri, MOVSDrr)
2362 FROM_TO(BLENDPSrri, MOVSSrr)
2363 FROM_TO(VBLENDPDrri, VMOVSDrr)
2364 FROM_TO(VBLENDPSrri, VMOVSSrr)
2365 }
2366 WorkingMI = CloneIfNew(MI);
2367 WorkingMI->setDesc(get(Opc));
2368 WorkingMI->removeOperand(3);
2369 break;
2370 }
2371#undef FROM_TO
2372 }
2373 [[fallthrough]];
2374 case X86::PBLENDWrri:
2375 case X86::VBLENDPDYrri:
2376 case X86::VBLENDPSYrri:
2377 case X86::VPBLENDDrri:
2378 case X86::VPBLENDWrri:
2379 case X86::VPBLENDDYrri:
2380 case X86::VPBLENDWYrri: {
2381 int8_t Mask;
2382 switch (Opc) {
2383 default:
2384 llvm_unreachable("Unreachable!");
2385 case X86::BLENDPDrri:
2386 Mask = (int8_t)0x03;
2387 break;
2388 case X86::BLENDPSrri:
2389 Mask = (int8_t)0x0F;
2390 break;
2391 case X86::PBLENDWrri:
2392 Mask = (int8_t)0xFF;
2393 break;
2394 case X86::VBLENDPDrri:
2395 Mask = (int8_t)0x03;
2396 break;
2397 case X86::VBLENDPSrri:
2398 Mask = (int8_t)0x0F;
2399 break;
2400 case X86::VBLENDPDYrri:
2401 Mask = (int8_t)0x0F;
2402 break;
2403 case X86::VBLENDPSYrri:
2404 Mask = (int8_t)0xFF;
2405 break;
2406 case X86::VPBLENDDrri:
2407 Mask = (int8_t)0x0F;
2408 break;
2409 case X86::VPBLENDWrri:
2410 Mask = (int8_t)0xFF;
2411 break;
2412 case X86::VPBLENDDYrri:
2413 Mask = (int8_t)0xFF;
2414 break;
2415 case X86::VPBLENDWYrri:
2416 Mask = (int8_t)0xFF;
2417 break;
2418 }
2419 // Only the least significant bits of Imm are used.
2420 // Using int8_t to ensure it will be sign extended to the int64_t that
2421 // setImm takes in order to match isel behavior.
2422 int8_t Imm = MI.getOperand(3).getImm() & Mask;
2423 WorkingMI = CloneIfNew(MI);
2424 WorkingMI->getOperand(3).setImm(Mask ^ Imm);
2425 break;
2426 }
2427 case X86::INSERTPSrr:
2428 case X86::VINSERTPSrr:
2429 case X86::VINSERTPSZrr: {
2430 unsigned Imm = MI.getOperand(MI.getNumOperands() - 1).getImm();
2431 unsigned ZMask = Imm & 15;
2432 unsigned DstIdx = (Imm >> 4) & 3;
2433 unsigned SrcIdx = (Imm >> 6) & 3;
2434
2435 // We can commute insertps if we zero 2 of the elements, the insertion is
2436 // "inline" and we don't override the insertion with a zero.
2437 if (DstIdx == SrcIdx && (ZMask & (1 << DstIdx)) == 0 &&
2438 llvm::popcount(ZMask) == 2) {
2439 unsigned AltIdx = llvm::countr_zero((ZMask | (1 << DstIdx)) ^ 15);
2440 assert(AltIdx < 4 && "Illegal insertion index");
2441 unsigned AltImm = (AltIdx << 6) | (AltIdx << 4) | ZMask;
2442 WorkingMI = CloneIfNew(MI);
2443 WorkingMI->getOperand(MI.getNumOperands() - 1).setImm(AltImm);
2444 break;
2445 }
2446 return nullptr;
2447 }
2448 case X86::MOVSDrr:
2449 case X86::MOVSSrr:
2450 case X86::VMOVSDrr:
2451 case X86::VMOVSSrr: {
2452 // On SSE41 or later we can commute a MOVSS/MOVSD to a BLENDPS/BLENDPD.
2453 if (Subtarget.hasSSE41()) {
2454 unsigned Mask;
2455 switch (Opc) {
2456 default:
2457 llvm_unreachable("Unreachable!");
2458 case X86::MOVSDrr:
2459 Opc = X86::BLENDPDrri;
2460 Mask = 0x02;
2461 break;
2462 case X86::MOVSSrr:
2463 Opc = X86::BLENDPSrri;
2464 Mask = 0x0E;
2465 break;
2466 case X86::VMOVSDrr:
2467 Opc = X86::VBLENDPDrri;
2468 Mask = 0x02;
2469 break;
2470 case X86::VMOVSSrr:
2471 Opc = X86::VBLENDPSrri;
2472 Mask = 0x0E;
2473 break;
2474 }
2475
2476 WorkingMI = CloneIfNew(MI);
2477 WorkingMI->setDesc(get(Opc));
2478 WorkingMI->addOperand(MachineOperand::CreateImm(Mask));
2479 break;
2480 }
2481
2482 WorkingMI = CloneIfNew(MI);
2483 WorkingMI->setDesc(get(X86::SHUFPDrri));
2484 WorkingMI->addOperand(MachineOperand::CreateImm(0x02));
2485 break;
2486 }
2487 case X86::SHUFPDrri: {
2488 // Commute to MOVSD.
2489 assert(MI.getOperand(3).getImm() == 0x02 && "Unexpected immediate!");
2490 WorkingMI = CloneIfNew(MI);
2491 WorkingMI->setDesc(get(X86::MOVSDrr));
2492 WorkingMI->removeOperand(3);
2493 break;
2494 }
2495 case X86::PCLMULQDQrri:
2496 case X86::VPCLMULQDQrri:
2497 case X86::VPCLMULQDQYrri:
2498 case X86::VPCLMULQDQZrri:
2499 case X86::VPCLMULQDQZ128rri:
2500 case X86::VPCLMULQDQZ256rri: {
2501 // SRC1 64bits = Imm[0] ? SRC1[127:64] : SRC1[63:0]
2502 // SRC2 64bits = Imm[4] ? SRC2[127:64] : SRC2[63:0]
2503 unsigned Imm = MI.getOperand(3).getImm();
2504 unsigned Src1Hi = Imm & 0x01;
2505 unsigned Src2Hi = Imm & 0x10;
2506 WorkingMI = CloneIfNew(MI);
2507 WorkingMI->getOperand(3).setImm((Src1Hi << 4) | (Src2Hi >> 4));
2508 break;
2509 }
2510 case X86::VPCMPBZ128rri:
2511 case X86::VPCMPUBZ128rri:
2512 case X86::VPCMPBZ256rri:
2513 case X86::VPCMPUBZ256rri:
2514 case X86::VPCMPBZrri:
2515 case X86::VPCMPUBZrri:
2516 case X86::VPCMPDZ128rri:
2517 case X86::VPCMPUDZ128rri:
2518 case X86::VPCMPDZ256rri:
2519 case X86::VPCMPUDZ256rri:
2520 case X86::VPCMPDZrri:
2521 case X86::VPCMPUDZrri:
2522 case X86::VPCMPQZ128rri:
2523 case X86::VPCMPUQZ128rri:
2524 case X86::VPCMPQZ256rri:
2525 case X86::VPCMPUQZ256rri:
2526 case X86::VPCMPQZrri:
2527 case X86::VPCMPUQZrri:
2528 case X86::VPCMPWZ128rri:
2529 case X86::VPCMPUWZ128rri:
2530 case X86::VPCMPWZ256rri:
2531 case X86::VPCMPUWZ256rri:
2532 case X86::VPCMPWZrri:
2533 case X86::VPCMPUWZrri:
2534 case X86::VPCMPBZ128rrik:
2535 case X86::VPCMPUBZ128rrik:
2536 case X86::VPCMPBZ256rrik:
2537 case X86::VPCMPUBZ256rrik:
2538 case X86::VPCMPBZrrik:
2539 case X86::VPCMPUBZrrik:
2540 case X86::VPCMPDZ128rrik:
2541 case X86::VPCMPUDZ128rrik:
2542 case X86::VPCMPDZ256rrik:
2543 case X86::VPCMPUDZ256rrik:
2544 case X86::VPCMPDZrrik:
2545 case X86::VPCMPUDZrrik:
2546 case X86::VPCMPQZ128rrik:
2547 case X86::VPCMPUQZ128rrik:
2548 case X86::VPCMPQZ256rrik:
2549 case X86::VPCMPUQZ256rrik:
2550 case X86::VPCMPQZrrik:
2551 case X86::VPCMPUQZrrik:
2552 case X86::VPCMPWZ128rrik:
2553 case X86::VPCMPUWZ128rrik:
2554 case X86::VPCMPWZ256rrik:
2555 case X86::VPCMPUWZ256rrik:
2556 case X86::VPCMPWZrrik:
2557 case X86::VPCMPUWZrrik:
2558 WorkingMI = CloneIfNew(MI);
2559 // Flip comparison mode immediate (if necessary).
2560 WorkingMI->getOperand(MI.getNumOperands() - 1)
2562 MI.getOperand(MI.getNumOperands() - 1).getImm() & 0x7));
2563 break;
2564 case X86::VPCOMBri:
2565 case X86::VPCOMUBri:
2566 case X86::VPCOMDri:
2567 case X86::VPCOMUDri:
2568 case X86::VPCOMQri:
2569 case X86::VPCOMUQri:
2570 case X86::VPCOMWri:
2571 case X86::VPCOMUWri:
2572 WorkingMI = CloneIfNew(MI);
2573 // Flip comparison mode immediate (if necessary).
2574 WorkingMI->getOperand(3).setImm(
2575 X86::getSwappedVPCOMImm(MI.getOperand(3).getImm() & 0x7));
2576 break;
2577 case X86::VCMPSDZrri:
2578 case X86::VCMPSSZrri:
2579 case X86::VCMPPDZrri:
2580 case X86::VCMPPSZrri:
2581 case X86::VCMPSHZrri:
2582 case X86::VCMPPHZrri:
2583 case X86::VCMPPHZ128rri:
2584 case X86::VCMPPHZ256rri:
2585 case X86::VCMPPDZ128rri:
2586 case X86::VCMPPSZ128rri:
2587 case X86::VCMPPDZ256rri:
2588 case X86::VCMPPSZ256rri:
2589 case X86::VCMPPDZrrik:
2590 case X86::VCMPPSZrrik:
2591 case X86::VCMPPDZ128rrik:
2592 case X86::VCMPPSZ128rrik:
2593 case X86::VCMPPDZ256rrik:
2594 case X86::VCMPPSZ256rrik:
2595 WorkingMI = CloneIfNew(MI);
2596 WorkingMI->getOperand(MI.getNumExplicitOperands() - 1)
2598 MI.getOperand(MI.getNumExplicitOperands() - 1).getImm() & 0x1f));
2599 break;
2600 case X86::VPERM2F128rr:
2601 case X86::VPERM2I128rr:
2602 // Flip permute source immediate.
2603 // Imm & 0x02: lo = if set, select Op1.lo/hi else Op0.lo/hi.
2604 // Imm & 0x20: hi = if set, select Op1.lo/hi else Op0.lo/hi.
2605 WorkingMI = CloneIfNew(MI);
2606 WorkingMI->getOperand(3).setImm((MI.getOperand(3).getImm() & 0xFF) ^ 0x22);
2607 break;
2608 case X86::MOVHLPSrr:
2609 case X86::UNPCKHPDrr:
2610 case X86::VMOVHLPSrr:
2611 case X86::VUNPCKHPDrr:
2612 case X86::VMOVHLPSZrr:
2613 case X86::VUNPCKHPDZ128rr:
2614 assert(Subtarget.hasSSE2() && "Commuting MOVHLP/UNPCKHPD requires SSE2!");
2615
2616 switch (Opc) {
2617 default:
2618 llvm_unreachable("Unreachable!");
2619 case X86::MOVHLPSrr:
2620 Opc = X86::UNPCKHPDrr;
2621 break;
2622 case X86::UNPCKHPDrr:
2623 Opc = X86::MOVHLPSrr;
2624 break;
2625 case X86::VMOVHLPSrr:
2626 Opc = X86::VUNPCKHPDrr;
2627 break;
2628 case X86::VUNPCKHPDrr:
2629 Opc = X86::VMOVHLPSrr;
2630 break;
2631 case X86::VMOVHLPSZrr:
2632 Opc = X86::VUNPCKHPDZ128rr;
2633 break;
2634 case X86::VUNPCKHPDZ128rr:
2635 Opc = X86::VMOVHLPSZrr;
2636 break;
2637 }
2638 WorkingMI = CloneIfNew(MI);
2639 WorkingMI->setDesc(get(Opc));
2640 break;
2641 CASE_ND(CMOV16rr)
2642 CASE_ND(CMOV32rr)
2643 CASE_ND(CMOV64rr) {
2644 WorkingMI = CloneIfNew(MI);
2645 unsigned OpNo = MI.getDesc().getNumOperands() - 1;
2646 X86::CondCode CC = static_cast<X86::CondCode>(MI.getOperand(OpNo).getImm());
2648 break;
2649 }
2650 case X86::VPTERNLOGDZrri:
2651 case X86::VPTERNLOGDZrmi:
2652 case X86::VPTERNLOGDZ128rri:
2653 case X86::VPTERNLOGDZ128rmi:
2654 case X86::VPTERNLOGDZ256rri:
2655 case X86::VPTERNLOGDZ256rmi:
2656 case X86::VPTERNLOGQZrri:
2657 case X86::VPTERNLOGQZrmi:
2658 case X86::VPTERNLOGQZ128rri:
2659 case X86::VPTERNLOGQZ128rmi:
2660 case X86::VPTERNLOGQZ256rri:
2661 case X86::VPTERNLOGQZ256rmi:
2662 case X86::VPTERNLOGDZrrik:
2663 case X86::VPTERNLOGDZ128rrik:
2664 case X86::VPTERNLOGDZ256rrik:
2665 case X86::VPTERNLOGQZrrik:
2666 case X86::VPTERNLOGQZ128rrik:
2667 case X86::VPTERNLOGQZ256rrik:
2668 case X86::VPTERNLOGDZrrikz:
2669 case X86::VPTERNLOGDZrmikz:
2670 case X86::VPTERNLOGDZ128rrikz:
2671 case X86::VPTERNLOGDZ128rmikz:
2672 case X86::VPTERNLOGDZ256rrikz:
2673 case X86::VPTERNLOGDZ256rmikz:
2674 case X86::VPTERNLOGQZrrikz:
2675 case X86::VPTERNLOGQZrmikz:
2676 case X86::VPTERNLOGQZ128rrikz:
2677 case X86::VPTERNLOGQZ128rmikz:
2678 case X86::VPTERNLOGQZ256rrikz:
2679 case X86::VPTERNLOGQZ256rmikz:
2680 case X86::VPTERNLOGDZ128rmbi:
2681 case X86::VPTERNLOGDZ256rmbi:
2682 case X86::VPTERNLOGDZrmbi:
2683 case X86::VPTERNLOGQZ128rmbi:
2684 case X86::VPTERNLOGQZ256rmbi:
2685 case X86::VPTERNLOGQZrmbi:
2686 case X86::VPTERNLOGDZ128rmbikz:
2687 case X86::VPTERNLOGDZ256rmbikz:
2688 case X86::VPTERNLOGDZrmbikz:
2689 case X86::VPTERNLOGQZ128rmbikz:
2690 case X86::VPTERNLOGQZ256rmbikz:
2691 case X86::VPTERNLOGQZrmbikz: {
2692 WorkingMI = CloneIfNew(MI);
2693 commuteVPTERNLOG(*WorkingMI, OpIdx1, OpIdx2);
2694 break;
2695 }
2696 default:
2698 WorkingMI = CloneIfNew(MI);
2699 WorkingMI->setDesc(get(getCommutedVPERMV3Opcode(Opc)));
2700 break;
2701 }
2702
2703 if (auto *FMA3Group = getFMA3Group(Opc, MI.getDesc().TSFlags)) {
2704 WorkingMI = CloneIfNew(MI);
2705 WorkingMI->setDesc(
2706 get(getFMA3OpcodeToCommuteOperands(MI, OpIdx1, OpIdx2, *FMA3Group)));
2707 break;
2708 }
2709 }
2710 return TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2);
2711}
2712
2713bool X86InstrInfo::findThreeSrcCommutedOpIndices(const MachineInstr &MI,
2714 unsigned &SrcOpIdx1,
2715 unsigned &SrcOpIdx2,
2716 bool IsIntrinsic) const {
2717 uint64_t TSFlags = MI.getDesc().TSFlags;
2718
2719 unsigned FirstCommutableVecOp = 1;
2720 unsigned LastCommutableVecOp = 3;
2721 unsigned KMaskOp = -1U;
2722 if (X86II::isKMasked(TSFlags)) {
2723 // For k-zero-masked operations it is Ok to commute the first vector
2724 // operand. Unless this is an intrinsic instruction.
2725 // For regular k-masked operations a conservative choice is done as the
2726 // elements of the first vector operand, for which the corresponding bit
2727 // in the k-mask operand is set to 0, are copied to the result of the
2728 // instruction.
2729 // TODO/FIXME: The commute still may be legal if it is known that the
2730 // k-mask operand is set to either all ones or all zeroes.
2731 // It is also Ok to commute the 1st operand if all users of MI use only
2732 // the elements enabled by the k-mask operand. For example,
2733 // v4 = VFMADD213PSZrk v1, k, v2, v3; // v1[i] = k[i] ? v2[i]*v1[i]+v3[i]
2734 // : v1[i];
2735 // VMOVAPSZmrk <mem_addr>, k, v4; // this is the ONLY user of v4 ->
2736 // // Ok, to commute v1 in FMADD213PSZrk.
2737
2738 // The k-mask operand has index = 2 for masked and zero-masked operations.
2739 KMaskOp = 2;
2740
2741 // The operand with index = 1 is used as a source for those elements for
2742 // which the corresponding bit in the k-mask is set to 0.
2743 if (X86II::isKMergeMasked(TSFlags) || IsIntrinsic)
2744 FirstCommutableVecOp = 3;
2745
2746 LastCommutableVecOp++;
2747 } else if (IsIntrinsic) {
2748 // Commuting the first operand of an intrinsic instruction isn't possible
2749 // unless we can prove that only the lowest element of the result is used.
2750 FirstCommutableVecOp = 2;
2751 }
2752
2753 if (isMem(MI, LastCommutableVecOp))
2754 LastCommutableVecOp--;
2755
2756 // Only the first RegOpsNum operands are commutable.
2757 // Also, the value 'CommuteAnyOperandIndex' is valid here as it means
2758 // that the operand is not specified/fixed.
2759 if (SrcOpIdx1 != CommuteAnyOperandIndex &&
2760 (SrcOpIdx1 < FirstCommutableVecOp || SrcOpIdx1 > LastCommutableVecOp ||
2761 SrcOpIdx1 == KMaskOp))
2762 return false;
2763 if (SrcOpIdx2 != CommuteAnyOperandIndex &&
2764 (SrcOpIdx2 < FirstCommutableVecOp || SrcOpIdx2 > LastCommutableVecOp ||
2765 SrcOpIdx2 == KMaskOp))
2766 return false;
2767
2768 // Look for two different register operands assumed to be commutable
2769 // regardless of the FMA opcode. The FMA opcode is adjusted later.
2770 if (SrcOpIdx1 == CommuteAnyOperandIndex ||
2771 SrcOpIdx2 == CommuteAnyOperandIndex) {
2772 unsigned CommutableOpIdx2 = SrcOpIdx2;
2773
2774 // At least one of operands to be commuted is not specified and
2775 // this method is free to choose appropriate commutable operands.
2776 if (SrcOpIdx1 == SrcOpIdx2)
2777 // Both of operands are not fixed. By default set one of commutable
2778 // operands to the last register operand of the instruction.
2779 CommutableOpIdx2 = LastCommutableVecOp;
2780 else if (SrcOpIdx2 == CommuteAnyOperandIndex)
2781 // Only one of operands is not fixed.
2782 CommutableOpIdx2 = SrcOpIdx1;
2783
2784 // CommutableOpIdx2 is well defined now. Let's choose another commutable
2785 // operand and assign its index to CommutableOpIdx1.
2786 Register Op2Reg = MI.getOperand(CommutableOpIdx2).getReg();
2787
2788 unsigned CommutableOpIdx1;
2789 for (CommutableOpIdx1 = LastCommutableVecOp;
2790 CommutableOpIdx1 >= FirstCommutableVecOp; CommutableOpIdx1--) {
2791 // Just ignore and skip the k-mask operand.
2792 if (CommutableOpIdx1 == KMaskOp)
2793 continue;
2794
2795 // The commuted operands must have different registers.
2796 // Otherwise, the commute transformation does not change anything and
2797 // is useless then.
2798 if (Op2Reg != MI.getOperand(CommutableOpIdx1).getReg())
2799 break;
2800 }
2801
2802 // No appropriate commutable operands were found.
2803 if (CommutableOpIdx1 < FirstCommutableVecOp)
2804 return false;
2805
2806 // Assign the found pair of commutable indices to SrcOpIdx1 and SrcOpidx2
2807 // to return those values.
2808 if (!fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2, CommutableOpIdx1,
2809 CommutableOpIdx2))
2810 return false;
2811 }
2812
2813 return true;
2814}
2815
2817 unsigned &SrcOpIdx1,
2818 unsigned &SrcOpIdx2) const {
2819 const MCInstrDesc &Desc = MI.getDesc();
2820 if (!Desc.isCommutable())
2821 return false;
2822
2823 switch (MI.getOpcode()) {
2824 case X86::CMPSDrri:
2825 case X86::CMPSSrri:
2826 case X86::CMPPDrri:
2827 case X86::CMPPSrri:
2828 case X86::VCMPSDrri:
2829 case X86::VCMPSSrri:
2830 case X86::VCMPPDrri:
2831 case X86::VCMPPSrri:
2832 case X86::VCMPPDYrri:
2833 case X86::VCMPPSYrri:
2834 case X86::VCMPSDZrri:
2835 case X86::VCMPSSZrri:
2836 case X86::VCMPPDZrri:
2837 case X86::VCMPPSZrri:
2838 case X86::VCMPSHZrri:
2839 case X86::VCMPPHZrri:
2840 case X86::VCMPPHZ128rri:
2841 case X86::VCMPPHZ256rri:
2842 case X86::VCMPPDZ128rri:
2843 case X86::VCMPPSZ128rri:
2844 case X86::VCMPPDZ256rri:
2845 case X86::VCMPPSZ256rri:
2846 case X86::VCMPPDZrrik:
2847 case X86::VCMPPSZrrik:
2848 case X86::VCMPPDZ128rrik:
2849 case X86::VCMPPSZ128rrik:
2850 case X86::VCMPPDZ256rrik:
2851 case X86::VCMPPSZ256rrik: {
2852 unsigned OpOffset = X86II::isKMasked(Desc.TSFlags) ? 1 : 0;
2853
2854 // Float comparison can be safely commuted for
2855 // Ordered/Unordered/Equal/NotEqual tests
2856 unsigned Imm = MI.getOperand(3 + OpOffset).getImm() & 0x7;
2857 switch (Imm) {
2858 default:
2859 // EVEX versions can be commuted.
2860 if ((Desc.TSFlags & X86II::EncodingMask) == X86II::EVEX)
2861 break;
2862 return false;
2863 case 0x00: // EQUAL
2864 case 0x03: // UNORDERED
2865 case 0x04: // NOT EQUAL
2866 case 0x07: // ORDERED
2867 break;
2868 }
2869
2870 // The indices of the commutable operands are 1 and 2 (or 2 and 3
2871 // when masked).
2872 // Assign them to the returned operand indices here.
2873 return fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2, 1 + OpOffset,
2874 2 + OpOffset);
2875 }
2876 case X86::MOVSSrr:
2877 // X86::MOVSDrr is always commutable. MOVSS is only commutable if we can
2878 // form sse4.1 blend. We assume VMOVSSrr/VMOVSDrr is always commutable since
2879 // AVX implies sse4.1.
2880 if (Subtarget.hasSSE41())
2881 return TargetInstrInfo::findCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2);
2882 return false;
2883 case X86::SHUFPDrri:
2884 // We can commute this to MOVSD.
2885 if (MI.getOperand(3).getImm() == 0x02)
2886 return TargetInstrInfo::findCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2);
2887 return false;
2888 case X86::MOVHLPSrr:
2889 case X86::UNPCKHPDrr:
2890 case X86::VMOVHLPSrr:
2891 case X86::VUNPCKHPDrr:
2892 case X86::VMOVHLPSZrr:
2893 case X86::VUNPCKHPDZ128rr:
2894 if (Subtarget.hasSSE2())
2895 return TargetInstrInfo::findCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2);
2896 return false;
2897 case X86::VPTERNLOGDZrri:
2898 case X86::VPTERNLOGDZrmi:
2899 case X86::VPTERNLOGDZ128rri:
2900 case X86::VPTERNLOGDZ128rmi:
2901 case X86::VPTERNLOGDZ256rri:
2902 case X86::VPTERNLOGDZ256rmi:
2903 case X86::VPTERNLOGQZrri:
2904 case X86::VPTERNLOGQZrmi:
2905 case X86::VPTERNLOGQZ128rri:
2906 case X86::VPTERNLOGQZ128rmi:
2907 case X86::VPTERNLOGQZ256rri:
2908 case X86::VPTERNLOGQZ256rmi:
2909 case X86::VPTERNLOGDZrrik:
2910 case X86::VPTERNLOGDZ128rrik:
2911 case X86::VPTERNLOGDZ256rrik:
2912 case X86::VPTERNLOGQZrrik:
2913 case X86::VPTERNLOGQZ128rrik:
2914 case X86::VPTERNLOGQZ256rrik:
2915 case X86::VPTERNLOGDZrrikz:
2916 case X86::VPTERNLOGDZrmikz:
2917 case X86::VPTERNLOGDZ128rrikz:
2918 case X86::VPTERNLOGDZ128rmikz:
2919 case X86::VPTERNLOGDZ256rrikz:
2920 case X86::VPTERNLOGDZ256rmikz:
2921 case X86::VPTERNLOGQZrrikz:
2922 case X86::VPTERNLOGQZrmikz:
2923 case X86::VPTERNLOGQZ128rrikz:
2924 case X86::VPTERNLOGQZ128rmikz:
2925 case X86::VPTERNLOGQZ256rrikz:
2926 case X86::VPTERNLOGQZ256rmikz:
2927 case X86::VPTERNLOGDZ128rmbi:
2928 case X86::VPTERNLOGDZ256rmbi:
2929 case X86::VPTERNLOGDZrmbi:
2930 case X86::VPTERNLOGQZ128rmbi:
2931 case X86::VPTERNLOGQZ256rmbi:
2932 case X86::VPTERNLOGQZrmbi:
2933 case X86::VPTERNLOGDZ128rmbikz:
2934 case X86::VPTERNLOGDZ256rmbikz:
2935 case X86::VPTERNLOGDZrmbikz:
2936 case X86::VPTERNLOGQZ128rmbikz:
2937 case X86::VPTERNLOGQZ256rmbikz:
2938 case X86::VPTERNLOGQZrmbikz:
2939 return findThreeSrcCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2);
2940 case X86::VPDPWSSDYrr:
2941 case X86::VPDPWSSDrr:
2942 case X86::VPDPWSSDSYrr:
2943 case X86::VPDPWSSDSrr:
2944 case X86::VPDPWUUDrr:
2945 case X86::VPDPWUUDYrr:
2946 case X86::VPDPWUUDSrr:
2947 case X86::VPDPWUUDSYrr:
2948 case X86::VPDPBSSDSrr:
2949 case X86::VPDPBSSDSYrr:
2950 case X86::VPDPBSSDrr:
2951 case X86::VPDPBSSDYrr:
2952 case X86::VPDPBUUDSrr:
2953 case X86::VPDPBUUDSYrr:
2954 case X86::VPDPBUUDrr:
2955 case X86::VPDPBUUDYrr:
2956 case X86::VPDPBSSDSZ128r:
2957 case X86::VPDPBSSDSZ128rk:
2958 case X86::VPDPBSSDSZ128rkz:
2959 case X86::VPDPBSSDSZ256r:
2960 case X86::VPDPBSSDSZ256rk:
2961 case X86::VPDPBSSDSZ256rkz:
2962 case X86::VPDPBSSDSZr:
2963 case X86::VPDPBSSDSZrk:
2964 case X86::VPDPBSSDSZrkz:
2965 case X86::VPDPBSSDZ128r:
2966 case X86::VPDPBSSDZ128rk:
2967 case X86::VPDPBSSDZ128rkz:
2968 case X86::VPDPBSSDZ256r:
2969 case X86::VPDPBSSDZ256rk:
2970 case X86::VPDPBSSDZ256rkz:
2971 case X86::VPDPBSSDZr:
2972 case X86::VPDPBSSDZrk:
2973 case X86::VPDPBSSDZrkz:
2974 case X86::VPDPBUUDSZ128r:
2975 case X86::VPDPBUUDSZ128rk:
2976 case X86::VPDPBUUDSZ128rkz:
2977 case X86::VPDPBUUDSZ256r:
2978 case X86::VPDPBUUDSZ256rk:
2979 case X86::VPDPBUUDSZ256rkz:
2980 case X86::VPDPBUUDSZr:
2981 case X86::VPDPBUUDSZrk:
2982 case X86::VPDPBUUDSZrkz:
2983 case X86::VPDPBUUDZ128r:
2984 case X86::VPDPBUUDZ128rk:
2985 case X86::VPDPBUUDZ128rkz:
2986 case X86::VPDPBUUDZ256r:
2987 case X86::VPDPBUUDZ256rk:
2988 case X86::VPDPBUUDZ256rkz:
2989 case X86::VPDPBUUDZr:
2990 case X86::VPDPBUUDZrk:
2991 case X86::VPDPBUUDZrkz:
2992 case X86::VPDPWSSDZ128r:
2993 case X86::VPDPWSSDZ128rk:
2994 case X86::VPDPWSSDZ128rkz:
2995 case X86::VPDPWSSDZ256r:
2996 case X86::VPDPWSSDZ256rk:
2997 case X86::VPDPWSSDZ256rkz:
2998 case X86::VPDPWSSDZr:
2999 case X86::VPDPWSSDZrk:
3000 case X86::VPDPWSSDZrkz:
3001 case X86::VPDPWSSDSZ128r:
3002 case X86::VPDPWSSDSZ128rk:
3003 case X86::VPDPWSSDSZ128rkz:
3004 case X86::VPDPWSSDSZ256r:
3005 case X86::VPDPWSSDSZ256rk:
3006 case X86::VPDPWSSDSZ256rkz:
3007 case X86::VPDPWSSDSZr:
3008 case X86::VPDPWSSDSZrk:
3009 case X86::VPDPWSSDSZrkz:
3010 case X86::VPDPWUUDZ128r:
3011 case X86::VPDPWUUDZ128rk:
3012 case X86::VPDPWUUDZ128rkz:
3013 case X86::VPDPWUUDZ256r:
3014 case X86::VPDPWUUDZ256rk:
3015 case X86::VPDPWUUDZ256rkz:
3016 case X86::VPDPWUUDZr:
3017 case X86::VPDPWUUDZrk:
3018 case X86::VPDPWUUDZrkz:
3019 case X86::VPDPWUUDSZ128r:
3020 case X86::VPDPWUUDSZ128rk:
3021 case X86::VPDPWUUDSZ128rkz:
3022 case X86::VPDPWUUDSZ256r:
3023 case X86::VPDPWUUDSZ256rk:
3024 case X86::VPDPWUUDSZ256rkz:
3025 case X86::VPDPWUUDSZr:
3026 case X86::VPDPWUUDSZrk:
3027 case X86::VPDPWUUDSZrkz:
3028 case X86::VPMADD52HUQrr:
3029 case X86::VPMADD52HUQYrr:
3030 case X86::VPMADD52HUQZ128r:
3031 case X86::VPMADD52HUQZ128rk:
3032 case X86::VPMADD52HUQZ128rkz:
3033 case X86::VPMADD52HUQZ256r:
3034 case X86::VPMADD52HUQZ256rk:
3035 case X86::VPMADD52HUQZ256rkz:
3036 case X86::VPMADD52HUQZr:
3037 case X86::VPMADD52HUQZrk:
3038 case X86::VPMADD52HUQZrkz:
3039 case X86::VPMADD52LUQrr:
3040 case X86::VPMADD52LUQYrr:
3041 case X86::VPMADD52LUQZ128r:
3042 case X86::VPMADD52LUQZ128rk:
3043 case X86::VPMADD52LUQZ128rkz:
3044 case X86::VPMADD52LUQZ256r:
3045 case X86::VPMADD52LUQZ256rk:
3046 case X86::VPMADD52LUQZ256rkz:
3047 case X86::VPMADD52LUQZr:
3048 case X86::VPMADD52LUQZrk:
3049 case X86::VPMADD52LUQZrkz:
3050 case X86::VFMADDCPHZr:
3051 case X86::VFMADDCPHZrk:
3052 case X86::VFMADDCPHZrkz:
3053 case X86::VFMADDCPHZ128r:
3054 case X86::VFMADDCPHZ128rk:
3055 case X86::VFMADDCPHZ128rkz:
3056 case X86::VFMADDCPHZ256r:
3057 case X86::VFMADDCPHZ256rk:
3058 case X86::VFMADDCPHZ256rkz:
3059 case X86::VFMADDCSHZr:
3060 case X86::VFMADDCSHZrk:
3061 case X86::VFMADDCSHZrkz: {
3062 unsigned CommutableOpIdx1 = 2;
3063 unsigned CommutableOpIdx2 = 3;
3064 if (X86II::isKMasked(Desc.TSFlags)) {
3065 // Skip the mask register.
3066 ++CommutableOpIdx1;
3067 ++CommutableOpIdx2;
3068 }
3069 if (!fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2, CommutableOpIdx1,
3070 CommutableOpIdx2))
3071 return false;
3072 if (!MI.getOperand(SrcOpIdx1).isReg() || !MI.getOperand(SrcOpIdx2).isReg())
3073 // No idea.
3074 return false;
3075 return true;
3076 }
3077
3078 default:
3079 const X86InstrFMA3Group *FMA3Group =
3080 getFMA3Group(MI.getOpcode(), MI.getDesc().TSFlags);
3081 if (FMA3Group)
3082 return findThreeSrcCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2,
3083 FMA3Group->isIntrinsic());
3084
3085 // Handled masked instructions since we need to skip over the mask input
3086 // and the preserved input.
3087 if (X86II::isKMasked(Desc.TSFlags)) {
3088 // First assume that the first input is the mask operand and skip past it.
3089 unsigned CommutableOpIdx1 = Desc.getNumDefs() + 1;
3090 unsigned CommutableOpIdx2 = Desc.getNumDefs() + 2;
3091 // Check if the first input is tied. If there isn't one then we only
3092 // need to skip the mask operand which we did above.
3093 if ((MI.getDesc().getOperandConstraint(Desc.getNumDefs(),
3094 MCOI::TIED_TO) != -1)) {
3095 // If this is zero masking instruction with a tied operand, we need to
3096 // move the first index back to the first input since this must
3097 // be a 3 input instruction and we want the first two non-mask inputs.
3098 // Otherwise this is a 2 input instruction with a preserved input and
3099 // mask, so we need to move the indices to skip one more input.
3100 if (X86II::isKMergeMasked(Desc.TSFlags)) {
3101 ++CommutableOpIdx1;
3102 ++CommutableOpIdx2;
3103 } else {
3104 --CommutableOpIdx1;
3105 }
3106 }
3107
3108 if (!fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2, CommutableOpIdx1,
3109 CommutableOpIdx2))
3110 return false;
3111
3112 if (!MI.getOperand(SrcOpIdx1).isReg() ||
3113 !MI.getOperand(SrcOpIdx2).isReg())
3114 // No idea.
3115 return false;
3116 return true;
3117 }
3118
3119 return TargetInstrInfo::findCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2);
3120 }
3121 return false;
3122}
3123
3125 unsigned Opcode = MI->getOpcode();
3126 if (Opcode != X86::LEA32r && Opcode != X86::LEA64r &&
3127 Opcode != X86::LEA64_32r)
3128 return false;
3129
3130 const MachineOperand &Scale = MI->getOperand(1 + X86::AddrScaleAmt);
3131 const MachineOperand &Disp = MI->getOperand(1 + X86::AddrDisp);
3132 const MachineOperand &Segment = MI->getOperand(1 + X86::AddrSegmentReg);
3133
3134 if (Segment.getReg() != 0 || !Disp.isImm() || Disp.getImm() != 0 ||
3135 Scale.getImm() > 1)
3136 return false;
3137
3138 return true;
3139}
3140
3142 // Currently we're interested in following sequence only.
3143 // r3 = lea r1, r2
3144 // r5 = add r3, r4
3145 // Both r3 and r4 are killed in add, we hope the add instruction has the
3146 // operand order
3147 // r5 = add r4, r3
3148 // So later in X86FixupLEAs the lea instruction can be rewritten as add.
3149 unsigned Opcode = MI.getOpcode();
3150 if (Opcode != X86::ADD32rr && Opcode != X86::ADD64rr)
3151 return false;
3152
3153 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
3154 Register Reg1 = MI.getOperand(1).getReg();
3155 Register Reg2 = MI.getOperand(2).getReg();
3156
3157 // Check if Reg1 comes from LEA in the same MBB.
3158 if (MachineInstr *Inst = MRI.getUniqueVRegDef(Reg1)) {
3159 if (isConvertibleLEA(Inst) && Inst->getParent() == MI.getParent()) {
3160 Commute = true;
3161 return true;
3162 }
3163 }
3164
3165 // Check if Reg2 comes from LEA in the same MBB.
3166 if (MachineInstr *Inst = MRI.getUniqueVRegDef(Reg2)) {
3167 if (isConvertibleLEA(Inst) && Inst->getParent() == MI.getParent()) {
3168 Commute = false;
3169 return true;
3170 }
3171 }
3172
3173 return false;
3174}
3175
3177 unsigned Opcode = MCID.getOpcode();
3178 if (!(X86::isJCC(Opcode) || X86::isSETCC(Opcode) || X86::isSETZUCC(Opcode) ||
3179 X86::isCMOVCC(Opcode) || X86::isCFCMOVCC(Opcode) ||
3180 X86::isCCMPCC(Opcode) || X86::isCTESTCC(Opcode)))
3181 return -1;
3182 // Assume that condition code is always the last use operand.
3183 unsigned NumUses = MCID.getNumOperands() - MCID.getNumDefs();
3184 return NumUses - 1;
3185}
3186
3188 const MCInstrDesc &MCID = MI.getDesc();
3189 int CondNo = getCondSrcNoFromDesc(MCID);
3190 if (CondNo < 0)
3191 return X86::COND_INVALID;
3192 CondNo += MCID.getNumDefs();
3193 return static_cast<X86::CondCode>(MI.getOperand(CondNo).getImm());
3194}
3195
3197 return X86::isJCC(MI.getOpcode()) ? X86::getCondFromMI(MI)
3199}
3200
3202 return X86::isSETCC(MI.getOpcode()) || X86::isSETZUCC(MI.getOpcode())
3205}
3206
3208 return X86::isCMOVCC(MI.getOpcode()) ? X86::getCondFromMI(MI)
3210}
3211
3213 return X86::isCFCMOVCC(MI.getOpcode()) ? X86::getCondFromMI(MI)
3215}
3216
3218 return X86::isCCMPCC(MI.getOpcode()) || X86::isCTESTCC(MI.getOpcode())
3221}
3222
3224 // CCMP/CTEST has two conditional operands:
3225 // - SCC: source conditonal code (same as CMOV)
3226 // - DCF: destination conditional flags, which has 4 valid bits
3227 //
3228 // +----+----+----+----+
3229 // | OF | SF | ZF | CF |
3230 // +----+----+----+----+
3231 //
3232 // If SCC(source conditional code) evaluates to false, CCMP/CTEST will updates
3233 // the conditional flags by as follows:
3234 //
3235 // OF = DCF.OF
3236 // SF = DCF.SF
3237 // ZF = DCF.ZF
3238 // CF = DCF.CF
3239 // PF = DCF.CF
3240 // AF = 0 (Auxiliary Carry Flag)
3241 //
3242 // Otherwise, the CMP or TEST is executed and it updates the
3243 // CSPAZO flags normally.
3244 //
3245 // NOTE:
3246 // If SCC = P, then SCC evaluates to true regardless of the CSPAZO value.
3247 // If SCC = NP, then SCC evaluates to false regardless of the CSPAZO value.
3248
3249 enum { CF = 1, ZF = 2, SF = 4, OF = 8, PF = CF };
3250
3251 switch (CC) {
3252 default:
3253 llvm_unreachable("Illegal condition code!");
3254 case X86::COND_NO:
3255 case X86::COND_NE:
3256 case X86::COND_GE:
3257 case X86::COND_G:
3258 case X86::COND_AE:
3259 case X86::COND_A:
3260 case X86::COND_NS:
3261 case X86::COND_NP:
3262 return 0;
3263 case X86::COND_O:
3264 return OF;
3265 case X86::COND_B:
3266 case X86::COND_BE:
3267 return CF;
3268 break;
3269 case X86::COND_E:
3270 case X86::COND_LE:
3271 return ZF;
3272 case X86::COND_S:
3273 case X86::COND_L:
3274 return SF;
3275 case X86::COND_P:
3276 return PF;
3277 }
3278}
3279
3280#define GET_X86_NF_TRANSFORM_TABLE
3281#define GET_X86_ND2NONND_TABLE
3282#include "X86GenInstrMapping.inc"
3283
3285 unsigned Opc) {
3286 const auto I = llvm::lower_bound(Table, Opc);
3287 return (I == Table.end() || I->OldOpc != Opc) ? 0U : I->NewOpc;
3288}
3289unsigned X86::getNFVariant(unsigned Opc) {
3290 return getNewOpcFromTable(X86NFTransformTable, Opc);
3291}
3292
3293unsigned X86::getNonNDVariant(unsigned Opc) {
3294 return getNewOpcFromTable(X86ND2NonNDTable, Opc);
3295}
3296
3297/// Return the inverse of the specified condition,
3298/// e.g. turning COND_E to COND_NE.
3300 switch (CC) {
3301 default:
3302 llvm_unreachable("Illegal condition code!");
3303 case X86::COND_E:
3304 return X86::COND_NE;
3305 case X86::COND_NE:
3306 return X86::COND_E;
3307 case X86::COND_L:
3308 return X86::COND_GE;
3309 case X86::COND_LE:
3310 return X86::COND_G;
3311 case X86::COND_G:
3312 return X86::COND_LE;
3313 case X86::COND_GE:
3314 return X86::COND_L;
3315 case X86::COND_B:
3316 return X86::COND_AE;
3317 case X86::COND_BE:
3318 return X86::COND_A;
3319 case X86::COND_A:
3320 return X86::COND_BE;
3321 case X86::COND_AE:
3322 return X86::COND_B;
3323 case X86::COND_S:
3324 return X86::COND_NS;
3325 case X86::COND_NS:
3326 return X86::COND_S;
3327 case X86::COND_P:
3328 return X86::COND_NP;
3329 case X86::COND_NP:
3330 return X86::COND_P;
3331 case X86::COND_O:
3332 return X86::COND_NO;
3333 case X86::COND_NO:
3334 return X86::COND_O;
3335 case X86::COND_NE_OR_P:
3336 return X86::COND_E_AND_NP;
3337 case X86::COND_E_AND_NP:
3338 return X86::COND_NE_OR_P;
3339 }
3340}
3341
3342/// Assuming the flags are set by MI(a,b), return the condition code if we
3343/// modify the instructions such that flags are set by MI(b,a).
3345 switch (CC) {
3346 default:
3347 return X86::COND_INVALID;
3348 case X86::COND_E:
3349 return X86::COND_E;
3350 case X86::COND_NE:
3351 return X86::COND_NE;
3352 case X86::COND_L:
3353 return X86::COND_G;
3354 case X86::COND_LE:
3355 return X86::COND_GE;
3356 case X86::COND_G:
3357 return X86::COND_L;
3358 case X86::COND_GE:
3359 return X86::COND_LE;
3360 case X86::COND_B:
3361 return X86::COND_A;
3362 case X86::COND_BE:
3363 return X86::COND_AE;
3364 case X86::COND_A:
3365 return X86::COND_B;
3366 case X86::COND_AE:
3367 return X86::COND_BE;
3368 }
3369}
3370
3371std::pair<X86::CondCode, bool>
3374 bool NeedSwap = false;
3375 switch (Predicate) {
3376 default:
3377 break;
3378 // Floating-point Predicates
3379 case CmpInst::FCMP_UEQ:
3380 CC = X86::COND_E;
3381 break;
3382 case CmpInst::FCMP_OLT:
3383 NeedSwap = true;
3384 [[fallthrough]];
3385 case CmpInst::FCMP_OGT:
3386 CC = X86::COND_A;
3387 break;
3388 case CmpInst::FCMP_OLE:
3389 NeedSwap = true;
3390 [[fallthrough]];
3391 case CmpInst::FCMP_OGE:
3392 CC = X86::COND_AE;
3393 break;
3394 case CmpInst::FCMP_UGT:
3395 NeedSwap = true;
3396 [[fallthrough]];
3397 case CmpInst::FCMP_ULT:
3398 CC = X86::COND_B;
3399 break;
3400 case CmpInst::FCMP_UGE:
3401 NeedSwap = true;
3402 [[fallthrough]];
3403 case CmpInst::FCMP_ULE:
3404 CC = X86::COND_BE;
3405 break;
3406 case CmpInst::FCMP_ONE:
3407 CC = X86::COND_NE;
3408 break;
3409 case CmpInst::FCMP_UNO:
3410 CC = X86::COND_P;
3411 break;
3412 case CmpInst::FCMP_ORD:
3413 CC = X86::COND_NP;
3414 break;
3415 case CmpInst::FCMP_OEQ:
3416 [[fallthrough]];
3417 case CmpInst::FCMP_UNE:
3419 break;
3420
3421 // Integer Predicates
3422 case CmpInst::ICMP_EQ:
3423 CC = X86::COND_E;
3424 break;
3425 case CmpInst::ICMP_NE:
3426 CC = X86::COND_NE;
3427 break;
3428 case CmpInst::ICMP_UGT:
3429 CC = X86::COND_A;
3430 break;
3431 case CmpInst::ICMP_UGE:
3432 CC = X86::COND_AE;
3433 break;
3434 case CmpInst::ICMP_ULT:
3435 CC = X86::COND_B;
3436 break;
3437 case CmpInst::ICMP_ULE:
3438 CC = X86::COND_BE;
3439 break;
3440 case CmpInst::ICMP_SGT:
3441 CC = X86::COND_G;
3442 break;
3443 case CmpInst::ICMP_SGE:
3444 CC = X86::COND_GE;
3445 break;
3446 case CmpInst::ICMP_SLT:
3447 CC = X86::COND_L;
3448 break;
3449 case CmpInst::ICMP_SLE:
3450 CC = X86::COND_LE;
3451 break;
3452 }
3453
3454 return std::make_pair(CC, NeedSwap);
3455}
3456
3457/// Return a cmov opcode for the given register size in bytes, and operand type.
3458unsigned X86::getCMovOpcode(unsigned RegBytes, bool HasMemoryOperand,
3459 bool HasNDD) {
3460 switch (RegBytes) {
3461 default:
3462 llvm_unreachable("Illegal register size!");
3463#define GET_ND_IF_ENABLED(OPC) (HasNDD ? OPC##_ND : OPC)
3464 case 2:
3465 return HasMemoryOperand ? GET_ND_IF_ENABLED(X86::CMOV16rm)
3466 : GET_ND_IF_ENABLED(X86::CMOV16rr);
3467 case 4:
3468 return HasMemoryOperand ? GET_ND_IF_ENABLED(X86::CMOV32rm)
3469 : GET_ND_IF_ENABLED(X86::CMOV32rr);
3470 case 8:
3471 return HasMemoryOperand ? GET_ND_IF_ENABLED(X86::CMOV64rm)
3472 : GET_ND_IF_ENABLED(X86::CMOV64rr);
3473 }
3474}
3475
3476/// Get the VPCMP immediate for the given condition.
3478 switch (CC) {
3479 default:
3480 llvm_unreachable("Unexpected SETCC condition");
3481 case ISD::SETNE:
3482 return 4;
3483 case ISD::SETEQ:
3484 return 0;
3485 case ISD::SETULT:
3486 case ISD::SETLT:
3487 return 1;
3488 case ISD::SETUGT:
3489 case ISD::SETGT:
3490 return 6;
3491 case ISD::SETUGE:
3492 case ISD::SETGE:
3493 return 5;
3494 case ISD::SETULE:
3495 case ISD::SETLE:
3496 return 2;
3497 }
3498}
3499
3500/// Get the VPCMP immediate if the operands are swapped.
3501unsigned X86::getSwappedVPCMPImm(unsigned Imm) {
3502 switch (Imm) {
3503 default:
3504 llvm_unreachable("Unreachable!");
3505 case 0x01:
3506 Imm = 0x06;
3507 break; // LT -> NLE
3508 case 0x02:
3509 Imm = 0x05;
3510 break; // LE -> NLT
3511 case 0x05:
3512 Imm = 0x02;
3513 break; // NLT -> LE
3514 case 0x06:
3515 Imm = 0x01;
3516 break; // NLE -> LT
3517 case 0x00: // EQ
3518 case 0x03: // FALSE
3519 case 0x04: // NE
3520 case 0x07: // TRUE
3521 break;
3522 }
3523
3524 return Imm;
3525}
3526
3527/// Get the VPCOM immediate if the operands are swapped.
3528unsigned X86::getSwappedVPCOMImm(unsigned Imm) {
3529 switch (Imm) {
3530 default:
3531 llvm_unreachable("Unreachable!");
3532 case 0x00:
3533 Imm = 0x02;
3534 break; // LT -> GT
3535 case 0x01:
3536 Imm = 0x03;
3537 break; // LE -> GE
3538 case 0x02:
3539 Imm = 0x00;
3540 break; // GT -> LT
3541 case 0x03:
3542 Imm = 0x01;
3543 break; // GE -> LE
3544 case 0x04: // EQ
3545 case 0x05: // NE
3546 case 0x06: // FALSE
3547 case 0x07: // TRUE
3548 break;
3549 }
3550
3551 return Imm;
3552}
3553
3554/// Get the VCMP immediate if the operands are swapped.
3555unsigned X86::getSwappedVCMPImm(unsigned Imm) {
3556 // Only need the lower 2 bits to distinquish.
3557 switch (Imm & 0x3) {
3558 default:
3559 llvm_unreachable("Unreachable!");
3560 case 0x00:
3561 case 0x03:
3562 // EQ/NE/TRUE/FALSE/ORD/UNORD don't change immediate when commuted.
3563 break;
3564 case 0x01:
3565 case 0x02:
3566 // Need to toggle bits 3:0. Bit 4 stays the same.
3567 Imm ^= 0xf;
3568 break;
3569 }
3570
3571 return Imm;
3572}
3573
3575 if (Info.RegClass == X86::VR128RegClassID ||
3576 Info.RegClass == X86::VR128XRegClassID)
3577 return 128;
3578 if (Info.RegClass == X86::VR256RegClassID ||
3579 Info.RegClass == X86::VR256XRegClassID)
3580 return 256;
3581 if (Info.RegClass == X86::VR512RegClassID)
3582 return 512;
3583 llvm_unreachable("Unknown register class!");
3584}
3585
3586/// Return true if the Reg is X87 register.
3587static bool isX87Reg(unsigned Reg) {
3588 return (Reg == X86::FPCW || Reg == X86::FPSW ||
3589 (Reg >= X86::ST0 && Reg <= X86::ST7));
3590}
3591
3592/// check if the instruction is X87 instruction
3594 // Call and inlineasm defs X87 register, so we special case it here because
3595 // otherwise calls are incorrectly flagged as x87 instructions
3596 // as a result.
3597 if (MI.isCall() || MI.isInlineAsm())
3598 return false;
3599 for (const MachineOperand &MO : MI.operands()) {
3600 if (!MO.isReg())
3601 continue;
3602 if (isX87Reg(MO.getReg()))
3603 return true;
3604 }
3605 return false;
3606}
3607
3609 auto IsMemOp = [](const MCOperandInfo &OpInfo) {
3610 return OpInfo.OperandType == MCOI::OPERAND_MEMORY;
3611 };
3612
3613 const MCInstrDesc &Desc = MI.getDesc();
3614
3615 // Directly invoke the MC-layer routine for real (i.e., non-pseudo)
3616 // instructions (fast case).
3617 if (!X86II::isPseudo(Desc.TSFlags)) {
3618 int MemRefIdx = X86II::getMemoryOperandNo(Desc.TSFlags);
3619 if (MemRefIdx >= 0)
3620 return MemRefIdx + X86II::getOperandBias(Desc);
3621#ifdef EXPENSIVE_CHECKS
3622 assert(none_of(Desc.operands(), IsMemOp) &&
3623 "Got false negative from X86II::getMemoryOperandNo()!");
3624#endif
3625 return -1;
3626 }
3627
3628 // Otherwise, handle pseudo instructions by examining the type of their
3629 // operands (slow case). An instruction cannot have a memory reference if it
3630 // has fewer than AddrNumOperands (= 5) explicit operands.
3631 unsigned NumOps = Desc.getNumOperands();
3632 if (NumOps < X86::AddrNumOperands) {
3633#ifdef EXPENSIVE_CHECKS
3634 assert(none_of(Desc.operands(), IsMemOp) &&
3635 "Expected no operands to have OPERAND_MEMORY type!");
3636#endif
3637 return -1;
3638 }
3639
3640 // The first operand with type OPERAND_MEMORY indicates the start of a memory
3641 // reference. We expect the following AddrNumOperand-1 operands to also have
3642 // OPERAND_MEMORY type.
3643 for (unsigned I = 0, E = NumOps - X86::AddrNumOperands; I != E; ++I) {
3644 if (IsMemOp(Desc.operands()[I])) {
3645#ifdef EXPENSIVE_CHECKS
3646 assert(std::all_of(Desc.operands().begin() + I,
3647 Desc.operands().begin() + I + X86::AddrNumOperands,
3648 IsMemOp) &&
3649 "Expected all five operands in the memory reference to have "
3650 "OPERAND_MEMORY type!");
3651#endif
3652 return I;
3653 }
3654 }
3655
3656 return -1;
3657}
3658
3660 unsigned OpNo) {
3661 assert(MI.getNumOperands() >= (OpNo + X86::AddrNumOperands) &&
3662 "Unexpected number of operands!");
3663
3664 const MachineOperand &Index = MI.getOperand(OpNo + X86::AddrIndexReg);
3665 if (!Index.isReg() || Index.getReg() != X86::NoRegister)
3666 return nullptr;
3667
3668 const MachineOperand &Disp = MI.getOperand(OpNo + X86::AddrDisp);
3669 if (!Disp.isCPI() || Disp.getOffset() != 0)
3670 return nullptr;
3671
3673 MI.getParent()->getParent()->getConstantPool()->getConstants();
3674 const MachineConstantPoolEntry &ConstantEntry = Constants[Disp.getIndex()];
3675
3676 // Bail if this is a machine constant pool entry, we won't be able to dig out
3677 // anything useful.
3678 if (ConstantEntry.isMachineConstantPoolEntry())
3679 return nullptr;
3680
3681 return ConstantEntry.Val.ConstVal;
3682}
3683
3685 switch (MI.getOpcode()) {
3686 case X86::TCRETURNdi:
3687 case X86::TCRETURNri:
3688 case X86::TCRETURNmi:
3689 case X86::TCRETURNdi64:
3690 case X86::TCRETURNri64:
3691 case X86::TCRETURNmi64:
3692 return true;
3693 default:
3694 return false;
3695 }
3696}
3697
3700 const MachineInstr &TailCall) const {
3701
3702 const MachineFunction *MF = TailCall.getMF();
3703
3704 if (MF->getTarget().getCodeModel() == CodeModel::Kernel) {
3705 // Kernel patches thunk calls in runtime, these should never be conditional.
3706 const MachineOperand &Target = TailCall.getOperand(0);
3707 if (Target.isSymbol()) {
3708 StringRef Symbol(Target.getSymbolName());
3709 // this is currently only relevant to r11/kernel indirect thunk.
3710 if (Symbol == "__x86_indirect_thunk_r11")
3711 return false;
3712 }
3713 }
3714
3715 if (TailCall.getOpcode() != X86::TCRETURNdi &&
3716 TailCall.getOpcode() != X86::TCRETURNdi64) {
3717 // Only direct calls can be done with a conditional branch.
3718 return false;
3719 }
3720
3721 if (Subtarget.isTargetWin64() && MF->hasWinCFI()) {
3722 // Conditional tail calls confuse the Win64 unwinder.
3723 return false;
3724 }
3725
3726 assert(BranchCond.size() == 1);
3727 if (BranchCond[0].getImm() > X86::LAST_VALID_COND) {
3728 // Can't make a conditional tail call with this condition.
3729 return false;
3730 }
3731
3733 if (X86FI->getTCReturnAddrDelta() != 0 ||
3734 TailCall.getOperand(1).getImm() != 0) {
3735 // A conditional tail call cannot do any stack adjustment.
3736 return false;
3737 }
3738
3739 return true;
3740}
3741
3744 const MachineInstr &TailCall) const {
3745 assert(canMakeTailCallConditional(BranchCond, TailCall));
3746
3748 while (I != MBB.begin()) {
3749 --I;
3750 if (I->isDebugInstr())
3751 continue;
3752 if (!I->isBranch())
3753 assert(0 && "Can't find the branch to replace!");
3754
3756 assert(BranchCond.size() == 1);
3757 if (CC != BranchCond[0].getImm())
3758 continue;
3759
3760 break;
3761 }
3762
3763 unsigned Opc = TailCall.getOpcode() == X86::TCRETURNdi ? X86::TCRETURNdicc
3764 : X86::TCRETURNdi64cc;
3765
3766 auto MIB = BuildMI(MBB, I, MBB.findDebugLoc(I), get(Opc));
3767 MIB->addOperand(TailCall.getOperand(0)); // Destination.
3768 MIB.addImm(0); // Stack offset (not used).
3769 MIB->addOperand(BranchCond[0]); // Condition.
3770 MIB.copyImplicitOps(TailCall); // Regmask and (imp-used) parameters.
3771
3772 // Add implicit uses and defs of all live regs potentially clobbered by the
3773 // call. This way they still appear live across the call.
3774 LivePhysRegs LiveRegs(getRegisterInfo());
3775 LiveRegs.addLiveOuts(MBB);
3777 LiveRegs.stepForward(*MIB, Clobbers);
3778 for (const auto &C : Clobbers) {
3779 MIB.addReg(C.first, RegState::Implicit);
3781 }
3782
3783 I->eraseFromParent();
3784}
3785
3786// Given a MBB and its TBB, find the FBB which was a fallthrough MBB (it may
3787// not be a fallthrough MBB now due to layout changes). Return nullptr if the
3788// fallthrough MBB cannot be identified.
3791 // Look for non-EHPad successors other than TBB. If we find exactly one, it
3792 // is the fallthrough MBB. If we find zero, then TBB is both the target MBB
3793 // and fallthrough MBB. If we find more than one, we cannot identify the
3794 // fallthrough MBB and should return nullptr.
3795 MachineBasicBlock *FallthroughBB = nullptr;
3796 for (MachineBasicBlock *Succ : MBB->successors()) {
3797 if (Succ->isEHPad() || (Succ == TBB && FallthroughBB))
3798 continue;
3799 // Return a nullptr if we found more than one fallthrough successor.
3800 if (FallthroughBB && FallthroughBB != TBB)
3801 return nullptr;
3802 FallthroughBB = Succ;
3803 }
3804 return FallthroughBB;
3805}
3806
3807bool X86InstrInfo::analyzeBranchImpl(
3810 SmallVectorImpl<MachineInstr *> &CondBranches, bool AllowModify) const {
3811
3812 // Start from the bottom of the block and work up, examining the
3813 // terminator instructions.
3815 MachineBasicBlock::iterator UnCondBrIter = MBB.end();
3816 while (I != MBB.begin()) {
3817 --I;
3818 if (I->isDebugInstr())
3819 continue;
3820
3821 // Working from the bottom, when we see a non-terminator instruction, we're
3822 // done.
3823 if (!isUnpredicatedTerminator(*I))
3824 break;
3825
3826 // A terminator that isn't a branch can't easily be handled by this
3827 // analysis.
3828 if (!I->isBranch())
3829 return true;
3830
3831 // Handle unconditional branches.
3832 if (I->getOpcode() == X86::JMP_1) {
3833 UnCondBrIter = I;
3834
3835 if (!AllowModify) {
3836 TBB = I->getOperand(0).getMBB();
3837 continue;
3838 }
3839
3840 // If the block has any instructions after a JMP, delete them.
3841 MBB.erase(std::next(I), MBB.end());
3842
3843 Cond.clear();
3844 FBB = nullptr;
3845
3846 // Delete the JMP if it's equivalent to a fall-through.
3847 if (MBB.isLayoutSuccessor(I->getOperand(0).getMBB())) {
3848 TBB = nullptr;
3849 I->eraseFromParent();
3850 I = MBB.end();
3851 UnCondBrIter = MBB.end();
3852 continue;
3853 }
3854
3855 // TBB is used to indicate the unconditional destination.
3856 TBB = I->getOperand(0).getMBB();
3857 continue;
3858 }
3859
3860 // Handle conditional branches.
3861 X86::CondCode BranchCode = X86::getCondFromBranch(*I);
3862 if (BranchCode == X86::COND_INVALID)
3863 return true; // Can't handle indirect branch.
3864
3865 // In practice we should never have an undef eflags operand, if we do
3866 // abort here as we are not prepared to preserve the flag.
3867 if (I->findRegisterUseOperand(X86::EFLAGS, /*TRI=*/nullptr)->isUndef())
3868 return true;
3869
3870 // Working from the bottom, handle the first conditional branch.
3871 if (Cond.empty()) {
3872 FBB = TBB;
3873 TBB = I->getOperand(0).getMBB();
3874 Cond.push_back(MachineOperand::CreateImm(BranchCode));
3875 CondBranches.push_back(&*I);
3876 continue;
3877 }
3878
3879 // Handle subsequent conditional branches. Only handle the case where all
3880 // conditional branches branch to the same destination and their condition
3881 // opcodes fit one of the special multi-branch idioms.
3882 assert(Cond.size() == 1);
3883 assert(TBB);
3884
3885 // If the conditions are the same, we can leave them alone.
3886 X86::CondCode OldBranchCode = (X86::CondCode)Cond[0].getImm();
3887 auto NewTBB = I->getOperand(0).getMBB();
3888 if (OldBranchCode == BranchCode && TBB == NewTBB)
3889 continue;
3890
3891 // If they differ, see if they fit one of the known patterns. Theoretically,
3892 // we could handle more patterns here, but we shouldn't expect to see them
3893 // if instruction selection has done a reasonable job.
3894 if (TBB == NewTBB &&
3895 ((OldBranchCode == X86::COND_P && BranchCode == X86::COND_NE) ||
3896 (OldBranchCode == X86::COND_NE && BranchCode == X86::COND_P))) {
3897 BranchCode = X86::COND_NE_OR_P;
3898 } else if ((OldBranchCode == X86::COND_NP && BranchCode == X86::COND_NE) ||
3899 (OldBranchCode == X86::COND_E && BranchCode == X86::COND_P)) {
3900 if (NewTBB != (FBB ? FBB : getFallThroughMBB(&MBB, TBB)))
3901 return true;
3902
3903 // X86::COND_E_AND_NP usually has two different branch destinations.
3904 //
3905 // JP B1
3906 // JE B2
3907 // JMP B1
3908 // B1:
3909 // B2:
3910 //
3911 // Here this condition branches to B2 only if NP && E. It has another
3912 // equivalent form:
3913 //
3914 // JNE B1
3915 // JNP B2
3916 // JMP B1
3917 // B1:
3918 // B2:
3919 //
3920 // Similarly it branches to B2 only if E && NP. That is why this condition
3921 // is named with COND_E_AND_NP.
3922 BranchCode = X86::COND_E_AND_NP;
3923 } else
3924 return true;
3925
3926 // Update the MachineOperand.
3927 Cond[0].setImm(BranchCode);
3928 CondBranches.push_back(&*I);
3929 }
3930
3931 return false;
3932}
3933
3936 MachineBasicBlock *&FBB,
3938 bool AllowModify) const {
3939 SmallVector<MachineInstr *, 4> CondBranches;
3940 return analyzeBranchImpl(MBB, TBB, FBB, Cond, CondBranches, AllowModify);
3941}
3942
3944 const MCInstrDesc &Desc = MI.getDesc();
3945 int MemRefBegin = X86II::getMemoryOperandNo(Desc.TSFlags);
3946 assert(MemRefBegin >= 0 && "instr should have memory operand");
3947 MemRefBegin += X86II::getOperandBias(Desc);
3948
3949 const MachineOperand &MO = MI.getOperand(MemRefBegin + X86::AddrDisp);
3950 if (!MO.isJTI())
3951 return -1;
3952
3953 return MO.getIndex();
3954}
3955
3957 Register Reg) {
3958 if (!Reg.isVirtual())
3959 return -1;
3960 MachineInstr *MI = MRI.getUniqueVRegDef(Reg);
3961 if (MI == nullptr)
3962 return -1;
3963 unsigned Opcode = MI->getOpcode();
3964 if (Opcode != X86::LEA64r && Opcode != X86::LEA32r)
3965 return -1;
3967}
3968
3970 unsigned Opcode = MI.getOpcode();
3971 // Switch-jump pattern for non-PIC code looks like:
3972 // JMP64m $noreg, 8, %X, %jump-table.X, $noreg
3973 if (Opcode == X86::JMP64m || Opcode == X86::JMP32m) {
3975 }
3976 // The pattern for PIC code looks like:
3977 // %0 = LEA64r $rip, 1, $noreg, %jump-table.X
3978 // %1 = MOVSX64rm32 %0, 4, XX, 0, $noreg
3979 // %2 = ADD64rr %1, %0
3980 // JMP64r %2
3981 if (Opcode == X86::JMP64r || Opcode == X86::JMP32r) {
3982 Register Reg = MI.getOperand(0).getReg();
3983 if (!Reg.isVirtual())
3984 return -1;
3985 const MachineFunction &MF = *MI.getParent()->getParent();
3986 const MachineRegisterInfo &MRI = MF.getRegInfo();
3987 MachineInstr *Add = MRI.getUniqueVRegDef(Reg);
3988 if (Add == nullptr)
3989 return -1;
3990 if (Add->getOpcode() != X86::ADD64rr && Add->getOpcode() != X86::ADD32rr)
3991 return -1;
3992 int JTI1 = getJumpTableIndexFromReg(MRI, Add->getOperand(1).getReg());
3993 if (JTI1 >= 0)
3994 return JTI1;
3995 int JTI2 = getJumpTableIndexFromReg(MRI, Add->getOperand(2).getReg());
3996 if (JTI2 >= 0)
3997 return JTI2;
3998 }
3999 return -1;
4000}
4001
4003 MachineBranchPredicate &MBP,
4004 bool AllowModify) const {
4005 using namespace std::placeholders;
4006
4008 SmallVector<MachineInstr *, 4> CondBranches;
4009 if (analyzeBranchImpl(MBB, MBP.TrueDest, MBP.FalseDest, Cond, CondBranches,
4010 AllowModify))
4011 return true;
4012
4013 if (Cond.size() != 1)
4014 return true;
4015
4016 assert(MBP.TrueDest && "expected!");
4017
4018 if (!MBP.FalseDest)
4019 MBP.FalseDest = MBB.getNextNode();
4020
4022
4023 MachineInstr *ConditionDef = nullptr;
4024 bool SingleUseCondition = true;
4025
4027 if (MI.modifiesRegister(X86::EFLAGS, TRI)) {
4028 ConditionDef = &MI;
4029 break;
4030 }
4031
4032 if (MI.readsRegister(X86::EFLAGS, TRI))
4033 SingleUseCondition = false;
4034 }
4035
4036 if (!ConditionDef)
4037 return true;
4038
4039 if (SingleUseCondition) {
4040 for (auto *Succ : MBB.successors())
4041 if (Succ->isLiveIn(X86::EFLAGS))
4042 SingleUseCondition = false;
4043 }
4044
4045 MBP.ConditionDef = ConditionDef;
4046 MBP.SingleUseCondition = SingleUseCondition;
4047
4048 // Currently we only recognize the simple pattern:
4049 //
4050 // test %reg, %reg
4051 // je %label
4052 //
4053 const unsigned TestOpcode =
4054 Subtarget.is64Bit() ? X86::TEST64rr : X86::TEST32rr;
4055
4056 if (ConditionDef->getOpcode() == TestOpcode &&
4057 ConditionDef->getNumOperands() == 3 &&
4058 ConditionDef->getOperand(0).isIdenticalTo(ConditionDef->getOperand(1)) &&
4059 (Cond[0].getImm() == X86::COND_NE || Cond[0].getImm() == X86::COND_E)) {
4060 MBP.LHS = ConditionDef->getOperand(0);
4061 MBP.RHS = MachineOperand::CreateImm(0);
4062 MBP.Predicate = Cond[0].getImm() == X86::COND_NE
4063 ? MachineBranchPredicate::PRED_NE
4064 : MachineBranchPredicate::PRED_EQ;
4065 return false;
4066 }
4067
4068 return true;
4069}
4070
4072 int *BytesRemoved) const {
4073 assert(!BytesRemoved && "code size not handled");
4074
4076 unsigned Count = 0;
4077
4078 while (I != MBB.begin()) {
4079 --I;
4080 if (I->isDebugInstr())
4081 continue;
4082 if (I->getOpcode() != X86::JMP_1 &&
4084 break;
4085 // Remove the branch.
4086 I->eraseFromParent();
4087 I = MBB.end();
4088 ++Count;
4089 }
4090
4091 return Count;
4092}
4093
4096 MachineBasicBlock *FBB,
4098 const DebugLoc &DL, int *BytesAdded) const {
4099 // Shouldn't be a fall through.
4100 assert(TBB && "insertBranch must not be told to insert a fallthrough");
4101 assert((Cond.size() == 1 || Cond.size() == 0) &&
4102 "X86 branch conditions have one component!");
4103 assert(!BytesAdded && "code size not handled");
4104
4105 if (Cond.empty()) {
4106 // Unconditional branch?
4107 assert(!FBB && "Unconditional branch with multiple successors!");
4108 BuildMI(&MBB, DL, get(X86::JMP_1)).addMBB(TBB);
4109 return 1;
4110 }
4111
4112 // If FBB is null, it is implied to be a fall-through block.
4113 bool FallThru = FBB == nullptr;
4114
4115 // Conditional branch.
4116 unsigned Count = 0;
4117 X86::CondCode CC = (X86::CondCode)Cond[0].getImm();
4118 switch (CC) {
4119 case X86::COND_NE_OR_P:
4120 // Synthesize NE_OR_P with two branches.
4121 BuildMI(&MBB, DL, get(X86::JCC_1)).addMBB(TBB).addImm(X86::COND_NE);
4122 ++Count;
4123 BuildMI(&MBB, DL, get(X86::JCC_1)).addMBB(TBB).addImm(X86::COND_P);
4124 ++Count;
4125 break;
4126 case X86::COND_E_AND_NP:
4127 // Use the next block of MBB as FBB if it is null.
4128 if (FBB == nullptr) {
4129 FBB = getFallThroughMBB(&MBB, TBB);
4130 assert(FBB && "MBB cannot be the last block in function when the false "
4131 "body is a fall-through.");
4132 }
4133 // Synthesize COND_E_AND_NP with two branches.
4134 BuildMI(&MBB, DL, get(X86::JCC_1)).addMBB(FBB).addImm(X86::COND_NE);
4135 ++Count;
4136 BuildMI(&MBB, DL, get(X86::JCC_1)).addMBB(TBB).addImm(X86::COND_NP);
4137 ++Count;
4138 break;
4139 default: {
4140 BuildMI(&MBB, DL, get(X86::JCC_1)).addMBB(TBB).addImm(CC);
4141 ++Count;
4142 }
4143 }
4144 if (!FallThru) {
4145 // Two-way Conditional branch. Insert the second branch.
4146 BuildMI(&MBB, DL, get(X86::JMP_1)).addMBB(FBB);
4147 ++Count;
4148 }
4149 return Count;
4150}
4151
4154 Register DstReg, Register TrueReg,
4155 Register FalseReg, int &CondCycles,
4156 int &TrueCycles, int &FalseCycles) const {
4157 // Not all subtargets have cmov instructions.
4158 if (!Subtarget.canUseCMOV())
4159 return false;
4160 if (Cond.size() != 1)
4161 return false;
4162 // We cannot do the composite conditions, at least not in SSA form.
4163 if ((X86::CondCode)Cond[0].getImm() > X86::LAST_VALID_COND)
4164 return false;
4165
4166 // Check register classes.
4168 const TargetRegisterClass *RC =
4169 RI.getCommonSubClass(MRI.getRegClass(TrueReg), MRI.getRegClass(FalseReg));
4170 if (!RC)
4171 return false;
4172
4173 // We have cmov instructions for 16, 32, and 64 bit general purpose registers.
4174 if (X86::GR16RegClass.hasSubClassEq(RC) ||
4175 X86::GR32RegClass.hasSubClassEq(RC) ||
4176 X86::GR64RegClass.hasSubClassEq(RC)) {
4177 // This latency applies to Pentium M, Merom, Wolfdale, Nehalem, and Sandy
4178 // Bridge. Probably Ivy Bridge as well.
4179 CondCycles = 2;
4180 TrueCycles = 2;
4181 FalseCycles = 2;
4182 return true;
4183 }
4184
4185 // Can't do vectors.
4186 return false;
4187}
4188
4191 const DebugLoc &DL, Register DstReg,
4193 Register FalseReg) const {
4195 const TargetRegisterInfo &TRI = *MRI.getTargetRegisterInfo();
4196 const TargetRegisterClass &RC = *MRI.getRegClass(DstReg);
4197 assert(Cond.size() == 1 && "Invalid Cond array");
4198 unsigned Opc =
4199 X86::getCMovOpcode(TRI.getRegSizeInBits(RC) / 8,
4200 false /*HasMemoryOperand*/, Subtarget.hasNDD());
4201 BuildMI(MBB, I, DL, get(Opc), DstReg)
4202 .addReg(FalseReg)
4203 .addReg(TrueReg)
4204 .addImm(Cond[0].getImm());
4205}
4206
4207/// Test if the given register is a physical h register.
4208static bool isHReg(unsigned Reg) {
4209 return X86::GR8_ABCD_HRegClass.contains(Reg);
4210}
4211
4212// Try and copy between VR128/VR64 and GR64 registers.
4213static unsigned CopyToFromAsymmetricReg(unsigned DestReg, unsigned SrcReg,
4214 const X86Subtarget &Subtarget) {
4215 bool HasAVX = Subtarget.hasAVX();
4216 bool HasAVX512 = Subtarget.hasAVX512();
4217 bool HasEGPR = Subtarget.hasEGPR();
4218
4219 // SrcReg(MaskReg) -> DestReg(GR64)
4220 // SrcReg(MaskReg) -> DestReg(GR32)
4221
4222 // All KMASK RegClasses hold the same k registers, can be tested against
4223 // anyone.
4224 if (X86::VK16RegClass.contains(SrcReg)) {
4225 if (X86::GR64RegClass.contains(DestReg)) {
4226 assert(Subtarget.hasBWI());
4227 return HasEGPR ? X86::KMOVQrk_EVEX : X86::KMOVQrk;
4228 }
4229 if (X86::GR32RegClass.contains(DestReg))
4230 return Subtarget.hasBWI() ? (HasEGPR ? X86::KMOVDrk_EVEX : X86::KMOVDrk)
4231 : (HasEGPR ? X86::KMOVWrk_EVEX : X86::KMOVWrk);
4232 }
4233
4234 // SrcReg(GR64) -> DestReg(MaskReg)
4235 // SrcReg(GR32) -> DestReg(MaskReg)
4236
4237 // All KMASK RegClasses hold the same k registers, can be tested against
4238 // anyone.
4239 if (X86::VK16RegClass.contains(DestReg)) {
4240 if (X86::GR64RegClass.contains(SrcReg)) {
4241 assert(Subtarget.hasBWI());
4242 return HasEGPR ? X86::KMOVQkr_EVEX : X86::KMOVQkr;
4243 }
4244 if (X86::GR32RegClass.contains(SrcReg))
4245 return Subtarget.hasBWI() ? (HasEGPR ? X86::KMOVDkr_EVEX : X86::KMOVDkr)
4246 : (HasEGPR ? X86::KMOVWkr_EVEX : X86::KMOVWkr);
4247 }
4248
4249 // SrcReg(VR128) -> DestReg(GR64)
4250 // SrcReg(VR64) -> DestReg(GR64)
4251 // SrcReg(GR64) -> DestReg(VR128)
4252 // SrcReg(GR64) -> DestReg(VR64)
4253
4254 if (X86::GR64RegClass.contains(DestReg)) {
4255 if (X86::VR128XRegClass.contains(SrcReg))
4256 // Copy from a VR128 register to a GR64 register.
4257 return HasAVX512 ? X86::VMOVPQIto64Zrr
4258 : HasAVX ? X86::VMOVPQIto64rr
4259 : X86::MOVPQIto64rr;
4260 if (X86::VR64RegClass.contains(SrcReg))
4261 // Copy from a VR64 register to a GR64 register.
4262 return X86::MMX_MOVD64from64rr;
4263 } else if (X86::GR64RegClass.contains(SrcReg)) {
4264 // Copy from a GR64 register to a VR128 register.
4265 if (X86::VR128XRegClass.contains(DestReg))
4266 return HasAVX512 ? X86::VMOV64toPQIZrr
4267 : HasAVX ? X86::VMOV64toPQIrr
4268 : X86::MOV64toPQIrr;
4269 // Copy from a GR64 register to a VR64 register.
4270 if (X86::VR64RegClass.contains(DestReg))
4271 return X86::MMX_MOVD64to64rr;
4272 }
4273
4274 // SrcReg(VR128) -> DestReg(GR32)
4275 // SrcReg(GR32) -> DestReg(VR128)
4276
4277 if (X86::GR32RegClass.contains(DestReg) &&
4278 X86::VR128XRegClass.contains(SrcReg))
4279 // Copy from a VR128 register to a GR32 register.
4280 return HasAVX512 ? X86::VMOVPDI2DIZrr
4281 : HasAVX ? X86::VMOVPDI2DIrr
4282 : X86::MOVPDI2DIrr;
4283
4284 if (X86::VR128XRegClass.contains(DestReg) &&
4285 X86::GR32RegClass.contains(SrcReg))
4286 // Copy from a VR128 register to a VR128 register.
4287 return HasAVX512 ? X86::VMOVDI2PDIZrr
4288 : HasAVX ? X86::VMOVDI2PDIrr
4289 : X86::MOVDI2PDIrr;
4290 return 0;
4291}
4292
4295 const DebugLoc &DL, MCRegister DestReg,
4296 MCRegister SrcReg, bool KillSrc) const {
4297 // First deal with the normal symmetric copies.
4298 bool HasAVX = Subtarget.hasAVX();
4299 bool HasVLX = Subtarget.hasVLX();
4300 bool HasEGPR = Subtarget.hasEGPR();
4301 unsigned Opc = 0;
4302 if (X86::GR64RegClass.contains(DestReg, SrcReg))
4303 Opc = X86::MOV64rr;
4304 else if (X86::GR32RegClass.contains(DestReg, SrcReg))
4305 Opc = X86::MOV32rr;
4306 else if (X86::GR16RegClass.contains(DestReg, SrcReg))
4307 Opc = X86::MOV16rr;
4308 else if (X86::GR8RegClass.contains(DestReg, SrcReg)) {
4309 // Copying to or from a physical H register on x86-64 requires a NOREX
4310 // move. Otherwise use a normal move.
4311 if ((isHReg(DestReg) || isHReg(SrcReg)) && Subtarget.is64Bit()) {
4312 Opc = X86::MOV8rr_NOREX;
4313 // Both operands must be encodable without an REX prefix.
4314 assert(X86::GR8_NOREXRegClass.contains(SrcReg, DestReg) &&
4315 "8-bit H register can not be copied outside GR8_NOREX");
4316 } else
4317 Opc = X86::MOV8rr;
4318 } else if (X86::VR64RegClass.contains(DestReg, SrcReg))
4319 Opc = X86::MMX_MOVQ64rr;
4320 else if (X86::VR128XRegClass.contains(DestReg, SrcReg)) {
4321 if (HasVLX)
4322 Opc = X86::VMOVAPSZ128rr;
4323 else if (X86::VR128RegClass.contains(DestReg, SrcReg))
4324 Opc = HasAVX ? X86::VMOVAPSrr : X86::MOVAPSrr;
4325 else {
4326 // If this an extended register and we don't have VLX we need to use a
4327 // 512-bit move.
4328 Opc = X86::VMOVAPSZrr;
4330 DestReg =
4331 TRI->getMatchingSuperReg(DestReg, X86::sub_xmm, &X86::VR512RegClass);
4332 SrcReg =
4333 TRI->getMatchingSuperReg(SrcReg, X86::sub_xmm, &X86::VR512RegClass);
4334 }
4335 } else if (X86::VR256XRegClass.contains(DestReg, SrcReg)) {
4336 if (HasVLX)
4337 Opc = X86::VMOVAPSZ256rr;
4338 else if (X86::VR256RegClass.contains(DestReg, SrcReg))
4339 Opc = X86::VMOVAPSYrr;
4340 else {
4341 // If this an extended register and we don't have VLX we need to use a
4342 // 512-bit move.
4343 Opc = X86::VMOVAPSZrr;
4345 DestReg =
4346 TRI->getMatchingSuperReg(DestReg, X86::sub_ymm, &X86::VR512RegClass);
4347 SrcReg =
4348 TRI->getMatchingSuperReg(SrcReg, X86::sub_ymm, &X86::VR512RegClass);
4349 }
4350 } else if (X86::VR512RegClass.contains(DestReg, SrcReg))
4351 Opc = X86::VMOVAPSZrr;
4352 // All KMASK RegClasses hold the same k registers, can be tested against
4353 // anyone.
4354 else if (X86::VK16RegClass.contains(DestReg, SrcReg))
4355 Opc = Subtarget.hasBWI() ? (HasEGPR ? X86::KMOVQkk_EVEX : X86::KMOVQkk)
4356 : (HasEGPR ? X86::KMOVQkk_EVEX : X86::KMOVWkk);
4357 if (!Opc)
4358 Opc = CopyToFromAsymmetricReg(DestReg, SrcReg, Subtarget);
4359
4360 if (Opc) {
4361 BuildMI(MBB, MI, DL, get(Opc), DestReg)
4362 .addReg(SrcReg, getKillRegState(KillSrc));
4363 return;
4364 }
4365
4366 if (SrcReg == X86::EFLAGS || DestReg == X86::EFLAGS) {
4367 // FIXME: We use a fatal error here because historically LLVM has tried
4368 // lower some of these physreg copies and we want to ensure we get
4369 // reasonable bug reports if someone encounters a case no other testing
4370 // found. This path should be removed after the LLVM 7 release.
4371 report_fatal_error("Unable to copy EFLAGS physical register!");
4372 }
4373
4374 LLVM_DEBUG(dbgs() << "Cannot copy " << RI.getName(SrcReg) << " to "
4375 << RI.getName(DestReg) << '\n');
4376 report_fatal_error("Cannot emit physreg copy instruction");
4377}
4378
4379std::optional<DestSourcePair>
4381 if (MI.isMoveReg()) {
4382 // FIXME: Dirty hack for apparent invariant that doesn't hold when
4383 // subreg_to_reg is coalesced with ordinary copies, such that the bits that
4384 // were asserted as 0 are now undef.
4385 if (MI.getOperand(0).isUndef() && MI.getOperand(0).getSubReg())
4386 return std::nullopt;
4387
4388 return DestSourcePair{MI.getOperand(0), MI.getOperand(1)};
4389 }
4390 return std::nullopt;
4391}
4392
4393static unsigned getLoadStoreOpcodeForFP16(bool Load, const X86Subtarget &STI) {
4394 if (STI.hasFP16())
4395 return Load ? X86::VMOVSHZrm_alt : X86::VMOVSHZmr;
4396 if (Load)
4397 return STI.hasAVX512() ? X86::VMOVSSZrm
4398 : STI.hasAVX() ? X86::VMOVSSrm
4399 : X86::MOVSSrm;
4400 else
4401 return STI.hasAVX512() ? X86::VMOVSSZmr
4402 : STI.hasAVX() ? X86::VMOVSSmr
4403 : X86::MOVSSmr;
4404}
4405
4407 const TargetRegisterClass *RC,
4408 bool IsStackAligned,
4409 const X86Subtarget &STI, bool Load) {
4410 bool HasAVX = STI.hasAVX();
4411 bool HasAVX512 = STI.hasAVX512();
4412 bool HasVLX = STI.hasVLX();
4413 bool HasEGPR = STI.hasEGPR();
4414
4415 assert(RC != nullptr && "Invalid target register class");
4416 switch (STI.getRegisterInfo()->getSpillSize(*RC)) {
4417 default:
4418 llvm_unreachable("Unknown spill size");
4419 case 1:
4420 assert(X86::GR8RegClass.hasSubClassEq(RC) && "Unknown 1-byte regclass");
4421 if (STI.is64Bit())
4422 // Copying to or from a physical H register on x86-64 requires a NOREX
4423 // move. Otherwise use a normal move.
4424 if (isHReg(Reg) || X86::GR8_ABCD_HRegClass.hasSubClassEq(RC))
4425 return Load ? X86::MOV8rm_NOREX : X86::MOV8mr_NOREX;
4426 return Load ? X86::MOV8rm : X86::MOV8mr;
4427 case 2:
4428 if (X86::VK16RegClass.hasSubClassEq(RC))
4429 return Load ? (HasEGPR ? X86::KMOVWkm_EVEX : X86::KMOVWkm)
4430 : (HasEGPR ? X86::KMOVWmk_EVEX : X86::KMOVWmk);
4431 assert(X86::GR16RegClass.hasSubClassEq(RC) && "Unknown 2-byte regclass");
4432 return Load ? X86::MOV16rm : X86::MOV16mr;
4433 case 4:
4434 if (X86::GR32RegClass.hasSubClassEq(RC))
4435 return Load ? X86::MOV32rm : X86::MOV32mr;
4436 if (X86::FR32XRegClass.hasSubClassEq(RC))
4437 return Load ? (HasAVX512 ? X86::VMOVSSZrm_alt
4438 : HasAVX ? X86::VMOVSSrm_alt
4439 : X86::MOVSSrm_alt)
4440 : (HasAVX512 ? X86::VMOVSSZmr
4441 : HasAVX ? X86::VMOVSSmr
4442 : X86::MOVSSmr);
4443 if (X86::RFP32RegClass.hasSubClassEq(RC))
4444 return Load ? X86::LD_Fp32m : X86::ST_Fp32m;
4445 if (X86::VK32RegClass.hasSubClassEq(RC)) {
4446 assert(STI.hasBWI() && "KMOVD requires BWI");
4447 return Load ? (HasEGPR ? X86::KMOVDkm_EVEX : X86::KMOVDkm)
4448 : (HasEGPR ? X86::KMOVDmk_EVEX : X86::KMOVDmk);
4449 }
4450 // All of these mask pair classes have the same spill size, the same kind
4451 // of kmov instructions can be used with all of them.
4452 if (X86::VK1PAIRRegClass.hasSubClassEq(RC) ||
4453 X86::VK2PAIRRegClass.hasSubClassEq(RC) ||
4454 X86::VK4PAIRRegClass.hasSubClassEq(RC) ||
4455 X86::VK8PAIRRegClass.hasSubClassEq(RC) ||
4456 X86::VK16PAIRRegClass.hasSubClassEq(RC))
4457 return Load ? X86::MASKPAIR16LOAD : X86::MASKPAIR16STORE;
4458 if (X86::FR16RegClass.hasSubClassEq(RC) ||
4459 X86::FR16XRegClass.hasSubClassEq(RC))
4460 return getLoadStoreOpcodeForFP16(Load, STI);
4461 llvm_unreachable("Unknown 4-byte regclass");
4462 case 8:
4463 if (X86::GR64RegClass.hasSubClassEq(RC))
4464 return Load ? X86::MOV64rm : X86::MOV64mr;
4465 if (X86::FR64XRegClass.hasSubClassEq(RC))
4466 return Load ? (HasAVX512 ? X86::VMOVSDZrm_alt
4467 : HasAVX ? X86::VMOVSDrm_alt
4468 : X86::MOVSDrm_alt)
4469 : (HasAVX512 ? X86::VMOVSDZmr
4470 : HasAVX ? X86::VMOVSDmr
4471 : X86::MOVSDmr);
4472 if (X86::VR64RegClass.hasSubClassEq(RC))
4473 return Load ? X86::MMX_MOVQ64rm : X86::MMX_MOVQ64mr;
4474 if (X86::RFP64RegClass.hasSubClassEq(RC))
4475 return Load ? X86::LD_Fp64m : X86::ST_Fp64m;
4476 if (X86::VK64RegClass.hasSubClassEq(RC)) {
4477 assert(STI.hasBWI() && "KMOVQ requires BWI");
4478 return Load ? (HasEGPR ? X86::KMOVQkm_EVEX : X86::KMOVQkm)
4479 : (HasEGPR ? X86::KMOVQmk_EVEX : X86::KMOVQmk);
4480 }
4481 llvm_unreachable("Unknown 8-byte regclass");
4482 case 10:
4483 assert(X86::RFP80RegClass.hasSubClassEq(RC) && "Unknown 10-byte regclass");
4484 return Load ? X86::LD_Fp80m : X86::ST_FpP80m;
4485 case 16: {
4486 if (X86::VR128XRegClass.hasSubClassEq(RC)) {
4487 // If stack is realigned we can use aligned stores.
4488 if (IsStackAligned)
4489 return Load ? (HasVLX ? X86::VMOVAPSZ128rm
4490 : HasAVX512 ? X86::VMOVAPSZ128rm_NOVLX
4491 : HasAVX ? X86::VMOVAPSrm
4492 : X86::MOVAPSrm)
4493 : (HasVLX ? X86::VMOVAPSZ128mr
4494 : HasAVX512 ? X86::VMOVAPSZ128mr_NOVLX
4495 : HasAVX ? X86::VMOVAPSmr
4496 : X86::MOVAPSmr);
4497 else
4498 return Load ? (HasVLX ? X86::VMOVUPSZ128rm
4499 : HasAVX512 ? X86::VMOVUPSZ128rm_NOVLX
4500 : HasAVX ? X86::VMOVUPSrm
4501 : X86::MOVUPSrm)
4502 : (HasVLX ? X86::VMOVUPSZ128mr
4503 : HasAVX512 ? X86::VMOVUPSZ128mr_NOVLX
4504 : HasAVX ? X86::VMOVUPSmr
4505 : X86::MOVUPSmr);
4506 }
4507 llvm_unreachable("Unknown 16-byte regclass");
4508 }
4509 case 32:
4510 assert(X86::VR256XRegClass.hasSubClassEq(RC) && "Unknown 32-byte regclass");
4511 // If stack is realigned we can use aligned stores.
4512 if (IsStackAligned)
4513 return Load ? (HasVLX ? X86::VMOVAPSZ256rm
4514 : HasAVX512 ? X86::VMOVAPSZ256rm_NOVLX
4515 : X86::VMOVAPSYrm)
4516 : (HasVLX ? X86::VMOVAPSZ256mr
4517 : HasAVX512 ? X86::VMOVAPSZ256mr_NOVLX
4518 : X86::VMOVAPSYmr);
4519 else
4520 return Load ? (HasVLX ? X86::VMOVUPSZ256rm
4521 : HasAVX512 ? X86::VMOVUPSZ256rm_NOVLX
4522 : X86::VMOVUPSYrm)
4523 : (HasVLX ? X86::VMOVUPSZ256mr
4524 : HasAVX512 ? X86::VMOVUPSZ256mr_NOVLX
4525 : X86::VMOVUPSYmr);
4526 case 64:
4527 assert(X86::VR512RegClass.hasSubClassEq(RC) && "Unknown 64-byte regclass");
4528 assert(STI.hasAVX512() && "Using 512-bit register requires AVX512");
4529 if (IsStackAligned)
4530 return Load ? X86::VMOVAPSZrm : X86::VMOVAPSZmr;
4531 else
4532 return Load ? X86::VMOVUPSZrm : X86::VMOVUPSZmr;
4533 case 1024:
4534 assert(X86::TILERegClass.hasSubClassEq(RC) && "Unknown 1024-byte regclass");
4535 assert(STI.hasAMXTILE() && "Using 8*1024-bit register requires AMX-TILE");
4536#define GET_EGPR_IF_ENABLED(OPC) (STI.hasEGPR() ? OPC##_EVEX : OPC)
4537 return Load ? GET_EGPR_IF_ENABLED(X86::TILELOADD)
4538 : GET_EGPR_IF_ENABLED(X86::TILESTORED);
4539#undef GET_EGPR_IF_ENABLED
4540 }
4541}
4542
4543std::optional<ExtAddrMode>
4545 const TargetRegisterInfo *TRI) const {
4546 const MCInstrDesc &Desc = MemI.getDesc();
4547 int MemRefBegin = X86II::getMemoryOperandNo(Desc.TSFlags);
4548 if (MemRefBegin < 0)
4549 return std::nullopt;
4550
4551 MemRefBegin += X86II::getOperandBias(Desc);
4552
4553 auto &BaseOp = MemI.getOperand(MemRefBegin + X86::AddrBaseReg);
4554 if (!BaseOp.isReg()) // Can be an MO_FrameIndex
4555 return std::nullopt;
4556
4557 const MachineOperand &DispMO = MemI.getOperand(MemRefBegin + X86::AddrDisp);
4558 // Displacement can be symbolic
4559 if (!DispMO.isImm())
4560 return std::nullopt;
4561
4562 ExtAddrMode AM;
4563 AM.BaseReg = BaseOp.getReg();
4564 AM.ScaledReg = MemI.getOperand(MemRefBegin + X86::AddrIndexReg).getReg();
4565 AM.Scale = MemI.getOperand(MemRefBegin + X86::AddrScaleAmt).getImm();
4566 AM.Displacement = DispMO.getImm();
4567 return AM;
4568}
4569
4571 StringRef &ErrInfo) const {
4572 std::optional<ExtAddrMode> AMOrNone = getAddrModeFromMemoryOp(MI, nullptr);
4573 if (!AMOrNone)
4574 return true;
4575
4576 ExtAddrMode AM = *AMOrNone;
4578 if (AM.ScaledReg != X86::NoRegister) {
4579 switch (AM.Scale) {
4580 case 1:
4581 case 2:
4582 case 4:
4583 case 8:
4584 break;
4585 default:
4586 ErrInfo = "Scale factor in address must be 1, 2, 4 or 8";
4587 return false;
4588 }
4589 }
4590 if (!isInt<32>(AM.Displacement)) {
4591 ErrInfo = "Displacement in address must fit into 32-bit signed "
4592 "integer";
4593 return false;
4594 }
4595
4596 return true;
4597}
4598
4600 const Register Reg,
4601 int64_t &ImmVal) const {
4602 Register MovReg = Reg;
4603 const MachineInstr *MovMI = &MI;
4604
4605 // Follow use-def for SUBREG_TO_REG to find the real move immediate
4606 // instruction. It is quite common for x86-64.
4607 if (MI.isSubregToReg()) {
4608 // We use following pattern to setup 64b immediate.
4609 // %8:gr32 = MOV32r0 implicit-def dead $eflags
4610 // %6:gr64 = SUBREG_TO_REG 0, killed %8:gr32, %subreg.sub_32bit
4611 if (!MI.getOperand(1).isImm())
4612 return false;
4613 unsigned FillBits = MI.getOperand(1).getImm();
4614 unsigned SubIdx = MI.getOperand(3).getImm();
4615 MovReg = MI.getOperand(2).getReg();
4616 if (SubIdx != X86::sub_32bit || FillBits != 0)
4617 return false;
4618 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
4619 MovMI = MRI.getUniqueVRegDef(MovReg);
4620 if (!MovMI)
4621 return false;
4622 }
4623
4624 if (MovMI->getOpcode() == X86::MOV32r0 &&
4625 MovMI->getOperand(0).getReg() == MovReg) {
4626 ImmVal = 0;
4627 return true;
4628 }
4629
4630 if (MovMI->getOpcode() != X86::MOV32ri &&
4631 MovMI->getOpcode() != X86::MOV64ri &&
4632 MovMI->getOpcode() != X86::MOV32ri64 && MovMI->getOpcode() != X86::MOV8ri)
4633 return false;
4634 // Mov Src can be a global address.
4635 if (!MovMI->getOperand(1).isImm() || MovMI->getOperand(0).getReg() != MovReg)
4636 return false;
4637 ImmVal = MovMI->getOperand(1).getImm();
4638 return true;
4639}
4640
4642 const MachineInstr *MI, const Register NullValueReg,
4643 const TargetRegisterInfo *TRI) const {
4644 if (!MI->modifiesRegister(NullValueReg, TRI))
4645 return true;
4646 switch (MI->getOpcode()) {
4647 // Shift right/left of a null unto itself is still a null, i.e. rax = shl rax
4648 // X.
4649 case X86::SHR64ri:
4650 case X86::SHR32ri:
4651 case X86::SHL64ri:
4652 case X86::SHL32ri:
4653 assert(MI->getOperand(0).isDef() && MI->getOperand(1).isUse() &&
4654 "expected for shift opcode!");
4655 return MI->getOperand(0).getReg() == NullValueReg &&
4656 MI->getOperand(1).getReg() == NullValueReg;
4657 // Zero extend of a sub-reg of NullValueReg into itself does not change the
4658 // null value.
4659 case X86::MOV32rr:
4660 return llvm::all_of(MI->operands(), [&](const MachineOperand &MO) {
4661 return TRI->isSubRegisterEq(NullValueReg, MO.getReg());
4662 });
4663 default:
4664 return false;
4665 }
4666 llvm_unreachable("Should be handled above!");
4667}
4668
4671 int64_t &Offset, bool &OffsetIsScalable, LocationSize &Width,
4672 const TargetRegisterInfo *TRI) const {
4673 const MCInstrDesc &Desc = MemOp.getDesc();
4674 int MemRefBegin = X86II::getMemoryOperandNo(Desc.TSFlags);
4675 if (MemRefBegin < 0)
4676 return false;
4677
4678 MemRefBegin += X86II::getOperandBias(Desc);
4679
4680 const MachineOperand *BaseOp =
4681 &MemOp.getOperand(MemRefBegin + X86::AddrBaseReg);
4682 if (!BaseOp->isReg()) // Can be an MO_FrameIndex
4683 return false;
4684
4685 if (MemOp.getOperand(MemRefBegin + X86::AddrScaleAmt).getImm() != 1)
4686 return false;
4687
4688 if (MemOp.getOperand(MemRefBegin + X86::AddrIndexReg).getReg() !=
4689 X86::NoRegister)
4690 return false;
4691
4692 const MachineOperand &DispMO = MemOp.getOperand(MemRefBegin + X86::AddrDisp);
4693
4694 // Displacement can be symbolic
4695 if (!DispMO.isImm())
4696 return false;
4697
4698 Offset = DispMO.getImm();
4699
4700 if (!BaseOp->isReg())
4701 return false;
4702
4703 OffsetIsScalable = false;
4704 // FIXME: Relying on memoperands() may not be right thing to do here. Check
4705 // with X86 maintainers, and fix it accordingly. For now, it is ok, since
4706 // there is no use of `Width` for X86 back-end at the moment.
4707 Width =
4708 !MemOp.memoperands_empty() ? MemOp.memoperands().front()->getSize() : 0;
4709 BaseOps.push_back(BaseOp);
4710 return true;
4711}
4712
4713static unsigned getStoreRegOpcode(Register SrcReg,
4714 const TargetRegisterClass *RC,
4715 bool IsStackAligned,
4716 const X86Subtarget &STI) {
4717 return getLoadStoreRegOpcode(SrcReg, RC, IsStackAligned, STI, false);
4718}
4719
4720static unsigned getLoadRegOpcode(Register DestReg,
4721 const TargetRegisterClass *RC,
4722 bool IsStackAligned, const X86Subtarget &STI) {
4723 return getLoadStoreRegOpcode(DestReg, RC, IsStackAligned, STI, true);
4724}
4725
4726static bool isAMXOpcode(unsigned Opc) {
4727 switch (Opc) {
4728 default:
4729 return false;
4730 case X86::TILELOADD:
4731 case X86::TILESTORED:
4732 case X86::TILELOADD_EVEX:
4733 case X86::TILESTORED_EVEX:
4734 return true;
4735 }
4736}
4737
4740 unsigned Opc, Register Reg, int FrameIdx,
4741 bool isKill) const {
4742 switch (Opc) {
4743 default:
4744 llvm_unreachable("Unexpected special opcode!");
4745 case X86::TILESTORED:
4746 case X86::TILESTORED_EVEX: {
4747 // tilestored %tmm, (%sp, %idx)
4749 Register VirtReg = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass);
4750 BuildMI(MBB, MI, DebugLoc(), get(X86::MOV64ri), VirtReg).addImm(64);
4751 MachineInstr *NewMI =
4752 addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc)), FrameIdx)
4753 .addReg(Reg, getKillRegState(isKill));
4755 MO.setReg(VirtReg);
4756 MO.setIsKill(true);
4757 break;
4758 }
4759 case X86::TILELOADD:
4760 case X86::TILELOADD_EVEX: {
4761 // tileloadd (%sp, %idx), %tmm
4763 Register VirtReg = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass);
4764 BuildMI(MBB, MI, DebugLoc(), get(X86::MOV64ri), VirtReg).addImm(64);
4766 BuildMI(MBB, MI, DebugLoc(), get(Opc), Reg), FrameIdx);
4768 MO.setReg(VirtReg);
4769 MO.setIsKill(true);
4770 break;
4771 }
4772 }
4773}
4774
4777 bool isKill, int FrameIdx, const TargetRegisterClass *RC,
4778 const TargetRegisterInfo *TRI, Register VReg) const {
4779 const MachineFunction &MF = *MBB.getParent();
4780 const MachineFrameInfo &MFI = MF.getFrameInfo();
4781 assert(MFI.getObjectSize(FrameIdx) >= TRI->getSpillSize(*RC) &&
4782 "Stack slot too small for store");
4783
4784 unsigned Alignment = std::max<uint32_t>(TRI->getSpillSize(*RC), 16);
4785 bool isAligned =
4786 (Subtarget.getFrameLowering()->getStackAlign() >= Alignment) ||
4787 (RI.canRealignStack(MF) && !MFI.isFixedObjectIndex(FrameIdx));
4788
4789 unsigned Opc = getStoreRegOpcode(SrcReg, RC, isAligned, Subtarget);
4790 if (isAMXOpcode(Opc))
4791 loadStoreTileReg(MBB, MI, Opc, SrcReg, FrameIdx, isKill);
4792 else
4793 addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc)), FrameIdx)
4794 .addReg(SrcReg, getKillRegState(isKill));
4795}
4796
4799 Register DestReg, int FrameIdx,
4800 const TargetRegisterClass *RC,
4801 const TargetRegisterInfo *TRI,
4802 Register VReg) const {
4803 const MachineFunction &MF = *MBB.getParent();
4804 const MachineFrameInfo &MFI = MF.getFrameInfo();
4805 assert(MFI.getObjectSize(FrameIdx) >= TRI->getSpillSize(*RC) &&
4806 "Load size exceeds stack slot");
4807 unsigned Alignment = std::max<uint32_t>(TRI->getSpillSize(*RC), 16);
4808 bool isAligned =
4809 (Subtarget.getFrameLowering()->getStackAlign() >= Alignment) ||
4810 (RI.canRealignStack(MF) && !MFI.isFixedObjectIndex(FrameIdx));
4811
4812 unsigned Opc = getLoadRegOpcode(DestReg, RC, isAligned, Subtarget);
4813 if (isAMXOpcode(Opc))
4814 loadStoreTileReg(MBB, MI, Opc, DestReg, FrameIdx);
4815 else
4816 addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc), DestReg),
4817 FrameIdx);
4818}
4819
4821 Register &SrcReg2, int64_t &CmpMask,
4822 int64_t &CmpValue) const {
4823 switch (MI.getOpcode()) {
4824 default:
4825 break;
4826 case X86::CMP64ri32:
4827 case X86::CMP32ri:
4828 case X86::CMP16ri:
4829 case X86::CMP8ri:
4830 SrcReg = MI.getOperand(0).getReg();
4831 SrcReg2 = 0;
4832 if (MI.getOperand(1).isImm()) {
4833 CmpMask = ~0;
4834 CmpValue = MI.getOperand(1).getImm();
4835 } else {
4836 CmpMask = CmpValue = 0;
4837 }
4838 return true;
4839 // A SUB can be used to perform comparison.
4840 CASE_ND(SUB64rm)
4841 CASE_ND(SUB32rm)
4842 CASE_ND(SUB16rm)
4843 CASE_ND(SUB8rm)
4844 SrcReg = MI.getOperand(1).getReg();
4845 SrcReg2 = 0;
4846 CmpMask = 0;
4847 CmpValue = 0;
4848 return true;
4849 CASE_ND(SUB64rr)
4850 CASE_ND(SUB32rr)
4851 CASE_ND(SUB16rr)
4852 CASE_ND(SUB8rr)
4853 SrcReg = MI.getOperand(1).getReg();
4854 SrcReg2 = MI.getOperand(2).getReg();
4855 CmpMask = 0;
4856 CmpValue = 0;
4857 return true;
4858 CASE_ND(SUB64ri32)
4859 CASE_ND(SUB32ri)
4860 CASE_ND(SUB16ri)
4861 CASE_ND(SUB8ri)
4862 SrcReg = MI.getOperand(1).getReg();
4863 SrcReg2 = 0;
4864 if (MI.getOperand(2).isImm()) {
4865 CmpMask = ~0;
4866 CmpValue = MI.getOperand(2).getImm();
4867 } else {
4868 CmpMask = CmpValue = 0;
4869 }
4870 return true;
4871 case X86::CMP64rr:
4872 case X86::CMP32rr:
4873 case X86::CMP16rr:
4874 case X86::CMP8rr:
4875 SrcReg = MI.getOperand(0).getReg();
4876 SrcReg2 = MI.getOperand(1).getReg();
4877 CmpMask = 0;
4878 CmpValue = 0;
4879 return true;
4880 case X86::TEST8rr:
4881 case X86::TEST16rr:
4882 case X86::TEST32rr:
4883 case X86::TEST64rr:
4884 SrcReg = MI.getOperand(0).getReg();
4885 if (MI.getOperand(1).getReg() != SrcReg)
4886 return false;
4887 // Compare against zero.
4888 SrcReg2 = 0;
4889 CmpMask = ~0;
4890 CmpValue = 0;
4891 return true;
4892 }
4893 return false;
4894}
4895
4896bool X86InstrInfo::isRedundantFlagInstr(const MachineInstr &FlagI,
4897 Register SrcReg, Register SrcReg2,
4898 int64_t ImmMask, int64_t ImmValue,
4899 const MachineInstr &OI, bool *IsSwapped,
4900 int64_t *ImmDelta) const {
4901 switch (OI.getOpcode()) {
4902 case X86::CMP64rr:
4903 case X86::CMP32rr:
4904 case X86::CMP16rr:
4905 case X86::CMP8rr:
4906 CASE_ND(SUB64rr)
4907 CASE_ND(SUB32rr)
4908 CASE_ND(SUB16rr)
4909 CASE_ND(SUB8rr) {
4910 Register OISrcReg;
4911 Register OISrcReg2;
4912 int64_t OIMask;
4913 int64_t OIValue;
4914 if (!analyzeCompare(OI, OISrcReg, OISrcReg2, OIMask, OIValue) ||
4915 OIMask != ImmMask || OIValue != ImmValue)
4916 return false;
4917 if (SrcReg == OISrcReg && SrcReg2 == OISrcReg2) {
4918 *IsSwapped = false;
4919 return true;
4920 }
4921 if (SrcReg == OISrcReg2 && SrcReg2 == OISrcReg) {
4922 *IsSwapped = true;
4923 return true;
4924 }
4925 return false;
4926 }
4927 case X86::CMP64ri32:
4928 case X86::CMP32ri:
4929 case X86::CMP16ri:
4930 case X86::CMP8ri:
4931 CASE_ND(SUB64ri32)
4932 CASE_ND(SUB32ri)
4933 CASE_ND(SUB16ri)
4934 CASE_ND(SUB8ri)
4935 case X86::TEST64rr:
4936 case X86::TEST32rr:
4937 case X86::TEST16rr:
4938 case X86::TEST8rr: {
4939 if (ImmMask != 0) {
4940 Register OISrcReg;
4941 Register OISrcReg2;
4942 int64_t OIMask;
4943 int64_t OIValue;
4944 if (analyzeCompare(OI, OISrcReg, OISrcReg2, OIMask, OIValue) &&
4945 SrcReg == OISrcReg && ImmMask == OIMask) {
4946 if (OIValue == ImmValue) {
4947 *ImmDelta = 0;
4948 return true;
4949 } else if (static_cast<uint64_t>(ImmValue) ==
4950 static_cast<uint64_t>(OIValue) - 1) {
4951 *ImmDelta = -1;
4952 return true;
4953 } else if (static_cast<uint64_t>(ImmValue) ==
4954 static_cast<uint64_t>(OIValue) + 1) {
4955 *ImmDelta = 1;
4956 return true;
4957 } else {
4958 return false;
4959 }
4960 }
4961 }
4962 return FlagI.isIdenticalTo(OI);
4963 }
4964 default:
4965 return false;
4966 }
4967}
4968
4969/// Check whether the definition can be converted
4970/// to remove a comparison against zero.
4971inline static bool isDefConvertible(const MachineInstr &MI, bool &NoSignFlag,
4972 bool &ClearsOverflowFlag) {
4973 NoSignFlag = false;
4974 ClearsOverflowFlag = false;
4975
4976 // "ELF Handling for Thread-Local Storage" specifies that x86-64 GOTTPOFF, and
4977 // i386 GOTNTPOFF/INDNTPOFF relocations can convert an ADD to a LEA during
4978 // Initial Exec to Local Exec relaxation. In these cases, we must not depend
4979 // on the EFLAGS modification of ADD actually happening in the final binary.
4980 if (MI.getOpcode() == X86::ADD64rm || MI.getOpcode() == X86::ADD32rm) {
4981 unsigned Flags = MI.getOperand(5).getTargetFlags();
4982 if (Flags == X86II::MO_GOTTPOFF || Flags == X86II::MO_INDNTPOFF ||
4983 Flags == X86II::MO_GOTNTPOFF)
4984 return false;
4985 }
4986
4987 switch (MI.getOpcode()) {
4988 default:
4989 return false;
4990
4991 // The shift instructions only modify ZF if their shift count is non-zero.
4992 // N.B.: The processor truncates the shift count depending on the encoding.
4993 CASE_ND(SAR8ri)
4994 CASE_ND(SAR16ri)
4995 CASE_ND(SAR32ri)
4996 CASE_ND(SAR64ri)
4997 CASE_ND(SHR8ri)
4998 CASE_ND(SHR16ri)
4999 CASE_ND(SHR32ri)
5000 CASE_ND(SHR64ri)
5001 return getTruncatedShiftCount(MI, 2) != 0;
5002
5003 // Some left shift instructions can be turned into LEA instructions but only
5004 // if their flags aren't used. Avoid transforming such instructions.
5005 CASE_ND(SHL8ri)
5006 CASE_ND(SHL16ri)
5007 CASE_ND(SHL32ri)
5008 CASE_ND(SHL64ri) {
5009 unsigned ShAmt = getTruncatedShiftCount(MI, 2);
5010 if (isTruncatedShiftCountForLEA(ShAmt))
5011 return false;
5012 return ShAmt != 0;
5013 }
5014
5015 CASE_ND(SHRD16rri8)
5016 CASE_ND(SHRD32rri8)
5017 CASE_ND(SHRD64rri8)
5018 CASE_ND(SHLD16rri8)
5019 CASE_ND(SHLD32rri8)
5020 CASE_ND(SHLD64rri8)
5021 return getTruncatedShiftCount(MI, 3) != 0;
5022
5023 CASE_ND(SUB64ri32)
5024 CASE_ND(SUB32ri)
5025 CASE_ND(SUB16ri)
5026 CASE_ND(SUB8ri)
5027 CASE_ND(SUB64rr)
5028 CASE_ND(SUB32rr)
5029 CASE_ND(SUB16rr)
5030 CASE_ND(SUB8rr)
5031 CASE_ND(SUB64rm)
5032 CASE_ND(SUB32rm)
5033 CASE_ND(SUB16rm)
5034 CASE_ND(SUB8rm)
5035 CASE_ND(DEC64r)
5036 CASE_ND(DEC32r)
5037 CASE_ND(DEC16r)
5038 CASE_ND(DEC8r)
5039 CASE_ND(ADD64ri32)
5040 CASE_ND(ADD32ri)
5041 CASE_ND(ADD16ri)
5042 CASE_ND(ADD8ri)
5043 CASE_ND(ADD64rr)
5044 CASE_ND(ADD32rr)
5045 CASE_ND(ADD16rr)
5046 CASE_ND(ADD8rr)
5047 CASE_ND(ADD64rm)
5048 CASE_ND(ADD32rm)
5049 CASE_ND(ADD16rm)
5050 CASE_ND(ADD8rm)
5051 CASE_ND(INC64r)
5052 CASE_ND(INC32r)
5053 CASE_ND(INC16r)
5054 CASE_ND(INC8r)
5055 CASE_ND(ADC64ri32)
5056 CASE_ND(ADC32ri)
5057 CASE_ND(ADC16ri)
5058 CASE_ND(ADC8ri)
5059 CASE_ND(ADC64rr)
5060 CASE_ND(ADC32rr)
5061 CASE_ND(ADC16rr)
5062 CASE_ND(ADC8rr)
5063 CASE_ND(ADC64rm)
5064 CASE_ND(ADC32rm)
5065 CASE_ND(ADC16rm)
5066 CASE_ND(ADC8rm)
5067 CASE_ND(SBB64ri32)
5068 CASE_ND(SBB32ri)
5069 CASE_ND(SBB16ri)
5070 CASE_ND(SBB8ri)
5071 CASE_ND(SBB64rr)
5072 CASE_ND(SBB32rr)
5073 CASE_ND(SBB16rr)
5074 CASE_ND(SBB8rr)
5075 CASE_ND(SBB64rm)
5076 CASE_ND(SBB32rm)
5077 CASE_ND(SBB16rm)
5078 CASE_ND(SBB8rm)
5079 CASE_ND(NEG8r)
5080 CASE_ND(NEG16r)
5081 CASE_ND(NEG32r)
5082 CASE_ND(NEG64r)
5083 case X86::LZCNT16rr:
5084 case X86::LZCNT16rm:
5085 case X86::LZCNT32rr:
5086 case X86::LZCNT32rm:
5087 case X86::LZCNT64rr:
5088 case X86::LZCNT64rm:
5089 case X86::POPCNT16rr:
5090 case X86::POPCNT16rm:
5091 case X86::POPCNT32rr:
5092 case X86::POPCNT32rm:
5093 case X86::POPCNT64rr:
5094 case X86::POPCNT64rm:
5095 case X86::TZCNT16rr:
5096 case X86::TZCNT16rm:
5097 case X86::TZCNT32rr:
5098 case X86::TZCNT32rm:
5099 case X86::TZCNT64rr:
5100 case X86::TZCNT64rm:
5101 return true;
5102 CASE_ND(AND64ri32)
5103 CASE_ND(AND32ri)
5104 CASE_ND(AND16ri)
5105 CASE_ND(AND8ri)
5106 CASE_ND(AND64rr)
5107 CASE_ND(AND32rr)
5108 CASE_ND(AND16rr)
5109 CASE_ND(AND8rr)
5110 CASE_ND(AND64rm)
5111 CASE_ND(AND32rm)
5112 CASE_ND(AND16rm)
5113 CASE_ND(AND8rm)
5114 CASE_ND(XOR64ri32)
5115 CASE_ND(XOR32ri)
5116 CASE_ND(XOR16ri)
5117 CASE_ND(XOR8ri)
5118 CASE_ND(XOR64rr)
5119 CASE_ND(XOR32rr)
5120 CASE_ND(XOR16rr)
5121 CASE_ND(XOR8rr)
5122 CASE_ND(XOR64rm)
5123 CASE_ND(XOR32rm)
5124 CASE_ND(XOR16rm)
5125 CASE_ND(XOR8rm)
5126 CASE_ND(OR64ri32)
5127 CASE_ND(OR32ri)
5128 CASE_ND(OR16ri)
5129 CASE_ND(OR8ri)
5130 CASE_ND(OR64rr)
5131 CASE_ND(OR32rr)
5132 CASE_ND(OR16rr)
5133 CASE_ND(OR8rr)
5134 CASE_ND(OR64rm)
5135 CASE_ND(OR32rm)
5136 CASE_ND(OR16rm)
5137 CASE_ND(OR8rm)
5138 case X86::ANDN32rr:
5139 case X86::ANDN32rm:
5140 case X86::ANDN64rr:
5141 case X86::ANDN64rm:
5142 case X86::BLSI32rr:
5143 case X86::BLSI32rm:
5144 case X86::BLSI64rr:
5145 case X86::BLSI64rm:
5146 case X86::BLSMSK32rr:
5147 case X86::BLSMSK32rm:
5148 case X86::BLSMSK64rr:
5149 case X86::BLSMSK64rm:
5150 case X86::BLSR32rr:
5151 case X86::BLSR32rm:
5152 case X86::BLSR64rr:
5153 case X86::BLSR64rm:
5154 case X86::BLCFILL32rr:
5155 case X86::BLCFILL32rm:
5156 case X86::BLCFILL64rr:
5157 case X86::BLCFILL64rm:
5158 case X86::BLCI32rr:
5159 case X86::BLCI32rm:
5160 case X86::BLCI64rr:
5161 case X86::BLCI64rm:
5162 case X86::BLCIC32rr:
5163 case X86::BLCIC32rm:
5164 case X86::BLCIC64rr:
5165 case X86::BLCIC64rm:
5166 case X86::BLCMSK32rr:
5167 case X86::BLCMSK32rm:
5168 case X86::BLCMSK64rr:
5169 case X86::BLCMSK64rm:
5170 case X86::BLCS32rr:
5171 case X86::BLCS32rm:
5172 case X86::BLCS64rr:
5173 case X86::BLCS64rm:
5174 case X86::BLSFILL32rr:
5175 case X86::BLSFILL32rm:
5176 case X86::BLSFILL64rr:
5177 case X86::BLSFILL64rm:
5178 case X86::BLSIC32rr:
5179 case X86::BLSIC32rm:
5180 case X86::BLSIC64rr:
5181 case X86::BLSIC64rm:
5182 case X86::BZHI32rr:
5183 case X86::BZHI32rm:
5184 case X86::BZHI64rr:
5185 case X86::BZHI64rm:
5186 case X86::T1MSKC32rr:
5187 case X86::T1MSKC32rm:
5188 case X86::T1MSKC64rr:
5189 case X86::T1MSKC64rm:
5190 case X86::TZMSK32rr:
5191 case X86::TZMSK32rm:
5192 case X86::TZMSK64rr:
5193 case X86::TZMSK64rm:
5194 // These instructions clear the overflow flag just like TEST.
5195 // FIXME: These are not the only instructions in this switch that clear the
5196 // overflow flag.
5197 ClearsOverflowFlag = true;
5198 return true;
5199 case X86::BEXTR32rr:
5200 case X86::BEXTR64rr:
5201 case X86::BEXTR32rm:
5202 case X86::BEXTR64rm:
5203 case X86::BEXTRI32ri:
5204 case X86::BEXTRI32mi:
5205 case X86::BEXTRI64ri:
5206 case X86::BEXTRI64mi:
5207 // BEXTR doesn't update the sign flag so we can't use it. It does clear
5208 // the overflow flag, but that's not useful without the sign flag.
5209 NoSignFlag = true;
5210 return true;
5211 }
5212}
5213
5214/// Check whether the use can be converted to remove a comparison against zero.
5216 switch (MI.getOpcode()) {
5217 default:
5218 return X86::COND_INVALID;
5219 CASE_ND(NEG8r)
5220 CASE_ND(NEG16r)
5221 CASE_ND(NEG32r)
5222 CASE_ND(NEG64r)
5223 return X86::COND_AE;
5224 case X86::LZCNT16rr:
5225 case X86::LZCNT32rr:
5226 case X86::LZCNT64rr:
5227 return X86::COND_B;
5228 case X86::POPCNT16rr:
5229 case X86::POPCNT32rr:
5230 case X86::POPCNT64rr:
5231 return X86::COND_E;
5232 case X86::TZCNT16rr:
5233 case X86::TZCNT32rr:
5234 case X86::TZCNT64rr:
5235 return X86::COND_B;
5236 case X86::BSF16rr:
5237 case X86::BSF32rr:
5238 case X86::BSF64rr:
5239 case X86::BSR16rr:
5240 case X86::BSR32rr:
5241 case X86::BSR64rr:
5242 return X86::COND_E;
5243 case X86::BLSI32rr:
5244 case X86::BLSI64rr:
5245 return X86::COND_AE;
5246 case X86::BLSR32rr:
5247 case X86::BLSR64rr:
5248 case X86::BLSMSK32rr:
5249 case X86::BLSMSK64rr:
5250 return X86::COND_B;
5251 // TODO: TBM instructions.
5252 }
5253}
5254
5255/// Check if there exists an earlier instruction that
5256/// operates on the same source operands and sets flags in the same way as
5257/// Compare; remove Compare if possible.
5259 Register SrcReg2, int64_t CmpMask,
5260 int64_t CmpValue,
5261 const MachineRegisterInfo *MRI) const {
5262 // Check whether we can replace SUB with CMP.
5263 switch (CmpInstr.getOpcode()) {
5264 default:
5265 break;
5266 CASE_ND(SUB64ri32)
5267 CASE_ND(SUB32ri)
5268 CASE_ND(SUB16ri)
5269 CASE_ND(SUB8ri)
5270 CASE_ND(SUB64rm)
5271 CASE_ND(SUB32rm)
5272 CASE_ND(SUB16rm)
5273 CASE_ND(SUB8rm)
5274 CASE_ND(SUB64rr)
5275 CASE_ND(SUB32rr)
5276 CASE_ND(SUB16rr)
5277 CASE_ND(SUB8rr) {
5278 if (!MRI->use_nodbg_empty(CmpInstr.getOperand(0).getReg()))
5279 return false;
5280 // There is no use of the destination register, we can replace SUB with CMP.
5281 unsigned NewOpcode = 0;
5282#define FROM_TO(A, B) \
5283 CASE_ND(A) NewOpcode = X86::B; \
5284 break;
5285 switch (CmpInstr.getOpcode()) {
5286 default:
5287 llvm_unreachable("Unreachable!");
5288 FROM_TO(SUB64rm, CMP64rm)
5289 FROM_TO(SUB32rm, CMP32rm)
5290 FROM_TO(SUB16rm, CMP16rm)
5291 FROM_TO(SUB8rm, CMP8rm)
5292 FROM_TO(SUB64rr, CMP64rr)
5293 FROM_TO(SUB32rr, CMP32rr)
5294 FROM_TO(SUB16rr, CMP16rr)
5295 FROM_TO(SUB8rr, CMP8rr)
5296 FROM_TO(SUB64ri32, CMP64ri32)
5297 FROM_TO(SUB32ri, CMP32ri)
5298 FROM_TO(SUB16ri, CMP16ri)
5299 FROM_TO(SUB8ri, CMP8ri)
5300 }
5301#undef FROM_TO
5302 CmpInstr.setDesc(get(NewOpcode));
5303 CmpInstr.removeOperand(0);
5304 // Mutating this instruction invalidates any debug data associated with it.
5305 CmpInstr.dropDebugNumber();
5306 // Fall through to optimize Cmp if Cmp is CMPrr or CMPri.
5307 if (NewOpcode == X86::CMP64rm || NewOpcode == X86::CMP32rm ||
5308 NewOpcode == X86::CMP16rm || NewOpcode == X86::CMP8rm)
5309 return false;
5310 }
5311 }
5312
5313 // The following code tries to remove the comparison by re-using EFLAGS
5314 // from earlier instructions.
5315
5316 bool IsCmpZero = (CmpMask != 0 && CmpValue == 0);
5317
5318 // Transformation currently requires SSA values.
5319 if (SrcReg2.isPhysical())
5320 return false;
5321 MachineInstr *SrcRegDef = MRI->getVRegDef(SrcReg);
5322 assert(SrcRegDef && "Must have a definition (SSA)");
5323
5324 MachineInstr *MI = nullptr;
5325 MachineInstr *Sub = nullptr;
5326 MachineInstr *Movr0Inst = nullptr;
5327 bool NoSignFlag =