LLVM 19.0.0git
X86InstrInfo.cpp
Go to the documentation of this file.
1//===-- X86InstrInfo.cpp - X86 Instruction Information --------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file contains the X86 implementation of the TargetInstrInfo class.
10//
11//===----------------------------------------------------------------------===//
12
13#include "X86InstrInfo.h"
14#include "X86.h"
15#include "X86InstrBuilder.h"
16#include "X86InstrFoldTables.h"
18#include "X86Subtarget.h"
19#include "X86TargetMachine.h"
20#include "llvm/ADT/STLExtras.h"
21#include "llvm/ADT/Sequence.h"
37#include "llvm/IR/Function.h"
38#include "llvm/IR/InstrTypes.h"
39#include "llvm/MC/MCAsmInfo.h"
40#include "llvm/MC/MCExpr.h"
41#include "llvm/MC/MCInst.h"
43#include "llvm/Support/Debug.h"
47#include <optional>
48
49using namespace llvm;
50
51#define DEBUG_TYPE "x86-instr-info"
52
53#define GET_INSTRINFO_CTOR_DTOR
54#include "X86GenInstrInfo.inc"
55
56static cl::opt<bool>
57 NoFusing("disable-spill-fusing",
58 cl::desc("Disable fusing of spill code into instructions"),
60static cl::opt<bool>
61 PrintFailedFusing("print-failed-fuse-candidates",
62 cl::desc("Print instructions that the allocator wants to"
63 " fuse, but the X86 backend currently can't"),
65static cl::opt<bool>
66 ReMatPICStubLoad("remat-pic-stub-load",
67 cl::desc("Re-materialize load from stub in PIC mode"),
68 cl::init(false), cl::Hidden);
70 PartialRegUpdateClearance("partial-reg-update-clearance",
71 cl::desc("Clearance between two register writes "
72 "for inserting XOR to avoid partial "
73 "register update"),
74 cl::init(64), cl::Hidden);
76 "undef-reg-clearance",
77 cl::desc("How many idle instructions we would like before "
78 "certain undef register reads"),
79 cl::init(128), cl::Hidden);
80
81// Pin the vtable to this file.
82void X86InstrInfo::anchor() {}
83
85 : X86GenInstrInfo((STI.isTarget64BitLP64() ? X86::ADJCALLSTACKDOWN64
86 : X86::ADJCALLSTACKDOWN32),
87 (STI.isTarget64BitLP64() ? X86::ADJCALLSTACKUP64
88 : X86::ADJCALLSTACKUP32),
89 X86::CATCHRET, (STI.is64Bit() ? X86::RET64 : X86::RET32)),
90 Subtarget(STI), RI(STI.getTargetTriple()) {}
91
93X86InstrInfo::getRegClass(const MCInstrDesc &MCID, unsigned OpNum,
95 const MachineFunction &MF) const {
96 auto *RC = TargetInstrInfo::getRegClass(MCID, OpNum, TRI, MF);
97 // If the target does not have egpr, then r16-r31 will be resereved for all
98 // instructions.
99 if (!RC || !Subtarget.hasEGPR())
100 return RC;
101
103 return RC;
104
105 switch (RC->getID()) {
106 default:
107 return RC;
108 case X86::GR8RegClassID:
109 return &X86::GR8_NOREX2RegClass;
110 case X86::GR16RegClassID:
111 return &X86::GR16_NOREX2RegClass;
112 case X86::GR32RegClassID:
113 return &X86::GR32_NOREX2RegClass;
114 case X86::GR64RegClassID:
115 return &X86::GR64_NOREX2RegClass;
116 case X86::GR32_NOSPRegClassID:
117 return &X86::GR32_NOREX2_NOSPRegClass;
118 case X86::GR64_NOSPRegClassID:
119 return &X86::GR64_NOREX2_NOSPRegClass;
120 }
121}
122
124 Register &SrcReg, Register &DstReg,
125 unsigned &SubIdx) const {
126 switch (MI.getOpcode()) {
127 default:
128 break;
129 case X86::MOVSX16rr8:
130 case X86::MOVZX16rr8:
131 case X86::MOVSX32rr8:
132 case X86::MOVZX32rr8:
133 case X86::MOVSX64rr8:
134 if (!Subtarget.is64Bit())
135 // It's not always legal to reference the low 8-bit of the larger
136 // register in 32-bit mode.
137 return false;
138 [[fallthrough]];
139 case X86::MOVSX32rr16:
140 case X86::MOVZX32rr16:
141 case X86::MOVSX64rr16:
142 case X86::MOVSX64rr32: {
143 if (MI.getOperand(0).getSubReg() || MI.getOperand(1).getSubReg())
144 // Be conservative.
145 return false;
146 SrcReg = MI.getOperand(1).getReg();
147 DstReg = MI.getOperand(0).getReg();
148 switch (MI.getOpcode()) {
149 default:
150 llvm_unreachable("Unreachable!");
151 case X86::MOVSX16rr8:
152 case X86::MOVZX16rr8:
153 case X86::MOVSX32rr8:
154 case X86::MOVZX32rr8:
155 case X86::MOVSX64rr8:
156 SubIdx = X86::sub_8bit;
157 break;
158 case X86::MOVSX32rr16:
159 case X86::MOVZX32rr16:
160 case X86::MOVSX64rr16:
161 SubIdx = X86::sub_16bit;
162 break;
163 case X86::MOVSX64rr32:
164 SubIdx = X86::sub_32bit;
165 break;
166 }
167 return true;
168 }
169 }
170 return false;
171}
172
174 if (MI.mayLoad() || MI.mayStore())
175 return false;
176
177 // Some target-independent operations that trivially lower to data-invariant
178 // instructions.
179 if (MI.isCopyLike() || MI.isInsertSubreg())
180 return true;
181
182 unsigned Opcode = MI.getOpcode();
183 using namespace X86;
184 // On x86 it is believed that imul is constant time w.r.t. the loaded data.
185 // However, they set flags and are perhaps the most surprisingly constant
186 // time operations so we call them out here separately.
187 if (isIMUL(Opcode))
188 return true;
189 // Bit scanning and counting instructions that are somewhat surprisingly
190 // constant time as they scan across bits and do other fairly complex
191 // operations like popcnt, but are believed to be constant time on x86.
192 // However, these set flags.
193 if (isBSF(Opcode) || isBSR(Opcode) || isLZCNT(Opcode) || isPOPCNT(Opcode) ||
194 isTZCNT(Opcode))
195 return true;
196 // Bit manipulation instructions are effectively combinations of basic
197 // arithmetic ops, and should still execute in constant time. These also
198 // set flags.
199 if (isBLCFILL(Opcode) || isBLCI(Opcode) || isBLCIC(Opcode) ||
200 isBLCMSK(Opcode) || isBLCS(Opcode) || isBLSFILL(Opcode) ||
201 isBLSI(Opcode) || isBLSIC(Opcode) || isBLSMSK(Opcode) || isBLSR(Opcode) ||
202 isTZMSK(Opcode))
203 return true;
204 // Bit extracting and clearing instructions should execute in constant time,
205 // and set flags.
206 if (isBEXTR(Opcode) || isBZHI(Opcode))
207 return true;
208 // Shift and rotate.
209 if (isROL(Opcode) || isROR(Opcode) || isSAR(Opcode) || isSHL(Opcode) ||
210 isSHR(Opcode) || isSHLD(Opcode) || isSHRD(Opcode))
211 return true;
212 // Basic arithmetic is constant time on the input but does set flags.
213 if (isADC(Opcode) || isADD(Opcode) || isAND(Opcode) || isOR(Opcode) ||
214 isSBB(Opcode) || isSUB(Opcode) || isXOR(Opcode))
215 return true;
216 // Arithmetic with just 32-bit and 64-bit variants and no immediates.
217 if (isANDN(Opcode))
218 return true;
219 // Unary arithmetic operations.
220 if (isDEC(Opcode) || isINC(Opcode) || isNEG(Opcode))
221 return true;
222 // Unlike other arithmetic, NOT doesn't set EFLAGS.
223 if (isNOT(Opcode))
224 return true;
225 // Various move instructions used to zero or sign extend things. Note that we
226 // intentionally don't support the _NOREX variants as we can't handle that
227 // register constraint anyways.
228 if (isMOVSX(Opcode) || isMOVZX(Opcode) || isMOVSXD(Opcode) || isMOV(Opcode))
229 return true;
230 // Arithmetic instructions that are both constant time and don't set flags.
231 if (isRORX(Opcode) || isSARX(Opcode) || isSHLX(Opcode) || isSHRX(Opcode))
232 return true;
233 // LEA doesn't actually access memory, and its arithmetic is constant time.
234 if (isLEA(Opcode))
235 return true;
236 // By default, assume that the instruction is not data invariant.
237 return false;
238}
239
241 switch (MI.getOpcode()) {
242 default:
243 // By default, assume that the load will immediately leak.
244 return false;
245
246 // On x86 it is believed that imul is constant time w.r.t. the loaded data.
247 // However, they set flags and are perhaps the most surprisingly constant
248 // time operations so we call them out here separately.
249 case X86::IMUL16rm:
250 case X86::IMUL16rmi:
251 case X86::IMUL32rm:
252 case X86::IMUL32rmi:
253 case X86::IMUL64rm:
254 case X86::IMUL64rmi32:
255
256 // Bit scanning and counting instructions that are somewhat surprisingly
257 // constant time as they scan across bits and do other fairly complex
258 // operations like popcnt, but are believed to be constant time on x86.
259 // However, these set flags.
260 case X86::BSF16rm:
261 case X86::BSF32rm:
262 case X86::BSF64rm:
263 case X86::BSR16rm:
264 case X86::BSR32rm:
265 case X86::BSR64rm:
266 case X86::LZCNT16rm:
267 case X86::LZCNT32rm:
268 case X86::LZCNT64rm:
269 case X86::POPCNT16rm:
270 case X86::POPCNT32rm:
271 case X86::POPCNT64rm:
272 case X86::TZCNT16rm:
273 case X86::TZCNT32rm:
274 case X86::TZCNT64rm:
275
276 // Bit manipulation instructions are effectively combinations of basic
277 // arithmetic ops, and should still execute in constant time. These also
278 // set flags.
279 case X86::BLCFILL32rm:
280 case X86::BLCFILL64rm:
281 case X86::BLCI32rm:
282 case X86::BLCI64rm:
283 case X86::BLCIC32rm:
284 case X86::BLCIC64rm:
285 case X86::BLCMSK32rm:
286 case X86::BLCMSK64rm:
287 case X86::BLCS32rm:
288 case X86::BLCS64rm:
289 case X86::BLSFILL32rm:
290 case X86::BLSFILL64rm:
291 case X86::BLSI32rm:
292 case X86::BLSI64rm:
293 case X86::BLSIC32rm:
294 case X86::BLSIC64rm:
295 case X86::BLSMSK32rm:
296 case X86::BLSMSK64rm:
297 case X86::BLSR32rm:
298 case X86::BLSR64rm:
299 case X86::TZMSK32rm:
300 case X86::TZMSK64rm:
301
302 // Bit extracting and clearing instructions should execute in constant time,
303 // and set flags.
304 case X86::BEXTR32rm:
305 case X86::BEXTR64rm:
306 case X86::BEXTRI32mi:
307 case X86::BEXTRI64mi:
308 case X86::BZHI32rm:
309 case X86::BZHI64rm:
310
311 // Basic arithmetic is constant time on the input but does set flags.
312 case X86::ADC8rm:
313 case X86::ADC16rm:
314 case X86::ADC32rm:
315 case X86::ADC64rm:
316 case X86::ADD8rm:
317 case X86::ADD16rm:
318 case X86::ADD32rm:
319 case X86::ADD64rm:
320 case X86::AND8rm:
321 case X86::AND16rm:
322 case X86::AND32rm:
323 case X86::AND64rm:
324 case X86::ANDN32rm:
325 case X86::ANDN64rm:
326 case X86::OR8rm:
327 case X86::OR16rm:
328 case X86::OR32rm:
329 case X86::OR64rm:
330 case X86::SBB8rm:
331 case X86::SBB16rm:
332 case X86::SBB32rm:
333 case X86::SBB64rm:
334 case X86::SUB8rm:
335 case X86::SUB16rm:
336 case X86::SUB32rm:
337 case X86::SUB64rm:
338 case X86::XOR8rm:
339 case X86::XOR16rm:
340 case X86::XOR32rm:
341 case X86::XOR64rm:
342
343 // Integer multiply w/o affecting flags is still believed to be constant
344 // time on x86. Called out separately as this is among the most surprising
345 // instructions to exhibit that behavior.
346 case X86::MULX32rm:
347 case X86::MULX64rm:
348
349 // Arithmetic instructions that are both constant time and don't set flags.
350 case X86::RORX32mi:
351 case X86::RORX64mi:
352 case X86::SARX32rm:
353 case X86::SARX64rm:
354 case X86::SHLX32rm:
355 case X86::SHLX64rm:
356 case X86::SHRX32rm:
357 case X86::SHRX64rm:
358
359 // Conversions are believed to be constant time and don't set flags.
360 case X86::CVTTSD2SI64rm:
361 case X86::VCVTTSD2SI64rm:
362 case X86::VCVTTSD2SI64Zrm:
363 case X86::CVTTSD2SIrm:
364 case X86::VCVTTSD2SIrm:
365 case X86::VCVTTSD2SIZrm:
366 case X86::CVTTSS2SI64rm:
367 case X86::VCVTTSS2SI64rm:
368 case X86::VCVTTSS2SI64Zrm:
369 case X86::CVTTSS2SIrm:
370 case X86::VCVTTSS2SIrm:
371 case X86::VCVTTSS2SIZrm:
372 case X86::CVTSI2SDrm:
373 case X86::VCVTSI2SDrm:
374 case X86::VCVTSI2SDZrm:
375 case X86::CVTSI2SSrm:
376 case X86::VCVTSI2SSrm:
377 case X86::VCVTSI2SSZrm:
378 case X86::CVTSI642SDrm:
379 case X86::VCVTSI642SDrm:
380 case X86::VCVTSI642SDZrm:
381 case X86::CVTSI642SSrm:
382 case X86::VCVTSI642SSrm:
383 case X86::VCVTSI642SSZrm:
384 case X86::CVTSS2SDrm:
385 case X86::VCVTSS2SDrm:
386 case X86::VCVTSS2SDZrm:
387 case X86::CVTSD2SSrm:
388 case X86::VCVTSD2SSrm:
389 case X86::VCVTSD2SSZrm:
390 // AVX512 added unsigned integer conversions.
391 case X86::VCVTTSD2USI64Zrm:
392 case X86::VCVTTSD2USIZrm:
393 case X86::VCVTTSS2USI64Zrm:
394 case X86::VCVTTSS2USIZrm:
395 case X86::VCVTUSI2SDZrm:
396 case X86::VCVTUSI642SDZrm:
397 case X86::VCVTUSI2SSZrm:
398 case X86::VCVTUSI642SSZrm:
399
400 // Loads to register don't set flags.
401 case X86::MOV8rm:
402 case X86::MOV8rm_NOREX:
403 case X86::MOV16rm:
404 case X86::MOV32rm:
405 case X86::MOV64rm:
406 case X86::MOVSX16rm8:
407 case X86::MOVSX32rm16:
408 case X86::MOVSX32rm8:
409 case X86::MOVSX32rm8_NOREX:
410 case X86::MOVSX64rm16:
411 case X86::MOVSX64rm32:
412 case X86::MOVSX64rm8:
413 case X86::MOVZX16rm8:
414 case X86::MOVZX32rm16:
415 case X86::MOVZX32rm8:
416 case X86::MOVZX32rm8_NOREX:
417 case X86::MOVZX64rm16:
418 case X86::MOVZX64rm8:
419 return true;
420 }
421}
422
424 const MachineFunction *MF = MI.getParent()->getParent();
426
427 if (isFrameInstr(MI)) {
428 int SPAdj = alignTo(getFrameSize(MI), TFI->getStackAlign());
429 SPAdj -= getFrameAdjustment(MI);
430 if (!isFrameSetup(MI))
431 SPAdj = -SPAdj;
432 return SPAdj;
433 }
434
435 // To know whether a call adjusts the stack, we need information
436 // that is bound to the following ADJCALLSTACKUP pseudo.
437 // Look for the next ADJCALLSTACKUP that follows the call.
438 if (MI.isCall()) {
439 const MachineBasicBlock *MBB = MI.getParent();
441 for (auto E = MBB->end(); I != E; ++I) {
442 if (I->getOpcode() == getCallFrameDestroyOpcode() || I->isCall())
443 break;
444 }
445
446 // If we could not find a frame destroy opcode, then it has already
447 // been simplified, so we don't care.
448 if (I->getOpcode() != getCallFrameDestroyOpcode())
449 return 0;
450
451 return -(I->getOperand(1).getImm());
452 }
453
454 // Currently handle only PUSHes we can reasonably expect to see
455 // in call sequences
456 switch (MI.getOpcode()) {
457 default:
458 return 0;
459 case X86::PUSH32r:
460 case X86::PUSH32rmm:
461 case X86::PUSH32rmr:
462 case X86::PUSH32i:
463 return 4;
464 case X86::PUSH64r:
465 case X86::PUSH64rmm:
466 case X86::PUSH64rmr:
467 case X86::PUSH64i32:
468 return 8;
469 }
470}
471
472/// Return true and the FrameIndex if the specified
473/// operand and follow operands form a reference to the stack frame.
474bool X86InstrInfo::isFrameOperand(const MachineInstr &MI, unsigned int Op,
475 int &FrameIndex) const {
476 if (MI.getOperand(Op + X86::AddrBaseReg).isFI() &&
477 MI.getOperand(Op + X86::AddrScaleAmt).isImm() &&
478 MI.getOperand(Op + X86::AddrIndexReg).isReg() &&
479 MI.getOperand(Op + X86::AddrDisp).isImm() &&
480 MI.getOperand(Op + X86::AddrScaleAmt).getImm() == 1 &&
481 MI.getOperand(Op + X86::AddrIndexReg).getReg() == 0 &&
482 MI.getOperand(Op + X86::AddrDisp).getImm() == 0) {
483 FrameIndex = MI.getOperand(Op + X86::AddrBaseReg).getIndex();
484 return true;
485 }
486 return false;
487}
488
489static bool isFrameLoadOpcode(int Opcode, unsigned &MemBytes) {
490 switch (Opcode) {
491 default:
492 return false;
493 case X86::MOV8rm:
494 case X86::KMOVBkm:
495 case X86::KMOVBkm_EVEX:
496 MemBytes = 1;
497 return true;
498 case X86::MOV16rm:
499 case X86::KMOVWkm:
500 case X86::KMOVWkm_EVEX:
501 case X86::VMOVSHZrm:
502 case X86::VMOVSHZrm_alt:
503 MemBytes = 2;
504 return true;
505 case X86::MOV32rm:
506 case X86::MOVSSrm:
507 case X86::MOVSSrm_alt:
508 case X86::VMOVSSrm:
509 case X86::VMOVSSrm_alt:
510 case X86::VMOVSSZrm:
511 case X86::VMOVSSZrm_alt:
512 case X86::KMOVDkm:
513 case X86::KMOVDkm_EVEX:
514 MemBytes = 4;
515 return true;
516 case X86::MOV64rm:
517 case X86::LD_Fp64m:
518 case X86::MOVSDrm:
519 case X86::MOVSDrm_alt:
520 case X86::VMOVSDrm:
521 case X86::VMOVSDrm_alt:
522 case X86::VMOVSDZrm:
523 case X86::VMOVSDZrm_alt:
524 case X86::MMX_MOVD64rm:
525 case X86::MMX_MOVQ64rm:
526 case X86::KMOVQkm:
527 case X86::KMOVQkm_EVEX:
528 MemBytes = 8;
529 return true;
530 case X86::MOVAPSrm:
531 case X86::MOVUPSrm:
532 case X86::MOVAPDrm:
533 case X86::MOVUPDrm:
534 case X86::MOVDQArm:
535 case X86::MOVDQUrm:
536 case X86::VMOVAPSrm:
537 case X86::VMOVUPSrm:
538 case X86::VMOVAPDrm:
539 case X86::VMOVUPDrm:
540 case X86::VMOVDQArm:
541 case X86::VMOVDQUrm:
542 case X86::VMOVAPSZ128rm:
543 case X86::VMOVUPSZ128rm:
544 case X86::VMOVAPSZ128rm_NOVLX:
545 case X86::VMOVUPSZ128rm_NOVLX:
546 case X86::VMOVAPDZ128rm:
547 case X86::VMOVUPDZ128rm:
548 case X86::VMOVDQU8Z128rm:
549 case X86::VMOVDQU16Z128rm:
550 case X86::VMOVDQA32Z128rm:
551 case X86::VMOVDQU32Z128rm:
552 case X86::VMOVDQA64Z128rm:
553 case X86::VMOVDQU64Z128rm:
554 MemBytes = 16;
555 return true;
556 case X86::VMOVAPSYrm:
557 case X86::VMOVUPSYrm:
558 case X86::VMOVAPDYrm:
559 case X86::VMOVUPDYrm:
560 case X86::VMOVDQAYrm:
561 case X86::VMOVDQUYrm:
562 case X86::VMOVAPSZ256rm:
563 case X86::VMOVUPSZ256rm:
564 case X86::VMOVAPSZ256rm_NOVLX:
565 case X86::VMOVUPSZ256rm_NOVLX:
566 case X86::VMOVAPDZ256rm:
567 case X86::VMOVUPDZ256rm:
568 case X86::VMOVDQU8Z256rm:
569 case X86::VMOVDQU16Z256rm:
570 case X86::VMOVDQA32Z256rm:
571 case X86::VMOVDQU32Z256rm:
572 case X86::VMOVDQA64Z256rm:
573 case X86::VMOVDQU64Z256rm:
574 MemBytes = 32;
575 return true;
576 case X86::VMOVAPSZrm:
577 case X86::VMOVUPSZrm:
578 case X86::VMOVAPDZrm:
579 case X86::VMOVUPDZrm:
580 case X86::VMOVDQU8Zrm:
581 case X86::VMOVDQU16Zrm:
582 case X86::VMOVDQA32Zrm:
583 case X86::VMOVDQU32Zrm:
584 case X86::VMOVDQA64Zrm:
585 case X86::VMOVDQU64Zrm:
586 MemBytes = 64;
587 return true;
588 }
589}
590
591static bool isFrameStoreOpcode(int Opcode, unsigned &MemBytes) {
592 switch (Opcode) {
593 default:
594 return false;
595 case X86::MOV8mr:
596 case X86::KMOVBmk:
597 case X86::KMOVBmk_EVEX:
598 MemBytes = 1;
599 return true;
600 case X86::MOV16mr:
601 case X86::KMOVWmk:
602 case X86::KMOVWmk_EVEX:
603 case X86::VMOVSHZmr:
604 MemBytes = 2;
605 return true;
606 case X86::MOV32mr:
607 case X86::MOVSSmr:
608 case X86::VMOVSSmr:
609 case X86::VMOVSSZmr:
610 case X86::KMOVDmk:
611 case X86::KMOVDmk_EVEX:
612 MemBytes = 4;
613 return true;
614 case X86::MOV64mr:
615 case X86::ST_FpP64m:
616 case X86::MOVSDmr:
617 case X86::VMOVSDmr:
618 case X86::VMOVSDZmr:
619 case X86::MMX_MOVD64mr:
620 case X86::MMX_MOVQ64mr:
621 case X86::MMX_MOVNTQmr:
622 case X86::KMOVQmk:
623 case X86::KMOVQmk_EVEX:
624 MemBytes = 8;
625 return true;
626 case X86::MOVAPSmr:
627 case X86::MOVUPSmr:
628 case X86::MOVAPDmr:
629 case X86::MOVUPDmr:
630 case X86::MOVDQAmr:
631 case X86::MOVDQUmr:
632 case X86::VMOVAPSmr:
633 case X86::VMOVUPSmr:
634 case X86::VMOVAPDmr:
635 case X86::VMOVUPDmr:
636 case X86::VMOVDQAmr:
637 case X86::VMOVDQUmr:
638 case X86::VMOVUPSZ128mr:
639 case X86::VMOVAPSZ128mr:
640 case X86::VMOVUPSZ128mr_NOVLX:
641 case X86::VMOVAPSZ128mr_NOVLX:
642 case X86::VMOVUPDZ128mr:
643 case X86::VMOVAPDZ128mr:
644 case X86::VMOVDQA32Z128mr:
645 case X86::VMOVDQU32Z128mr:
646 case X86::VMOVDQA64Z128mr:
647 case X86::VMOVDQU64Z128mr:
648 case X86::VMOVDQU8Z128mr:
649 case X86::VMOVDQU16Z128mr:
650 MemBytes = 16;
651 return true;
652 case X86::VMOVUPSYmr:
653 case X86::VMOVAPSYmr:
654 case X86::VMOVUPDYmr:
655 case X86::VMOVAPDYmr:
656 case X86::VMOVDQUYmr:
657 case X86::VMOVDQAYmr:
658 case X86::VMOVUPSZ256mr:
659 case X86::VMOVAPSZ256mr:
660 case X86::VMOVUPSZ256mr_NOVLX:
661 case X86::VMOVAPSZ256mr_NOVLX:
662 case X86::VMOVUPDZ256mr:
663 case X86::VMOVAPDZ256mr:
664 case X86::VMOVDQU8Z256mr:
665 case X86::VMOVDQU16Z256mr:
666 case X86::VMOVDQA32Z256mr:
667 case X86::VMOVDQU32Z256mr:
668 case X86::VMOVDQA64Z256mr:
669 case X86::VMOVDQU64Z256mr:
670 MemBytes = 32;
671 return true;
672 case X86::VMOVUPSZmr:
673 case X86::VMOVAPSZmr:
674 case X86::VMOVUPDZmr:
675 case X86::VMOVAPDZmr:
676 case X86::VMOVDQU8Zmr:
677 case X86::VMOVDQU16Zmr:
678 case X86::VMOVDQA32Zmr:
679 case X86::VMOVDQU32Zmr:
680 case X86::VMOVDQA64Zmr:
681 case X86::VMOVDQU64Zmr:
682 MemBytes = 64;
683 return true;
684 }
685 return false;
686}
687
689 int &FrameIndex) const {
690 unsigned Dummy;
691 return X86InstrInfo::isLoadFromStackSlot(MI, FrameIndex, Dummy);
692}
693
695 int &FrameIndex,
696 unsigned &MemBytes) const {
697 if (isFrameLoadOpcode(MI.getOpcode(), MemBytes))
698 if (MI.getOperand(0).getSubReg() == 0 && isFrameOperand(MI, 1, FrameIndex))
699 return MI.getOperand(0).getReg();
700 return 0;
701}
702
704 int &FrameIndex) const {
705 unsigned Dummy;
706 if (isFrameLoadOpcode(MI.getOpcode(), Dummy)) {
707 unsigned Reg;
708 if ((Reg = isLoadFromStackSlot(MI, FrameIndex)))
709 return Reg;
710 // Check for post-frame index elimination operations
712 if (hasLoadFromStackSlot(MI, Accesses)) {
713 FrameIndex =
714 cast<FixedStackPseudoSourceValue>(Accesses.front()->getPseudoValue())
715 ->getFrameIndex();
716 return MI.getOperand(0).getReg();
717 }
718 }
719 return 0;
720}
721
723 int &FrameIndex) const {
724 unsigned Dummy;
725 return X86InstrInfo::isStoreToStackSlot(MI, FrameIndex, Dummy);
726}
727
729 int &FrameIndex,
730 unsigned &MemBytes) const {
731 if (isFrameStoreOpcode(MI.getOpcode(), MemBytes))
732 if (MI.getOperand(X86::AddrNumOperands).getSubReg() == 0 &&
733 isFrameOperand(MI, 0, FrameIndex))
734 return MI.getOperand(X86::AddrNumOperands).getReg();
735 return 0;
736}
737
739 int &FrameIndex) const {
740 unsigned Dummy;
741 if (isFrameStoreOpcode(MI.getOpcode(), Dummy)) {
742 unsigned Reg;
743 if ((Reg = isStoreToStackSlot(MI, FrameIndex)))
744 return Reg;
745 // Check for post-frame index elimination operations
747 if (hasStoreToStackSlot(MI, Accesses)) {
748 FrameIndex =
749 cast<FixedStackPseudoSourceValue>(Accesses.front()->getPseudoValue())
750 ->getFrameIndex();
751 return MI.getOperand(X86::AddrNumOperands).getReg();
752 }
753 }
754 return 0;
755}
756
757/// Return true if register is PIC base; i.e.g defined by X86::MOVPC32r.
758static bool regIsPICBase(Register BaseReg, const MachineRegisterInfo &MRI) {
759 // Don't waste compile time scanning use-def chains of physregs.
760 if (!BaseReg.isVirtual())
761 return false;
762 bool isPICBase = false;
763 for (const MachineInstr &DefMI : MRI.def_instructions(BaseReg)) {
764 if (DefMI.getOpcode() != X86::MOVPC32r)
765 return false;
766 assert(!isPICBase && "More than one PIC base?");
767 isPICBase = true;
768 }
769 return isPICBase;
770}
771
773 const MachineInstr &MI) const {
774 switch (MI.getOpcode()) {
775 default:
776 // This function should only be called for opcodes with the ReMaterializable
777 // flag set.
778 llvm_unreachable("Unknown rematerializable operation!");
779 break;
780 case X86::IMPLICIT_DEF:
781 // Defer to generic logic.
782 break;
783 case X86::LOAD_STACK_GUARD:
784 case X86::LD_Fp032:
785 case X86::LD_Fp064:
786 case X86::LD_Fp080:
787 case X86::LD_Fp132:
788 case X86::LD_Fp164:
789 case X86::LD_Fp180:
790 case X86::AVX1_SETALLONES:
791 case X86::AVX2_SETALLONES:
792 case X86::AVX512_128_SET0:
793 case X86::AVX512_256_SET0:
794 case X86::AVX512_512_SET0:
795 case X86::AVX512_512_SETALLONES:
796 case X86::AVX512_FsFLD0SD:
797 case X86::AVX512_FsFLD0SH:
798 case X86::AVX512_FsFLD0SS:
799 case X86::AVX512_FsFLD0F128:
800 case X86::AVX_SET0:
801 case X86::FsFLD0SD:
802 case X86::FsFLD0SS:
803 case X86::FsFLD0SH:
804 case X86::FsFLD0F128:
805 case X86::KSET0D:
806 case X86::KSET0Q:
807 case X86::KSET0W:
808 case X86::KSET1D:
809 case X86::KSET1Q:
810 case X86::KSET1W:
811 case X86::MMX_SET0:
812 case X86::MOV32ImmSExti8:
813 case X86::MOV32r0:
814 case X86::MOV32r1:
815 case X86::MOV32r_1:
816 case X86::MOV32ri64:
817 case X86::MOV64ImmSExti8:
818 case X86::V_SET0:
819 case X86::V_SETALLONES:
820 case X86::MOV16ri:
821 case X86::MOV32ri:
822 case X86::MOV64ri:
823 case X86::MOV64ri32:
824 case X86::MOV8ri:
825 case X86::PTILEZEROV:
826 return true;
827
828 case X86::MOV8rm:
829 case X86::MOV8rm_NOREX:
830 case X86::MOV16rm:
831 case X86::MOV32rm:
832 case X86::MOV64rm:
833 case X86::MOVSSrm:
834 case X86::MOVSSrm_alt:
835 case X86::MOVSDrm:
836 case X86::MOVSDrm_alt:
837 case X86::MOVAPSrm:
838 case X86::MOVUPSrm:
839 case X86::MOVAPDrm:
840 case X86::MOVUPDrm:
841 case X86::MOVDQArm:
842 case X86::MOVDQUrm:
843 case X86::VMOVSSrm:
844 case X86::VMOVSSrm_alt:
845 case X86::VMOVSDrm:
846 case X86::VMOVSDrm_alt:
847 case X86::VMOVAPSrm:
848 case X86::VMOVUPSrm:
849 case X86::VMOVAPDrm:
850 case X86::VMOVUPDrm:
851 case X86::VMOVDQArm:
852 case X86::VMOVDQUrm:
853 case X86::VMOVAPSYrm:
854 case X86::VMOVUPSYrm:
855 case X86::VMOVAPDYrm:
856 case X86::VMOVUPDYrm:
857 case X86::VMOVDQAYrm:
858 case X86::VMOVDQUYrm:
859 case X86::MMX_MOVD64rm:
860 case X86::MMX_MOVQ64rm:
861 case X86::VBROADCASTSSrm:
862 case X86::VBROADCASTSSYrm:
863 case X86::VBROADCASTSDYrm:
864 // AVX-512
865 case X86::VPBROADCASTBZ128rm:
866 case X86::VPBROADCASTBZ256rm:
867 case X86::VPBROADCASTBZrm:
868 case X86::VBROADCASTF32X2Z256rm:
869 case X86::VBROADCASTF32X2Zrm:
870 case X86::VBROADCASTI32X2Z128rm:
871 case X86::VBROADCASTI32X2Z256rm:
872 case X86::VBROADCASTI32X2Zrm:
873 case X86::VPBROADCASTWZ128rm:
874 case X86::VPBROADCASTWZ256rm:
875 case X86::VPBROADCASTWZrm:
876 case X86::VPBROADCASTDZ128rm:
877 case X86::VPBROADCASTDZ256rm:
878 case X86::VPBROADCASTDZrm:
879 case X86::VBROADCASTSSZ128rm:
880 case X86::VBROADCASTSSZ256rm:
881 case X86::VBROADCASTSSZrm:
882 case X86::VPBROADCASTQZ128rm:
883 case X86::VPBROADCASTQZ256rm:
884 case X86::VPBROADCASTQZrm:
885 case X86::VBROADCASTSDZ256rm:
886 case X86::VBROADCASTSDZrm:
887 case X86::VMOVSSZrm:
888 case X86::VMOVSSZrm_alt:
889 case X86::VMOVSDZrm:
890 case X86::VMOVSDZrm_alt:
891 case X86::VMOVSHZrm:
892 case X86::VMOVSHZrm_alt:
893 case X86::VMOVAPDZ128rm:
894 case X86::VMOVAPDZ256rm:
895 case X86::VMOVAPDZrm:
896 case X86::VMOVAPSZ128rm:
897 case X86::VMOVAPSZ256rm:
898 case X86::VMOVAPSZ128rm_NOVLX:
899 case X86::VMOVAPSZ256rm_NOVLX:
900 case X86::VMOVAPSZrm:
901 case X86::VMOVDQA32Z128rm:
902 case X86::VMOVDQA32Z256rm:
903 case X86::VMOVDQA32Zrm:
904 case X86::VMOVDQA64Z128rm:
905 case X86::VMOVDQA64Z256rm:
906 case X86::VMOVDQA64Zrm:
907 case X86::VMOVDQU16Z128rm:
908 case X86::VMOVDQU16Z256rm:
909 case X86::VMOVDQU16Zrm:
910 case X86::VMOVDQU32Z128rm:
911 case X86::VMOVDQU32Z256rm:
912 case X86::VMOVDQU32Zrm:
913 case X86::VMOVDQU64Z128rm:
914 case X86::VMOVDQU64Z256rm:
915 case X86::VMOVDQU64Zrm:
916 case X86::VMOVDQU8Z128rm:
917 case X86::VMOVDQU8Z256rm:
918 case X86::VMOVDQU8Zrm:
919 case X86::VMOVUPDZ128rm:
920 case X86::VMOVUPDZ256rm:
921 case X86::VMOVUPDZrm:
922 case X86::VMOVUPSZ128rm:
923 case X86::VMOVUPSZ256rm:
924 case X86::VMOVUPSZ128rm_NOVLX:
925 case X86::VMOVUPSZ256rm_NOVLX:
926 case X86::VMOVUPSZrm: {
927 // Loads from constant pools are trivially rematerializable.
928 if (MI.getOperand(1 + X86::AddrBaseReg).isReg() &&
929 MI.getOperand(1 + X86::AddrScaleAmt).isImm() &&
930 MI.getOperand(1 + X86::AddrIndexReg).isReg() &&
931 MI.getOperand(1 + X86::AddrIndexReg).getReg() == 0 &&
932 MI.isDereferenceableInvariantLoad()) {
933 Register BaseReg = MI.getOperand(1 + X86::AddrBaseReg).getReg();
934 if (BaseReg == 0 || BaseReg == X86::RIP)
935 return true;
936 // Allow re-materialization of PIC load.
937 if (!(!ReMatPICStubLoad && MI.getOperand(1 + X86::AddrDisp).isGlobal())) {
938 const MachineFunction &MF = *MI.getParent()->getParent();
939 const MachineRegisterInfo &MRI = MF.getRegInfo();
940 if (regIsPICBase(BaseReg, MRI))
941 return true;
942 }
943 }
944 break;
945 }
946
947 case X86::LEA32r:
948 case X86::LEA64r: {
949 if (MI.getOperand(1 + X86::AddrScaleAmt).isImm() &&
950 MI.getOperand(1 + X86::AddrIndexReg).isReg() &&
951 MI.getOperand(1 + X86::AddrIndexReg).getReg() == 0 &&
952 !MI.getOperand(1 + X86::AddrDisp).isReg()) {
953 // lea fi#, lea GV, etc. are all rematerializable.
954 if (!MI.getOperand(1 + X86::AddrBaseReg).isReg())
955 return true;
956 Register BaseReg = MI.getOperand(1 + X86::AddrBaseReg).getReg();
957 if (BaseReg == 0)
958 return true;
959 // Allow re-materialization of lea PICBase + x.
960 const MachineFunction &MF = *MI.getParent()->getParent();
961 const MachineRegisterInfo &MRI = MF.getRegInfo();
962 if (regIsPICBase(BaseReg, MRI))
963 return true;
964 }
965 break;
966 }
967 }
969}
970
973 Register DestReg, unsigned SubIdx,
974 const MachineInstr &Orig,
975 const TargetRegisterInfo &TRI) const {
976 bool ClobbersEFLAGS = Orig.modifiesRegister(X86::EFLAGS, &TRI);
977 if (ClobbersEFLAGS && MBB.computeRegisterLiveness(&TRI, X86::EFLAGS, I) !=
979 // The instruction clobbers EFLAGS. Re-materialize as MOV32ri to avoid side
980 // effects.
981 int Value;
982 switch (Orig.getOpcode()) {
983 case X86::MOV32r0:
984 Value = 0;
985 break;
986 case X86::MOV32r1:
987 Value = 1;
988 break;
989 case X86::MOV32r_1:
990 Value = -1;
991 break;
992 default:
993 llvm_unreachable("Unexpected instruction!");
994 }
995
996 const DebugLoc &DL = Orig.getDebugLoc();
997 BuildMI(MBB, I, DL, get(X86::MOV32ri))
998 .add(Orig.getOperand(0))
999 .addImm(Value);
1000 } else {
1002 MBB.insert(I, MI);
1003 }
1004
1005 MachineInstr &NewMI = *std::prev(I);
1006 NewMI.substituteRegister(Orig.getOperand(0).getReg(), DestReg, SubIdx, TRI);
1007}
1008
1009/// True if MI has a condition code def, e.g. EFLAGS, that is not marked dead.
1011 for (const MachineOperand &MO : MI.operands()) {
1012 if (MO.isReg() && MO.isDef() && MO.getReg() == X86::EFLAGS &&
1013 !MO.isDead()) {
1014 return true;
1015 }
1016 }
1017 return false;
1018}
1019
1020/// Check whether the shift count for a machine operand is non-zero.
1021inline static unsigned getTruncatedShiftCount(const MachineInstr &MI,
1022 unsigned ShiftAmtOperandIdx) {
1023 // The shift count is six bits with the REX.W prefix and five bits without.
1024 unsigned ShiftCountMask = (MI.getDesc().TSFlags & X86II::REX_W) ? 63 : 31;
1025 unsigned Imm = MI.getOperand(ShiftAmtOperandIdx).getImm();
1026 return Imm & ShiftCountMask;
1027}
1028
1029/// Check whether the given shift count is appropriate
1030/// can be represented by a LEA instruction.
1031inline static bool isTruncatedShiftCountForLEA(unsigned ShAmt) {
1032 // Left shift instructions can be transformed into load-effective-address
1033 // instructions if we can encode them appropriately.
1034 // A LEA instruction utilizes a SIB byte to encode its scale factor.
1035 // The SIB.scale field is two bits wide which means that we can encode any
1036 // shift amount less than 4.
1037 return ShAmt < 4 && ShAmt > 0;
1038}
1039
1041 MachineInstr &CmpValDefInstr,
1042 const MachineRegisterInfo *MRI,
1043 MachineInstr **AndInstr,
1044 const TargetRegisterInfo *TRI,
1045 bool &NoSignFlag, bool &ClearsOverflowFlag) {
1046 if (!(CmpValDefInstr.getOpcode() == X86::SUBREG_TO_REG &&
1047 CmpInstr.getOpcode() == X86::TEST64rr) &&
1048 !(CmpValDefInstr.getOpcode() == X86::COPY &&
1049 CmpInstr.getOpcode() == X86::TEST16rr))
1050 return false;
1051
1052 // CmpInstr is a TEST16rr/TEST64rr instruction, and
1053 // `X86InstrInfo::analyzeCompare` guarantees that it's analyzable only if two
1054 // registers are identical.
1055 assert((CmpInstr.getOperand(0).getReg() == CmpInstr.getOperand(1).getReg()) &&
1056 "CmpInstr is an analyzable TEST16rr/TEST64rr, and "
1057 "`X86InstrInfo::analyzeCompare` requires two reg operands are the"
1058 "same.");
1059
1060 // Caller (`X86InstrInfo::optimizeCompareInstr`) guarantees that
1061 // `CmpValDefInstr` defines the value that's used by `CmpInstr`; in this case
1062 // if `CmpValDefInstr` sets the EFLAGS, it is likely that `CmpInstr` is
1063 // redundant.
1064 assert(
1065 (MRI->getVRegDef(CmpInstr.getOperand(0).getReg()) == &CmpValDefInstr) &&
1066 "Caller guarantees that TEST64rr is a user of SUBREG_TO_REG or TEST16rr "
1067 "is a user of COPY sub16bit.");
1068 MachineInstr *VregDefInstr = nullptr;
1069 if (CmpInstr.getOpcode() == X86::TEST16rr) {
1070 if (!CmpValDefInstr.getOperand(1).getReg().isVirtual())
1071 return false;
1072 VregDefInstr = MRI->getVRegDef(CmpValDefInstr.getOperand(1).getReg());
1073 if (!VregDefInstr)
1074 return false;
1075 // We can only remove test when AND32ri or AND64ri32 whose imm can fit 16bit
1076 // size, others 32/64 bit ops would test higher bits which test16rr don't
1077 // want to.
1078 if (!((VregDefInstr->getOpcode() == X86::AND32ri ||
1079 VregDefInstr->getOpcode() == X86::AND64ri32) &&
1080 isUInt<16>(VregDefInstr->getOperand(2).getImm())))
1081 return false;
1082 }
1083
1084 if (CmpInstr.getOpcode() == X86::TEST64rr) {
1085 // As seen in X86 td files, CmpValDefInstr.getOperand(1).getImm() is
1086 // typically 0.
1087 if (CmpValDefInstr.getOperand(1).getImm() != 0)
1088 return false;
1089
1090 // As seen in X86 td files, CmpValDefInstr.getOperand(3) is typically
1091 // sub_32bit or sub_xmm.
1092 if (CmpValDefInstr.getOperand(3).getImm() != X86::sub_32bit)
1093 return false;
1094
1095 VregDefInstr = MRI->getVRegDef(CmpValDefInstr.getOperand(2).getReg());
1096 }
1097
1098 assert(VregDefInstr && "Must have a definition (SSA)");
1099
1100 // Requires `CmpValDefInstr` and `VregDefInstr` are from the same MBB
1101 // to simplify the subsequent analysis.
1102 //
1103 // FIXME: If `VregDefInstr->getParent()` is the only predecessor of
1104 // `CmpValDefInstr.getParent()`, this could be handled.
1105 if (VregDefInstr->getParent() != CmpValDefInstr.getParent())
1106 return false;
1107
1108 if (X86::isAND(VregDefInstr->getOpcode())) {
1109 // Get a sequence of instructions like
1110 // %reg = and* ... // Set EFLAGS
1111 // ... // EFLAGS not changed
1112 // %extended_reg = subreg_to_reg 0, %reg, %subreg.sub_32bit
1113 // test64rr %extended_reg, %extended_reg, implicit-def $eflags
1114 // or
1115 // %reg = and32* ...
1116 // ... // EFLAGS not changed.
1117 // %src_reg = copy %reg.sub_16bit:gr32
1118 // test16rr %src_reg, %src_reg, implicit-def $eflags
1119 //
1120 // If subsequent readers use a subset of bits that don't change
1121 // after `and*` instructions, it's likely that the test64rr could
1122 // be optimized away.
1123 for (const MachineInstr &Instr :
1124 make_range(std::next(MachineBasicBlock::iterator(VregDefInstr)),
1125 MachineBasicBlock::iterator(CmpValDefInstr))) {
1126 // There are instructions between 'VregDefInstr' and
1127 // 'CmpValDefInstr' that modifies EFLAGS.
1128 if (Instr.modifiesRegister(X86::EFLAGS, TRI))
1129 return false;
1130 }
1131
1132 *AndInstr = VregDefInstr;
1133
1134 // AND instruction will essentially update SF and clear OF, so
1135 // NoSignFlag should be false in the sense that SF is modified by `AND`.
1136 //
1137 // However, the implementation artifically sets `NoSignFlag` to true
1138 // to poison the SF bit; that is to say, if SF is looked at later, the
1139 // optimization (to erase TEST64rr) will be disabled.
1140 //
1141 // The reason to poison SF bit is that SF bit value could be different
1142 // in the `AND` and `TEST` operation; signed bit is not known for `AND`,
1143 // and is known to be 0 as a result of `TEST64rr`.
1144 //
1145 // FIXME: As opposed to poisoning the SF bit directly, consider peeking into
1146 // the AND instruction and using the static information to guide peephole
1147 // optimization if possible. For example, it's possible to fold a
1148 // conditional move into a copy if the relevant EFLAG bits could be deduced
1149 // from an immediate operand of and operation.
1150 //
1151 NoSignFlag = true;
1152 // ClearsOverflowFlag is true for AND operation (no surprise).
1153 ClearsOverflowFlag = true;
1154 return true;
1155 }
1156 return false;
1157}
1158
1160 unsigned Opc, bool AllowSP, Register &NewSrc,
1161 bool &isKill, MachineOperand &ImplicitOp,
1162 LiveVariables *LV, LiveIntervals *LIS) const {
1163 MachineFunction &MF = *MI.getParent()->getParent();
1164 const TargetRegisterClass *RC;
1165 if (AllowSP) {
1166 RC = Opc != X86::LEA32r ? &X86::GR64RegClass : &X86::GR32RegClass;
1167 } else {
1168 RC = Opc != X86::LEA32r ? &X86::GR64_NOSPRegClass : &X86::GR32_NOSPRegClass;
1169 }
1170 Register SrcReg = Src.getReg();
1171 isKill = MI.killsRegister(SrcReg);
1172
1173 // For both LEA64 and LEA32 the register already has essentially the right
1174 // type (32-bit or 64-bit) we may just need to forbid SP.
1175 if (Opc != X86::LEA64_32r) {
1176 NewSrc = SrcReg;
1177 assert(!Src.isUndef() && "Undef op doesn't need optimization");
1178
1179 if (NewSrc.isVirtual() && !MF.getRegInfo().constrainRegClass(NewSrc, RC))
1180 return false;
1181
1182 return true;
1183 }
1184
1185 // This is for an LEA64_32r and incoming registers are 32-bit. One way or
1186 // another we need to add 64-bit registers to the final MI.
1187 if (SrcReg.isPhysical()) {
1188 ImplicitOp = Src;
1189 ImplicitOp.setImplicit();
1190
1191 NewSrc = getX86SubSuperRegister(SrcReg, 64);
1192 assert(NewSrc.isValid() && "Invalid Operand");
1193 assert(!Src.isUndef() && "Undef op doesn't need optimization");
1194 } else {
1195 // Virtual register of the wrong class, we have to create a temporary 64-bit
1196 // vreg to feed into the LEA.
1197 NewSrc = MF.getRegInfo().createVirtualRegister(RC);
1198 MachineInstr *Copy =
1199 BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(TargetOpcode::COPY))
1200 .addReg(NewSrc, RegState::Define | RegState::Undef, X86::sub_32bit)
1201 .addReg(SrcReg, getKillRegState(isKill));
1202
1203 // Which is obviously going to be dead after we're done with it.
1204 isKill = true;
1205
1206 if (LV)
1207 LV->replaceKillInstruction(SrcReg, MI, *Copy);
1208
1209 if (LIS) {
1210 SlotIndex CopyIdx = LIS->InsertMachineInstrInMaps(*Copy);
1212 LiveInterval &LI = LIS->getInterval(SrcReg);
1214 if (S->end.getBaseIndex() == Idx)
1215 S->end = CopyIdx.getRegSlot();
1216 }
1217 }
1218
1219 // We've set all the parameters without issue.
1220 return true;
1221}
1222
1223MachineInstr *X86InstrInfo::convertToThreeAddressWithLEA(unsigned MIOpc,
1225 LiveVariables *LV,
1226 LiveIntervals *LIS,
1227 bool Is8BitOp) const {
1228 // We handle 8-bit adds and various 16-bit opcodes in the switch below.
1229 MachineBasicBlock &MBB = *MI.getParent();
1231 assert((Is8BitOp ||
1233 *RegInfo.getRegClass(MI.getOperand(0).getReg())) == 16) &&
1234 "Unexpected type for LEA transform");
1235
1236 // TODO: For a 32-bit target, we need to adjust the LEA variables with
1237 // something like this:
1238 // Opcode = X86::LEA32r;
1239 // InRegLEA = RegInfo.createVirtualRegister(&X86::GR32_NOSPRegClass);
1240 // OutRegLEA =
1241 // Is8BitOp ? RegInfo.createVirtualRegister(&X86::GR32ABCD_RegClass)
1242 // : RegInfo.createVirtualRegister(&X86::GR32RegClass);
1243 if (!Subtarget.is64Bit())
1244 return nullptr;
1245
1246 unsigned Opcode = X86::LEA64_32r;
1247 Register InRegLEA = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass);
1248 Register OutRegLEA = RegInfo.createVirtualRegister(&X86::GR32RegClass);
1249 Register InRegLEA2;
1250
1251 // Build and insert into an implicit UNDEF value. This is OK because
1252 // we will be shifting and then extracting the lower 8/16-bits.
1253 // This has the potential to cause partial register stall. e.g.
1254 // movw (%rbp,%rcx,2), %dx
1255 // leal -65(%rdx), %esi
1256 // But testing has shown this *does* help performance in 64-bit mode (at
1257 // least on modern x86 machines).
1258 MachineBasicBlock::iterator MBBI = MI.getIterator();
1259 Register Dest = MI.getOperand(0).getReg();
1260 Register Src = MI.getOperand(1).getReg();
1261 Register Src2;
1262 bool IsDead = MI.getOperand(0).isDead();
1263 bool IsKill = MI.getOperand(1).isKill();
1264 unsigned SubReg = Is8BitOp ? X86::sub_8bit : X86::sub_16bit;
1265 assert(!MI.getOperand(1).isUndef() && "Undef op doesn't need optimization");
1266 MachineInstr *ImpDef =
1267 BuildMI(MBB, MBBI, MI.getDebugLoc(), get(X86::IMPLICIT_DEF), InRegLEA);
1268 MachineInstr *InsMI =
1269 BuildMI(MBB, MBBI, MI.getDebugLoc(), get(TargetOpcode::COPY))
1270 .addReg(InRegLEA, RegState::Define, SubReg)
1271 .addReg(Src, getKillRegState(IsKill));
1272 MachineInstr *ImpDef2 = nullptr;
1273 MachineInstr *InsMI2 = nullptr;
1274
1276 BuildMI(MBB, MBBI, MI.getDebugLoc(), get(Opcode), OutRegLEA);
1277 switch (MIOpc) {
1278 default:
1279 llvm_unreachable("Unreachable!");
1280 case X86::SHL8ri:
1281 case X86::SHL16ri: {
1282 unsigned ShAmt = MI.getOperand(2).getImm();
1283 MIB.addReg(0)
1284 .addImm(1LL << ShAmt)
1285 .addReg(InRegLEA, RegState::Kill)
1286 .addImm(0)
1287 .addReg(0);
1288 break;
1289 }
1290 case X86::INC8r:
1291 case X86::INC16r:
1292 addRegOffset(MIB, InRegLEA, true, 1);
1293 break;
1294 case X86::DEC8r:
1295 case X86::DEC16r:
1296 addRegOffset(MIB, InRegLEA, true, -1);
1297 break;
1298 case X86::ADD8ri:
1299 case X86::ADD8ri_DB:
1300 case X86::ADD16ri:
1301 case X86::ADD16ri_DB:
1302 addRegOffset(MIB, InRegLEA, true, MI.getOperand(2).getImm());
1303 break;
1304 case X86::ADD8rr:
1305 case X86::ADD8rr_DB:
1306 case X86::ADD16rr:
1307 case X86::ADD16rr_DB: {
1308 Src2 = MI.getOperand(2).getReg();
1309 bool IsKill2 = MI.getOperand(2).isKill();
1310 assert(!MI.getOperand(2).isUndef() && "Undef op doesn't need optimization");
1311 if (Src == Src2) {
1312 // ADD8rr/ADD16rr killed %reg1028, %reg1028
1313 // just a single insert_subreg.
1314 addRegReg(MIB, InRegLEA, true, InRegLEA, false);
1315 } else {
1316 if (Subtarget.is64Bit())
1317 InRegLEA2 = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass);
1318 else
1319 InRegLEA2 = RegInfo.createVirtualRegister(&X86::GR32_NOSPRegClass);
1320 // Build and insert into an implicit UNDEF value. This is OK because
1321 // we will be shifting and then extracting the lower 8/16-bits.
1322 ImpDef2 = BuildMI(MBB, &*MIB, MI.getDebugLoc(), get(X86::IMPLICIT_DEF),
1323 InRegLEA2);
1324 InsMI2 = BuildMI(MBB, &*MIB, MI.getDebugLoc(), get(TargetOpcode::COPY))
1325 .addReg(InRegLEA2, RegState::Define, SubReg)
1326 .addReg(Src2, getKillRegState(IsKill2));
1327 addRegReg(MIB, InRegLEA, true, InRegLEA2, true);
1328 }
1329 if (LV && IsKill2 && InsMI2)
1330 LV->replaceKillInstruction(Src2, MI, *InsMI2);
1331 break;
1332 }
1333 }
1334
1335 MachineInstr *NewMI = MIB;
1336 MachineInstr *ExtMI =
1337 BuildMI(MBB, MBBI, MI.getDebugLoc(), get(TargetOpcode::COPY))
1339 .addReg(OutRegLEA, RegState::Kill, SubReg);
1340
1341 if (LV) {
1342 // Update live variables.
1343 LV->getVarInfo(InRegLEA).Kills.push_back(NewMI);
1344 if (InRegLEA2)
1345 LV->getVarInfo(InRegLEA2).Kills.push_back(NewMI);
1346 LV->getVarInfo(OutRegLEA).Kills.push_back(ExtMI);
1347 if (IsKill)
1348 LV->replaceKillInstruction(Src, MI, *InsMI);
1349 if (IsDead)
1350 LV->replaceKillInstruction(Dest, MI, *ExtMI);
1351 }
1352
1353 if (LIS) {
1354 LIS->InsertMachineInstrInMaps(*ImpDef);
1355 SlotIndex InsIdx = LIS->InsertMachineInstrInMaps(*InsMI);
1356 if (ImpDef2)
1357 LIS->InsertMachineInstrInMaps(*ImpDef2);
1358 SlotIndex Ins2Idx;
1359 if (InsMI2)
1360 Ins2Idx = LIS->InsertMachineInstrInMaps(*InsMI2);
1361 SlotIndex NewIdx = LIS->ReplaceMachineInstrInMaps(MI, *NewMI);
1362 SlotIndex ExtIdx = LIS->InsertMachineInstrInMaps(*ExtMI);
1363 LIS->getInterval(InRegLEA);
1364 LIS->getInterval(OutRegLEA);
1365 if (InRegLEA2)
1366 LIS->getInterval(InRegLEA2);
1367
1368 // Move the use of Src up to InsMI.
1369 LiveInterval &SrcLI = LIS->getInterval(Src);
1370 LiveRange::Segment *SrcSeg = SrcLI.getSegmentContaining(NewIdx);
1371 if (SrcSeg->end == NewIdx.getRegSlot())
1372 SrcSeg->end = InsIdx.getRegSlot();
1373
1374 if (InsMI2) {
1375 // Move the use of Src2 up to InsMI2.
1376 LiveInterval &Src2LI = LIS->getInterval(Src2);
1377 LiveRange::Segment *Src2Seg = Src2LI.getSegmentContaining(NewIdx);
1378 if (Src2Seg->end == NewIdx.getRegSlot())
1379 Src2Seg->end = Ins2Idx.getRegSlot();
1380 }
1381
1382 // Move the definition of Dest down to ExtMI.
1383 LiveInterval &DestLI = LIS->getInterval(Dest);
1384 LiveRange::Segment *DestSeg =
1385 DestLI.getSegmentContaining(NewIdx.getRegSlot());
1386 assert(DestSeg->start == NewIdx.getRegSlot() &&
1387 DestSeg->valno->def == NewIdx.getRegSlot());
1388 DestSeg->start = ExtIdx.getRegSlot();
1389 DestSeg->valno->def = ExtIdx.getRegSlot();
1390 }
1391
1392 return ExtMI;
1393}
1394
1395/// This method must be implemented by targets that
1396/// set the M_CONVERTIBLE_TO_3_ADDR flag. When this flag is set, the target
1397/// may be able to convert a two-address instruction into a true
1398/// three-address instruction on demand. This allows the X86 target (for
1399/// example) to convert ADD and SHL instructions into LEA instructions if they
1400/// would require register copies due to two-addressness.
1401///
1402/// This method returns a null pointer if the transformation cannot be
1403/// performed, otherwise it returns the new instruction.
1404///
1406 LiveVariables *LV,
1407 LiveIntervals *LIS) const {
1408 // The following opcodes also sets the condition code register(s). Only
1409 // convert them to equivalent lea if the condition code register def's
1410 // are dead!
1412 return nullptr;
1413
1414 MachineFunction &MF = *MI.getParent()->getParent();
1415 // All instructions input are two-addr instructions. Get the known operands.
1416 const MachineOperand &Dest = MI.getOperand(0);
1417 const MachineOperand &Src = MI.getOperand(1);
1418
1419 // Ideally, operations with undef should be folded before we get here, but we
1420 // can't guarantee it. Bail out because optimizing undefs is a waste of time.
1421 // Without this, we have to forward undef state to new register operands to
1422 // avoid machine verifier errors.
1423 if (Src.isUndef())
1424 return nullptr;
1425 if (MI.getNumOperands() > 2)
1426 if (MI.getOperand(2).isReg() && MI.getOperand(2).isUndef())
1427 return nullptr;
1428
1429 MachineInstr *NewMI = nullptr;
1430 Register SrcReg, SrcReg2;
1431 bool Is64Bit = Subtarget.is64Bit();
1432
1433 bool Is8BitOp = false;
1434 unsigned NumRegOperands = 2;
1435 unsigned MIOpc = MI.getOpcode();
1436 switch (MIOpc) {
1437 default:
1438 llvm_unreachable("Unreachable!");
1439 case X86::SHL64ri: {
1440 assert(MI.getNumOperands() >= 3 && "Unknown shift instruction!");
1441 unsigned ShAmt = getTruncatedShiftCount(MI, 2);
1442 if (!isTruncatedShiftCountForLEA(ShAmt))
1443 return nullptr;
1444
1445 // LEA can't handle RSP.
1446 if (Src.getReg().isVirtual() && !MF.getRegInfo().constrainRegClass(
1447 Src.getReg(), &X86::GR64_NOSPRegClass))
1448 return nullptr;
1449
1450 NewMI = BuildMI(MF, MI.getDebugLoc(), get(X86::LEA64r))
1451 .add(Dest)
1452 .addReg(0)
1453 .addImm(1LL << ShAmt)
1454 .add(Src)
1455 .addImm(0)
1456 .addReg(0);
1457 break;
1458 }
1459 case X86::SHL32ri: {
1460 assert(MI.getNumOperands() >= 3 && "Unknown shift instruction!");
1461 unsigned ShAmt = getTruncatedShiftCount(MI, 2);
1462 if (!isTruncatedShiftCountForLEA(ShAmt))
1463 return nullptr;
1464
1465 unsigned Opc = Is64Bit ? X86::LEA64_32r : X86::LEA32r;
1466
1467 // LEA can't handle ESP.
1468 bool isKill;
1469 MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
1470 if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/false, SrcReg, isKill,
1471 ImplicitOp, LV, LIS))
1472 return nullptr;
1473
1474 MachineInstrBuilder MIB = BuildMI(MF, MI.getDebugLoc(), get(Opc))
1475 .add(Dest)
1476 .addReg(0)
1477 .addImm(1LL << ShAmt)
1478 .addReg(SrcReg, getKillRegState(isKill))
1479 .addImm(0)
1480 .addReg(0);
1481 if (ImplicitOp.getReg() != 0)
1482 MIB.add(ImplicitOp);
1483 NewMI = MIB;
1484
1485 // Add kills if classifyLEAReg created a new register.
1486 if (LV && SrcReg != Src.getReg())
1487 LV->getVarInfo(SrcReg).Kills.push_back(NewMI);
1488 break;
1489 }
1490 case X86::SHL8ri:
1491 Is8BitOp = true;
1492 [[fallthrough]];
1493 case X86::SHL16ri: {
1494 assert(MI.getNumOperands() >= 3 && "Unknown shift instruction!");
1495 unsigned ShAmt = getTruncatedShiftCount(MI, 2);
1496 if (!isTruncatedShiftCountForLEA(ShAmt))
1497 return nullptr;
1498 return convertToThreeAddressWithLEA(MIOpc, MI, LV, LIS, Is8BitOp);
1499 }
1500 case X86::INC64r:
1501 case X86::INC32r: {
1502 assert(MI.getNumOperands() >= 2 && "Unknown inc instruction!");
1503 unsigned Opc = MIOpc == X86::INC64r
1504 ? X86::LEA64r
1505 : (Is64Bit ? X86::LEA64_32r : X86::LEA32r);
1506 bool isKill;
1507 MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
1508 if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/false, SrcReg, isKill,
1509 ImplicitOp, LV, LIS))
1510 return nullptr;
1511
1512 MachineInstrBuilder MIB = BuildMI(MF, MI.getDebugLoc(), get(Opc))
1513 .add(Dest)
1514 .addReg(SrcReg, getKillRegState(isKill));
1515 if (ImplicitOp.getReg() != 0)
1516 MIB.add(ImplicitOp);
1517
1518 NewMI = addOffset(MIB, 1);
1519
1520 // Add kills if classifyLEAReg created a new register.
1521 if (LV && SrcReg != Src.getReg())
1522 LV->getVarInfo(SrcReg).Kills.push_back(NewMI);
1523 break;
1524 }
1525 case X86::DEC64r:
1526 case X86::DEC32r: {
1527 assert(MI.getNumOperands() >= 2 && "Unknown dec instruction!");
1528 unsigned Opc = MIOpc == X86::DEC64r
1529 ? X86::LEA64r
1530 : (Is64Bit ? X86::LEA64_32r : X86::LEA32r);
1531
1532 bool isKill;
1533 MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
1534 if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/false, SrcReg, isKill,
1535 ImplicitOp, LV, LIS))
1536 return nullptr;
1537
1538 MachineInstrBuilder MIB = BuildMI(MF, MI.getDebugLoc(), get(Opc))
1539 .add(Dest)
1540 .addReg(SrcReg, getKillRegState(isKill));
1541 if (ImplicitOp.getReg() != 0)
1542 MIB.add(ImplicitOp);
1543
1544 NewMI = addOffset(MIB, -1);
1545
1546 // Add kills if classifyLEAReg created a new register.
1547 if (LV && SrcReg != Src.getReg())
1548 LV->getVarInfo(SrcReg).Kills.push_back(NewMI);
1549 break;
1550 }
1551 case X86::DEC8r:
1552 case X86::INC8r:
1553 Is8BitOp = true;
1554 [[fallthrough]];
1555 case X86::DEC16r:
1556 case X86::INC16r:
1557 return convertToThreeAddressWithLEA(MIOpc, MI, LV, LIS, Is8BitOp);
1558 case X86::ADD64rr:
1559 case X86::ADD64rr_DB:
1560 case X86::ADD32rr:
1561 case X86::ADD32rr_DB: {
1562 assert(MI.getNumOperands() >= 3 && "Unknown add instruction!");
1563 unsigned Opc;
1564 if (MIOpc == X86::ADD64rr || MIOpc == X86::ADD64rr_DB)
1565 Opc = X86::LEA64r;
1566 else
1567 Opc = Is64Bit ? X86::LEA64_32r : X86::LEA32r;
1568
1569 const MachineOperand &Src2 = MI.getOperand(2);
1570 bool isKill2;
1571 MachineOperand ImplicitOp2 = MachineOperand::CreateReg(0, false);
1572 if (!classifyLEAReg(MI, Src2, Opc, /*AllowSP=*/false, SrcReg2, isKill2,
1573 ImplicitOp2, LV, LIS))
1574 return nullptr;
1575
1576 bool isKill;
1577 MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
1578 if (Src.getReg() == Src2.getReg()) {
1579 // Don't call classify LEAReg a second time on the same register, in case
1580 // the first call inserted a COPY from Src2 and marked it as killed.
1581 isKill = isKill2;
1582 SrcReg = SrcReg2;
1583 } else {
1584 if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/true, SrcReg, isKill,
1585 ImplicitOp, LV, LIS))
1586 return nullptr;
1587 }
1588
1589 MachineInstrBuilder MIB = BuildMI(MF, MI.getDebugLoc(), get(Opc)).add(Dest);
1590 if (ImplicitOp.getReg() != 0)
1591 MIB.add(ImplicitOp);
1592 if (ImplicitOp2.getReg() != 0)
1593 MIB.add(ImplicitOp2);
1594
1595 NewMI = addRegReg(MIB, SrcReg, isKill, SrcReg2, isKill2);
1596
1597 // Add kills if classifyLEAReg created a new register.
1598 if (LV) {
1599 if (SrcReg2 != Src2.getReg())
1600 LV->getVarInfo(SrcReg2).Kills.push_back(NewMI);
1601 if (SrcReg != SrcReg2 && SrcReg != Src.getReg())
1602 LV->getVarInfo(SrcReg).Kills.push_back(NewMI);
1603 }
1604 NumRegOperands = 3;
1605 break;
1606 }
1607 case X86::ADD8rr:
1608 case X86::ADD8rr_DB:
1609 Is8BitOp = true;
1610 [[fallthrough]];
1611 case X86::ADD16rr:
1612 case X86::ADD16rr_DB:
1613 return convertToThreeAddressWithLEA(MIOpc, MI, LV, LIS, Is8BitOp);
1614 case X86::ADD64ri32:
1615 case X86::ADD64ri32_DB:
1616 assert(MI.getNumOperands() >= 3 && "Unknown add instruction!");
1617 NewMI = addOffset(
1618 BuildMI(MF, MI.getDebugLoc(), get(X86::LEA64r)).add(Dest).add(Src),
1619 MI.getOperand(2));
1620 break;
1621 case X86::ADD32ri:
1622 case X86::ADD32ri_DB: {
1623 assert(MI.getNumOperands() >= 3 && "Unknown add instruction!");
1624 unsigned Opc = Is64Bit ? X86::LEA64_32r : X86::LEA32r;
1625
1626 bool isKill;
1627 MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
1628 if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/true, SrcReg, isKill,
1629 ImplicitOp, LV, LIS))
1630 return nullptr;
1631
1632 MachineInstrBuilder MIB = BuildMI(MF, MI.getDebugLoc(), get(Opc))
1633 .add(Dest)
1634 .addReg(SrcReg, getKillRegState(isKill));
1635 if (ImplicitOp.getReg() != 0)
1636 MIB.add(ImplicitOp);
1637
1638 NewMI = addOffset(MIB, MI.getOperand(2));
1639
1640 // Add kills if classifyLEAReg created a new register.
1641 if (LV && SrcReg != Src.getReg())
1642 LV->getVarInfo(SrcReg).Kills.push_back(NewMI);
1643 break;
1644 }
1645 case X86::ADD8ri:
1646 case X86::ADD8ri_DB:
1647 Is8BitOp = true;
1648 [[fallthrough]];
1649 case X86::ADD16ri:
1650 case X86::ADD16ri_DB:
1651 return convertToThreeAddressWithLEA(MIOpc, MI, LV, LIS, Is8BitOp);
1652 case X86::SUB8ri:
1653 case X86::SUB16ri:
1654 /// FIXME: Support these similar to ADD8ri/ADD16ri*.
1655 return nullptr;
1656 case X86::SUB32ri: {
1657 if (!MI.getOperand(2).isImm())
1658 return nullptr;
1659 int64_t Imm = MI.getOperand(2).getImm();
1660 if (!isInt<32>(-Imm))
1661 return nullptr;
1662
1663 assert(MI.getNumOperands() >= 3 && "Unknown add instruction!");
1664 unsigned Opc = Is64Bit ? X86::LEA64_32r : X86::LEA32r;
1665
1666 bool isKill;
1667 MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
1668 if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/true, SrcReg, isKill,
1669 ImplicitOp, LV, LIS))
1670 return nullptr;
1671
1672 MachineInstrBuilder MIB = BuildMI(MF, MI.getDebugLoc(), get(Opc))
1673 .add(Dest)
1674 .addReg(SrcReg, getKillRegState(isKill));
1675 if (ImplicitOp.getReg() != 0)
1676 MIB.add(ImplicitOp);
1677
1678 NewMI = addOffset(MIB, -Imm);
1679
1680 // Add kills if classifyLEAReg created a new register.
1681 if (LV && SrcReg != Src.getReg())
1682 LV->getVarInfo(SrcReg).Kills.push_back(NewMI);
1683 break;
1684 }
1685
1686 case X86::SUB64ri32: {
1687 if (!MI.getOperand(2).isImm())
1688 return nullptr;
1689 int64_t Imm = MI.getOperand(2).getImm();
1690 if (!isInt<32>(-Imm))
1691 return nullptr;
1692
1693 assert(MI.getNumOperands() >= 3 && "Unknown sub instruction!");
1694
1696 BuildMI(MF, MI.getDebugLoc(), get(X86::LEA64r)).add(Dest).add(Src);
1697 NewMI = addOffset(MIB, -Imm);
1698 break;
1699 }
1700
1701 case X86::VMOVDQU8Z128rmk:
1702 case X86::VMOVDQU8Z256rmk:
1703 case X86::VMOVDQU8Zrmk:
1704 case X86::VMOVDQU16Z128rmk:
1705 case X86::VMOVDQU16Z256rmk:
1706 case X86::VMOVDQU16Zrmk:
1707 case X86::VMOVDQU32Z128rmk:
1708 case X86::VMOVDQA32Z128rmk:
1709 case X86::VMOVDQU32Z256rmk:
1710 case X86::VMOVDQA32Z256rmk:
1711 case X86::VMOVDQU32Zrmk:
1712 case X86::VMOVDQA32Zrmk:
1713 case X86::VMOVDQU64Z128rmk:
1714 case X86::VMOVDQA64Z128rmk:
1715 case X86::VMOVDQU64Z256rmk:
1716 case X86::VMOVDQA64Z256rmk:
1717 case X86::VMOVDQU64Zrmk:
1718 case X86::VMOVDQA64Zrmk:
1719 case X86::VMOVUPDZ128rmk:
1720 case X86::VMOVAPDZ128rmk:
1721 case X86::VMOVUPDZ256rmk:
1722 case X86::VMOVAPDZ256rmk:
1723 case X86::VMOVUPDZrmk:
1724 case X86::VMOVAPDZrmk:
1725 case X86::VMOVUPSZ128rmk:
1726 case X86::VMOVAPSZ128rmk:
1727 case X86::VMOVUPSZ256rmk:
1728 case X86::VMOVAPSZ256rmk:
1729 case X86::VMOVUPSZrmk:
1730 case X86::VMOVAPSZrmk:
1731 case X86::VBROADCASTSDZ256rmk:
1732 case X86::VBROADCASTSDZrmk:
1733 case X86::VBROADCASTSSZ128rmk:
1734 case X86::VBROADCASTSSZ256rmk:
1735 case X86::VBROADCASTSSZrmk:
1736 case X86::VPBROADCASTDZ128rmk:
1737 case X86::VPBROADCASTDZ256rmk:
1738 case X86::VPBROADCASTDZrmk:
1739 case X86::VPBROADCASTQZ128rmk:
1740 case X86::VPBROADCASTQZ256rmk:
1741 case X86::VPBROADCASTQZrmk: {
1742 unsigned Opc;
1743 switch (MIOpc) {
1744 default:
1745 llvm_unreachable("Unreachable!");
1746 case X86::VMOVDQU8Z128rmk:
1747 Opc = X86::VPBLENDMBZ128rmk;
1748 break;
1749 case X86::VMOVDQU8Z256rmk:
1750 Opc = X86::VPBLENDMBZ256rmk;
1751 break;
1752 case X86::VMOVDQU8Zrmk:
1753 Opc = X86::VPBLENDMBZrmk;
1754 break;
1755 case X86::VMOVDQU16Z128rmk:
1756 Opc = X86::VPBLENDMWZ128rmk;
1757 break;
1758 case X86::VMOVDQU16Z256rmk:
1759 Opc = X86::VPBLENDMWZ256rmk;
1760 break;
1761 case X86::VMOVDQU16Zrmk:
1762 Opc = X86::VPBLENDMWZrmk;
1763 break;
1764 case X86::VMOVDQU32Z128rmk:
1765 Opc = X86::VPBLENDMDZ128rmk;
1766 break;
1767 case X86::VMOVDQU32Z256rmk:
1768 Opc = X86::VPBLENDMDZ256rmk;
1769 break;
1770 case X86::VMOVDQU32Zrmk:
1771 Opc = X86::VPBLENDMDZrmk;
1772 break;
1773 case X86::VMOVDQU64Z128rmk:
1774 Opc = X86::VPBLENDMQZ128rmk;
1775 break;
1776 case X86::VMOVDQU64Z256rmk:
1777 Opc = X86::VPBLENDMQZ256rmk;
1778 break;
1779 case X86::VMOVDQU64Zrmk:
1780 Opc = X86::VPBLENDMQZrmk;
1781 break;
1782 case X86::VMOVUPDZ128rmk:
1783 Opc = X86::VBLENDMPDZ128rmk;
1784 break;
1785 case X86::VMOVUPDZ256rmk:
1786 Opc = X86::VBLENDMPDZ256rmk;
1787 break;
1788 case X86::VMOVUPDZrmk:
1789 Opc = X86::VBLENDMPDZrmk;
1790 break;
1791 case X86::VMOVUPSZ128rmk:
1792 Opc = X86::VBLENDMPSZ128rmk;
1793 break;
1794 case X86::VMOVUPSZ256rmk:
1795 Opc = X86::VBLENDMPSZ256rmk;
1796 break;
1797 case X86::VMOVUPSZrmk:
1798 Opc = X86::VBLENDMPSZrmk;
1799 break;
1800 case X86::VMOVDQA32Z128rmk:
1801 Opc = X86::VPBLENDMDZ128rmk;
1802 break;
1803 case X86::VMOVDQA32Z256rmk:
1804 Opc = X86::VPBLENDMDZ256rmk;
1805 break;
1806 case X86::VMOVDQA32Zrmk:
1807 Opc = X86::VPBLENDMDZrmk;
1808 break;
1809 case X86::VMOVDQA64Z128rmk:
1810 Opc = X86::VPBLENDMQZ128rmk;
1811 break;
1812 case X86::VMOVDQA64Z256rmk:
1813 Opc = X86::VPBLENDMQZ256rmk;
1814 break;
1815 case X86::VMOVDQA64Zrmk:
1816 Opc = X86::VPBLENDMQZrmk;
1817 break;
1818 case X86::VMOVAPDZ128rmk:
1819 Opc = X86::VBLENDMPDZ128rmk;
1820 break;
1821 case X86::VMOVAPDZ256rmk:
1822 Opc = X86::VBLENDMPDZ256rmk;
1823 break;
1824 case X86::VMOVAPDZrmk:
1825 Opc = X86::VBLENDMPDZrmk;
1826 break;
1827 case X86::VMOVAPSZ128rmk:
1828 Opc = X86::VBLENDMPSZ128rmk;
1829 break;
1830 case X86::VMOVAPSZ256rmk:
1831 Opc = X86::VBLENDMPSZ256rmk;
1832 break;
1833 case X86::VMOVAPSZrmk:
1834 Opc = X86::VBLENDMPSZrmk;
1835 break;
1836 case X86::VBROADCASTSDZ256rmk:
1837 Opc = X86::VBLENDMPDZ256rmbk;
1838 break;
1839 case X86::VBROADCASTSDZrmk:
1840 Opc = X86::VBLENDMPDZrmbk;
1841 break;
1842 case X86::VBROADCASTSSZ128rmk:
1843 Opc = X86::VBLENDMPSZ128rmbk;
1844 break;
1845 case X86::VBROADCASTSSZ256rmk:
1846 Opc = X86::VBLENDMPSZ256rmbk;
1847 break;
1848 case X86::VBROADCASTSSZrmk:
1849 Opc = X86::VBLENDMPSZrmbk;
1850 break;
1851 case X86::VPBROADCASTDZ128rmk:
1852 Opc = X86::VPBLENDMDZ128rmbk;
1853 break;
1854 case X86::VPBROADCASTDZ256rmk:
1855 Opc = X86::VPBLENDMDZ256rmbk;
1856 break;
1857 case X86::VPBROADCASTDZrmk:
1858 Opc = X86::VPBLENDMDZrmbk;
1859 break;
1860 case X86::VPBROADCASTQZ128rmk:
1861 Opc = X86::VPBLENDMQZ128rmbk;
1862 break;
1863 case X86::VPBROADCASTQZ256rmk:
1864 Opc = X86::VPBLENDMQZ256rmbk;
1865 break;
1866 case X86::VPBROADCASTQZrmk:
1867 Opc = X86::VPBLENDMQZrmbk;
1868 break;
1869 }
1870
1871 NewMI = BuildMI(MF, MI.getDebugLoc(), get(Opc))
1872 .add(Dest)
1873 .add(MI.getOperand(2))
1874 .add(Src)
1875 .add(MI.getOperand(3))
1876 .add(MI.getOperand(4))
1877 .add(MI.getOperand(5))
1878 .add(MI.getOperand(6))
1879 .add(MI.getOperand(7));
1880 NumRegOperands = 4;
1881 break;
1882 }
1883
1884 case X86::VMOVDQU8Z128rrk:
1885 case X86::VMOVDQU8Z256rrk:
1886 case X86::VMOVDQU8Zrrk:
1887 case X86::VMOVDQU16Z128rrk:
1888 case X86::VMOVDQU16Z256rrk:
1889 case X86::VMOVDQU16Zrrk:
1890 case X86::VMOVDQU32Z128rrk:
1891 case X86::VMOVDQA32Z128rrk:
1892 case X86::VMOVDQU32Z256rrk:
1893 case X86::VMOVDQA32Z256rrk:
1894 case X86::VMOVDQU32Zrrk:
1895 case X86::VMOVDQA32Zrrk:
1896 case X86::VMOVDQU64Z128rrk:
1897 case X86::VMOVDQA64Z128rrk:
1898 case X86::VMOVDQU64Z256rrk:
1899 case X86::VMOVDQA64Z256rrk:
1900 case X86::VMOVDQU64Zrrk:
1901 case X86::VMOVDQA64Zrrk:
1902 case X86::VMOVUPDZ128rrk:
1903 case X86::VMOVAPDZ128rrk:
1904 case X86::VMOVUPDZ256rrk:
1905 case X86::VMOVAPDZ256rrk:
1906 case X86::VMOVUPDZrrk:
1907 case X86::VMOVAPDZrrk:
1908 case X86::VMOVUPSZ128rrk:
1909 case X86::VMOVAPSZ128rrk:
1910 case X86::VMOVUPSZ256rrk:
1911 case X86::VMOVAPSZ256rrk:
1912 case X86::VMOVUPSZrrk:
1913 case X86::VMOVAPSZrrk: {
1914 unsigned Opc;
1915 switch (MIOpc) {
1916 default:
1917 llvm_unreachable("Unreachable!");
1918 case X86::VMOVDQU8Z128rrk:
1919 Opc = X86::VPBLENDMBZ128rrk;
1920 break;
1921 case X86::VMOVDQU8Z256rrk:
1922 Opc = X86::VPBLENDMBZ256rrk;
1923 break;
1924 case X86::VMOVDQU8Zrrk:
1925 Opc = X86::VPBLENDMBZrrk;
1926 break;
1927 case X86::VMOVDQU16Z128rrk:
1928 Opc = X86::VPBLENDMWZ128rrk;
1929 break;
1930 case X86::VMOVDQU16Z256rrk:
1931 Opc = X86::VPBLENDMWZ256rrk;
1932 break;
1933 case X86::VMOVDQU16Zrrk:
1934 Opc = X86::VPBLENDMWZrrk;
1935 break;
1936 case X86::VMOVDQU32Z128rrk:
1937 Opc = X86::VPBLENDMDZ128rrk;
1938 break;
1939 case X86::VMOVDQU32Z256rrk:
1940 Opc = X86::VPBLENDMDZ256rrk;
1941 break;
1942 case X86::VMOVDQU32Zrrk:
1943 Opc = X86::VPBLENDMDZrrk;
1944 break;
1945 case X86::VMOVDQU64Z128rrk:
1946 Opc = X86::VPBLENDMQZ128rrk;
1947 break;
1948 case X86::VMOVDQU64Z256rrk:
1949 Opc = X86::VPBLENDMQZ256rrk;
1950 break;
1951 case X86::VMOVDQU64Zrrk:
1952 Opc = X86::VPBLENDMQZrrk;
1953 break;
1954 case X86::VMOVUPDZ128rrk:
1955 Opc = X86::VBLENDMPDZ128rrk;
1956 break;
1957 case X86::VMOVUPDZ256rrk:
1958 Opc = X86::VBLENDMPDZ256rrk;
1959 break;
1960 case X86::VMOVUPDZrrk:
1961 Opc = X86::VBLENDMPDZrrk;
1962 break;
1963 case X86::VMOVUPSZ128rrk:
1964 Opc = X86::VBLENDMPSZ128rrk;
1965 break;
1966 case X86::VMOVUPSZ256rrk:
1967 Opc = X86::VBLENDMPSZ256rrk;
1968 break;
1969 case X86::VMOVUPSZrrk:
1970 Opc = X86::VBLENDMPSZrrk;
1971 break;
1972 case X86::VMOVDQA32Z128rrk:
1973 Opc = X86::VPBLENDMDZ128rrk;
1974 break;
1975 case X86::VMOVDQA32Z256rrk:
1976 Opc = X86::VPBLENDMDZ256rrk;
1977 break;
1978 case X86::VMOVDQA32Zrrk:
1979 Opc = X86::VPBLENDMDZrrk;
1980 break;
1981 case X86::VMOVDQA64Z128rrk:
1982 Opc = X86::VPBLENDMQZ128rrk;
1983 break;
1984 case X86::VMOVDQA64Z256rrk:
1985 Opc = X86::VPBLENDMQZ256rrk;
1986 break;
1987 case X86::VMOVDQA64Zrrk:
1988 Opc = X86::VPBLENDMQZrrk;
1989 break;
1990 case X86::VMOVAPDZ128rrk:
1991 Opc = X86::VBLENDMPDZ128rrk;
1992 break;
1993 case X86::VMOVAPDZ256rrk:
1994 Opc = X86::VBLENDMPDZ256rrk;
1995 break;
1996 case X86::VMOVAPDZrrk:
1997 Opc = X86::VBLENDMPDZrrk;
1998 break;
1999 case X86::VMOVAPSZ128rrk:
2000 Opc = X86::VBLENDMPSZ128rrk;
2001 break;
2002 case X86::VMOVAPSZ256rrk:
2003 Opc = X86::VBLENDMPSZ256rrk;
2004 break;
2005 case X86::VMOVAPSZrrk:
2006 Opc = X86::VBLENDMPSZrrk;
2007 break;
2008 }
2009
2010 NewMI = BuildMI(MF, MI.getDebugLoc(), get(Opc))
2011 .add(Dest)
2012 .add(MI.getOperand(2))
2013 .add(Src)
2014 .add(MI.getOperand(3));
2015 NumRegOperands = 4;
2016 break;
2017 }
2018 }
2019
2020 if (!NewMI)
2021 return nullptr;
2022
2023 if (LV) { // Update live variables
2024 for (unsigned I = 0; I < NumRegOperands; ++I) {
2025 MachineOperand &Op = MI.getOperand(I);
2026 if (Op.isReg() && (Op.isDead() || Op.isKill()))
2027 LV->replaceKillInstruction(Op.getReg(), MI, *NewMI);
2028 }
2029 }
2030
2031 MachineBasicBlock &MBB = *MI.getParent();
2032 MBB.insert(MI.getIterator(), NewMI); // Insert the new inst
2033
2034 if (LIS) {
2035 LIS->ReplaceMachineInstrInMaps(MI, *NewMI);
2036 if (SrcReg)
2037 LIS->getInterval(SrcReg);
2038 if (SrcReg2)
2039 LIS->getInterval(SrcReg2);
2040 }
2041
2042 return NewMI;
2043}
2044
2045/// This determines which of three possible cases of a three source commute
2046/// the source indexes correspond to taking into account any mask operands.
2047/// All prevents commuting a passthru operand. Returns -1 if the commute isn't
2048/// possible.
2049/// Case 0 - Possible to commute the first and second operands.
2050/// Case 1 - Possible to commute the first and third operands.
2051/// Case 2 - Possible to commute the second and third operands.
2052static unsigned getThreeSrcCommuteCase(uint64_t TSFlags, unsigned SrcOpIdx1,
2053 unsigned SrcOpIdx2) {
2054 // Put the lowest index to SrcOpIdx1 to simplify the checks below.
2055 if (SrcOpIdx1 > SrcOpIdx2)
2056 std::swap(SrcOpIdx1, SrcOpIdx2);
2057
2058 unsigned Op1 = 1, Op2 = 2, Op3 = 3;
2059 if (X86II::isKMasked(TSFlags)) {
2060 Op2++;
2061 Op3++;
2062 }
2063
2064 if (SrcOpIdx1 == Op1 && SrcOpIdx2 == Op2)
2065 return 0;
2066 if (SrcOpIdx1 == Op1 && SrcOpIdx2 == Op3)
2067 return 1;
2068 if (SrcOpIdx1 == Op2 && SrcOpIdx2 == Op3)
2069 return 2;
2070 llvm_unreachable("Unknown three src commute case.");
2071}
2072
2074 const MachineInstr &MI, unsigned SrcOpIdx1, unsigned SrcOpIdx2,
2075 const X86InstrFMA3Group &FMA3Group) const {
2076
2077 unsigned Opc = MI.getOpcode();
2078
2079 // TODO: Commuting the 1st operand of FMA*_Int requires some additional
2080 // analysis. The commute optimization is legal only if all users of FMA*_Int
2081 // use only the lowest element of the FMA*_Int instruction. Such analysis are
2082 // not implemented yet. So, just return 0 in that case.
2083 // When such analysis are available this place will be the right place for
2084 // calling it.
2085 assert(!(FMA3Group.isIntrinsic() && (SrcOpIdx1 == 1 || SrcOpIdx2 == 1)) &&
2086 "Intrinsic instructions can't commute operand 1");
2087
2088 // Determine which case this commute is or if it can't be done.
2089 unsigned Case =
2090 getThreeSrcCommuteCase(MI.getDesc().TSFlags, SrcOpIdx1, SrcOpIdx2);
2091 assert(Case < 3 && "Unexpected case number!");
2092
2093 // Define the FMA forms mapping array that helps to map input FMA form
2094 // to output FMA form to preserve the operation semantics after
2095 // commuting the operands.
2096 const unsigned Form132Index = 0;
2097 const unsigned Form213Index = 1;
2098 const unsigned Form231Index = 2;
2099 static const unsigned FormMapping[][3] = {
2100 // 0: SrcOpIdx1 == 1 && SrcOpIdx2 == 2;
2101 // FMA132 A, C, b; ==> FMA231 C, A, b;
2102 // FMA213 B, A, c; ==> FMA213 A, B, c;
2103 // FMA231 C, A, b; ==> FMA132 A, C, b;
2104 {Form231Index, Form213Index, Form132Index},
2105 // 1: SrcOpIdx1 == 1 && SrcOpIdx2 == 3;
2106 // FMA132 A, c, B; ==> FMA132 B, c, A;
2107 // FMA213 B, a, C; ==> FMA231 C, a, B;
2108 // FMA231 C, a, B; ==> FMA213 B, a, C;
2109 {Form132Index, Form231Index, Form213Index},
2110 // 2: SrcOpIdx1 == 2 && SrcOpIdx2 == 3;
2111 // FMA132 a, C, B; ==> FMA213 a, B, C;
2112 // FMA213 b, A, C; ==> FMA132 b, C, A;
2113 // FMA231 c, A, B; ==> FMA231 c, B, A;
2114 {Form213Index, Form132Index, Form231Index}};
2115
2116 unsigned FMAForms[3];
2117 FMAForms[0] = FMA3Group.get132Opcode();
2118 FMAForms[1] = FMA3Group.get213Opcode();
2119 FMAForms[2] = FMA3Group.get231Opcode();
2120
2121 // Everything is ready, just adjust the FMA opcode and return it.
2122 for (unsigned FormIndex = 0; FormIndex < 3; FormIndex++)
2123 if (Opc == FMAForms[FormIndex])
2124 return FMAForms[FormMapping[Case][FormIndex]];
2125
2126 llvm_unreachable("Illegal FMA3 format");
2127}
2128
2129static void commuteVPTERNLOG(MachineInstr &MI, unsigned SrcOpIdx1,
2130 unsigned SrcOpIdx2) {
2131 // Determine which case this commute is or if it can't be done.
2132 unsigned Case =
2133 getThreeSrcCommuteCase(MI.getDesc().TSFlags, SrcOpIdx1, SrcOpIdx2);
2134 assert(Case < 3 && "Unexpected case value!");
2135
2136 // For each case we need to swap two pairs of bits in the final immediate.
2137 static const uint8_t SwapMasks[3][4] = {
2138 {0x04, 0x10, 0x08, 0x20}, // Swap bits 2/4 and 3/5.
2139 {0x02, 0x10, 0x08, 0x40}, // Swap bits 1/4 and 3/6.
2140 {0x02, 0x04, 0x20, 0x40}, // Swap bits 1/2 and 5/6.
2141 };
2142
2143 uint8_t Imm = MI.getOperand(MI.getNumOperands() - 1).getImm();
2144 // Clear out the bits we are swapping.
2145 uint8_t NewImm = Imm & ~(SwapMasks[Case][0] | SwapMasks[Case][1] |
2146 SwapMasks[Case][2] | SwapMasks[Case][3]);
2147 // If the immediate had a bit of the pair set, then set the opposite bit.
2148 if (Imm & SwapMasks[Case][0])
2149 NewImm |= SwapMasks[Case][1];
2150 if (Imm & SwapMasks[Case][1])
2151 NewImm |= SwapMasks[Case][0];
2152 if (Imm & SwapMasks[Case][2])
2153 NewImm |= SwapMasks[Case][3];
2154 if (Imm & SwapMasks[Case][3])
2155 NewImm |= SwapMasks[Case][2];
2156 MI.getOperand(MI.getNumOperands() - 1).setImm(NewImm);
2157}
2158
2159// Returns true if this is a VPERMI2 or VPERMT2 instruction that can be
2160// commuted.
2161static bool isCommutableVPERMV3Instruction(unsigned Opcode) {
2162#define VPERM_CASES(Suffix) \
2163 case X86::VPERMI2##Suffix##Z128rr: \
2164 case X86::VPERMT2##Suffix##Z128rr: \
2165 case X86::VPERMI2##Suffix##Z256rr: \
2166 case X86::VPERMT2##Suffix##Z256rr: \
2167 case X86::VPERMI2##Suffix##Zrr: \
2168 case X86::VPERMT2##Suffix##Zrr: \
2169 case X86::VPERMI2##Suffix##Z128rm: \
2170 case X86::VPERMT2##Suffix##Z128rm: \
2171 case X86::VPERMI2##Suffix##Z256rm: \
2172 case X86::VPERMT2##Suffix##Z256rm: \
2173 case X86::VPERMI2##Suffix##Zrm: \
2174 case X86::VPERMT2##Suffix##Zrm: \
2175 case X86::VPERMI2##Suffix##Z128rrkz: \
2176 case X86::VPERMT2##Suffix##Z128rrkz: \
2177 case X86::VPERMI2##Suffix##Z256rrkz: \
2178 case X86::VPERMT2##Suffix##Z256rrkz: \
2179 case X86::VPERMI2##Suffix##Zrrkz: \
2180 case X86::VPERMT2##Suffix##Zrrkz: \
2181 case X86::VPERMI2##Suffix##Z128rmkz: \
2182 case X86::VPERMT2##Suffix##Z128rmkz: \
2183 case X86::VPERMI2##Suffix##Z256rmkz: \
2184 case X86::VPERMT2##Suffix##Z256rmkz: \
2185 case X86::VPERMI2##Suffix##Zrmkz: \
2186 case X86::VPERMT2##Suffix##Zrmkz:
2187
2188#define VPERM_CASES_BROADCAST(Suffix) \
2189 VPERM_CASES(Suffix) \
2190 case X86::VPERMI2##Suffix##Z128rmb: \
2191 case X86::VPERMT2##Suffix##Z128rmb: \
2192 case X86::VPERMI2##Suffix##Z256rmb: \
2193 case X86::VPERMT2##Suffix##Z256rmb: \
2194 case X86::VPERMI2##Suffix##Zrmb: \
2195 case X86::VPERMT2##Suffix##Zrmb: \
2196 case X86::VPERMI2##Suffix##Z128rmbkz: \
2197 case X86::VPERMT2##Suffix##Z128rmbkz: \
2198 case X86::VPERMI2##Suffix##Z256rmbkz: \
2199 case X86::VPERMT2##Suffix##Z256rmbkz: \
2200 case X86::VPERMI2##Suffix##Zrmbkz: \
2201 case X86::VPERMT2##Suffix##Zrmbkz:
2202
2203 switch (Opcode) {
2204 default:
2205 return false;
2206 VPERM_CASES(B)
2211 VPERM_CASES(W)
2212 return true;
2213 }
2214#undef VPERM_CASES_BROADCAST
2215#undef VPERM_CASES
2216}
2217
2218// Returns commuted opcode for VPERMI2 and VPERMT2 instructions by switching
2219// from the I opcode to the T opcode and vice versa.
2220static unsigned getCommutedVPERMV3Opcode(unsigned Opcode) {
2221#define VPERM_CASES(Orig, New) \
2222 case X86::Orig##Z128rr: \
2223 return X86::New##Z128rr; \
2224 case X86::Orig##Z128rrkz: \
2225 return X86::New##Z128rrkz; \
2226 case X86::Orig##Z128rm: \
2227 return X86::New##Z128rm; \
2228 case X86::Orig##Z128rmkz: \
2229 return X86::New##Z128rmkz; \
2230 case X86::Orig##Z256rr: \
2231 return X86::New##Z256rr; \
2232 case X86::Orig##Z256rrkz: \
2233 return X86::New##Z256rrkz; \
2234 case X86::Orig##Z256rm: \
2235 return X86::New##Z256rm; \
2236 case X86::Orig##Z256rmkz: \
2237 return X86::New##Z256rmkz; \
2238 case X86::Orig##Zrr: \
2239 return X86::New##Zrr; \
2240 case X86::Orig##Zrrkz: \
2241 return X86::New##Zrrkz; \
2242 case X86::Orig##Zrm: \
2243 return X86::New##Zrm; \
2244 case X86::Orig##Zrmkz: \
2245 return X86::New##Zrmkz;
2246
2247#define VPERM_CASES_BROADCAST(Orig, New) \
2248 VPERM_CASES(Orig, New) \
2249 case X86::Orig##Z128rmb: \
2250 return X86::New##Z128rmb; \
2251 case X86::Orig##Z128rmbkz: \
2252 return X86::New##Z128rmbkz; \
2253 case X86::Orig##Z256rmb: \
2254 return X86::New##Z256rmb; \
2255 case X86::Orig##Z256rmbkz: \
2256 return X86::New##Z256rmbkz; \
2257 case X86::Orig##Zrmb: \
2258 return X86::New##Zrmb; \
2259 case X86::Orig##Zrmbkz: \
2260 return X86::New##Zrmbkz;
2261
2262 switch (Opcode) {
2263 VPERM_CASES(VPERMI2B, VPERMT2B)
2264 VPERM_CASES_BROADCAST(VPERMI2D, VPERMT2D)
2265 VPERM_CASES_BROADCAST(VPERMI2PD, VPERMT2PD)
2266 VPERM_CASES_BROADCAST(VPERMI2PS, VPERMT2PS)
2267 VPERM_CASES_BROADCAST(VPERMI2Q, VPERMT2Q)
2268 VPERM_CASES(VPERMI2W, VPERMT2W)
2269 VPERM_CASES(VPERMT2B, VPERMI2B)
2270 VPERM_CASES_BROADCAST(VPERMT2D, VPERMI2D)
2271 VPERM_CASES_BROADCAST(VPERMT2PD, VPERMI2PD)
2272 VPERM_CASES_BROADCAST(VPERMT2PS, VPERMI2PS)
2273 VPERM_CASES_BROADCAST(VPERMT2Q, VPERMI2Q)
2274 VPERM_CASES(VPERMT2W, VPERMI2W)
2275 }
2276
2277 llvm_unreachable("Unreachable!");
2278#undef VPERM_CASES_BROADCAST
2279#undef VPERM_CASES
2280}
2281
2283 unsigned OpIdx1,
2284 unsigned OpIdx2) const {
2285 auto CloneIfNew = [&](MachineInstr &MI) {
2286 return std::exchange(NewMI, false)
2287 ? MI.getParent()->getParent()->CloneMachineInstr(&MI)
2288 : &MI;
2289 };
2290 MachineInstr *WorkingMI = nullptr;
2291 unsigned Opc = MI.getOpcode();
2292
2293#define CASE_ND(OP) \
2294 case X86::OP: \
2295 case X86::OP##_ND:
2296
2297 switch (Opc) {
2298 // SHLD B, C, I <-> SHRD C, B, (BitWidth - I)
2299 CASE_ND(SHRD16rri8)
2300 CASE_ND(SHLD16rri8)
2301 CASE_ND(SHRD32rri8)
2302 CASE_ND(SHLD32rri8)
2303 CASE_ND(SHRD64rri8)
2304 CASE_ND(SHLD64rri8) {
2305 unsigned Size;
2306 switch (Opc) {
2307 default:
2308 llvm_unreachable("Unreachable!");
2309#define FROM_TO_SIZE(A, B, S) \
2310 case X86::A: \
2311 Opc = X86::B; \
2312 Size = S; \
2313 break; \
2314 case X86::A##_ND: \
2315 Opc = X86::B##_ND; \
2316 Size = S; \
2317 break; \
2318 case X86::B: \
2319 Opc = X86::A; \
2320 Size = S; \
2321 break; \
2322 case X86::B##_ND: \
2323 Opc = X86::A##_ND; \
2324 Size = S; \
2325 break;
2326
2327 FROM_TO_SIZE(SHRD16rri8, SHLD16rri8, 16)
2328 FROM_TO_SIZE(SHRD32rri8, SHLD32rri8, 32)
2329 FROM_TO_SIZE(SHRD64rri8, SHLD64rri8, 64)
2330#undef FROM_TO_SIZE
2331 }
2332 WorkingMI = CloneIfNew(MI);
2333 WorkingMI->setDesc(get(Opc));
2334 WorkingMI->getOperand(3).setImm(Size - MI.getOperand(3).getImm());
2335 break;
2336 }
2337 case X86::PFSUBrr:
2338 case X86::PFSUBRrr:
2339 // PFSUB x, y: x = x - y
2340 // PFSUBR x, y: x = y - x
2341 WorkingMI = CloneIfNew(MI);
2342 WorkingMI->setDesc(
2343 get(X86::PFSUBRrr == Opc ? X86::PFSUBrr : X86::PFSUBRrr));
2344 break;
2345 case X86::BLENDPDrri:
2346 case X86::BLENDPSrri:
2347 case X86::VBLENDPDrri:
2348 case X86::VBLENDPSrri:
2349 // If we're optimizing for size, try to use MOVSD/MOVSS.
2350 if (MI.getParent()->getParent()->getFunction().hasOptSize()) {
2351 unsigned Mask = (Opc == X86::BLENDPDrri || Opc == X86::VBLENDPDrri) ? 0x03: 0x0F;
2352 if ((MI.getOperand(3).getImm() ^ Mask) == 1) {
2353#define FROM_TO(FROM, TO) \
2354 case X86::FROM: \
2355 Opc = X86::TO; \
2356 break;
2357 switch (Opc) {
2358 default:
2359 llvm_unreachable("Unreachable!");
2360 FROM_TO(BLENDPDrri, MOVSDrr)
2361 FROM_TO(BLENDPSrri, MOVSSrr)
2362 FROM_TO(VBLENDPDrri, VMOVSDrr)
2363 FROM_TO(VBLENDPSrri, VMOVSSrr)
2364 }
2365 WorkingMI = CloneIfNew(MI);
2366 WorkingMI->setDesc(get(Opc));
2367 WorkingMI->removeOperand(3);
2368 break;
2369 }
2370#undef FROM_TO
2371 }
2372 [[fallthrough]];
2373 case X86::PBLENDWrri:
2374 case X86::VBLENDPDYrri:
2375 case X86::VBLENDPSYrri:
2376 case X86::VPBLENDDrri:
2377 case X86::VPBLENDWrri:
2378 case X86::VPBLENDDYrri:
2379 case X86::VPBLENDWYrri: {
2380 int8_t Mask;
2381 switch (Opc) {
2382 default:
2383 llvm_unreachable("Unreachable!");
2384 case X86::BLENDPDrri:
2385 Mask = (int8_t)0x03;
2386 break;
2387 case X86::BLENDPSrri:
2388 Mask = (int8_t)0x0F;
2389 break;
2390 case X86::PBLENDWrri:
2391 Mask = (int8_t)0xFF;
2392 break;
2393 case X86::VBLENDPDrri:
2394 Mask = (int8_t)0x03;
2395 break;
2396 case X86::VBLENDPSrri:
2397 Mask = (int8_t)0x0F;
2398 break;
2399 case X86::VBLENDPDYrri:
2400 Mask = (int8_t)0x0F;
2401 break;
2402 case X86::VBLENDPSYrri:
2403 Mask = (int8_t)0xFF;
2404 break;
2405 case X86::VPBLENDDrri:
2406 Mask = (int8_t)0x0F;
2407 break;
2408 case X86::VPBLENDWrri:
2409 Mask = (int8_t)0xFF;
2410 break;
2411 case X86::VPBLENDDYrri:
2412 Mask = (int8_t)0xFF;
2413 break;
2414 case X86::VPBLENDWYrri:
2415 Mask = (int8_t)0xFF;
2416 break;
2417 }
2418 // Only the least significant bits of Imm are used.
2419 // Using int8_t to ensure it will be sign extended to the int64_t that
2420 // setImm takes in order to match isel behavior.
2421 int8_t Imm = MI.getOperand(3).getImm() & Mask;
2422 WorkingMI = CloneIfNew(MI);
2423 WorkingMI->getOperand(3).setImm(Mask ^ Imm);
2424 break;
2425 }
2426 case X86::INSERTPSrr:
2427 case X86::VINSERTPSrr:
2428 case X86::VINSERTPSZrr: {
2429 unsigned Imm = MI.getOperand(MI.getNumOperands() - 1).getImm();
2430 unsigned ZMask = Imm & 15;
2431 unsigned DstIdx = (Imm >> 4) & 3;
2432 unsigned SrcIdx = (Imm >> 6) & 3;
2433
2434 // We can commute insertps if we zero 2 of the elements, the insertion is
2435 // "inline" and we don't override the insertion with a zero.
2436 if (DstIdx == SrcIdx && (ZMask & (1 << DstIdx)) == 0 &&
2437 llvm::popcount(ZMask) == 2) {
2438 unsigned AltIdx = llvm::countr_zero((ZMask | (1 << DstIdx)) ^ 15);
2439 assert(AltIdx < 4 && "Illegal insertion index");
2440 unsigned AltImm = (AltIdx << 6) | (AltIdx << 4) | ZMask;
2441 WorkingMI = CloneIfNew(MI);
2442 WorkingMI->getOperand(MI.getNumOperands() - 1).setImm(AltImm);
2443 break;
2444 }
2445 return nullptr;
2446 }
2447 case X86::MOVSDrr:
2448 case X86::MOVSSrr:
2449 case X86::VMOVSDrr:
2450 case X86::VMOVSSrr: {
2451 // On SSE41 or later we can commute a MOVSS/MOVSD to a BLENDPS/BLENDPD.
2452 if (Subtarget.hasSSE41()) {
2453 unsigned Mask;
2454 switch (Opc) {
2455 default:
2456 llvm_unreachable("Unreachable!");
2457 case X86::MOVSDrr:
2458 Opc = X86::BLENDPDrri;
2459 Mask = 0x02;
2460 break;
2461 case X86::MOVSSrr:
2462 Opc = X86::BLENDPSrri;
2463 Mask = 0x0E;
2464 break;
2465 case X86::VMOVSDrr:
2466 Opc = X86::VBLENDPDrri;
2467 Mask = 0x02;
2468 break;
2469 case X86::VMOVSSrr:
2470 Opc = X86::VBLENDPSrri;
2471 Mask = 0x0E;
2472 break;
2473 }
2474
2475 WorkingMI = CloneIfNew(MI);
2476 WorkingMI->setDesc(get(Opc));
2477 WorkingMI->addOperand(MachineOperand::CreateImm(Mask));
2478 break;
2479 }
2480
2481 WorkingMI = CloneIfNew(MI);
2482 WorkingMI->setDesc(get(X86::SHUFPDrri));
2483 WorkingMI->addOperand(MachineOperand::CreateImm(0x02));
2484 break;
2485 }
2486 case X86::SHUFPDrri: {
2487 // Commute to MOVSD.
2488 assert(MI.getOperand(3).getImm() == 0x02 && "Unexpected immediate!");
2489 WorkingMI = CloneIfNew(MI);
2490 WorkingMI->setDesc(get(X86::MOVSDrr));
2491 WorkingMI->removeOperand(3);
2492 break;
2493 }
2494 case X86::PCLMULQDQrri:
2495 case X86::VPCLMULQDQrri:
2496 case X86::VPCLMULQDQYrri:
2497 case X86::VPCLMULQDQZrri:
2498 case X86::VPCLMULQDQZ128rri:
2499 case X86::VPCLMULQDQZ256rri: {
2500 // SRC1 64bits = Imm[0] ? SRC1[127:64] : SRC1[63:0]
2501 // SRC2 64bits = Imm[4] ? SRC2[127:64] : SRC2[63:0]
2502 unsigned Imm = MI.getOperand(3).getImm();
2503 unsigned Src1Hi = Imm & 0x01;
2504 unsigned Src2Hi = Imm & 0x10;
2505 WorkingMI = CloneIfNew(MI);
2506 WorkingMI->getOperand(3).setImm((Src1Hi << 4) | (Src2Hi >> 4));
2507 break;
2508 }
2509 case X86::VPCMPBZ128rri:
2510 case X86::VPCMPUBZ128rri:
2511 case X86::VPCMPBZ256rri:
2512 case X86::VPCMPUBZ256rri:
2513 case X86::VPCMPBZrri:
2514 case X86::VPCMPUBZrri:
2515 case X86::VPCMPDZ128rri:
2516 case X86::VPCMPUDZ128rri:
2517 case X86::VPCMPDZ256rri:
2518 case X86::VPCMPUDZ256rri:
2519 case X86::VPCMPDZrri:
2520 case X86::VPCMPUDZrri:
2521 case X86::VPCMPQZ128rri:
2522 case X86::VPCMPUQZ128rri:
2523 case X86::VPCMPQZ256rri:
2524 case X86::VPCMPUQZ256rri:
2525 case X86::VPCMPQZrri:
2526 case X86::VPCMPUQZrri:
2527 case X86::VPCMPWZ128rri:
2528 case X86::VPCMPUWZ128rri:
2529 case X86::VPCMPWZ256rri:
2530 case X86::VPCMPUWZ256rri:
2531 case X86::VPCMPWZrri:
2532 case X86::VPCMPUWZrri:
2533 case X86::VPCMPBZ128rrik:
2534 case X86::VPCMPUBZ128rrik:
2535 case X86::VPCMPBZ256rrik:
2536 case X86::VPCMPUBZ256rrik:
2537 case X86::VPCMPBZrrik:
2538 case X86::VPCMPUBZrrik:
2539 case X86::VPCMPDZ128rrik:
2540 case X86::VPCMPUDZ128rrik:
2541 case X86::VPCMPDZ256rrik:
2542 case X86::VPCMPUDZ256rrik:
2543 case X86::VPCMPDZrrik:
2544 case X86::VPCMPUDZrrik:
2545 case X86::VPCMPQZ128rrik:
2546 case X86::VPCMPUQZ128rrik:
2547 case X86::VPCMPQZ256rrik:
2548 case X86::VPCMPUQZ256rrik:
2549 case X86::VPCMPQZrrik:
2550 case X86::VPCMPUQZrrik:
2551 case X86::VPCMPWZ128rrik:
2552 case X86::VPCMPUWZ128rrik:
2553 case X86::VPCMPWZ256rrik:
2554 case X86::VPCMPUWZ256rrik:
2555 case X86::VPCMPWZrrik:
2556 case X86::VPCMPUWZrrik:
2557 WorkingMI = CloneIfNew(MI);
2558 // Flip comparison mode immediate (if necessary).
2559 WorkingMI->getOperand(MI.getNumOperands() - 1)
2561 MI.getOperand(MI.getNumOperands() - 1).getImm() & 0x7));
2562 break;
2563 case X86::VPCOMBri:
2564 case X86::VPCOMUBri:
2565 case X86::VPCOMDri:
2566 case X86::VPCOMUDri:
2567 case X86::VPCOMQri:
2568 case X86::VPCOMUQri:
2569 case X86::VPCOMWri:
2570 case X86::VPCOMUWri:
2571 WorkingMI = CloneIfNew(MI);
2572 // Flip comparison mode immediate (if necessary).
2573 WorkingMI->getOperand(3).setImm(
2574 X86::getSwappedVPCOMImm(MI.getOperand(3).getImm() & 0x7));
2575 break;
2576 case X86::VCMPSDZrri:
2577 case X86::VCMPSSZrri:
2578 case X86::VCMPPDZrri:
2579 case X86::VCMPPSZrri:
2580 case X86::VCMPSHZrri:
2581 case X86::VCMPPHZrri:
2582 case X86::VCMPPHZ128rri:
2583 case X86::VCMPPHZ256rri:
2584 case X86::VCMPPDZ128rri:
2585 case X86::VCMPPSZ128rri:
2586 case X86::VCMPPDZ256rri:
2587 case X86::VCMPPSZ256rri:
2588 case X86::VCMPPDZrrik:
2589 case X86::VCMPPSZrrik:
2590 case X86::VCMPPDZ128rrik:
2591 case X86::VCMPPSZ128rrik:
2592 case X86::VCMPPDZ256rrik:
2593 case X86::VCMPPSZ256rrik:
2594 WorkingMI = CloneIfNew(MI);
2595 WorkingMI->getOperand(MI.getNumExplicitOperands() - 1)
2597 MI.getOperand(MI.getNumExplicitOperands() - 1).getImm() & 0x1f));
2598 break;
2599 case X86::VPERM2F128rr:
2600 case X86::VPERM2I128rr:
2601 // Flip permute source immediate.
2602 // Imm & 0x02: lo = if set, select Op1.lo/hi else Op0.lo/hi.
2603 // Imm & 0x20: hi = if set, select Op1.lo/hi else Op0.lo/hi.
2604 WorkingMI = CloneIfNew(MI);
2605 WorkingMI->getOperand(3).setImm((MI.getOperand(3).getImm() & 0xFF) ^ 0x22);
2606 break;
2607 case X86::MOVHLPSrr:
2608 case X86::UNPCKHPDrr:
2609 case X86::VMOVHLPSrr:
2610 case X86::VUNPCKHPDrr:
2611 case X86::VMOVHLPSZrr:
2612 case X86::VUNPCKHPDZ128rr:
2613 assert(Subtarget.hasSSE2() && "Commuting MOVHLP/UNPCKHPD requires SSE2!");
2614
2615 switch (Opc) {
2616 default:
2617 llvm_unreachable("Unreachable!");
2618 case X86::MOVHLPSrr:
2619 Opc = X86::UNPCKHPDrr;
2620 break;
2621 case X86::UNPCKHPDrr:
2622 Opc = X86::MOVHLPSrr;
2623 break;
2624 case X86::VMOVHLPSrr:
2625 Opc = X86::VUNPCKHPDrr;
2626 break;
2627 case X86::VUNPCKHPDrr:
2628 Opc = X86::VMOVHLPSrr;
2629 break;
2630 case X86::VMOVHLPSZrr:
2631 Opc = X86::VUNPCKHPDZ128rr;
2632 break;
2633 case X86::VUNPCKHPDZ128rr:
2634 Opc = X86::VMOVHLPSZrr;
2635 break;
2636 }
2637 WorkingMI = CloneIfNew(MI);
2638 WorkingMI->setDesc(get(Opc));
2639 break;
2640 CASE_ND(CMOV16rr)
2641 CASE_ND(CMOV32rr)
2642 CASE_ND(CMOV64rr) {
2643 WorkingMI = CloneIfNew(MI);
2644 unsigned OpNo = MI.getDesc().getNumOperands() - 1;
2645 X86::CondCode CC = static_cast<X86::CondCode>(MI.getOperand(OpNo).getImm());
2647 break;
2648 }
2649 case X86::VPTERNLOGDZrri:
2650 case X86::VPTERNLOGDZrmi:
2651 case X86::VPTERNLOGDZ128rri:
2652 case X86::VPTERNLOGDZ128rmi:
2653 case X86::VPTERNLOGDZ256rri:
2654 case X86::VPTERNLOGDZ256rmi:
2655 case X86::VPTERNLOGQZrri:
2656 case X86::VPTERNLOGQZrmi:
2657 case X86::VPTERNLOGQZ128rri:
2658 case X86::VPTERNLOGQZ128rmi:
2659 case X86::VPTERNLOGQZ256rri:
2660 case X86::VPTERNLOGQZ256rmi:
2661 case X86::VPTERNLOGDZrrik:
2662 case X86::VPTERNLOGDZ128rrik:
2663 case X86::VPTERNLOGDZ256rrik:
2664 case X86::VPTERNLOGQZrrik:
2665 case X86::VPTERNLOGQZ128rrik:
2666 case X86::VPTERNLOGQZ256rrik:
2667 case X86::VPTERNLOGDZrrikz:
2668 case X86::VPTERNLOGDZrmikz:
2669 case X86::VPTERNLOGDZ128rrikz:
2670 case X86::VPTERNLOGDZ128rmikz:
2671 case X86::VPTERNLOGDZ256rrikz:
2672 case X86::VPTERNLOGDZ256rmikz:
2673 case X86::VPTERNLOGQZrrikz:
2674 case X86::VPTERNLOGQZrmikz:
2675 case X86::VPTERNLOGQZ128rrikz:
2676 case X86::VPTERNLOGQZ128rmikz:
2677 case X86::VPTERNLOGQZ256rrikz:
2678 case X86::VPTERNLOGQZ256rmikz:
2679 case X86::VPTERNLOGDZ128rmbi:
2680 case X86::VPTERNLOGDZ256rmbi:
2681 case X86::VPTERNLOGDZrmbi:
2682 case X86::VPTERNLOGQZ128rmbi:
2683 case X86::VPTERNLOGQZ256rmbi:
2684 case X86::VPTERNLOGQZrmbi:
2685 case X86::VPTERNLOGDZ128rmbikz:
2686 case X86::VPTERNLOGDZ256rmbikz:
2687 case X86::VPTERNLOGDZrmbikz:
2688 case X86::VPTERNLOGQZ128rmbikz:
2689 case X86::VPTERNLOGQZ256rmbikz:
2690 case X86::VPTERNLOGQZrmbikz: {
2691 WorkingMI = CloneIfNew(MI);
2692 commuteVPTERNLOG(*WorkingMI, OpIdx1, OpIdx2);
2693 break;
2694 }
2695 default:
2697 WorkingMI = CloneIfNew(MI);
2698 WorkingMI->setDesc(get(getCommutedVPERMV3Opcode(Opc)));
2699 break;
2700 }
2701
2702 if (auto *FMA3Group = getFMA3Group(Opc, MI.getDesc().TSFlags)) {
2703 WorkingMI = CloneIfNew(MI);
2704 WorkingMI->setDesc(
2705 get(getFMA3OpcodeToCommuteOperands(MI, OpIdx1, OpIdx2, *FMA3Group)));
2706 break;
2707 }
2708 }
2709 return TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2);
2710}
2711
2712bool X86InstrInfo::findThreeSrcCommutedOpIndices(const MachineInstr &MI,
2713 unsigned &SrcOpIdx1,
2714 unsigned &SrcOpIdx2,
2715 bool IsIntrinsic) const {
2716 uint64_t TSFlags = MI.getDesc().TSFlags;
2717
2718 unsigned FirstCommutableVecOp = 1;
2719 unsigned LastCommutableVecOp = 3;
2720 unsigned KMaskOp = -1U;
2721 if (X86II::isKMasked(TSFlags)) {
2722 // For k-zero-masked operations it is Ok to commute the first vector
2723 // operand. Unless this is an intrinsic instruction.
2724 // For regular k-masked operations a conservative choice is done as the
2725 // elements of the first vector operand, for which the corresponding bit
2726 // in the k-mask operand is set to 0, are copied to the result of the
2727 // instruction.
2728 // TODO/FIXME: The commute still may be legal if it is known that the
2729 // k-mask operand is set to either all ones or all zeroes.
2730 // It is also Ok to commute the 1st operand if all users of MI use only
2731 // the elements enabled by the k-mask operand. For example,
2732 // v4 = VFMADD213PSZrk v1, k, v2, v3; // v1[i] = k[i] ? v2[i]*v1[i]+v3[i]
2733 // : v1[i];
2734 // VMOVAPSZmrk <mem_addr>, k, v4; // this is the ONLY user of v4 ->
2735 // // Ok, to commute v1 in FMADD213PSZrk.
2736
2737 // The k-mask operand has index = 2 for masked and zero-masked operations.
2738 KMaskOp = 2;
2739
2740 // The operand with index = 1 is used as a source for those elements for
2741 // which the corresponding bit in the k-mask is set to 0.
2742 if (X86II::isKMergeMasked(TSFlags) || IsIntrinsic)
2743 FirstCommutableVecOp = 3;
2744
2745 LastCommutableVecOp++;
2746 } else if (IsIntrinsic) {
2747 // Commuting the first operand of an intrinsic instruction isn't possible
2748 // unless we can prove that only the lowest element of the result is used.
2749 FirstCommutableVecOp = 2;
2750 }
2751
2752 if (isMem(MI, LastCommutableVecOp))
2753 LastCommutableVecOp--;
2754
2755 // Only the first RegOpsNum operands are commutable.
2756 // Also, the value 'CommuteAnyOperandIndex' is valid here as it means
2757 // that the operand is not specified/fixed.
2758 if (SrcOpIdx1 != CommuteAnyOperandIndex &&
2759 (SrcOpIdx1 < FirstCommutableVecOp || SrcOpIdx1 > LastCommutableVecOp ||
2760 SrcOpIdx1 == KMaskOp))
2761 return false;
2762 if (SrcOpIdx2 != CommuteAnyOperandIndex &&
2763 (SrcOpIdx2 < FirstCommutableVecOp || SrcOpIdx2 > LastCommutableVecOp ||
2764 SrcOpIdx2 == KMaskOp))
2765 return false;
2766
2767 // Look for two different register operands assumed to be commutable
2768 // regardless of the FMA opcode. The FMA opcode is adjusted later.
2769 if (SrcOpIdx1 == CommuteAnyOperandIndex ||
2770 SrcOpIdx2 == CommuteAnyOperandIndex) {
2771 unsigned CommutableOpIdx2 = SrcOpIdx2;
2772
2773 // At least one of operands to be commuted is not specified and
2774 // this method is free to choose appropriate commutable operands.
2775 if (SrcOpIdx1 == SrcOpIdx2)
2776 // Both of operands are not fixed. By default set one of commutable
2777 // operands to the last register operand of the instruction.
2778 CommutableOpIdx2 = LastCommutableVecOp;
2779 else if (SrcOpIdx2 == CommuteAnyOperandIndex)
2780 // Only one of operands is not fixed.
2781 CommutableOpIdx2 = SrcOpIdx1;
2782
2783 // CommutableOpIdx2 is well defined now. Let's choose another commutable
2784 // operand and assign its index to CommutableOpIdx1.
2785 Register Op2Reg = MI.getOperand(CommutableOpIdx2).getReg();
2786
2787 unsigned CommutableOpIdx1;
2788 for (CommutableOpIdx1 = LastCommutableVecOp;
2789 CommutableOpIdx1 >= FirstCommutableVecOp; CommutableOpIdx1--) {
2790 // Just ignore and skip the k-mask operand.
2791 if (CommutableOpIdx1 == KMaskOp)
2792 continue;
2793
2794 // The commuted operands must have different registers.
2795 // Otherwise, the commute transformation does not change anything and
2796 // is useless then.
2797 if (Op2Reg != MI.getOperand(CommutableOpIdx1).getReg())
2798 break;
2799 }
2800
2801 // No appropriate commutable operands were found.
2802 if (CommutableOpIdx1 < FirstCommutableVecOp)
2803 return false;
2804
2805 // Assign the found pair of commutable indices to SrcOpIdx1 and SrcOpidx2
2806 // to return those values.
2807 if (!fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2, CommutableOpIdx1,
2808 CommutableOpIdx2))
2809 return false;
2810 }
2811
2812 return true;
2813}
2814
2816 unsigned &SrcOpIdx1,
2817 unsigned &SrcOpIdx2) const {
2818 const MCInstrDesc &Desc = MI.getDesc();
2819 if (!Desc.isCommutable())
2820 return false;
2821
2822 switch (MI.getOpcode()) {
2823 case X86::CMPSDrri:
2824 case X86::CMPSSrri:
2825 case X86::CMPPDrri:
2826 case X86::CMPPSrri:
2827 case X86::VCMPSDrri:
2828 case X86::VCMPSSrri:
2829 case X86::VCMPPDrri:
2830 case X86::VCMPPSrri:
2831 case X86::VCMPPDYrri:
2832 case X86::VCMPPSYrri:
2833 case X86::VCMPSDZrri:
2834 case X86::VCMPSSZrri:
2835 case X86::VCMPPDZrri:
2836 case X86::VCMPPSZrri:
2837 case X86::VCMPSHZrri:
2838 case X86::VCMPPHZrri:
2839 case X86::VCMPPHZ128rri:
2840 case X86::VCMPPHZ256rri:
2841 case X86::VCMPPDZ128rri:
2842 case X86::VCMPPSZ128rri:
2843 case X86::VCMPPDZ256rri:
2844 case X86::VCMPPSZ256rri:
2845 case X86::VCMPPDZrrik:
2846 case X86::VCMPPSZrrik:
2847 case X86::VCMPPDZ128rrik:
2848 case X86::VCMPPSZ128rrik:
2849 case X86::VCMPPDZ256rrik:
2850 case X86::VCMPPSZ256rrik: {
2851 unsigned OpOffset = X86II::isKMasked(Desc.TSFlags) ? 1 : 0;
2852
2853 // Float comparison can be safely commuted for
2854 // Ordered/Unordered/Equal/NotEqual tests
2855 unsigned Imm = MI.getOperand(3 + OpOffset).getImm() & 0x7;
2856 switch (Imm) {
2857 default:
2858 // EVEX versions can be commuted.
2859 if ((Desc.TSFlags & X86II::EncodingMask) == X86II::EVEX)
2860 break;
2861 return false;
2862 case 0x00: // EQUAL
2863 case 0x03: // UNORDERED
2864 case 0x04: // NOT EQUAL
2865 case 0x07: // ORDERED
2866 break;
2867 }
2868
2869 // The indices of the commutable operands are 1 and 2 (or 2 and 3
2870 // when masked).
2871 // Assign them to the returned operand indices here.
2872 return fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2, 1 + OpOffset,
2873 2 + OpOffset);
2874 }
2875 case X86::MOVSSrr:
2876 // X86::MOVSDrr is always commutable. MOVSS is only commutable if we can
2877 // form sse4.1 blend. We assume VMOVSSrr/VMOVSDrr is always commutable since
2878 // AVX implies sse4.1.
2879 if (Subtarget.hasSSE41())
2880 return TargetInstrInfo::findCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2);
2881 return false;
2882 case X86::SHUFPDrri:
2883 // We can commute this to MOVSD.
2884 if (MI.getOperand(3).getImm() == 0x02)
2885 return TargetInstrInfo::findCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2);
2886 return false;
2887 case X86::MOVHLPSrr:
2888 case X86::UNPCKHPDrr:
2889 case X86::VMOVHLPSrr:
2890 case X86::VUNPCKHPDrr:
2891 case X86::VMOVHLPSZrr:
2892 case X86::VUNPCKHPDZ128rr:
2893 if (Subtarget.hasSSE2())
2894 return TargetInstrInfo::findCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2);
2895 return false;
2896 case X86::VPTERNLOGDZrri:
2897 case X86::VPTERNLOGDZrmi:
2898 case X86::VPTERNLOGDZ128rri:
2899 case X86::VPTERNLOGDZ128rmi:
2900 case X86::VPTERNLOGDZ256rri:
2901 case X86::VPTERNLOGDZ256rmi:
2902 case X86::VPTERNLOGQZrri:
2903 case X86::VPTERNLOGQZrmi:
2904 case X86::VPTERNLOGQZ128rri:
2905 case X86::VPTERNLOGQZ128rmi:
2906 case X86::VPTERNLOGQZ256rri:
2907 case X86::VPTERNLOGQZ256rmi:
2908 case X86::VPTERNLOGDZrrik:
2909 case X86::VPTERNLOGDZ128rrik:
2910 case X86::VPTERNLOGDZ256rrik:
2911 case X86::VPTERNLOGQZrrik:
2912 case X86::VPTERNLOGQZ128rrik:
2913 case X86::VPTERNLOGQZ256rrik:
2914 case X86::VPTERNLOGDZrrikz:
2915 case X86::VPTERNLOGDZrmikz:
2916 case X86::VPTERNLOGDZ128rrikz:
2917 case X86::VPTERNLOGDZ128rmikz:
2918 case X86::VPTERNLOGDZ256rrikz:
2919 case X86::VPTERNLOGDZ256rmikz:
2920 case X86::VPTERNLOGQZrrikz:
2921 case X86::VPTERNLOGQZrmikz:
2922 case X86::VPTERNLOGQZ128rrikz:
2923 case X86::VPTERNLOGQZ128rmikz:
2924 case X86::VPTERNLOGQZ256rrikz:
2925 case X86::VPTERNLOGQZ256rmikz:
2926 case X86::VPTERNLOGDZ128rmbi:
2927 case X86::VPTERNLOGDZ256rmbi:
2928 case X86::VPTERNLOGDZrmbi:
2929 case X86::VPTERNLOGQZ128rmbi:
2930 case X86::VPTERNLOGQZ256rmbi:
2931 case X86::VPTERNLOGQZrmbi:
2932 case X86::VPTERNLOGDZ128rmbikz:
2933 case X86::VPTERNLOGDZ256rmbikz:
2934 case X86::VPTERNLOGDZrmbikz:
2935 case X86::VPTERNLOGQZ128rmbikz:
2936 case X86::VPTERNLOGQZ256rmbikz:
2937 case X86::VPTERNLOGQZrmbikz:
2938 return findThreeSrcCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2);
2939 case X86::VPDPWSSDYrr:
2940 case X86::VPDPWSSDrr:
2941 case X86::VPDPWSSDSYrr:
2942 case X86::VPDPWSSDSrr:
2943 case X86::VPDPWUUDrr:
2944 case X86::VPDPWUUDYrr:
2945 case X86::VPDPWUUDSrr:
2946 case X86::VPDPWUUDSYrr:
2947 case X86::VPDPBSSDSrr:
2948 case X86::VPDPBSSDSYrr:
2949 case X86::VPDPBSSDrr:
2950 case X86::VPDPBSSDYrr:
2951 case X86::VPDPBUUDSrr:
2952 case X86::VPDPBUUDSYrr:
2953 case X86::VPDPBUUDrr:
2954 case X86::VPDPBUUDYrr:
2955 case X86::VPDPWSSDZ128r:
2956 case X86::VPDPWSSDZ128rk:
2957 case X86::VPDPWSSDZ128rkz:
2958 case X86::VPDPWSSDZ256r:
2959 case X86::VPDPWSSDZ256rk:
2960 case X86::VPDPWSSDZ256rkz:
2961 case X86::VPDPWSSDZr:
2962 case X86::VPDPWSSDZrk:
2963 case X86::VPDPWSSDZrkz:
2964 case X86::VPDPWSSDSZ128r:
2965 case X86::VPDPWSSDSZ128rk:
2966 case X86::VPDPWSSDSZ128rkz:
2967 case X86::VPDPWSSDSZ256r:
2968 case X86::VPDPWSSDSZ256rk:
2969 case X86::VPDPWSSDSZ256rkz:
2970 case X86::VPDPWSSDSZr:
2971 case X86::VPDPWSSDSZrk:
2972 case X86::VPDPWSSDSZrkz:
2973 case X86::VPMADD52HUQrr:
2974 case X86::VPMADD52HUQYrr:
2975 case X86::VPMADD52HUQZ128r:
2976 case X86::VPMADD52HUQZ128rk:
2977 case X86::VPMADD52HUQZ128rkz:
2978 case X86::VPMADD52HUQZ256r:
2979 case X86::VPMADD52HUQZ256rk:
2980 case X86::VPMADD52HUQZ256rkz:
2981 case X86::VPMADD52HUQZr:
2982 case X86::VPMADD52HUQZrk:
2983 case X86::VPMADD52HUQZrkz:
2984 case X86::VPMADD52LUQrr:
2985 case X86::VPMADD52LUQYrr:
2986 case X86::VPMADD52LUQZ128r:
2987 case X86::VPMADD52LUQZ128rk:
2988 case X86::VPMADD52LUQZ128rkz:
2989 case X86::VPMADD52LUQZ256r:
2990 case X86::VPMADD52LUQZ256rk:
2991 case X86::VPMADD52LUQZ256rkz:
2992 case X86::VPMADD52LUQZr:
2993 case X86::VPMADD52LUQZrk:
2994 case X86::VPMADD52LUQZrkz:
2995 case X86::VFMADDCPHZr:
2996 case X86::VFMADDCPHZrk:
2997 case X86::VFMADDCPHZrkz:
2998 case X86::VFMADDCPHZ128r:
2999 case X86::VFMADDCPHZ128rk:
3000 case X86::VFMADDCPHZ128rkz:
3001 case X86::VFMADDCPHZ256r:
3002 case X86::VFMADDCPHZ256rk:
3003 case X86::VFMADDCPHZ256rkz:
3004 case X86::VFMADDCSHZr:
3005 case X86::VFMADDCSHZrk:
3006 case X86::VFMADDCSHZrkz: {
3007 unsigned CommutableOpIdx1 = 2;
3008 unsigned CommutableOpIdx2 = 3;
3009 if (X86II::isKMasked(Desc.TSFlags)) {
3010 // Skip the mask register.
3011 ++CommutableOpIdx1;
3012 ++CommutableOpIdx2;
3013 }
3014 if (!fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2, CommutableOpIdx1,
3015 CommutableOpIdx2))
3016 return false;
3017 if (!MI.getOperand(SrcOpIdx1).isReg() || !MI.getOperand(SrcOpIdx2).isReg())
3018 // No idea.
3019 return false;
3020 return true;
3021 }
3022
3023 default:
3024 const X86InstrFMA3Group *FMA3Group =
3025 getFMA3Group(MI.getOpcode(), MI.getDesc().TSFlags);
3026 if (FMA3Group)
3027 return findThreeSrcCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2,
3028 FMA3Group->isIntrinsic());
3029
3030 // Handled masked instructions since we need to skip over the mask input
3031 // and the preserved input.
3032 if (X86II::isKMasked(Desc.TSFlags)) {
3033 // First assume that the first input is the mask operand and skip past it.
3034 unsigned CommutableOpIdx1 = Desc.getNumDefs() + 1;
3035 unsigned CommutableOpIdx2 = Desc.getNumDefs() + 2;
3036 // Check if the first input is tied. If there isn't one then we only
3037 // need to skip the mask operand which we did above.
3038 if ((MI.getDesc().getOperandConstraint(Desc.getNumDefs(),
3039 MCOI::TIED_TO) != -1)) {
3040 // If this is zero masking instruction with a tied operand, we need to
3041 // move the first index back to the first input since this must
3042 // be a 3 input instruction and we want the first two non-mask inputs.
3043 // Otherwise this is a 2 input instruction with a preserved input and
3044 // mask, so we need to move the indices to skip one more input.
3045 if (X86II::isKMergeMasked(Desc.TSFlags)) {
3046 ++CommutableOpIdx1;
3047 ++CommutableOpIdx2;
3048 } else {
3049 --CommutableOpIdx1;
3050 }
3051 }
3052
3053 if (!fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2, CommutableOpIdx1,
3054 CommutableOpIdx2))
3055 return false;
3056
3057 if (!MI.getOperand(SrcOpIdx1).isReg() ||
3058 !MI.getOperand(SrcOpIdx2).isReg())
3059 // No idea.
3060 return false;
3061 return true;
3062 }
3063
3064 return TargetInstrInfo::findCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2);
3065 }
3066 return false;
3067}
3068
3070 unsigned Opcode = MI->getOpcode();
3071 if (Opcode != X86::LEA32r && Opcode != X86::LEA64r &&
3072 Opcode != X86::LEA64_32r)
3073 return false;
3074
3075 const MachineOperand &Scale = MI->getOperand(1 + X86::AddrScaleAmt);
3076 const MachineOperand &Disp = MI->getOperand(1 + X86::AddrDisp);
3077 const MachineOperand &Segment = MI->getOperand(1 + X86::AddrSegmentReg);
3078
3079 if (Segment.getReg() != 0 || !Disp.isImm() || Disp.getImm() != 0 ||
3080 Scale.getImm() > 1)
3081 return false;
3082
3083 return true;
3084}
3085
3087 // Currently we're interested in following sequence only.
3088 // r3 = lea r1, r2
3089 // r5 = add r3, r4
3090 // Both r3 and r4 are killed in add, we hope the add instruction has the
3091 // operand order
3092 // r5 = add r4, r3
3093 // So later in X86FixupLEAs the lea instruction can be rewritten as add.
3094 unsigned Opcode = MI.getOpcode();
3095 if (Opcode != X86::ADD32rr && Opcode != X86::ADD64rr)
3096 return false;
3097
3098 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
3099 Register Reg1 = MI.getOperand(1).getReg();
3100 Register Reg2 = MI.getOperand(2).getReg();
3101
3102 // Check if Reg1 comes from LEA in the same MBB.
3103 if (MachineInstr *Inst = MRI.getUniqueVRegDef(Reg1)) {
3104 if (isConvertibleLEA(Inst) && Inst->getParent() == MI.getParent()) {
3105 Commute = true;
3106 return true;
3107 }
3108 }
3109
3110 // Check if Reg2 comes from LEA in the same MBB.
3111 if (MachineInstr *Inst = MRI.getUniqueVRegDef(Reg2)) {
3112 if (isConvertibleLEA(Inst) && Inst->getParent() == MI.getParent()) {
3113 Commute = false;
3114 return true;
3115 }
3116 }
3117
3118 return false;
3119}
3120
3122 unsigned Opcode = MCID.getOpcode();
3123 if (!(X86::isJCC(Opcode) || X86::isSETCC(Opcode) || X86::isCMOVCC(Opcode) ||
3124 X86::isCFCMOVCC(Opcode)))
3125 return -1;
3126 // Assume that condition code is always the last use operand.
3127 unsigned NumUses = MCID.getNumOperands() - MCID.getNumDefs();
3128 return NumUses - 1;
3129}
3130
3132 const MCInstrDesc &MCID = MI.getDesc();
3133 int CondNo = getCondSrcNoFromDesc(MCID);
3134 if (CondNo < 0)
3135 return X86::COND_INVALID;
3136 CondNo += MCID.getNumDefs();
3137 return static_cast<X86::CondCode>(MI.getOperand(CondNo).getImm());
3138}
3139
3141 return X86::isJCC(MI.getOpcode()) ? X86::getCondFromMI(MI)
3143}
3144
3146 return X86::isSETCC(MI.getOpcode()) ? X86::getCondFromMI(MI)
3148}
3149
3151 return X86::isCMOVCC(MI.getOpcode()) ? X86::getCondFromMI(MI)
3153}
3154
3156 return X86::isCFCMOVCC(MI.getOpcode()) ? X86::getCondFromMI(MI)
3158}
3159
3160/// Return the inverse of the specified condition,
3161/// e.g. turning COND_E to COND_NE.
3163 switch (CC) {
3164 default:
3165 llvm_unreachable("Illegal condition code!");
3166 case X86::COND_E:
3167 return X86::COND_NE;
3168 case X86::COND_NE:
3169 return X86::COND_E;
3170 case X86::COND_L:
3171 return X86::COND_GE;
3172 case X86::COND_LE:
3173 return X86::COND_G;
3174 case X86::COND_G:
3175 return X86::COND_LE;
3176 case X86::COND_GE:
3177 return X86::COND_L;
3178 case X86::COND_B:
3179 return X86::COND_AE;
3180 case X86::COND_BE:
3181 return X86::COND_A;
3182 case X86::COND_A:
3183 return X86::COND_BE;
3184 case X86::COND_AE:
3185 return X86::COND_B;
3186 case X86::COND_S:
3187 return X86::COND_NS;
3188 case X86::COND_NS:
3189 return X86::COND_S;
3190 case X86::COND_P:
3191 return X86::COND_NP;
3192 case X86::COND_NP:
3193 return X86::COND_P;
3194 case X86::COND_O:
3195 return X86::COND_NO;
3196 case X86::COND_NO:
3197 return X86::COND_O;
3198 case X86::COND_NE_OR_P:
3199 return X86::COND_E_AND_NP;
3200 case X86::COND_E_AND_NP:
3201 return X86::COND_NE_OR_P;
3202 }
3203}
3204
3205/// Assuming the flags are set by MI(a,b), return the condition code if we
3206/// modify the instructions such that flags are set by MI(b,a).
3208 switch (CC) {
3209 default:
3210 return X86::COND_INVALID;
3211 case X86::COND_E:
3212 return X86::COND_E;
3213 case X86::COND_NE:
3214 return X86::COND_NE;
3215 case X86::COND_L:
3216 return X86::COND_G;
3217 case X86::COND_LE:
3218 return X86::COND_GE;
3219 case X86::COND_G:
3220 return X86::COND_L;
3221 case X86::COND_GE:
3222 return X86::COND_LE;
3223 case X86::COND_B:
3224 return X86::COND_A;
3225 case X86::COND_BE:
3226 return X86::COND_AE;
3227 case X86::COND_A:
3228 return X86::COND_B;
3229 case X86::COND_AE:
3230 return X86::COND_BE;
3231 }
3232}
3233
3234std::pair<X86::CondCode, bool>
3237 bool NeedSwap = false;
3238 switch (Predicate) {
3239 default:
3240 break;
3241 // Floating-point Predicates
3242 case CmpInst::FCMP_UEQ:
3243 CC = X86::COND_E;
3244 break;
3245 case CmpInst::FCMP_OLT:
3246 NeedSwap = true;
3247 [[fallthrough]];
3248 case CmpInst::FCMP_OGT:
3249 CC = X86::COND_A;
3250 break;
3251 case CmpInst::FCMP_OLE:
3252 NeedSwap = true;
3253 [[fallthrough]];
3254 case CmpInst::FCMP_OGE:
3255 CC = X86::COND_AE;
3256 break;
3257 case CmpInst::FCMP_UGT:
3258 NeedSwap = true;
3259 [[fallthrough]];
3260 case CmpInst::FCMP_ULT:
3261 CC = X86::COND_B;
3262 break;
3263 case CmpInst::FCMP_UGE:
3264 NeedSwap = true;
3265 [[fallthrough]];
3266 case CmpInst::FCMP_ULE:
3267 CC = X86::COND_BE;
3268 break;
3269 case CmpInst::FCMP_ONE:
3270 CC = X86::COND_NE;
3271 break;
3272 case CmpInst::FCMP_UNO:
3273 CC = X86::COND_P;
3274 break;
3275 case CmpInst::FCMP_ORD:
3276 CC = X86::COND_NP;
3277 break;
3278 case CmpInst::FCMP_OEQ:
3279 [[fallthrough]];
3280 case CmpInst::FCMP_UNE:
3282 break;
3283
3284 // Integer Predicates
3285 case CmpInst::ICMP_EQ:
3286 CC = X86::COND_E;
3287 break;
3288 case CmpInst::ICMP_NE:
3289 CC = X86::COND_NE;
3290 break;
3291 case CmpInst::ICMP_UGT:
3292 CC = X86::COND_A;
3293 break;
3294 case CmpInst::ICMP_UGE:
3295 CC = X86::COND_AE;
3296 break;
3297 case CmpInst::ICMP_ULT:
3298 CC = X86::COND_B;
3299 break;
3300 case CmpInst::ICMP_ULE:
3301 CC = X86::COND_BE;
3302 break;
3303 case CmpInst::ICMP_SGT:
3304 CC = X86::COND_G;
3305 break;
3306 case CmpInst::ICMP_SGE:
3307 CC = X86::COND_GE;
3308 break;
3309 case CmpInst::ICMP_SLT:
3310 CC = X86::COND_L;
3311 break;
3312 case CmpInst::ICMP_SLE:
3313 CC = X86::COND_LE;
3314 break;
3315 }
3316
3317 return std::make_pair(CC, NeedSwap);
3318}
3319
3320/// Return a cmov opcode for the given register size in bytes, and operand type.
3321unsigned X86::getCMovOpcode(unsigned RegBytes, bool HasMemoryOperand,
3322 bool HasNDD) {
3323 switch (RegBytes) {
3324 default:
3325 llvm_unreachable("Illegal register size!");
3326#define GET_ND_IF_ENABLED(OPC) (HasNDD ? OPC##_ND : OPC)
3327 case 2:
3328 return HasMemoryOperand ? GET_ND_IF_ENABLED(X86::CMOV16rm)
3329 : GET_ND_IF_ENABLED(X86::CMOV16rr);
3330 case 4:
3331 return HasMemoryOperand ? GET_ND_IF_ENABLED(X86::CMOV32rm)
3332 : GET_ND_IF_ENABLED(X86::CMOV32rr);
3333 case 8:
3334 return HasMemoryOperand ? GET_ND_IF_ENABLED(X86::CMOV64rm)
3335 : GET_ND_IF_ENABLED(X86::CMOV64rr);
3336 }
3337}
3338
3339/// Get the VPCMP immediate for the given condition.
3341 switch (CC) {
3342 default:
3343 llvm_unreachable("Unexpected SETCC condition");
3344 case ISD::SETNE:
3345 return 4;
3346 case ISD::SETEQ:
3347 return 0;
3348 case ISD::SETULT:
3349 case ISD::SETLT:
3350 return 1;
3351 case ISD::SETUGT:
3352 case ISD::SETGT:
3353 return 6;
3354 case ISD::SETUGE:
3355 case ISD::SETGE:
3356 return 5;
3357 case ISD::SETULE:
3358 case ISD::SETLE:
3359 return 2;
3360 }
3361}
3362
3363/// Get the VPCMP immediate if the operands are swapped.
3364unsigned X86::getSwappedVPCMPImm(unsigned Imm) {
3365 switch (Imm) {
3366 default:
3367 llvm_unreachable("Unreachable!");
3368 case 0x01:
3369 Imm = 0x06;
3370 break; // LT -> NLE
3371 case 0x02:
3372 Imm = 0x05;
3373 break; // LE -> NLT
3374 case 0x05:
3375 Imm = 0x02;
3376 break; // NLT -> LE
3377 case 0x06:
3378 Imm = 0x01;
3379 break; // NLE -> LT
3380 case 0x00: // EQ
3381 case 0x03: // FALSE
3382 case 0x04: // NE
3383 case 0x07: // TRUE
3384 break;
3385 }
3386
3387 return Imm;
3388}
3389
3390/// Get the VPCOM immediate if the operands are swapped.
3391unsigned X86::getSwappedVPCOMImm(unsigned Imm) {
3392 switch (Imm) {
3393 default:
3394 llvm_unreachable("Unreachable!");
3395 case 0x00:
3396 Imm = 0x02;
3397 break; // LT -> GT
3398 case 0x01:
3399 Imm = 0x03;
3400 break; // LE -> GE
3401 case 0x02:
3402 Imm = 0x00;
3403 break; // GT -> LT
3404 case 0x03:
3405 Imm = 0x01;
3406 break; // GE -> LE
3407 case 0x04: // EQ
3408 case 0x05: // NE
3409 case 0x06: // FALSE
3410 case 0x07: // TRUE
3411 break;
3412 }
3413
3414 return Imm;
3415}
3416
3417/// Get the VCMP immediate if the operands are swapped.
3418unsigned X86::getSwappedVCMPImm(unsigned Imm) {
3419 // Only need the lower 2 bits to distinquish.
3420 switch (Imm & 0x3) {
3421 default:
3422 llvm_unreachable("Unreachable!");
3423 case 0x00:
3424 case 0x03:
3425 // EQ/NE/TRUE/FALSE/ORD/UNORD don't change immediate when commuted.
3426 break;
3427 case 0x01:
3428 case 0x02:
3429 // Need to toggle bits 3:0. Bit 4 stays the same.
3430 Imm ^= 0xf;
3431 break;
3432 }
3433
3434 return Imm;
3435}
3436
3438 if (Info.RegClass == X86::VR128RegClassID ||
3439 Info.RegClass == X86::VR128XRegClassID)
3440 return 128;
3441 if (Info.RegClass == X86::VR256RegClassID ||
3442 Info.RegClass == X86::VR256XRegClassID)
3443 return 256;
3444 if (Info.RegClass == X86::VR512RegClassID)
3445 return 512;
3446 llvm_unreachable("Unknown register class!");
3447}
3448
3449/// Return true if the Reg is X87 register.
3450static bool isX87Reg(unsigned Reg) {
3451 return (Reg == X86::FPCW || Reg == X86::FPSW ||
3452 (Reg >= X86::ST0 && Reg <= X86::ST7));
3453}
3454
3455/// check if the instruction is X87 instruction
3457 // Call defs X87 register, so we special case it here because
3458 // otherwise calls are incorrectly flagged as x87 instructions
3459 // as a result.
3460 if (MI.isCall())
3461 return false;
3462 for (const MachineOperand &MO : MI.operands()) {
3463 if (!MO.isReg())
3464 continue;
3465 if (isX87Reg(MO.getReg()))
3466 return true;
3467 }
3468 return false;
3469}
3470
3472 auto IsMemOp = [](const MCOperandInfo &OpInfo) {
3473 return OpInfo.OperandType == MCOI::OPERAND_MEMORY;
3474 };
3475
3476 const MCInstrDesc &Desc = MI.getDesc();
3477
3478 // Directly invoke the MC-layer routine for real (i.e., non-pseudo)
3479 // instructions (fast case).
3480 if (!X86II::isPseudo(Desc.TSFlags)) {
3481 int MemRefIdx = X86II::getMemoryOperandNo(Desc.TSFlags);
3482 if (MemRefIdx >= 0)
3483 return MemRefIdx + X86II::getOperandBias(Desc);
3484#ifdef EXPENSIVE_CHECKS
3485 assert(none_of(Desc.operands(), IsMemOp) &&
3486 "Got false negative from X86II::getMemoryOperandNo()!");
3487#endif
3488 return -1;
3489 }
3490
3491 // Otherwise, handle pseudo instructions by examining the type of their
3492 // operands (slow case). An instruction cannot have a memory reference if it
3493 // has fewer than AddrNumOperands (= 5) explicit operands.
3494 unsigned NumOps = Desc.getNumOperands();
3495 if (NumOps < X86::AddrNumOperands) {
3496#ifdef EXPENSIVE_CHECKS
3497 assert(none_of(Desc.operands(), IsMemOp) &&
3498 "Expected no operands to have OPERAND_MEMORY type!");
3499#endif
3500 return -1;
3501 }
3502
3503 // The first operand with type OPERAND_MEMORY indicates the start of a memory
3504 // reference. We expect the following AddrNumOperand-1 operands to also have
3505 // OPERAND_MEMORY type.
3506 for (unsigned I = 0, E = NumOps - X86::AddrNumOperands; I != E; ++I) {
3507 if (IsMemOp(Desc.operands()[I])) {
3508#ifdef EXPENSIVE_CHECKS
3509 assert(std::all_of(Desc.operands().begin() + I,
3510 Desc.operands().begin() + I + X86::AddrNumOperands,
3511 IsMemOp) &&
3512 "Expected all five operands in the memory reference to have "
3513 "OPERAND_MEMORY type!");
3514#endif
3515 return I;
3516 }
3517 }
3518
3519 return -1;
3520}
3521
3523 unsigned OpNo) {
3524 assert(MI.getNumOperands() >= (OpNo + X86::AddrNumOperands) &&
3525 "Unexpected number of operands!");
3526
3527 const MachineOperand &Index = MI.getOperand(OpNo + X86::AddrIndexReg);
3528 if (!Index.isReg() || Index.getReg() != X86::NoRegister)
3529 return nullptr;
3530
3531 const MachineOperand &Disp = MI.getOperand(OpNo + X86::AddrDisp);
3532 if (!Disp.isCPI() || Disp.getOffset() != 0)
3533 return nullptr;
3534
3536 MI.getParent()->getParent()->getConstantPool()->getConstants();
3537 const MachineConstantPoolEntry &ConstantEntry = Constants[Disp.getIndex()];
3538
3539 // Bail if this is a machine constant pool entry, we won't be able to dig out
3540 // anything useful.
3541 if (ConstantEntry.isMachineConstantPoolEntry())
3542 return nullptr;
3543
3544 return ConstantEntry.Val.ConstVal;
3545}
3546
3548 switch (MI.getOpcode()) {
3549 case X86::TCRETURNdi:
3550 case X86::TCRETURNri:
3551 case X86::TCRETURNmi:
3552 case X86::TCRETURNdi64:
3553 case X86::TCRETURNri64:
3554 case X86::TCRETURNmi64:
3555 return true;
3556 default:
3557 return false;
3558 }
3559}
3560
3563 const MachineInstr &TailCall) const {
3564
3565 const MachineFunction *MF = TailCall.getMF();
3566
3567 if (MF->getTarget().getCodeModel() == CodeModel::Kernel) {
3568 // Kernel patches thunk calls in runtime, these should never be conditional.
3569 const MachineOperand &Target = TailCall.getOperand(0);
3570 if (Target.isSymbol()) {
3571 StringRef Symbol(Target.getSymbolName());
3572 // this is currently only relevant to r11/kernel indirect thunk.
3573 if (Symbol.equals("__x86_indirect_thunk_r11"))
3574 return false;
3575 }
3576 }
3577
3578 if (TailCall.getOpcode() != X86::TCRETURNdi &&
3579 TailCall.getOpcode() != X86::TCRETURNdi64) {
3580 // Only direct calls can be done with a conditional branch.
3581 return false;
3582 }
3583
3584 if (Subtarget.isTargetWin64() && MF->hasWinCFI()) {
3585 // Conditional tail calls confuse the Win64 unwinder.
3586 return false;
3587 }
3588
3589 assert(BranchCond.size() == 1);
3590 if (BranchCond[0].getImm() > X86::LAST_VALID_COND) {
3591 // Can't make a conditional tail call with this condition.
3592 return false;
3593 }
3594
3596 if (X86FI->getTCReturnAddrDelta() != 0 ||
3597 TailCall.getOperand(1).getImm() != 0) {
3598 // A conditional tail call cannot do any stack adjustment.
3599 return false;
3600 }
3601
3602 return true;
3603}
3604
3607 const MachineInstr &TailCall) const {
3608 assert(canMakeTailCallConditional(BranchCond, TailCall));
3609
3611 while (I != MBB.begin()) {
3612 --I;
3613 if (I->isDebugInstr())
3614 continue;
3615 if (!I->isBranch())
3616 assert(0 && "Can't find the branch to replace!");
3617
3619 assert(BranchCond.size() == 1);
3620 if (CC != BranchCond[0].getImm())
3621 continue;
3622
3623 break;
3624 }
3625
3626 unsigned Opc = TailCall.getOpcode() == X86::TCRETURNdi ? X86::TCRETURNdicc
3627 : X86::TCRETURNdi64cc;
3628
3629 auto MIB = BuildMI(MBB, I, MBB.findDebugLoc(I), get(Opc));
3630 MIB->addOperand(TailCall.getOperand(0)); // Destination.
3631 MIB.addImm(0); // Stack offset (not used).
3632 MIB->addOperand(BranchCond[0]); // Condition.
3633 MIB.copyImplicitOps(TailCall); // Regmask and (imp-used) parameters.
3634
3635 // Add implicit uses and defs of all live regs potentially clobbered by the
3636 // call. This way they still appear live across the call.
3637 LivePhysRegs LiveRegs(getRegisterInfo());
3638 LiveRegs.addLiveOuts(MBB);
3640 LiveRegs.stepForward(*MIB, Clobbers);
3641 for (const auto &C : Clobbers) {
3642 MIB.addReg(C.first, RegState::Implicit);
3644 }
3645
3646 I->eraseFromParent();
3647}
3648
3649// Given a MBB and its TBB, find the FBB which was a fallthrough MBB (it may
3650// not be a fallthrough MBB now due to layout changes). Return nullptr if the
3651// fallthrough MBB cannot be identified.
3654 // Look for non-EHPad successors other than TBB. If we find exactly one, it
3655 // is the fallthrough MBB. If we find zero, then TBB is both the target MBB
3656 // and fallthrough MBB. If we find more than one, we cannot identify the
3657 // fallthrough MBB and should return nullptr.
3658 MachineBasicBlock *FallthroughBB = nullptr;
3659 for (MachineBasicBlock *Succ : MBB->successors()) {
3660 if (Succ->isEHPad() || (Succ == TBB && FallthroughBB))
3661 continue;
3662 // Return a nullptr if we found more than one fallthrough successor.
3663 if (FallthroughBB && FallthroughBB != TBB)
3664 return nullptr;
3665 FallthroughBB = Succ;
3666 }
3667 return FallthroughBB;
3668}
3669
3670bool X86InstrInfo::analyzeBranchImpl(
3673 SmallVectorImpl<MachineInstr *> &CondBranches, bool AllowModify) const {
3674
3675 // Start from the bottom of the block and work up, examining the
3676 // terminator instructions.
3678 MachineBasicBlock::iterator UnCondBrIter = MBB.end();
3679 while (I != MBB.begin()) {
3680 --I;
3681 if (I->isDebugInstr())
3682 continue;
3683
3684 // Working from the bottom, when we see a non-terminator instruction, we're
3685 // done.
3686 if (!isUnpredicatedTerminator(*I))
3687 break;
3688
3689 // A terminator that isn't a branch can't easily be handled by this
3690 // analysis.
3691 if (!I->isBranch())
3692 return true;
3693
3694 // Handle unconditional branches.
3695 if (I->getOpcode() == X86::JMP_1) {
3696 UnCondBrIter = I;
3697
3698 if (!AllowModify) {
3699 TBB = I->getOperand(0).getMBB();
3700 continue;
3701 }
3702
3703 // If the block has any instructions after a JMP, delete them.
3704 MBB.erase(std::next(I), MBB.end());
3705
3706 Cond.clear();
3707 FBB = nullptr;
3708
3709 // Delete the JMP if it's equivalent to a fall-through.
3710 if (MBB.isLayoutSuccessor(I->getOperand(0).getMBB())) {
3711 TBB = nullptr;
3712 I->eraseFromParent();
3713 I = MBB.end();
3714 UnCondBrIter = MBB.end();
3715 continue;
3716 }
3717
3718 // TBB is used to indicate the unconditional destination.
3719 TBB = I->getOperand(0).getMBB();
3720 continue;
3721 }
3722
3723 // Handle conditional branches.
3724 X86::CondCode BranchCode = X86::getCondFromBranch(*I);
3725 if (BranchCode == X86::COND_INVALID)
3726 return true; // Can't handle indirect branch.
3727
3728 // In practice we should never have an undef eflags operand, if we do
3729 // abort here as we are not prepared to preserve the flag.
3730 if (I->findRegisterUseOperand(X86::EFLAGS)->isUndef())
3731 return true;
3732
3733 // Working from the bottom, handle the first conditional branch.
3734 if (Cond.empty()) {
3735 FBB = TBB;
3736 TBB = I->getOperand(0).getMBB();
3737 Cond.push_back(MachineOperand::CreateImm(BranchCode));
3738 CondBranches.push_back(&*I);
3739 continue;
3740 }
3741
3742 // Handle subsequent conditional branches. Only handle the case where all
3743 // conditional branches branch to the same destination and their condition
3744 // opcodes fit one of the special multi-branch idioms.
3745 assert(Cond.size() == 1);
3746 assert(TBB);
3747
3748 // If the conditions are the same, we can leave them alone.
3749 X86::CondCode OldBranchCode = (X86::CondCode)Cond[0].getImm();
3750 auto NewTBB = I->getOperand(0).getMBB();
3751 if (OldBranchCode == BranchCode && TBB == NewTBB)
3752 continue;
3753
3754 // If they differ, see if they fit one of the known patterns. Theoretically,
3755 // we could handle more patterns here, but we shouldn't expect to see them
3756 // if instruction selection has done a reasonable job.
3757 if (TBB == NewTBB &&
3758 ((OldBranchCode == X86::COND_P && BranchCode == X86::COND_NE) ||
3759 (OldBranchCode == X86::COND_NE && BranchCode == X86::COND_P))) {
3760 BranchCode = X86::COND_NE_OR_P;
3761 } else if ((OldBranchCode == X86::COND_NP && BranchCode == X86::COND_NE) ||
3762 (OldBranchCode == X86::COND_E && BranchCode == X86::COND_P)) {
3763 if (NewTBB != (FBB ? FBB : getFallThroughMBB(&MBB, TBB)))
3764 return true;
3765
3766 // X86::COND_E_AND_NP usually has two different branch destinations.
3767 //
3768 // JP B1
3769 // JE B2
3770 // JMP B1
3771 // B1:
3772 // B2:
3773 //
3774 // Here this condition branches to B2 only if NP && E. It has another
3775 // equivalent form:
3776 //
3777 // JNE B1
3778 // JNP B2
3779 // JMP B1
3780 // B1:
3781 // B2:
3782 //
3783 // Similarly it branches to B2 only if E && NP. That is why this condition
3784 // is named with COND_E_AND_NP.
3785 BranchCode = X86::COND_E_AND_NP;
3786 } else
3787 return true;
3788
3789 // Update the MachineOperand.
3790 Cond[0].setImm(BranchCode);
3791 CondBranches.push_back(&*I);
3792 }
3793
3794 return false;
3795}
3796
3799 MachineBasicBlock *&FBB,
3801 bool AllowModify) const {
3802 SmallVector<MachineInstr *, 4> CondBranches;
3803 return analyzeBranchImpl(MBB, TBB, FBB, Cond, CondBranches, AllowModify);
3804}
3805
3807 const MCInstrDesc &Desc = MI.getDesc();
3808 int MemRefBegin = X86II::getMemoryOperandNo(Desc.TSFlags);
3809 assert(MemRefBegin >= 0 && "instr should have memory operand");
3810 MemRefBegin += X86II::getOperandBias(Desc);
3811
3812 const MachineOperand &MO = MI.getOperand(MemRefBegin + X86::AddrDisp);
3813 if (!MO.isJTI())
3814 return -1;
3815
3816 return MO.getIndex();
3817}
3818
3820 Register Reg) {
3821 if (!Reg.isVirtual())
3822 return -1;
3823 MachineInstr *MI = MRI.getUniqueVRegDef(Reg);
3824 if (MI == nullptr)
3825 return -1;
3826 unsigned Opcode = MI->getOpcode();
3827 if (Opcode != X86::LEA64r && Opcode != X86::LEA32r)
3828 return -1;
3830}
3831
3833 unsigned Opcode = MI.getOpcode();
3834 // Switch-jump pattern for non-PIC code looks like:
3835 // JMP64m $noreg, 8, %X, %jump-table.X, $noreg
3836 if (Opcode == X86::JMP64m || Opcode == X86::JMP32m) {
3838 }
3839 // The pattern for PIC code looks like:
3840 // %0 = LEA64r $rip, 1, $noreg, %jump-table.X
3841 // %1 = MOVSX64rm32 %0, 4, XX, 0, $noreg
3842 // %2 = ADD64rr %1, %0
3843 // JMP64r %2
3844 if (Opcode == X86::JMP64r || Opcode == X86::JMP32r) {
3845 Register Reg = MI.getOperand(0).getReg();
3846 if (!Reg.isVirtual())
3847 return -1;
3848 const MachineFunction &MF = *MI.getParent()->getParent();
3849 const MachineRegisterInfo &MRI = MF.getRegInfo();
3850 MachineInstr *Add = MRI.getUniqueVRegDef(Reg);
3851 if (Add == nullptr)
3852 return -1;
3853 if (Add->getOpcode() != X86::ADD64rr && Add->getOpcode() != X86::ADD32rr)
3854 return -1;
3855 int JTI1 = getJumpTableIndexFromReg(MRI, Add->getOperand(1).getReg());
3856 if (JTI1 >= 0)
3857 return JTI1;
3858 int JTI2 = getJumpTableIndexFromReg(MRI, Add->getOperand(2).getReg());
3859 if (JTI2 >= 0)
3860 return JTI2;
3861 }
3862 return -1;
3863}
3864
3866 MachineBranchPredicate &MBP,
3867 bool AllowModify) const {
3868 using namespace std::placeholders;
3869
3871 SmallVector<MachineInstr *, 4> CondBranches;
3872 if (analyzeBranchImpl(MBB, MBP.TrueDest, MBP.FalseDest, Cond, CondBranches,
3873 AllowModify))
3874 return true;
3875
3876 if (Cond.size() != 1)
3877 return true;
3878
3879 assert(MBP.TrueDest && "expected!");
3880
3881 if (!MBP.FalseDest)
3882 MBP.FalseDest = MBB.getNextNode();
3883
3885
3886 MachineInstr *ConditionDef = nullptr;
3887 bool SingleUseCondition = true;
3888
3890 if (MI.modifiesRegister(X86::EFLAGS, TRI)) {
3891 ConditionDef = &MI;
3892 break;
3893 }
3894
3895 if (MI.readsRegister(X86::EFLAGS, TRI))
3896 SingleUseCondition = false;
3897 }
3898
3899 if (!ConditionDef)
3900 return true;
3901
3902 if (SingleUseCondition) {
3903 for (auto *Succ : MBB.successors())
3904 if (Succ->isLiveIn(X86::EFLAGS))
3905 SingleUseCondition = false;
3906 }
3907
3908 MBP.ConditionDef = ConditionDef;
3909 MBP.SingleUseCondition = SingleUseCondition;
3910
3911 // Currently we only recognize the simple pattern:
3912 //
3913 // test %reg, %reg
3914 // je %label
3915 //
3916 const unsigned TestOpcode =
3917 Subtarget.is64Bit() ? X86::TEST64rr : X86::TEST32rr;
3918
3919 if (ConditionDef->getOpcode() == TestOpcode &&
3920 ConditionDef->getNumOperands() == 3 &&
3921 ConditionDef->getOperand(0).isIdenticalTo(ConditionDef->getOperand(1)) &&
3922 (Cond[0].getImm() == X86::COND_NE || Cond[0].getImm() == X86::COND_E)) {
3923 MBP.LHS = ConditionDef->getOperand(0);
3924 MBP.RHS = MachineOperand::CreateImm(0);
3925 MBP.Predicate = Cond[0].getImm() == X86::COND_NE
3926 ? MachineBranchPredicate::PRED_NE
3927 : MachineBranchPredicate::PRED_EQ;
3928 return false;
3929 }
3930
3931 return true;
3932}
3933
3935 int *BytesRemoved) const {
3936 assert(!BytesRemoved && "code size not handled");
3937
3939 unsigned Count = 0;
3940
3941 while (I != MBB.begin()) {
3942 --I;
3943 if (I->isDebugInstr())
3944 continue;
3945 if (I->getOpcode() != X86::JMP_1 &&
3947 break;
3948 // Remove the branch.
3949 I->eraseFromParent();
3950 I = MBB.end();
3951 ++Count;
3952 }
3953
3954 return Count;
3955}
3956
3959 MachineBasicBlock *FBB,
3961 const DebugLoc &DL, int *BytesAdded) const {
3962 // Shouldn't be a fall through.
3963 assert(TBB && "insertBranch must not be told to insert a fallthrough");
3964 assert((Cond.size() == 1 || Cond.size() == 0) &&
3965 "X86 branch conditions have one component!");
3966 assert(!BytesAdded && "code size not handled");
3967
3968 if (Cond.empty()) {
3969 // Unconditional branch?
3970 assert(!FBB && "Unconditional branch with multiple successors!");
3971 BuildMI(&MBB, DL, get(X86::JMP_1)).addMBB(TBB);
3972 return 1;
3973 }
3974
3975 // If FBB is null, it is implied to be a fall-through block.
3976 bool FallThru = FBB == nullptr;
3977
3978 // Conditional branch.
3979 unsigned Count = 0;
3980 X86::CondCode CC = (X86::CondCode)Cond[0].getImm();
3981 switch (CC) {
3982 case X86::COND_NE_OR_P:
3983 // Synthesize NE_OR_P with two branches.
3984 BuildMI(&MBB, DL, get(X86::JCC_1)).addMBB(TBB).addImm(X86::COND_NE);
3985 ++Count;
3986 BuildMI(&MBB, DL, get(X86::JCC_1)).addMBB(TBB).addImm(X86::COND_P);
3987 ++Count;
3988 break;
3989 case X86::COND_E_AND_NP:
3990 // Use the next block of MBB as FBB if it is null.
3991 if (FBB == nullptr) {
3992 FBB = getFallThroughMBB(&MBB, TBB);
3993 assert(FBB && "MBB cannot be the last block in function when the false "
3994 "body is a fall-through.");
3995 }
3996 // Synthesize COND_E_AND_NP with two branches.
3997 BuildMI(&MBB, DL, get(X86::JCC_1)).addMBB(FBB).addImm(X86::COND_NE);
3998 ++Count;
3999 BuildMI(&MBB, DL, get(X86::JCC_1)).addMBB(TBB).addImm(X86::COND_NP);
4000 ++Count;
4001 break;
4002 default: {
4003 BuildMI(&MBB, DL, get(X86::JCC_1)).addMBB(TBB).addImm(CC);
4004 ++Count;
4005 }
4006 }
4007 if (!FallThru) {
4008 // Two-way Conditional branch. Insert the second branch.
4009 BuildMI(&MBB, DL, get(X86::JMP_1)).addMBB(FBB);
4010 ++Count;
4011 }
4012 return Count;
4013}
4014
4017 Register DstReg, Register TrueReg,
4018 Register FalseReg, int &CondCycles,
4019 int &TrueCycles, int &FalseCycles) const {
4020 // Not all subtargets have cmov instructions.
4021 if (!Subtarget.canUseCMOV())
4022 return false;
4023 if (Cond.size() != 1)
4024 return false;
4025 // We cannot do the composite conditions, at least not in SSA form.
4026 if ((X86::CondCode)Cond[0].getImm() > X86::LAST_VALID_COND)
4027 return false;
4028
4029 // Check register classes.
4031 const TargetRegisterClass *RC =
4032 RI.getCommonSubClass(MRI.getRegClass(TrueReg), MRI.getRegClass(FalseReg));
4033 if (!RC)
4034 return false;
4035
4036 // We have cmov instructions for 16, 32, and 64 bit general purpose registers.
4037 if (X86::GR16RegClass.hasSubClassEq(RC) ||
4038 X86::GR32RegClass.hasSubClassEq(RC) ||
4039 X86::GR64RegClass.hasSubClassEq(RC)) {
4040 // This latency applies to Pentium M, Merom, Wolfdale, Nehalem, and Sandy
4041 // Bridge. Probably Ivy Bridge as well.
4042 CondCycles = 2;
4043 TrueCycles = 2;
4044 FalseCycles = 2;
4045 return true;
4046 }
4047
4048 // Can't do vectors.
4049 return false;
4050}
4051
4054 const DebugLoc &DL, Register DstReg,
4056 Register FalseReg) const {
4058 const TargetRegisterInfo &TRI = *MRI.getTargetRegisterInfo();
4059 const TargetRegisterClass &RC = *MRI.getRegClass(DstReg);
4060 assert(Cond.size() == 1 && "Invalid Cond array");
4061 unsigned Opc =
4062 X86::getCMovOpcode(TRI.getRegSizeInBits(RC) / 8,
4063 false /*HasMemoryOperand*/, Subtarget.hasNDD());
4064 BuildMI(MBB, I, DL, get(Opc), DstReg)
4065 .addReg(FalseReg)
4066 .addReg(TrueReg)
4067 .addImm(Cond[0].getImm());
4068}
4069
4070/// Test if the given register is a physical h register.
4071static bool isHReg(unsigned Reg) {
4072 return X86::GR8_ABCD_HRegClass.contains(Reg);
4073}
4074
4075// Try and copy between VR128/VR64 and GR64 registers.
4076static unsigned CopyToFromAsymmetricReg(unsigned DestReg, unsigned SrcReg,
4077 const X86Subtarget &Subtarget) {
4078 bool HasAVX = Subtarget.hasAVX();
4079 bool HasAVX512 = Subtarget.hasAVX512();
4080 bool HasEGPR = Subtarget.hasEGPR();
4081
4082 // SrcReg(MaskReg) -> DestReg(GR64)
4083 // SrcReg(MaskReg) -> DestReg(GR32)
4084
4085 // All KMASK RegClasses hold the same k registers, can be tested against
4086 // anyone.
4087 if (X86::VK16RegClass.contains(SrcReg)) {
4088 if (X86::GR64RegClass.contains(DestReg)) {
4089 assert(Subtarget.hasBWI());
4090 return HasEGPR ? X86::KMOVQrk_EVEX : X86::KMOVQrk;
4091 }
4092 if (X86::GR32RegClass.contains(DestReg))
4093 return Subtarget.hasBWI() ? (HasEGPR ? X86::KMOVDrk_EVEX : X86::KMOVDrk)
4094 : (HasEGPR ? X86::KMOVWrk_EVEX : X86::KMOVWrk);
4095 }
4096
4097 // SrcReg(GR64) -> DestReg(MaskReg)
4098 // SrcReg(GR32) -> DestReg(MaskReg)
4099
4100 // All KMASK RegClasses hold the same k registers, can be tested against
4101 // anyone.
4102 if (X86::VK16RegClass.contains(DestReg)) {
4103 if (X86::GR64RegClass.contains(SrcReg)) {
4104 assert(Subtarget.hasBWI());
4105 return HasEGPR ? X86::KMOVQkr_EVEX : X86::KMOVQkr;
4106 }
4107 if (X86::GR32RegClass.contains(SrcReg))
4108 return Subtarget.hasBWI() ? (HasEGPR ? X86::KMOVDkr_EVEX : X86::KMOVDkr)
4109 : (HasEGPR ? X86::KMOVWkr_EVEX : X86::KMOVWkr);
4110 }
4111
4112 // SrcReg(VR128) -> DestReg(GR64)
4113 // SrcReg(VR64) -> DestReg(GR64)
4114 // SrcReg(GR64) -> DestReg(VR128)
4115 // SrcReg(GR64) -> DestReg(VR64)
4116
4117 if (X86::GR64RegClass.contains(DestReg)) {
4118 if (X86::VR128XRegClass.contains(SrcReg))
4119 // Copy from a VR128 register to a GR64 register.
4120 return HasAVX512 ? X86::VMOVPQIto64Zrr
4121 : HasAVX ? X86::VMOVPQIto64rr
4122 : X86::MOVPQIto64rr;
4123 if (X86::VR64RegClass.contains(SrcReg))
4124 // Copy from a VR64 register to a GR64 register.
4125 return X86::MMX_MOVD64from64rr;
4126 } else if (X86::GR64RegClass.contains(SrcReg)) {
4127 // Copy from a GR64 register to a VR128 register.
4128 if (X86::VR128XRegClass.contains(DestReg))
4129 return HasAVX512 ? X86::VMOV64toPQIZrr
4130 : HasAVX ? X86::VMOV64toPQIrr
4131 : X86::MOV64toPQIrr;
4132 // Copy from a GR64 register to a VR64 register.
4133 if (X86::VR64RegClass.contains(DestReg))
4134 return X86::MMX_MOVD64to64rr;
4135 }
4136
4137 // SrcReg(VR128) -> DestReg(GR32)
4138 // SrcReg(GR32) -> DestReg(VR128)
4139
4140 if (X86::GR32RegClass.contains(DestReg) &&
4141 X86::VR128XRegClass.contains(SrcReg))
4142 // Copy from a VR128 register to a GR32 register.
4143 return HasAVX512 ? X86::VMOVPDI2DIZrr
4144 : HasAVX ? X86::VMOVPDI2DIrr
4145 : X86::MOVPDI2DIrr;
4146
4147 if (X86::VR128XRegClass.contains(DestReg) &&
4148 X86::GR32RegClass.contains(SrcReg))
4149 // Copy from a VR128 register to a VR128 register.
4150 return HasAVX512 ? X86::VMOVDI2PDIZrr
4151 : HasAVX ? X86::VMOVDI2PDIrr
4152 : X86::MOVDI2PDIrr;
4153 return 0;
4154}
4155
4158 const DebugLoc &DL, MCRegister DestReg,
4159 MCRegister SrcReg, bool KillSrc) const {
4160 // First deal with the normal symmetric copies.
4161 bool HasAVX = Subtarget.hasAVX();
4162 bool HasVLX = Subtarget.hasVLX();
4163 bool HasEGPR = Subtarget.hasEGPR();
4164 unsigned Opc = 0;
4165 if (X86::GR64RegClass.contains(DestReg, SrcReg))
4166 Opc = X86::MOV64rr;
4167 else if (X86::GR32RegClass.contains(DestReg, SrcReg))
4168 Opc = X86::MOV32rr;
4169 else if (X86::GR16RegClass.contains(DestReg, SrcReg))
4170 Opc = X86::MOV16rr;
4171 else if (X86::GR8RegClass.contains(DestReg, SrcReg)) {
4172 // Copying to or from a physical H register on x86-64 requires a NOREX
4173 // move. Otherwise use a normal move.
4174 if ((isHReg(DestReg) || isHReg(SrcReg)) && Subtarget.is64Bit()) {
4175 Opc = X86::MOV8rr_NOREX;
4176 // Both operands must be encodable without an REX prefix.
4177 assert(X86::GR8_NOREXRegClass.contains(SrcReg, DestReg) &&
4178 "8-bit H register can not be copied outside GR8_NOREX");
4179 } else
4180 Opc = X86::MOV8rr;
4181 } else if (X86::VR64RegClass.contains(DestReg, SrcReg))
4182 Opc = X86::MMX_MOVQ64rr;
4183 else if (X86::VR128XRegClass.contains(DestReg, SrcReg)) {
4184 if (HasVLX)
4185 Opc = X86::VMOVAPSZ128rr;
4186 else if (X86::VR128RegClass.contains(DestReg, SrcReg))
4187 Opc = HasAVX ? X86::VMOVAPSrr : X86::MOVAPSrr;
4188 else {
4189 // If this an extended register and we don't have VLX we need to use a
4190 // 512-bit move.
4191 Opc = X86::VMOVAPSZrr;
4193 DestReg =
4194 TRI->getMatchingSuperReg(DestReg, X86::sub_xmm, &X86::VR512RegClass);
4195 SrcReg =
4196 TRI->getMatchingSuperReg(SrcReg, X86::sub_xmm, &X86::VR512RegClass);
4197 }
4198 } else if (X86::VR256XRegClass.contains(DestReg, SrcReg)) {
4199 if (HasVLX)
4200 Opc = X86::VMOVAPSZ256rr;
4201 else if (X86::VR256RegClass.contains(DestReg, SrcReg))
4202 Opc = X86::VMOVAPSYrr;
4203 else {
4204 // If this an extended register and we don't have VLX we need to use a
4205 // 512-bit move.
4206 Opc = X86::VMOVAPSZrr;
4208 DestReg =
4209 TRI->getMatchingSuperReg(DestReg, X86::sub_ymm, &X86::VR512RegClass);
4210 SrcReg =
4211 TRI->getMatchingSuperReg(SrcReg, X86::sub_ymm, &X86::VR512RegClass);
4212 }
4213 } else if (X86::VR512RegClass.contains(DestReg, SrcReg))
4214 Opc = X86::VMOVAPSZrr;
4215 // All KMASK RegClasses hold the same k registers, can be tested against
4216 // anyone.
4217 else if (X86::VK16RegClass.contains(DestReg, SrcReg))
4218 Opc = Subtarget.hasBWI() ? (HasEGPR ? X86::KMOVQkk_EVEX : X86::KMOVQkk)
4219 : (HasEGPR ? X86::KMOVQkk_EVEX : X86::KMOVWkk);
4220 if (!Opc)
4221 Opc = CopyToFromAsymmetricReg(DestReg, SrcReg, Subtarget);
4222
4223 if (Opc) {
4224 BuildMI(MBB, MI, DL, get(Opc), DestReg)
4225 .addReg(SrcReg, getKillRegState(KillSrc));
4226 return;
4227 }
4228
4229 if (SrcReg == X86::EFLAGS || DestReg == X86::EFLAGS) {
4230 // FIXME: We use a fatal error here because historically LLVM has tried
4231 // lower some of these physreg copies and we want to ensure we get
4232 // reasonable bug reports if someone encounters a case no other testing
4233 // found. This path should be removed after the LLVM 7 release.
4234 report_fatal_error("Unable to copy EFLAGS physical register!");
4235 }
4236
4237 LLVM_DEBUG(dbgs() << "Cannot copy " << RI.getName(SrcReg) << " to "
4238 << RI.getName(DestReg) << '\n');
4239 report_fatal_error("Cannot emit physreg copy instruction");
4240}
4241
4242std::optional<DestSourcePair>
4244 if (MI.isMoveReg()) {
4245 // FIXME: Dirty hack for apparent invariant that doesn't hold when
4246 // subreg_to_reg is coalesced with ordinary copies, such that the bits that
4247 // were asserted as 0 are now undef.
4248 if (MI.getOperand(0).isUndef() && MI.getOperand(0).getSubReg())
4249 return std::nullopt;
4250
4251 return DestSourcePair{MI.getOperand(0), MI.getOperand(1)};
4252 }
4253 return std::nullopt;
4254}
4255
4256static unsigned getLoadStoreOpcodeForFP16(bool Load, const X86Subtarget &STI) {
4257 if (STI.hasFP16())
4258 return Load ? X86::VMOVSHZrm_alt : X86::VMOVSHZmr;
4259 if (Load)
4260 return STI.hasAVX512() ? X86::VMOVSSZrm
4261 : STI.hasAVX() ? X86::VMOVSSrm
4262 : X86::MOVSSrm;
4263 else
4264 return STI.hasAVX512() ? X86::VMOVSSZmr
4265 : STI.hasAVX() ? X86::VMOVSSmr
4266 : X86::MOVSSmr;
4267}
4268
4270 const TargetRegisterClass *RC,
4271 bool IsStackAligned,
4272 const X86Subtarget &STI, bool Load) {
4273 bool HasAVX = STI.hasAVX();
4274 bool HasAVX512 = STI.hasAVX512();
4275 bool HasVLX = STI.hasVLX();
4276 bool HasEGPR = STI.hasEGPR();
4277
4278 assert(RC != nullptr && "Invalid target register class");
4279 switch (STI.getRegisterInfo()->getSpillSize(*RC)) {
4280 default:
4281 llvm_unreachable("Unknown spill size");
4282 case 1:
4283 assert(X86::GR8RegClass.hasSubClassEq(RC) && "Unknown 1-byte regclass");
4284 if (STI.is64Bit())
4285 // Copying to or from a physical H register on x86-64 requires a NOREX
4286 // move. Otherwise use a normal move.
4287 if (isHReg(Reg) || X86::GR8_ABCD_HRegClass.hasSubClassEq(RC))
4288 return Load ? X86::MOV8rm_NOREX : X86::MOV8mr_NOREX;
4289 return Load ? X86::MOV8rm : X86::MOV8mr;
4290 case 2:
4291 if (X86::VK16RegClass.hasSubClassEq(RC))
4292 return Load ? (HasEGPR ? X86::KMOVWkm_EVEX : X86::KMOVWkm)
4293 : (HasEGPR ? X86::KMOVWmk_EVEX : X86::KMOVWmk);
4294 assert(X86::GR16RegClass.hasSubClassEq(RC) && "Unknown 2-byte regclass");
4295 return Load ? X86::MOV16rm : X86::MOV16mr;
4296 case 4:
4297 if (X86::GR32RegClass.hasSubClassEq(RC))
4298 return Load ? X86::MOV32rm : X86::MOV32mr;
4299 if (X86::FR32XRegClass.hasSubClassEq(RC))
4300 return Load ? (HasAVX512 ? X86::VMOVSSZrm_alt
4301 : HasAVX ? X86::VMOVSSrm_alt
4302 : X86::MOVSSrm_alt)
4303 : (HasAVX512 ? X86::VMOVSSZmr
4304 : HasAVX ? X86::VMOVSSmr
4305 : X86::MOVSSmr);
4306 if (X86::RFP32RegClass.hasSubClassEq(RC))
4307 return Load ? X86::LD_Fp32m : X86::ST_Fp32m;
4308 if (X86::VK32RegClass.hasSubClassEq(RC)) {
4309 assert(STI.hasBWI() && "KMOVD requires BWI");
4310 return Load ? (HasEGPR ? X86::KMOVDkm_EVEX : X86::KMOVDkm)
4311 : (HasEGPR ? X86::KMOVDmk_EVEX : X86::KMOVDmk);
4312 }
4313 // All of these mask pair classes have the same spill size, the same kind
4314 // of kmov instructions can be used with all of them.
4315 if (X86::VK1PAIRRegClass.hasSubClassEq(RC) ||
4316 X86::VK2PAIRRegClass.hasSubClassEq(RC) ||
4317 X86::VK4PAIRRegClass.hasSubClassEq(RC) ||
4318 X86::VK8PAIRRegClass.hasSubClassEq(RC) ||
4319 X86::VK16PAIRRegClass.hasSubClassEq(RC))
4320 return Load ? X86::MASKPAIR16LOAD : X86::MASKPAIR16STORE;
4321 if (X86::FR16RegClass.hasSubClassEq(RC) ||
4322 X86::FR16XRegClass.hasSubClassEq(RC))
4323 return getLoadStoreOpcodeForFP16(Load, STI);
4324 llvm_unreachable("Unknown 4-byte regclass");
4325 case 8:
4326 if (X86::GR64RegClass.hasSubClassEq(RC))
4327 return Load ? X86::MOV64rm : X86::MOV64mr;
4328 if (X86::FR64XRegClass.hasSubClassEq(RC))
4329 return Load ? (HasAVX512 ? X86::VMOVSDZrm_alt
4330 : HasAVX ? X86::VMOVSDrm_alt
4331 : X86::MOVSDrm_alt)
4332 : (HasAVX512 ? X86::VMOVSDZmr
4333 : HasAVX ? X86::VMOVSDmr
4334 : X86::MOVSDmr);
4335 if (X86::VR64RegClass.hasSubClassEq(RC))
4336 return Load ? X86::MMX_MOVQ64rm : X86::MMX_MOVQ64mr;
4337 if (X86::RFP64RegClass.hasSubClassEq(RC))
4338 return Load ? X86::LD_Fp64m : X86::ST_Fp64m;
4339 if (X86::VK64RegClass.hasSubClassEq(RC)) {
4340 assert(STI.hasBWI() && "KMOVQ requires BWI");
4341 return Load ? (HasEGPR ? X86::KMOVQkm_EVEX : X86::KMOVQkm)
4342 : (HasEGPR ? X86::KMOVQmk_EVEX : X86::KMOVQmk);
4343 }
4344 llvm_unreachable("Unknown 8-byte regclass");
4345 case 10:
4346 assert(X86::RFP80RegClass.hasSubClassEq(RC) && "Unknown 10-byte regclass");
4347 return Load ? X86::LD_Fp80m : X86::ST_FpP80m;
4348 case 16: {
4349 if (X86::VR128XRegClass.hasSubClassEq(RC)) {
4350 // If stack is realigned we can use aligned stores.
4351 if (IsStackAligned)
4352 return Load ? (HasVLX ? X86::VMOVAPSZ128rm
4353 : HasAVX512 ? X86::VMOVAPSZ128rm_NOVLX
4354 : HasAVX ? X86::VMOVAPSrm
4355 : X86::MOVAPSrm)
4356 : (HasVLX ? X86::VMOVAPSZ128mr
4357 : HasAVX512 ? X86::VMOVAPSZ128mr_NOVLX
4358 : HasAVX ? X86::VMOVAPSmr
4359 : X86::MOVAPSmr);
4360 else
4361 return Load ? (HasVLX ? X86::VMOVUPSZ128rm
4362 : HasAVX512 ? X86::VMOVUPSZ128rm_NOVLX
4363 : HasAVX ? X86::VMOVUPSrm
4364 : X86::MOVUPSrm)
4365 : (HasVLX ? X86::VMOVUPSZ128mr
4366 : HasAVX512 ? X86::VMOVUPSZ128mr_NOVLX
4367 : HasAVX ? X86::VMOVUPSmr
4368 : X86::MOVUPSmr);
4369 }
4370 llvm_unreachable("Unknown 16-byte regclass");
4371 }
4372 case 32:
4373 assert(X86::VR256XRegClass.hasSubClassEq(RC) && "Unknown 32-byte regclass");
4374 // If stack is realigned we can use aligned stores.
4375 if (IsStackAligned)
4376 return Load ? (HasVLX ? X86::VMOVAPSZ256rm
4377 : HasAVX512 ? X86::VMOVAPSZ256rm_NOVLX
4378 : X86::VMOVAPSYrm)
4379 : (HasVLX ? X86::VMOVAPSZ256mr
4380 : HasAVX512 ? X86::VMOVAPSZ256mr_NOVLX
4381 : X86::VMOVAPSYmr);
4382 else
4383 return Load ? (HasVLX ? X86::VMOVUPSZ256rm
4384 : HasAVX512 ? X86::VMOVUPSZ256rm_NOVLX
4385 : X86::VMOVUPSYrm)
4386 : (HasVLX ? X86::VMOVUPSZ256mr
4387 : HasAVX512 ? X86::VMOVUPSZ256mr_NOVLX
4388 : X86::VMOVUPSYmr);
4389 case 64:
4390 assert(X86::VR512RegClass.hasSubClassEq(RC) && "Unknown 64-byte regclass");
4391 assert(STI.hasAVX512() && "Using 512-bit register requires AVX512");
4392 if (IsStackAligned)
4393 return Load ? X86::VMOVAPSZrm : X86::VMOVAPSZmr;
4394 else
4395 return Load ? X86::VMOVUPSZrm : X86::VMOVUPSZmr;
4396 case 1024:
4397 assert(X86::TILERegClass.hasSubClassEq(RC) && "Unknown 1024-byte regclass");
4398 assert(STI.hasAMXTILE() && "Using 8*1024-bit register requires AMX-TILE");
4399#define GET_EGPR_IF_ENABLED(OPC) (STI.hasEGPR() ? OPC##_EVEX : OPC)
4400 return Load ? GET_EGPR_IF_ENABLED(X86::TILELOADD)
4401 : GET_EGPR_IF_ENABLED(X86::TILESTORED);
4402#undef GET_EGPR_IF_ENABLED
4403 }
4404}
4405
4406std::optional<ExtAddrMode>
4408 const TargetRegisterInfo *TRI) const {
4409 const MCInstrDesc &Desc = MemI.getDesc();
4410 int MemRefBegin = X86II::getMemoryOperandNo(Desc.TSFlags);
4411 if (MemRefBegin < 0)
4412 return std::nullopt;
4413
4414 MemRefBegin += X86II::getOperandBias(Desc);
4415
4416 auto &BaseOp = MemI.getOperand(MemRefBegin + X86::AddrBaseReg);
4417 if (!BaseOp.isReg()) // Can be an MO_FrameIndex
4418 return std::nullopt;
4419
4420 const MachineOperand &DispMO = MemI.getOperand(MemRefBegin + X86::AddrDisp);
4421 // Displacement can be symbolic
4422 if (!DispMO.isImm())
4423 return std::nullopt;
4424
4425 ExtAddrMode AM;
4426 AM.BaseReg = BaseOp.getReg();
4427 AM.ScaledReg = MemI.getOperand(MemRefBegin + X86::AddrIndexReg).getReg();
4428 AM.Scale = MemI.getOperand(MemRefBegin + X86::AddrScaleAmt).getImm();
4429 AM.Displacement = DispMO.getImm();
4430 return AM;
4431}
4432
4434 StringRef &ErrInfo) const {
4435 std::optional<ExtAddrMode> AMOrNone = getAddrModeFromMemoryOp(MI, nullptr);
4436 if (!AMOrNone)
4437 return true;
4438
4439 ExtAddrMode AM = *AMOrNone;
4441 if (AM.ScaledReg != X86::NoRegister) {
4442 switch (AM.Scale) {
4443 case 1:
4444 case 2:
4445 case 4:
4446 case 8:
4447 break;
4448 default:
4449 ErrInfo = "Scale factor in address must be 1, 2, 4 or 8";
4450 return false;
4451 }
4452 }
4453 if (!isInt<32>(AM.Displacement)) {
4454 ErrInfo = "Displacement in address must fit into 32-bit signed "
4455 "integer";
4456 return false;
4457 }
4458
4459 return true;
4460}
4461
4463 const Register Reg,
4464 int64_t &ImmVal) const {
4465 Register MovReg = Reg;
4466 const MachineInstr *MovMI = &MI;
4467
4468 // Follow use-def for SUBREG_TO_REG to find the real move immediate
4469 // instruction. It is quite common for x86-64.
4470 if (MI.isSubregToReg()) {
4471 // We use following pattern to setup 64b immediate.
4472 // %8:gr32 = MOV32r0 implicit-def dead $eflags
4473 // %6:gr64 = SUBREG_TO_REG 0, killed %8:gr32, %subreg.sub_32bit
4474 if (!MI.getOperand(1).isImm())
4475 return false;
4476 unsigned FillBits = MI.getOperand(1).getImm();
4477 unsigned SubIdx = MI.getOperand(3).getImm();
4478 MovReg = MI.getOperand(2).getReg();
4479 if (SubIdx != X86::sub_32bit || FillBits != 0)
4480 return false;
4481 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
4482 MovMI = MRI.getUniqueVRegDef(MovReg);
4483 if (!MovMI)
4484 return false;
4485 }
4486
4487 if (MovMI->getOpcode() == X86::MOV32r0 &&
4488 MovMI->getOperand(0).getReg() == MovReg) {
4489 ImmVal = 0;
4490 return true;
4491 }
4492
4493 if (MovMI->getOpcode() != X86::MOV32ri &&
4494 MovMI->getOpcode() != X86::MOV64ri &&
4495 MovMI->getOpcode() != X86::MOV32ri64 && MovMI->getOpcode() != X86::MOV8ri)
4496 return false;
4497 // Mov Src can be a global address.
4498 if (!MovMI->getOperand(1).isImm() || MovMI->getOperand(0).getReg() != MovReg)
4499 return false;
4500 ImmVal = MovMI->getOperand(1).getImm();
4501 return true;
4502}
4503
4505 const MachineInstr *MI, const Register NullValueReg,
4506 const TargetRegisterInfo *TRI) const {
4507 if (!MI->modifiesRegister(NullValueReg, TRI))
4508 return true;
4509 switch (MI->getOpcode()) {
4510 // Shift right/left of a null unto itself is still a null, i.e. rax = shl rax
4511 // X.
4512 case X86::SHR64ri:
4513 case X86::SHR32ri:
4514 case X86::SHL64ri:
4515 case X86::SHL32ri:
4516 assert(MI->getOperand(0).isDef() && MI->getOperand(1).isUse() &&
4517 "expected for shift opcode!");
4518 return MI->getOperand(0).getReg() == NullValueReg &&
4519 MI->getOperand(1).getReg() == NullValueReg;
4520 // Zero extend of a sub-reg of NullValueReg into itself does not change the
4521 // null value.
4522 case X86::MOV32rr:
4523 return llvm::all_of(MI->operands(), [&](const MachineOperand &MO) {
4524 return TRI->isSubRegisterEq(NullValueReg, MO.getReg());
4525 });
4526 default:
4527 return false;
4528 }
4529 llvm_unreachable("Should be handled above!");
4530}
4531
4534 int64_t &Offset, bool &OffsetIsScalable, LocationSize &Width,
4535 const TargetRegisterInfo *TRI) const {
4536 const MCInstrDesc &Desc = MemOp.getDesc();
4537 int MemRefBegin = X86II::getMemoryOperandNo(Desc.TSFlags);
4538 if (MemRefBegin < 0)
4539 return false;
4540
4541 MemRefBegin += X86II::getOperandBias(Desc);
4542
4543 const MachineOperand *BaseOp =
4544 &MemOp.getOperand(MemRefBegin + X86::AddrBaseReg);
4545 if (!BaseOp->isReg()) // Can be an MO_FrameIndex
4546 return false;
4547
4548 if (MemOp.getOperand(MemRefBegin + X86::AddrScaleAmt).getImm() != 1)
4549 return false;
4550
4551 if (MemOp.getOperand(MemRefBegin + X86::AddrIndexReg).getReg() !=
4552 X86::NoRegister)
4553 return false;
4554
4555 const MachineOperand &DispMO = MemOp.getOperand(MemRefBegin + X86::AddrDisp);
4556
4557 // Displacement can be symbolic
4558 if (!DispMO.isImm())
4559 return false;
4560
4561 Offset = DispMO.getImm();
4562
4563 if (!BaseOp->isReg())
4564 return false;
4565
4566 OffsetIsScalable = false;
4567 // FIXME: Relying on memoperands() may not be right thing to do here. Check
4568 // with X86 maintainers, and fix it accordingly. For now, it is ok, since
4569 // there is no use of `Width` for X86 back-end at the moment.
4570 Width =
4571 !MemOp.memoperands_empty() ? MemOp.memoperands().front()->getSize() : 0;
4572 BaseOps.push_back(BaseOp);
4573 return true;
4574}
4575
4576static unsigned getStoreRegOpcode(Register SrcReg,
4577 const TargetRegisterClass *RC,
4578 bool IsStackAligned,
4579 const X86Subtarget &STI) {
4580 return getLoadStoreRegOpcode(SrcReg, RC, IsStackAligned, STI, false);
4581}
4582
4583static unsigned getLoadRegOpcode(Register DestReg,
4584 const TargetRegisterClass *RC,
4585 bool IsStackAligned, const X86Subtarget &STI) {
4586 return getLoadStoreRegOpcode(DestReg, RC, IsStackAligned, STI, true);
4587}
4588
4589static bool isAMXOpcode(unsigned Opc) {
4590 switch (Opc) {
4591 default:
4592 return false;
4593 case X86::TILELOADD:
4594 case X86::TILESTORED:
4595 case X86::TILELOADD_EVEX:
4596 case X86::TILESTORED_EVEX:
4597 return true;
4598 }
4599}
4600
4603 unsigned Opc, Register Reg, int FrameIdx,
4604 bool isKill) const {
4605 switch (Opc) {
4606 default:
4607 llvm_unreachable("Unexpected special opcode!");
4608 case X86::TILESTORED:
4609 case X86::TILESTORED_EVEX: {
4610 // tilestored %tmm, (%sp, %idx)
4612 Register VirtReg = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass);
4613 BuildMI(MBB, MI, DebugLoc(), get(X86::MOV64ri), VirtReg).addImm(64);
4614 MachineInstr *NewMI =
4615 addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc)), FrameIdx)
4616 .addReg(Reg, getKillRegState(isKill));
4618 MO.setReg(VirtReg);
4619 MO.setIsKill(true);
4620 break;
4621 }
4622 case X86::TILELOADD:
4623 case X86::TILELOADD_EVEX: {
4624 // tileloadd (%sp, %idx), %tmm
4626 Register VirtReg = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass);
4627 BuildMI(MBB, MI, DebugLoc(), get(X86::MOV64ri), VirtReg).addImm(64);
4629 BuildMI(MBB, MI, DebugLoc(), get(Opc), Reg), FrameIdx);
4631 MO.setReg(VirtReg);
4632 MO.setIsKill(true);
4633 break;
4634 }
4635 }
4636}
4637
4640 bool isKill, int FrameIdx, const TargetRegisterClass *RC,
4641 const TargetRegisterInfo *TRI, Register VReg) const {
4642 const MachineFunction &MF = *MBB.getParent();
4643 const MachineFrameInfo &MFI = MF.getFrameInfo();
4644 assert(MFI.getObjectSize(FrameIdx) >= TRI->getSpillSize(*RC) &&
4645 "Stack slot too small for store");
4646
4647 unsigned Alignment = std::max<uint32_t>(TRI->getSpillSize(*RC), 16);
4648 bool isAligned =
4649 (Subtarget.getFrameLowering()->getStackAlign() >= Alignment) ||
4650 (RI.canRealignStack(MF) && !MFI.isFixedObjectIndex(FrameIdx));
4651
4652 unsigned Opc = getStoreRegOpcode(SrcReg, RC, isAligned, Subtarget);
4653 if (isAMXOpcode(Opc))
4654 loadStoreTileReg(MBB, MI, Opc, SrcReg, FrameIdx, isKill);
4655 else
4656 addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc)), FrameIdx)
4657 .addReg(SrcReg, getKillRegState(isKill));
4658}
4659
4662 Register DestReg, int FrameIdx,
4663 const TargetRegisterClass *RC,
4664 const TargetRegisterInfo *TRI,
4665 Register VReg) const {
4666 const MachineFunction &MF = *MBB.getParent();
4667 const MachineFrameInfo &MFI = MF.getFrameInfo();
4668 assert(MFI.getObjectSize(FrameIdx) >= TRI->getSpillSize(*RC) &&
4669 "Load size exceeds stack slot");
4670 unsigned Alignment = std::max<uint32_t>(TRI->getSpillSize(*RC), 16);
4671 bool isAligned =
4672 (Subtarget.getFrameLowering()->getStackAlign() >= Alignment) ||
4673 (RI.canRealignStack(MF) && !MFI.isFixedObjectIndex(FrameIdx));
4674
4675 unsigned Opc = getLoadRegOpcode(DestReg, RC, isAligned, Subtarget);
4676 if (isAMXOpcode(Opc))
4677 loadStoreTileReg(MBB, MI, Opc, DestReg, FrameIdx);
4678 else
4679 addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc), DestReg),
4680 FrameIdx);
4681}
4682
4684 Register &SrcReg2, int64_t &CmpMask,
4685 int64_t &CmpValue) const {
4686 switch (MI.getOpcode()) {
4687 default:
4688 break;
4689 case X86::CMP64ri32:
4690 case X86::CMP32ri:
4691 case X86::CMP16ri:
4692 case X86::CMP8ri:
4693 SrcReg = MI.getOperand(0).getReg();
4694 SrcReg2 = 0;
4695 if (MI.getOperand(1).isImm()) {
4696 CmpMask = ~0;
4697 CmpValue = MI.getOperand(1).getImm();
4698 } else {
4699 CmpMask = CmpValue = 0;
4700 }
4701 return true;
4702 // A SUB can be used to perform comparison.
4703 CASE_ND(SUB64rm)
4704 CASE_ND(SUB32rm)
4705 CASE_ND(SUB16rm)
4706 CASE_ND(SUB8rm)
4707 SrcReg = MI.getOperand(1).getReg();
4708 SrcReg2 = 0;
4709 CmpMask = 0;
4710 CmpValue = 0;
4711 return true;
4712 CASE_ND(SUB64rr)
4713 CASE_ND(SUB32rr)
4714 CASE_ND(SUB16rr)
4715 CASE_ND(SUB8rr)
4716 SrcReg = MI.getOperand(1).getReg();
4717 SrcReg2 = MI.getOperand(2).getReg();
4718 CmpMask = 0;
4719 CmpValue = 0;
4720 return true;
4721 CASE_ND(SUB64ri32)
4722 CASE_ND(SUB32ri)
4723 CASE_ND(SUB16ri)
4724 CASE_ND(SUB8ri)
4725 SrcReg = MI.getOperand(1).getReg();
4726 SrcReg2 = 0;
4727 if (MI.getOperand(2).isImm()) {
4728 CmpMask = ~0;
4729 CmpValue = MI.getOperand(2).getImm();
4730 } else {
4731 CmpMask = CmpValue = 0;
4732 }
4733 return true;
4734 case X86::CMP64rr:
4735 case X86::CMP32rr:
4736 case X86::CMP16rr:
4737 case X86::CMP8rr:
4738 SrcReg = MI.getOperand(0).getReg();
4739 SrcReg2 = MI.getOperand(1).getReg();
4740 CmpMask = 0;
4741 CmpValue = 0;
4742 return true;
4743 case X86::TEST8rr:
4744 case X86::TEST16rr:
4745 case X86::TEST32rr:
4746 case X86::TEST64rr:
4747 SrcReg = MI.getOperand(0).getReg();
4748 if (MI.getOperand(1).getReg() != SrcReg)
4749 return false;
4750 // Compare against zero.
4751 SrcReg2 = 0;
4752 CmpMask = ~0;
4753 CmpValue = 0;
4754 return true;
4755 }
4756 return false;
4757}
4758
4759bool X86InstrInfo::isRedundantFlagInstr(const MachineInstr &FlagI,
4760 Register SrcReg, Register SrcReg2,
4761 int64_t ImmMask, int64_t ImmValue,
4762 const MachineInstr &OI, bool *IsSwapped,
4763 int64_t *ImmDelta) const {
4764 switch (OI.getOpcode()) {
4765 case X86::CMP64rr:
4766 case X86::CMP32rr:
4767 case X86::CMP16rr:
4768 case X86::CMP8rr:
4769 CASE_ND(SUB64rr)
4770 CASE_ND(SUB32rr)
4771 CASE_ND(SUB16rr)
4772 CASE_ND(SUB8rr) {
4773 Register OISrcReg;
4774 Register OISrcReg2;
4775 int64_t OIMask;
4776 int64_t OIValue;
4777 if (!analyzeCompare(OI, OISrcReg, OISrcReg2, OIMask, OIValue) ||
4778 OIMask != ImmMask || OIValue != ImmValue)
4779 return false;
4780 if (SrcReg == OISrcReg && SrcReg2 == OISrcReg2) {
4781 *IsSwapped = false;
4782 return true;
4783 }
4784 if (SrcReg == OISrcReg2 && SrcReg2 == OISrcReg) {
4785 *IsSwapped = true;
4786 return true;
4787 }
4788 return false;
4789 }
4790 case X86::CMP64ri32:
4791 case X86::CMP32ri:
4792 case X86::CMP16ri:
4793 case X86::CMP8ri:
4794 CASE_ND(SUB64ri32)
4795 CASE_ND(SUB32ri)
4796 CASE_ND(SUB16ri)
4797 CASE_ND(SUB8ri)
4798 case X86::TEST64rr:
4799 case X86::TEST32rr:
4800 case X86::TEST16rr:
4801 case X86::TEST8rr: {
4802 if (ImmMask != 0) {
4803 Register OISrcReg;
4804 Register OISrcReg2;
4805 int64_t OIMask;
4806 int64_t OIValue;
4807 if (analyzeCompare(OI, OISrcReg, OISrcReg2, OIMask, OIValue) &&
4808 SrcReg == OISrcReg && ImmMask == OIMask) {
4809 if (OIValue == ImmValue) {
4810 *ImmDelta = 0;
4811 return true;
4812 } else if (static_cast<uint64_t>(ImmValue) ==
4813 static_cast<uint64_t>(OIValue) - 1) {
4814 *ImmDelta = -1;
4815 return true;
4816 } else if (static_cast<uint64_t>(ImmValue) ==
4817 static_cast<uint64_t>(OIValue) + 1) {
4818 *ImmDelta = 1;
4819 return true;
4820 } else {
4821 return false;
4822 }
4823 }
4824 }
4825 return FlagI.isIdenticalTo(OI);
4826 }
4827 default:
4828 return false;
4829 }
4830}
4831
4832/// Check whether the definition can be converted
4833/// to remove a comparison against zero.
4834inline static bool isDefConvertible(const MachineInstr &MI, bool &NoSignFlag,
4835 bool &ClearsOverflowFlag) {
4836 NoSignFlag = false;
4837 ClearsOverflowFlag = false;
4838
4839 // "ELF Handling for Thread-Local Storage" specifies that x86-64 GOTTPOFF, and
4840 // i386 GOTNTPOFF/INDNTPOFF relocations can convert an ADD to a LEA during
4841 // Initial Exec to Local Exec relaxation. In these cases, we must not depend
4842 // on the EFLAGS modification of ADD actually happening in the final binary.
4843 if (MI.getOpcode() == X86::ADD64rm || MI.getOpcode() == X86::ADD32rm) {
4844 unsigned Flags = MI.getOperand(5).getTargetFlags();
4845 if (Flags == X86II::MO_GOTTPOFF || Flags == X86II::MO_INDNTPOFF ||
4846 Flags == X86II::MO_GOTNTPOFF)
4847 return false;
4848 }
4849
4850 switch (MI.getOpcode()) {
4851 default:
4852 return false;
4853
4854 // The shift instructions only modify ZF if their shift count is non-zero.
4855 // N.B.: The processor truncates the shift count depending on the encoding.
4856 CASE_ND(SAR8ri)
4857 CASE_ND(SAR16ri)
4858 CASE_ND(SAR32ri)
4859 CASE_ND(SAR64ri)
4860 CASE_ND(SHR8ri)
4861 CASE_ND(SHR16ri)
4862 CASE_ND(SHR32ri)
4863 CASE_ND(SHR64ri)
4864 return getTruncatedShiftCount(MI, 2) != 0;
4865
4866 // Some left shift instructions can be turned into LEA instructions but only
4867 // if their flags aren't used. Avoid transforming such instructions.
4868 CASE_ND(SHL8ri)
4869 CASE_ND(SHL16ri)
4870 CASE_ND(SHL32ri)
4871 CASE_ND(SHL64ri) {
4872 unsigned ShAmt = getTruncatedShiftCount(MI, 2);
4873 if (isTruncatedShiftCountForLEA(ShAmt))
4874 return false;
4875 return ShAmt != 0;
4876 }
4877
4878 CASE_ND(SHRD16rri8)
4879 CASE_ND(SHRD32rri8)
4880 CASE_ND(SHRD64rri8)
4881 CASE_ND(SHLD16rri8)
4882 CASE_ND(SHLD32rri8)
4883 CASE_ND(SHLD64rri8)
4884 return getTruncatedShiftCount(MI, 3) != 0;
4885
4886 CASE_ND(SUB64ri32)
4887 CASE_ND(SUB32ri)
4888 CASE_ND(SUB16ri)
4889 CASE_ND(SUB8ri)
4890 CASE_ND(SUB64rr)
4891 CASE_ND(SUB32rr)
4892 CASE_ND(SUB16rr)
4893 CASE_ND(SUB8rr)
4894 CASE_ND(SUB64rm)
4895 CASE_ND(SUB32rm)
4896 CASE_ND(SUB16rm)
4897 CASE_ND(SUB8rm)
4898 CASE_ND(DEC64r)
4899 CASE_ND(DEC32r)
4900 CASE_ND(DEC16r)
4901 CASE_ND(DEC8r)
4902 CASE_ND(ADD64ri32)
4903 CASE_ND(ADD32ri)
4904 CASE_ND(ADD16ri)
4905 CASE_ND(ADD8ri)
4906 CASE_ND(ADD64rr)
4907 CASE_ND(ADD32rr)
4908 CASE_ND(ADD16rr)
4909 CASE_ND(ADD8rr)
4910 CASE_ND(ADD64rm)
4911 CASE_ND(ADD32rm)
4912 CASE_ND(ADD16rm)
4913 CASE_ND(ADD8rm)
4914 CASE_ND(INC64r)
4915 CASE_ND(INC32r)
4916 CASE_ND(INC16r)
4917 CASE_ND(INC8r)
4918 CASE_ND(ADC64ri32)
4919 CASE_ND(ADC32ri)
4920 CASE_ND(ADC16ri)
4921 CASE_ND(ADC8ri)
4922 CASE_ND(ADC64rr)
4923 CASE_ND(ADC32rr)
4924 CASE_ND(ADC16rr)
4925 CASE_ND(ADC8rr)
4926 CASE_ND(ADC64rm)
4927 CASE_ND(ADC32rm)
4928 CASE_ND(ADC16rm)
4929 CASE_ND(ADC8rm)
4930 CASE_ND(SBB64ri32)
4931 CASE_ND(SBB32ri)
4932 CASE_ND(SBB16ri)
4933 CASE_ND(SBB8ri)
4934 CASE_ND(SBB64rr)
4935 CASE_ND(SBB32rr)
4936 CASE_ND(SBB16rr)
4937 CASE_ND(SBB8rr)
4938 CASE_ND(SBB64rm)
4939 CASE_ND(SBB32rm)
4940 CASE_ND(SBB16rm)
4941 CASE_ND(SBB8rm)
4942 CASE_ND(NEG8r)
4943 CASE_ND(NEG16r)
4944 CASE_ND(NEG32r)
4945 CASE_ND(NEG64r)
4946 case X86::LZCNT16rr:
4947 case X86::LZCNT16rm:
4948 case X86::LZCNT32rr:
4949 case X86::LZCNT32rm:
4950 case X86::LZCNT64rr:
4951 case X86::LZCNT64rm:
4952 case X86::POPCNT16rr:
4953 case X86::POPCNT16rm:
4954 case X86::POPCNT32rr:
4955 case X86::POPCNT32rm:
4956 case X86::POPCNT64rr:
4957 case X86::POPCNT64rm:
4958 case X86::TZCNT16rr:
4959 case X86::TZCNT16rm:
4960 case X86::TZCNT32rr:
4961 case X86::TZCNT32rm:
4962 case X86::TZCNT64rr:
4963 case X86::TZCNT64rm:
4964 return true;
4965 CASE_ND(AND64ri32)
4966 CASE_ND(AND32ri)
4967 CASE_ND(AND16ri)
4968 CASE_ND(AND8ri)
4969 CASE_ND(AND64rr)
4970 CASE_ND(AND32rr)
4971 CASE_ND(AND16rr)
4972 CASE_ND(AND8rr)
4973 CASE_ND(AND64rm)
4974 CASE_ND(AND32rm)
4975 CASE_ND(AND16rm)
4976 CASE_ND(AND8rm)
4977 CASE_ND(XOR64ri32)
4978 CASE_ND(XOR32ri)
4979 CASE_ND(XOR16ri)
4980 CASE_ND(XOR8ri)
4981 CASE_ND(XOR64rr)
4982 CASE_ND(XOR32rr)
4983 CASE_ND(XOR16rr)
4984 CASE_ND(XOR8rr)
4985 CASE_ND(XOR64rm)
4986 CASE_ND(XOR32rm)
4987 CASE_ND(XOR16rm)
4988 CASE_ND(XOR8rm)
4989 CASE_ND(OR64ri32)
4990 CASE_ND(OR32ri)
4991 CASE_ND(OR16ri)
4992 CASE_ND(OR8ri)
4993 CASE_ND(OR64rr)
4994 CASE_ND(OR32rr)
4995 CASE_ND(OR16rr)
4996 CASE_ND(OR8rr)
4997 CASE_ND(OR64rm)
4998 CASE_ND(OR32rm)
4999 CASE_ND(OR16rm)
5000 CASE_ND(OR8rm)
5001 case X86::ANDN32rr:
5002 case X86::ANDN32rm:
5003 case X86::ANDN64rr:
5004 case X86::ANDN64rm:
5005 case X86::BLSI32rr:
5006 case X86::BLSI32rm:
5007 case X86::BLSI64rr:
5008 case X86::BLSI64rm:
5009 case X86::BLSMSK32rr:
5010 case X86::BLSMSK32rm:
5011 case X86::BLSMSK64rr:
5012 case X86::BLSMSK64rm:
5013 case X86::BLSR32rr:
5014 case X86::BLSR32rm:
5015 case X86::BLSR64rr:
5016 case X86::BLSR64rm:
5017 case X86::BLCFILL32rr:
5018 case X86::BLCFILL32rm:
5019 case X86::BLCFILL64rr:
5020 case X86::BLCFILL64rm:
5021 case X86::BLCI32rr:
5022 case X86::BLCI32rm:
5023 case X86::BLCI64rr:
5024 case X86::BLCI64rm:
5025 case X86::BLCIC32rr:
5026 case X86::BLCIC32rm:
5027 case X86::BLCIC64rr:
5028 case X86::BLCIC64rm:
5029 case X86::BLCMSK32rr:
5030 case X86::BLCMSK32rm:
5031 case X86::BLCMSK64rr:
5032 case X86::BLCMSK64rm:
5033 case X86::BLCS32rr:
5034 case X86::BLCS32rm:
5035 case X86::BLCS64rr:
5036 case X86::BLCS64rm:
5037 case X86::BLSFILL32rr:
5038 case X86::BLSFILL32rm:
5039 case X86::BLSFILL64rr:
5040 case X86::BLSFILL64rm:
5041 case X86::BLSIC32rr:
5042 case X86::BLSIC32rm:
5043 case X86::BLSIC64rr:
5044 case X86::BLSIC64rm:
5045 case X86::BZHI32rr:
5046 case X86::BZHI32rm:
5047 case X86::BZHI64rr:
5048 case X86::BZHI64rm:
5049 case X86::T1MSKC32rr:
5050 case X86::T1MSKC32rm:
5051 case X86::T1MSKC64rr:
5052 case X86::T1MSKC64rm:
5053 case X86::TZMSK32rr:
5054 case X86::TZMSK32rm:
5055 case X86::TZMSK64rr:
5056 case X86::TZMSK64rm:
5057 // These instructions clear the overflow flag just like TEST.
5058 // FIXME: These are not the only instructions in this switch that clear the
5059 // overflow flag.
5060 ClearsOverflowFlag = true;
5061 return true;
5062 case X86::BEXTR32rr:
5063 case X86::BEXTR64rr:
5064 case X86::BEXTR32rm:
5065 case X86::BEXTR64rm:
5066 case X86::BEXTRI32ri:
5067 case X86::BEXTRI32mi:
5068 case X86::BEXTRI64ri:
5069 case X86::BEXTRI64mi:
5070 // BEXTR doesn't update the sign flag so we can't use it. It does clear
5071 // the overflow flag, but that's not useful without the sign flag.
5072 NoSignFlag = true;
5073 return true;
5074 }
5075}
5076
5077/// Check whether the use can be converted to remove a comparison against zero.
5079 switch (MI.getOpcode()) {
5080 default:
5081 return X86::COND_INVALID;
5082 CASE_ND(NEG8r)
5083 CASE_ND(NEG16r)
5084 CASE_ND(NEG32r)
5085 CASE_ND(NEG64r)
5086 return X86::COND_AE;
5087 case X86::LZCNT16rr:
5088 case X86::LZCNT32rr:
5089 case X86::LZCNT64rr:
5090 return X86::COND_B;
5091 case X86::POPCNT16rr:
5092 case X86::POPCNT32rr:
5093 case X86::POPCNT64rr:
5094 return X86::COND_E;
5095 case X86::TZCNT16rr:
5096 case X86::TZCNT32rr:
5097 case X86::TZCNT64rr:
5098 return X86::COND_B;
5099 case X86::BSF16rr:
5100 case X86::BSF32rr:
5101 case X86::BSF64rr:
5102 case X86::BSR16rr:
5103 case X86::BSR32rr:
5104 case X86::BSR64rr:
5105 return X86::COND_E;
5106 case X86::BLSI32rr:
5107 case X86::BLSI64rr:
5108 return X86::COND_AE;
5109 case X86::BLSR32rr:
5110 case X86::BLSR64rr:
5111 case X86::BLSMSK32rr:
5112 case X86::BLSMSK64rr:
5113 return X86::COND_B;
5114 // TODO: TBM instructions.
5115 }
5116}
5117
5118/// Check if there exists an earlier instruction that
5119/// operates on the same source operands and sets flags in the same way as
5120/// Compare; remove Compare if possible.
5122 Register SrcReg2, int64_t CmpMask,
5123 int64_t CmpValue,
5124 const MachineRegisterInfo *MRI) const {
5125 // Check whether we can replace SUB with CMP.
5126 switch (CmpInstr.getOpcode()) {
5127 default:
5128 break;
5129 CASE_ND(SUB64ri32)
5130 CASE_ND(SUB32ri)
5131 CASE_ND(SUB16ri)
5132 CASE_ND(SUB8ri)
5133 CASE_ND(SUB64rm)
5134 CASE_ND(SUB32rm)
5135 CASE_ND(SUB16rm)
5136 CASE_ND(SUB8rm)
5137 CASE_ND(SUB64rr)
5138 CASE_ND(SUB32rr)
5139 CASE_ND(SUB16rr)
5140 CASE_ND(SUB8rr) {
5141 if (!MRI->use_nodbg_empty(CmpInstr.getOperand(0).getReg()))
5142 return false;
5143 // There is no use of the destination register, we can replace SUB with CMP.
5144 unsigned NewOpcode = 0;
5145#define FROM_TO(A, B) \
5146 CASE_ND(A) NewOpcode = X86::B; \
5147 break;
5148 switch (CmpInstr.getOpcode()) {
5149 default:
5150 llvm_unreachable("Unreachable!");
5151 FROM_TO(SUB64rm, CMP64rm)
5152 FROM_TO(SUB32rm, CMP32rm)
5153 FROM_TO(SUB16rm, CMP16rm)
5154 FROM_TO(SUB8rm, CMP8rm)
5155 FROM_TO(SUB64rr, CMP64rr)
5156 FROM_TO(SUB32rr, CMP32rr)
5157 FROM_TO(SUB16rr, CMP16rr)
5158 FROM_TO(SUB8rr, CMP8rr)
5159 FROM_TO(SUB64ri32, CMP64ri32)
5160 FROM_TO(SUB32ri, CMP32ri)
5161 FROM_TO(SUB16ri, CMP16ri)
5162 FROM_TO(SUB8ri, CMP8ri)
5163 }
5164#undef FROM_TO
5165 CmpInstr.setDesc(get(NewOpcode));
5166 CmpInstr.removeOperand(0);
5167 // Mutating this instruction invalidates any debug data associated with it.
5168 CmpInstr.dropDebugNumber();
5169 // Fall through to optimize Cmp if Cmp is CMPrr or CMPri.
5170 if (NewOpcode == X86::CMP64rm || NewOpcode == X86::CMP32rm ||
5171 NewOpcode == X86::CMP16rm || NewOpcode == X86::CMP8rm)
5172 return false;
5173 }
5174 }
5175
5176 // The following code tries to remove the comparison by re-using EFLAGS
5177 // from earlier instructions.
5178
5179 bool IsCmpZero = (CmpMask != 0 && CmpValue == 0);
5180
5181 // Transformation currently requires SSA values.
5182 if (SrcReg2.isPhysical())
5183 return false;
5184 MachineInstr *SrcRegDef = MRI->getVRegDef(SrcReg);
5185 assert(SrcRegDef && "Must have a definition (SSA)");
5186
5187 MachineInstr *MI = nullptr;
5188 MachineInstr *Sub = nullptr;
5189 MachineInstr *Movr0Inst = nullptr;
5190 bool NoSignFlag = false;
5191 bool ClearsOverflowFlag = false;
5192 bool ShouldUpdateCC = false;
5193 bool IsSwapped = false;
5195 int64_t ImmDelta = 0;
5196
5197 // Search backward from CmpInstr for the next instruction defining EFLAGS.
5199 MachineBasicBlock &CmpMBB = *CmpInstr.getParent();
5201 std::next(MachineBasicBlock::reverse_iterator(CmpInstr));
5202 for (MachineBasicBlock *MBB = &CmpMBB;;) {
5203 for (MachineInstr &Inst : make_range(From, MBB->rend())) {
5204 // Try to use EFLAGS from the instruction defining %SrcReg. Example:
5205 // %eax = addl ...
5206 // ... // EFLAGS not changed
5207 // testl %eax, %eax // <-- can be removed
5208 if (&Inst == SrcRegDef) {
5209 if (IsCmpZero &&
5210 isDefConvertible(Inst, NoSignFlag, ClearsOverflowFlag)) {
5211 MI = &Inst;
5212 break;
5213 }
5214
5215 // Look back for the following pattern, in which case the
5216 // test16rr/test64rr instruction could be erased.
5217 //
5218 // Example for test16rr:
5219 // %reg = and32ri %in_reg, 5
5220 // ... // EFLAGS not changed.
5221 // %src_reg = copy %reg.sub_16bit:gr32
5222 // test16rr %src_reg, %src_reg, implicit-def $eflags
5223 // Example for test64rr:
5224 // %reg = and32ri %in_reg, 5
5225 // ... // EFLAGS not changed.
5226 // %src_reg = subreg_to_reg 0, %reg, %subreg.sub_index
5227 // test64rr %src_reg, %src_reg, implicit-def $eflags
5228 MachineInstr *AndInstr = nullptr;
5229 if (IsCmpZero &&
5230 findRedundantFlagInstr(CmpInstr, Inst, MRI, &AndInstr, TRI,
5231 NoSignFlag, ClearsOverflowFlag)) {
5232 assert(AndInstr != nullptr && X86::isAND(AndInstr->getOpcode()));
5233 MI = AndInstr;
5234 break;
5235 }
5236 // Cannot find other candidates before definition of SrcReg.
5237 return false;
5238 }
5239
5240 if (Inst.modifiesRegister(X86::EFLAGS, TRI)) {
5241 // Try to use EFLAGS produced by an instruction reading %SrcReg.
5242 // Example:
5243 // %eax = ...
5244 // ...
5245 // popcntl %eax
5246 // ... // EFLAGS not changed
5247 // testl %eax, %eax // <-- can be removed
5248 if (IsCmpZero) {
5249 NewCC = isUseDefConvertible(Inst);
5250 if (NewCC != X86::COND_INVALID && Inst.getOperand(1).isReg() &&
5251 Inst.getOperand(1).getReg() == SrcReg) {
5252 ShouldUpdateCC = true;
5253 MI = &Inst;
5254 break;
5255 }
5256 }
5257
5258 // Try to use EFLAGS from an instruction with similar flag results.
5259 // Example:
5260 // sub x, y or cmp x, y
5261 // ... // EFLAGS not changed
5262 // cmp x, y // <-- can be removed
5263 if (isRedundantFlagInstr(CmpInstr, SrcReg, SrcReg2, CmpMask, CmpValue,
5264 Inst, &IsSwapped, &ImmDelta)) {
5265 Sub = &Inst;
5266 break;
5267 }
5268
5269 // MOV32r0 is implemented with xor which clobbers condition code. It is
5270 // safe to move up, if the definition to EFLAGS is dead and earlier
5271 // instructions do not read or write EFLAGS.
5272 if (!Movr0Inst && Inst.getOpcode() == X86::MOV32r0 &&
5273 Inst.registerDefIsDead(X86::EFLAGS, TRI)) {
5274 Movr0Inst = &Inst;
5275 continue;
5276 }
5277
5278 // Cannot do anything for any other EFLAG changes.
5279 return false;
5280 }
5281 }
5282
5283 if (MI || Sub)
5284 break;
5285
5286 // Reached begin of basic block. Continue in predecessor if there is
5287 // exactly one.
5288 if (MBB->pred_size() != 1)
5289 return false;
5290 MBB = *MBB->pred_begin();
5291 From = MBB->rbegin();
5292 }
5293
5294 // Scan forward from the instruction after CmpInstr for uses of EFLAGS.
5295 // It is safe to remove CmpInstr if EFLAGS is redefined or killed.
5296 // If we are done with the basic block, we need to check whether EFLAGS is
5297 // live-out.
5298 bool FlagsMayLiveOut = true;
5300 MachineBasicBlock::iterator AfterCmpInstr =
5301 std::next(MachineBasicBlock::iterator(CmpInstr));
5302 for (MachineInstr &Instr : make_range(AfterCmpInstr, CmpMBB.end())) {
5303 bool ModifyEFLAGS = Instr.modifiesRegister(X86::EFLAGS, TRI);
5304 bool UseEFLAGS = Instr.readsRegister(X86::EFLAGS, TRI);
5305 // We should check the usage if this instruction uses and updates EFLAGS.
5306 if (!UseEFLAGS && ModifyEFLAGS) {
5307 // It is safe to remove CmpInstr if EFLAGS is updated again.
5308 FlagsMayLiveOut = false;
5309 break;
5310 }
5311 if (!UseEFLAGS && !ModifyEFLAGS)
5312 continue;
5313
5314 // EFLAGS is used by this instruction.
5315 X86::CondCode OldCC = X86::getCondFromMI(Instr);
5316 if ((MI || IsSwapped || ImmDelta != 0) && OldCC == X86::COND_INVALID)
5317 return false;
5318
5319 X86::CondCode ReplacementCC = X86::COND_INVALID;
5320 if (MI) {
5321 switch (OldCC) {
5322 default:
5323 break;
5324 case X86::COND_A:
5325 case X86::COND_AE:
5326 case X86::COND_B:
5327 case X86::COND_BE:
5328 // CF is used, we can't perform this optimization.
5329 return false;
5330 case X86::COND_G:
5331 case X86::COND_GE:
5332 case X86::COND_L:
5333 case X86::COND_LE:
5334 // If SF is used, but the instruction doesn't update the SF, then we
5335 // can't do the optimization.
5336 if (NoSignFlag)
5337 return false;
5338 [[fallthrough]];
5339 case X86::COND_O:
5340 case X86::COND_NO:
5341 // If OF is used, the instruction needs to clear it like CmpZero does.
5342 if (!ClearsOverflowFlag)
5343 return false;
5344 break;
5345 case X86::COND_S:
5346 case X86::COND_NS:
5347 // If SF is used, but the instruction doesn't update the SF, then we
5348 // can't do the optimization.
5349 if (NoSignFlag)
5350 return false;
5351 break;
5352 }
5353
5354 // If we're updating the condition code check if we have to reverse the
5355 // condition.
5356 if (ShouldUpdateCC)
5357 switch (OldCC) {
5358 default:
5359 return false;
5360 case X86::COND_E:
5361 ReplacementCC = NewCC;
5362 break;
5363 case X86::COND_NE:
5364 ReplacementCC = GetOppositeBranchCondition(NewCC);
5365 break;
5366 }
5367 } else if (IsSwapped) {
5368 // If we have SUB(r1, r2) and CMP(r2, r1), the condition code needs
5369 // to be changed from r2 > r1 to r1 < r2, from r2 < r1 to r1 > r2, etc.
5370 // We swap the condition code and synthesize the new opcode.
5371 ReplacementCC = getSwappedCondition(OldCC);
5372 if (ReplacementCC == X86::COND_INVALID)
5373 return false;
5374 ShouldUpdateCC = true;
5375 } else if (ImmDelta != 0) {
5376 unsigned BitWidth = TRI->getRegSizeInBits(*MRI->getRegClass(SrcReg));
5377 // Shift amount for min/max constants to adjust for 8/16/32 instruction
5378 // sizes.
5379 switch (OldCC) {
5380 case X86::COND_L: // x <s (C + 1) --> x <=s C
5381 if (ImmDelta != 1 || APInt::getSignedMinValue(BitWidth) == CmpValue)
5382 return false;
5383 ReplacementCC = X86::COND_LE;
5384 break;
5385 case X86::COND_B: // x <u (C + 1) --> x <=u C
5386 if (ImmDelta != 1 || CmpValue == 0)
5387 return false;
5388 ReplacementCC = X86::COND_BE;
5389 break;
5390 case X86::COND_GE: // x >=s (C + 1) --> x >s C
5391 if (ImmDelta != 1 || APInt::getSignedMinValue(BitWidth) == CmpValue)
5392 return false;
5393 ReplacementCC = X86::COND_G;
5394 break;
5395 case X86::COND_AE: // x >=u (C + 1) --> x >u C
5396 if (ImmDelta != 1 || CmpValue == 0)
5397 return false;
5398 ReplacementCC = X86::COND_A;
5399 break;
5400 case X86::COND_G: // x >s (C - 1) --> x >=s C
5401 if (ImmDelta != -1 || APInt::getSignedMaxValue(BitWidth) == CmpValue)
5402 return false;
5403 ReplacementCC = X86::COND_GE;
5404 break;
5405 case X86::COND_A: // x >u (C - 1) --> x >=u C
5406 if (ImmDelta != -1 || APInt::getMaxValue(BitWidth) == CmpValue)
5407 return false;
5408 ReplacementCC = X86::COND_AE;
5409 break;
5410 case X86::COND_LE: // x <=s (C - 1) --> x <s C
5411 if (ImmDelta != -1 || APInt::getSignedMaxValue(BitWidth) == CmpValue)
5412 return false;
5413 ReplacementCC = X86::COND_L;
5414 break;
5415 case X86::COND_BE: // x <=u (C - 1) --> x <u C
5416 if (ImmDelta != -1 || APInt::getMaxValue(BitWidth) == CmpValue)
5417 return false;
5418 ReplacementCC = X86::COND_B;
5419 break;
5420 default:
5421 return false;
5422 }
5423 ShouldUpdateCC = true;
5424 }
5425
5426 if (ShouldUpdateCC && ReplacementCC != OldCC) {
5427 // Push the MachineInstr to OpsToUpdate.
5428 // If it is safe to remove CmpInstr, the condition code of these
5429 // instructions will be modified.
5430 OpsToUpdate.push_back(std::make_pair(&Instr, ReplacementCC));
5431 }
5432 if (ModifyEFLAGS || Instr.killsRegister(X86::EFLAGS, TRI)) {
5433 // It is safe to remove CmpInstr if EFLAGS is updated again or killed.
5434 FlagsMayLiveOut = false;
5435 break;
5436 }
5437 }
5438
5439 // If we have to update users but EFLAGS is live-out abort, since we cannot
5440 // easily find all of the users.
5441 if ((MI != nullptr || ShouldUpdateCC) && FlagsMayLiveOut) {
5442 for (MachineBasicBlock *Successor : CmpMBB.successors())
5443 if (Successor->isLiveIn(X86::EFLAGS))
5444 return false;
5445 }
5446
5447 // The instruction to be updated is either Sub or MI.
5448 assert((MI == nullptr || Sub == nullptr) && "Should not have Sub and MI set");
5449 Sub = MI != nullptr ? MI : Sub;
5450 MachineBasicBlock *SubBB = Sub->getParent();
5451 // Move Movr0Inst to the appropriate place before Sub.
5452 if (Movr0Inst) {
5453 // Only move within the same block so we don't accidentally move to a
5454 // block with higher execution frequency.
5455 if (&CmpMBB != SubBB)
5456 return false;
5457 // Look backwards until we find a def that doesn't use the current EFLAGS.
5459 InsertE = Sub->getParent()->rend();
5460 for (; InsertI != InsertE; ++InsertI) {
5461 MachineInstr *Instr = &*InsertI;
5462 if (!Instr->readsRegister(X86::EFLAGS, TRI) &&
5463 Instr->modifiesRegister(X86::EFLAGS, TRI)) {
5464 Movr0Inst->getParent()->remove(Movr0Inst);
5465 Instr->getParent()->insert(MachineBasicBlock::iterator(Instr),
5466 Movr0Inst);
5467 break;
5468 }
5469 }
5470 if (InsertI == InsertE)
5471 return false;
5472 }
5473
5474 // Make sure Sub instruction defines EFLAGS and mark the def live.
5475 MachineOperand *FlagDef = Sub->findRegisterDefOperand(X86::EFLAGS);
5476 assert(FlagDef && "Unable to locate a def EFLAGS operand");
5477 FlagDef->setIsDead(false);
5478
5479 CmpInstr.eraseFromParent();
5480
5481 // Modify the condition code of instructions in OpsToUpdate.
5482 for (auto &Op : OpsToUpdate) {
5483 Op.first->getOperand(Op.first->getDesc().getNumOperands() - 1)
5484 .setImm(Op.second);
5485 }
5486 // Add EFLAGS to block live-ins between CmpBB and block of flags producer.
5487 for (MachineBasicBlock *MBB = &CmpMBB; MBB != SubBB;
5488 MBB = *MBB->pred_begin()) {
5489 assert(MBB->pred_size() == 1 && "Expected exactly one predecessor");
5490 if (!MBB->isLiveIn(X86::EFLAGS))
5491 MBB->addLiveIn(X86::EFLAGS);
5492 }
5493 return true;
5494}
5495
5496/// Try to remove the load by folding it to a register
5497/// operand at the use. We fold the load instructions if load defines a virtual
5498/// register, the virtual register is used once in the same BB, and the
5499/// instructions in-between do not load or store, and have no side effects.
5501 const MachineRegisterInfo *MRI,
5502 Register &FoldAsLoadDefReg,
5503 MachineInstr *&DefMI) const {
5504 // Check whether we can move DefMI here.
5505 DefMI = MRI->getVRegDef(FoldAsLoadDefReg);
5506 assert(DefMI);
5507 bool SawStore = false;
5508 if (!DefMI->isSafeToMove(nullptr, SawStore))
5509 return nullptr;
5510
5511 // Collect information about virtual register operands of MI.
5512 SmallVector<unsigned, 1> SrcOperandIds;
5513 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
5514 MachineOperand &MO = MI.getOperand(i);
5515 if (!MO.isReg())
5516 continue;
5517 Register Reg = MO.getReg();
5518 if (Reg != FoldAsLoadDefReg)
5519 continue;
5520 // Do not fold if we have a subreg use or a def.
5521 if (MO.getSubReg() || MO.isDef())
5522 return nullptr;
5523 SrcOperandIds.push_back(i);
5524 }
5525 if (SrcOperandIds.empty())
5526 return nullptr;
5527
5528 // Check whether we can fold the def into SrcOperandId.
5529 if (MachineInstr *FoldMI = foldMemoryOperand(MI, SrcOperandIds, *DefMI)) {
5530 FoldAsLoadDefReg = 0;
5531 return FoldMI;
5532 }
5533
5534 return nullptr;
5535}
5536
5537/// \returns true if the instruction can be changed to COPY when imm is 0.
5538static bool canConvert2Copy(unsigned Opc) {
5539 switch (Opc) {
5540 default:
5541 return false;
5542 CASE_ND(ADD64ri32)
5543 CASE_ND(SUB64ri32)
5544 CASE_ND(OR64ri32)
5545 CASE_ND(XOR64ri32)
5546 CASE_ND(ADD32ri)
5547 CASE_ND(SUB32ri)
5548 CASE_ND(OR32ri)
5549 CASE_ND(XOR32ri)
5550 return true;
5551 }
5552}
5553
5554/// Convert an ALUrr opcode to corresponding ALUri opcode. Such as
5555/// ADD32rr ==> ADD32ri
5556static unsigned convertALUrr2ALUri(unsigned Opc) {
5557 switch (Opc) {
5558 default:
5559 return 0;
5560#define FROM_TO(FROM, TO) \
5561 case X86::FROM: \
5562 return X86::TO; \
5563 case X86::FROM##_ND: \
5564 return X86::TO##_ND;
5565 FROM_TO(ADD64rr, ADD64ri32)
5566 FROM_TO(ADC64rr, ADC64ri32)
5567 FROM_TO(SUB64rr, SUB64ri32)
5568 FROM_TO(SBB64rr, SBB64ri32)
5569 FROM_TO(AND64rr, AND64ri32)
5570 FROM_TO(OR64rr, OR64ri32)
5571 FROM_TO(XOR64rr, XOR64ri32)
5572 FROM_TO(SHR64rCL, SHR64ri)
5573 FROM_TO(SHL64rCL, SHL64ri)
5574 FROM_TO(SAR64rCL, SAR64ri)
5575 FROM_TO(ROL64rCL, ROL64ri)
5576 FROM_TO(ROR64rCL, ROR64ri)
5577 FROM_TO(RCL64rCL, RCL64ri)
5578 FROM_TO(RCR64rCL, RCR64ri)
5579 FROM_TO(ADD32rr, ADD32ri)
5580 FROM_TO(ADC32rr, ADC32ri)
5581 FROM_TO(SUB32rr, SUB32ri)
5582 FROM_TO(SBB32rr, SBB32ri)
5583 FROM_TO(AND32rr, AND32ri)
5584 FROM_TO(OR32rr, OR32ri)
5585 FROM_TO(XOR32rr, XOR32ri)
5586 FROM_TO(SHR32rCL, SHR32ri)
5587 FROM_TO(SHL32rCL, SHL32ri)
5588 FROM_TO(SAR32rCL, SAR32ri)
5589 FROM_TO(ROL32rCL, ROL32ri)
5590 FROM_TO(ROR32rCL, ROR32ri)
5591 FROM_TO(RCL32rCL, RCL32ri)
5592 FROM_TO(RCR32rCL, RCR32ri)
5593#undef FROM_TO
5594#define FROM_TO(FROM, TO) \
5595 case X86::FROM: \
5596 return X86::TO;
5597 FROM_TO(TEST64rr, TEST64ri32)
5598 FROM_TO(CTEST64rr, CTEST64ri32)
5599 FROM_TO(CMP64rr, CMP64ri32)
5600 FROM_TO(CCMP64rr, CCMP64ri32)
5601 FROM_TO(TEST32rr, TEST32ri)
5602 FROM_TO(CTEST32rr, CTEST32ri)
5603 FROM_TO(CMP32rr, CMP32ri)
5604 FROM_TO(CCMP32rr, CCMP32ri)
5605#undef FROM_TO
5606 }
5607}
5608
5609/// Reg is assigned ImmVal in DefMI, and is used in UseMI.
5610/// If MakeChange is true, this function tries to replace Reg by ImmVal in
5611/// UseMI. If MakeChange is false, just check if folding is possible.
5612//
5613/// \returns true if folding is successful or possible.
5614bool X86InstrInfo::foldImmediateImpl(MachineInstr &UseMI, MachineInstr *DefMI,
5615 Register Reg, int64_t ImmVal,
5617 bool MakeChange) const {
5618 bool Modified = false;
5619
5620 // 64 bit operations accept sign extended 32 bit immediates.
5621 // 32 bit operations accept all 32 bit immediates, so we don't need to check
5622 // them.
5623 const TargetRegisterClass *RC = nullptr;
5624 if (Reg.isVirtual())
5625 RC = MRI->getRegClass(Reg);
5626 if ((Reg.isPhysical() && X86::GR64RegClass.contains(Reg)) ||
5627 (Reg.isVirtual() && X86::GR64RegClass.hasSubClassEq(RC))) {
5628 if (!isInt<32>(ImmVal))
5629 return false;
5630 }
5631
5632 if (UseMI.findRegisterUseOperand(Reg)->getSubReg())
5633 return false;
5634 // Immediate has larger code size than register. So avoid folding the
5635 // immediate if it has more than 1 use and we are optimizing for size.
5636 if (UseMI.getMF()->getFunction().hasOptSize() && Reg.isVirtual() &&
5637 !MRI->hasOneNonDBGUse(Reg))
5638 return false;
5639
5640 unsigned Opc = UseMI.getOpcode();
5641 unsigned NewOpc;
5642 if (Opc == TargetOpcode::COPY) {
5643 Register ToReg = UseMI.getOperand(0).getReg();
5644 const TargetRegisterClass *RC = nullptr;
5645 if (ToReg.isVirtual())
5646 RC = MRI->getRegClass(ToReg);
5647 bool GR32Reg = (ToReg.isVirtual() && X86::GR32RegClass.hasSubClassEq(RC)) ||
5648 (ToReg.isPhysical() && X86::GR32RegClass.contains(ToReg));
5649 bool GR64Reg = (ToReg.isVirtual() && X86::GR64RegClass.hasSubClassEq(RC)) ||
5650 (ToReg.isPhysical() && X86::GR64RegClass.contains(ToReg));
5651 bool GR8Reg = (ToReg.isVirtual() && X86::GR8RegClass.hasSubClassEq(RC)) ||
5652 (ToReg.isPhysical() && X86::GR8RegClass.contains(ToReg));
5653
5654 if (ImmVal == 0) {
5655 // We have MOV32r0 only.
5656 if (!GR32Reg)
5657 return false;
5658 }
5659
5660 if (GR64Reg) {
5661 if (isUInt<32>(ImmVal))
5662 NewOpc = X86::MOV32ri64;
5663 else
5664 NewOpc = X86::MOV64ri;
5665 } else if (GR32Reg) {
5666 NewOpc = X86::MOV32ri;
5667 if (ImmVal == 0) {
5668 // MOV32r0 clobbers EFLAGS.
5670 if (UseMI.getParent()->computeRegisterLiveness(
5671 TRI, X86::EFLAGS, UseMI) != MachineBasicBlock::LQR_Dead)
5672 return false;
5673
5674 // MOV32r0 is different than other cases because it doesn't encode the
5675 // immediate in the instruction. So we directly modify it here.
5676 if (!MakeChange)
5677 return true;
5678 UseMI.setDesc(get(X86::MOV32r0));
5679 UseMI.removeOperand(UseMI.findRegisterUseOperandIdx(Reg));
5680 UseMI.addOperand(MachineOperand::CreateReg(X86::EFLAGS, /*isDef=*/true,
5681 /*isImp=*/true,
5682 /*isKill=*/false,
5683 /*isDead=*/true));
5684 Modified = true;
5685 }
5686 } else if (GR8Reg)
5687 NewOpc = X86::MOV8ri;
5688 else
5689 return false;
5690 } else
5691 NewOpc = convertALUrr2ALUri(Opc);
5692
5693 if (!NewOpc)
5694 return false;
5695
5696 // For SUB instructions the immediate can only be the second source operand.
5697 if ((NewOpc == X86::SUB64ri32 || NewOpc == X86::SUB32ri ||
5698 NewOpc == X86::SBB64ri32 || NewOpc == X86::SBB32ri ||
5699 NewOpc == X86::SUB64ri32_ND || NewOpc == X86::SUB32ri_ND ||
5700 NewOpc == X86::SBB64ri32_ND || NewOpc == X86::SBB32ri_ND) &&
5701 UseMI.findRegisterUseOperandIdx(Reg) != 2)
5702 return false;
5703 // For CMP instructions the immediate can only be at index 1.
5704 if (((NewOpc == X86::CMP64ri32 || NewOpc == X86::CMP32ri) ||
5705 (NewOpc == X86::CCMP64ri32 || NewOpc == X86::CCMP32ri)) &&
5706 UseMI.findRegisterUseOperandIdx(Reg) != 1)
5707 return false;
5708
5709 using namespace X86;
5710 if (isSHL(Opc) || isSHR(Opc) || isSAR(Opc) || isROL(Opc) || isROR(Opc) ||
5711 isRCL(Opc) || isRCR(Opc)) {
5712 unsigned RegIdx = UseMI.findRegisterUseOperandIdx(Reg);
5713 if (RegIdx < 2)
5714 return false;
5715 if (!isInt<8>(ImmVal))
5716 return false;
5717 assert(Reg == X86::CL);
5718
5719 if (!MakeChange)
5720 return true;
5721 UseMI.setDesc(get(NewOpc));
5722 UseMI.removeOperand(RegIdx);
5723 UseMI.addOperand(MachineOperand::CreateImm(ImmVal));
5724 // Reg is physical register $cl, so we don't know if DefMI is dead through
5725 // MRI. Let the caller handle it, or pass dead-mi-elimination can delete
5726 // the dead physical register define instruction.
5727 return true;
5728 }
5729
5730 if (!MakeChange)
5731 return true;
5732
5733 if (!Modified) {
5734 // Modify the instruction.
5735 if (ImmVal == 0 && canConvert2Copy(NewOpc) &&
5736 UseMI.registerDefIsDead(X86::EFLAGS)) {
5737 // %100 = add %101, 0
5738 // ==>
5739 // %100 = COPY %101
5740 UseMI.setDesc(get(TargetOpcode::COPY));
5741 UseMI.removeOperand(UseMI.findRegisterUseOperandIdx(Reg));
5742 UseMI.removeOperand(UseMI.findRegisterDefOperandIdx(X86::EFLAGS));
5743 UseMI.untieRegOperand(0);
5746 } else {
5747 unsigned Op1 = 1, Op2 = CommuteAnyOperandIndex;
5748 unsigned ImmOpNum = 2;
5749 if (!UseMI.getOperand(0).isDef()) {
5750 Op1 = 0; // TEST, CMP, CTEST, CCMP
5751 ImmOpNum = 1;
5752 }
5753 if (Opc == TargetOpcode::COPY)
5754 ImmOpNum = 1;
5755 if (findCommutedOpIndices(UseMI, Op1, Op2) &&
5756 UseMI.getOperand(Op1).getReg() == Reg)
5757 commuteInstruction(UseMI);
5758
5759 assert(UseMI.getOperand(ImmOpNum).getReg() == Reg);
5760 UseMI.setDesc(get(NewOpc));
5761 UseMI.getOperand(ImmOpNum).ChangeToImmediate(ImmVal);
5762 }
5763 }
5764
5765 if (Reg.isVirtual() && MRI->use_nodbg_empty(Reg))
5767
5768 return true;
5769}
5770
5771/// foldImmediate - 'Reg' is known to be defined by a move immediate
5772/// instruction, try to fold the immediate into the use instruction.
5774 Register Reg, MachineRegisterInfo *MRI) const {
5775 int64_t ImmVal;
5776 if (!getConstValDefinedInReg(DefMI, Reg, ImmVal))
5777 return false;
5778
5779 return foldImmediateImpl(UseMI, &DefMI, Reg, ImmVal, MRI, true);
5780}
5781
5782/// Expand a single-def pseudo instruction to a two-addr
5783/// instruction with two undef reads of the register being defined.
5784/// This is used for mapping:
5785/// %xmm4 = V_SET0
5786/// to:
5787/// %xmm4 = PXORrr undef %xmm4, undef %xmm4
5788///
5790 const MCInstrDesc &Desc) {
5791 assert(Desc.getNumOperands() == 3 && "Expected two-addr instruction.");
5792 Register Reg = MIB.getReg(0);
5793 MIB->setDesc(Desc);
5794
5795 // MachineInstr::addOperand() will insert explicit operands before any
5796 // implicit operands.
5798 // But we don't trust that.
5799 assert(MIB.getReg(1) == Reg && MIB.getReg(2) == Reg && "Misplaced operand");
5800 return true;
5801}
5802
5803/// Expand a single-def pseudo instruction to a two-addr
5804/// instruction with two %k0 reads.
5805/// This is used for mapping:
5806/// %k4 = K_SET1
5807/// to:
5808/// %k4 = KXNORrr %k0, %k0
5810 Register Reg) {
5811 assert(Desc.getNumOperands() == 3 && "Expected two-addr instruction.");
5812 MIB->setDesc(Desc);
5814 return true;
5815}
5816
5818 bool MinusOne) {
5819 MachineBasicBlock &MBB = *MIB->getParent();
5820 const DebugLoc &DL = MIB->getDebugLoc();
5821 Register Reg = MIB.getReg(0);
5822
5823 // Insert the XOR.
5824 BuildMI(MBB, MIB.getInstr(), DL, TII.get(X86::XOR32rr), Reg)
5825 .addReg(Reg, RegState::Undef)
5826 .addReg(Reg, RegState::Undef);
5827
5828 // Turn the pseudo into an INC or DEC.
5829 MIB->setDesc(TII.get(MinusOne ? X86::DEC32r : X86::INC32r));
5830 MIB.addReg(Reg);
5831
5832 return true;
5833}
5834
5836 const TargetInstrInfo &TII,
5837 const X86Subtarget &Subtarget) {
5838 MachineBasicBlock &MBB = *MIB->getParent();
5839 const DebugLoc &DL = MIB->getDebugLoc();
5840 int64_t Imm = MIB->getOperand(1).getImm();
5841 assert(Imm != 0 && "Using push/pop for 0 is not efficient.");
5843
5844 int StackAdjustment;
5845
5846 if (Subtarget.is64Bit()) {
5847 assert(MIB->getOpcode() == X86::MOV64ImmSExti8 ||
5848 MIB->getOpcode() == X86::MOV32ImmSExti8);
5849
5850 // Can't use push/pop lowering if the function might write to the red zone.
5851 X86MachineFunctionInfo *X86FI =
5853 if (X86FI->getUsesRedZone()) {
5854 MIB->setDesc(TII.get(MIB->getOpcode() == X86::MOV32ImmSExti8
5855 ? X86::MOV32ri
5856 : X86::MOV64ri));
5857 return true;
5858 }
5859
5860 // 64-bit mode doesn't have 32-bit push/pop, so use 64-bit operations and
5861 // widen the register if necessary.
5862 StackAdjustment = 8;
5863 BuildMI(MBB, I, DL, TII.get(X86::PUSH64i32)).addImm(Imm);
5864 MIB->setDesc(TII.get(X86::POP64r));
5865 MIB->getOperand(0).setReg(getX86SubSuperRegister(MIB.getReg(0), 64));
5866 } else {
5867 assert(MIB->getOpcode() == X86::MOV32ImmSExti8);
5868 StackAdjustment = 4;
5869 BuildMI(MBB, I, DL, TII.get(X86::PUSH32i)).addImm(Imm);
5870 MIB->setDesc(TII.get(X86::POP32r));
5871 }
5872 MIB->removeOperand(1);
5874
5875 // Build CFI if necessary.
5876 MachineFunction &MF = *MBB.getParent();
5877 const X86FrameLowering *TFL = Subtarget.getFrameLowering();
5878 bool IsWin64Prologue = MF.getTarget().getMCAsmInfo()->usesWindowsCFI();
5879 bool NeedsDwarfCFI = !IsWin64Prologue && MF.needsFrameMoves();
5880 bool EmitCFI = !TFL->hasFP(MF) && NeedsDwarfCFI;
5881 if (EmitCFI) {
5882 TFL->BuildCFI(
5883 MBB, I, DL,
5884 MCCFIInstruction::createAdjustCfaOffset(nullptr, StackAdjustment));
5885 TFL->BuildCFI(
5886 MBB, std::next(I), DL,
5887 MCCFIInstruction::createAdjustCfaOffset(nullptr, -StackAdjustment));
5888 }
5889
5890 return true;
5891}
5892
5893// LoadStackGuard has so far only been implemented for 64-bit MachO. Different
5894// code sequence is needed for other targets.
5896 const TargetInstrInfo &TII) {
5897 MachineBasicBlock &MBB = *MIB->getParent();
5898 const DebugLoc &DL = MIB->getDebugLoc();
5899 Register Reg = MIB.getReg(0);
5900 const GlobalValue *GV =
5901 cast<GlobalValue>((*MIB->memoperands_begin())->getValue());
5902 auto Flags = MachineMemOperand::MOLoad |
5906 MachinePointerInfo::getGOT(*MBB.getParent()), Flags, 8, Align(8));
5908
5909 BuildMI(MBB, I, DL, TII.get(X86::MOV64rm), Reg)
5910 .addReg(X86::RIP)
5911 .addImm(1)
5912 .addReg(0)
5914 .addReg(0)
5915 .addMemOperand(MMO);
5916 MIB->setDebugLoc(DL);
5917 MIB->setDesc(TII.get(X86::MOV64rm));
5918 MIB.addReg(Reg, RegState::Kill).addImm(1).addReg(0).addImm(0).addReg(0);
5919}
5920
5922 MachineBasicBlock &MBB = *MIB->getParent();
5923 MachineFunction &MF = *MBB.getParent();
5924 const X86Subtarget &Subtarget = MF.getSubtarget<X86Subtarget>();
5925 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
5926 unsigned XorOp =
5927 MIB->getOpcode() == X86::XOR64_FP ? X86::XOR64rr : X86::XOR32rr;
5928 MIB->setDesc(TII.get(XorOp));
5929 MIB.addReg(TRI->getFrameRegister(MF), RegState::Undef);
5930 return true;
5931}
5932
5933// This is used to handle spills for 128/256-bit registers when we have AVX512,
5934// but not VLX. If it uses an extended register we need to use an instruction
5935// that loads the lower 128/256-bit, but is available with only AVX512F.
5937 const TargetRegisterInfo *TRI,
5938 const MCInstrDesc &LoadDesc,
5939 const MCInstrDesc &BroadcastDesc, unsigned SubIdx) {
5940 Register DestReg = MIB.getReg(0);
5941 // Check if DestReg is XMM16-31 or YMM16-31.
5942 if (TRI->getEncodingValue(DestReg) < 16) {
5943 // We can use a normal VEX encoded load.
5944 MIB->setDesc(LoadDesc);
5945 } else {
5946 // Use a 128/256-bit VBROADCAST instruction.
5947 MIB->setDesc(BroadcastDesc);
5948 // Change the destination to a 512-bit register.
5949 DestReg = TRI->getMatchingSuperReg(DestReg, SubIdx, &X86::VR512RegClass);
5950 MIB->getOperand(0).setReg(DestReg);
5951 }
5952 return true;
5953}
5954
5955// This is used to handle spills for 128/256-bit registers when we have AVX512,
5956// but not VLX. If it uses an extended register we need to use an instruction
5957// that stores the lower 128/256-bit, but is available with only AVX512F.
5959 const TargetRegisterInfo *TRI,
5960 const MCInstrDesc &StoreDesc,
5961 const MCInstrDesc &ExtractDesc, unsigned SubIdx) {
5962 Register SrcReg = MIB.getReg(X86::AddrNumOperands);
5963 // Check if DestReg is XMM16-31 or YMM16-31.
5964 if (TRI->getEncodingValue(SrcReg) < 16) {
5965 // We can use a normal VEX encoded store.
5966 MIB->setDesc(StoreDesc);
5967 } else {
5968 // Use a VEXTRACTF instruction.
5969 MIB->setDesc(ExtractDesc);
5970 // Change the destination to a 512-bit register.
5971 SrcReg = TRI->getMatchingSuperReg(SrcReg, SubIdx, &X86::VR512RegClass);
5973 MIB.addImm(0x0); // Append immediate to extract from the lower bits.
5974 }
5975
5976 return true;
5977}
5978
5980 MIB->setDesc(Desc);
5981 int64_t ShiftAmt = MIB->getOperand(2).getImm();
5982 // Temporarily remove the immediate so we can add another source register.
5983 MIB->removeOperand(2);
5984 // Add the register. Don't copy the kill flag if there is one.
5985 MIB.addReg(MIB.getReg(1), getUndefRegState(MIB->getOperand(1).isUndef()));
5986 // Add back the immediate.
5987 MIB.addImm(ShiftAmt);
5988 return true;
5989}
5990
5992 bool HasAVX = Subtarget.hasAVX();
5993 MachineInstrBuilder MIB(*MI.getParent()->getParent(), MI);
5994 switch (MI.getOpcode()) {
5995 case X86::MOV32r0:
5996 return Expand2AddrUndef(MIB, get(X86::XOR32rr));
5997 case X86::MOV32r1:
5998 return expandMOV32r1(MIB, *this, /*MinusOne=*/false);
5999 case X86::MOV32r_1:
6000 return expandMOV32r1(MIB, *this, /*MinusOne=*/true);
6001 case X86::MOV32ImmSExti8:
6002 case X86::MOV64ImmSExti8:
6003 return ExpandMOVImmSExti8(MIB, *this, Subtarget);
6004 case X86::SETB_C32r:
6005 return Expand2AddrUndef(MIB, get(X86::SBB32rr));
6006 case X86::SETB_C64r:
6007 return Expand2AddrUndef(MIB, get(X86::SBB64rr));
6008 case X86::MMX_SET0:
6009 return Expand2AddrUndef(MIB, get(X86::MMX_PXORrr));
6010 case X86::V_SET0:
6011 case X86::FsFLD0SS:
6012 case X86::FsFLD0SD:
6013 case X86::FsFLD0SH:
6014 case X86::FsFLD0F128:
6015 return Expand2AddrUndef(MIB, get(HasAVX ? X86::VXORPSrr : X86::XORPSrr));
6016 case X86::AVX_SET0: {
6017 assert(HasAVX && "AVX not supported");
6019 Register SrcReg = MIB.getReg(0);
6020 Register XReg = TRI->getSubReg(SrcReg, X86::sub_xmm);
6021 MIB->getOperand(0).setReg(XReg);
6022 Expand2AddrUndef(MIB, get(X86::VXORPSrr));
6023 MIB.addReg(SrcReg, RegState::ImplicitDefine);
6024 return true;
6025 }
6026 case X86::AVX512_128_SET0:
6027 case X86::AVX512_FsFLD0SH:
6028 case X86::AVX512_FsFLD0SS:
6029 case X86::AVX512_FsFLD0SD:
6030 case X86::AVX512_FsFLD0F128: {
6031 bool HasVLX = Subtarget.hasVLX();
6032 Register SrcReg = MIB.getReg(0);
6034 if (HasVLX || TRI->getEncodingValue(SrcReg) < 16)
6035 return Expand2AddrUndef(MIB,
6036 get(HasVLX ? X86::VPXORDZ128rr : X86::VXORPSrr));
6037 // Extended register without VLX. Use a larger XOR.
6038 SrcReg =
6039 TRI->getMatchingSuperReg(SrcReg, X86::sub_xmm, &X86::VR512RegClass);
6040 MIB->getOperand(0).setReg(SrcReg);
6041 return Expand2AddrUndef(MIB, get(X86::VPXORDZrr));
6042 }
6043 case X86::AVX512_256_SET0:
6044 case X86::AVX512_512_SET0: {
6045 bool HasVLX = Subtarget.hasVLX();
6046 Register SrcReg = MIB.getReg(0);
6048 if (HasVLX || TRI->getEncodingValue(SrcReg) < 16) {
6049 Register XReg = TRI->getSubReg(SrcReg, X86::sub_xmm);
6050 MIB->getOperand(0).setReg(XReg);
6051 Expand2AddrUndef(MIB, get(HasVLX ? X86::VPXORDZ128rr : X86::VXORPSrr));
6052 MIB.addReg(SrcReg, RegState::ImplicitDefine);
6053 return true;
6054 }
6055 if (MI.getOpcode() == X86::AVX512_256_SET0) {
6056 // No VLX so we must reference a zmm.
6057 unsigned ZReg =
6058 TRI->getMatchingSuperReg(SrcReg, X86::sub_ymm, &X86::VR512RegClass);
6059 MIB->getOperand(0).setReg(ZReg);
6060 }
6061 return Expand2AddrUndef(MIB, get(X86::VPXORDZrr));
6062 }
6063 case X86::V_SETALLONES:
6064 return Expand2AddrUndef(MIB,
6065 get(HasAVX ? X86::VPCMPEQDrr : X86::PCMPEQDrr));
6066 case X86::AVX2_SETALLONES:
6067 return Expand2AddrUndef(MIB, get(X86::VPCMPEQDYrr));
6068 case X86::AVX1_SETALLONES: {
6069 Register Reg = MIB.getReg(0);
6070 // VCMPPSYrri with an immediate 0xf should produce VCMPTRUEPS.
6071 MIB->setDesc(get(X86::VCMPPSYrri));
6072 MIB.addReg(Reg, RegState::Undef).addReg(Reg, RegState::Undef).addImm(0xf);
6073 return true;
6074 }
6075 case X86::AVX512_512_SETALLONES: {
6076 Register Reg = MIB.getReg(0);
6077 MIB->setDesc(get(X86::VPTERNLOGDZrri));
6078 // VPTERNLOGD needs 3 register inputs and an immediate.
6079 // 0xff will return 1s for any input.
6080 MIB.addReg(Reg, RegState::Undef)
6081 .addReg(Reg, RegState::Undef)
6082 .addReg(Reg, RegState::Undef)
6083 .addImm(0xff);
6084 return true;
6085 }
6086 case X86::AVX512_512_SEXT_MASK_32:
6087 case X86::AVX512_512_SEXT_MASK_64: {
6088 Register Reg = MIB.getReg(0);
6089 Register MaskReg = MIB.getReg(1);
6090 unsigned MaskState = getRegState(MIB->getOperand(1));
6091 unsigned Opc = (MI.getOpcode() == X86::AVX512_512_SEXT_MASK_64)
6092 ? X86::VPTERNLOGQZrrikz
6093 : X86::VPTERNLOGDZrrikz;
6094 MI.removeOperand(1);
6095 MIB->setDesc(get(Opc));
6096 // VPTERNLOG needs 3 register inputs and an immediate.
6097 // 0xff will return 1s for any input.
6098 MIB.addReg(Reg, RegState::Undef)
6099 .addReg(MaskReg, MaskState)
6100 .addReg(Reg, RegState::Undef)
6101 .addReg(Reg, RegState::Undef)
6102 .addImm(0xff);
6103 return true;
6104 }
6105 case X86::VMOVAPSZ128rm_NOVLX:
6106 return expandNOVLXLoad(MIB, &getRegisterInfo(), get(X86::VMOVAPSrm),
6107 get(X86::VBROADCASTF32X4rm), X86::sub_xmm);
6108 case X86::VMOVUPSZ128rm_NOVLX:
6109 return expandNOVLXLoad(MIB, &getRegisterInfo(), get(X86::VMOVUPSrm),
6110 get(X86::VBROADCASTF32X4rm), X86::sub_xmm);
6111 case X86::VMOVAPSZ256rm_NOVLX:
6112 return expandNOVLXLoad(MIB, &getRegisterInfo(), get(X86::VMOVAPSYrm),
6113 get(X86::VBROADCASTF64X4rm), X86::sub_ymm);
6114 case X86::VMOVUPSZ256rm_NOVLX:
6115 return expandNOVLXLoad(MIB, &getRegisterInfo(), get(X86::VMOVUPSYrm),
6116 get(X86::VBROADCASTF64X4rm), X86::sub_ymm);
6117 case X86::VMOVAPSZ128mr_NOVLX:
6118 return expandNOVLXStore(MIB, &getRegisterInfo(), get(X86::VMOVAPSmr),
6119 get(X86::VEXTRACTF32x4Zmr), X86::sub_xmm);
6120 case X86::VMOVUPSZ128mr_NOVLX:
6121 return expandNOVLXStore(MIB, &getRegisterInfo(), get(X86::VMOVUPSmr),
6122 get(X86::VEXTRACTF32x4Zmr), X86::sub_xmm);
6123 case X86::VMOVAPSZ256mr_NOVLX:
6124 return expandNOVLXStore(MIB, &getRegisterInfo(), get(X86::VMOVAPSYmr),
6125 get(X86::VEXTRACTF64x4Zmr), X86::sub_ymm);
6126 case X86::VMOVUPSZ256mr_NOVLX:
6127 return expandNOVLXStore(MIB, &getRegisterInfo(), get(X86::VMOVUPSYmr),
6128 get(X86::VEXTRACTF64x4Zmr), X86::sub_ymm);
6129 case X86::MOV32ri64: {
6130 Register Reg = MIB.getReg(0);
6131 Register Reg32 = RI.getSubReg(Reg, X86::sub_32bit);
6132 MI.setDesc(get(X86::MOV32ri));
6133 MIB->getOperand(0).setReg(Reg32);
6135 return true;
6136 }
6137
6138 case X86::RDFLAGS32:
6139 case X86::RDFLAGS64: {
6140 unsigned Is64Bit = MI.getOpcode() == X86::RDFLAGS64;
6141 MachineBasicBlock &MBB = *MIB->getParent();
6142
6143 MachineInstr *NewMI = BuildMI(MBB, MI, MIB->getDebugLoc(),
6144 get(Is64Bit ? X86::PUSHF64 : X86::PUSHF32))
6145 .getInstr();
6146
6147 // Permit reads of the EFLAGS and DF registers without them being defined.
6148 // This intrinsic exists to read external processor state in flags, such as
6149 // the trap flag, interrupt flag, and direction flag, none of which are
6150 // modeled by the backend.
6151 assert(NewMI->getOperand(2).getReg() == X86::EFLAGS &&
6152 "Unexpected register in operand! Should be EFLAGS.");
6153 NewMI->getOperand(2).setIsUndef();
6154 assert(NewMI->getOperand(3).getReg() == X86::DF &&
6155 "Unexpected register in operand! Should be DF.");
6156 NewMI->getOperand(3).setIsUndef();
6157
6158 MIB->setDesc(get(Is64Bit ? X86::POP64r : X86::POP32r));
6159 return true;
6160 }
6161
6162 case X86::WRFLAGS32:
6163 case X86::WRFLAGS64: {
6164 unsigned Is64Bit = MI.getOpcode() == X86::WRFLAGS64;
6165 MachineBasicBlock &MBB = *MIB->getParent();
6166
6167 BuildMI(MBB, MI, MIB->getDebugLoc(),
6168 get(Is64Bit ? X86::PUSH64r : X86::PUSH32r))
6169 .addReg(MI.getOperand(0).getReg());
6170 BuildMI(MBB, MI, MIB->getDebugLoc(),
6171 get(Is64Bit ? X86::POPF64 : X86::POPF32));
6172 MI.eraseFromParent();
6173 return true;
6174 }
6175
6176 // KNL does not recognize dependency-breaking idioms for mask registers,
6177 // so kxnor %k1, %k1, %k2 has a RAW dependence on %k1.
6178 // Using %k0 as the undef input register is a performance heuristic based
6179 // on the assumption that %k0 is used less frequently than the other mask
6180 // registers, since it is not usable as a write mask.
6181 // FIXME: A more advanced approach would be to choose the best input mask
6182 // register based on context.
6183 case X86::KSET0W:
6184 return Expand2AddrKreg(MIB, get(X86::KXORWrr), X86::K0);
6185 case X86::KSET0D:
6186 return Expand2AddrKreg(MIB, get(X86::KXORDrr), X86::K0);
6187 case X86::KSET0Q:
6188 return Expand2AddrKreg(MIB, get(X86::KXORQrr), X86::K0);
6189 case X86::KSET1W:
6190 return Expand2AddrKreg(MIB, get(X86::KXNORWrr), X86::K0);
6191 case X86::KSET1D:
6192 return Expand2AddrKreg(MIB, get(X86::KXNORDrr), X86::K0);
6193 case X86::KSET1Q:
6194 return Expand2AddrKreg(MIB, get(X86::KXNORQrr), X86::K0);
6195 case TargetOpcode::LOAD_STACK_GUARD:
6196 expandLoadStackGuard(MIB, *this);
6197 return true;
6198 case X86::XOR64_FP:
6199 case X86::XOR32_FP:
6200 return expandXorFP(MIB, *this);
6201 case X86::SHLDROT32ri:
6202 return expandSHXDROT(MIB, get(X86::SHLD32rri8));
6203 case X86::SHLDROT64ri:
6204 return expandSHXDROT(MIB, get(X86::SHLD64rri8));
6205 case X86::SHRDROT32ri:
6206 return expandSHXDROT(MIB, get(X86::SHRD32rri8));
6207 case X86::SHRDROT64ri:
6208 return expandSHXDROT(MIB, get(X86::SHRD64rri8));
6209 case X86::ADD8rr_DB:
6210 MIB->setDesc(get(X86::OR8rr));
6211 break;
6212 case X86::ADD16rr_DB:
6213 MIB->setDesc(get(X86::OR16rr));
6214 break;
6215 case X86::ADD32rr_DB:
6216 MIB->setDesc(get(X86::OR32rr));
6217 break;
6218 case X86::ADD64rr_DB:
6219 MIB->setDesc(get(X86::OR64rr));
6220 break;
6221 case X86::ADD8ri_DB:
6222 MIB->setDesc(get(X86::OR8ri));
6223 break;
6224 case X86::ADD16ri_DB:
6225 MIB->setDesc(get(X86::OR16ri));
6226 break;
6227 case X86::ADD32ri_DB:
6228 MIB->setDesc(get(X86::OR32ri));
6229 break;
6230 case X86::ADD64ri32_DB:
6231 MIB->setDesc(get(X86::OR64ri32));
6232 break;
6233 }
6234 return false;
6235}
6236
6237/// Return true for all instructions that only update
6238/// the first 32 or 64-bits of the destination register and leave the rest
6239/// unmodified. This can be used to avoid folding loads if the instructions
6240/// only update part of the destination register, and the non-updated part is
6241/// not needed. e.g. cvtss2sd, sqrtss. Unfolding the load from these
6242/// instructions breaks the partial register dependency and it can improve
6243/// performance. e.g.:
6244///
6245/// movss (%rdi), %xmm0
6246/// cvtss2sd %xmm0, %xmm0
6247///
6248/// Instead of
6249/// cvtss2sd (%rdi), %xmm0
6250///
6251/// FIXME: This should be turned into a TSFlags.
6252///
6253static bool hasPartialRegUpdate(unsigned Opcode, const X86Subtarget &Subtarget,
6254 bool ForLoadFold = false) {
6255 switch (Opcode) {
6256 case X86::CVTSI2SSrr:
6257 case X86::CVTSI2SSrm:
6258 case X86::CVTSI642SSrr:
6259 case X86::CVTSI642SSrm:
6260 case X86::CVTSI2SDrr:
6261 case X86::CVTSI2SDrm:
6262 case X86::CVTSI642SDrr:
6263 case X86::CVTSI642SDrm:
6264 // Load folding won't effect the undef register update since the input is
6265 // a GPR.
6266 return !ForLoadFold;
6267 case X86::CVTSD2SSrr:
6268 case X86::CVTSD2SSrm:
6269 case X86::CVTSS2SDrr:
6270 case X86::CVTSS2SDrm:
6271 case X86::MOVHPDrm:
6272 case X86::MOVHPSrm:
6273 case X86::MOVLPDrm:
6274 case X86::MOVLPSrm:
6275 case X86::RCPSSr:
6276 case X86::RCPSSm:
6277 case X86::RCPSSr_Int:
6278 case X86::RCPSSm_Int:
6279 case X86::ROUNDSDri:
6280 case X86::ROUNDSDmi:
6281 case X86::ROUNDSSri:
6282 case X86::ROUNDSSmi:
6283 case X86::RSQRTSSr:
6284 case X86::RSQRTSSm:
6285 case X86::RSQRTSSr_Int:
6286 case X86::RSQRTSSm_Int:
6287 case X86::SQRTSSr:
6288 case X86::SQRTSSm:
6289 case X86::SQRTSSr_Int:
6290 case X86::SQRTSSm_Int:
6291 case X86::SQRTSDr:
6292 case X86::SQRTSDm:
6293 case X86::SQRTSDr_Int:
6294 case X86::SQRTSDm_Int:
6295 return true;
6296 case X86::VFCMULCPHZ128rm:
6297 case X86::VFCMULCPHZ128rmb:
6298 case X86::VFCMULCPHZ128rmbkz:
6299 case X86::VFCMULCPHZ128rmkz:
6300 case X86::VFCMULCPHZ128rr:
6301 case X86::VFCMULCPHZ128rrkz:
6302 case X86::VFCMULCPHZ256rm:
6303 case X86::VFCMULCPHZ256rmb:
6304 case X86::VFCMULCPHZ256rmbkz:
6305 case X86::VFCMULCPHZ256rmkz:
6306 case X86::VFCMULCPHZ256rr:
6307 case X86::VFCMULCPHZ256rrkz:
6308 case X86::VFCMULCPHZrm:
6309 case X86::VFCMULCPHZrmb:
6310 case X86::VFCMULCPHZrmbkz:
6311 case X86::VFCMULCPHZrmkz:
6312 case X86::VFCMULCPHZrr:
6313 case X86::VFCMULCPHZrrb:
6314 case X86::VFCMULCPHZrrbkz:
6315 case X86::VFCMULCPHZrrkz:
6316 case X86::VFMULCPHZ128rm:
6317 case X86::VFMULCPHZ128rmb:
6318 case X86::VFMULCPHZ128rmbkz:
6319 case X86::VFMULCPHZ128rmkz:
6320 case X86::VFMULCPHZ128rr:
6321 case X86::VFMULCPHZ128rrkz:
6322 case X86::VFMULCPHZ256rm:
6323 case X86::VFMULCPHZ256rmb:
6324 case X86::VFMULCPHZ256rmbkz:
6325 case X86::VFMULCPHZ256rmkz:
6326 case X86::VFMULCPHZ256rr:
6327 case X86::VFMULCPHZ256rrkz:
6328 case X86::VFMULCPHZrm:
6329 case X86::VFMULCPHZrmb:
6330 case X86::VFMULCPHZrmbkz:
6331 case X86::VFMULCPHZrmkz:
6332 case X86::VFMULCPHZrr:
6333 case X86::VFMULCPHZrrb:
6334 case X86::VFMULCPHZrrbkz:
6335 case X86::VFMULCPHZrrkz:
6336 case X86::VFCMULCSHZrm:
6337 case X86::VFCMULCSHZrmkz:
6338 case X86::VFCMULCSHZrr:
6339 case X86::VFCMULCSHZrrb:
6340 case X86::VFCMULCSHZrrbkz:
6341 case X86::VFCMULCSHZrrkz:
6342 case X86::VFMULCSHZrm:
6343 case X86::VFMULCSHZrmkz:
6344 case X86::VFMULCSHZrr:
6345 case X86::VFMULCSHZrrb:
6346 case X86::VFMULCSHZrrbkz:
6347 case X86::VFMULCSHZrrkz:
6348 return Subtarget.hasMULCFalseDeps();
6349 case X86::VPERMDYrm:
6350 case X86::VPERMDYrr:
6351 case X86::VPERMQYmi:
6352 case X86::VPERMQYri:
6353 case X86::VPERMPSYrm:
6354 case X86::VPERMPSYrr:
6355 case X86::VPERMPDYmi:
6356 case X86::VPERMPDYri:
6357 case X86::VPERMDZ256rm:
6358 case X86::VPERMDZ256rmb:
6359 case X86::VPERMDZ256rmbkz:
6360 case X86::VPERMDZ256rmkz:
6361 case X86::VPERMDZ256rr:
6362 case X86::VPERMDZ256rrkz:
6363 case X86::VPERMDZrm:
6364 case X86::VPERMDZrmb:
6365 case X86::VPERMDZrmbkz:
6366 case X86::VPERMDZrmkz:
6367 case X86::VPERMDZrr:
6368 case X86::VPERMDZrrkz:
6369 case X86::VPERMQZ256mbi:
6370 case X86::VPERMQZ256mbikz:
6371 case X86::VPERMQZ256mi:
6372 case X86::VPERMQZ256mikz:
6373 case X86::VPERMQZ256ri:
6374 case X86::VPERMQZ256rikz:
6375 case X86::VPERMQZ256rm:
6376 case X86::VPERMQZ256rmb:
6377 case X86::VPERMQZ256rmbkz:
6378 case X86::VPERMQZ256rmkz:
6379 case X86::VPERMQZ256rr:
6380 case X86::VPERMQZ256rrkz:
6381 case X86::VPERMQZmbi:
6382 case X86::VPERMQZmbikz:
6383 case X86::VPERMQZmi:
6384 case X86::VPERMQZmikz:
6385 case X86::VPERMQZri:
6386 case X86::VPERMQZrikz:
6387 case X86::VPERMQZrm:
6388 case X86::VPERMQZrmb:
6389 case X86::VPERMQZrmbkz:
6390 case X86::VPERMQZrmkz:
6391 case X86::VPERMQZrr:
6392 case X86::VPERMQZrrkz:
6393 case X86::VPERMPSZ256rm:
6394 case X86::VPERMPSZ256rmb:
6395 case X86::VPERMPSZ256rmbkz:
6396 case X86::VPERMPSZ256rmkz:
6397 case X86::VPERMPSZ256rr:
6398 case X86::VPERMPSZ256rrkz:
6399 case X86::VPERMPSZrm:
6400 case X86::VPERMPSZrmb:
6401 case X86::VPERMPSZrmbkz:
6402 case X86::VPERMPSZrmkz:
6403 case X86::VPERMPSZrr:
6404 case X86::VPERMPSZrrkz:
6405 case X86::VPERMPDZ256mbi:
6406 case X86::VPERMPDZ256mbikz:
6407 case X86::VPERMPDZ256mi:
6408 case X86::VPERMPDZ256mikz:
6409 case X86::VPERMPDZ256ri:
6410 case X86::VPERMPDZ256rikz:
6411 case X86::VPERMPDZ256rm:
6412 case X86::VPERMPDZ256rmb:
6413 case X86::VPERMPDZ256rmbkz:
6414 case X86::VPERMPDZ256rmkz:
6415 case X86::VPERMPDZ256rr:
6416 case X86::VPERMPDZ256rrkz:
6417 case X86::VPERMPDZmbi:
6418 case X86::VPERMPDZmbikz:
6419 case X86::VPERMPDZmi:
6420 case X86::VPERMPDZmikz:
6421 case X86::VPERMPDZri:
6422 case X86::VPERMPDZrikz:
6423 case X86::VPERMPDZrm:
6424 case X86::VPERMPDZrmb:
6425 case X86::VPERMPDZrmbkz:
6426 case X86::VPERMPDZrmkz:
6427 case X86::VPERMPDZrr:
6428 case X86::VPERMPDZrrkz:
6429 return Subtarget.hasPERMFalseDeps();
6430 case X86::VRANGEPDZ128rmbi:
6431 case X86::VRANGEPDZ128rmbikz:
6432 case X86::VRANGEPDZ128rmi:
6433 case X86::VRANGEPDZ128rmikz:
6434 case X86::VRANGEPDZ128rri:
6435 case X86::VRANGEPDZ128rrikz:
6436 case X86::VRANGEPDZ256rmbi:
6437 case X86::VRANGEPDZ256rmbikz:
6438 case X86::VRANGEPDZ256rmi:
6439 case X86::VRANGEPDZ256rmikz:
6440 case X86::VRANGEPDZ256rri:
6441 case X86::VRANGEPDZ256rrikz:
6442 case X86::VRANGEPDZrmbi:
6443 case X86::VRANGEPDZrmbikz:
6444 case X86::VRANGEPDZrmi:
6445 case X86::VRANGEPDZrmikz:
6446 case X86::VRANGEPDZrri:
6447 case X86::VRANGEPDZrrib:
6448 case X86::VRANGEPDZrribkz:
6449 case X86::VRANGEPDZrrikz:
6450 case X86::VRANGEPSZ128rmbi:
6451 case X86::VRANGEPSZ128rmbikz:
6452 case X86::VRANGEPSZ128rmi:
6453 case X86::VRANGEPSZ128rmikz:
6454 case X86::VRANGEPSZ128rri:
6455 case X86::VRANGEPSZ128rrikz:
6456 case X86::VRANGEPSZ256rmbi:
6457 case X86::VRANGEPSZ256rmbikz:
6458 case X86::VRANGEPSZ256rmi:
6459 case X86::VRANGEPSZ256rmikz:
6460 case X86::VRANGEPSZ256rri:
6461 case X86::VRANGEPSZ256rrikz:
6462 case X86::VRANGEPSZrmbi:
6463 case X86::VRANGEPSZrmbikz:
6464 case X86::VRANGEPSZrmi:
6465 case X86::VRANGEPSZrmikz:
6466 case X86::VRANGEPSZrri:
6467 case X86::VRANGEPSZrrib:
6468 case X86::VRANGEPSZrribkz:
6469 case X86::VRANGEPSZrrikz:
6470 case X86::VRANGESDZrmi:
6471 case X86::VRANGESDZrmikz:
6472 case X86::VRANGESDZrri:
6473 case X86::VRANGESDZrrib:
6474 case X86::VRANGESDZrribkz:
6475 case X86::VRANGESDZrrikz:
6476 case X86::VRANGESSZrmi:
6477 case X86::VRANGESSZrmikz:
6478 case X86::VRANGESSZrri:
6479 case X86::VRANGESSZrrib:
6480 case X86::VRANGESSZrribkz:
6481 case X86::VRANGESSZrrikz:
6482 return Subtarget.hasRANGEFalseDeps();
6483 case X86::VGETMANTSSZrmi:
6484 case X86::VGETMANTSSZrmikz:
6485 case X86::VGETMANTSSZrri:
6486 case X86::VGETMANTSSZrrib:
6487 case X86::VGETMANTSSZrribkz:
6488 case X86::VGETMANTSSZrrikz:
6489 case X86::VGETMANTSDZrmi:
6490 case X86::VGETMANTSDZrmikz:
6491 case X86::VGETMANTSDZrri:
6492 case X86::VGETMANTSDZrrib:
6493 case X86::VGETMANTSDZrribkz:
6494 case X86::VGETMANTSDZrrikz:
6495 case X86::VGETMANTSHZrmi:
6496 case X86::VGETMANTSHZrmikz:
6497 case X86::VGETMANTSHZrri:
6498 case X86::VGETMANTSHZrrib:
6499 case X86::VGETMANTSHZrribkz:
6500 case X86::VGETMANTSHZrrikz:
6501 case X86::VGETMANTPSZ128rmbi:
6502 case X86::VGETMANTPSZ128rmbikz:
6503 case X86::VGETMANTPSZ128rmi:
6504 case X86::VGETMANTPSZ128rmikz:
6505 case X86::VGETMANTPSZ256rmbi:
6506 case X86::VGETMANTPSZ256rmbikz:
6507 case X86::VGETMANTPSZ256rmi:
6508 case X86::VGETMANTPSZ256rmikz:
6509 case X86::VGETMANTPSZrmbi:
6510 case X86::VGETMANTPSZrmbikz:
6511 case X86::VGETMANTPSZrmi:
6512 case X86::VGETMANTPSZrmikz:
6513 case X86::VGETMANTPDZ128rmbi:
6514 case X86::VGETMANTPDZ128rmbikz:
6515 case X86::VGETMANTPDZ128rmi:
6516 case X86::VGETMANTPDZ128rmikz:
6517 case X86::VGETMANTPDZ256rmbi:
6518 case X86::VGETMANTPDZ256rmbikz:
6519 case X86::VGETMANTPDZ256rmi:
6520 case X86::VGETMANTPDZ256rmikz:
6521 case X86::VGETMANTPDZrmbi:
6522 case X86::VGETMANTPDZrmbikz:
6523 case X86::VGETMANTPDZrmi:
6524 case X86::VGETMANTPDZrmikz:
6525 return Subtarget.hasGETMANTFalseDeps();
6526 case X86::VPMULLQZ128rm:
6527 case X86::VPMULLQZ128rmb:
6528 case X86::VPMULLQZ128rmbkz:
6529 case X86::VPMULLQZ128rmkz:
6530 case X86::VPMULLQZ128rr:
6531 case X86::VPMULLQZ128rrkz:
6532 case X86::VPMULLQZ256rm:
6533 case X86::VPMULLQZ256rmb:
6534 case X86::VPMULLQZ256rmbkz:
6535 case X86::VPMULLQZ256rmkz:
6536 case X86::VPMULLQZ256rr:
6537 case X86::VPMULLQZ256rrkz:
6538 case X86::VPMULLQZrm:
6539 case X86::VPMULLQZrmb:
6540 case X86::VPMULLQZrmbkz:
6541 case X86::VPMULLQZrmkz:
6542 case X86::VPMULLQZrr:
6543 case X86::VPMULLQZrrkz:
6544 return Subtarget.hasMULLQFalseDeps();
6545 // GPR
6546 case X86::POPCNT32rm:
6547 case X86::POPCNT32rr:
6548 case X86::POPCNT64rm:
6549 case X86::POPCNT64rr:
6550 return Subtarget.hasPOPCNTFalseDeps();
6551 case X86::LZCNT32rm:
6552 case X86::LZCNT32rr:
6553 case X86::LZCNT64rm:
6554 case X86::LZCNT64rr:
6555 case X86::TZCNT32rm:
6556 case X86::TZCNT32rr:
6557 case X86::TZCNT64rm:
6558 case X86::TZCNT64rr:
6559 return Subtarget.hasLZCNTFalseDeps();
6560 }
6561
6562 return false;
6563}
6564
6565/// Inform the BreakFalseDeps pass how many idle
6566/// instructions we would like before a partial register update.
6568 const MachineInstr &MI, unsigned OpNum,
6569 const TargetRegisterInfo *TRI) const {
6570 if (OpNum != 0 || !hasPartialRegUpdate(MI.getOpcode(), Subtarget))
6571 return 0;
6572
6573 // If MI is marked as reading Reg, the partial register update is wanted.
6574 const MachineOperand &MO = MI.getOperand(0);
6575 Register Reg = MO.getReg();
6576 if (Reg.isVirtual()) {
6577 if (MO.readsReg() || MI.readsVirtualRegister(Reg))
6578 return 0;
6579 } else {
6580 if (MI.readsRegister(Reg, TRI))
6581 return 0;
6582 }
6583
6584 // If any instructions in the clearance range are reading Reg, insert a
6585 // dependency breaking instruction, which is inexpensive and is likely to
6586 // be hidden in other instruction's cycles.
6588}
6589
6590// Return true for any instruction the copies the high bits of the first source
6591// operand into the unused high bits of the destination operand.
6592// Also returns true for instructions that have two inputs where one may
6593// be undef and we want it to use the same register as the other input.
6594static bool hasUndefRegUpdate(unsigned Opcode, unsigned OpNum,
6595 bool ForLoadFold = false) {
6596 // Set the OpNum parameter to the first source operand.
6597 switch (Opcode) {
6598 case X86::MMX_PUNPCKHBWrr:
6599 case X86::MMX_PUNPCKHWDrr:
6600 case X86::MMX_PUNPCKHDQrr:
6601 case X86::MMX_PUNPCKLBWrr:
6602 case X86::MMX_PUNPCKLWDrr:
6603 case X86::MMX_PUNPCKLDQrr:
6604 case X86::MOVHLPSrr:
6605 case X86::PACKSSWBrr:
6606 case X86::PACKUSWBrr:
6607 case X86::PACKSSDWrr:
6608 case X86::PACKUSDWrr:
6609 case X86::PUNPCKHBWrr:
6610 case X86::PUNPCKLBWrr:
6611 case X86::PUNPCKHWDrr:
6612 case X86::PUNPCKLWDrr:
6613 case X86::PUNPCKHDQrr:
6614 case X86::PUNPCKLDQrr:
6615 case X86::PUNPCKHQDQrr:
6616 case X86::PUNPCKLQDQrr:
6617 case X86::SHUFPDrri:
6618 case X86::SHUFPSrri:
6619 // These instructions are sometimes used with an undef first or second
6620 // source. Return true here so BreakFalseDeps will assign this source to the
6621 // same register as the first source to avoid a false dependency.
6622 // Operand 1 of these instructions is tied so they're separate from their
6623 // VEX counterparts.
6624 return OpNum == 2 && !ForLoadFold;
6625
6626 case X86::VMOVLHPSrr:
6627 case X86::VMOVLHPSZrr:
6628 case X86::VPACKSSWBrr:
6629 case X86::VPACKUSWBrr:
6630 case X86::VPACKSSDWrr:
6631 case X86::VPACKUSDWrr:
6632 case X86::VPACKSSWBZ128rr:
6633 case X86::VPACKUSWBZ128rr:
6634 case X86::VPACKSSDWZ128rr:
6635 case X86::VPACKUSDWZ128rr:
6636 case X86::VPERM2F128rr:
6637 case X86::VPERM2I128rr:
6638 case X86::VSHUFF32X4Z256rri:
6639 case X86::VSHUFF32X4Zrri:
6640 case X86::VSHUFF64X2Z256rri:
6641 case X86::VSHUFF64X2Zrri:
6642 case X86::VSHUFI32X4Z256rri:
6643 case X86::VSHUFI32X4Zrri:
6644 case X86::VSHUFI64X2Z256rri:
6645 case X86::VSHUFI64X2Zrri:
6646 case X86::VPUNPCKHBWrr:
6647 case X86::VPUNPCKLBWrr:
6648 case X86::VPUNPCKHBWYrr:
6649 case X86::VPUNPCKLBWYrr:
6650 case X86::VPUNPCKHBWZ128rr:
6651 case X86::VPUNPCKLBWZ128rr:
6652 case X86::VPUNPCKHBWZ256rr:
6653 case X86::VPUNPCKLBWZ256rr:
6654 case X86::VPUNPCKHBWZrr:
6655 case X86::VPUNPCKLBWZrr:
6656 case X86::VPUNPCKHWDrr:
6657 case X86::VPUNPCKLWDrr:
6658 case X86::VPUNPCKHWDYrr:
6659 case X86::VPUNPCKLWDYrr:
6660 case X86::VPUNPCKHWDZ128rr:
6661 case X86::VPUNPCKLWDZ128rr:
6662 case X86::VPUNPCKHWDZ256rr:
6663 case X86::VPUNPCKLWDZ256rr:
6664 case X86::VPUNPCKHWDZrr:
6665 case X86::VPUNPCKLWDZrr:
6666 case X86::VPUNPCKHDQrr:
6667 case X86::VPUNPCKLDQrr:
6668 case X86::VPUNPCKHDQYrr:
6669 case X86::VPUNPCKLDQYrr:
6670 case X86::VPUNPCKHDQZ128rr:
6671 case X86::VPUNPCKLDQZ128rr:
6672 case X86::VPUNPCKHDQZ256rr:
6673 case X86::VPUNPCKLDQZ256rr:
6674 case X86::VPUNPCKHDQZrr:
6675 case X86::VPUNPCKLDQZrr:
6676 case X86::VPUNPCKHQDQrr:
6677 case X86::VPUNPCKLQDQrr:
6678 case X86::VPUNPCKHQDQYrr:
6679 case X86::VPUNPCKLQDQYrr:
6680 case X86::VPUNPCKHQDQZ128rr:
6681 case X86::VPUNPCKLQDQZ128rr:
6682 case X86::VPUNPCKHQDQZ256rr:
6683 case X86::VPUNPCKLQDQZ256rr:
6684 case X86::VPUNPCKHQDQZrr:
6685 case X86::VPUNPCKLQDQZrr:
6686 // These instructions are sometimes used with an undef first or second
6687 // source. Return true here so BreakFalseDeps will assign this source to the
6688 // same register as the first source to avoid a false dependency.
6689 return (OpNum == 1 || OpNum == 2) && !ForLoadFold;
6690
6691 case X86::VCVTSI2SSrr:
6692 case X86::VCVTSI2SSrm:
6693 case X86::VCVTSI2SSrr_Int:
6694 case X86::VCVTSI2SSrm_Int:
6695 case X86::VCVTSI642SSrr:
6696 case X86::VCVTSI642SSrm:
6697 case X86::VCVTSI642SSrr_Int:
6698 case X86::VCVTSI642SSrm_Int:
6699 case X86::VCVTSI2SDrr:
6700 case X86::VCVTSI2SDrm:
6701 case X86::VCVTSI2SDrr_Int:
6702 case X86::VCVTSI2SDrm_Int:
6703 case X86::VCVTSI642SDrr:
6704 case X86::VCVTSI642SDrm:
6705 case X86::VCVTSI642SDrr_Int:
6706 case X86::VCVTSI642SDrm_Int:
6707 // AVX-512
6708 case X86::VCVTSI2SSZrr:
6709 case X86::VCVTSI2SSZrm:
6710 case X86::VCVTSI2SSZrr_Int:
6711 case X86::VCVTSI2SSZrrb_Int:
6712 case X86::VCVTSI2SSZrm_Int:
6713 case X86::VCVTSI642SSZrr:
6714 case X86::VCVTSI642SSZrm:
6715 case X86::VCVTSI642SSZrr_Int:
6716 case X86::VCVTSI642SSZrrb_Int:
6717 case X86::VCVTSI642SSZrm_Int:
6718 case X86::VCVTSI2SDZrr:
6719 case X86::VCVTSI2SDZrm:
6720 case X86::VCVTSI2SDZrr_Int:
6721 case X86::VCVTSI2SDZrm_Int:
6722 case X86::VCVTSI642SDZrr:
6723 case X86::VCVTSI642SDZrm:
6724 case X86::VCVTSI642SDZrr_Int:
6725 case X86::VCVTSI642SDZrrb_Int:
6726 case X86::VCVTSI642SDZrm_Int:
6727 case X86::VCVTUSI2SSZrr:
6728 case X86::VCVTUSI2SSZrm:
6729 case X86::VCVTUSI2SSZrr_Int:
6730 case X86::VCVTUSI2SSZrrb_Int:
6731 case X86::VCVTUSI2SSZrm_Int:
6732 case X86::VCVTUSI642SSZrr:
6733 case X86::VCVTUSI642SSZrm:
6734 case X86::VCVTUSI642SSZrr_Int:
6735 case X86::VCVTUSI642SSZrrb_Int:
6736 case X86::VCVTUSI642SSZrm_Int:
6737 case X86::VCVTUSI2SDZrr:
6738 case X86::VCVTUSI2SDZrm:
6739 case X86::VCVTUSI2SDZrr_Int:
6740 case X86::VCVTUSI2SDZrm_Int:
6741 case X86::VCVTUSI642SDZrr:
6742 case X86::VCVTUSI642SDZrm:
6743 case X86::VCVTUSI642SDZrr_Int:
6744 case X86::VCVTUSI642SDZrrb_Int:
6745 case X86::VCVTUSI642SDZrm_Int:
6746 case X86::VCVTSI2SHZrr:
6747 case X86::VCVTSI2SHZrm:
6748 case X86::VCVTSI2SHZrr_Int:
6749 case X86::VCVTSI2SHZrrb_Int:
6750 case X86::VCVTSI2SHZrm_Int:
6751 case X86::VCVTSI642SHZrr:
6752 case X86::VCVTSI642SHZrm:
6753 case X86::VCVTSI642SHZrr_Int:
6754 case X86::VCVTSI642SHZrrb_Int:
6755 case X86::VCVTSI642SHZrm_Int:
6756 case X86::VCVTUSI2SHZrr:
6757 case X86::VCVTUSI2SHZrm:
6758 case X86::VCVTUSI2SHZrr_Int:
6759 case X86::VCVTUSI2SHZrrb_Int:
6760 case X86::VCVTUSI2SHZrm_Int:
6761 case X86::VCVTUSI642SHZrr:
6762 case X86::VCVTUSI642SHZrm:
6763 case X86::VCVTUSI642SHZrr_Int:
6764 case X86::VCVTUSI642SHZrrb_Int:
6765 case X86::VCVTUSI642SHZrm_Int:
6766 // Load folding won't effect the undef register update since the input is
6767 // a GPR.
6768 return OpNum == 1 && !ForLoadFold;
6769 case X86::VCVTSD2SSrr:
6770 case X86::VCVTSD2SSrm:
6771 case X86::VCVTSD2SSrr_Int:
6772 case X86::VCVTSD2SSrm_Int:
6773 case X86::VCVTSS2SDrr:
6774 case X86::VCVTSS2SDrm:
6775 case X86::VCVTSS2SDrr_Int:
6776 case X86::VCVTSS2SDrm_Int:
6777 case X86::VRCPSSr:
6778 case X86::VRCPSSr_Int:
6779 case X86::VRCPSSm:
6780 case X86::VRCPSSm_Int:
6781 case X86::VROUNDSDri:
6782 case X86::VROUNDSDmi:
6783 case X86::VROUNDSDri_Int:
6784 case X86::VROUNDSDmi_Int:
6785 case X86::VROUNDSSri:
6786 case X86::VROUNDSSmi:
6787 case X86::VROUNDSSri_Int:
6788 case X86::VROUNDSSmi_Int:
6789 case X86::VRSQRTSSr:
6790 case X86::VRSQRTSSr_Int:
6791 case X86::VRSQRTSSm:
6792 case X86::VRSQRTSSm_Int:
6793 case X86::VSQRTSSr:
6794 case X86::VSQRTSSr_Int:
6795 case X86::VSQRTSSm:
6796 case X86::VSQRTSSm_Int:
6797 case X86::VSQRTSDr:
6798 case X86::VSQRTSDr_Int:
6799 case X86::VSQRTSDm:
6800 case X86::VSQRTSDm_Int:
6801 // AVX-512
6802 case X86::VCVTSD2SSZrr:
6803 case X86::VCVTSD2SSZrr_Int:
6804 case X86::VCVTSD2SSZrrb_Int:
6805 case X86::VCVTSD2SSZrm:
6806 case X86::VCVTSD2SSZrm_Int:
6807 case X86::VCVTSS2SDZrr:
6808 case X86::VCVTSS2SDZrr_Int:
6809 case X86::VCVTSS2SDZrrb_Int:
6810 case X86::VCVTSS2SDZrm:
6811 case X86::VCVTSS2SDZrm_Int:
6812 case X86::VGETEXPSDZr:
6813 case X86::VGETEXPSDZrb:
6814 case X86::VGETEXPSDZm:
6815 case X86::VGETEXPSSZr:
6816 case X86::VGETEXPSSZrb:
6817 case X86::VGETEXPSSZm:
6818 case X86::VGETMANTSDZrri:
6819 case X86::VGETMANTSDZrrib:
6820 case X86::VGETMANTSDZrmi:
6821 case X86::VGETMANTSSZrri:
6822 case X86::VGETMANTSSZrrib:
6823 case X86::VGETMANTSSZrmi:
6824 case X86::VRNDSCALESDZr:
6825 case X86::VRNDSCALESDZr_Int:
6826 case X86::VRNDSCALESDZrb_Int:
6827 case X86::VRNDSCALESDZm:
6828 case X86::VRNDSCALESDZm_Int:
6829 case X86::VRNDSCALESSZr:
6830 case X86::VRNDSCALESSZr_Int:
6831 case X86::VRNDSCALESSZrb_Int:
6832 case X86::VRNDSCALESSZm:
6833 case X86::VRNDSCALESSZm_Int:
6834 case X86::VRCP14SDZrr:
6835 case X86::VRCP14SDZrm:
6836 case X86::VRCP14SSZrr:
6837 case X86::VRCP14SSZrm:
6838 case X86::VRCPSHZrr:
6839 case X86::VRCPSHZrm:
6840 case X86::VRSQRTSHZrr:
6841 case X86::VRSQRTSHZrm:
6842 case X86::VREDUCESHZrmi:
6843 case X86::VREDUCESHZrri:
6844 case X86::VREDUCESHZrrib:
6845 case X86::VGETEXPSHZr:
6846 case X86::VGETEXPSHZrb:
6847 case X86::VGETEXPSHZm:
6848 case X86::VGETMANTSHZrri:
6849 case X86::VGETMANTSHZrrib:
6850 case X86::VGETMANTSHZrmi:
6851 case X86::VRNDSCALESHZr:
6852 case X86::VRNDSCALESHZr_Int:
6853 case X86::VRNDSCALESHZrb_Int:
6854 case X86::VRNDSCALESHZm:
6855 case X86::VRNDSCALESHZm_Int:
6856 case X86::VSQRTSHZr:
6857 case X86::VSQRTSHZr_Int:
6858 case X86::VSQRTSHZrb_Int:
6859 case X86::VSQRTSHZm:
6860 case X86::VSQRTSHZm_Int:
6861 case X86::VRCP28SDZr:
6862 case X86::VRCP28SDZrb:
6863 case X86::VRCP28SDZm:
6864 case X86::VRCP28SSZr:
6865 case X86::VRCP28SSZrb:
6866 case X86::VRCP28SSZm:
6867 case X86::VREDUCESSZrmi:
6868 case X86::VREDUCESSZrri:
6869 case X86::VREDUCESSZrrib:
6870 case X86::VRSQRT14SDZrr:
6871 case X86::VRSQRT14SDZrm:
6872 case X86::VRSQRT14SSZrr:
6873 case X86::VRSQRT14SSZrm:
6874 case X86::VRSQRT28SDZr:
6875 case X86::VRSQRT28SDZrb:
6876 case X86::VRSQRT28SDZm:
6877 case X86::VRSQRT28SSZr:
6878 case X86::VRSQRT28SSZrb:
6879 case X86::VRSQRT28SSZm:
6880 case X86::VSQRTSSZr:
6881 case X86::VSQRTSSZr_Int:
6882 case X86::VSQRTSSZrb_Int:
6883 case X86::VSQRTSSZm:
6884 case X86::VSQRTSSZm_Int:
6885 case X86::VSQRTSDZr:
6886 case X86::VSQRTSDZr_Int:
6887 case X86::VSQRTSDZrb_Int:
6888 case X86::VSQRTSDZm:
6889 case X86::VSQRTSDZm_Int:
6890 case X86::VCVTSD2SHZrr:
6891 case X86::VCVTSD2SHZrr_Int:
6892 case X86::VCVTSD2SHZrrb_Int:
6893 case X86::VCVTSD2SHZrm:
6894 case X86::VCVTSD2SHZrm_Int:
6895 case X86::VCVTSS2SHZrr:
6896 case X86::VCVTSS2SHZrr_Int:
6897 case X86::VCVTSS2SHZrrb_Int:
6898 case X86::VCVTSS2SHZrm:
6899 case X86::VCVTSS2SHZrm_Int:
6900 case X86::VCVTSH2SDZrr:
6901 case X86::VCVTSH2SDZrr_Int:
6902 case X86::VCVTSH2SDZrrb_Int:
6903 case X86::VCVTSH2SDZrm:
6904 case X86::VCVTSH2SDZrm_Int:
6905 case X86::VCVTSH2SSZrr:
6906 case X86::VCVTSH2SSZrr_Int:
6907 case X86::VCVTSH2SSZrrb_Int:
6908 case X86::VCVTSH2SSZrm:
6909 case X86::VCVTSH2SSZrm_Int:
6910 return OpNum == 1;
6911 case X86::VMOVSSZrrk:
6912 case X86::VMOVSDZrrk:
6913 return OpNum == 3 && !ForLoadFold;
6914 case X86::VMOVSSZrrkz:
6915 case X86::VMOVSDZrrkz:
6916 return OpNum == 2 && !ForLoadFold;
6917 }
6918
6919 return false;
6920}
6921
6922/// Inform the BreakFalseDeps pass how many idle instructions we would like
6923/// before certain undef register reads.
6924///
6925/// This catches the VCVTSI2SD family of instructions:
6926///
6927/// vcvtsi2sdq %rax, undef %xmm0, %xmm14
6928///
6929/// We should to be careful *not* to catch VXOR idioms which are presumably
6930/// handled specially in the pipeline:
6931///
6932/// vxorps undef %xmm1, undef %xmm1, %xmm1
6933///
6934/// Like getPartialRegUpdateClearance, this makes a strong assumption that the
6935/// high bits that are passed-through are not live.
6936unsigned
6938 const TargetRegisterInfo *TRI) const {
6939 const MachineOperand &MO = MI.getOperand(OpNum);
6940 if (MO.getReg().isPhysical() && hasUndefRegUpdate(MI.getOpcode(), OpNum))
6941 return UndefRegClearance;
6942
6943 return 0;
6944}
6945
6947 MachineInstr &MI, unsigned OpNum, const TargetRegisterInfo *TRI) const {
6948 Register Reg = MI.getOperand(OpNum).getReg();
6949 // If MI kills this register, the false dependence is already broken.
6950 if (MI.killsRegister(Reg, TRI))
6951 return;
6952
6953 if (X86::VR128RegClass.contains(Reg)) {
6954 // These instructions are all floating point domain, so xorps is the best
6955 // choice.
6956 unsigned Opc = Subtarget.hasAVX() ? X86::VXORPSrr : X86::XORPSrr;
6957 BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(Opc), Reg)
6958 .addReg(Reg, RegState::Undef)
6959 .addReg(Reg, RegState::Undef);
6960 MI.addRegisterKilled(Reg, TRI, true);
6961 } else if (X86::VR256RegClass.contains(Reg)) {
6962 // Use vxorps to clear the full ymm register.
6963 // It wants to read and write the xmm sub-register.
6964 Register XReg = TRI->getSubReg(Reg, X86::sub_xmm);
6965 BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(X86::VXORPSrr), XReg)
6966 .addReg(XReg, RegState::Undef)
6967 .addReg(XReg, RegState::Undef)
6969 MI.addRegisterKilled(Reg, TRI, true);
6970 } else if (X86::VR128XRegClass.contains(Reg)) {
6971 // Only handle VLX targets.
6972 if (!Subtarget.hasVLX())
6973 return;
6974 // Since vxorps requires AVX512DQ, vpxord should be the best choice.
6975 BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(X86::VPXORDZ128rr), Reg)
6976 .addReg(Reg, RegState::Undef)
6977 .addReg(Reg, RegState::Undef);
6978 MI.addRegisterKilled(Reg, TRI, true);
6979 } else if (X86::VR256XRegClass.contains(Reg) ||
6980 X86::VR512RegClass.contains(Reg)) {
6981 // Only handle VLX targets.
6982 if (!Subtarget.hasVLX())
6983 return;
6984 // Use vpxord to clear the full ymm/zmm register.
6985 // It wants to read and write the xmm sub-register.
6986 Register XReg = TRI->getSubReg(Reg, X86::sub_xmm);
6987 BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(X86::VPXORDZ128rr), XReg)
6988 .addReg(XReg, RegState::Undef)
6989 .addReg(XReg, RegState::Undef)
6991 MI.addRegisterKilled(Reg, TRI, true);
6992 } else if (X86::GR64RegClass.contains(Reg)) {
6993 // Using XOR32rr because it has shorter encoding and zeros up the upper bits
6994 // as well.
6995 Register XReg = TRI->getSubReg(Reg, X86::sub_32bit);
6996 BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(X86::XOR32rr), XReg)
6997 .addReg(XReg, RegState::Undef)
6998 .addReg(XReg, RegState::Undef)
7000 MI.addRegisterKilled(Reg, TRI, true);
7001 } else if (X86::GR32RegClass.contains(Reg)) {
7002 BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(X86::XOR32rr), Reg)
7003 .addReg(Reg, RegState::Undef)
7004 .addReg(Reg, RegState::Undef);
7005 MI.addRegisterKilled(Reg, TRI, true);
7006 }
7007}
7008
7010 int PtrOffset = 0) {
7011 unsigned NumAddrOps = MOs.size();
7012
7013 if (NumAddrOps < 4) {
7014 // FrameIndex only - add an immediate offset (whether its zero or not).
7015 for (unsigned i = 0; i != NumAddrOps; ++i)
7016 MIB.add(MOs[i]);
7017 addOffset(MIB, PtrOffset);
7018 } else {
7019 // General Memory Addressing - we need to add any offset to an existing
7020 // offset.
7021 assert(MOs.size() == 5 && "Unexpected memory operand list length");
7022 for (unsigned i = 0; i != NumAddrOps; ++i) {
7023 const MachineOperand &MO = MOs[i];
7024 if (i == 3 && PtrOffset != 0) {
7025 MIB.addDisp(MO, PtrOffset);
7026 } else {
7027 MIB.add(MO);
7028 }
7029 }
7030 }
7031}
7032
7034 MachineInstr &NewMI,
7035 const TargetInstrInfo &TII) {
7037 const TargetRegisterInfo &TRI = *MRI.getTargetRegisterInfo();
7038
7039 for (int Idx : llvm::seq<int>(0, NewMI.getNumOperands())) {
7040 MachineOperand &MO = NewMI.getOperand(Idx);
7041 // We only need to update constraints on virtual register operands.
7042 if (!MO.isReg())
7043 continue;
7044 Register Reg = MO.getReg();
7045 if (!Reg.isVirtual())
7046 continue;
7047
7048 auto *NewRC = MRI.constrainRegClass(
7049 Reg, TII.getRegClass(NewMI.getDesc(), Idx, &TRI, MF));
7050 if (!NewRC) {
7051 LLVM_DEBUG(
7052 dbgs() << "WARNING: Unable to update register constraint for operand "
7053 << Idx << " of instruction:\n";
7054 NewMI.dump(); dbgs() << "\n");
7055 }
7056 }
7057}
7058
7059static MachineInstr *FuseTwoAddrInst(MachineFunction &MF, unsigned Opcode,
7063 const TargetInstrInfo &TII) {
7064 // Create the base instruction with the memory operand as the first part.
7065 // Omit the implicit operands, something BuildMI can't do.
7066 MachineInstr *NewMI =
7067 MF.CreateMachineInstr(TII.get(Opcode), MI.getDebugLoc(), true);
7068 MachineInstrBuilder MIB(MF, NewMI);
7069 addOperands(MIB, MOs);
7070
7071 // Loop over the rest of the ri operands, converting them over.
7072 unsigned NumOps = MI.getDesc().getNumOperands() - 2;
7073 for (unsigned i = 0; i != NumOps; ++i) {
7074 MachineOperand &MO = MI.getOperand(i + 2);
7075 MIB.add(MO);
7076 }
7077 for (const MachineOperand &MO : llvm::drop_begin(MI.operands(), NumOps + 2))
7078 MIB.add(MO);
7079
7080 updateOperandRegConstraints(MF, *NewMI, TII);
7081
7082 MachineBasicBlock *MBB = InsertPt->getParent();
7083 MBB->insert(InsertPt, NewMI);
7084
7085 return MIB;
7086}
7087
7088static MachineInstr *FuseInst(MachineFunction &MF, unsigned Opcode,
7089 unsigned OpNo, ArrayRef<MachineOperand> MOs,
7092 int PtrOffset = 0) {
7093 // Omit the implicit operands, something BuildMI can't do.
7094 MachineInstr *NewMI =
7095 MF.CreateMachineInstr(TII.get(Opcode), MI.getDebugLoc(), true);
7096 MachineInstrBuilder MIB(MF, NewMI);
7097
7098 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
7099 MachineOperand &MO = MI.getOperand(i);
7100 if (i == OpNo) {
7101 assert(MO.isReg() && "Expected to fold into reg operand!");
7102 addOperands(MIB, MOs, PtrOffset);
7103 } else {
7104 MIB.add(MO);
7105 }
7106 }
7107
7108 updateOperandRegConstraints(MF, *NewMI, TII);
7109
7110 // Copy the NoFPExcept flag from the instruction we're fusing.
7113
7114 MachineBasicBlock *MBB = InsertPt->getParent();
7115 MBB->insert(InsertPt, NewMI);
7116
7117 return MIB;
7118}
7119
7120static MachineInstr *MakeM0Inst(const TargetInstrInfo &TII, unsigned Opcode,
7123 MachineInstr &MI) {
7124 MachineInstrBuilder MIB = BuildMI(*InsertPt->getParent(), InsertPt,
7125 MI.getDebugLoc(), TII.get(Opcode));
7126 addOperands(MIB, MOs);
7127 return MIB.addImm(0);
7128}
7129
7130MachineInstr *X86InstrInfo::foldMemoryOperandCustom(
7131 MachineFunction &MF, MachineInstr &MI, unsigned OpNum,
7133 unsigned Size, Align Alignment) const {
7134 switch (MI.getOpcode()) {
7135 case X86::INSERTPSrr:
7136 case X86::VINSERTPSrr:
7137 case X86::VINSERTPSZrr:
7138 // Attempt to convert the load of inserted vector into a fold load
7139 // of a single float.
7140 if (OpNum == 2) {
7141 unsigned Imm = MI.getOperand(MI.getNumOperands() - 1).getImm();
7142 unsigned ZMask = Imm & 15;
7143 unsigned DstIdx = (Imm >> 4) & 3;
7144 unsigned SrcIdx = (Imm >> 6) & 3;
7145
7147 const TargetRegisterClass *RC = getRegClass(MI.getDesc(), OpNum, &RI, MF);
7148 unsigned RCSize = TRI.getRegSizeInBits(*RC) / 8;
7149 if ((Size == 0 || Size >= 16) && RCSize >= 16 &&
7150 (MI.getOpcode() != X86::INSERTPSrr || Alignment >= Align(4))) {
7151 int PtrOffset = SrcIdx * 4;
7152 unsigned NewImm = (DstIdx << 4) | ZMask;
7153 unsigned NewOpCode =
7154 (MI.getOpcode() == X86::VINSERTPSZrr) ? X86::VINSERTPSZrm
7155 : (MI.getOpcode() == X86::VINSERTPSrr) ? X86::VINSERTPSrm
7156 : X86::INSERTPSrm;
7157 MachineInstr *NewMI =
7158 FuseInst(MF, NewOpCode, OpNum, MOs, InsertPt, MI, *this, PtrOffset);
7159 NewMI->getOperand(NewMI->getNumOperands() - 1).setImm(NewImm);
7160 return NewMI;
7161 }
7162 }
7163 break;
7164 case X86::MOVHLPSrr:
7165 case X86::VMOVHLPSrr:
7166 case X86::VMOVHLPSZrr:
7167 // Move the upper 64-bits of the second operand to the lower 64-bits.
7168 // To fold the load, adjust the pointer to the upper and use (V)MOVLPS.
7169 // TODO: In most cases AVX doesn't have a 8-byte alignment requirement.
7170 if (OpNum == 2) {
7172 const TargetRegisterClass *RC = getRegClass(MI.getDesc(), OpNum, &RI, MF);
7173 unsigned RCSize = TRI.getRegSizeInBits(*RC) / 8;
7174 if ((Size == 0 || Size >= 16) && RCSize >= 16 && Alignment >= Align(8)) {
7175 unsigned NewOpCode =
7176 (MI.getOpcode() == X86::VMOVHLPSZrr) ? X86::VMOVLPSZ128rm
7177 : (MI.getOpcode() == X86::VMOVHLPSrr) ? X86::VMOVLPSrm
7178 : X86::MOVLPSrm;
7179 MachineInstr *NewMI =
7180 FuseInst(MF, NewOpCode, OpNum, MOs, InsertPt, MI, *this, 8);
7181 return NewMI;
7182 }
7183 }
7184 break;
7185 case X86::UNPCKLPDrr:
7186 // If we won't be able to fold this to the memory form of UNPCKL, use
7187 // MOVHPD instead. Done as custom because we can't have this in the load
7188 // table twice.
7189 if (OpNum == 2) {
7191 const TargetRegisterClass *RC = getRegClass(MI.getDesc(), OpNum, &RI, MF);
7192 unsigned RCSize = TRI.getRegSizeInBits(*RC) / 8;
7193 if ((Size == 0 || Size >= 16) && RCSize >= 16 && Alignment < Align(16)) {
7194 MachineInstr *NewMI =
7195 FuseInst(MF, X86::MOVHPDrm, OpNum, MOs, InsertPt, MI, *this);
7196 return NewMI;
7197 }
7198 }
7199 break;
7200 }
7201
7202 return nullptr;
7203}
7204
7206 MachineInstr &MI) {
7207 if (!hasUndefRegUpdate(MI.getOpcode(), 1, /*ForLoadFold*/ true) ||
7208 !MI.getOperand(1).isReg())
7209 return false;
7210
7211 // The are two cases we need to handle depending on where in the pipeline
7212 // the folding attempt is being made.
7213 // -Register has the undef flag set.
7214 // -Register is produced by the IMPLICIT_DEF instruction.
7215
7216 if (MI.getOperand(1).isUndef())
7217 return true;
7218
7220 MachineInstr *VRegDef = RegInfo.getUniqueVRegDef(MI.getOperand(1).getReg());
7221 return VRegDef && VRegDef->isImplicitDef();
7222}
7223
7224unsigned X86InstrInfo::commuteOperandsForFold(MachineInstr &MI,
7225 unsigned Idx1) const {
7226 unsigned Idx2 = CommuteAnyOperandIndex;
7227 if (!findCommutedOpIndices(MI, Idx1, Idx2))
7228 return Idx1;
7229
7230 bool HasDef = MI.getDesc().getNumDefs();
7231 Register Reg0 = HasDef ? MI.getOperand(0).getReg() : Register();
7232 Register Reg1 = MI.getOperand(Idx1).getReg();
7233 Register Reg2 = MI.getOperand(Idx2).getReg();
7234 bool Tied1 = 0 == MI.getDesc().getOperandConstraint(Idx1, MCOI::TIED_TO);
7235 bool Tied2 = 0 == MI.getDesc().getOperandConstraint(Idx2, MCOI::TIED_TO);
7236
7237 // If either of the commutable operands are tied to the destination
7238 // then we can not commute + fold.
7239 if ((HasDef && Reg0 == Reg1 && Tied1) || (HasDef && Reg0 == Reg2 && Tied2))
7240 return Idx1;
7241
7242 return commuteInstruction(MI, false, Idx1, Idx2) ? Idx2 : Idx1;
7243}
7244
7245static void printFailMsgforFold(const MachineInstr &MI, unsigned Idx) {
7246 if (PrintFailedFusing && !MI.isCopy())
7247 dbgs() << "We failed to fuse operand " << Idx << " in " << MI;
7248}
7249
7251 MachineFunction &MF, MachineInstr &MI, unsigned OpNum,
7253 unsigned Size, Align Alignment, bool AllowCommute) const {
7254 bool isSlowTwoMemOps = Subtarget.slowTwoMemOps();
7255 bool isTwoAddrFold = false;
7256
7257 // For CPUs that favor the register form of a call or push,
7258 // do not fold loads into calls or pushes, unless optimizing for size
7259 // aggressively.
7260 if (isSlowTwoMemOps && !MF.getFunction().hasMinSize() &&
7261 (MI.getOpcode() == X86::CALL32r || MI.getOpcode() == X86::CALL64r ||
7262 MI.getOpcode() == X86::PUSH16r || MI.getOpcode() == X86::PUSH32r ||
7263 MI.getOpcode() == X86::PUSH64r))
7264 return nullptr;
7265
7266 // Avoid partial and undef register update stalls unless optimizing for size.
7267 if (!MF.getFunction().hasOptSize() &&
7268 (hasPartialRegUpdate(MI.getOpcode(), Subtarget, /*ForLoadFold*/ true) ||
7270 return nullptr;
7271
7272 unsigned NumOps = MI.getDesc().getNumOperands();
7273 bool isTwoAddr =
7274 NumOps > 1 && MI.getDesc().getOperandConstraint(1, MCOI::TIED_TO) != -1;
7275
7276 // FIXME: AsmPrinter doesn't know how to handle
7277 // X86II::MO_GOT_ABSOLUTE_ADDRESS after folding.
7278 if (MI.getOpcode() == X86::ADD32ri &&
7279 MI.getOperand(2).getTargetFlags() == X86II::MO_GOT_ABSOLUTE_ADDRESS)
7280 return nullptr;
7281
7282 // GOTTPOFF relocation loads can only be folded into add instructions.
7283 // FIXME: Need to exclude other relocations that only support specific
7284 // instructions.
7285 if (MOs.size() == X86::AddrNumOperands &&
7286 MOs[X86::AddrDisp].getTargetFlags() == X86II::MO_GOTTPOFF &&
7287 MI.getOpcode() != X86::ADD64rr)
7288 return nullptr;
7289
7290 // Don't fold loads into indirect calls that need a KCFI check as we'll
7291 // have to unfold these in X86TargetLowering::EmitKCFICheck anyway.
7292 if (MI.isCall() && MI.getCFIType())
7293 return nullptr;
7294
7295 MachineInstr *NewMI = nullptr;
7296
7297 // Attempt to fold any custom cases we have.
7298 if (MachineInstr *CustomMI = foldMemoryOperandCustom(
7299 MF, MI, OpNum, MOs, InsertPt, Size, Alignment))
7300 return CustomMI;
7301
7302 const X86FoldTableEntry *I = nullptr;
7303
7304 // Folding a memory location into the two-address part of a two-address
7305 // instruction is different than folding it other places. It requires
7306 // replacing the *two* registers with the memory location.
7307 if (isTwoAddr && NumOps >= 2 && OpNum < 2 && MI.getOperand(0).isReg() &&
7308 MI.getOperand(1).isReg() &&
7309 MI.getOperand(0).getReg() == MI.getOperand(1).getReg()) {
7310 I = lookupTwoAddrFoldTable(MI.getOpcode());
7311 isTwoAddrFold = true;
7312 } else {
7313 if (OpNum == 0) {
7314 if (MI.getOpcode() == X86::MOV32r0) {
7315 NewMI = MakeM0Inst(*this, X86::MOV32mi, MOs, InsertPt, MI);
7316 if (NewMI)
7317 return NewMI;
7318 }
7319 }
7320
7321 I = lookupFoldTable(MI.getOpcode(), OpNum);
7322 }
7323
7324 if (I != nullptr) {
7325 unsigned Opcode = I->DstOp;
7326 if (Alignment <
7327 Align(1ULL << ((I->Flags & TB_ALIGN_MASK) >> TB_ALIGN_SHIFT)))
7328 return nullptr;
7329 bool NarrowToMOV32rm = false;
7330 if (Size) {
7332 const TargetRegisterClass *RC = getRegClass(MI.getDesc(), OpNum, &RI, MF);
7333 unsigned RCSize = TRI.getRegSizeInBits(*RC) / 8;
7334 // Check if it's safe to fold the load. If the size of the object is
7335 // narrower than the load width, then it's not.
7336 // FIXME: Allow scalar intrinsic instructions like ADDSSrm_Int.
7337 if ((I->Flags & TB_FOLDED_LOAD) && Size < RCSize) {
7338 // If this is a 64-bit load, but the spill slot is 32, then we can do
7339 // a 32-bit load which is implicitly zero-extended. This likely is
7340 // due to live interval analysis remat'ing a load from stack slot.
7341 if (Opcode != X86::MOV64rm || RCSize != 8 || Size != 4)
7342 return nullptr;
7343 if (MI.getOperand(0).getSubReg() || MI.getOperand(1).getSubReg())
7344 return nullptr;
7345 Opcode = X86::MOV32rm;
7346 NarrowToMOV32rm = true;
7347 }
7348 // For stores, make sure the size of the object is equal to the size of
7349 // the store. If the object is larger, the extra bits would be garbage. If
7350 // the object is smaller we might overwrite another object or fault.
7351 if ((I->Flags & TB_FOLDED_STORE) && Size != RCSize)
7352 return nullptr;
7353 }
7354
7355 if (isTwoAddrFold)
7356 NewMI = FuseTwoAddrInst(MF, Opcode, MOs, InsertPt, MI, *this);
7357 else
7358 NewMI = FuseInst(MF, Opcode, OpNum, MOs, InsertPt, MI, *this);
7359
7360 if (NarrowToMOV32rm) {
7361 // If this is the special case where we use a MOV32rm to load a 32-bit
7362 // value and zero-extend the top bits. Change the destination register
7363 // to a 32-bit one.
7364 Register DstReg = NewMI->getOperand(0).getReg();
7365 if (DstReg.isPhysical())
7366 NewMI->getOperand(0).setReg(RI.getSubReg(DstReg, X86::sub_32bit));
7367 else
7368 NewMI->getOperand(0).setSubReg(X86::sub_32bit);
7369 }
7370 return NewMI;
7371 }
7372
7373 if (AllowCommute) {
7374 // If the instruction and target operand are commutable, commute the
7375 // instruction and try again.
7376 unsigned CommuteOpIdx2 = commuteOperandsForFold(MI, OpNum);
7377 if (CommuteOpIdx2 == OpNum) {
7378 printFailMsgforFold(MI, OpNum);
7379 return nullptr;
7380 }
7381 // Attempt to fold with the commuted version of the instruction.
7382 NewMI = foldMemoryOperandImpl(MF, MI, CommuteOpIdx2, MOs, InsertPt, Size,
7383 Alignment, /*AllowCommute=*/false);
7384 if (NewMI)
7385 return NewMI;
7386 // Folding failed again - undo the commute before returning.
7387 commuteInstruction(MI, false, OpNum, CommuteOpIdx2);
7388 }
7389
7390 printFailMsgforFold(MI, OpNum);
7391 return nullptr;
7392}
7393
7396 MachineBasicBlock::iterator InsertPt, int FrameIndex, LiveIntervals *LIS,
7397 VirtRegMap *VRM) const {
7398 // Check switch flag
7399 if (NoFusing)
7400 return nullptr;
7401
7402 // Avoid partial and undef register update stalls unless optimizing for size.
7403 if (!MF.getFunction().hasOptSize() &&
7404 (hasPartialRegUpdate(MI.getOpcode(), Subtarget, /*ForLoadFold*/ true) ||
7406 return nullptr;
7407
7408 // Don't fold subreg spills, or reloads that use a high subreg.
7409 for (auto Op : Ops) {
7410 MachineOperand &MO = MI.getOperand(Op);
7411 auto SubReg = MO.getSubReg();
7412 if (SubReg && (MO.isDef() || SubReg == X86::sub_8bit_hi))
7413 return nullptr;
7414 }
7415
7416 const MachineFrameInfo &MFI = MF.getFrameInfo();
7417 unsigned Size = MFI.getObjectSize(FrameIndex);
7418 Align Alignment = MFI.getObjectAlign(FrameIndex);
7419 // If the function stack isn't realigned we don't want to fold instructions
7420 // that need increased alignment.
7421 if (!RI.hasStackRealignment(MF))
7422 Alignment =
7423 std::min(Alignment, Subtarget.getFrameLowering()->getStackAlign());
7424 if (Ops.size() == 2 && Ops[0] == 0 && Ops[1] == 1) {
7425 unsigned NewOpc = 0;
7426 unsigned RCSize = 0;
7427 switch (MI.getOpcode()) {
7428 default:
7429 return nullptr;
7430 case X86::TEST8rr:
7431 NewOpc = X86::CMP8ri;
7432 RCSize = 1;
7433 break;
7434 case X86::TEST16rr:
7435 NewOpc = X86::CMP16ri;
7436 RCSize = 2;
7437 break;
7438 case X86::TEST32rr:
7439 NewOpc = X86::CMP32ri;
7440 RCSize = 4;
7441 break;
7442 case X86::TEST64rr:
7443 NewOpc = X86::CMP64ri32;
7444 RCSize = 8;
7445 break;
7446 }
7447 // Check if it's safe to fold the load. If the size of the object is
7448 // narrower than the load width, then it's not.
7449 if (Size < RCSize)
7450 return nullptr;
7451 // Change to CMPXXri r, 0 first.
7452 MI.setDesc(get(NewOpc));
7453 MI.getOperand(1).ChangeToImmediate(0);
7454 } else if (Ops.size() != 1)
7455 return nullptr;
7456
7457 return foldMemoryOperandImpl(MF, MI, Ops[0],
7458 MachineOperand::CreateFI(FrameIndex), InsertPt,
7459 Size, Alignment, /*AllowCommute=*/true);
7460}
7461
7462/// Check if \p LoadMI is a partial register load that we can't fold into \p MI
7463/// because the latter uses contents that wouldn't be defined in the folded
7464/// version. For instance, this transformation isn't legal:
7465/// movss (%rdi), %xmm0
7466/// addps %xmm0, %xmm0
7467/// ->
7468/// addps (%rdi), %xmm0
7469///
7470/// But this one is:
7471/// movss (%rdi), %xmm0
7472/// addss %xmm0, %xmm0
7473/// ->
7474/// addss (%rdi), %xmm0
7475///
7477 const MachineInstr &UserMI,
7478 const MachineFunction &MF) {
7479 unsigned Opc = LoadMI.getOpcode();
7480 unsigned UserOpc = UserMI.getOpcode();
7482 const TargetRegisterClass *RC =
7483 MF.getRegInfo().getRegClass(LoadMI.getOperand(0).getReg());
7484 unsigned RegSize = TRI.getRegSizeInBits(*RC);
7485
7486 if ((Opc == X86::MOVSSrm || Opc == X86::VMOVSSrm || Opc == X86::VMOVSSZrm ||
7487 Opc == X86::MOVSSrm_alt || Opc == X86::VMOVSSrm_alt ||
7488 Opc == X86::VMOVSSZrm_alt) &&
7489 RegSize > 32) {
7490 // These instructions only load 32 bits, we can't fold them if the
7491 // destination register is wider than 32 bits (4 bytes), and its user
7492 // instruction isn't scalar (SS).
7493 switch (UserOpc) {
7494 case X86::CVTSS2SDrr_Int:
7495 case X86::VCVTSS2SDrr_Int:
7496 case X86::VCVTSS2SDZrr_Int:
7497 case X86::VCVTSS2SDZrr_Intk:
7498 case X86::VCVTSS2SDZrr_Intkz:
7499 case X86::CVTSS2SIrr_Int:
7500 case X86::CVTSS2SI64rr_Int:
7501 case X86::VCVTSS2SIrr_Int:
7502 case X86::VCVTSS2SI64rr_Int:
7503 case X86::VCVTSS2SIZrr_Int:
7504 case X86::VCVTSS2SI64Zrr_Int:
7505 case X86::CVTTSS2SIrr_Int:
7506 case X86::CVTTSS2SI64rr_Int:
7507 case X86::VCVTTSS2SIrr_Int:
7508 case X86::VCVTTSS2SI64rr_Int:
7509 case X86::VCVTTSS2SIZrr_Int:
7510 case X86::VCVTTSS2SI64Zrr_Int:
7511 case X86::VCVTSS2USIZrr_Int:
7512 case X86::VCVTSS2USI64Zrr_Int:
7513 case X86::VCVTTSS2USIZrr_Int:
7514 case X86::VCVTTSS2USI64Zrr_Int:
7515 case X86::RCPSSr_Int:
7516 case X86::VRCPSSr_Int:
7517 case X86::RSQRTSSr_Int:
7518 case X86::VRSQRTSSr_Int:
7519 case X86::ROUNDSSri_Int:
7520 case X86::VROUNDSSri_Int:
7521 case X86::COMISSrr_Int:
7522 case X86::VCOMISSrr_Int:
7523 case X86::VCOMISSZrr_Int:
7524 case X86::UCOMISSrr_Int:
7525 case X86::VUCOMISSrr_Int:
7526 case X86::VUCOMISSZrr_Int:
7527 case X86::ADDSSrr_Int:
7528 case X86::VADDSSrr_Int:
7529 case X86::VADDSSZrr_Int:
7530 case X86::CMPSSrri_Int:
7531 case X86::VCMPSSrri_Int:
7532 case X86::VCMPSSZrri_Int:
7533 case X86::DIVSSrr_Int:
7534 case X86::VDIVSSrr_Int:
7535 case X86::VDIVSSZrr_Int:
7536 case X86::MAXSSrr_Int:
7537 case X86::VMAXSSrr_Int:
7538 case X86::VMAXSSZrr_Int:
7539 case X86::MINSSrr_Int:
7540 case X86::VMINSSrr_Int:
7541 case X86::VMINSSZrr_Int:
7542 case X86::MULSSrr_Int:
7543 case X86::VMULSSrr_Int:
7544 case X86::VMULSSZrr_Int:
7545 case X86::SQRTSSr_Int:
7546 case X86::VSQRTSSr_Int:
7547 case X86::VSQRTSSZr_Int:
7548 case X86::SUBSSrr_Int:
7549 case X86::VSUBSSrr_Int:
7550 case X86::VSUBSSZrr_Int:
7551 case X86::VADDSSZrr_Intk:
7552 case X86::VADDSSZrr_Intkz:
7553 case X86::VCMPSSZrri_Intk:
7554 case X86::VDIVSSZrr_Intk:
7555 case X86::VDIVSSZrr_Intkz:
7556 case X86::VMAXSSZrr_Intk:
7557 case X86::VMAXSSZrr_Intkz:
7558 case X86::VMINSSZrr_Intk:
7559 case X86::VMINSSZrr_Intkz:
7560 case X86::VMULSSZrr_Intk:
7561 case X86::VMULSSZrr_Intkz:
7562 case X86::VSQRTSSZr_Intk:
7563 case X86::VSQRTSSZr_Intkz:
7564 case X86::VSUBSSZrr_Intk:
7565 case X86::VSUBSSZrr_Intkz:
7566 case X86::VFMADDSS4rr_Int:
7567 case X86::VFNMADDSS4rr_Int:
7568 case X86::VFMSUBSS4rr_Int:
7569 case X86::VFNMSUBSS4rr_Int:
7570 case X86::VFMADD132SSr_Int:
7571 case X86::VFNMADD132SSr_Int:
7572 case X86::VFMADD213SSr_Int:
7573 case X86::VFNMADD213SSr_Int:
7574 case X86::VFMADD231SSr_Int:
7575 case X86::VFNMADD231SSr_Int:
7576 case X86::VFMSUB132SSr_Int:
7577 case X86::VFNMSUB132SSr_Int:
7578 case X86::VFMSUB213SSr_Int:
7579 case X86::VFNMSUB213SSr_Int:
7580 case X86::VFMSUB231SSr_Int:
7581 case X86::VFNMSUB231SSr_Int:
7582 case X86::VFMADD132SSZr_Int:
7583 case X86::VFNMADD132SSZr_Int:
7584 case X86::VFMADD213SSZr_Int:
7585 case X86::VFNMADD213SSZr_Int:
7586 case X86::VFMADD231SSZr_Int:
7587 case X86::VFNMADD231SSZr_Int:
7588 case X86::VFMSUB132SSZr_Int:
7589 case X86::VFNMSUB132SSZr_Int:
7590 case X86::VFMSUB213SSZr_Int:
7591 case X86::VFNMSUB213SSZr_Int:
7592 case X86::VFMSUB231SSZr_Int:
7593 case X86::VFNMSUB231SSZr_Int:
7594 case X86::VFMADD132SSZr_Intk:
7595 case X86::VFNMADD132SSZr_Intk:
7596 case X86::VFMADD213SSZr_Intk:
7597 case X86::VFNMADD213SSZr_Intk:
7598 case X86::VFMADD231SSZr_Intk:
7599 case X86::VFNMADD231SSZr_Intk:
7600 case X86::VFMSUB132SSZr_Intk:
7601 case X86::VFNMSUB132SSZr_Intk:
7602 case X86::VFMSUB213SSZr_Intk:
7603 case X86::VFNMSUB213SSZr_Intk:
7604 case X86::VFMSUB231SSZr_Intk:
7605 case X86::VFNMSUB231SSZr_Intk:
7606 case X86::VFMADD132SSZr_Intkz:
7607 case X86::VFNMADD132SSZr_Intkz:
7608 case X86::VFMADD213SSZr_Intkz:
7609 case X86::VFNMADD213SSZr_Intkz:
7610 case X86::VFMADD231SSZr_Intkz:
7611 case X86::VFNMADD231SSZr_Intkz:
7612 case X86::VFMSUB132SSZr_Intkz:
7613 case X86::VFNMSUB132SSZr_Intkz:
7614 case X86::VFMSUB213SSZr_Intkz:
7615 case X86::VFNMSUB213SSZr_Intkz:
7616 case X86::VFMSUB231SSZr_Intkz:
7617 case X86::VFNMSUB231SSZr_Intkz:
7618 case X86::VFIXUPIMMSSZrri:
7619 case X86::VFIXUPIMMSSZrrik:
7620 case X86::VFIXUPIMMSSZrrikz:
7621 case X86::VFPCLASSSSZrr:
7622 case X86::VFPCLASSSSZrrk:
7623 case X86::VGETEXPSSZr:
7624 case X86::VGETEXPSSZrk:
7625 case X86::VGETEXPSSZrkz:
7626 case X86::VGETMANTSSZrri:
7627 case X86::VGETMANTSSZrrik:
7628 case X86::VGETMANTSSZrrikz:
7629 case X86::VRANGESSZrri:
7630 case X86::VRANGESSZrrik:
7631 case X86::VRANGESSZrrikz:
7632 case X86::VRCP14SSZrr:
7633 case X86::VRCP14SSZrrk:
7634 case X86::VRCP14SSZrrkz:
7635 case X86::VRCP28SSZr:
7636 case X86::VRCP28SSZrk:
7637 case X86::VRCP28SSZrkz:
7638 case X86::VREDUCESSZrri:
7639 case X86::VREDUCESSZrrik:
7640 case X86::VREDUCESSZrrikz:
7641 case X86::VRNDSCALESSZr_Int:
7642 case X86::VRNDSCALESSZr_Intk:
7643 case X86::VRNDSCALESSZr_Intkz:
7644 case X86::VRSQRT14SSZrr:
7645 case X86::VRSQRT14SSZrrk:
7646 case X86::VRSQRT14SSZrrkz:
7647 case X86::VRSQRT28SSZr:
7648 case X86::VRSQRT28SSZrk:
7649 case X86::VRSQRT28SSZrkz:
7650 case X86::VSCALEFSSZrr:
7651 case X86::VSCALEFSSZrrk:
7652 case X86::VSCALEFSSZrrkz:
7653 return false;
7654 default:
7655 return true;
7656 }
7657 }
7658
7659 if ((Opc == X86::MOVSDrm || Opc == X86::VMOVSDrm || Opc == X86::VMOVSDZrm ||
7660 Opc == X86::MOVSDrm_alt || Opc == X86::VMOVSDrm_alt ||
7661 Opc == X86::VMOVSDZrm_alt) &&
7662 RegSize > 64) {
7663 // These instructions only load 64 bits, we can't fold them if the
7664 // destination register is wider than 64 bits (8 bytes), and its user
7665 // instruction isn't scalar (SD).
7666 switch (UserOpc) {
7667 case X86::CVTSD2SSrr_Int:
7668 case X86::VCVTSD2SSrr_Int:
7669 case X86::VCVTSD2SSZrr_Int:
7670 case X86::VCVTSD2SSZrr_Intk:
7671 case X86::VCVTSD2SSZrr_Intkz:
7672 case X86::CVTSD2SIrr_Int:
7673 case X86::CVTSD2SI64rr_Int:
7674 case X86::VCVTSD2SIrr_Int:
7675 case X86::VCVTSD2SI64rr_Int:
7676 case X86::VCVTSD2SIZrr_Int:
7677 case X86::VCVTSD2SI64Zrr_Int:
7678 case X86::CVTTSD2SIrr_Int:
7679 case X86::CVTTSD2SI64rr_Int:
7680 case X86::VCVTTSD2SIrr_Int:
7681 case X86::VCVTTSD2SI64rr_Int:
7682 case X86::VCVTTSD2SIZrr_Int:
7683 case X86::VCVTTSD2SI64Zrr_Int:
7684 case X86::VCVTSD2USIZrr_Int:
7685 case X86::VCVTSD2USI64Zrr_Int:
7686 case X86::VCVTTSD2USIZrr_Int:
7687 case X86::VCVTTSD2USI64Zrr_Int:
7688 case X86::ROUNDSDri_Int:
7689 case X86::VROUNDSDri_Int:
7690 case X86::COMISDrr_Int:
7691 case X86::VCOMISDrr_Int:
7692 case X86::VCOMISDZrr_Int:
7693 case X86::UCOMISDrr_Int:
7694 case X86::VUCOMISDrr_Int:
7695 case X86::VUCOMISDZrr_Int:
7696 case X86::ADDSDrr_Int:
7697 case X86::VADDSDrr_Int:
7698 case X86::VADDSDZrr_Int:
7699 case X86::CMPSDrri_Int:
7700 case X86::VCMPSDrri_Int:
7701 case X86::VCMPSDZrri_Int:
7702 case X86::DIVSDrr_Int:
7703 case X86::VDIVSDrr_Int:
7704 case X86::VDIVSDZrr_Int:
7705 case X86::MAXSDrr_Int:
7706 case X86::VMAXSDrr_Int:
7707 case X86::VMAXSDZrr_Int:
7708 case X86::MINSDrr_Int:
7709 case X86::VMINSDrr_Int:
7710 case X86::VMINSDZrr_Int:
7711 case X86::MULSDrr_Int:
7712 case X86::VMULSDrr_Int:
7713 case X86::VMULSDZrr_Int:
7714 case X86::SQRTSDr_Int:
7715 case X86::VSQRTSDr_Int:
7716 case X86::VSQRTSDZr_Int:
7717 case X86::SUBSDrr_Int:
7718 case X86::VSUBSDrr_Int:
7719 case X86::VSUBSDZrr_Int:
7720 case X86::VADDSDZrr_Intk:
7721 case X86::VADDSDZrr_Intkz:
7722 case X86::VCMPSDZrri_Intk:
7723 case X86::VDIVSDZrr_Intk:
7724 case X86::VDIVSDZrr_Intkz:
7725 case X86::VMAXSDZrr_Intk:
7726 case X86::VMAXSDZrr_Intkz:
7727 case X86::VMINSDZrr_Intk:
7728 case X86::VMINSDZrr_Intkz:
7729 case X86::VMULSDZrr_Intk:
7730 case X86::VMULSDZrr_Intkz:
7731 case X86::VSQRTSDZr_Intk:
7732 case X86::VSQRTSDZr_Intkz:
7733 case X86::VSUBSDZrr_Intk:
7734 case X86::VSUBSDZrr_Intkz:
7735 case X86::VFMADDSD4rr_Int:
7736 case X86::VFNMADDSD4rr_Int:
7737 case X86::VFMSUBSD4rr_Int:
7738 case X86::VFNMSUBSD4rr_Int:
7739 case X86::VFMADD132SDr_Int:
7740 case X86::VFNMADD132SDr_Int:
7741 case X86::VFMADD213SDr_Int:
7742 case X86::VFNMADD213SDr_Int:
7743 case X86::VFMADD231SDr_Int:
7744 case X86::VFNMADD231SDr_Int:
7745 case X86::VFMSUB132SDr_Int:
7746 case X86::VFNMSUB132SDr_Int:
7747 case X86::VFMSUB213SDr_Int:
7748 case X86::VFNMSUB213SDr_Int:
7749 case X86::VFMSUB231SDr_Int:
7750 case X86::VFNMSUB231SDr_Int:
7751 case X86::VFMADD132SDZr_Int:
7752 case X86::VFNMADD132SDZr_Int:
7753 case X86::VFMADD213SDZr_Int:
7754 case X86::VFNMADD213SDZr_Int:
7755 case X86::VFMADD231SDZr_Int:
7756 case X86::VFNMADD231SDZr_Int:
7757 case X86::VFMSUB132SDZr_Int:
7758 case X86::VFNMSUB132SDZr_Int:
7759 case X86::VFMSUB213SDZr_Int:
7760 case X86::VFNMSUB213SDZr_Int:
7761 case X86::VFMSUB231SDZr_Int:
7762 case X86::VFNMSUB231SDZr_Int:
7763 case X86::VFMADD132SDZr_Intk:
7764 case X86::VFNMADD132SDZr_Intk:
7765 case X86::VFMADD213SDZr_Intk:
7766 case X86::VFNMADD213SDZr_Intk:
7767 case X86::VFMADD231SDZr_Intk:
7768 case X86::VFNMADD231SDZr_Intk:
7769 case X86::VFMSUB132SDZr_Intk:
7770 case X86::VFNMSUB132SDZr_Intk:
7771 case X86::VFMSUB213SDZr_Intk:
7772 case X86::VFNMSUB213SDZr_Intk:
7773 case X86::VFMSUB231SDZr_Intk:
7774 case X86::VFNMSUB231SDZr_Intk:
7775 case X86::VFMADD132SDZr_Intkz:
7776 case X86::VFNMADD132SDZr_Intkz:
7777 case X86::VFMADD213SDZr_Intkz:
7778 case X86::VFNMADD213SDZr_Intkz:
7779 case X86::VFMADD231SDZr_Intkz:
7780 case X86::VFNMADD231SDZr_Intkz:
7781 case X86::VFMSUB132SDZr_Intkz:
7782 case X86::VFNMSUB132SDZr_Intkz:
7783 case X86::VFMSUB213SDZr_Intkz:
7784 case X86::VFNMSUB213SDZr_Intkz:
7785 case X86::VFMSUB231SDZr_Intkz:
7786 case X86::VFNMSUB231SDZr_Intkz:
7787 case X86::VFIXUPIMMSDZrri:
7788 case X86::VFIXUPIMMSDZrrik:
7789 case X86::VFIXUPIMMSDZrrikz:
7790 case X86::VFPCLASSSDZrr:
7791 case X86::VFPCLASSSDZrrk:
7792 case X86::VGETEXPSDZr:
7793 case X86::VGETEXPSDZrk:
7794 case X86::VGETEXPSDZrkz:
7795 case X86::VGETMANTSDZrri:
7796 case X86::VGETMANTSDZrrik:
7797 case X86::VGETMANTSDZrrikz:
7798 case X86::VRANGESDZrri:
7799 case X86::VRANGESDZrrik:
7800 case X86::VRANGESDZrrikz:
7801 case X86::VRCP14SDZrr:
7802 case X86::VRCP14SDZrrk:
7803 case X86::VRCP14SDZrrkz:
7804 case X86::VRCP28SDZr:
7805 case X86::VRCP28SDZrk:
7806 case X86::VRCP28SDZrkz:
7807 case X86::VREDUCESDZrri:
7808 case X86::VREDUCESDZrrik:
7809 case X86::VREDUCESDZrrikz:
7810 case X86::VRNDSCALESDZr_Int:
7811 case X86::VRNDSCALESDZr_Intk:
7812 case X86::VRNDSCALESDZr_Intkz:
7813 case X86::VRSQRT14SDZrr:
7814 case X86::VRSQRT14SDZrrk:
7815 case X86::VRSQRT14SDZrrkz:
7816 case X86::VRSQRT28SDZr:
7817 case X86::VRSQRT28SDZrk:
7818 case X86::VRSQRT28SDZrkz:
7819 case X86::VSCALEFSDZrr:
7820 case X86::VSCALEFSDZrrk:
7821 case X86::VSCALEFSDZrrkz:
7822 return false;
7823 default:
7824 return true;
7825 }
7826 }
7827
7828 if ((Opc == X86::VMOVSHZrm || Opc == X86::VMOVSHZrm_alt) && RegSize > 16) {
7829 // These instructions only load 16 bits, we can't fold them if the
7830 // destination register is wider than 16 bits (2 bytes), and its user
7831 // instruction isn't scalar (SH).
7832 switch (UserOpc) {
7833 case X86::VADDSHZrr_Int:
7834 case X86::VCMPSHZrri_Int:
7835 case X86::VDIVSHZrr_Int:
7836 case X86::VMAXSHZrr_Int:
7837 case X86::VMINSHZrr_Int:
7838 case X86::VMULSHZrr_Int:
7839 case X86::VSUBSHZrr_Int:
7840 case X86::VADDSHZrr_Intk:
7841 case X86::VADDSHZrr_Intkz:
7842 case X86::VCMPSHZrri_Intk:
7843 case X86::VDIVSHZrr_Intk:
7844 case X86::VDIVSHZrr_Intkz:
7845 case X86::VMAXSHZrr_Intk:
7846 case X86::VMAXSHZrr_Intkz:
7847 case X86::VMINSHZrr_Intk:
7848 case X86::VMINSHZrr_Intkz:
7849 case X86::VMULSHZrr_Intk:
7850 case X86::VMULSHZrr_Intkz:
7851 case X86::VSUBSHZrr_Intk:
7852 case X86::VSUBSHZrr_Intkz:
7853 case X86::VFMADD132SHZr_Int:
7854 case X86::VFNMADD132SHZr_Int:
7855 case X86::VFMADD213SHZr_Int:
7856 case X86::VFNMADD213SHZr_Int:
7857 case X86::VFMADD231SHZr_Int:
7858 case X86::VFNMADD231SHZr_Int:
7859 case X86::VFMSUB132SHZr_Int:
7860 case X86::VFNMSUB132SHZr_Int:
7861 case X86::VFMSUB213SHZr_Int:
7862 case X86::VFNMSUB213SHZr_Int:
7863 case X86::VFMSUB231SHZr_Int:
7864 case X86::VFNMSUB231SHZr_Int:
7865 case X86::VFMADD132SHZr_Intk:
7866 case X86::VFNMADD132SHZr_Intk:
7867 case X86::VFMADD213SHZr_Intk:
7868 case X86::VFNMADD213SHZr_Intk:
7869 case X86::VFMADD231SHZr_Intk:
7870 case X86::VFNMADD231SHZr_Intk:
7871 case X86::VFMSUB132SHZr_Intk:
7872 case X86::VFNMSUB132SHZr_Intk:
7873 case X86::VFMSUB213SHZr_Intk:
7874 case X86::VFNMSUB213SHZr_Intk:
7875 case X86::VFMSUB231SHZr_Intk:
7876 case X86::VFNMSUB231SHZr_Intk:
7877 case X86::VFMADD132SHZr_Intkz:
7878 case X86::VFNMADD132SHZr_Intkz:
7879 case X86::VFMADD213SHZr_Intkz:
7880 case X86::VFNMADD213SHZr_Intkz:
7881 case X86::VFMADD231SHZr_Intkz:
7882 case X86::VFNMADD231SHZr_Intkz:
7883 case X86::VFMSUB132SHZr_Intkz:
7884 case X86::VFNMSUB132SHZr_Intkz:
7885 case X86::VFMSUB213SHZr_Intkz:
7886 case X86::VFNMSUB213SHZr_Intkz:
7887 case X86::VFMSUB231SHZr_Intkz:
7888 case X86::VFNMSUB231SHZr_Intkz:
7889 return false;
7890 default:
7891 return true;
7892 }
7893 }
7894
7895 return false;
7896}
7897
7900 MachineBasicBlock::iterator InsertPt, MachineInstr &LoadMI,
7901 LiveIntervals *LIS) const {
7902
7903 // TODO: Support the case where LoadMI loads a wide register, but MI
7904 // only uses a subreg.
7905 for (auto Op : Ops) {
7906 if (MI.getOperand(Op).getSubReg())
7907 return nullptr;
7908 }
7909
7910 // If loading from a FrameIndex, fold directly from the FrameIndex.
7911 unsigned NumOps = LoadMI.getDesc().getNumOperands();
7912 int FrameIndex;
7913 if (isLoadFromStackSlot(LoadMI, FrameIndex)) {
7914 if (isNonFoldablePartialRegisterLoad(LoadMI, MI, MF))
7915 return nullptr;
7916 return foldMemoryOperandImpl(MF, MI, Ops, InsertPt, FrameIndex, LIS);
7917 }
7918
7919 // Check switch flag
7920 if (NoFusing)
7921 return nullptr;
7922
7923 // Avoid partial and undef register update stalls unless optimizing for size.
7924 if (!MF.getFunction().hasOptSize() &&
7925 (hasPartialRegUpdate(MI.getOpcode(), Subtarget, /*ForLoadFold*/ true) ||
7927 return nullptr;
7928
7929 // Determine the alignment of the load.
7930 Align Alignment;
7931 unsigned LoadOpc = LoadMI.getOpcode();
7932 if (LoadMI.hasOneMemOperand())
7933 Alignment = (*LoadMI.memoperands_begin())->getAlign();
7934 else
7935 switch (LoadOpc) {
7936 case X86::AVX512_512_SET0:
7937 case X86::AVX512_512_SETALLONES:
7938 Alignment = Align(64);
7939 break;
7940 case X86::AVX2_SETALLONES:
7941 case X86::AVX1_SETALLONES:
7942 case X86::AVX_SET0:
7943 case X86::AVX512_256_SET0:
7944 Alignment = Align(32);
7945 break;
7946 case X86::V_SET0:
7947 case X86::V_SETALLONES:
7948 case X86::AVX512_128_SET0:
7949 case X86::FsFLD0F128:
7950 case X86::AVX512_FsFLD0F128:
7951 Alignment = Align(16);
7952 break;
7953 case X86::MMX_SET0:
7954 case X86::FsFLD0SD:
7955 case X86::AVX512_FsFLD0SD:
7956 Alignment = Align(8);
7957 break;
7958 case X86::FsFLD0SS:
7959 case X86::AVX512_FsFLD0SS:
7960 Alignment = Align(4);
7961 break;
7962 case X86::FsFLD0SH:
7963 case X86::AVX512_FsFLD0SH:
7964 Alignment = Align(2);
7965 break;
7966 default:
7967 return nullptr;
7968 }
7969 if (Ops.size() == 2 && Ops[0] == 0 && Ops[1] == 1) {
7970 unsigned NewOpc = 0;
7971 switch (MI.getOpcode()) {
7972 default:
7973 return nullptr;
7974 case X86::TEST8rr:
7975 NewOpc = X86::CMP8ri;
7976 break;
7977 case X86::TEST16rr:
7978 NewOpc = X86::CMP16ri;
7979 break;
7980 case X86::TEST32rr:
7981 NewOpc = X86::CMP32ri;
7982 break;
7983 case X86::TEST64rr:
7984 NewOpc = X86::CMP64ri32;
7985 break;
7986 }
7987 // Change to CMPXXri r, 0 first.
7988 MI.setDesc(get(NewOpc));
7989 MI.getOperand(1).ChangeToImmediate(0);
7990 } else if (Ops.size() != 1)
7991 return nullptr;
7992
7993 // Make sure the subregisters match.
7994 // Otherwise we risk changing the size of the load.
7995 if (LoadMI.getOperand(0).getSubReg() != MI.getOperand(Ops[0]).getSubReg())
7996 return nullptr;
7997
7999 switch (LoadOpc) {
8000 case X86::MMX_SET0:
8001 case X86::V_SET0:
8002 case X86::V_SETALLONES:
8003 case X86::AVX2_SETALLONES:
8004 case X86::AVX1_SETALLONES:
8005 case X86::AVX_SET0:
8006 case X86::AVX512_128_SET0:
8007 case X86::AVX512_256_SET0:
8008 case X86::AVX512_512_SET0:
8009 case X86::AVX512_512_SETALLONES:
8010 case X86::FsFLD0SH:
8011 case X86::AVX512_FsFLD0SH:
8012 case X86::FsFLD0SD:
8013 case X86::AVX512_FsFLD0SD:
8014 case X86::FsFLD0SS:
8015 case X86::AVX512_FsFLD0SS:
8016 case X86::FsFLD0F128:
8017 case X86::AVX512_FsFLD0F128: {
8018 // Folding a V_SET0 or V_SETALLONES as a load, to ease register pressure.
8019 // Create a constant-pool entry and operands to load from it.
8020
8021 // Large code model can't fold loads this way.
8023 return nullptr;
8024
8025 // x86-32 PIC requires a PIC base register for constant pools.
8026 unsigned PICBase = 0;
8027 // Since we're using Small or Kernel code model, we can always use
8028 // RIP-relative addressing for a smaller encoding.
8029 if (Subtarget.is64Bit()) {
8030 PICBase = X86::RIP;
8031 } else if (MF.getTarget().isPositionIndependent()) {
8032 // FIXME: PICBase = getGlobalBaseReg(&MF);
8033 // This doesn't work for several reasons.
8034 // 1. GlobalBaseReg may have been spilled.
8035 // 2. It may not be live at MI.
8036 return nullptr;
8037 }
8038
8039 // Create a constant-pool entry.
8041 Type *Ty;
8042 bool IsAllOnes = false;
8043 switch (LoadOpc) {
8044 case X86::FsFLD0SS:
8045 case X86::AVX512_FsFLD0SS:
8047 break;
8048 case X86::FsFLD0SD:
8049 case X86::AVX512_FsFLD0SD:
8051 break;
8052 case X86::FsFLD0F128:
8053 case X86::AVX512_FsFLD0F128:
8055 break;
8056 case X86::FsFLD0SH:
8057 case X86::AVX512_FsFLD0SH:
8059 break;
8060 case X86::AVX512_512_SETALLONES:
8061 IsAllOnes = true;
8062 [[fallthrough]];
8063 case X86::AVX512_512_SET0:
8065 16);
8066 break;
8067 case X86::AVX1_SETALLONES:
8068 case X86::AVX2_SETALLONES:
8069 IsAllOnes = true;
8070 [[fallthrough]];
8071 case X86::AVX512_256_SET0:
8072 case X86::AVX_SET0:
8074 8);
8075
8076 break;
8077 case X86::MMX_SET0:
8079 2);
8080 break;
8081 case X86::V_SETALLONES:
8082 IsAllOnes = true;
8083 [[fallthrough]];
8084 case X86::V_SET0:
8085 case X86::AVX512_128_SET0:
8087 4);
8088 break;
8089 }
8090
8091 const Constant *C =
8093 unsigned CPI = MCP.getConstantPoolIndex(C, Alignment);
8094
8095 // Create operands to load from the constant pool entry.
8096 MOs.push_back(MachineOperand::CreateReg(PICBase, false));
8098 MOs.push_back(MachineOperand::CreateReg(0, false));
8100 MOs.push_back(MachineOperand::CreateReg(0, false));
8101 break;
8102 }
8103 case X86::VPBROADCASTBZ128rm:
8104 case X86::VPBROADCASTBZ256rm:
8105 case X86::VPBROADCASTBZrm:
8106 case X86::VBROADCASTF32X2Z256rm:
8107 case X86::VBROADCASTF32X2Zrm:
8108 case X86::VBROADCASTI32X2Z128rm:
8109 case X86::VBROADCASTI32X2Z256rm:
8110 case X86::VBROADCASTI32X2Zrm:
8111 // No instructions currently fuse with 8bits or 32bits x 2.
8112 return nullptr;
8113
8114#define FOLD_BROADCAST(SIZE) \
8115 MOs.append(LoadMI.operands_begin() + NumOps - X86::AddrNumOperands, \
8116 LoadMI.operands_begin() + NumOps); \
8117 return foldMemoryBroadcast(MF, MI, Ops[0], MOs, InsertPt, /*Size=*/SIZE, \
8118 /*AllowCommute=*/true);
8119 case X86::VPBROADCASTWZ128rm:
8120 case X86::VPBROADCASTWZ256rm:
8121 case X86::VPBROADCASTWZrm:
8122 FOLD_BROADCAST(16);
8123 case X86::VPBROADCASTDZ128rm:
8124 case X86::VPBROADCASTDZ256rm:
8125 case X86::VPBROADCASTDZrm:
8126 case X86::VBROADCASTSSZ128rm:
8127 case X86::VBROADCASTSSZ256rm:
8128 case X86::VBROADCASTSSZrm:
8129 FOLD_BROADCAST(32);
8130 case X86::VPBROADCASTQZ128rm:
8131 case X86::VPBROADCASTQZ256rm:
8132 case X86::VPBROADCASTQZrm:
8133 case X86::VBROADCASTSDZ256rm:
8134 case X86::VBROADCASTSDZrm:
8135 FOLD_BROADCAST(64);
8136 default: {
8137 if (isNonFoldablePartialRegisterLoad(LoadMI, MI, MF))
8138 return nullptr;
8139
8140 // Folding a normal load. Just copy the load's address operands.
8141 MOs.append(LoadMI.operands_begin() + NumOps - X86::AddrNumOperands,
8142 LoadMI.operands_begin() + NumOps);
8143 break;
8144 }
8145 }
8146 return foldMemoryOperandImpl(MF, MI, Ops[0], MOs, InsertPt,
8147 /*Size=*/0, Alignment, /*AllowCommute=*/true);
8148}
8149
8151X86InstrInfo::foldMemoryBroadcast(MachineFunction &MF, MachineInstr &MI,
8152 unsigned OpNum, ArrayRef<MachineOperand> MOs,
8154 unsigned BitsSize, bool AllowCommute) const {
8155
8156 if (auto *I = lookupBroadcastFoldTable(MI.getOpcode(), OpNum))
8157 return matchBroadcastSize(*I, BitsSize)
8158 ? FuseInst(MF, I->DstOp, OpNum, MOs, InsertPt, MI, *this)
8159 : nullptr;
8160
8161 if (AllowCommute) {
8162 // If the instruction and target operand are commutable, commute the
8163 // instruction and try again.
8164 unsigned CommuteOpIdx2 = commuteOperandsForFold(MI, OpNum);
8165 if (CommuteOpIdx2 == OpNum) {
8166 printFailMsgforFold(MI, OpNum);
8167 return nullptr;
8168 }
8169 MachineInstr *NewMI =
8170 foldMemoryBroadcast(MF, MI, CommuteOpIdx2, MOs, InsertPt, BitsSize,
8171 /*AllowCommute=*/false);
8172 if (NewMI)
8173 return NewMI;
8174 // Folding failed again - undo the commute before returning.
8175 commuteInstruction(MI, false, OpNum, CommuteOpIdx2);
8176 }
8177
8178 printFailMsgforFold(MI, OpNum);
8179 return nullptr;
8180}
8181
8185
8186 for (MachineMemOperand *MMO : MMOs) {
8187 if (!MMO->isLoad())
8188 continue;
8189
8190 if (!MMO->isStore()) {
8191 // Reuse the MMO.
8192 LoadMMOs.push_back(MMO);
8193 } else {
8194 // Clone the MMO and unset the store flag.
8195 LoadMMOs.push_back(MF.getMachineMemOperand(
8196 MMO, MMO->getFlags() & ~MachineMemOperand::MOStore));
8197 }
8198 }
8199
8200 return LoadMMOs;
8201}
8202
8206
8207 for (MachineMemOperand *MMO : MMOs) {
8208 if (!MMO->isStore())
8209 continue;
8210
8211 if (!MMO->isLoad()) {
8212 // Reuse the MMO.
8213 StoreMMOs.push_back(MMO);
8214 } else {
8215 // Clone the MMO and unset the load flag.
8216 StoreMMOs.push_back(MF.getMachineMemOperand(
8217 MMO, MMO->getFlags() & ~MachineMemOperand::MOLoad));
8218 }
8219 }
8220
8221 return StoreMMOs;
8222}
8223
8225 const TargetRegisterClass *RC,
8226 const X86Subtarget &STI) {
8227 assert(STI.hasAVX512() && "Expected at least AVX512!");
8228 unsigned SpillSize = STI.getRegisterInfo()->getSpillSize(*RC);
8229 assert((SpillSize == 64 || STI.hasVLX()) &&
8230 "Can't broadcast less than 64 bytes without AVX512VL!");
8231
8232#define CASE_BCAST_TYPE_OPC(TYPE, OP16, OP32, OP64) \
8233 case TYPE: \
8234 switch (SpillSize) { \
8235 default: \
8236 llvm_unreachable("Unknown spill size"); \
8237 case 16: \
8238 return X86::OP16; \
8239 case 32: \
8240 return X86::OP32; \
8241 case 64: \
8242 return X86::OP64; \
8243 } \
8244 break;
8245
8246 switch (I->Flags & TB_BCAST_MASK) {
8247 default:
8248 llvm_unreachable("Unexpected broadcast type!");
8249 CASE_BCAST_TYPE_OPC(TB_BCAST_W, VPBROADCASTWZ128rm, VPBROADCASTWZ256rm,
8250 VPBROADCASTWZrm)
8251 CASE_BCAST_TYPE_OPC(TB_BCAST_D, VPBROADCASTDZ128rm, VPBROADCASTDZ256rm,
8252 VPBROADCASTDZrm)
8253 CASE_BCAST_TYPE_OPC(TB_BCAST_Q, VPBROADCASTQZ128rm, VPBROADCASTQZ256rm,
8254 VPBROADCASTQZrm)
8255 CASE_BCAST_TYPE_OPC(TB_BCAST_SH, VPBROADCASTWZ128rm, VPBROADCASTWZ256rm,
8256 VPBROADCASTWZrm)
8257 CASE_BCAST_TYPE_OPC(TB_BCAST_SS, VBROADCASTSSZ128rm, VBROADCASTSSZ256rm,
8258 VBROADCASTSSZrm)
8259 CASE_BCAST_TYPE_OPC(TB_BCAST_SD, VMOVDDUPZ128rm, VBROADCASTSDZ256rm,
8260 VBROADCASTSDZrm)
8261 }
8262}
8263
8265 MachineFunction &MF, MachineInstr &MI, unsigned Reg, bool UnfoldLoad,
8266 bool UnfoldStore, SmallVectorImpl<MachineInstr *> &NewMIs) const {
8267 const X86FoldTableEntry *I = lookupUnfoldTable(MI.getOpcode());
8268 if (I == nullptr)
8269 return false;
8270 unsigned Opc = I->DstOp;
8271 unsigned Index = I->Flags & TB_INDEX_MASK;
8272 bool FoldedLoad = I->Flags & TB_FOLDED_LOAD;
8273 bool FoldedStore = I->Flags & TB_FOLDED_STORE;
8274 if (UnfoldLoad && !FoldedLoad)
8275 return false;
8276 UnfoldLoad &= FoldedLoad;
8277 if (UnfoldStore && !FoldedStore)
8278 return false;
8279 UnfoldStore &= FoldedStore;
8280
8281 const MCInstrDesc &MCID = get(Opc);
8282
8283 const TargetRegisterClass *RC = getRegClass(MCID, Index, &RI, MF);
8285 // TODO: Check if 32-byte or greater accesses are slow too?
8286 if (!MI.hasOneMemOperand() && RC == &X86::VR128RegClass &&
8287 Subtarget.isUnalignedMem16Slow())
8288 // Without memoperands, loadRegFromAddr and storeRegToStackSlot will
8289 // conservatively assume the address is unaligned. That's bad for
8290 // performance.
8291 return false;
8296 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
8297 MachineOperand &Op = MI.getOperand(i);
8298 if (i >= Index && i < Index + X86::AddrNumOperands)
8299 AddrOps.push_back(Op);
8300 else if (Op.isReg() && Op.isImplicit())
8301 ImpOps.push_back(Op);
8302 else if (i < Index)
8303 BeforeOps.push_back(Op);
8304 else if (i > Index)
8305 AfterOps.push_back(Op);
8306 }
8307
8308 // Emit the load or broadcast instruction.
8309 if (UnfoldLoad) {
8310 auto MMOs = extractLoadMMOs(MI.memoperands(), MF);
8311
8312 unsigned Opc;
8313 if (I->Flags & TB_BCAST_MASK) {
8314 Opc = getBroadcastOpcode(I, RC, Subtarget);
8315 } else {
8316 unsigned Alignment = std::max<uint32_t>(TRI.getSpillSize(*RC), 16);
8317 bool isAligned = !MMOs.empty() && MMOs.front()->getAlign() >= Alignment;
8318 Opc = getLoadRegOpcode(Reg, RC, isAligned, Subtarget);
8319 }
8320
8321 DebugLoc DL;
8322 MachineInstrBuilder MIB = BuildMI(MF, DL, get(Opc), Reg);
8323 for (const MachineOperand &AddrOp : AddrOps)
8324 MIB.add(AddrOp);
8325 MIB.setMemRefs(MMOs);
8326 NewMIs.push_back(MIB);
8327
8328 if (UnfoldStore) {
8329 // Address operands cannot be marked isKill.
8330 for (unsigned i = 1; i != 1 + X86::AddrNumOperands; ++i) {
8331 MachineOperand &MO = NewMIs[0]->getOperand(i);
8332 if (MO.isReg())
8333 MO.setIsKill(false);
8334 }
8335 }
8336 }
8337
8338 // Emit the data processing instruction.
8339 MachineInstr *DataMI = MF.CreateMachineInstr(MCID, MI.getDebugLoc(), true);
8340 MachineInstrBuilder MIB(MF, DataMI);
8341
8342 if (FoldedStore)
8343 MIB.addReg(Reg, RegState::Define);
8344 for (MachineOperand &BeforeOp : BeforeOps)
8345 MIB.add(BeforeOp);
8346 if (FoldedLoad)
8347 MIB.addReg(Reg);
8348 for (MachineOperand &AfterOp : AfterOps)
8349 MIB.add(AfterOp);
8350 for (MachineOperand &ImpOp : ImpOps) {
8351 MIB.addReg(ImpOp.getReg(), getDefRegState(ImpOp.isDef()) |
8353 getKillRegState(ImpOp.isKill()) |
8354 getDeadRegState(ImpOp.isDead()) |
8355 getUndefRegState(ImpOp.isUndef()));
8356 }
8357 // Change CMP32ri r, 0 back to TEST32rr r, r, etc.
8358 switch (DataMI->getOpcode()) {
8359 default:
8360 break;
8361 case X86::CMP64ri32:
8362 case X86::CMP32ri:
8363 case X86::CMP16ri:
8364 case X86::CMP8ri: {
8365 MachineOperand &MO0 = DataMI->getOperand(0);
8366 MachineOperand &MO1 = DataMI->getOperand(1);
8367 if (MO1.isImm() && MO1.getImm() == 0) {
8368 unsigned NewOpc;
8369 switch (DataMI->getOpcode()) {
8370 default:
8371 llvm_unreachable("Unreachable!");
8372 case X86::CMP64ri32:
8373 NewOpc = X86::TEST64rr;
8374 break;
8375 case X86::CMP32ri:
8376 NewOpc = X86::TEST32rr;
8377 break;
8378 case X86::CMP16ri:
8379 NewOpc = X86::TEST16rr;
8380 break;
8381 case X86::CMP8ri:
8382 NewOpc = X86::TEST8rr;
8383 break;
8384 }
8385 DataMI->setDesc(get(NewOpc));
8386 MO1.ChangeToRegister(MO0.getReg(), false);
8387 }
8388 }
8389 }
8390 NewMIs.push_back(DataMI);
8391
8392 // Emit the store instruction.
8393 if (UnfoldStore) {
8394 const TargetRegisterClass *DstRC = getRegClass(MCID, 0, &RI, MF);
8395 auto MMOs = extractStoreMMOs(MI.memoperands(), MF);
8396 unsigned Alignment = std::max<uint32_t>(TRI.getSpillSize(*DstRC), 16);
8397 bool isAligned = !MMOs.empty() && MMOs.front()->getAlign() >= Alignment;
8398 unsigned Opc = getStoreRegOpcode(Reg, DstRC, isAligned, Subtarget);
8399 DebugLoc DL;
8400 MachineInstrBuilder MIB = BuildMI(MF, DL, get(Opc));
8401 for (const MachineOperand &AddrOp : AddrOps)
8402 MIB.add(AddrOp);
8403 MIB.addReg(Reg, RegState::Kill);
8404 MIB.setMemRefs(MMOs);
8405 NewMIs.push_back(MIB);
8406 }
8407
8408 return true;
8409}
8410
8412 SelectionDAG &DAG, SDNode *N, SmallVectorImpl<SDNode *> &NewNodes) const {
8413 if (!N->isMachineOpcode())
8414 return false;
8415
8416 const X86FoldTableEntry *I = lookupUnfoldTable(N->getMachineOpcode());
8417 if (I == nullptr)
8418 return false;
8419 unsigned Opc = I->DstOp;
8420 unsigned Index = I->Flags & TB_INDEX_MASK;
8421 bool FoldedLoad = I->Flags & TB_FOLDED_LOAD;
8422 bool FoldedStore = I->Flags & TB_FOLDED_STORE;
8423 const MCInstrDesc &MCID = get(Opc);
8426 const TargetRegisterClass *RC = getRegClass(MCID, Index, &RI, MF);
8427 unsigned NumDefs = MCID.NumDefs;
8428 std::vector<SDValue> AddrOps;
8429 std::vector<SDValue> BeforeOps;
8430 std::vector<SDValue> AfterOps;
8431 SDLoc dl(N);
8432 unsigned NumOps = N->getNumOperands();
8433 for (unsigned i = 0; i != NumOps - 1; ++i) {
8434 SDValue Op = N->getOperand(i);
8435 if (i >= Index - NumDefs && i < Index - NumDefs + X86::AddrNumOperands)
8436 AddrOps.push_back(Op);
8437 else if (i < Index - NumDefs)
8438 BeforeOps.push_back(Op);
8439 else if (i > Index - NumDefs)
8440 AfterOps.push_back(Op);
8441 }
8442 SDValue Chain = N->getOperand(NumOps - 1);
8443 AddrOps.push_back(Chain);
8444
8445 // Emit the load instruction.
8446 SDNode *Load = nullptr;
8447 if (FoldedLoad) {
8448 EVT VT = *TRI.legalclasstypes_begin(*RC);
8449 auto MMOs = extractLoadMMOs(cast<MachineSDNode>(N)->memoperands(), MF);
8450 if (MMOs.empty() && RC == &X86::VR128RegClass &&
8451 Subtarget.isUnalignedMem16Slow())
8452 // Do not introduce a slow unaligned load.
8453 return false;
8454 // FIXME: If a VR128 can have size 32, we should be checking if a 32-byte
8455 // memory access is slow above.
8456
8457 unsigned Opc;
8458 if (I->Flags & TB_BCAST_MASK) {
8459 Opc = getBroadcastOpcode(I, RC, Subtarget);
8460 } else {
8461 unsigned Alignment = std::max<uint32_t>(TRI.getSpillSize(*RC), 16);
8462 bool isAligned = !MMOs.empty() && MMOs.front()->getAlign() >= Alignment;
8463 Opc = getLoadRegOpcode(0, RC, isAligned, Subtarget);
8464 }
8465
8466 Load = DAG.getMachineNode(Opc, dl, VT, MVT::Other, AddrOps);
8467 NewNodes.push_back(Load);
8468
8469 // Preserve memory reference information.
8470 DAG.setNodeMemRefs(cast<MachineSDNode>(Load), MMOs);
8471 }
8472
8473 // Emit the data processing instruction.
8474 std::vector<EVT> VTs;
8475 const TargetRegisterClass *DstRC = nullptr;
8476 if (MCID.getNumDefs() > 0) {
8477 DstRC = getRegClass(MCID, 0, &RI, MF);
8478 VTs.push_back(*TRI.legalclasstypes_begin(*DstRC));
8479 }
8480 for (unsigned i = 0, e = N->getNumValues(); i != e; ++i) {
8481 EVT VT = N->getValueType(i);
8482 if (VT != MVT::Other && i >= (unsigned)MCID.getNumDefs())
8483 VTs.push_back(VT);
8484 }
8485 if (Load)
8486 BeforeOps.push_back(SDValue(Load, 0));
8487 llvm::append_range(BeforeOps, AfterOps);
8488 // Change CMP32ri r, 0 back to TEST32rr r, r, etc.
8489 switch (Opc) {
8490 default:
8491 break;
8492 case X86::CMP64ri32:
8493 case X86::CMP32ri:
8494 case X86::CMP16ri:
8495 case X86::CMP8ri:
8496 if (isNullConstant(BeforeOps[1])) {
8497 switch (Opc) {
8498 default:
8499 llvm_unreachable("Unreachable!");
8500 case X86::CMP64ri32:
8501 Opc = X86::TEST64rr;
8502 break;
8503 case X86::CMP32ri:
8504 Opc = X86::TEST32rr;
8505 break;
8506 case X86::CMP16ri:
8507 Opc = X86::TEST16rr;
8508 break;
8509 case X86::CMP8ri:
8510 Opc = X86::TEST8rr;
8511 break;
8512 }
8513 BeforeOps[1] = BeforeOps[0];
8514 }
8515 }
8516 SDNode *NewNode = DAG.getMachineNode(Opc, dl, VTs, BeforeOps);
8517 NewNodes.push_back(NewNode);
8518
8519 // Emit the store instruction.
8520 if (FoldedStore) {
8521 AddrOps.pop_back();
8522 AddrOps.push_back(SDValue(NewNode, 0));
8523 AddrOps.push_back(Chain);
8524 auto MMOs = extractStoreMMOs(cast<MachineSDNode>(N)->memoperands(), MF);
8525 if (MMOs.empty() && RC == &X86::VR128RegClass &&
8526 Subtarget.isUnalignedMem16Slow())
8527 // Do not introduce a slow unaligned store.
8528 return false;
8529 // FIXME: If a VR128 can have size 32, we should be checking if a 32-byte
8530 // memory access is slow above.
8531 unsigned Alignment = std::max<uint32_t>(TRI.getSpillSize(*RC), 16);
8532 bool isAligned = !MMOs.empty() && MMOs.front()->getAlign() >= Alignment;
8533 SDNode *Store =
8534 DAG.getMachineNode(getStoreRegOpcode(0, DstRC, isAligned, Subtarget),
8535 dl, MVT::Other, AddrOps);
8536 NewNodes.push_back(Store);
8537
8538 // Preserve memory reference information.
8539 DAG.setNodeMemRefs(cast<MachineSDNode>(Store), MMOs);
8540 }
8541
8542 return true;
8543}
8544
8545unsigned
8546X86InstrInfo::getOpcodeAfterMemoryUnfold(unsigned Opc, bool UnfoldLoad,
8547 bool UnfoldStore,
8548 unsigned *LoadRegIndex) const {
8550 if (I == nullptr)
8551 return 0;
8552 bool FoldedLoad = I->Flags & TB_FOLDED_LOAD;
8553 bool FoldedStore = I->Flags & TB_FOLDED_STORE;
8554 if (UnfoldLoad && !FoldedLoad)
8555 return 0;
8556 if (UnfoldStore && !FoldedStore)
8557 return 0;
8558 if (LoadRegIndex)
8559 *LoadRegIndex = I->Flags & TB_INDEX_MASK;
8560 return I->DstOp;
8561}
8562
8564 int64_t &Offset1,
8565 int64_t &Offset2) const {
8566 if (!Load1->isMachineOpcode() || !Load2->isMachineOpcode())
8567 return false;
8568
8569 auto IsLoadOpcode = [&](unsigned Opcode) {
8570 switch (Opcode) {
8571 default:
8572 return false;
8573 case X86::MOV8rm:
8574 case X86::MOV16rm:
8575 case X86::MOV32rm:
8576 case X86::MOV64rm:
8577 case X86::LD_Fp32m:
8578 case X86::LD_Fp64m:
8579 case X86::LD_Fp80m:
8580 case X86::MOVSSrm:
8581 case X86::MOVSSrm_alt:
8582 case X86::MOVSDrm:
8583 case X86::MOVSDrm_alt:
8584 case X86::MMX_MOVD64rm:
8585 case X86::MMX_MOVQ64rm:
8586 case X86::MOVAPSrm:
8587 case X86::MOVUPSrm:
8588 case X86::MOVAPDrm:
8589 case X86::MOVUPDrm:
8590 case X86::MOVDQArm:
8591 case X86::MOVDQUrm:
8592 // AVX load instructions
8593 case X86::VMOVSSrm:
8594 case X86::VMOVSSrm_alt:
8595 case X86::VMOVSDrm:
8596 case X86::VMOVSDrm_alt:
8597 case X86::VMOVAPSrm:
8598 case X86::VMOVUPSrm:
8599 case X86::VMOVAPDrm:
8600 case X86::VMOVUPDrm:
8601 case X86::VMOVDQArm:
8602 case X86::VMOVDQUrm:
8603 case X86::VMOVAPSYrm:
8604 case X86::VMOVUPSYrm:
8605 case X86::VMOVAPDYrm:
8606 case X86::VMOVUPDYrm:
8607 case X86::VMOVDQAYrm:
8608 case X86::VMOVDQUYrm:
8609 // AVX512 load instructions
8610 case X86::VMOVSSZrm:
8611 case X86::VMOVSSZrm_alt:
8612 case X86::VMOVSDZrm:
8613 case X86::VMOVSDZrm_alt:
8614 case X86::VMOVAPSZ128rm:
8615 case X86::VMOVUPSZ128rm:
8616 case X86::VMOVAPSZ128rm_NOVLX:
8617 case X86::VMOVUPSZ128rm_NOVLX:
8618 case X86::VMOVAPDZ128rm:
8619 case X86::VMOVUPDZ128rm:
8620 case X86::VMOVDQU8Z128rm:
8621 case X86::VMOVDQU16Z128rm:
8622 case X86::VMOVDQA32Z128rm:
8623 case X86::VMOVDQU32Z128rm:
8624 case X86::VMOVDQA64Z128rm:
8625 case X86::VMOVDQU64Z128rm:
8626 case X86::VMOVAPSZ256rm:
8627 case X86::VMOVUPSZ256rm:
8628 case X86::VMOVAPSZ256rm_NOVLX:
8629 case X86::VMOVUPSZ256rm_NOVLX:
8630 case X86::VMOVAPDZ256rm:
8631 case X86::VMOVUPDZ256rm:
8632 case X86::VMOVDQU8Z256rm:
8633 case X86::VMOVDQU16Z256rm:
8634 case X86::VMOVDQA32Z256rm:
8635 case X86::VMOVDQU32Z256rm:
8636 case X86::VMOVDQA64Z256rm:
8637 case X86::VMOVDQU64Z256rm:
8638 case X86::VMOVAPSZrm:
8639 case X86::VMOVUPSZrm:
8640 case X86::VMOVAPDZrm:
8641 case X86::VMOVUPDZrm:
8642 case X86::VMOVDQU8Zrm:
8643 case X86::VMOVDQU16Zrm:
8644 case X86::VMOVDQA32Zrm:
8645 case X86::VMOVDQU32Zrm:
8646 case X86::VMOVDQA64Zrm:
8647 case X86::VMOVDQU64Zrm:
8648 case X86::KMOVBkm:
8649 case X86::KMOVBkm_EVEX:
8650 case X86::KMOVWkm:
8651 case X86::KMOVWkm_EVEX:
8652 case X86::KMOVDkm:
8653 case X86::KMOVDkm_EVEX:
8654 case X86::KMOVQkm:
8655 case X86::KMOVQkm_EVEX:
8656 return true;
8657 }
8658 };
8659
8660 if (!IsLoadOpcode(Load1->getMachineOpcode()) ||
8661 !IsLoadOpcode(Load2->getMachineOpcode()))
8662 return false;
8663
8664 // Lambda to check if both the loads have the same value for an operand index.
8665 auto HasSameOp = [&](int I) {
8666 return Load1->getOperand(I) == Load2->getOperand(I);
8667 };
8668
8669 // All operands except the displacement should match.
8670 if (!HasSameOp(X86::AddrBaseReg) || !HasSameOp(X86::AddrScaleAmt) ||
8671 !HasSameOp(X86::AddrIndexReg) || !HasSameOp(X86::AddrSegmentReg))
8672 return false;
8673
8674 // Chain Operand must be the same.
8675 if (!HasSameOp(5))
8676 return false;
8677
8678 // Now let's examine if the displacements are constants.
8679 auto Disp1 = dyn_cast<ConstantSDNode>(Load1->getOperand(X86::AddrDisp));
8680 auto Disp2 = dyn_cast<ConstantSDNode>(Load2->getOperand(X86::AddrDisp));
8681 if (!Disp1 || !Disp2)
8682 return false;
8683
8684 Offset1 = Disp1->getSExtValue();
8685 Offset2 = Disp2->getSExtValue();
8686 return true;
8687}
8688
8690 int64_t Offset1, int64_t Offset2,
8691 unsigned NumLoads) const {
8692 assert(Offset2 > Offset1);
8693 if ((Offset2 - Offset1) / 8 > 64)
8694 return false;
8695
8696 unsigned Opc1 = Load1->getMachineOpcode();
8697 unsigned Opc2 = Load2->getMachineOpcode();
8698 if (Opc1 != Opc2)
8699 return false; // FIXME: overly conservative?
8700
8701 switch (Opc1) {
8702 default:
8703 break;
8704 case X86::LD_Fp32m:
8705 case X86::LD_Fp64m:
8706 case X86::LD_Fp80m:
8707 case X86::MMX_MOVD64rm:
8708 case X86::MMX_MOVQ64rm:
8709 return false;
8710 }
8711
8712 EVT VT = Load1->getValueType(0);
8713 switch (VT.getSimpleVT().SimpleTy) {
8714 default:
8715 // XMM registers. In 64-bit mode we can be a bit more aggressive since we
8716 // have 16 of them to play with.
8717 if (Subtarget.is64Bit()) {
8718 if (NumLoads >= 3)
8719 return false;
8720 } else if (NumLoads) {
8721 return false;
8722 }
8723 break;
8724 case MVT::i8:
8725 case MVT::i16:
8726 case MVT::i32:
8727 case MVT::i64:
8728 case MVT::f32:
8729 case MVT::f64:
8730 if (NumLoads)
8731 return false;
8732 break;
8733 }
8734
8735 return true;
8736}
8737
8739 const MachineBasicBlock *MBB,
8740 const MachineFunction &MF) const {
8741
8742 // ENDBR instructions should not be scheduled around.
8743 unsigned Opcode = MI.getOpcode();
8744 if (Opcode == X86::ENDBR64 || Opcode == X86::ENDBR32 ||
8745 Opcode == X86::PLDTILECFGV)
8746 return true;
8747
8749}
8750
8753 assert(Cond.size() == 1 && "Invalid X86 branch condition!");
8754 X86::CondCode CC = static_cast<X86::CondCode>(Cond[0].getImm());
8756 return false;
8757}
8758
8760 const TargetRegisterClass *RC) const {
8761 // FIXME: Return false for x87 stack register classes for now. We can't
8762 // allow any loads of these registers before FpGet_ST0_80.
8763 return !(RC == &X86::CCRRegClass || RC == &X86::DFCCRRegClass ||
8764 RC == &X86::RFP32RegClass || RC == &X86::RFP64RegClass ||
8765 RC == &X86::RFP80RegClass);
8766}
8767
8768/// Return a virtual register initialized with the
8769/// the global base register value. Output instructions required to
8770/// initialize the register in the function entry block, if necessary.
8771///
8772/// TODO: Eliminate this and move the code to X86MachineFunctionInfo.
8773///
8776 Register GlobalBaseReg = X86FI->getGlobalBaseReg();
8777 if (GlobalBaseReg != 0)
8778 return GlobalBaseReg;
8779
8780 // Create the register. The code to initialize it is inserted
8781 // later, by the CGBR pass (below).
8782 MachineRegisterInfo &RegInfo = MF->getRegInfo();
8783 GlobalBaseReg = RegInfo.createVirtualRegister(
8784 Subtarget.is64Bit() ? &X86::GR64_NOSPRegClass : &X86::GR32_NOSPRegClass);
8785 X86FI->setGlobalBaseReg(GlobalBaseReg);
8786 return GlobalBaseReg;
8787}
8788
8789// FIXME: Some shuffle and unpack instructions have equivalents in different
8790// domains, but they require a bit more work than just switching opcodes.
8791
8792static const uint16_t *lookup(unsigned opcode, unsigned domain,
8793 ArrayRef<uint16_t[3]> Table) {
8794 for (const uint16_t(&Row)[3] : Table)
8795 if (Row[domain - 1] == opcode)
8796 return Row;
8797 return nullptr;
8798}
8799
8800static const uint16_t *lookupAVX512(unsigned opcode, unsigned domain,
8801 ArrayRef<uint16_t[4]> Table) {
8802 // If this is the integer domain make sure to check both integer columns.
8803 for (const uint16_t(&Row)[4] : Table)
8804 if (Row[domain - 1] == opcode || (domain == 3 && Row[3] == opcode))
8805 return Row;
8806 return nullptr;
8807}
8808
8809// Helper to attempt to widen/narrow blend masks.
8810static bool AdjustBlendMask(unsigned OldMask, unsigned OldWidth,
8811 unsigned NewWidth, unsigned *pNewMask = nullptr) {
8812 assert(((OldWidth % NewWidth) == 0 || (NewWidth % OldWidth) == 0) &&
8813 "Illegal blend mask scale");
8814 unsigned NewMask = 0;
8815
8816 if ((OldWidth % NewWidth) == 0) {
8817 unsigned Scale = OldWidth / NewWidth;
8818 unsigned SubMask = (1u << Scale) - 1;
8819 for (unsigned i = 0; i != NewWidth; ++i) {
8820 unsigned Sub = (OldMask >> (i * Scale)) & SubMask;
8821 if (Sub == SubMask)
8822 NewMask |= (1u << i);
8823 else if (Sub != 0x0)
8824 return false;
8825 }
8826 } else {
8827 unsigned Scale = NewWidth / OldWidth;
8828 unsigned SubMask = (1u << Scale) - 1;
8829 for (unsigned i = 0; i != OldWidth; ++i) {
8830 if (OldMask & (1 << i)) {
8831 NewMask |= (SubMask << (i * Scale));
8832 }
8833 }
8834 }
8835
8836 if (pNewMask)
8837 *pNewMask = NewMask;
8838 return true;
8839}
8840
8842 unsigned Opcode = MI.getOpcode();
8843 unsigned NumOperands = MI.getDesc().getNumOperands();
8844
8845 auto GetBlendDomains = [&](unsigned ImmWidth, bool Is256) {
8846 uint16_t validDomains = 0;
8847 if (MI.getOperand(NumOperands - 1).isImm()) {
8848 unsigned Imm = MI.getOperand(NumOperands - 1).getImm();
8849 if (AdjustBlendMask(Imm, ImmWidth, Is256 ? 8 : 4))
8850 validDomains |= 0x2; // PackedSingle
8851 if (AdjustBlendMask(Imm, ImmWidth, Is256 ? 4 : 2))
8852 validDomains |= 0x4; // PackedDouble
8853 if (!Is256 || Subtarget.hasAVX2())
8854 validDomains |= 0x8; // PackedInt
8855 }
8856 return validDomains;
8857 };
8858
8859 switch (Opcode) {
8860 case X86::BLENDPDrmi:
8861 case X86::BLENDPDrri:
8862 case X86::VBLENDPDrmi:
8863 case X86::VBLENDPDrri:
8864 return GetBlendDomains(2, false);
8865 case X86::VBLENDPDYrmi:
8866 case X86::VBLENDPDYrri:
8867 return GetBlendDomains(4, true);
8868 case X86::BLENDPSrmi:
8869 case X86::BLENDPSrri:
8870 case X86::VBLENDPSrmi:
8871 case X86::VBLENDPSrri:
8872 case X86::VPBLENDDrmi:
8873 case X86::VPBLENDDrri:
8874 return GetBlendDomains(4, false);
8875 case X86::VBLENDPSYrmi:
8876 case X86::VBLENDPSYrri:
8877 case X86::VPBLENDDYrmi:
8878 case X86::VPBLENDDYrri:
8879 return GetBlendDomains(8, true);
8880 case X86::PBLENDWrmi:
8881 case X86::PBLENDWrri:
8882 case X86::VPBLENDWrmi:
8883 case X86::VPBLENDWrri:
8884 // Treat VPBLENDWY as a 128-bit vector as it repeats the lo/hi masks.
8885 case X86::VPBLENDWYrmi:
8886 case X86::VPBLENDWYrri:
8887 return GetBlendDomains(8, false);
8888 case X86::VPANDDZ128rr:
8889 case X86::VPANDDZ128rm:
8890 case X86::VPANDDZ256rr:
8891 case X86::VPANDDZ256rm:
8892 case X86::VPANDQZ128rr:
8893 case X86::VPANDQZ128rm:
8894 case X86::VPANDQZ256rr:
8895 case X86::VPANDQZ256rm:
8896 case X86::VPANDNDZ128rr:
8897 case X86::VPANDNDZ128rm:
8898 case X86::VPANDNDZ256rr:
8899 case X86::VPANDNDZ256rm:
8900 case X86::VPANDNQZ128rr:
8901 case X86::VPANDNQZ128rm:
8902 case X86::VPANDNQZ256rr:
8903 case X86::VPANDNQZ256rm:
8904 case X86::VPORDZ128rr:
8905 case X86::VPORDZ128rm:
8906 case X86::VPORDZ256rr:
8907 case X86::VPORDZ256rm:
8908 case X86::VPORQZ128rr:
8909 case X86::VPORQZ128rm:
8910 case X86::VPORQZ256rr:
8911 case X86::VPORQZ256rm:
8912 case X86::VPXORDZ128rr:
8913 case X86::VPXORDZ128rm:
8914 case X86::VPXORDZ256rr:
8915 case X86::VPXORDZ256rm:
8916 case X86::VPXORQZ128rr:
8917 case X86::VPXORQZ128rm:
8918 case X86::VPXORQZ256rr:
8919 case X86::VPXORQZ256rm:
8920 // If we don't have DQI see if we can still switch from an EVEX integer
8921 // instruction to a VEX floating point instruction.
8922 if (Subtarget.hasDQI())
8923 return 0;
8924
8925 if (RI.getEncodingValue(MI.getOperand(0).getReg()) >= 16)
8926 return 0;
8927 if (RI.getEncodingValue(MI.getOperand(1).getReg()) >= 16)
8928 return 0;
8929 // Register forms will have 3 operands. Memory form will have more.
8930 if (NumOperands == 3 &&
8931 RI.getEncodingValue(MI.getOperand(2).getReg()) >= 16)
8932 return 0;
8933
8934 // All domains are valid.
8935 return 0xe;
8936 case X86::MOVHLPSrr:
8937 // We can swap domains when both inputs are the same register.
8938 // FIXME: This doesn't catch all the cases we would like. If the input
8939 // register isn't KILLed by the instruction, the two address instruction
8940 // pass puts a COPY on one input. The other input uses the original
8941 // register. This prevents the same physical register from being used by
8942 // both inputs.
8943 if (MI.getOperand(1).getReg() == MI.getOperand(2).getReg() &&
8944 MI.getOperand(0).getSubReg() == 0 &&
8945 MI.getOperand(1).getSubReg() == 0 && MI.getOperand(2).getSubReg() == 0)
8946 return 0x6;
8947 return 0;
8948 case X86::SHUFPDrri:
8949 return 0x6;
8950 }
8951 return 0;
8952}
8953
8954#include "X86ReplaceableInstrs.def"
8955
8957 unsigned Domain) const {
8958 assert(Domain > 0 && Domain < 4 && "Invalid execution domain");
8959 uint16_t dom = (MI.getDesc().TSFlags >> X86II::SSEDomainShift) & 3;
8960 assert(dom && "Not an SSE instruction");
8961
8962 unsigned Opcode = MI.getOpcode();
8963 unsigned NumOperands = MI.getDesc().getNumOperands();
8964
8965 auto SetBlendDomain = [&](unsigned ImmWidth, bool Is256) {
8966 if (MI.getOperand(NumOperands - 1).isImm()) {
8967 unsigned Imm = MI.getOperand(NumOperands - 1).getImm() & 255;
8968 Imm = (ImmWidth == 16 ? ((Imm << 8) | Imm) : Imm);
8969 unsigned NewImm = Imm;
8970
8971 const uint16_t *table = lookup(Opcode, dom, ReplaceableBlendInstrs);
8972 if (!table)
8973 table = lookup(Opcode, dom, ReplaceableBlendAVX2Instrs);
8974
8975 if (Domain == 1) { // PackedSingle
8976 AdjustBlendMask(Imm, ImmWidth, Is256 ? 8 : 4, &NewImm);
8977 } else if (Domain == 2) { // PackedDouble
8978 AdjustBlendMask(Imm, ImmWidth, Is256 ? 4 : 2, &NewImm);
8979 } else if (Domain == 3) { // PackedInt
8980 if (Subtarget.hasAVX2()) {
8981 // If we are already VPBLENDW use that, else use VPBLENDD.
8982 if ((ImmWidth / (Is256 ? 2 : 1)) != 8) {
8983 table = lookup(Opcode, dom, ReplaceableBlendAVX2Instrs);
8984 AdjustBlendMask(Imm, ImmWidth, Is256 ? 8 : 4, &NewImm);
8985 }
8986 } else {
8987 assert(!Is256 && "128-bit vector expected");
8988 AdjustBlendMask(Imm, ImmWidth, 8, &NewImm);
8989 }
8990 }
8991
8992 assert(table && table[Domain - 1] && "Unknown domain op");
8993 MI.setDesc(get(table[Domain - 1]));
8994 MI.getOperand(NumOperands - 1).setImm(NewImm & 255);
8995 }
8996 return true;
8997 };
8998
8999 switch (Opcode) {
9000 case X86::BLENDPDrmi:
9001 case X86::BLENDPDrri:
9002 case X86::VBLENDPDrmi:
9003 case X86::VBLENDPDrri:
9004 return SetBlendDomain(2, false);
9005 case X86::VBLENDPDYrmi:
9006 case X86::VBLENDPDYrri:
9007 return SetBlendDomain(4, true);
9008 case X86::BLENDPSrmi:
9009 case X86::BLENDPSrri:
9010 case X86::VBLENDPSrmi:
9011 case X86::VBLENDPSrri:
9012 case X86::VPBLENDDrmi:
9013 case X86::VPBLENDDrri:
9014 return SetBlendDomain(4, false);
9015 case X86::VBLENDPSYrmi:
9016 case X86::VBLENDPSYrri:
9017 case X86::VPBLENDDYrmi:
9018 case X86::VPBLENDDYrri:
9019 return SetBlendDomain(8, true);
9020 case X86::PBLENDWrmi:
9021 case X86::PBLENDWrri:
9022 case X86::VPBLENDWrmi:
9023 case X86::VPBLENDWrri:
9024 return SetBlendDomain(8, false);
9025 case X86::VPBLENDWYrmi:
9026 case X86::VPBLENDWYrri:
9027 return SetBlendDomain(16, true);
9028 case X86::VPANDDZ128rr:
9029 case X86::VPANDDZ128rm:
9030 case X86::VPANDDZ256rr:
9031 case X86::VPANDDZ256rm:
9032 case X86::VPANDQZ128rr:
9033 case X86::VPANDQZ128rm:
9034 case X86::VPANDQZ256rr:
9035 case X86::VPANDQZ256rm:
9036 case X86::VPANDNDZ128rr:
9037 case X86::VPANDNDZ128rm:
9038 case X86::VPANDNDZ256rr:
9039 case X86::VPANDNDZ256rm:
9040 case X86::VPANDNQZ128rr:
9041 case X86::VPANDNQZ128rm:
9042 case X86::VPANDNQZ256rr:
9043 case X86::VPANDNQZ256rm:
9044 case X86::VPORDZ128rr:
9045 case X86::VPORDZ128rm:
9046 case X86::VPORDZ256rr:
9047 case X86::VPORDZ256rm:
9048 case X86::VPORQZ128rr:
9049 case X86::VPORQZ128rm:
9050 case X86::VPORQZ256rr:
9051 case X86::VPORQZ256rm:
9052 case X86::VPXORDZ128rr:
9053 case X86::VPXORDZ128rm:
9054 case X86::VPXORDZ256rr:
9055 case X86::VPXORDZ256rm:
9056 case X86::VPXORQZ128rr:
9057 case X86::VPXORQZ128rm:
9058 case X86::VPXORQZ256rr:
9059 case X86::VPXORQZ256rm: {
9060 // Without DQI, convert EVEX instructions to VEX instructions.
9061 if (Subtarget.hasDQI())
9062 return false;
9063
9064 const uint16_t *table =
9065 lookupAVX512(MI.getOpcode(), dom, ReplaceableCustomAVX512LogicInstrs);
9066 assert(table && "Instruction not found in table?");
9067 // Don't change integer Q instructions to D instructions and
9068 // use D intructions if we started with a PS instruction.
9069 if (Domain == 3 && (dom == 1 || table[3] == MI.getOpcode()))
9070 Domain = 4;
9071 MI.setDesc(get(table[Domain - 1]));
9072 return true;
9073 }
9074 case X86::UNPCKHPDrr:
9075 case X86::MOVHLPSrr:
9076 // We just need to commute the instruction which will switch the domains.
9077 if (Domain != dom && Domain != 3 &&
9078 MI.getOperand(1).getReg() == MI.getOperand(2).getReg() &&
9079 MI.getOperand(0).getSubReg() == 0 &&
9080 MI.getOperand(1).getSubReg() == 0 &&
9081 MI.getOperand(2).getSubReg() == 0) {
9082 commuteInstruction(MI, false);
9083 return true;
9084 }
9085 // We must always return true for MOVHLPSrr.
9086 if (Opcode == X86::MOVHLPSrr)
9087 return true;
9088 break;
9089 case X86::SHUFPDrri: {
9090 if (Domain == 1) {
9091 unsigned Imm = MI.getOperand(3).getImm();
9092 unsigned NewImm = 0x44;
9093 if (Imm & 1)
9094 NewImm |= 0x0a;
9095 if (Imm & 2)
9096 NewImm |= 0xa0;
9097 MI.getOperand(3).setImm(NewImm);
9098 MI.setDesc(get(X86::SHUFPSrri));
9099 }
9100 return true;
9101 }
9102 }
9103 return false;
9104}
9105
9106std::pair<uint16_t, uint16_t>
9108 uint16_t domain = (MI.getDesc().TSFlags >> X86II::SSEDomainShift) & 3;
9109 unsigned opcode = MI.getOpcode();
9110 uint16_t validDomains = 0;
9111 if (domain) {
9112 // Attempt to match for custom instructions.
9113 validDomains = getExecutionDomainCustom(MI);
9114 if (validDomains)
9115 return std::make_pair(domain, validDomains);
9116
9117 if (lookup(opcode, domain, ReplaceableInstrs)) {
9118 validDomains = 0xe;
9119 } else if (lookup(opcode, domain, ReplaceableInstrsAVX2)) {
9120 validDomains = Subtarget.hasAVX2() ? 0xe : 0x6;
9121 } else if (lookup(opcode, domain, ReplaceableInstrsFP)) {
9122 validDomains = 0x6;
9123 } else if (lookup(opcode, domain, ReplaceableInstrsAVX2InsertExtract)) {
9124 // Insert/extract instructions should only effect domain if AVX2
9125 // is enabled.
9126 if (!Subtarget.hasAVX2())
9127 return std::make_pair(0, 0);
9128 validDomains = 0xe;
9129 } else if (lookupAVX512(opcode, domain, ReplaceableInstrsAVX512)) {
9130 validDomains = 0xe;
9131 } else if (Subtarget.hasDQI() &&
9132 lookupAVX512(opcode, domain, ReplaceableInstrsAVX512DQ)) {
9133 validDomains = 0xe;
9134 } else if (Subtarget.hasDQI()) {
9135 if (const uint16_t *table =
9136 lookupAVX512(opcode, domain, ReplaceableInstrsAVX512DQMasked)) {
9137 if (domain == 1 || (domain == 3 && table[3] == opcode))
9138 validDomains = 0xa;
9139 else
9140 validDomains = 0xc;
9141 }
9142 }
9143 }
9144 return std::make_pair(domain, validDomains);
9145}
9146
9148 assert(Domain > 0 && Domain < 4 && "Invalid execution domain");
9149 uint16_t dom = (MI.getDesc().TSFlags >> X86II::SSEDomainShift) & 3;
9150 assert(dom && "Not an SSE instruction");
9151
9152 // Attempt to match for custom instructions.
9154 return;
9155
9156 const uint16_t *table = lookup(MI.getOpcode(), dom, ReplaceableInstrs);
9157 if (!table) { // try the other table
9158 assert((Subtarget.hasAVX2() || Domain < 3) &&
9159 "256-bit vector operations only available in AVX2");
9160 table = lookup(MI.getOpcode(), dom, ReplaceableInstrsAVX2);
9161 }
9162 if (!table) { // try the FP table
9163 table = lookup(MI.getOpcode(), dom, ReplaceableInstrsFP);
9164 assert((!table || Domain < 3) &&
9165 "Can only select PackedSingle or PackedDouble");
9166 }
9167 if (!table) { // try the other table
9168 assert(Subtarget.hasAVX2() &&
9169 "256-bit insert/extract only available in AVX2");
9170 table = lookup(MI.getOpcode(), dom, ReplaceableInstrsAVX2InsertExtract);
9171 }
9172 if (!table) { // try the AVX512 table
9173 assert(Subtarget.hasAVX512() && "Requires AVX-512");
9174 table = lookupAVX512(MI.getOpcode(), dom, ReplaceableInstrsAVX512);
9175 // Don't change integer Q instructions to D instructions.
9176 if (table && Domain == 3 && table[3] == MI.getOpcode())
9177 Domain = 4;
9178 }
9179 if (!table) { // try the AVX512DQ table
9180 assert((Subtarget.hasDQI() || Domain >= 3) && "Requires AVX-512DQ");
9181 table = lookupAVX512(MI.getOpcode(), dom, ReplaceableInstrsAVX512DQ);
9182 // Don't change integer Q instructions to D instructions and
9183 // use D instructions if we started with a PS instruction.
9184 if (table && Domain == 3 && (dom == 1 || table[3] == MI.getOpcode()))
9185 Domain = 4;
9186 }
9187 if (!table) { // try the AVX512DQMasked table
9188 assert((Subtarget.hasDQI() || Domain >= 3) && "Requires AVX-512DQ");
9189 table = lookupAVX512(MI.getOpcode(), dom, ReplaceableInstrsAVX512DQMasked);
9190 if (table && Domain == 3 && (dom == 1 || table[3] == MI.getOpcode()))
9191 Domain = 4;
9192 }
9193 assert(table && "Cannot change domain");
9194 MI.setDesc(get(table[Domain - 1]));
9195}
9196
9199 DebugLoc DL;
9200 BuildMI(MBB, MI, DL, get(X86::NOOP));
9201}
9202
9203/// Return the noop instruction to use for a noop.
9205 MCInst Nop;
9206 Nop.setOpcode(X86::NOOP);
9207 return Nop;
9208}
9209
9211 switch (opc) {
9212 default:
9213 return false;
9214 case X86::DIVPDrm:
9215 case X86::DIVPDrr:
9216 case X86::DIVPSrm:
9217 case X86::DIVPSrr:
9218 case X86::DIVSDrm:
9219 case X86::DIVSDrm_Int:
9220 case X86::DIVSDrr:
9221 case X86::DIVSDrr_Int:
9222 case X86::DIVSSrm:
9223 case X86::DIVSSrm_Int:
9224 case X86::DIVSSrr:
9225 case X86::DIVSSrr_Int:
9226 case X86::SQRTPDm:
9227 case X86::SQRTPDr:
9228 case X86::SQRTPSm:
9229 case X86::SQRTPSr:
9230 case X86::SQRTSDm:
9231 case X86::SQRTSDm_Int:
9232 case X86::SQRTSDr:
9233 case X86::SQRTSDr_Int:
9234 case X86::SQRTSSm:
9235 case X86::SQRTSSm_Int:
9236 case X86::SQRTSSr:
9237 case X86::SQRTSSr_Int:
9238 // AVX instructions with high latency
9239 case X86::VDIVPDrm:
9240 case X86::VDIVPDrr:
9241 case X86::VDIVPDYrm:
9242 case X86::VDIVPDYrr:
9243 case X86::VDIVPSrm:
9244 case X86::VDIVPSrr:
9245 case X86::VDIVPSYrm:
9246 case X86::VDIVPSYrr:
9247 case X86::VDIVSDrm:
9248 case X86::VDIVSDrm_Int:
9249 case X86::VDIVSDrr:
9250 case X86::VDIVSDrr_Int:
9251 case X86::VDIVSSrm:
9252 case X86::VDIVSSrm_Int:
9253 case X86::VDIVSSrr:
9254 case X86::VDIVSSrr_Int:
9255 case X86::VSQRTPDm:
9256 case X86::VSQRTPDr:
9257 case X86::VSQRTPDYm:
9258 case X86::VSQRTPDYr:
9259 case X86::VSQRTPSm:
9260 case X86::VSQRTPSr:
9261 case X86::VSQRTPSYm:
9262 case X86::VSQRTPSYr:
9263 case X86::VSQRTSDm:
9264 case X86::VSQRTSDm_Int:
9265 case X86::VSQRTSDr:
9266 case X86::VSQRTSDr_Int:
9267 case X86::VSQRTSSm:
9268 case X86::VSQRTSSm_Int:
9269 case X86::VSQRTSSr:
9270 case X86::VSQRTSSr_Int:
9271 // AVX512 instructions with high latency
9272 case X86::VDIVPDZ128rm:
9273 case X86::VDIVPDZ128rmb:
9274 case X86::VDIVPDZ128rmbk:
9275 case X86::VDIVPDZ128rmbkz:
9276 case X86::VDIVPDZ128rmk:
9277 case X86::VDIVPDZ128rmkz:
9278 case X86::VDIVPDZ128rr:
9279 case X86::VDIVPDZ128rrk:
9280 case X86::VDIVPDZ128rrkz:
9281 case X86::VDIVPDZ256rm:
9282 case X86::VDIVPDZ256rmb:
9283 case X86::VDIVPDZ256rmbk:
9284 case X86::VDIVPDZ256rmbkz:
9285 case X86::VDIVPDZ256rmk:
9286 case X86::VDIVPDZ256rmkz:
9287 case X86::VDIVPDZ256rr:
9288 case X86::VDIVPDZ256rrk:
9289 case X86::VDIVPDZ256rrkz:
9290 case X86::VDIVPDZrrb:
9291 case X86::VDIVPDZrrbk:
9292 case X86::VDIVPDZrrbkz:
9293 case X86::VDIVPDZrm:
9294 case X86::VDIVPDZrmb:
9295 case X86::VDIVPDZrmbk:
9296 case X86::VDIVPDZrmbkz:
9297 case X86::VDIVPDZrmk:
9298 case X86::VDIVPDZrmkz:
9299 case X86::VDIVPDZrr:
9300 case X86::VDIVPDZrrk:
9301 case X86::VDIVPDZrrkz:
9302 case X86::VDIVPSZ128rm:
9303 case X86::VDIVPSZ128rmb:
9304 case X86::VDIVPSZ128rmbk:
9305 case X86::VDIVPSZ128rmbkz:
9306 case X86::VDIVPSZ128rmk:
9307 case X86::VDIVPSZ128rmkz:
9308 case X86::VDIVPSZ128rr:
9309 case X86::VDIVPSZ128rrk:
9310 case X86::VDIVPSZ128rrkz:
9311 case X86::VDIVPSZ256rm:
9312 case X86::VDIVPSZ256rmb:
9313 case X86::VDIVPSZ256rmbk:
9314 case X86::VDIVPSZ256rmbkz:
9315 case X86::VDIVPSZ256rmk:
9316 case X86::VDIVPSZ256rmkz:
9317 case X86::VDIVPSZ256rr:
9318 case X86::VDIVPSZ256rrk:
9319 case X86::VDIVPSZ256rrkz:
9320 case X86::VDIVPSZrrb:
9321 case X86::VDIVPSZrrbk:
9322 case X86::VDIVPSZrrbkz:
9323 case X86::VDIVPSZrm:
9324 case X86::VDIVPSZrmb:
9325 case X86::VDIVPSZrmbk:
9326 case X86::VDIVPSZrmbkz:
9327 case X86::VDIVPSZrmk:
9328 case X86::VDIVPSZrmkz:
9329 case X86::VDIVPSZrr:
9330 case X86::VDIVPSZrrk:
9331 case X86::VDIVPSZrrkz:
9332 case X86::VDIVSDZrm:
9333 case X86::VDIVSDZrr:
9334 case X86::VDIVSDZrm_Int:
9335 case X86::VDIVSDZrm_Intk:
9336 case X86::VDIVSDZrm_Intkz:
9337 case X86::VDIVSDZrr_Int:
9338 case X86::VDIVSDZrr_Intk:
9339 case X86::VDIVSDZrr_Intkz:
9340 case X86::VDIVSDZrrb_Int:
9341 case X86::VDIVSDZrrb_Intk:
9342 case X86::VDIVSDZrrb_Intkz:
9343 case X86::VDIVSSZrm:
9344 case X86::VDIVSSZrr:
9345 case X86::VDIVSSZrm_Int:
9346 case X86::VDIVSSZrm_Intk:
9347 case X86::VDIVSSZrm_Intkz:
9348 case X86::VDIVSSZrr_Int:
9349 case X86::VDIVSSZrr_Intk:
9350 case X86::VDIVSSZrr_Intkz:
9351 case X86::VDIVSSZrrb_Int:
9352 case X86::VDIVSSZrrb_Intk:
9353 case X86::VDIVSSZrrb_Intkz:
9354 case X86::VSQRTPDZ128m:
9355 case X86::VSQRTPDZ128mb:
9356 case X86::VSQRTPDZ128mbk:
9357 case X86::VSQRTPDZ128mbkz:
9358 case X86::VSQRTPDZ128mk:
9359 case X86::VSQRTPDZ128mkz:
9360 case X86::VSQRTPDZ128r:
9361 case X86::VSQRTPDZ128rk:
9362 case X86::VSQRTPDZ128rkz:
9363 case X86::VSQRTPDZ256m:
9364 case X86::VSQRTPDZ256mb:
9365 case X86::VSQRTPDZ256mbk:
9366 case X86::VSQRTPDZ256mbkz:
9367 case X86::VSQRTPDZ256mk:
9368 case X86::VSQRTPDZ256mkz:
9369 case X86::VSQRTPDZ256r:
9370 case X86::VSQRTPDZ256rk:
9371 case X86::VSQRTPDZ256rkz:
9372 case X86::VSQRTPDZm:
9373 case X86::VSQRTPDZmb:
9374 case X86::VSQRTPDZmbk:
9375 case X86::VSQRTPDZmbkz:
9376 case X86::VSQRTPDZmk:
9377 case X86::VSQRTPDZmkz:
9378 case X86::VSQRTPDZr:
9379 case X86::VSQRTPDZrb:
9380 case X86::VSQRTPDZrbk:
9381 case X86::VSQRTPDZrbkz:
9382 case X86::VSQRTPDZrk:
9383 case X86::VSQRTPDZrkz:
9384 case X86::VSQRTPSZ128m:
9385 case X86::VSQRTPSZ128mb:
9386 case X86::VSQRTPSZ128mbk:
9387 case X86::VSQRTPSZ128mbkz:
9388 case X86::VSQRTPSZ128mk:
9389 case X86::VSQRTPSZ128mkz:
9390 case X86::VSQRTPSZ128r:
9391 case X86::VSQRTPSZ128rk:
9392 case X86::VSQRTPSZ128rkz:
9393 case X86::VSQRTPSZ256m:
9394 case X86::VSQRTPSZ256mb:
9395 case X86::VSQRTPSZ256mbk:
9396 case X86::VSQRTPSZ256mbkz:
9397 case X86::VSQRTPSZ256mk:
9398 case X86::VSQRTPSZ256mkz:
9399 case X86::VSQRTPSZ256r:
9400 case X86::VSQRTPSZ256rk:
9401 case X86::VSQRTPSZ256rkz:
9402 case X86::VSQRTPSZm:
9403 case X86::VSQRTPSZmb:
9404 case X86::VSQRTPSZmbk:
9405 case X86::VSQRTPSZmbkz:
9406 case X86::VSQRTPSZmk:
9407 case X86::VSQRTPSZmkz:
9408 case X86::VSQRTPSZr:
9409 case X86::VSQRTPSZrb:
9410 case X86::VSQRTPSZrbk:
9411 case X86::VSQRTPSZrbkz:
9412 case X86::VSQRTPSZrk:
9413 case X86::VSQRTPSZrkz:
9414 case X86::VSQRTSDZm:
9415 case X86::VSQRTSDZm_Int:
9416 case X86::VSQRTSDZm_Intk:
9417 case X86::VSQRTSDZm_Intkz:
9418 case X86::VSQRTSDZr:
9419 case X86::VSQRTSDZr_Int:
9420 case X86::VSQRTSDZr_Intk:
9421 case X86::VSQRTSDZr_Intkz:
9422 case X86::VSQRTSDZrb_Int:
9423 case X86::VSQRTSDZrb_Intk:
9424 case X86::VSQRTSDZrb_Intkz:
9425 case X86::VSQRTSSZm:
9426 case X86::VSQRTSSZm_Int:
9427 case X86::VSQRTSSZm_Intk:
9428 case X86::VSQRTSSZm_Intkz:
9429 case X86::VSQRTSSZr:
9430 case X86::VSQRTSSZr_Int:
9431 case X86::VSQRTSSZr_Intk:
9432 case X86::VSQRTSSZr_Intkz:
9433 case X86::VSQRTSSZrb_Int:
9434 case X86::VSQRTSSZrb_Intk:
9435 case X86::VSQRTSSZrb_Intkz:
9436
9437 case X86::VGATHERDPDYrm:
9438 case X86::VGATHERDPDZ128rm:
9439 case X86::VGATHERDPDZ256rm:
9440 case X86::VGATHERDPDZrm:
9441 case X86::VGATHERDPDrm:
9442 case X86::VGATHERDPSYrm:
9443 case X86::VGATHERDPSZ128rm:
9444 case X86::VGATHERDPSZ256rm:
9445 case X86::VGATHERDPSZrm:
9446 case X86::VGATHERDPSrm:
9447 case X86::VGATHERPF0DPDm:
9448 case X86::VGATHERPF0DPSm:
9449 case X86::VGATHERPF0QPDm:
9450 case X86::VGATHERPF0QPSm:
9451 case X86::VGATHERPF1DPDm:
9452 case X86::VGATHERPF1DPSm:
9453 case X86::VGATHERPF1QPDm:
9454 case X86::VGATHERPF1QPSm:
9455 case X86::VGATHERQPDYrm:
9456 case X86::VGATHERQPDZ128rm:
9457 case X86::VGATHERQPDZ256rm:
9458 case X86::VGATHERQPDZrm:
9459 case X86::VGATHERQPDrm:
9460 case X86::VGATHERQPSYrm:
9461 case X86::VGATHERQPSZ128rm:
9462 case X86::VGATHERQPSZ256rm:
9463 case X86::VGATHERQPSZrm:
9464 case X86::VGATHERQPSrm:
9465 case X86::VPGATHERDDYrm:
9466 case X86::VPGATHERDDZ128rm:
9467 case X86::VPGATHERDDZ256rm:
9468 case X86::VPGATHERDDZrm:
9469 case X86::VPGATHERDDrm:
9470 case X86::VPGATHERDQYrm:
9471 case X86::VPGATHERDQZ128rm:
9472 case X86::VPGATHERDQZ256rm:
9473 case X86::VPGATHERDQZrm:
9474 case X86::VPGATHERDQrm:
9475 case X86::VPGATHERQDYrm:
9476 case X86::VPGATHERQDZ128rm:
9477 case X86::VPGATHERQDZ256rm:
9478 case X86::VPGATHERQDZrm:
9479 case X86::VPGATHERQDrm:
9480 case X86::VPGATHERQQYrm:
9481 case X86::VPGATHERQQZ128rm:
9482 case X86::VPGATHERQQZ256rm:
9483 case X86::VPGATHERQQZrm:
9484 case X86::VPGATHERQQrm:
9485 case X86::VSCATTERDPDZ128mr:
9486 case X86::VSCATTERDPDZ256mr:
9487 case X86::VSCATTERDPDZmr:
9488 case X86::VSCATTERDPSZ128mr:
9489 case X86::VSCATTERDPSZ256mr:
9490 case X86::VSCATTERDPSZmr:
9491 case X86::VSCATTERPF0DPDm:
9492 case X86::VSCATTERPF0DPSm:
9493 case X86::VSCATTERPF0QPDm:
9494 case X86::VSCATTERPF0QPSm:
9495 case X86::VSCATTERPF1DPDm:
9496 case X86::VSCATTERPF1DPSm:
9497 case X86::VSCATTERPF1QPDm:
9498 case X86::VSCATTERPF1QPSm:
9499 case X86::VSCATTERQPDZ128mr:
9500 case X86::VSCATTERQPDZ256mr:
9501 case X86::VSCATTERQPDZmr:
9502 case X86::VSCATTERQPSZ128mr:
9503 case X86::VSCATTERQPSZ256mr:
9504 case X86::VSCATTERQPSZmr:
9505 case X86::VPSCATTERDDZ128mr:
9506 case X86::VPSCATTERDDZ256mr:
9507 case X86::VPSCATTERDDZmr:
9508 case X86::VPSCATTERDQZ128mr:
9509 case X86::VPSCATTERDQZ256mr:
9510 case X86::VPSCATTERDQZmr:
9511 case X86::VPSCATTERQDZ128mr:
9512 case X86::VPSCATTERQDZ256mr:
9513 case X86::VPSCATTERQDZmr:
9514 case X86::VPSCATTERQQZ128mr:
9515 case X86::VPSCATTERQQZ256mr:
9516 case X86::VPSCATTERQQZmr:
9517 return true;
9518 }
9519}
9520
9522 const MachineRegisterInfo *MRI,
9523 const MachineInstr &DefMI,
9524 unsigned DefIdx,
9525 const MachineInstr &UseMI,
9526 unsigned UseIdx) const {
9527 return isHighLatencyDef(DefMI.getOpcode());
9528}
9529
9531 const MachineBasicBlock *MBB) const {
9532 assert(Inst.getNumExplicitOperands() == 3 && Inst.getNumExplicitDefs() == 1 &&
9533 Inst.getNumDefs() <= 2 && "Reassociation needs binary operators");
9534
9535 // Integer binary math/logic instructions have a third source operand:
9536 // the EFLAGS register. That operand must be both defined here and never
9537 // used; ie, it must be dead. If the EFLAGS operand is live, then we can
9538 // not change anything because rearranging the operands could affect other
9539 // instructions that depend on the exact status flags (zero, sign, etc.)
9540 // that are set by using these particular operands with this operation.
9541 const MachineOperand *FlagDef = Inst.findRegisterDefOperand(X86::EFLAGS);
9542 assert((Inst.getNumDefs() == 1 || FlagDef) && "Implicit def isn't flags?");
9543 if (FlagDef && !FlagDef->isDead())
9544 return false;
9545
9547}
9548
9549// TODO: There are many more machine instruction opcodes to match:
9550// 1. Other data types (integer, vectors)
9551// 2. Other math / logic operations (xor, or)
9552// 3. Other forms of the same operation (intrinsics and other variants)
9554 bool Invert) const {
9555 if (Invert)
9556 return false;
9557 switch (Inst.getOpcode()) {
9558 CASE_ND(ADD8rr)
9559 CASE_ND(ADD16rr)
9560 CASE_ND(ADD32rr)
9561 CASE_ND(ADD64rr)
9562 CASE_ND(AND8rr)
9563 CASE_ND(AND16rr)
9564 CASE_ND(AND32rr)
9565 CASE_ND(AND64rr)
9566 CASE_ND(OR8rr)
9567 CASE_ND(OR16rr)
9568 CASE_ND(OR32rr)
9569 CASE_ND(OR64rr)
9570 CASE_ND(XOR8rr)
9571 CASE_ND(XOR16rr)
9572 CASE_ND(XOR32rr)
9573 CASE_ND(XOR64rr)
9574 CASE_ND(IMUL16rr)
9575 CASE_ND(IMUL32rr)
9576 CASE_ND(IMUL64rr)
9577 case X86::PANDrr:
9578 case X86::PORrr:
9579 case X86::PXORrr:
9580 case X86::ANDPDrr:
9581 case X86::ANDPSrr:
9582 case X86::ORPDrr:
9583 case X86::ORPSrr:
9584 case X86::XORPDrr:
9585 case X86::XORPSrr:
9586 case X86::PADDBrr:
9587 case X86::PADDWrr:
9588 case X86::PADDDrr:
9589 case X86::PADDQrr:
9590 case X86::PMULLWrr:
9591 case X86::PMULLDrr:
9592 case X86::PMAXSBrr:
9593 case X86::PMAXSDrr:
9594 case X86::PMAXSWrr:
9595 case X86::PMAXUBrr:
9596 case X86::PMAXUDrr:
9597 case X86::PMAXUWrr:
9598 case X86::PMINSBrr:
9599 case X86::PMINSDrr:
9600 case X86::PMINSWrr:
9601 case X86::PMINUBrr:
9602 case X86::PMINUDrr:
9603 case X86::PMINUWrr:
9604 case X86::VPANDrr:
9605 case X86::VPANDYrr:
9606 case X86::VPANDDZ128rr:
9607 case X86::VPANDDZ256rr:
9608 case X86::VPANDDZrr:
9609 case X86::VPANDQZ128rr:
9610 case X86::VPANDQZ256rr:
9611 case X86::VPANDQZrr:
9612 case X86::VPORrr:
9613 case X86::VPORYrr:
9614 case X86::VPORDZ128rr:
9615 case X86::VPORDZ256rr:
9616 case X86::VPORDZrr:
9617 case X86::VPORQZ128rr:
9618 case X86::VPORQZ256rr:
9619 case X86::VPORQZrr:
9620 case X86::VPXORrr:
9621 case X86::VPXORYrr:
9622 case X86::VPXORDZ128rr:
9623 case X86::VPXORDZ256rr:
9624 case X86::VPXORDZrr:
9625 case X86::VPXORQZ128rr:
9626 case X86::VPXORQZ256rr:
9627 case X86::VPXORQZrr:
9628 case X86::VANDPDrr:
9629 case X86::VANDPSrr:
9630 case X86::VANDPDYrr:
9631 case X86::VANDPSYrr:
9632 case X86::VANDPDZ128rr:
9633 case X86::VANDPSZ128rr:
9634 case X86::VANDPDZ256rr:
9635 case X86::VANDPSZ256rr:
9636 case X86::VANDPDZrr:
9637 case X86::VANDPSZrr:
9638 case X86::VORPDrr:
9639 case X86::VORPSrr:
9640 case X86::VORPDYrr:
9641 case X86::VORPSYrr:
9642 case X86::VORPDZ128rr:
9643 case X86::VORPSZ128rr:
9644 case X86::VORPDZ256rr:
9645 case X86::VORPSZ256rr:
9646 case X86::VORPDZrr:
9647 case X86::VORPSZrr:
9648 case X86::VXORPDrr:
9649 case X86::VXORPSrr:
9650 case X86::VXORPDYrr:
9651 case X86::VXORPSYrr:
9652 case X86::VXORPDZ128rr:
9653 case X86::VXORPSZ128rr:
9654 case X86::VXORPDZ256rr:
9655 case X86::VXORPSZ256rr:
9656 case X86::VXORPDZrr:
9657 case X86::VXORPSZrr:
9658 case X86::KADDBrr:
9659 case X86::KADDWrr:
9660 case X86::KADDDrr:
9661 case X86::KADDQrr:
9662 case X86::KANDBrr:
9663 case X86::KANDWrr:
9664 case X86::KANDDrr:
9665 case X86::KANDQrr:
9666 case X86::KORBrr:
9667 case X86::KORWrr:
9668 case X86::KORDrr:
9669 case X86::KORQrr:
9670 case X86::KXORBrr:
9671 case X86::KXORWrr:
9672 case X86::KXORDrr:
9673 case X86::KXORQrr:
9674 case X86::VPADDBrr:
9675 case X86::VPADDWrr:
9676 case X86::VPADDDrr:
9677 case X86::VPADDQrr:
9678 case X86::VPADDBYrr:
9679 case X86::VPADDWYrr:
9680 case X86::VPADDDYrr:
9681 case X86::VPADDQYrr:
9682 case X86::VPADDBZ128rr:
9683 case X86::VPADDWZ128rr:
9684 case X86::VPADDDZ128rr:
9685 case X86::VPADDQZ128rr:
9686 case X86::VPADDBZ256rr:
9687 case X86::VPADDWZ256rr:
9688 case X86::VPADDDZ256rr:
9689 case X86::VPADDQZ256rr:
9690 case X86::VPADDBZrr:
9691 case X86::VPADDWZrr:
9692 case X86::VPADDDZrr:
9693 case X86::VPADDQZrr:
9694 case X86::VPMULLWrr:
9695 case X86::VPMULLWYrr:
9696 case X86::VPMULLWZ128rr:
9697 case X86::VPMULLWZ256rr:
9698 case X86::VPMULLWZrr:
9699 case X86::VPMULLDrr:
9700 case X86::VPMULLDYrr:
9701 case X86::VPMULLDZ128rr:
9702 case X86::VPMULLDZ256rr:
9703 case X86::VPMULLDZrr:
9704 case X86::VPMULLQZ128rr:
9705 case X86::VPMULLQZ256rr:
9706 case X86::VPMULLQZrr:
9707 case X86::VPMAXSBrr:
9708 case X86::VPMAXSBYrr:
9709 case X86::VPMAXSBZ128rr:
9710 case X86::VPMAXSBZ256rr:
9711 case X86::VPMAXSBZrr:
9712 case X86::VPMAXSDrr:
9713 case X86::VPMAXSDYrr:
9714 case X86::VPMAXSDZ128rr:
9715 case X86::VPMAXSDZ256rr:
9716 case X86::VPMAXSDZrr:
9717 case X86::VPMAXSQZ128rr:
9718 case X86::VPMAXSQZ256rr:
9719 case X86::VPMAXSQZrr:
9720 case X86::VPMAXSWrr:
9721 case X86::VPMAXSWYrr:
9722 case X86::VPMAXSWZ128rr:
9723 case X86::VPMAXSWZ256rr:
9724 case X86::VPMAXSWZrr:
9725 case X86::VPMAXUBrr:
9726 case X86::VPMAXUBYrr:
9727 case X86::VPMAXUBZ128rr:
9728 case X86::VPMAXUBZ256rr:
9729 case X86::VPMAXUBZrr:
9730 case X86::VPMAXUDrr:
9731 case X86::VPMAXUDYrr:
9732 case X86::VPMAXUDZ128rr:
9733 case X86::VPMAXUDZ256rr:
9734 case X86::VPMAXUDZrr:
9735 case X86::VPMAXUQZ128rr:
9736 case X86::VPMAXUQZ256rr:
9737 case X86::VPMAXUQZrr:
9738 case X86::VPMAXUWrr:
9739 case X86::VPMAXUWYrr:
9740 case X86::VPMAXUWZ128rr:
9741 case X86::VPMAXUWZ256rr:
9742 case X86::VPMAXUWZrr:
9743 case X86::VPMINSBrr:
9744 case X86::VPMINSBYrr:
9745 case X86::VPMINSBZ128rr:
9746 case X86::VPMINSBZ256rr:
9747 case X86::VPMINSBZrr:
9748 case X86::VPMINSDrr:
9749 case X86::VPMINSDYrr:
9750 case X86::VPMINSDZ128rr:
9751 case X86::VPMINSDZ256rr:
9752 case X86::VPMINSDZrr:
9753 case X86::VPMINSQZ128rr:
9754 case X86::VPMINSQZ256rr:
9755 case X86::VPMINSQZrr:
9756 case X86::VPMINSWrr:
9757 case X86::VPMINSWYrr:
9758 case X86::VPMINSWZ128rr:
9759 case X86::VPMINSWZ256rr:
9760 case X86::VPMINSWZrr:
9761 case X86::VPMINUBrr:
9762 case X86::VPMINUBYrr:
9763 case X86::VPMINUBZ128rr:
9764 case X86::VPMINUBZ256rr:
9765 case X86::VPMINUBZrr:
9766 case X86::VPMINUDrr:
9767 case X86::VPMINUDYrr:
9768 case X86::VPMINUDZ128rr:
9769 case X86::VPMINUDZ256rr:
9770 case X86::VPMINUDZrr:
9771 case X86::VPMINUQZ128rr:
9772 case X86::VPMINUQZ256rr:
9773 case X86::VPMINUQZrr:
9774 case X86::VPMINUWrr:
9775 case X86::VPMINUWYrr:
9776 case X86::VPMINUWZ128rr:
9777 case X86::VPMINUWZ256rr:
9778 case X86::VPMINUWZrr:
9779 // Normal min/max instructions are not commutative because of NaN and signed
9780 // zero semantics, but these are. Thus, there's no need to check for global
9781 // relaxed math; the instructions themselves have the properties we need.
9782 case X86::MAXCPDrr:
9783 case X86::MAXCPSrr:
9784 case X86::MAXCSDrr:
9785 case X86::MAXCSSrr:
9786 case X86::MINCPDrr:
9787 case X86::MINCPSrr:
9788 case X86::MINCSDrr:
9789 case X86::MINCSSrr:
9790 case X86::VMAXCPDrr:
9791 case X86::VMAXCPSrr:
9792 case X86::VMAXCPDYrr:
9793 case X86::VMAXCPSYrr:
9794 case X86::VMAXCPDZ128rr:
9795 case X86::VMAXCPSZ128rr:
9796 case X86::VMAXCPDZ256rr:
9797 case X86::VMAXCPSZ256rr:
9798 case X86::VMAXCPDZrr:
9799 case X86::VMAXCPSZrr:
9800 case X86::VMAXCSDrr:
9801 case X86::VMAXCSSrr:
9802 case X86::VMAXCSDZrr:
9803 case X86::VMAXCSSZrr:
9804 case X86::VMINCPDrr:
9805 case X86::VMINCPSrr:
9806 case X86::VMINCPDYrr:
9807 case X86::VMINCPSYrr:
9808 case X86::VMINCPDZ128rr:
9809 case X86::VMINCPSZ128rr:
9810 case X86::VMINCPDZ256rr:
9811 case X86::VMINCPSZ256rr:
9812 case X86::VMINCPDZrr:
9813 case X86::VMINCPSZrr:
9814 case X86::VMINCSDrr:
9815 case X86::VMINCSSrr:
9816 case X86::VMINCSDZrr:
9817 case X86::VMINCSSZrr:
9818 case X86::VMAXCPHZ128rr:
9819 case X86::VMAXCPHZ256rr:
9820 case X86::VMAXCPHZrr:
9821 case X86::VMAXCSHZrr:
9822 case X86::VMINCPHZ128rr:
9823 case X86::VMINCPHZ256rr:
9824 case X86::VMINCPHZrr:
9825 case X86::VMINCSHZrr:
9826 return true;
9827 case X86::ADDPDrr:
9828 case X86::ADDPSrr:
9829 case X86::ADDSDrr:
9830 case X86::ADDSSrr:
9831 case X86::MULPDrr:
9832 case X86::MULPSrr:
9833 case X86::MULSDrr:
9834 case X86::MULSSrr:
9835 case X86::VADDPDrr:
9836 case X86::VADDPSrr:
9837 case X86::VADDPDYrr:
9838 case X86::VADDPSYrr:
9839 case X86::VADDPDZ128rr:
9840 case X86::VADDPSZ128rr:
9841 case X86::VADDPDZ256rr:
9842 case X86::VADDPSZ256rr:
9843 case X86::VADDPDZrr:
9844 case X86::VADDPSZrr:
9845 case X86::VADDSDrr:
9846 case X86::VADDSSrr:
9847 case X86::VADDSDZrr:
9848 case X86::VADDSSZrr:
9849 case X86::VMULPDrr:
9850 case X86::VMULPSrr:
9851 case X86::VMULPDYrr:
9852 case X86::VMULPSYrr:
9853 case X86::VMULPDZ128rr:
9854 case X86::VMULPSZ128rr:
9855 case X86::VMULPDZ256rr:
9856 case X86::VMULPSZ256rr:
9857 case X86::VMULPDZrr:
9858 case X86::VMULPSZrr:
9859 case X86::VMULSDrr:
9860 case X86::VMULSSrr:
9861 case X86::VMULSDZrr:
9862 case X86::VMULSSZrr:
9863 case X86::VADDPHZ128rr:
9864 case X86::VADDPHZ256rr:
9865 case X86::VADDPHZrr:
9866 case X86::VADDSHZrr:
9867 case X86::VMULPHZ128rr:
9868 case X86::VMULPHZ256rr:
9869 case X86::VMULPHZrr:
9870 case X86::VMULSHZrr:
9873 default:
9874 return false;
9875 }
9876}
9877
9878/// If \p DescribedReg overlaps with the MOVrr instruction's destination
9879/// register then, if possible, describe the value in terms of the source
9880/// register.
9881static std::optional<ParamLoadedValue>
9883 const TargetRegisterInfo *TRI) {
9884 Register DestReg = MI.getOperand(0).getReg();
9885 Register SrcReg = MI.getOperand(1).getReg();
9886
9887 auto Expr = DIExpression::get(MI.getMF()->getFunction().getContext(), {});
9888
9889 // If the described register is the destination, just return the source.
9890 if (DestReg == DescribedReg)
9891 return ParamLoadedValue(MachineOperand::CreateReg(SrcReg, false), Expr);
9892
9893 // If the described register is a sub-register of the destination register,
9894 // then pick out the source register's corresponding sub-register.
9895 if (unsigned SubRegIdx = TRI->getSubRegIndex(DestReg, DescribedReg)) {
9896 Register SrcSubReg = TRI->getSubReg(SrcReg, SubRegIdx);
9897 return ParamLoadedValue(MachineOperand::CreateReg(SrcSubReg, false), Expr);
9898 }
9899
9900 // The remaining case to consider is when the described register is a
9901 // super-register of the destination register. MOV8rr and MOV16rr does not
9902 // write to any of the other bytes in the register, meaning that we'd have to
9903 // describe the value using a combination of the source register and the
9904 // non-overlapping bits in the described register, which is not currently
9905 // possible.
9906 if (MI.getOpcode() == X86::MOV8rr || MI.getOpcode() == X86::MOV16rr ||
9907 !TRI->isSuperRegister(DestReg, DescribedReg))
9908 return std::nullopt;
9909
9910 assert(MI.getOpcode() == X86::MOV32rr && "Unexpected super-register case");
9911 return ParamLoadedValue(MachineOperand::CreateReg(SrcReg, false), Expr);
9912}
9913
9914std::optional<ParamLoadedValue>
9916 const MachineOperand *Op = nullptr;
9917 DIExpression *Expr = nullptr;
9918
9920
9921 switch (MI.getOpcode()) {
9922 case X86::LEA32r:
9923 case X86::LEA64r:
9924 case X86::LEA64_32r: {
9925 // We may need to describe a 64-bit parameter with a 32-bit LEA.
9926 if (!TRI->isSuperRegisterEq(MI.getOperand(0).getReg(), Reg))
9927 return std::nullopt;
9928
9929 // Operand 4 could be global address. For now we do not support
9930 // such situation.
9931 if (!MI.getOperand(4).isImm() || !MI.getOperand(2).isImm())
9932 return std::nullopt;
9933
9934 const MachineOperand &Op1 = MI.getOperand(1);
9935 const MachineOperand &Op2 = MI.getOperand(3);
9936 assert(Op2.isReg() &&
9937 (Op2.getReg() == X86::NoRegister || Op2.getReg().isPhysical()));
9938
9939 // Omit situations like:
9940 // %rsi = lea %rsi, 4, ...
9941 if ((Op1.isReg() && Op1.getReg() == MI.getOperand(0).getReg()) ||
9942 Op2.getReg() == MI.getOperand(0).getReg())
9943 return std::nullopt;
9944 else if ((Op1.isReg() && Op1.getReg() != X86::NoRegister &&
9945 TRI->regsOverlap(Op1.getReg(), MI.getOperand(0).getReg())) ||
9946 (Op2.getReg() != X86::NoRegister &&
9947 TRI->regsOverlap(Op2.getReg(), MI.getOperand(0).getReg())))
9948 return std::nullopt;
9949
9950 int64_t Coef = MI.getOperand(2).getImm();
9951 int64_t Offset = MI.getOperand(4).getImm();
9953
9954 if ((Op1.isReg() && Op1.getReg() != X86::NoRegister)) {
9955 Op = &Op1;
9956 } else if (Op1.isFI())
9957 Op = &Op1;
9958
9959 if (Op && Op->isReg() && Op->getReg() == Op2.getReg() && Coef > 0) {
9960 Ops.push_back(dwarf::DW_OP_constu);
9961 Ops.push_back(Coef + 1);
9962 Ops.push_back(dwarf::DW_OP_mul);
9963 } else {
9964 if (Op && Op2.getReg() != X86::NoRegister) {
9965 int dwarfReg = TRI->getDwarfRegNum(Op2.getReg(), false);
9966 if (dwarfReg < 0)
9967 return std::nullopt;
9968 else if (dwarfReg < 32) {
9969 Ops.push_back(dwarf::DW_OP_breg0 + dwarfReg);
9970 Ops.push_back(0);
9971 } else {
9972 Ops.push_back(dwarf::DW_OP_bregx);
9973 Ops.push_back(dwarfReg);
9974 Ops.push_back(0);
9975 }
9976 } else if (!Op) {
9977 assert(Op2.getReg() != X86::NoRegister);
9978 Op = &Op2;
9979 }
9980
9981 if (Coef > 1) {
9982 assert(Op2.getReg() != X86::NoRegister);
9983 Ops.push_back(dwarf::DW_OP_constu);
9984 Ops.push_back(Coef);
9985 Ops.push_back(dwarf::DW_OP_mul);
9986 }
9987
9988 if (((Op1.isReg() && Op1.getReg() != X86::NoRegister) || Op1.isFI()) &&
9989 Op2.getReg() != X86::NoRegister) {
9990 Ops.push_back(dwarf::DW_OP_plus);
9991 }
9992 }
9993
9995 Expr = DIExpression::get(MI.getMF()->getFunction().getContext(), Ops);
9996
9997 return ParamLoadedValue(*Op, Expr);
9998 }
9999 case X86::MOV8ri:
10000 case X86::MOV16ri:
10001 // TODO: Handle MOV8ri and MOV16ri.
10002 return std::nullopt;
10003 case X86::MOV32ri:
10004 case X86::MOV64ri:
10005 case X86::MOV64ri32:
10006 // MOV32ri may be used for producing zero-extended 32-bit immediates in
10007 // 64-bit parameters, so we need to consider super-registers.
10008 if (!TRI->isSuperRegisterEq(MI.getOperand(0).getReg(), Reg))
10009 return std::nullopt;
10010 return ParamLoadedValue(MI.getOperand(1), Expr);
10011 case X86::MOV8rr:
10012 case X86::MOV16rr:
10013 case X86::MOV32rr:
10014 case X86::MOV64rr:
10015 return describeMOVrrLoadedValue(MI, Reg, TRI);
10016 case X86::XOR32rr: {
10017 // 64-bit parameters are zero-materialized using XOR32rr, so also consider
10018 // super-registers.
10019 if (!TRI->isSuperRegisterEq(MI.getOperand(0).getReg(), Reg))
10020 return std::nullopt;
10021 if (MI.getOperand(1).getReg() == MI.getOperand(2).getReg())
10023 return std::nullopt;
10024 }
10025 case X86::MOVSX64rr32: {
10026 // We may need to describe the lower 32 bits of the MOVSX; for example, in
10027 // cases like this:
10028 //
10029 // $ebx = [...]
10030 // $rdi = MOVSX64rr32 $ebx
10031 // $esi = MOV32rr $edi
10032 if (!TRI->isSubRegisterEq(MI.getOperand(0).getReg(), Reg))
10033 return std::nullopt;
10034
10035 Expr = DIExpression::get(MI.getMF()->getFunction().getContext(), {});
10036
10037 // If the described register is the destination register we need to
10038 // sign-extend the source register from 32 bits. The other case we handle
10039 // is when the described register is the 32-bit sub-register of the
10040 // destination register, in case we just need to return the source
10041 // register.
10042 if (Reg == MI.getOperand(0).getReg())
10043 Expr = DIExpression::appendExt(Expr, 32, 64, true);
10044 else
10045 assert(X86MCRegisterClasses[X86::GR32RegClassID].contains(Reg) &&
10046 "Unhandled sub-register case for MOVSX64rr32");
10047
10048 return ParamLoadedValue(MI.getOperand(1), Expr);
10049 }
10050 default:
10051 assert(!MI.isMoveImmediate() && "Unexpected MoveImm instruction");
10053 }
10054}
10055
10056/// This is an architecture-specific helper function of reassociateOps.
10057/// Set special operand attributes for new instructions after reassociation.
10059 MachineInstr &OldMI2,
10060 MachineInstr &NewMI1,
10061 MachineInstr &NewMI2) const {
10062 // Integer instructions may define an implicit EFLAGS dest register operand.
10063 MachineOperand *OldFlagDef1 = OldMI1.findRegisterDefOperand(X86::EFLAGS);
10064 MachineOperand *OldFlagDef2 = OldMI2.findRegisterDefOperand(X86::EFLAGS);
10065
10066 assert(!OldFlagDef1 == !OldFlagDef2 &&
10067 "Unexpected instruction type for reassociation");
10068
10069 if (!OldFlagDef1 || !OldFlagDef2)
10070 return;
10071
10072 assert(OldFlagDef1->isDead() && OldFlagDef2->isDead() &&
10073 "Must have dead EFLAGS operand in reassociable instruction");
10074
10075 MachineOperand *NewFlagDef1 = NewMI1.findRegisterDefOperand(X86::EFLAGS);
10076 MachineOperand *NewFlagDef2 = NewMI2.findRegisterDefOperand(X86::EFLAGS);
10077
10078 assert(NewFlagDef1 && NewFlagDef2 &&
10079 "Unexpected operand in reassociable instruction");
10080
10081 // Mark the new EFLAGS operands as dead to be helpful to subsequent iterations
10082 // of this pass or other passes. The EFLAGS operands must be dead in these new
10083 // instructions because the EFLAGS operands in the original instructions must
10084 // be dead in order for reassociation to occur.
10085 NewFlagDef1->setIsDead();
10086 NewFlagDef2->setIsDead();
10087}
10088
10089std::pair<unsigned, unsigned>
10091 return std::make_pair(TF, 0u);
10092}
10093
10096 using namespace X86II;
10097 static const std::pair<unsigned, const char *> TargetFlags[] = {
10098 {MO_GOT_ABSOLUTE_ADDRESS, "x86-got-absolute-address"},
10099 {MO_PIC_BASE_OFFSET, "x86-pic-base-offset"},
10100 {MO_GOT, "x86-got"},
10101 {MO_GOTOFF, "x86-gotoff"},
10102 {MO_GOTPCREL, "x86-gotpcrel"},
10103 {MO_GOTPCREL_NORELAX, "x86-gotpcrel-norelax"},
10104 {MO_PLT, "x86-plt"},
10105 {MO_TLSGD, "x86-tlsgd"},
10106 {MO_TLSLD, "x86-tlsld"},
10107 {MO_TLSLDM, "x86-tlsldm"},
10108 {MO_GOTTPOFF, "x86-gottpoff"},
10109 {MO_INDNTPOFF, "x86-indntpoff"},
10110 {MO_TPOFF, "x86-tpoff"},
10111 {MO_DTPOFF, "x86-dtpoff"},
10112 {MO_NTPOFF, "x86-ntpoff"},
10113 {MO_GOTNTPOFF, "x86-gotntpoff"},
10114 {MO_DLLIMPORT, "x86-dllimport"},
10115 {MO_DARWIN_NONLAZY, "x86-darwin-nonlazy"},
10116 {MO_DARWIN_NONLAZY_PIC_BASE, "x86-darwin-nonlazy-pic-base"},
10117 {MO_TLVP, "x86-tlvp"},
10118 {MO_TLVP_PIC_BASE, "x86-tlvp-pic-base"},
10119 {MO_SECREL, "x86-secrel"},
10120 {MO_COFFSTUB, "x86-coffstub"}};
10121 return ArrayRef(TargetFlags);
10122}
10123
10124namespace {
10125/// Create Global Base Reg pass. This initializes the PIC
10126/// global base register for x86-32.
10127struct CGBR : public MachineFunctionPass {
10128 static char ID;
10129 CGBR() : MachineFunctionPass(ID) {}
10130
10131 bool runOnMachineFunction(MachineFunction &MF) override {
10132 const X86TargetMachine *TM =
10133 static_cast<const X86TargetMachine *>(&MF.getTarget());
10134 const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>();
10135
10136 // Only emit a global base reg in PIC mode.
10137 if (!TM->isPositionIndependent())
10138 return false;
10139
10141 Register GlobalBaseReg = X86FI->getGlobalBaseReg();
10142
10143 // If we didn't need a GlobalBaseReg, don't insert code.
10144 if (GlobalBaseReg == 0)
10145 return false;
10146
10147 // Insert the set of GlobalBaseReg into the first MBB of the function
10148 MachineBasicBlock &FirstMBB = MF.front();
10150 DebugLoc DL = FirstMBB.findDebugLoc(MBBI);
10152 const X86InstrInfo *TII = STI.getInstrInfo();
10153
10154 Register PC;
10155 if (STI.isPICStyleGOT())
10156 PC = RegInfo.createVirtualRegister(&X86::GR32RegClass);
10157 else
10158 PC = GlobalBaseReg;
10159
10160 if (STI.is64Bit()) {
10161 if (TM->getCodeModel() == CodeModel::Large) {
10162 // In the large code model, we are aiming for this code, though the
10163 // register allocation may vary:
10164 // leaq .LN$pb(%rip), %rax
10165 // movq $_GLOBAL_OFFSET_TABLE_ - .LN$pb, %rcx
10166 // addq %rcx, %rax
10167 // RAX now holds address of _GLOBAL_OFFSET_TABLE_.
10168 Register PBReg = RegInfo.createVirtualRegister(&X86::GR64RegClass);
10169 Register GOTReg = RegInfo.createVirtualRegister(&X86::GR64RegClass);
10170 BuildMI(FirstMBB, MBBI, DL, TII->get(X86::LEA64r), PBReg)
10171 .addReg(X86::RIP)
10172 .addImm(0)
10173 .addReg(0)
10175 .addReg(0);
10176 std::prev(MBBI)->setPreInstrSymbol(MF, MF.getPICBaseSymbol());
10177 BuildMI(FirstMBB, MBBI, DL, TII->get(X86::MOV64ri), GOTReg)
10178 .addExternalSymbol("_GLOBAL_OFFSET_TABLE_",
10180 BuildMI(FirstMBB, MBBI, DL, TII->get(X86::ADD64rr), PC)
10181 .addReg(PBReg, RegState::Kill)
10182 .addReg(GOTReg, RegState::Kill);
10183 } else {
10184 // In other code models, use a RIP-relative LEA to materialize the
10185 // GOT.
10186 BuildMI(FirstMBB, MBBI, DL, TII->get(X86::LEA64r), PC)
10187 .addReg(X86::RIP)
10188 .addImm(0)
10189 .addReg(0)
10190 .addExternalSymbol("_GLOBAL_OFFSET_TABLE_")
10191 .addReg(0);
10192 }
10193 } else {
10194 // Operand of MovePCtoStack is completely ignored by asm printer. It's
10195 // only used in JIT code emission as displacement to pc.
10196 BuildMI(FirstMBB, MBBI, DL, TII->get(X86::MOVPC32r), PC).addImm(0);
10197
10198 // If we're using vanilla 'GOT' PIC style, we should use relative
10199 // addressing not to pc, but to _GLOBAL_OFFSET_TABLE_ external.
10200 if (STI.isPICStyleGOT()) {
10201 // Generate addl $__GLOBAL_OFFSET_TABLE_ + [.-piclabel],
10202 // %some_register
10203 BuildMI(FirstMBB, MBBI, DL, TII->get(X86::ADD32ri), GlobalBaseReg)
10204 .addReg(PC)
10205 .addExternalSymbol("_GLOBAL_OFFSET_TABLE_",
10207 }
10208 }
10209
10210 return true;
10211 }
10212
10213 StringRef getPassName() const override {
10214 return "X86 PIC Global Base Reg Initialization";
10215 }
10216
10217 void getAnalysisUsage(AnalysisUsage &AU) const override {
10218 AU.setPreservesCFG();
10220 }
10221};
10222} // namespace
10223
10224char CGBR::ID = 0;
10226
10227namespace {
10228struct LDTLSCleanup : public MachineFunctionPass {
10229 static char ID;
10230 LDTLSCleanup() : MachineFunctionPass(ID) {}
10231
10232 bool runOnMachineFunction(MachineFunction &MF) override {
10233 if (skipFunction(MF.getFunction()))
10234 return false;
10235
10237 if (MFI->getNumLocalDynamicTLSAccesses() < 2) {
10238 // No point folding accesses if there isn't at least two.
10239 return false;
10240 }
10241
10242 MachineDominatorTree *DT = &getAnalysis<MachineDominatorTree>();
10243 return VisitNode(DT->getRootNode(), 0);
10244 }
10245
10246 // Visit the dominator subtree rooted at Node in pre-order.
10247 // If TLSBaseAddrReg is non-null, then use that to replace any
10248 // TLS_base_addr instructions. Otherwise, create the register
10249 // when the first such instruction is seen, and then use it
10250 // as we encounter more instructions.
10251 bool VisitNode(MachineDomTreeNode *Node, unsigned TLSBaseAddrReg) {
10252 MachineBasicBlock *BB = Node->getBlock();
10253 bool Changed = false;
10254
10255 // Traverse the current block.
10256 for (MachineBasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;
10257 ++I) {
10258 switch (I->getOpcode()) {
10259 case X86::TLS_base_addr32:
10260 case X86::TLS_base_addr64:
10261 if (TLSBaseAddrReg)
10262 I = ReplaceTLSBaseAddrCall(*I, TLSBaseAddrReg);
10263 else
10264 I = SetRegister(*I, &TLSBaseAddrReg);
10265 Changed = true;
10266 break;
10267 default:
10268 break;
10269 }
10270 }
10271
10272 // Visit the children of this block in the dominator tree.
10273 for (auto &I : *Node) {
10274 Changed |= VisitNode(I, TLSBaseAddrReg);
10275 }
10276
10277 return Changed;
10278 }
10279
10280 // Replace the TLS_base_addr instruction I with a copy from
10281 // TLSBaseAddrReg, returning the new instruction.
10282 MachineInstr *ReplaceTLSBaseAddrCall(MachineInstr &I,
10283 unsigned TLSBaseAddrReg) {
10284 MachineFunction *MF = I.getParent()->getParent();
10285 const X86Subtarget &STI = MF->getSubtarget<X86Subtarget>();
10286 const bool is64Bit = STI.is64Bit();
10287 const X86InstrInfo *TII = STI.getInstrInfo();
10288
10289 // Insert a Copy from TLSBaseAddrReg to RAX/EAX.
10291 BuildMI(*I.getParent(), I, I.getDebugLoc(),
10292 TII->get(TargetOpcode::COPY), is64Bit ? X86::RAX : X86::EAX)
10293 .addReg(TLSBaseAddrReg);
10294
10295 // Erase the TLS_base_addr instruction.
10296 I.eraseFromParent();
10297
10298 return Copy;
10299 }
10300
10301 // Create a virtual register in *TLSBaseAddrReg, and populate it by
10302 // inserting a copy instruction after I. Returns the new instruction.
10303 MachineInstr *SetRegister(MachineInstr &I, unsigned *TLSBaseAddrReg) {
10304 MachineFunction *MF = I.getParent()->getParent();
10305 const X86Subtarget &STI = MF->getSubtarget<X86Subtarget>();
10306 const bool is64Bit = STI.is64Bit();
10307 const X86InstrInfo *TII = STI.getInstrInfo();
10308
10309 // Create a virtual register for the TLS base address.
10311 *TLSBaseAddrReg = RegInfo.createVirtualRegister(
10312 is64Bit ? &X86::GR64RegClass : &X86::GR32RegClass);
10313
10314 // Insert a copy from RAX/EAX to TLSBaseAddrReg.
10315 MachineInstr *Next = I.getNextNode();
10316 MachineInstr *Copy = BuildMI(*I.getParent(), Next, I.getDebugLoc(),
10317 TII->get(TargetOpcode::COPY), *TLSBaseAddrReg)
10318 .addReg(is64Bit ? X86::RAX : X86::EAX);
10319
10320 return Copy;
10321 }
10322
10323 StringRef getPassName() const override {
10324 return "Local Dynamic TLS Access Clean-up";
10325 }
10326
10327 void getAnalysisUsage(AnalysisUsage &AU) const override {
10328 AU.setPreservesCFG();
10331 }
10332};
10333} // namespace
10334
10335char LDTLSCleanup::ID = 0;
10337 return new LDTLSCleanup();
10338}
10339
10340/// Constants defining how certain sequences should be outlined.
10341///
10342/// \p MachineOutlinerDefault implies that the function is called with a call
10343/// instruction, and a return must be emitted for the outlined function frame.
10344///
10345/// That is,
10346///
10347/// I1 OUTLINED_FUNCTION:
10348/// I2 --> call OUTLINED_FUNCTION I1
10349/// I3 I2
10350/// I3
10351/// ret
10352///
10353/// * Call construction overhead: 1 (call instruction)
10354/// * Frame construction overhead: 1 (return instruction)
10355///
10356/// \p MachineOutlinerTailCall implies that the function is being tail called.
10357/// A jump is emitted instead of a call, and the return is already present in
10358/// the outlined sequence. That is,
10359///
10360/// I1 OUTLINED_FUNCTION:
10361/// I2 --> jmp OUTLINED_FUNCTION I1
10362/// ret I2
10363/// ret
10364///
10365/// * Call construction overhead: 1 (jump instruction)
10366/// * Frame construction overhead: 0 (don't need to return)
10367///
10369
10370std::optional<outliner::OutlinedFunction>
10372 std::vector<outliner::Candidate> &RepeatedSequenceLocs) const {
10373 unsigned SequenceSize = 0;
10374 for (auto &MI : RepeatedSequenceLocs[0]) {
10375 // FIXME: x86 doesn't implement getInstSizeInBytes, so
10376 // we can't tell the cost. Just assume each instruction
10377 // is one byte.
10378 if (MI.isDebugInstr() || MI.isKill())
10379 continue;
10380 SequenceSize += 1;
10381 }
10382
10383 // We check to see if CFI Instructions are present, and if they are
10384 // we find the number of CFI Instructions in the candidates.
10385 unsigned CFICount = 0;
10386 for (auto &I : RepeatedSequenceLocs[0]) {
10387 if (I.isCFIInstruction())
10388 CFICount++;
10389 }
10390
10391 // We compare the number of found CFI Instructions to the number of CFI
10392 // instructions in the parent function for each candidate. We must check this
10393 // since if we outline one of the CFI instructions in a function, we have to
10394 // outline them all for correctness. If we do not, the address offsets will be
10395 // incorrect between the two sections of the program.
10396 for (outliner::Candidate &C : RepeatedSequenceLocs) {
10397 std::vector<MCCFIInstruction> CFIInstructions =
10398 C.getMF()->getFrameInstructions();
10399
10400 if (CFICount > 0 && CFICount != CFIInstructions.size())
10401 return std::nullopt;
10402 }
10403
10404 // FIXME: Use real size in bytes for call and ret instructions.
10405 if (RepeatedSequenceLocs[0].back().isTerminator()) {
10406 for (outliner::Candidate &C : RepeatedSequenceLocs)
10407 C.setCallInfo(MachineOutlinerTailCall, 1);
10408
10409 return outliner::OutlinedFunction(RepeatedSequenceLocs, SequenceSize,
10410 0, // Number of bytes to emit frame.
10411 MachineOutlinerTailCall // Type of frame.
10412 );
10413 }
10414
10415 if (CFICount > 0)
10416 return std::nullopt;
10417
10418 for (outliner::Candidate &C : RepeatedSequenceLocs)
10419 C.setCallInfo(MachineOutlinerDefault, 1);
10420
10421 return outliner::OutlinedFunction(RepeatedSequenceLocs, SequenceSize, 1,
10423}
10424
10426 MachineFunction &MF, bool OutlineFromLinkOnceODRs) const {
10427 const Function &F = MF.getFunction();
10428
10429 // Does the function use a red zone? If it does, then we can't risk messing
10430 // with the stack.
10431 if (Subtarget.getFrameLowering()->has128ByteRedZone(MF)) {
10432 // It could have a red zone. If it does, then we don't want to touch it.
10434 if (!X86FI || X86FI->getUsesRedZone())
10435 return false;
10436 }
10437
10438 // If we *don't* want to outline from things that could potentially be deduped
10439 // then return false.
10440 if (!OutlineFromLinkOnceODRs && F.hasLinkOnceODRLinkage())
10441 return false;
10442
10443 // This function is viable for outlining, so return true.
10444 return true;
10445}
10446
10449 unsigned Flags) const {
10450 MachineInstr &MI = *MIT;
10451
10452 // Is this a terminator for a basic block?
10453 if (MI.isTerminator())
10454 // TargetInstrInfo::getOutliningType has already filtered out anything
10455 // that would break this, so we can allow it here.
10457
10458 // Don't outline anything that modifies or reads from the stack pointer.
10459 //
10460 // FIXME: There are instructions which are being manually built without
10461 // explicit uses/defs so we also have to check the MCInstrDesc. We should be
10462 // able to remove the extra checks once those are fixed up. For example,
10463 // sometimes we might get something like %rax = POP64r 1. This won't be
10464 // caught by modifiesRegister or readsRegister even though the instruction
10465 // really ought to be formed so that modifiesRegister/readsRegister would
10466 // catch it.
10467 if (MI.modifiesRegister(X86::RSP, &RI) || MI.readsRegister(X86::RSP, &RI) ||
10468 MI.getDesc().hasImplicitUseOfPhysReg(X86::RSP) ||
10469 MI.getDesc().hasImplicitDefOfPhysReg(X86::RSP))
10471
10472 // Outlined calls change the instruction pointer, so don't read from it.
10473 if (MI.readsRegister(X86::RIP, &RI) ||
10474 MI.getDesc().hasImplicitUseOfPhysReg(X86::RIP) ||
10475 MI.getDesc().hasImplicitDefOfPhysReg(X86::RIP))
10477
10478 // Don't outline CFI instructions.
10479 if (MI.isCFIInstruction())
10481
10483}
10484
10487 const outliner::OutlinedFunction &OF) const {
10488 // If we're a tail call, we already have a return, so don't do anything.
10490 return;
10491
10492 // We're a normal call, so our sequence doesn't have a return instruction.
10493 // Add it in.
10494 MachineInstr *retq = BuildMI(MF, DebugLoc(), get(X86::RET64));
10495 MBB.insert(MBB.end(), retq);
10496}
10497
10501 // Is it a tail call?
10502 if (C.CallConstructionID == MachineOutlinerTailCall) {
10503 // Yes, just insert a JMP.
10504 It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(X86::TAILJMPd64))
10505 .addGlobalAddress(M.getNamedValue(MF.getName())));
10506 } else {
10507 // No, insert a call.
10508 It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(X86::CALL64pcrel32))
10509 .addGlobalAddress(M.getNamedValue(MF.getName())));
10510 }
10511
10512 return It;
10513}
10514
10517 DebugLoc &DL,
10518 bool AllowSideEffects) const {
10519 const MachineFunction &MF = *MBB.getParent();
10520 const X86Subtarget &ST = MF.getSubtarget<X86Subtarget>();
10522
10523 if (ST.hasMMX() && X86::VR64RegClass.contains(Reg))
10524 // FIXME: Should we ignore MMX registers?
10525 return;
10526
10527 if (TRI.isGeneralPurposeRegister(MF, Reg)) {
10528 // Convert register to the 32-bit version. Both 'movl' and 'xorl' clear the
10529 // upper bits of a 64-bit register automagically.
10530 Reg = getX86SubSuperRegister(Reg, 32);
10531
10532 if (!AllowSideEffects)
10533 // XOR affects flags, so use a MOV instead.
10534 BuildMI(MBB, Iter, DL, get(X86::MOV32ri), Reg).addImm(0);
10535 else
10536 BuildMI(MBB, Iter, DL, get(X86::XOR32rr), Reg)
10537 .addReg(Reg, RegState::Undef)
10538 .addReg(Reg, RegState::Undef);
10539 } else if (X86::VR128RegClass.contains(Reg)) {
10540 // XMM#
10541 if (!ST.hasSSE1())
10542 return;
10543
10544 // PXOR is safe to use because it doesn't affect flags.
10545 BuildMI(MBB, Iter, DL, get(X86::PXORrr), Reg)
10546 .addReg(Reg, RegState::Undef)
10547 .addReg(Reg, RegState::Undef);
10548 } else if (X86::VR256RegClass.contains(Reg)) {
10549 // YMM#
10550 if (!ST.hasAVX())
10551 return;
10552
10553 // VPXOR is safe to use because it doesn't affect flags.
10554 BuildMI(MBB, Iter, DL, get(X86::VPXORrr), Reg)
10555 .addReg(Reg, RegState::Undef)
10556 .addReg(Reg, RegState::Undef);
10557 } else if (X86::VR512RegClass.contains(Reg)) {
10558 // ZMM#
10559 if (!ST.hasAVX512())
10560 return;
10561
10562 // VPXORY is safe to use because it doesn't affect flags.
10563 BuildMI(MBB, Iter, DL, get(X86::VPXORYrr), Reg)
10564 .addReg(Reg, RegState::Undef)
10565 .addReg(Reg, RegState::Undef);
10566 } else if (X86::VK1RegClass.contains(Reg) || X86::VK2RegClass.contains(Reg) ||
10567 X86::VK4RegClass.contains(Reg) || X86::VK8RegClass.contains(Reg) ||
10568 X86::VK16RegClass.contains(Reg)) {
10569 if (!ST.hasVLX())
10570 return;
10571
10572 // KXOR is safe to use because it doesn't affect flags.
10573 unsigned Op = ST.hasBWI() ? X86::KXORQrr : X86::KXORWrr;
10574 BuildMI(MBB, Iter, DL, get(Op), Reg)
10575 .addReg(Reg, RegState::Undef)
10576 .addReg(Reg, RegState::Undef);
10577 }
10578}
10579
10581 MachineInstr &Root, SmallVectorImpl<unsigned> &Patterns,
10582 bool DoRegPressureReduce) const {
10583 unsigned Opc = Root.getOpcode();
10584 switch (Opc) {
10585 case X86::VPDPWSSDrr:
10586 case X86::VPDPWSSDrm:
10587 case X86::VPDPWSSDYrr:
10588 case X86::VPDPWSSDYrm: {
10589 if (!Subtarget.hasFastDPWSSD()) {
10591 return true;
10592 }
10593 break;
10594 }
10595 case X86::VPDPWSSDZ128r:
10596 case X86::VPDPWSSDZ128m:
10597 case X86::VPDPWSSDZ256r:
10598 case X86::VPDPWSSDZ256m:
10599 case X86::VPDPWSSDZr:
10600 case X86::VPDPWSSDZm: {
10601 if (Subtarget.hasBWI() && !Subtarget.hasFastDPWSSD()) {
10603 return true;
10604 }
10605 break;
10606 }
10607 }
10609 Patterns, DoRegPressureReduce);
10610}
10611
10612static void
10616 DenseMap<unsigned, unsigned> &InstrIdxForVirtReg) {
10617 MachineFunction *MF = Root.getMF();
10619
10620 unsigned Opc = Root.getOpcode();
10621 unsigned AddOpc = 0;
10622 unsigned MaddOpc = 0;
10623 switch (Opc) {
10624 default:
10625 assert(false && "It should not reach here");
10626 break;
10627 // vpdpwssd xmm2,xmm3,xmm1
10628 // -->
10629 // vpmaddwd xmm3,xmm3,xmm1
10630 // vpaddd xmm2,xmm2,xmm3
10631 case X86::VPDPWSSDrr:
10632 MaddOpc = X86::VPMADDWDrr;
10633 AddOpc = X86::VPADDDrr;
10634 break;
10635 case X86::VPDPWSSDrm:
10636 MaddOpc = X86::VPMADDWDrm;
10637 AddOpc = X86::VPADDDrr;
10638 break;
10639 case X86::VPDPWSSDZ128r:
10640 MaddOpc = X86::VPMADDWDZ128rr;
10641 AddOpc = X86::VPADDDZ128rr;
10642 break;
10643 case X86::VPDPWSSDZ128m:
10644 MaddOpc = X86::VPMADDWDZ128rm;
10645 AddOpc = X86::VPADDDZ128rr;
10646 break;
10647 // vpdpwssd ymm2,ymm3,ymm1
10648 // -->
10649 // vpmaddwd ymm3,ymm3,ymm1
10650 // vpaddd ymm2,ymm2,ymm3
10651 case X86::VPDPWSSDYrr:
10652 MaddOpc = X86::VPMADDWDYrr;
10653 AddOpc = X86::VPADDDYrr;
10654 break;
10655 case X86::VPDPWSSDYrm:
10656 MaddOpc = X86::VPMADDWDYrm;
10657 AddOpc = X86::VPADDDYrr;
10658 break;
10659 case X86::VPDPWSSDZ256r:
10660 MaddOpc = X86::VPMADDWDZ256rr;
10661 AddOpc = X86::VPADDDZ256rr;
10662 break;
10663 case X86::VPDPWSSDZ256m:
10664 MaddOpc = X86::VPMADDWDZ256rm;
10665 AddOpc = X86::VPADDDZ256rr;
10666 break;
10667 // vpdpwssd zmm2,zmm3,zmm1
10668 // -->
10669 // vpmaddwd zmm3,zmm3,zmm1
10670 // vpaddd zmm2,zmm2,zmm3
10671 case X86::VPDPWSSDZr:
10672 MaddOpc = X86::VPMADDWDZrr;
10673 AddOpc = X86::VPADDDZrr;
10674 break;
10675 case X86::VPDPWSSDZm:
10676 MaddOpc = X86::VPMADDWDZrm;
10677 AddOpc = X86::VPADDDZrr;
10678 break;
10679 }
10680 // Create vpmaddwd.
10681 const TargetRegisterClass *RC =
10682 RegInfo.getRegClass(Root.getOperand(0).getReg());
10683 Register NewReg = RegInfo.createVirtualRegister(RC);
10684 MachineInstr *Madd = Root.getMF()->CloneMachineInstr(&Root);
10685 Madd->setDesc(TII.get(MaddOpc));
10686 Madd->untieRegOperand(1);
10687 Madd->removeOperand(1);
10688 Madd->getOperand(0).setReg(NewReg);
10689 InstrIdxForVirtReg.insert(std::make_pair(NewReg, 0));
10690 // Create vpaddd.
10691 Register DstReg = Root.getOperand(0).getReg();
10692 bool IsKill = Root.getOperand(1).isKill();
10693 MachineInstr *Add =
10694 BuildMI(*MF, MIMetadata(Root), TII.get(AddOpc), DstReg)
10695 .addReg(Root.getOperand(1).getReg(), getKillRegState(IsKill))
10696 .addReg(Madd->getOperand(0).getReg(), getKillRegState(true));
10697 InsInstrs.push_back(Madd);
10698 InsInstrs.push_back(Add);
10699 DelInstrs.push_back(&Root);
10700}
10701
10703 MachineInstr &Root, unsigned Pattern,
10706 DenseMap<unsigned, unsigned> &InstrIdxForVirtReg) const {
10707 switch (Pattern) {
10708 default:
10709 // Reassociate instructions.
10711 DelInstrs, InstrIdxForVirtReg);
10712 return;
10714 genAlternativeDpCodeSequence(Root, *this, InsInstrs, DelInstrs,
10715 InstrIdxForVirtReg);
10716 return;
10717 }
10718}
10719
10720// See also: X86DAGToDAGISel::SelectInlineAsmMemoryOperand().
10722 int FI) const {
10724 M.BaseType = X86AddressMode::FrameIndexBase;
10725 M.Base.FrameIndex = FI;
10726 M.getFullAddress(Ops);
10727}
10728
10729#define GET_INSTRINFO_HELPERS
10730#include "X86GenInstrInfo.inc"
unsigned SubReg
unsigned const MachineRegisterInfo * MRI
MachineInstrBuilder & UseMI
MachineInstrBuilder MachineInstrBuilder & DefMI
MachineOutlinerClass
Constants defining how certain sequences should be outlined.
@ MachineOutlinerTailCall
Emit a save, restore, call, and return.
@ MachineOutlinerDefault
unsigned RegSize
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
BlockVerifier::State From
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(X)
Definition: Debug.h:101
uint64_t Size
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
static bool lookup(const GsymReader &GR, DataExtractor &Data, uint64_t &Offset, uint64_t BaseAddr, uint64_t Addr, SourceLocations &SrcLocs, llvm::Error &Err)
A Lookup helper functions.
Definition: InlineInfo.cpp:109
This file implements the LivePhysRegs utility for tracking liveness of physical registers.
static bool Expand2AddrUndef(MachineInstrBuilder &MIB, const MCInstrDesc &Desc)
Expand a single-def pseudo instruction to a two-addr instruction with two undef reads of the register...
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
This file declares the MachineConstantPool class which is an abstract constant pool to keep track of ...
unsigned const TargetRegisterInfo * TRI
const char LLVMTargetMachineRef TM
const SmallVectorImpl< MachineOperand > MachineBasicBlock * TBB
const SmallVectorImpl< MachineOperand > & Cond
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
bool IsDead
This file contains some templates that are useful if you are working with the STL at all.
Provides some synthesis utilities to produce sequences of values.
static SPCC::CondCodes GetOppositeBranchCondition(SPCC::CondCodes CC)
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition: Value.cpp:469
static bool is64Bit(const char *name)
#define GET_EGPR_IF_ENABLED(OPC)
#define FROM_TO(FROM, TO, IDX1, IDX2)
static bool isLEA(unsigned Opcode)
#define CASE_ND(OP)
static void addOperands(MachineInstrBuilder &MIB, ArrayRef< MachineOperand > MOs, int PtrOffset=0)
static std::optional< ParamLoadedValue > describeMOVrrLoadedValue(const MachineInstr &MI, Register DescribedReg, const TargetRegisterInfo *TRI)
If DescribedReg overlaps with the MOVrr instruction's destination register then, if possible,...
static cl::opt< unsigned > PartialRegUpdateClearance("partial-reg-update-clearance", cl::desc("Clearance between two register writes " "for inserting XOR to avoid partial " "register update"), cl::init(64), cl::Hidden)
static bool shouldPreventUndefRegUpdateMemFold(MachineFunction &MF, MachineInstr &MI)
static bool isConvertibleLEA(MachineInstr *MI)
static bool ExpandMOVImmSExti8(MachineInstrBuilder &MIB, const TargetInstrInfo &TII, const X86Subtarget &Subtarget)
static bool isAMXOpcode(unsigned Opc)
static int getJumpTableIndexFromReg(const MachineRegisterInfo &MRI, Register Reg)
static void updateOperandRegConstraints(MachineFunction &MF, MachineInstr &NewMI, const TargetInstrInfo &TII)
static bool findRedundantFlagInstr(MachineInstr &CmpInstr, MachineInstr &CmpValDefInstr, const MachineRegisterInfo *MRI, MachineInstr **AndInstr, const TargetRegisterInfo *TRI, bool &NoSignFlag, bool &ClearsOverflowFlag)
static int getJumpTableIndexFromAddr(const MachineInstr &MI)
static bool AdjustBlendMask(unsigned OldMask, unsigned OldWidth, unsigned NewWidth, unsigned *pNewMask=nullptr)
static bool expandMOV32r1(MachineInstrBuilder &MIB, const TargetInstrInfo &TII, bool MinusOne)
static unsigned getStoreRegOpcode(Register SrcReg, const TargetRegisterClass *RC, bool IsStackAligned, const X86Subtarget &STI)
#define FOLD_BROADCAST(SIZE)
static cl::opt< unsigned > UndefRegClearance("undef-reg-clearance", cl::desc("How many idle instructions we would like before " "certain undef register reads"), cl::init(128), cl::Hidden)
#define CASE_BCAST_TYPE_OPC(TYPE, OP16, OP32, OP64)
static bool isTruncatedShiftCountForLEA(unsigned ShAmt)
Check whether the given shift count is appropriate can be represented by a LEA instruction.
static cl::opt< bool > ReMatPICStubLoad("remat-pic-stub-load", cl::desc("Re-materialize load from stub in PIC mode"), cl::init(false), cl::Hidden)
static SmallVector< MachineMemOperand *, 2 > extractLoadMMOs(ArrayRef< MachineMemOperand * > MMOs, MachineFunction &MF)
static void printFailMsgforFold(const MachineInstr &MI, unsigned Idx)
static bool canConvert2Copy(unsigned Opc)
static MachineInstr * FuseTwoAddrInst(MachineFunction &MF, unsigned Opcode, ArrayRef< MachineOperand > MOs, MachineBasicBlock::iterator InsertPt, MachineInstr &MI, const TargetInstrInfo &TII)
static cl::opt< bool > NoFusing("disable-spill-fusing", cl::desc("Disable fusing of spill code into instructions"), cl::Hidden)
static bool expandNOVLXStore(MachineInstrBuilder &MIB, const TargetRegisterInfo *TRI, const MCInstrDesc &StoreDesc, const MCInstrDesc &ExtractDesc, unsigned SubIdx)
static bool isFrameLoadOpcode(int Opcode, unsigned &MemBytes)
static bool Expand2AddrKreg(MachineInstrBuilder &MIB, const MCInstrDesc &Desc, Register Reg)
Expand a single-def pseudo instruction to a two-addr instruction with two k0 reads.
#define VPERM_CASES_BROADCAST(Suffix)
static X86::CondCode isUseDefConvertible(const MachineInstr &MI)
Check whether the use can be converted to remove a comparison against zero.
static bool expandSHXDROT(MachineInstrBuilder &MIB, const MCInstrDesc &Desc)
static unsigned getLoadRegOpcode(Register DestReg, const TargetRegisterClass *RC, bool IsStackAligned, const X86Subtarget &STI)
static void expandLoadStackGuard(MachineInstrBuilder &MIB, const TargetInstrInfo &TII)
static bool hasUndefRegUpdate(unsigned Opcode, unsigned OpNum, bool ForLoadFold=false)
static MachineInstr * MakeM0Inst(const TargetInstrInfo &TII, unsigned Opcode, ArrayRef< MachineOperand > MOs, MachineBasicBlock::iterator InsertPt, MachineInstr &MI)
#define GET_ND_IF_ENABLED(OPC)
static bool hasPartialRegUpdate(unsigned Opcode, const X86Subtarget &Subtarget, bool ForLoadFold=false)
Return true for all instructions that only update the first 32 or 64-bits of the destination register...
static unsigned CopyToFromAsymmetricReg(unsigned DestReg, unsigned SrcReg, const X86Subtarget &Subtarget)
static const uint16_t * lookupAVX512(unsigned opcode, unsigned domain, ArrayRef< uint16_t[4]> Table)
static MachineInstr * FuseInst(MachineFunction &MF, unsigned Opcode, unsigned OpNo, ArrayRef< MachineOperand > MOs, MachineBasicBlock::iterator InsertPt, MachineInstr &MI, const TargetInstrInfo &TII, int PtrOffset=0)
static unsigned getLoadStoreRegOpcode(Register Reg, const TargetRegisterClass *RC, bool IsStackAligned, const X86Subtarget &STI, bool Load)
#define VPERM_CASES(Suffix)
#define FROM_TO_SIZE(A, B, S)
static void commuteVPTERNLOG(MachineInstr &MI, unsigned SrcOpIdx1, unsigned SrcOpIdx2)
static bool isFrameStoreOpcode(int Opcode, unsigned &MemBytes)
static bool isDefConvertible(const MachineInstr &MI, bool &NoSignFlag, bool &ClearsOverflowFlag)
Check whether the definition can be converted to remove a comparison against zero.
static bool isHReg(unsigned Reg)
Test if the given register is a physical h register.
static X86::CondCode getSwappedCondition(X86::CondCode CC)
Assuming the flags are set by MI(a,b), return the condition code if we modify the instructions such t...
static unsigned getCommutedVPERMV3Opcode(unsigned Opcode)
static bool expandXorFP(MachineInstrBuilder &MIB, const TargetInstrInfo &TII)
static MachineBasicBlock * getFallThroughMBB(MachineBasicBlock *MBB, MachineBasicBlock *TBB)
static bool isNonFoldablePartialRegisterLoad(const MachineInstr &LoadMI, const MachineInstr &UserMI, const MachineFunction &MF)
Check if LoadMI is a partial register load that we can't fold into MI because the latter uses content...
static unsigned getLoadStoreOpcodeForFP16(bool Load, const X86Subtarget &STI)
static cl::opt< bool > PrintFailedFusing("print-failed-fuse-candidates", cl::desc("Print instructions that the allocator wants to" " fuse, but the X86 backend currently can't"), cl::Hidden)
static bool expandNOVLXLoad(MachineInstrBuilder &MIB, const TargetRegisterInfo *TRI, const MCInstrDesc &LoadDesc, const MCInstrDesc &BroadcastDesc, unsigned SubIdx)
static unsigned getThreeSrcCommuteCase(uint64_t TSFlags, unsigned SrcOpIdx1, unsigned SrcOpIdx2)
This determines which of three possible cases of a three source commute the source indexes correspond...
static bool isX87Reg(unsigned Reg)
Return true if the Reg is X87 register.
static void genAlternativeDpCodeSequence(MachineInstr &Root, const TargetInstrInfo &TII, SmallVectorImpl< MachineInstr * > &InsInstrs, SmallVectorImpl< MachineInstr * > &DelInstrs, DenseMap< unsigned, unsigned > &InstrIdxForVirtReg)
static unsigned getTruncatedShiftCount(const MachineInstr &MI, unsigned ShiftAmtOperandIdx)
Check whether the shift count for a machine operand is non-zero.
static SmallVector< MachineMemOperand *, 2 > extractStoreMMOs(ArrayRef< MachineMemOperand * > MMOs, MachineFunction &MF)
static unsigned getBroadcastOpcode(const X86FoldTableEntry *I, const TargetRegisterClass *RC, const X86Subtarget &STI)
static unsigned convertALUrr2ALUri(unsigned Opc)
Convert an ALUrr opcode to corresponding ALUri opcode.
static bool regIsPICBase(Register BaseReg, const MachineRegisterInfo &MRI)
Return true if register is PIC base; i.e.g defined by X86::MOVPC32r.
static bool isCommutableVPERMV3Instruction(unsigned Opcode)
static APInt getMaxValue(unsigned numBits)
Gets maximum unsigned value of APInt for specific bit width.
Definition: APInt.h:184
static APInt getSignedMaxValue(unsigned numBits)
Gets maximum signed value of APInt for a specific bit width.
Definition: APInt.h:187
static APInt getSignedMinValue(unsigned numBits)
Gets minimum signed value of APInt for a specific bit width.
Definition: APInt.h:197
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition: Pass.cpp:269
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:165
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:993
@ FCMP_OEQ
0 0 0 1 True if ordered and equal
Definition: InstrTypes.h:996
@ ICMP_SLT
signed less than
Definition: InstrTypes.h:1022
@ ICMP_SLE
signed less or equal
Definition: InstrTypes.h:1023
@ FCMP_OLT
0 1 0 0 True if ordered and less than
Definition: InstrTypes.h:999
@ FCMP_ULE
1 1 0 1 True if unordered, less than, or equal
Definition: InstrTypes.h:1008
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
Definition: InstrTypes.h:997
@ FCMP_OGE
0 0 1 1 True if ordered and greater than or equal
Definition: InstrTypes.h:998
@ ICMP_UGE
unsigned greater or equal
Definition: InstrTypes.h:1017
@ ICMP_UGT
unsigned greater than
Definition: InstrTypes.h:1016
@ ICMP_SGT
signed greater than
Definition: InstrTypes.h:1020
@ FCMP_ULT
1 1 0 0 True if unordered or less than
Definition: InstrTypes.h:1007
@ FCMP_ONE
0 1 1 0 True if ordered and operands are unequal
Definition: InstrTypes.h:1001
@ FCMP_UEQ
1 0 0 1 True if unordered or equal
Definition: InstrTypes.h:1004
@ ICMP_ULT
unsigned less than
Definition: InstrTypes.h:1018
@ FCMP_UGT
1 0 1 0 True if unordered or greater than
Definition: InstrTypes.h:1005
@ FCMP_OLE
0 1 0 1 True if ordered and less than or equal
Definition: InstrTypes.h:1000
@ FCMP_ORD
0 1 1 1 True if ordered (no nans)
Definition: InstrTypes.h:1002
@ ICMP_EQ
equal
Definition: InstrTypes.h:1014
@ ICMP_NE
not equal
Definition: InstrTypes.h:1015
@ ICMP_SGE
signed greater or equal
Definition: InstrTypes.h:1021
@ FCMP_UNE
1 1 1 0 True if unordered or not equal
Definition: InstrTypes.h:1009
@ ICMP_ULE
unsigned less or equal
Definition: InstrTypes.h:1019
@ FCMP_UGE
1 0 1 1 True if unordered, greater than, or equal
Definition: InstrTypes.h:1006
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition: InstrTypes.h:1003
This is an important base class in LLVM.
Definition: Constant.h:41
static Constant * getAllOnesValue(Type *Ty)
Definition: Constants.cpp:417
static Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
Definition: Constants.cpp:370
DWARF expression.
static void appendOffset(SmallVectorImpl< uint64_t > &Ops, int64_t Offset)
Append Ops with operations to apply the Offset.
static DIExpression * appendExt(const DIExpression *Expr, unsigned FromSize, unsigned ToSize, bool Signed)
Append a zero- or sign-extension to Expr.
This class represents an Operation in the Expression.
A debug info location.
Definition: DebugLoc.h:33
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition: DenseMap.h:220
Base class for the actual dominator tree node.
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:692
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:311
bool hasOptSize() const
Optimize this function for size (-Os) or minimum size (-Oz).
Definition: Function.h:681
bool hasMinSize() const
Optimize this function for minimum size (-Oz).
Definition: Function.h:678
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition: Function.cpp:356
LiveInterval - This class represents the liveness of a register, or stack slot.
Definition: LiveInterval.h:687
SlotIndex InsertMachineInstrInMaps(MachineInstr &MI)
SlotIndex getInstructionIndex(const MachineInstr &Instr) const
Returns the base index of the given instruction.
LiveInterval & getInterval(Register Reg)
SlotIndex ReplaceMachineInstrInMaps(MachineInstr &MI, MachineInstr &NewMI)
A set of physical registers with utility functions to track liveness when walking backward/forward th...
Definition: LivePhysRegs.h:52
void stepForward(const MachineInstr &MI, SmallVectorImpl< std::pair< MCPhysReg, const MachineOperand * > > &Clobbers)
Simulates liveness when stepping forward over an instruction(bundle).
void addLiveOuts(const MachineBasicBlock &MBB)
Adds all live-out registers of basic block MBB.
const Segment * getSegmentContaining(SlotIndex Idx) const
Return the segment that contains the specified index, or null if there is none.
Definition: LiveInterval.h:408
void replaceKillInstruction(Register Reg, MachineInstr &OldMI, MachineInstr &NewMI)
replaceKillInstruction - Update register kill info by replacing a kill instruction with a new one.
VarInfo & getVarInfo(Register Reg)
getVarInfo - Return the VarInfo structure for the specified VIRTUAL register.
bool usesWindowsCFI() const
Definition: MCAsmInfo.h:799
static MCCFIInstruction createAdjustCfaOffset(MCSymbol *L, int Adjustment, SMLoc Loc={})
.cfi_adjust_cfa_offset Same as .cfi_def_cfa_offset, but Offset is a relative value that is added/subt...
Definition: MCDwarf.h:564
Instances of this class represent a single low-level machine instruction.
Definition: MCInst.h:184
void setOpcode(unsigned Op)
Definition: MCInst.h:197
Describe properties that are true of each instruction in the target description file.
Definition: MCInstrDesc.h:198
unsigned getNumOperands() const
Return the number of declared MachineOperands for this MachineInstruction.
Definition: MCInstrDesc.h:237
unsigned getNumDefs() const
Return the number of MachineOperands that are register definitions.
Definition: MCInstrDesc.h:248
unsigned getOpcode() const
Return the opcode number for this descriptor.
Definition: MCInstrDesc.h:230
unsigned char NumDefs
Definition: MCInstrDesc.h:207
This holds information about one operand of a machine instruction, indicating the register class for ...
Definition: MCInstrDesc.h:85
Wrapper class representing physical registers. Should be passed by value.
Definition: MCRegister.h:33
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition: Metadata.h:1541
Set of metadata that should be preserved when using BuildMI().
SimpleValueType SimpleTy
unsigned pred_size() const
MachineInstrBundleIterator< const MachineInstr > const_iterator
reverse_iterator rend()
instr_iterator insert(instr_iterator I, MachineInstr *M)
Insert MI into the instruction list before I, possibly inside a bundle.
bool isLiveIn(MCPhysReg Reg, LaneBitmask LaneMask=LaneBitmask::getAll()) const
Return true if the specified register is in the live in set.
MachineInstr * remove(MachineInstr *I)
Remove the unbundled instruction from the instruction list without deleting it.
LivenessQueryResult computeRegisterLiveness(const TargetRegisterInfo *TRI, MCRegister Reg, const_iterator Before, unsigned Neighborhood=10) const
Return whether (physical) register Reg has been defined and not killed as of just before Before.
DebugLoc findDebugLoc(instr_iterator MBBI)
Find the next valid DebugLoc starting at MBBI, skipping any debug instructions.
bool isLayoutSuccessor(const MachineBasicBlock *MBB) const
Return true if the specified MBB will be emitted immediately after this block, such that if this bloc...
void addLiveIn(MCRegister PhysReg, LaneBitmask LaneMask=LaneBitmask::getAll())
Adds the specified register as a live in.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
instr_iterator erase(instr_iterator I)
Remove an instruction from the instruction list and delete it.
iterator_range< succ_iterator > successors()
reverse_iterator rbegin()
@ LQR_Dead
Register is known to be fully dead.
This class is a data container for one entry in a MachineConstantPool.
bool isMachineConstantPoolEntry() const
isMachineConstantPoolEntry - Return true if the MachineConstantPoolEntry is indeed a target specific ...
union llvm::MachineConstantPoolEntry::@196 Val
The constant itself.
The MachineConstantPool class keeps track of constants referenced by a function which must be spilled...
unsigned getConstantPoolIndex(const Constant *C, Align Alignment)
getConstantPoolIndex - Create a new entry in the constant pool or return an existing one.
DominatorTree Class - Concrete subclass of DominatorTreeBase that is used to compute a normal dominat...
MachineDomTreeNode * getRootNode() const
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
Align getObjectAlign(int ObjectIdx) const
Return the alignment of the specified stack object.
int64_t getObjectSize(int ObjectIdx) const
Return the size of the specified object.
bool isFixedObjectIndex(int ObjectIdx) const
Returns true if the specified index corresponds to a fixed stack object.
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
MCSymbol * getPICBaseSymbol() const
getPICBaseSymbol - Return a function-local symbol to represent the PIC base.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
StringRef getName() const
getName - Return the name of the corresponding LLVM function.
MachineInstr * CreateMachineInstr(const MCInstrDesc &MCID, DebugLoc DL, bool NoImplicit=false)
CreateMachineInstr - Allocate a new MachineInstr.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
bool needsFrameMoves() const
True if this function needs frame moves for debug or exceptions.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
const LLVMTargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
MachineConstantPool * getConstantPool()
getConstantPool - Return the constant pool object for the current function.
MachineInstr * CloneMachineInstr(const MachineInstr *Orig)
Create a new MachineInstr which is a copy of Orig, identical in all ways except the instruction has n...
const MachineBasicBlock & front() const
Register getReg(unsigned Idx) const
Get the register for the operand index.
const MachineInstrBuilder & setMemRefs(ArrayRef< MachineMemOperand * > MMOs) const
const MachineInstrBuilder & addExternalSymbol(const char *FnName, unsigned TargetFlags=0) const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addSym(MCSymbol *Sym, unsigned char TargetFlags=0) const
const MachineInstrBuilder & addGlobalAddress(const GlobalValue *GV, int64_t Offset=0, unsigned TargetFlags=0) const
const MachineInstrBuilder & addDisp(const MachineOperand &Disp, int64_t off, unsigned char TargetFlags=0) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & copyImplicitOps(const MachineInstr &OtherMI) const
Copy all the implicit operands from OtherMI onto this one.
const MachineInstrBuilder & addMemOperand(MachineMemOperand *MMO) const
MachineInstr * getInstr() const
If conversion operators fail, use this method to get the MachineInstr explicitly.
Representation of each machine instruction.
Definition: MachineInstr.h:69
mop_iterator operands_begin()
Definition: MachineInstr.h:656
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:546
bool isSafeToMove(AAResults *AA, bool &SawStore) const
Return true if it is safe to move this instruction.
bool isImplicitDef() const
const MachineBasicBlock * getParent() const
Definition: MachineInstr.h:329
void dropDebugNumber()
Drop any variable location debugging information associated with this instruction.
Definition: MachineInstr.h:532
void setPreInstrSymbol(MachineFunction &MF, MCSymbol *Symbol)
Set a symbol that will be emitted just prior to the instruction itself.
void addImplicitDefUseOperands(MachineFunction &MF)
Add all implicit def and use operands to this instruction.
bool getFlag(MIFlag Flag) const
Return whether an MI flag is set.
Definition: MachineInstr.h:379
unsigned getNumOperands() const
Retuns the total number of operands.
Definition: MachineInstr.h:549
void addOperand(MachineFunction &MF, const MachineOperand &Op)
Add the specified operand to the instruction.
bool modifiesRegister(Register Reg, const TargetRegisterInfo *TRI=nullptr) const
Return true if the MachineInstr modifies (fully define or partially define) the specified register.
unsigned getNumExplicitOperands() const
Returns the number of non-implicit operands.
const MCInstrDesc & getDesc() const
Returns the target instruction descriptor of this MachineInstr.
Definition: MachineInstr.h:543
void untieRegOperand(unsigned OpIdx)
Break any tie involving OpIdx.
void setDesc(const MCInstrDesc &TID)
Replace the instruction descriptor (thus opcode) of the current instruction with a new one.
unsigned getNumExplicitDefs() const
Returns the number of non-implicit definitions.
void eraseFromBundle()
Unlink 'this' from its basic block and delete it.
bool hasOneMemOperand() const
Return true if this instruction has exactly one MachineMemOperand.
Definition: MachineInstr.h:792
void substituteRegister(Register FromReg, Register ToReg, unsigned SubIdx, const TargetRegisterInfo &RegInfo)
Replace all occurrences of FromReg with ToReg:SubIdx, properly composing subreg indices where necessa...
mmo_iterator memoperands_begin() const
Access to memory operands of the instruction.
Definition: MachineInstr.h:777
bool isIdenticalTo(const MachineInstr &Other, MICheckType Check=CheckDefs) const
Return true if this instruction is identical to Other.
const MachineFunction * getMF() const
Return the function that contains the basic block that this instruction belongs to.
void setFlag(MIFlag Flag)
Set a MI flag.
Definition: MachineInstr.h:386
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
Definition: MachineInstr.h:475
void eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
void removeOperand(unsigned OpNo)
Erase an operand from an instruction, leaving it with one fewer operand than it started with.
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:556
unsigned getNumDefs() const
Returns the total number of definitions.
Definition: MachineInstr.h:615
MachineOperand * findRegisterDefOperand(Register Reg, bool isDead=false, bool Overlap=false, const TargetRegisterInfo *TRI=nullptr)
Wrapper for findRegisterDefOperandIdx, it returns a pointer to the MachineOperand rather than an inde...
void setDebugLoc(DebugLoc DL)
Replace current source information with new such.
A description of a memory reference used in the backend.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MOInvariant
The memory access always returns the same value (or traps).
@ MOStore
The memory access writes data.
MachineOperand class - Representation of each machine instruction operand.
void setSubReg(unsigned subReg)
unsigned getSubReg() const
void setImplicit(bool Val=true)
void setImm(int64_t immVal)
int64_t getImm() const
bool readsReg() const
readsReg - Returns true if this operand reads the previous value of its register.
bool isReg() const
isReg - Tests if this is a MO_Register operand.
bool isCPI() const
isCPI - Tests if this is a MO_ConstantPoolIndex operand.
void setIsDead(bool Val=true)
void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
void setIsKill(bool Val=true)
bool isJTI() const
isJTI - Tests if this is a MO_JumpTableIndex operand.
void ChangeToRegister(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isDebug=false)
ChangeToRegister - Replace this operand with a new register operand of the specified value.
static MachineOperand CreateImm(int64_t Val)
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
bool isFI() const
isFI - Tests if this is a MO_FrameIndex operand.
bool isIdenticalTo(const MachineOperand &Other) const
Returns true if this operand is identical to the specified operand except for liveness related flags ...
static MachineOperand CreateCPI(unsigned Idx, int Offset, unsigned TargetFlags=0)
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
int64_t getOffset() const
Return the offset from the symbol in this operand.
static MachineOperand CreateFI(int Idx)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
const TargetRegisterClass * getRegClass(Register Reg) const
Return the register class of the specified virtual register.
Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
const TargetRegisterInfo * getTargetRegisterInfo() const
const TargetRegisterClass * constrainRegClass(Register Reg, const TargetRegisterClass *RC, unsigned MinNumRegs=0)
constrainRegClass - Constrain the register class of the specified virtual register to be a common sub...
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:65
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
constexpr bool isValid() const
Definition: Register.h:116
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
Definition: Register.h:91
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition: Register.h:95
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
Represents one node in the SelectionDAG.
bool isMachineOpcode() const
Test if this node has a post-isel opcode, directly corresponding to a MachineInstr opcode.
unsigned getMachineOpcode() const
This may only be called if isMachineOpcode returns true.
const SDValue & getOperand(unsigned Num) const
EVT getValueType(unsigned ResNo) const
Return the type of a specified result.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
Definition: SelectionDAG.h:225
MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
void setNodeMemRefs(MachineSDNode *N, ArrayRef< MachineMemOperand * > NewMemRefs)
Mutate the specified machine node's memory references to the provided list.
MachineFunction & getMachineFunction() const
Definition: SelectionDAG.h:468
SlotIndex - An opaque wrapper around machine indexes.
Definition: SlotIndexes.h:68
SlotIndex getBaseIndex() const
Returns the base index for associated with this index.
Definition: SlotIndexes.h:227
SlotIndex getRegSlot(bool EC=false) const
Returns the register use/def slot in the current instruction for a normal or early-clobber def.
Definition: SlotIndexes.h:240
bool empty() const
Definition: SmallVector.h:94
size_t size() const
Definition: SmallVector.h:91
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:586
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
Definition: SmallVector.h:696
void push_back(const T &Elt)
Definition: SmallVector.h:426
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
Information about stack frame layout on the target.
Align getStackAlign() const
getStackAlignment - This method returns the number of bytes to which the stack pointer must be aligne...
TargetInstrInfo - Interface to description of machine instruction set.
virtual bool findCommutedOpIndices(const MachineInstr &MI, unsigned &SrcOpIdx1, unsigned &SrcOpIdx2) const
Returns true iff the routine could find two commutable operands in the given machine instruction.
virtual bool hasReassociableOperands(const MachineInstr &Inst, const MachineBasicBlock *MBB) const
Return true when \P Inst has reassociable operands in the same \P MBB.
virtual void genAlternativeCodeSequence(MachineInstr &Root, unsigned Pattern, SmallVectorImpl< MachineInstr * > &InsInstrs, SmallVectorImpl< MachineInstr * > &DelInstrs, DenseMap< unsigned, unsigned > &InstIdxForVirtReg) const
When getMachineCombinerPatterns() finds patterns, this function generates the instructions that could...
virtual std::optional< ParamLoadedValue > describeLoadedValue(const MachineInstr &MI, Register Reg) const
Produce the expression describing the MI loading a value into the physical register Reg.
virtual bool getMachineCombinerPatterns(MachineInstr &Root, SmallVectorImpl< unsigned > &Patterns, bool DoRegPressureReduce) const
Return true when there is potentially a faster code sequence for an instruction chain ending in Root.
virtual bool isReallyTriviallyReMaterializable(const MachineInstr &MI) const
For instructions with opcodes for which the M_REMATERIALIZABLE flag is set, this hook lets the target...
virtual bool isSchedulingBoundary(const MachineInstr &MI, const MachineBasicBlock *MBB, const MachineFunction &MF) const
Test if the given instruction should be considered a scheduling boundary.
virtual MachineInstr * commuteInstructionImpl(MachineInstr &MI, bool NewMI, unsigned OpIdx1, unsigned OpIdx2) const
This method commutes the operands of the given machine instruction MI.
virtual const TargetRegisterClass * getRegClass(const MCInstrDesc &MCID, unsigned OpNum, const TargetRegisterInfo *TRI, const MachineFunction &MF) const
Given a machine instruction descriptor, returns the register class constraint for OpNum,...
bool isPositionIndependent() const
CodeModel::Model getCodeModel() const
Returns the code model.
const MCAsmInfo * getMCAsmInfo() const
Return target specific asm information.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
TypeSize getRegSizeInBits(const TargetRegisterClass &RC) const
Return the size in bits of a register from class RC.
Provide an instruction scheduling machine model to CodeGen passes.
virtual const TargetRegisterInfo * getRegisterInfo() const
getRegisterInfo - If register information is available, return it.
virtual const TargetFrameLowering * getFrameLowering() const
Target - Wrapper for Target specific information.
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
static Type * getHalfTy(LLVMContext &C)
static Type * getDoubleTy(LLVMContext &C)
static Type * getFP128Ty(LLVMContext &C)
static IntegerType * getInt32Ty(LLVMContext &C)
static Type * getFloatTy(LLVMContext &C)
SlotIndex def
The index of the defining instruction.
Definition: LiveInterval.h:61
LLVM Value Representation.
Definition: Value.h:74
bool has128ByteRedZone(const MachineFunction &MF) const
Return true if the function has a redzone (accessible bytes past the frame of the top of stack functi...
bool hasFP(const MachineFunction &MF) const override
hasFP - Return true if the specified function should have a dedicated frame pointer register.
void BuildCFI(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, const MCCFIInstruction &CFIInst, MachineInstr::MIFlag Flag=MachineInstr::NoFlags) const
Wraps up getting a CFI index and building a MachineInstr for it.
void getFrameIndexOperands(SmallVectorImpl< MachineOperand > &Ops, int FI) const override
bool optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, Register SrcReg2, int64_t CmpMask, int64_t CmpValue, const MachineRegisterInfo *MRI) const override
Check if there exists an earlier instruction that operates on the same source operands and sets eflag...
void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc) const override
bool getMachineCombinerPatterns(MachineInstr &Root, SmallVectorImpl< unsigned > &Patterns, bool DoRegPressureReduce) const override
bool isSchedulingBoundary(const MachineInstr &MI, const MachineBasicBlock *MBB, const MachineFunction &MF) const override
Overrides the isSchedulingBoundary from Codegen/TargetInstrInfo.cpp to make it capable of identifying...
MachineBasicBlock::iterator insertOutlinedCall(Module &M, MachineBasicBlock &MBB, MachineBasicBlock::iterator &It, MachineFunction &MF, outliner::Candidate &C) const override
const TargetRegisterClass * getRegClass(const MCInstrDesc &MCID, unsigned OpNum, const TargetRegisterInfo *TRI, const MachineFunction &MF) const override
Given a machine instruction descriptor, returns the register class constraint for OpNum,...
void replaceBranchWithTailCall(MachineBasicBlock &MBB, SmallVectorImpl< MachineOperand > &Cond, const MachineInstr &TailCall) const override
void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, unsigned SubIdx, const MachineInstr &Orig, const TargetRegisterInfo &TRI) const override
unsigned getGlobalBaseReg(MachineFunction *MF) const
getGlobalBaseReg - Return a virtual register initialized with the the global base register value.
bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl< MachineOperand > &Cond, bool AllowModify) const override
bool canInsertSelect(const MachineBasicBlock &, ArrayRef< MachineOperand > Cond, Register, Register, Register, int &, int &, int &) const override
void insertSelect(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, Register DstReg, ArrayRef< MachineOperand > Cond, Register TrueReg, Register FalseReg) const override
unsigned getOpcodeAfterMemoryUnfold(unsigned Opc, bool UnfoldLoad, bool UnfoldStore, unsigned *LoadRegIndex=nullptr) const override
bool findCommutedOpIndices(const MachineInstr &MI, unsigned &SrcOpIdx1, unsigned &SrcOpIdx2) const override
Returns true iff the routine could find two commutable operands in the given machine instruction.
bool areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2, int64_t &Offset1, int64_t &Offset2) const override
static bool isDataInvariantLoad(MachineInstr &MI)
Returns true if the instruction has no behavior (specified or otherwise) that is based on the value l...
MachineInstr * commuteInstructionImpl(MachineInstr &MI, bool NewMI, unsigned CommuteOpIdx1, unsigned CommuteOpIdx2) const override
bool isFunctionSafeToOutlineFrom(MachineFunction &MF, bool OutlineFromLinkOnceODRs) const override
const X86RegisterInfo & getRegisterInfo() const
getRegisterInfo - TargetInstrInfo is a superset of MRegister info.
Definition: X86InstrInfo.h:199
bool isReallyTriviallyReMaterializable(const MachineInstr &MI) const override
bool hasCommutePreference(MachineInstr &MI, bool &Commute) const override
Returns true if we have preference on the operands order in MI, the commute decision is returned in C...
bool hasLiveCondCodeDef(MachineInstr &MI) const
True if MI has a condition code def, e.g.
std::optional< ParamLoadedValue > describeLoadedValue(const MachineInstr &MI, Register Reg) const override
bool canMakeTailCallConditional(SmallVectorImpl< MachineOperand > &Cond, const MachineInstr &TailCall) const override
void loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, int FrameIndex, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI, Register VReg) const override
bool getMemOperandsWithOffsetWidth(const MachineInstr &LdSt, SmallVectorImpl< const MachineOperand * > &BaseOps, int64_t &Offset, bool &OffsetIsScalable, LocationSize &Width, const TargetRegisterInfo *TRI) const override
std::optional< DestSourcePair > isCopyInstrImpl(const MachineInstr &MI) const override
MachineInstr * convertToThreeAddress(MachineInstr &MI, LiveVariables *LV, LiveIntervals *LIS) const override
convertToThreeAddress - This method must be implemented by targets that set the M_CONVERTIBLE_TO_3_AD...
X86InstrInfo(X86Subtarget &STI)
std::pair< unsigned, unsigned > decomposeMachineOperandsTargetFlags(unsigned TF) const override
bool expandPostRAPseudo(MachineInstr &MI) const override
bool isAssociativeAndCommutative(const MachineInstr &Inst, bool Invert) const override
MCInst getNop() const override
Return the noop instruction to use for a noop.
bool shouldScheduleLoadsNear(SDNode *Load1, SDNode *Load2, int64_t Offset1, int64_t Offset2, unsigned NumLoads) const override
This is a used by the pre-regalloc scheduler to determine (in conjunction with areLoadsFromSameBasePt...
bool unfoldMemoryOperand(MachineFunction &MF, MachineInstr &MI, unsigned Reg, bool UnfoldLoad, bool UnfoldStore, SmallVectorImpl< MachineInstr * > &NewMIs) const override
MachineInstr * foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI, ArrayRef< unsigned > Ops, MachineBasicBlock::iterator InsertPt, int FrameIndex, LiveIntervals *LIS=nullptr, VirtRegMap *VRM=nullptr) const override
Fold a load or store of the specified stack slot into the specified machine instruction for the speci...
bool analyzeCompare(const MachineInstr &MI, Register &SrcReg, Register &SrcReg2, int64_t &CmpMask, int64_t &CmpValue) const override
void genAlternativeCodeSequence(MachineInstr &Root, unsigned Pattern, SmallVectorImpl< MachineInstr * > &InsInstrs, SmallVectorImpl< MachineInstr * > &DelInstrs, DenseMap< unsigned, unsigned > &InstrIdxForVirtReg) const override
When getMachineCombinerPatterns() finds potential patterns, this function generates the instructions ...
bool getConstValDefinedInReg(const MachineInstr &MI, const Register Reg, int64_t &ImmVal) const override
std::optional< ExtAddrMode > getAddrModeFromMemoryOp(const MachineInstr &MemI, const TargetRegisterInfo *TRI) const override
Register isStoreToStackSlotPostFE(const MachineInstr &MI, int &FrameIndex) const override
isStoreToStackSlotPostFE - Check for post-frame ptr elimination stack locations as well.
bool isUnconditionalTailCall(const MachineInstr &MI) const override
bool reverseBranchCondition(SmallVectorImpl< MachineOperand > &Cond) const override
Register isLoadFromStackSlotPostFE(const MachineInstr &MI, int &FrameIndex) const override
isLoadFromStackSlotPostFE - Check for post-frame ptr elimination stack locations as well.
void setExecutionDomain(MachineInstr &MI, unsigned Domain) const override
unsigned insertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB, ArrayRef< MachineOperand > Cond, const DebugLoc &DL, int *BytesAdded=nullptr) const override
ArrayRef< std::pair< unsigned, const char * > > getSerializableDirectMachineOperandTargetFlags() const override
Register isStoreToStackSlot(const MachineInstr &MI, int &FrameIndex) const override
bool setExecutionDomainCustom(MachineInstr &MI, unsigned Domain) const
int getSPAdjust(const MachineInstr &MI) const override
getSPAdjust - This returns the stack pointer adjustment made by this instruction.
bool verifyInstruction(const MachineInstr &MI, StringRef &ErrInfo) const override
outliner::InstrType getOutliningTypeImpl(MachineBasicBlock::iterator &MIT, unsigned Flags) const override
int getJumpTableIndex(const MachineInstr &MI) const override
void insertNoop(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI) const override
void setSpecialOperandAttr(MachineInstr &OldMI1, MachineInstr &OldMI2, MachineInstr &NewMI1, MachineInstr &NewMI2) const override
This is an architecture-specific helper function of reassociateOps.
std::pair< uint16_t, uint16_t > getExecutionDomain(const MachineInstr &MI) const override
bool isCoalescableExtInstr(const MachineInstr &MI, Register &SrcReg, Register &DstReg, unsigned &SubIdx) const override
isCoalescableExtInstr - Return true if the instruction is a "coalescable" extension instruction.
void loadStoreTileReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, unsigned Opc, Register Reg, int FrameIdx, bool isKill=false) const
bool classifyLEAReg(MachineInstr &MI, const MachineOperand &Src, unsigned LEAOpcode, bool AllowSP, Register &NewSrc, bool &isKill, MachineOperand &ImplicitOp, LiveVariables *LV, LiveIntervals *LIS) const
Given an operand within a MachineInstr, insert preceding code to put it into the right format for a p...
bool hasReassociableOperands(const MachineInstr &Inst, const MachineBasicBlock *MBB) const override
bool analyzeBranchPredicate(MachineBasicBlock &MBB, TargetInstrInfo::MachineBranchPredicate &MBP, bool AllowModify=false) const override
static bool isDataInvariant(MachineInstr &MI)
Returns true if the instruction has no behavior (specified or otherwise) that is based on the value o...
unsigned getUndefRegClearance(const MachineInstr &MI, unsigned OpNum, const TargetRegisterInfo *TRI) const override
Inform the BreakFalseDeps pass how many idle instructions we would like before certain undef register...
void breakPartialRegDependency(MachineInstr &MI, unsigned OpNum, const TargetRegisterInfo *TRI) const override
void buildClearRegister(Register Reg, MachineBasicBlock &MBB, MachineBasicBlock::iterator Iter, DebugLoc &DL, bool AllowSideEffects=true) const override
Register isLoadFromStackSlot(const MachineInstr &MI, int &FrameIndex) const override
int64_t getFrameAdjustment(const MachineInstr &I) const
Returns the stack pointer adjustment that happens inside the frame setup..destroy sequence (e....
Definition: X86InstrInfo.h:203
bool hasHighOperandLatency(const TargetSchedModel &SchedModel, const MachineRegisterInfo *MRI, const MachineInstr &DefMI, unsigned DefIdx, const MachineInstr &UseMI, unsigned UseIdx) const override
bool isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const override
uint16_t getExecutionDomainCustom(const MachineInstr &MI) const
bool isHighLatencyDef(int opc) const override
void buildOutlinedFrame(MachineBasicBlock &MBB, MachineFunction &MF, const outliner::OutlinedFunction &OF) const override
bool foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, Register Reg, MachineRegisterInfo *MRI) const override
foldImmediate - 'Reg' is known to be defined by a move immediate instruction, try to fold the immedia...
unsigned removeBranch(MachineBasicBlock &MBB, int *BytesRemoved=nullptr) const override
void storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg, bool isKill, int FrameIndex, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI, Register VReg) const override
unsigned getFMA3OpcodeToCommuteOperands(const MachineInstr &MI, unsigned SrcOpIdx1, unsigned SrcOpIdx2, const X86InstrFMA3Group &FMA3Group) const
Returns an adjusted FMA opcode that must be used in FMA instruction that performs the same computatio...
std::optional< outliner::OutlinedFunction > getOutliningCandidateInfo(std::vector< outliner::Candidate > &RepeatedSequenceLocs) const override
bool preservesZeroValueInReg(const MachineInstr *MI, const Register NullValueReg, const TargetRegisterInfo *TRI) const override
unsigned getPartialRegUpdateClearance(const MachineInstr &MI, unsigned OpNum, const TargetRegisterInfo *TRI) const override
Inform the BreakFalseDeps pass how many idle instructions we would like before a partial register upd...
MachineInstr * optimizeLoadInstr(MachineInstr &MI, const MachineRegisterInfo *MRI, Register &FoldAsLoadDefReg, MachineInstr *&DefMI) const override
Try to remove the load by folding it to a register operand at the use.
X86MachineFunctionInfo - This class is derived from MachineFunction and contains private X86 target-s...
unsigned getNumLocalDynamicTLSAccesses() const
bool canRealignStack(const MachineFunction &MF) const override
bool isPICStyleGOT() const
Definition: X86Subtarget.h:341
bool canUseCMOV() const
Definition: X86Subtarget.h:199
bool isTargetWin64() const
Definition: X86Subtarget.h:337
const X86InstrInfo * getInstrInfo() const override
Definition: X86Subtarget.h:129
bool hasAVX512() const
Definition: X86Subtarget.h:208
bool hasSSE41() const
Definition: X86Subtarget.h:204
bool hasSSE2() const
Definition: X86Subtarget.h:201
const X86RegisterInfo * getRegisterInfo() const override
Definition: X86Subtarget.h:139
bool hasAVX() const
Definition: X86Subtarget.h:206
const X86FrameLowering * getFrameLowering() const override
Definition: X86Subtarget.h:131
bool hasAVX2() const
Definition: X86Subtarget.h:207
NodeTy * getNextNode()
Get the next node, or nullptr for the list tail.
Definition: ilist_node.h:316
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
Definition: ISDOpcodes.h:1523
@ OPERAND_MEMORY
Definition: MCInstrDesc.h:62
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Define
Register definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
@ X86
Windows x64, Windows Itanium (IA-64)
Reg
All possible values of the reg field in the ModR/M byte.
bool isKMergeMasked(uint64_t TSFlags)
Definition: X86BaseInfo.h:1329
@ MO_GOT_ABSOLUTE_ADDRESS
MO_GOT_ABSOLUTE_ADDRESS - On a symbol operand, this represents a relocation of: SYMBOL_LABEL + [.
Definition: X86BaseInfo.h:381
@ MO_INDNTPOFF
MO_INDNTPOFF - On a symbol operand this indicates that the immediate is the absolute address of the G...
Definition: X86BaseInfo.h:446
@ MO_GOTNTPOFF
MO_GOTNTPOFF - On a symbol operand this indicates that the immediate is the offset of the GOT entry w...
Definition: X86BaseInfo.h:470
@ MO_GOTTPOFF
MO_GOTTPOFF - On a symbol operand this indicates that the immediate is the offset of the GOT entry wi...
Definition: X86BaseInfo.h:439
@ MO_PIC_BASE_OFFSET
MO_PIC_BASE_OFFSET - On a symbol operand this indicates that the immediate should get the value of th...
Definition: X86BaseInfo.h:385
@ MO_GOTPCREL
MO_GOTPCREL - On a symbol operand this indicates that the immediate is offset to the GOT entry for th...
Definition: X86BaseInfo.h:401
bool canUseApxExtendedReg(const MCInstrDesc &Desc)
Definition: X86BaseInfo.h:1270
bool isPseudo(uint64_t TSFlags)
Definition: X86BaseInfo.h:898
bool isKMasked(uint64_t TSFlags)
Definition: X86BaseInfo.h:1324
@ EVEX
EVEX - Specifies that this instruction use EVEX form which provides syntax support up to 32 512-bit r...
Definition: X86BaseInfo.h:839
@ SSEDomainShift
Execution domain for SSE instructions.
Definition: X86BaseInfo.h:825
int getMemoryOperandNo(uint64_t TSFlags)
Definition: X86BaseInfo.h:1022
unsigned getOperandBias(const MCInstrDesc &Desc)
Compute whether all of the def operands are repeated in the uses and therefore should be skipped.
Definition: X86BaseInfo.h:979
CondCode getCondFromBranch(const MachineInstr &MI)
CondCode getCondFromCFCMov(const MachineInstr &MI)
@ LAST_VALID_COND
Definition: X86BaseInfo.h:92
@ COND_E_AND_NP
Definition: X86BaseInfo.h:99
@ COND_NE_OR_P
Definition: X86BaseInfo.h:98
CondCode getCondFromMI(const MachineInstr &MI)
Return the condition code of the instruction.
int getFirstAddrOperandIdx(const MachineInstr &MI)
Return the index of the instruction's first address operand, if it has a memory reference,...
unsigned getSwappedVCMPImm(unsigned Imm)
Get the VCMP immediate if the opcodes are swapped.
CondCode GetOppositeBranchCondition(CondCode CC)
GetOppositeBranchCondition - Return the inverse of the specified cond, e.g.
@ AddrScaleAmt
Definition: X86BaseInfo.h:30
@ AddrSegmentReg
Definition: X86BaseInfo.h:34
@ AddrIndexReg
Definition: X86BaseInfo.h:31
@ AddrNumOperands
Definition: X86BaseInfo.h:36
unsigned getSwappedVPCOMImm(unsigned Imm)
Get the VPCOM immediate if the opcodes are swapped.
bool isX87Instruction(MachineInstr &MI)
Check if the instruction is X87 instruction.
unsigned getVPCMPImmForCond(ISD::CondCode CC)
Get the VPCMP immediate for the given condition.
std::pair< CondCode, bool > getX86ConditionCode(CmpInst::Predicate Predicate)
Return a pair of condition code for the given predicate and whether the instruction operands should b...
CondCode getCondFromSETCC(const MachineInstr &MI)
unsigned getSwappedVPCMPImm(unsigned Imm)
Get the VPCMP immediate if the opcodes are swapped.
int getCondSrcNoFromDesc(const MCInstrDesc &MCID)
Return the source operand # for condition code by MCID.
const Constant * getConstantFromPool(const MachineInstr &MI, unsigned OpNo)
Find any constant pool entry associated with a specific instruction operand.
unsigned getCMovOpcode(unsigned RegBytes, bool HasMemoryOperand=false, bool HasNDD=false)
Return a cmov opcode for the given register size in bytes, and operand type.
unsigned getVectorRegisterWidth(const MCOperandInfo &Info)
Get the width of the vector register operand.
CondCode getCondFromCMov(const MachineInstr &MI)
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:450
InstrType
Represents how an instruction should be mapped by the outliner.
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition: STLExtras.h:329
@ Offset
Definition: DWP.cpp:456
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1722
int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition: bit.h:385
std::pair< MachineOperand, DIExpression * > ParamLoadedValue
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
static bool isMem(const MachineInstr &MI, unsigned Op)
Definition: X86InstrInfo.h:158
bool isAligned(Align Lhs, uint64_t SizeInBytes)
Checks that SizeInBytes is a multiple of the alignment.
Definition: Alignment.h:145
MCRegister getX86SubSuperRegister(MCRegister Reg, unsigned Size, bool High=false)
bool getAlign(const Function &F, unsigned index, unsigned &align)
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
FunctionPass * createX86GlobalBaseRegPass()
This pass initializes a global base register for PIC on x86-32.
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition: STLExtras.h:2073
unsigned getDeadRegState(bool B)
static const MachineInstrBuilder & addFrameReference(const MachineInstrBuilder &MIB, int FI, int Offset=0, bool mem=true)
addFrameReference - This function is used to add a reference to the base of an abstract object on the...
FunctionPass * createCleanupLocalDynamicTLSPass()
This pass combines multiple accesses to local-dynamic TLS variables so that the TLS base address for ...
const X86FoldTableEntry * lookupBroadcastFoldTable(unsigned RegOp, unsigned OpNum)
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition: bit.h:215
const X86InstrFMA3Group * getFMA3Group(unsigned Opcode, uint64_t TSFlags)
Returns a reference to a group of FMA3 opcodes to where the given Opcode is included.
auto reverse(ContainerTy &&C)
Definition: STLExtras.h:419
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
static const MachineInstrBuilder & addRegOffset(const MachineInstrBuilder &MIB, unsigned Reg, bool isKill, int Offset)
addRegOffset - This function is used to add a memory reference of the form [Reg + Offset],...
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1736
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition: Error.cpp:156
const X86FoldTableEntry * lookupTwoAddrFoldTable(unsigned RegOp)
static const MachineInstrBuilder & addRegReg(const MachineInstrBuilder &MIB, unsigned Reg1, bool isKill1, unsigned Reg2, bool isKill2)
addRegReg - This function is used to add a memory reference of the form: [Reg + Reg].
static const MachineInstrBuilder & addOffset(const MachineInstrBuilder &MIB, int Offset)
unsigned getUndefRegState(bool B)
unsigned getRegState(const MachineOperand &RegOp)
Get all register state flags from machine operand RegOp.
unsigned getDefRegState(bool B)
@ Add
Sum of integers.
unsigned getKillRegState(bool B)
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition: Alignment.h:155
@ DPWSSD
Definition: X86InstrInfo.h:32
const X86FoldTableEntry * lookupUnfoldTable(unsigned MemOp)
constexpr unsigned BitWidth
Definition: BitmaskEnum.h:191
bool matchBroadcastSize(const X86FoldTableEntry &Entry, unsigned BroadcastBits)
const X86FoldTableEntry * lookupFoldTable(unsigned RegOp, unsigned OpNum)
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:860
#define N
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
Description of the encoding of one expression Op.
Extended Value Type.
Definition: ValueTypes.h:34
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition: ValueTypes.h:306
Used to describe addressing mode similar to ExtAddrMode in CodeGenPrepare.
This represents a simple continuous liveness interval for a value.
Definition: LiveInterval.h:162
std::vector< MachineInstr * > Kills
Kills - List of MachineInstruction's which are the last use of this virtual register (kill it) in the...
Definition: LiveVariables.h:90
static MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
X86AddressMode - This struct holds a generalized full x86 address mode.
This class is used to group {132, 213, 231} forms of FMA opcodes together.
unsigned get213Opcode() const
Returns the 213 form of FMA opcode.
unsigned get231Opcode() const
Returns the 231 form of FMA opcode.
bool isIntrinsic() const
Returns true iff the group of FMA opcodes holds intrinsic opcodes.
unsigned get132Opcode() const
Returns the 132 form of FMA opcode.
An individual sequence of instructions to be replaced with a call to an outlined function.
The information necessary to create an outlined function for some class of candidate.
unsigned FrameConstructionID
Target-defined identifier for constructing a frame for this function.