LLVM 22.0.0git
X86InstrInfo.cpp
Go to the documentation of this file.
1//===-- X86InstrInfo.cpp - X86 Instruction Information --------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file contains the X86 implementation of the TargetInstrInfo class.
10//
11//===----------------------------------------------------------------------===//
12
13#include "X86InstrInfo.h"
14#include "X86.h"
15#include "X86InstrBuilder.h"
16#include "X86InstrFoldTables.h"
18#include "X86Subtarget.h"
19#include "X86TargetMachine.h"
20#include "llvm/ADT/STLExtras.h"
21#include "llvm/ADT/Sequence.h"
36#include "llvm/IR/Function.h"
37#include "llvm/IR/InstrTypes.h"
38#include "llvm/IR/Module.h"
39#include "llvm/MC/MCAsmInfo.h"
40#include "llvm/MC/MCExpr.h"
41#include "llvm/MC/MCInst.h"
43#include "llvm/Support/Debug.h"
47#include <atomic>
48#include <optional>
49
50using namespace llvm;
51
52#define DEBUG_TYPE "x86-instr-info"
53
54#define GET_INSTRINFO_CTOR_DTOR
55#include "X86GenInstrInfo.inc"
56
58
59static cl::opt<bool>
60 NoFusing("disable-spill-fusing",
61 cl::desc("Disable fusing of spill code into instructions"),
63static cl::opt<bool>
64 PrintFailedFusing("print-failed-fuse-candidates",
65 cl::desc("Print instructions that the allocator wants to"
66 " fuse, but the X86 backend currently can't"),
68static cl::opt<bool>
69 ReMatPICStubLoad("remat-pic-stub-load",
70 cl::desc("Re-materialize load from stub in PIC mode"),
71 cl::init(false), cl::Hidden);
73 PartialRegUpdateClearance("partial-reg-update-clearance",
74 cl::desc("Clearance between two register writes "
75 "for inserting XOR to avoid partial "
76 "register update"),
77 cl::init(64), cl::Hidden);
79 "undef-reg-clearance",
80 cl::desc("How many idle instructions we would like before "
81 "certain undef register reads"),
82 cl::init(128), cl::Hidden);
83
84// Pin the vtable to this file.
85void X86InstrInfo::anchor() {}
86
88 : X86GenInstrInfo(STI,
89 (STI.isTarget64BitLP64() ? X86::ADJCALLSTACKDOWN64
90 : X86::ADJCALLSTACKDOWN32),
91 (STI.isTarget64BitLP64() ? X86::ADJCALLSTACKUP64
92 : X86::ADJCALLSTACKUP32),
93 X86::CATCHRET, (STI.is64Bit() ? X86::RET64 : X86::RET32)),
94 Subtarget(STI), RI(STI.getTargetTriple()) {}
95
98 const TargetRegisterInfo *TRI) const {
99 auto *RC = TargetInstrInfo::getRegClass(MCID, OpNum, TRI);
100 // If the target does not have egpr, then r16-r31 will be resereved for all
101 // instructions.
102 if (!RC || !Subtarget.hasEGPR())
103 return RC;
104
106 return RC;
107
108 const X86RegisterInfo *RI = Subtarget.getRegisterInfo();
109 return RI->constrainRegClassToNonRex2(RC);
110}
111
113 Register &SrcReg, Register &DstReg,
114 unsigned &SubIdx) const {
115 switch (MI.getOpcode()) {
116 default:
117 break;
118 case X86::MOVSX16rr8:
119 case X86::MOVZX16rr8:
120 case X86::MOVSX32rr8:
121 case X86::MOVZX32rr8:
122 case X86::MOVSX64rr8:
123 if (!Subtarget.is64Bit())
124 // It's not always legal to reference the low 8-bit of the larger
125 // register in 32-bit mode.
126 return false;
127 [[fallthrough]];
128 case X86::MOVSX32rr16:
129 case X86::MOVZX32rr16:
130 case X86::MOVSX64rr16:
131 case X86::MOVSX64rr32: {
132 if (MI.getOperand(0).getSubReg() || MI.getOperand(1).getSubReg())
133 // Be conservative.
134 return false;
135 SrcReg = MI.getOperand(1).getReg();
136 DstReg = MI.getOperand(0).getReg();
137 switch (MI.getOpcode()) {
138 default:
139 llvm_unreachable("Unreachable!");
140 case X86::MOVSX16rr8:
141 case X86::MOVZX16rr8:
142 case X86::MOVSX32rr8:
143 case X86::MOVZX32rr8:
144 case X86::MOVSX64rr8:
145 SubIdx = X86::sub_8bit;
146 break;
147 case X86::MOVSX32rr16:
148 case X86::MOVZX32rr16:
149 case X86::MOVSX64rr16:
150 SubIdx = X86::sub_16bit;
151 break;
152 case X86::MOVSX64rr32:
153 SubIdx = X86::sub_32bit;
154 break;
155 }
156 return true;
157 }
158 }
159 return false;
160}
161
163 if (MI.mayLoad() || MI.mayStore())
164 return false;
165
166 // Some target-independent operations that trivially lower to data-invariant
167 // instructions.
168 if (MI.isCopyLike() || MI.isInsertSubreg())
169 return true;
170
171 unsigned Opcode = MI.getOpcode();
172 using namespace X86;
173 // On x86 it is believed that imul is constant time w.r.t. the loaded data.
174 // However, they set flags and are perhaps the most surprisingly constant
175 // time operations so we call them out here separately.
176 if (isIMUL(Opcode))
177 return true;
178 // Bit scanning and counting instructions that are somewhat surprisingly
179 // constant time as they scan across bits and do other fairly complex
180 // operations like popcnt, but are believed to be constant time on x86.
181 // However, these set flags.
182 if (isBSF(Opcode) || isBSR(Opcode) || isLZCNT(Opcode) || isPOPCNT(Opcode) ||
183 isTZCNT(Opcode))
184 return true;
185 // Bit manipulation instructions are effectively combinations of basic
186 // arithmetic ops, and should still execute in constant time. These also
187 // set flags.
188 if (isBLCFILL(Opcode) || isBLCI(Opcode) || isBLCIC(Opcode) ||
189 isBLCMSK(Opcode) || isBLCS(Opcode) || isBLSFILL(Opcode) ||
190 isBLSI(Opcode) || isBLSIC(Opcode) || isBLSMSK(Opcode) || isBLSR(Opcode) ||
191 isTZMSK(Opcode))
192 return true;
193 // Bit extracting and clearing instructions should execute in constant time,
194 // and set flags.
195 if (isBEXTR(Opcode) || isBZHI(Opcode))
196 return true;
197 // Shift and rotate.
198 if (isROL(Opcode) || isROR(Opcode) || isSAR(Opcode) || isSHL(Opcode) ||
199 isSHR(Opcode) || isSHLD(Opcode) || isSHRD(Opcode))
200 return true;
201 // Basic arithmetic is constant time on the input but does set flags.
202 if (isADC(Opcode) || isADD(Opcode) || isAND(Opcode) || isOR(Opcode) ||
203 isSBB(Opcode) || isSUB(Opcode) || isXOR(Opcode))
204 return true;
205 // Arithmetic with just 32-bit and 64-bit variants and no immediates.
206 if (isANDN(Opcode))
207 return true;
208 // Unary arithmetic operations.
209 if (isDEC(Opcode) || isINC(Opcode) || isNEG(Opcode))
210 return true;
211 // Unlike other arithmetic, NOT doesn't set EFLAGS.
212 if (isNOT(Opcode))
213 return true;
214 // Various move instructions used to zero or sign extend things. Note that we
215 // intentionally don't support the _NOREX variants as we can't handle that
216 // register constraint anyways.
217 if (isMOVSX(Opcode) || isMOVZX(Opcode) || isMOVSXD(Opcode) || isMOV(Opcode))
218 return true;
219 // Arithmetic instructions that are both constant time and don't set flags.
220 if (isRORX(Opcode) || isSARX(Opcode) || isSHLX(Opcode) || isSHRX(Opcode))
221 return true;
222 // LEA doesn't actually access memory, and its arithmetic is constant time.
223 if (isLEA(Opcode))
224 return true;
225 // By default, assume that the instruction is not data invariant.
226 return false;
227}
228
230 switch (MI.getOpcode()) {
231 default:
232 // By default, assume that the load will immediately leak.
233 return false;
234
235 // On x86 it is believed that imul is constant time w.r.t. the loaded data.
236 // However, they set flags and are perhaps the most surprisingly constant
237 // time operations so we call them out here separately.
238 case X86::IMUL16rm:
239 case X86::IMUL16rmi:
240 case X86::IMUL32rm:
241 case X86::IMUL32rmi:
242 case X86::IMUL64rm:
243 case X86::IMUL64rmi32:
244
245 // Bit scanning and counting instructions that are somewhat surprisingly
246 // constant time as they scan across bits and do other fairly complex
247 // operations like popcnt, but are believed to be constant time on x86.
248 // However, these set flags.
249 case X86::BSF16rm:
250 case X86::BSF32rm:
251 case X86::BSF64rm:
252 case X86::BSR16rm:
253 case X86::BSR32rm:
254 case X86::BSR64rm:
255 case X86::LZCNT16rm:
256 case X86::LZCNT32rm:
257 case X86::LZCNT64rm:
258 case X86::POPCNT16rm:
259 case X86::POPCNT32rm:
260 case X86::POPCNT64rm:
261 case X86::TZCNT16rm:
262 case X86::TZCNT32rm:
263 case X86::TZCNT64rm:
264
265 // Bit manipulation instructions are effectively combinations of basic
266 // arithmetic ops, and should still execute in constant time. These also
267 // set flags.
268 case X86::BLCFILL32rm:
269 case X86::BLCFILL64rm:
270 case X86::BLCI32rm:
271 case X86::BLCI64rm:
272 case X86::BLCIC32rm:
273 case X86::BLCIC64rm:
274 case X86::BLCMSK32rm:
275 case X86::BLCMSK64rm:
276 case X86::BLCS32rm:
277 case X86::BLCS64rm:
278 case X86::BLSFILL32rm:
279 case X86::BLSFILL64rm:
280 case X86::BLSI32rm:
281 case X86::BLSI64rm:
282 case X86::BLSIC32rm:
283 case X86::BLSIC64rm:
284 case X86::BLSMSK32rm:
285 case X86::BLSMSK64rm:
286 case X86::BLSR32rm:
287 case X86::BLSR64rm:
288 case X86::TZMSK32rm:
289 case X86::TZMSK64rm:
290
291 // Bit extracting and clearing instructions should execute in constant time,
292 // and set flags.
293 case X86::BEXTR32rm:
294 case X86::BEXTR64rm:
295 case X86::BEXTRI32mi:
296 case X86::BEXTRI64mi:
297 case X86::BZHI32rm:
298 case X86::BZHI64rm:
299
300 // Basic arithmetic is constant time on the input but does set flags.
301 case X86::ADC8rm:
302 case X86::ADC16rm:
303 case X86::ADC32rm:
304 case X86::ADC64rm:
305 case X86::ADD8rm:
306 case X86::ADD16rm:
307 case X86::ADD32rm:
308 case X86::ADD64rm:
309 case X86::AND8rm:
310 case X86::AND16rm:
311 case X86::AND32rm:
312 case X86::AND64rm:
313 case X86::ANDN32rm:
314 case X86::ANDN64rm:
315 case X86::OR8rm:
316 case X86::OR16rm:
317 case X86::OR32rm:
318 case X86::OR64rm:
319 case X86::SBB8rm:
320 case X86::SBB16rm:
321 case X86::SBB32rm:
322 case X86::SBB64rm:
323 case X86::SUB8rm:
324 case X86::SUB16rm:
325 case X86::SUB32rm:
326 case X86::SUB64rm:
327 case X86::XOR8rm:
328 case X86::XOR16rm:
329 case X86::XOR32rm:
330 case X86::XOR64rm:
331
332 // Integer multiply w/o affecting flags is still believed to be constant
333 // time on x86. Called out separately as this is among the most surprising
334 // instructions to exhibit that behavior.
335 case X86::MULX32rm:
336 case X86::MULX64rm:
337
338 // Arithmetic instructions that are both constant time and don't set flags.
339 case X86::RORX32mi:
340 case X86::RORX64mi:
341 case X86::SARX32rm:
342 case X86::SARX64rm:
343 case X86::SHLX32rm:
344 case X86::SHLX64rm:
345 case X86::SHRX32rm:
346 case X86::SHRX64rm:
347
348 // Conversions are believed to be constant time and don't set flags.
349 case X86::CVTTSD2SI64rm:
350 case X86::VCVTTSD2SI64rm:
351 case X86::VCVTTSD2SI64Zrm:
352 case X86::CVTTSD2SIrm:
353 case X86::VCVTTSD2SIrm:
354 case X86::VCVTTSD2SIZrm:
355 case X86::CVTTSS2SI64rm:
356 case X86::VCVTTSS2SI64rm:
357 case X86::VCVTTSS2SI64Zrm:
358 case X86::CVTTSS2SIrm:
359 case X86::VCVTTSS2SIrm:
360 case X86::VCVTTSS2SIZrm:
361 case X86::CVTSI2SDrm:
362 case X86::VCVTSI2SDrm:
363 case X86::VCVTSI2SDZrm:
364 case X86::CVTSI2SSrm:
365 case X86::VCVTSI2SSrm:
366 case X86::VCVTSI2SSZrm:
367 case X86::CVTSI642SDrm:
368 case X86::VCVTSI642SDrm:
369 case X86::VCVTSI642SDZrm:
370 case X86::CVTSI642SSrm:
371 case X86::VCVTSI642SSrm:
372 case X86::VCVTSI642SSZrm:
373 case X86::CVTSS2SDrm:
374 case X86::VCVTSS2SDrm:
375 case X86::VCVTSS2SDZrm:
376 case X86::CVTSD2SSrm:
377 case X86::VCVTSD2SSrm:
378 case X86::VCVTSD2SSZrm:
379 // AVX512 added unsigned integer conversions.
380 case X86::VCVTTSD2USI64Zrm:
381 case X86::VCVTTSD2USIZrm:
382 case X86::VCVTTSS2USI64Zrm:
383 case X86::VCVTTSS2USIZrm:
384 case X86::VCVTUSI2SDZrm:
385 case X86::VCVTUSI642SDZrm:
386 case X86::VCVTUSI2SSZrm:
387 case X86::VCVTUSI642SSZrm:
388
389 // Loads to register don't set flags.
390 case X86::MOV8rm:
391 case X86::MOV8rm_NOREX:
392 case X86::MOV16rm:
393 case X86::MOV32rm:
394 case X86::MOV64rm:
395 case X86::MOVSX16rm8:
396 case X86::MOVSX32rm16:
397 case X86::MOVSX32rm8:
398 case X86::MOVSX32rm8_NOREX:
399 case X86::MOVSX64rm16:
400 case X86::MOVSX64rm32:
401 case X86::MOVSX64rm8:
402 case X86::MOVZX16rm8:
403 case X86::MOVZX32rm16:
404 case X86::MOVZX32rm8:
405 case X86::MOVZX32rm8_NOREX:
406 case X86::MOVZX64rm16:
407 case X86::MOVZX64rm8:
408 return true;
409 }
410}
411
413 const MachineFunction *MF = MI.getParent()->getParent();
415
416 if (isFrameInstr(MI)) {
417 int SPAdj = alignTo(getFrameSize(MI), TFI->getStackAlign());
418 SPAdj -= getFrameAdjustment(MI);
419 if (!isFrameSetup(MI))
420 SPAdj = -SPAdj;
421 return SPAdj;
422 }
423
424 // To know whether a call adjusts the stack, we need information
425 // that is bound to the following ADJCALLSTACKUP pseudo.
426 // Look for the next ADJCALLSTACKUP that follows the call.
427 if (MI.isCall()) {
428 const MachineBasicBlock *MBB = MI.getParent();
430 for (auto E = MBB->end(); I != E; ++I) {
431 if (I->getOpcode() == getCallFrameDestroyOpcode() || I->isCall())
432 break;
433 }
434
435 // If we could not find a frame destroy opcode, then it has already
436 // been simplified, so we don't care.
437 if (I->getOpcode() != getCallFrameDestroyOpcode())
438 return 0;
439
440 return -(I->getOperand(1).getImm());
441 }
442
443 // Currently handle only PUSHes we can reasonably expect to see
444 // in call sequences
445 switch (MI.getOpcode()) {
446 default:
447 return 0;
448 case X86::PUSH32r:
449 case X86::PUSH32rmm:
450 case X86::PUSH32rmr:
451 case X86::PUSH32i:
452 return 4;
453 case X86::PUSH64r:
454 case X86::PUSH64rmm:
455 case X86::PUSH64rmr:
456 case X86::PUSH64i32:
457 return 8;
458 }
459}
460
461/// Return true and the FrameIndex if the specified
462/// operand and follow operands form a reference to the stack frame.
463bool X86InstrInfo::isFrameOperand(const MachineInstr &MI, unsigned int Op,
464 int &FrameIndex) const {
465 if (MI.getOperand(Op + X86::AddrBaseReg).isFI() &&
466 MI.getOperand(Op + X86::AddrScaleAmt).isImm() &&
467 MI.getOperand(Op + X86::AddrIndexReg).isReg() &&
468 MI.getOperand(Op + X86::AddrDisp).isImm() &&
469 MI.getOperand(Op + X86::AddrScaleAmt).getImm() == 1 &&
470 MI.getOperand(Op + X86::AddrIndexReg).getReg() == 0 &&
471 MI.getOperand(Op + X86::AddrDisp).getImm() == 0) {
472 FrameIndex = MI.getOperand(Op + X86::AddrBaseReg).getIndex();
473 return true;
474 }
475 return false;
476}
477
478static bool isFrameLoadOpcode(int Opcode, TypeSize &MemBytes) {
479 switch (Opcode) {
480 default:
481 return false;
482 case X86::MOV8rm:
483 case X86::KMOVBkm:
484 case X86::KMOVBkm_EVEX:
485 MemBytes = TypeSize::getFixed(1);
486 return true;
487 case X86::MOV16rm:
488 case X86::KMOVWkm:
489 case X86::KMOVWkm_EVEX:
490 case X86::VMOVSHZrm:
491 case X86::VMOVSHZrm_alt:
492 MemBytes = TypeSize::getFixed(2);
493 return true;
494 case X86::MOV32rm:
495 case X86::MOVSSrm:
496 case X86::MOVSSrm_alt:
497 case X86::VMOVSSrm:
498 case X86::VMOVSSrm_alt:
499 case X86::VMOVSSZrm:
500 case X86::VMOVSSZrm_alt:
501 case X86::KMOVDkm:
502 case X86::KMOVDkm_EVEX:
503 MemBytes = TypeSize::getFixed(4);
504 return true;
505 case X86::MOV64rm:
506 case X86::LD_Fp64m:
507 case X86::MOVSDrm:
508 case X86::MOVSDrm_alt:
509 case X86::VMOVSDrm:
510 case X86::VMOVSDrm_alt:
511 case X86::VMOVSDZrm:
512 case X86::VMOVSDZrm_alt:
513 case X86::MMX_MOVD64rm:
514 case X86::MMX_MOVQ64rm:
515 case X86::KMOVQkm:
516 case X86::KMOVQkm_EVEX:
517 MemBytes = TypeSize::getFixed(8);
518 return true;
519 case X86::MOVAPSrm:
520 case X86::MOVUPSrm:
521 case X86::MOVAPDrm:
522 case X86::MOVUPDrm:
523 case X86::MOVDQArm:
524 case X86::MOVDQUrm:
525 case X86::VMOVAPSrm:
526 case X86::VMOVUPSrm:
527 case X86::VMOVAPDrm:
528 case X86::VMOVUPDrm:
529 case X86::VMOVDQArm:
530 case X86::VMOVDQUrm:
531 case X86::VMOVAPSZ128rm:
532 case X86::VMOVUPSZ128rm:
533 case X86::VMOVAPSZ128rm_NOVLX:
534 case X86::VMOVUPSZ128rm_NOVLX:
535 case X86::VMOVAPDZ128rm:
536 case X86::VMOVUPDZ128rm:
537 case X86::VMOVDQU8Z128rm:
538 case X86::VMOVDQU16Z128rm:
539 case X86::VMOVDQA32Z128rm:
540 case X86::VMOVDQU32Z128rm:
541 case X86::VMOVDQA64Z128rm:
542 case X86::VMOVDQU64Z128rm:
543 MemBytes = TypeSize::getFixed(16);
544 return true;
545 case X86::VMOVAPSYrm:
546 case X86::VMOVUPSYrm:
547 case X86::VMOVAPDYrm:
548 case X86::VMOVUPDYrm:
549 case X86::VMOVDQAYrm:
550 case X86::VMOVDQUYrm:
551 case X86::VMOVAPSZ256rm:
552 case X86::VMOVUPSZ256rm:
553 case X86::VMOVAPSZ256rm_NOVLX:
554 case X86::VMOVUPSZ256rm_NOVLX:
555 case X86::VMOVAPDZ256rm:
556 case X86::VMOVUPDZ256rm:
557 case X86::VMOVDQU8Z256rm:
558 case X86::VMOVDQU16Z256rm:
559 case X86::VMOVDQA32Z256rm:
560 case X86::VMOVDQU32Z256rm:
561 case X86::VMOVDQA64Z256rm:
562 case X86::VMOVDQU64Z256rm:
563 MemBytes = TypeSize::getFixed(32);
564 return true;
565 case X86::VMOVAPSZrm:
566 case X86::VMOVUPSZrm:
567 case X86::VMOVAPDZrm:
568 case X86::VMOVUPDZrm:
569 case X86::VMOVDQU8Zrm:
570 case X86::VMOVDQU16Zrm:
571 case X86::VMOVDQA32Zrm:
572 case X86::VMOVDQU32Zrm:
573 case X86::VMOVDQA64Zrm:
574 case X86::VMOVDQU64Zrm:
575 MemBytes = TypeSize::getFixed(64);
576 return true;
577 }
578}
579
580static bool isFrameStoreOpcode(int Opcode, TypeSize &MemBytes) {
581 switch (Opcode) {
582 default:
583 return false;
584 case X86::MOV8mr:
585 case X86::KMOVBmk:
586 case X86::KMOVBmk_EVEX:
587 MemBytes = TypeSize::getFixed(1);
588 return true;
589 case X86::MOV16mr:
590 case X86::KMOVWmk:
591 case X86::KMOVWmk_EVEX:
592 case X86::VMOVSHZmr:
593 MemBytes = TypeSize::getFixed(2);
594 return true;
595 case X86::MOV32mr:
596 case X86::MOVSSmr:
597 case X86::VMOVSSmr:
598 case X86::VMOVSSZmr:
599 case X86::KMOVDmk:
600 case X86::KMOVDmk_EVEX:
601 MemBytes = TypeSize::getFixed(4);
602 return true;
603 case X86::MOV64mr:
604 case X86::ST_FpP64m:
605 case X86::MOVSDmr:
606 case X86::VMOVSDmr:
607 case X86::VMOVSDZmr:
608 case X86::MMX_MOVD64mr:
609 case X86::MMX_MOVQ64mr:
610 case X86::MMX_MOVNTQmr:
611 case X86::KMOVQmk:
612 case X86::KMOVQmk_EVEX:
613 MemBytes = TypeSize::getFixed(8);
614 return true;
615 case X86::MOVAPSmr:
616 case X86::MOVUPSmr:
617 case X86::MOVAPDmr:
618 case X86::MOVUPDmr:
619 case X86::MOVDQAmr:
620 case X86::MOVDQUmr:
621 case X86::VMOVAPSmr:
622 case X86::VMOVUPSmr:
623 case X86::VMOVAPDmr:
624 case X86::VMOVUPDmr:
625 case X86::VMOVDQAmr:
626 case X86::VMOVDQUmr:
627 case X86::VMOVUPSZ128mr:
628 case X86::VMOVAPSZ128mr:
629 case X86::VMOVUPSZ128mr_NOVLX:
630 case X86::VMOVAPSZ128mr_NOVLX:
631 case X86::VMOVUPDZ128mr:
632 case X86::VMOVAPDZ128mr:
633 case X86::VMOVDQA32Z128mr:
634 case X86::VMOVDQU32Z128mr:
635 case X86::VMOVDQA64Z128mr:
636 case X86::VMOVDQU64Z128mr:
637 case X86::VMOVDQU8Z128mr:
638 case X86::VMOVDQU16Z128mr:
639 MemBytes = TypeSize::getFixed(16);
640 return true;
641 case X86::VMOVUPSYmr:
642 case X86::VMOVAPSYmr:
643 case X86::VMOVUPDYmr:
644 case X86::VMOVAPDYmr:
645 case X86::VMOVDQUYmr:
646 case X86::VMOVDQAYmr:
647 case X86::VMOVUPSZ256mr:
648 case X86::VMOVAPSZ256mr:
649 case X86::VMOVUPSZ256mr_NOVLX:
650 case X86::VMOVAPSZ256mr_NOVLX:
651 case X86::VMOVUPDZ256mr:
652 case X86::VMOVAPDZ256mr:
653 case X86::VMOVDQU8Z256mr:
654 case X86::VMOVDQU16Z256mr:
655 case X86::VMOVDQA32Z256mr:
656 case X86::VMOVDQU32Z256mr:
657 case X86::VMOVDQA64Z256mr:
658 case X86::VMOVDQU64Z256mr:
659 MemBytes = TypeSize::getFixed(32);
660 return true;
661 case X86::VMOVUPSZmr:
662 case X86::VMOVAPSZmr:
663 case X86::VMOVUPDZmr:
664 case X86::VMOVAPDZmr:
665 case X86::VMOVDQU8Zmr:
666 case X86::VMOVDQU16Zmr:
667 case X86::VMOVDQA32Zmr:
668 case X86::VMOVDQU32Zmr:
669 case X86::VMOVDQA64Zmr:
670 case X86::VMOVDQU64Zmr:
671 MemBytes = TypeSize::getFixed(64);
672 return true;
673 }
674 return false;
675}
676
678 int &FrameIndex) const {
679 TypeSize Dummy = TypeSize::getZero();
680 return X86InstrInfo::isLoadFromStackSlot(MI, FrameIndex, Dummy);
681}
682
684 int &FrameIndex,
685 TypeSize &MemBytes) const {
686 if (isFrameLoadOpcode(MI.getOpcode(), MemBytes))
687 if (MI.getOperand(0).getSubReg() == 0 && isFrameOperand(MI, 1, FrameIndex))
688 return MI.getOperand(0).getReg();
689 return Register();
690}
691
693 int &FrameIndex) const {
694 TypeSize Dummy = TypeSize::getZero();
695 if (isFrameLoadOpcode(MI.getOpcode(), Dummy)) {
696 if (Register Reg = isLoadFromStackSlot(MI, FrameIndex))
697 return Reg;
698 // Check for post-frame index elimination operations
700 if (hasLoadFromStackSlot(MI, Accesses)) {
701 FrameIndex =
702 cast<FixedStackPseudoSourceValue>(Accesses.front()->getPseudoValue())
703 ->getFrameIndex();
704 return MI.getOperand(0).getReg();
705 }
706 }
707 return Register();
708}
709
711 int &FrameIndex) const {
712 TypeSize Dummy = TypeSize::getZero();
713 return X86InstrInfo::isStoreToStackSlot(MI, FrameIndex, Dummy);
714}
715
717 int &FrameIndex,
718 TypeSize &MemBytes) const {
719 if (isFrameStoreOpcode(MI.getOpcode(), MemBytes))
720 if (MI.getOperand(X86::AddrNumOperands).getSubReg() == 0 &&
721 isFrameOperand(MI, 0, FrameIndex))
722 return MI.getOperand(X86::AddrNumOperands).getReg();
723 return Register();
724}
725
727 int &FrameIndex) const {
728 TypeSize Dummy = TypeSize::getZero();
729 if (isFrameStoreOpcode(MI.getOpcode(), Dummy)) {
730 if (Register Reg = isStoreToStackSlot(MI, FrameIndex))
731 return Reg;
732 // Check for post-frame index elimination operations
734 if (hasStoreToStackSlot(MI, Accesses)) {
735 FrameIndex =
736 cast<FixedStackPseudoSourceValue>(Accesses.front()->getPseudoValue())
737 ->getFrameIndex();
738 return MI.getOperand(X86::AddrNumOperands).getReg();
739 }
740 }
741 return Register();
742}
743
744/// Return true if register is PIC base; i.e.g defined by X86::MOVPC32r.
745static bool regIsPICBase(Register BaseReg, const MachineRegisterInfo &MRI) {
746 // Don't waste compile time scanning use-def chains of physregs.
747 if (!BaseReg.isVirtual())
748 return false;
749 bool isPICBase = false;
750 for (const MachineInstr &DefMI : MRI.def_instructions(BaseReg)) {
751 if (DefMI.getOpcode() != X86::MOVPC32r)
752 return false;
753 assert(!isPICBase && "More than one PIC base?");
754 isPICBase = true;
755 }
756 return isPICBase;
757}
758
760 const MachineInstr &MI) const {
761 switch (MI.getOpcode()) {
762 default:
763 // This function should only be called for opcodes with the ReMaterializable
764 // flag set.
765 llvm_unreachable("Unknown rematerializable operation!");
766 break;
767 case X86::IMPLICIT_DEF:
768 // Defer to generic logic.
769 break;
770 case X86::LOAD_STACK_GUARD:
771 case X86::LD_Fp032:
772 case X86::LD_Fp064:
773 case X86::LD_Fp080:
774 case X86::LD_Fp132:
775 case X86::LD_Fp164:
776 case X86::LD_Fp180:
777 case X86::AVX1_SETALLONES:
778 case X86::AVX2_SETALLONES:
779 case X86::AVX512_128_SET0:
780 case X86::AVX512_256_SET0:
781 case X86::AVX512_512_SET0:
782 case X86::AVX512_512_SETALLONES:
783 case X86::AVX512_FsFLD0SD:
784 case X86::AVX512_FsFLD0SH:
785 case X86::AVX512_FsFLD0SS:
786 case X86::AVX512_FsFLD0F128:
787 case X86::AVX_SET0:
788 case X86::FsFLD0SD:
789 case X86::FsFLD0SS:
790 case X86::FsFLD0SH:
791 case X86::FsFLD0F128:
792 case X86::KSET0D:
793 case X86::KSET0Q:
794 case X86::KSET0W:
795 case X86::KSET1D:
796 case X86::KSET1Q:
797 case X86::KSET1W:
798 case X86::MMX_SET0:
799 case X86::MOV32ImmSExti8:
800 case X86::MOV32r0:
801 case X86::MOV32r1:
802 case X86::MOV32r_1:
803 case X86::MOV32ri64:
804 case X86::MOV64ImmSExti8:
805 case X86::V_SET0:
806 case X86::V_SETALLONES:
807 case X86::MOV16ri:
808 case X86::MOV32ri:
809 case X86::MOV64ri:
810 case X86::MOV64ri32:
811 case X86::MOV8ri:
812 case X86::PTILEZEROV:
813 return true;
814
815 case X86::MOV8rm:
816 case X86::MOV8rm_NOREX:
817 case X86::MOV16rm:
818 case X86::MOV32rm:
819 case X86::MOV64rm:
820 case X86::MOVSSrm:
821 case X86::MOVSSrm_alt:
822 case X86::MOVSDrm:
823 case X86::MOVSDrm_alt:
824 case X86::MOVAPSrm:
825 case X86::MOVUPSrm:
826 case X86::MOVAPDrm:
827 case X86::MOVUPDrm:
828 case X86::MOVDQArm:
829 case X86::MOVDQUrm:
830 case X86::VMOVSSrm:
831 case X86::VMOVSSrm_alt:
832 case X86::VMOVSDrm:
833 case X86::VMOVSDrm_alt:
834 case X86::VMOVAPSrm:
835 case X86::VMOVUPSrm:
836 case X86::VMOVAPDrm:
837 case X86::VMOVUPDrm:
838 case X86::VMOVDQArm:
839 case X86::VMOVDQUrm:
840 case X86::VMOVAPSYrm:
841 case X86::VMOVUPSYrm:
842 case X86::VMOVAPDYrm:
843 case X86::VMOVUPDYrm:
844 case X86::VMOVDQAYrm:
845 case X86::VMOVDQUYrm:
846 case X86::MMX_MOVD64rm:
847 case X86::MMX_MOVQ64rm:
848 case X86::VBROADCASTSSrm:
849 case X86::VBROADCASTSSYrm:
850 case X86::VBROADCASTSDYrm:
851 // AVX-512
852 case X86::VPBROADCASTBZ128rm:
853 case X86::VPBROADCASTBZ256rm:
854 case X86::VPBROADCASTBZrm:
855 case X86::VBROADCASTF32X2Z256rm:
856 case X86::VBROADCASTF32X2Zrm:
857 case X86::VBROADCASTI32X2Z128rm:
858 case X86::VBROADCASTI32X2Z256rm:
859 case X86::VBROADCASTI32X2Zrm:
860 case X86::VPBROADCASTWZ128rm:
861 case X86::VPBROADCASTWZ256rm:
862 case X86::VPBROADCASTWZrm:
863 case X86::VPBROADCASTDZ128rm:
864 case X86::VPBROADCASTDZ256rm:
865 case X86::VPBROADCASTDZrm:
866 case X86::VBROADCASTSSZ128rm:
867 case X86::VBROADCASTSSZ256rm:
868 case X86::VBROADCASTSSZrm:
869 case X86::VPBROADCASTQZ128rm:
870 case X86::VPBROADCASTQZ256rm:
871 case X86::VPBROADCASTQZrm:
872 case X86::VBROADCASTSDZ256rm:
873 case X86::VBROADCASTSDZrm:
874 case X86::VMOVSSZrm:
875 case X86::VMOVSSZrm_alt:
876 case X86::VMOVSDZrm:
877 case X86::VMOVSDZrm_alt:
878 case X86::VMOVSHZrm:
879 case X86::VMOVSHZrm_alt:
880 case X86::VMOVAPDZ128rm:
881 case X86::VMOVAPDZ256rm:
882 case X86::VMOVAPDZrm:
883 case X86::VMOVAPSZ128rm:
884 case X86::VMOVAPSZ256rm:
885 case X86::VMOVAPSZ128rm_NOVLX:
886 case X86::VMOVAPSZ256rm_NOVLX:
887 case X86::VMOVAPSZrm:
888 case X86::VMOVDQA32Z128rm:
889 case X86::VMOVDQA32Z256rm:
890 case X86::VMOVDQA32Zrm:
891 case X86::VMOVDQA64Z128rm:
892 case X86::VMOVDQA64Z256rm:
893 case X86::VMOVDQA64Zrm:
894 case X86::VMOVDQU16Z128rm:
895 case X86::VMOVDQU16Z256rm:
896 case X86::VMOVDQU16Zrm:
897 case X86::VMOVDQU32Z128rm:
898 case X86::VMOVDQU32Z256rm:
899 case X86::VMOVDQU32Zrm:
900 case X86::VMOVDQU64Z128rm:
901 case X86::VMOVDQU64Z256rm:
902 case X86::VMOVDQU64Zrm:
903 case X86::VMOVDQU8Z128rm:
904 case X86::VMOVDQU8Z256rm:
905 case X86::VMOVDQU8Zrm:
906 case X86::VMOVUPDZ128rm:
907 case X86::VMOVUPDZ256rm:
908 case X86::VMOVUPDZrm:
909 case X86::VMOVUPSZ128rm:
910 case X86::VMOVUPSZ256rm:
911 case X86::VMOVUPSZ128rm_NOVLX:
912 case X86::VMOVUPSZ256rm_NOVLX:
913 case X86::VMOVUPSZrm: {
914 // Loads from constant pools are trivially rematerializable.
915 if (MI.getOperand(1 + X86::AddrBaseReg).isReg() &&
916 MI.getOperand(1 + X86::AddrScaleAmt).isImm() &&
917 MI.getOperand(1 + X86::AddrIndexReg).isReg() &&
918 MI.getOperand(1 + X86::AddrIndexReg).getReg() == 0 &&
919 MI.isDereferenceableInvariantLoad()) {
920 Register BaseReg = MI.getOperand(1 + X86::AddrBaseReg).getReg();
921 if (BaseReg == 0 || BaseReg == X86::RIP)
922 return true;
923 // Allow re-materialization of PIC load.
924 if (!(!ReMatPICStubLoad && MI.getOperand(1 + X86::AddrDisp).isGlobal())) {
925 const MachineFunction &MF = *MI.getParent()->getParent();
926 const MachineRegisterInfo &MRI = MF.getRegInfo();
927 if (regIsPICBase(BaseReg, MRI))
928 return true;
929 }
930 }
931 break;
932 }
933
934 case X86::LEA32r:
935 case X86::LEA64r: {
936 if (MI.getOperand(1 + X86::AddrScaleAmt).isImm() &&
937 MI.getOperand(1 + X86::AddrIndexReg).isReg() &&
938 MI.getOperand(1 + X86::AddrIndexReg).getReg() == 0 &&
939 !MI.getOperand(1 + X86::AddrDisp).isReg()) {
940 // lea fi#, lea GV, etc. are all rematerializable.
941 if (!MI.getOperand(1 + X86::AddrBaseReg).isReg())
942 return true;
943 Register BaseReg = MI.getOperand(1 + X86::AddrBaseReg).getReg();
944 if (BaseReg == 0)
945 return true;
946 // Allow re-materialization of lea PICBase + x.
947 const MachineFunction &MF = *MI.getParent()->getParent();
948 const MachineRegisterInfo &MRI = MF.getRegInfo();
949 if (regIsPICBase(BaseReg, MRI))
950 return true;
951 }
952 break;
953 }
954 }
956}
957
960 Register DestReg, unsigned SubIdx,
961 const MachineInstr &Orig,
962 const TargetRegisterInfo &TRI) const {
963 bool ClobbersEFLAGS = Orig.modifiesRegister(X86::EFLAGS, &TRI);
964 if (ClobbersEFLAGS && MBB.computeRegisterLiveness(&TRI, X86::EFLAGS, I) !=
966 // The instruction clobbers EFLAGS. Re-materialize as MOV32ri to avoid side
967 // effects.
968 int Value;
969 switch (Orig.getOpcode()) {
970 case X86::MOV32r0:
971 Value = 0;
972 break;
973 case X86::MOV32r1:
974 Value = 1;
975 break;
976 case X86::MOV32r_1:
977 Value = -1;
978 break;
979 default:
980 llvm_unreachable("Unexpected instruction!");
981 }
982
983 const DebugLoc &DL = Orig.getDebugLoc();
984 BuildMI(MBB, I, DL, get(X86::MOV32ri))
985 .add(Orig.getOperand(0))
986 .addImm(Value);
987 } else {
988 MachineInstr *MI = MBB.getParent()->CloneMachineInstr(&Orig);
989 MBB.insert(I, MI);
990 }
991
992 MachineInstr &NewMI = *std::prev(I);
993 NewMI.substituteRegister(Orig.getOperand(0).getReg(), DestReg, SubIdx, TRI);
994}
995
996/// True if MI has a condition code def, e.g. EFLAGS, that is not marked dead.
998 for (const MachineOperand &MO : MI.operands()) {
999 if (MO.isReg() && MO.isDef() && MO.getReg() == X86::EFLAGS &&
1000 !MO.isDead()) {
1001 return true;
1002 }
1003 }
1004 return false;
1005}
1006
1007/// Check whether the shift count for a machine operand is non-zero.
1008inline static unsigned getTruncatedShiftCount(const MachineInstr &MI,
1009 unsigned ShiftAmtOperandIdx) {
1010 // The shift count is six bits with the REX.W prefix and five bits without.
1011 unsigned ShiftCountMask = (MI.getDesc().TSFlags & X86II::REX_W) ? 63 : 31;
1012 unsigned Imm = MI.getOperand(ShiftAmtOperandIdx).getImm();
1013 return Imm & ShiftCountMask;
1014}
1015
1016/// Check whether the given shift count is appropriate
1017/// can be represented by a LEA instruction.
1018inline static bool isTruncatedShiftCountForLEA(unsigned ShAmt) {
1019 // Left shift instructions can be transformed into load-effective-address
1020 // instructions if we can encode them appropriately.
1021 // A LEA instruction utilizes a SIB byte to encode its scale factor.
1022 // The SIB.scale field is two bits wide which means that we can encode any
1023 // shift amount less than 4.
1024 return ShAmt < 4 && ShAmt > 0;
1025}
1026
1027static bool
1029 const MachineRegisterInfo *MRI, MachineInstr **AndInstr,
1030 const TargetRegisterInfo *TRI, const X86Subtarget &ST,
1031 bool &NoSignFlag, bool &ClearsOverflowFlag) {
1032 if (!(CmpValDefInstr.getOpcode() == X86::SUBREG_TO_REG &&
1033 CmpInstr.getOpcode() == X86::TEST64rr) &&
1034 !(CmpValDefInstr.getOpcode() == X86::COPY &&
1035 CmpInstr.getOpcode() == X86::TEST16rr))
1036 return false;
1037
1038 // CmpInstr is a TEST16rr/TEST64rr instruction, and
1039 // `X86InstrInfo::analyzeCompare` guarantees that it's analyzable only if two
1040 // registers are identical.
1041 assert((CmpInstr.getOperand(0).getReg() == CmpInstr.getOperand(1).getReg()) &&
1042 "CmpInstr is an analyzable TEST16rr/TEST64rr, and "
1043 "`X86InstrInfo::analyzeCompare` requires two reg operands are the"
1044 "same.");
1045
1046 // Caller (`X86InstrInfo::optimizeCompareInstr`) guarantees that
1047 // `CmpValDefInstr` defines the value that's used by `CmpInstr`; in this case
1048 // if `CmpValDefInstr` sets the EFLAGS, it is likely that `CmpInstr` is
1049 // redundant.
1050 assert(
1051 (MRI->getVRegDef(CmpInstr.getOperand(0).getReg()) == &CmpValDefInstr) &&
1052 "Caller guarantees that TEST64rr is a user of SUBREG_TO_REG or TEST16rr "
1053 "is a user of COPY sub16bit.");
1054 MachineInstr *VregDefInstr = nullptr;
1055 if (CmpInstr.getOpcode() == X86::TEST16rr) {
1056 if (!CmpValDefInstr.getOperand(1).getReg().isVirtual())
1057 return false;
1058 VregDefInstr = MRI->getVRegDef(CmpValDefInstr.getOperand(1).getReg());
1059 if (!VregDefInstr)
1060 return false;
1061 // We can only remove test when AND32ri or AND64ri32 whose imm can fit 16bit
1062 // size, others 32/64 bit ops would test higher bits which test16rr don't
1063 // want to.
1064 if (!((VregDefInstr->getOpcode() == X86::AND32ri ||
1065 VregDefInstr->getOpcode() == X86::AND64ri32) &&
1066 isUInt<16>(VregDefInstr->getOperand(2).getImm())))
1067 return false;
1068 }
1069
1070 if (CmpInstr.getOpcode() == X86::TEST64rr) {
1071 // As seen in X86 td files, CmpValDefInstr.getOperand(1).getImm() is
1072 // typically 0.
1073 if (CmpValDefInstr.getOperand(1).getImm() != 0)
1074 return false;
1075
1076 // As seen in X86 td files, CmpValDefInstr.getOperand(3) is typically
1077 // sub_32bit or sub_xmm.
1078 if (CmpValDefInstr.getOperand(3).getImm() != X86::sub_32bit)
1079 return false;
1080
1081 VregDefInstr = MRI->getVRegDef(CmpValDefInstr.getOperand(2).getReg());
1082 }
1083
1084 assert(VregDefInstr && "Must have a definition (SSA)");
1085
1086 // Requires `CmpValDefInstr` and `VregDefInstr` are from the same MBB
1087 // to simplify the subsequent analysis.
1088 //
1089 // FIXME: If `VregDefInstr->getParent()` is the only predecessor of
1090 // `CmpValDefInstr.getParent()`, this could be handled.
1091 if (VregDefInstr->getParent() != CmpValDefInstr.getParent())
1092 return false;
1093
1094 if (X86::isAND(VregDefInstr->getOpcode()) &&
1095 (!ST.hasNF() || VregDefInstr->modifiesRegister(X86::EFLAGS, TRI))) {
1096 // Get a sequence of instructions like
1097 // %reg = and* ... // Set EFLAGS
1098 // ... // EFLAGS not changed
1099 // %extended_reg = subreg_to_reg 0, %reg, %subreg.sub_32bit
1100 // test64rr %extended_reg, %extended_reg, implicit-def $eflags
1101 // or
1102 // %reg = and32* ...
1103 // ... // EFLAGS not changed.
1104 // %src_reg = copy %reg.sub_16bit:gr32
1105 // test16rr %src_reg, %src_reg, implicit-def $eflags
1106 //
1107 // If subsequent readers use a subset of bits that don't change
1108 // after `and*` instructions, it's likely that the test64rr could
1109 // be optimized away.
1110 for (const MachineInstr &Instr :
1111 make_range(std::next(MachineBasicBlock::iterator(VregDefInstr)),
1112 MachineBasicBlock::iterator(CmpValDefInstr))) {
1113 // There are instructions between 'VregDefInstr' and
1114 // 'CmpValDefInstr' that modifies EFLAGS.
1115 if (Instr.modifiesRegister(X86::EFLAGS, TRI))
1116 return false;
1117 }
1118
1119 *AndInstr = VregDefInstr;
1120
1121 // AND instruction will essentially update SF and clear OF, so
1122 // NoSignFlag should be false in the sense that SF is modified by `AND`.
1123 //
1124 // However, the implementation artifically sets `NoSignFlag` to true
1125 // to poison the SF bit; that is to say, if SF is looked at later, the
1126 // optimization (to erase TEST64rr) will be disabled.
1127 //
1128 // The reason to poison SF bit is that SF bit value could be different
1129 // in the `AND` and `TEST` operation; signed bit is not known for `AND`,
1130 // and is known to be 0 as a result of `TEST64rr`.
1131 //
1132 // FIXME: As opposed to poisoning the SF bit directly, consider peeking into
1133 // the AND instruction and using the static information to guide peephole
1134 // optimization if possible. For example, it's possible to fold a
1135 // conditional move into a copy if the relevant EFLAG bits could be deduced
1136 // from an immediate operand of and operation.
1137 //
1138 NoSignFlag = true;
1139 // ClearsOverflowFlag is true for AND operation (no surprise).
1140 ClearsOverflowFlag = true;
1141 return true;
1142 }
1143 return false;
1144}
1145
1147 unsigned Opc, bool AllowSP, Register &NewSrc,
1148 unsigned &NewSrcSubReg, bool &isKill,
1149 MachineOperand &ImplicitOp, LiveVariables *LV,
1150 LiveIntervals *LIS) const {
1151 MachineFunction &MF = *MI.getParent()->getParent();
1152 const TargetRegisterClass *RC;
1153 if (AllowSP) {
1154 RC = Opc != X86::LEA32r ? &X86::GR64RegClass : &X86::GR32RegClass;
1155 } else {
1156 RC = Opc != X86::LEA32r ? &X86::GR64_NOSPRegClass : &X86::GR32_NOSPRegClass;
1157 }
1158 Register SrcReg = Src.getReg();
1159 unsigned SubReg = Src.getSubReg();
1160 isKill = MI.killsRegister(SrcReg, /*TRI=*/nullptr);
1161
1162 NewSrcSubReg = X86::NoSubRegister;
1163
1164 // For both LEA64 and LEA32 the register already has essentially the right
1165 // type (32-bit or 64-bit) we may just need to forbid SP.
1166 if (Opc != X86::LEA64_32r) {
1167 NewSrc = SrcReg;
1168 NewSrcSubReg = SubReg;
1169 assert(!Src.isUndef() && "Undef op doesn't need optimization");
1170
1171 if (NewSrc.isVirtual() && !MF.getRegInfo().constrainRegClass(NewSrc, RC))
1172 return false;
1173
1174 return true;
1175 }
1176
1177 // This is for an LEA64_32r and incoming registers are 32-bit. One way or
1178 // another we need to add 64-bit registers to the final MI.
1179 if (SrcReg.isPhysical()) {
1180 ImplicitOp = Src;
1181 ImplicitOp.setImplicit();
1182
1183 NewSrc = getX86SubSuperRegister(SrcReg, 64);
1184 assert(!SubReg && "no superregister for source");
1185 assert(NewSrc.isValid() && "Invalid Operand");
1186 assert(!Src.isUndef() && "Undef op doesn't need optimization");
1187 } else {
1188 // Virtual register of the wrong class, we have to create a temporary 64-bit
1189 // vreg to feed into the LEA.
1190 NewSrc = MF.getRegInfo().createVirtualRegister(RC);
1191 NewSrcSubReg = X86::NoSubRegister;
1192 MachineInstr *Copy =
1193 BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(TargetOpcode::COPY))
1194 .addReg(NewSrc, RegState::Define | RegState::Undef, X86::sub_32bit)
1195 .addReg(SrcReg, getKillRegState(isKill), SubReg);
1196
1197 // Which is obviously going to be dead after we're done with it.
1198 isKill = true;
1199
1200 if (LV)
1201 LV->replaceKillInstruction(SrcReg, MI, *Copy);
1202
1203 if (LIS) {
1204 SlotIndex CopyIdx = LIS->InsertMachineInstrInMaps(*Copy);
1205 SlotIndex Idx = LIS->getInstructionIndex(MI);
1206 LiveInterval &LI = LIS->getInterval(SrcReg);
1208 if (S->end.getBaseIndex() == Idx)
1209 S->end = CopyIdx.getRegSlot();
1210 }
1211 }
1212
1213 // We've set all the parameters without issue.
1214 return true;
1215}
1216
1217MachineInstr *X86InstrInfo::convertToThreeAddressWithLEA(unsigned MIOpc,
1219 LiveVariables *LV,
1220 LiveIntervals *LIS,
1221 bool Is8BitOp) const {
1222 // We handle 8-bit adds and various 16-bit opcodes in the switch below.
1223 MachineBasicBlock &MBB = *MI.getParent();
1224 MachineRegisterInfo &RegInfo = MBB.getParent()->getRegInfo();
1225 assert((Is8BitOp ||
1226 RegInfo.getTargetRegisterInfo()->getRegSizeInBits(
1227 *RegInfo.getRegClass(MI.getOperand(0).getReg())) == 16) &&
1228 "Unexpected type for LEA transform");
1229
1230 // TODO: For a 32-bit target, we need to adjust the LEA variables with
1231 // something like this:
1232 // Opcode = X86::LEA32r;
1233 // InRegLEA = RegInfo.createVirtualRegister(&X86::GR32_NOSPRegClass);
1234 // OutRegLEA =
1235 // Is8BitOp ? RegInfo.createVirtualRegister(&X86::GR32ABCD_RegClass)
1236 // : RegInfo.createVirtualRegister(&X86::GR32RegClass);
1237 if (!Subtarget.is64Bit())
1238 return nullptr;
1239
1240 unsigned Opcode = X86::LEA64_32r;
1241 Register InRegLEA = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass);
1242 Register OutRegLEA = RegInfo.createVirtualRegister(&X86::GR32RegClass);
1243 Register InRegLEA2;
1244
1245 // Build and insert into an implicit UNDEF value. This is OK because
1246 // we will be shifting and then extracting the lower 8/16-bits.
1247 // This has the potential to cause partial register stall. e.g.
1248 // movw (%rbp,%rcx,2), %dx
1249 // leal -65(%rdx), %esi
1250 // But testing has shown this *does* help performance in 64-bit mode (at
1251 // least on modern x86 machines).
1252 MachineBasicBlock::iterator MBBI = MI.getIterator();
1253 Register Dest = MI.getOperand(0).getReg();
1254 Register Src = MI.getOperand(1).getReg();
1255 unsigned SrcSubReg = MI.getOperand(1).getSubReg();
1256 Register Src2;
1257 unsigned Src2SubReg;
1258 bool IsDead = MI.getOperand(0).isDead();
1259 bool IsKill = MI.getOperand(1).isKill();
1260 unsigned SubReg = Is8BitOp ? X86::sub_8bit : X86::sub_16bit;
1261 assert(!MI.getOperand(1).isUndef() && "Undef op doesn't need optimization");
1262 MachineInstr *ImpDef =
1263 BuildMI(MBB, MBBI, MI.getDebugLoc(), get(X86::IMPLICIT_DEF), InRegLEA);
1264 MachineInstr *InsMI =
1265 BuildMI(MBB, MBBI, MI.getDebugLoc(), get(TargetOpcode::COPY))
1266 .addReg(InRegLEA, RegState::Define, SubReg)
1267 .addReg(Src, getKillRegState(IsKill), SrcSubReg);
1268 MachineInstr *ImpDef2 = nullptr;
1269 MachineInstr *InsMI2 = nullptr;
1270
1272 BuildMI(MBB, MBBI, MI.getDebugLoc(), get(Opcode), OutRegLEA);
1273#define CASE_NF(OP) \
1274 case X86::OP: \
1275 case X86::OP##_NF:
1276 switch (MIOpc) {
1277 default:
1278 llvm_unreachable("Unreachable!");
1279 CASE_NF(SHL8ri)
1280 CASE_NF(SHL16ri) {
1281 unsigned ShAmt = MI.getOperand(2).getImm();
1282 MIB.addReg(0)
1283 .addImm(1LL << ShAmt)
1284 .addReg(InRegLEA, RegState::Kill)
1285 .addImm(0)
1286 .addReg(0);
1287 break;
1288 }
1289 CASE_NF(INC8r)
1290 CASE_NF(INC16r)
1291 addRegOffset(MIB, InRegLEA, true, 1);
1292 break;
1293 CASE_NF(DEC8r)
1294 CASE_NF(DEC16r)
1295 addRegOffset(MIB, InRegLEA, true, -1);
1296 break;
1297 CASE_NF(ADD8ri)
1298 CASE_NF(ADD16ri)
1299 case X86::ADD8ri_DB:
1300 case X86::ADD16ri_DB:
1301 addRegOffset(MIB, InRegLEA, true, MI.getOperand(2).getImm());
1302 break;
1303 CASE_NF(ADD8rr)
1304 CASE_NF(ADD16rr)
1305 case X86::ADD8rr_DB:
1306 case X86::ADD16rr_DB: {
1307 Src2 = MI.getOperand(2).getReg();
1308 Src2SubReg = MI.getOperand(2).getSubReg();
1309 bool IsKill2 = MI.getOperand(2).isKill();
1310 assert(!MI.getOperand(2).isUndef() && "Undef op doesn't need optimization");
1311 if (Src == Src2) {
1312 // ADD8rr/ADD16rr killed %reg1028, %reg1028
1313 // just a single insert_subreg.
1314 addRegReg(MIB, InRegLEA, true, X86::NoSubRegister, InRegLEA, false,
1315 X86::NoSubRegister);
1316 } else {
1317 if (Subtarget.is64Bit())
1318 InRegLEA2 = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass);
1319 else
1320 InRegLEA2 = RegInfo.createVirtualRegister(&X86::GR32_NOSPRegClass);
1321 // Build and insert into an implicit UNDEF value. This is OK because
1322 // we will be shifting and then extracting the lower 8/16-bits.
1323 ImpDef2 = BuildMI(MBB, &*MIB, MI.getDebugLoc(), get(X86::IMPLICIT_DEF),
1324 InRegLEA2);
1325 InsMI2 = BuildMI(MBB, &*MIB, MI.getDebugLoc(), get(TargetOpcode::COPY))
1326 .addReg(InRegLEA2, RegState::Define, SubReg)
1327 .addReg(Src2, getKillRegState(IsKill2), Src2SubReg);
1328 addRegReg(MIB, InRegLEA, true, X86::NoSubRegister, InRegLEA2, true,
1329 X86::NoSubRegister);
1330 }
1331 if (LV && IsKill2 && InsMI2)
1332 LV->replaceKillInstruction(Src2, MI, *InsMI2);
1333 break;
1334 }
1335 }
1336
1337 MachineInstr *NewMI = MIB;
1338 MachineInstr *ExtMI =
1339 BuildMI(MBB, MBBI, MI.getDebugLoc(), get(TargetOpcode::COPY))
1341 .addReg(OutRegLEA, RegState::Kill, SubReg);
1342
1343 if (LV) {
1344 // Update live variables.
1345 LV->getVarInfo(InRegLEA).Kills.push_back(NewMI);
1346 if (InRegLEA2)
1347 LV->getVarInfo(InRegLEA2).Kills.push_back(NewMI);
1348 LV->getVarInfo(OutRegLEA).Kills.push_back(ExtMI);
1349 if (IsKill)
1350 LV->replaceKillInstruction(Src, MI, *InsMI);
1351 if (IsDead)
1352 LV->replaceKillInstruction(Dest, MI, *ExtMI);
1353 }
1354
1355 if (LIS) {
1356 LIS->InsertMachineInstrInMaps(*ImpDef);
1357 SlotIndex InsIdx = LIS->InsertMachineInstrInMaps(*InsMI);
1358 if (ImpDef2)
1359 LIS->InsertMachineInstrInMaps(*ImpDef2);
1360 SlotIndex Ins2Idx;
1361 if (InsMI2)
1362 Ins2Idx = LIS->InsertMachineInstrInMaps(*InsMI2);
1363 SlotIndex NewIdx = LIS->ReplaceMachineInstrInMaps(MI, *NewMI);
1364 SlotIndex ExtIdx = LIS->InsertMachineInstrInMaps(*ExtMI);
1365 LIS->getInterval(InRegLEA);
1366 LIS->getInterval(OutRegLEA);
1367 if (InRegLEA2)
1368 LIS->getInterval(InRegLEA2);
1369
1370 // Move the use of Src up to InsMI.
1371 LiveInterval &SrcLI = LIS->getInterval(Src);
1372 LiveRange::Segment *SrcSeg = SrcLI.getSegmentContaining(NewIdx);
1373 if (SrcSeg->end == NewIdx.getRegSlot())
1374 SrcSeg->end = InsIdx.getRegSlot();
1375
1376 if (InsMI2) {
1377 // Move the use of Src2 up to InsMI2.
1378 LiveInterval &Src2LI = LIS->getInterval(Src2);
1379 LiveRange::Segment *Src2Seg = Src2LI.getSegmentContaining(NewIdx);
1380 if (Src2Seg->end == NewIdx.getRegSlot())
1381 Src2Seg->end = Ins2Idx.getRegSlot();
1382 }
1383
1384 // Move the definition of Dest down to ExtMI.
1385 LiveInterval &DestLI = LIS->getInterval(Dest);
1386 LiveRange::Segment *DestSeg =
1387 DestLI.getSegmentContaining(NewIdx.getRegSlot());
1388 assert(DestSeg->start == NewIdx.getRegSlot() &&
1389 DestSeg->valno->def == NewIdx.getRegSlot());
1390 DestSeg->start = ExtIdx.getRegSlot();
1391 DestSeg->valno->def = ExtIdx.getRegSlot();
1392 }
1393
1394 return ExtMI;
1395}
1396
1397/// This method must be implemented by targets that
1398/// set the M_CONVERTIBLE_TO_3_ADDR flag. When this flag is set, the target
1399/// may be able to convert a two-address instruction into a true
1400/// three-address instruction on demand. This allows the X86 target (for
1401/// example) to convert ADD and SHL instructions into LEA instructions if they
1402/// would require register copies due to two-addressness.
1403///
1404/// This method returns a null pointer if the transformation cannot be
1405/// performed, otherwise it returns the new instruction.
1406///
1408 LiveVariables *LV,
1409 LiveIntervals *LIS) const {
1410 // The following opcodes also sets the condition code register(s). Only
1411 // convert them to equivalent lea if the condition code register def's
1412 // are dead!
1414 return nullptr;
1415
1416 MachineFunction &MF = *MI.getParent()->getParent();
1417 // All instructions input are two-addr instructions. Get the known operands.
1418 const MachineOperand &Dest = MI.getOperand(0);
1419 const MachineOperand &Src = MI.getOperand(1);
1420
1421 // Ideally, operations with undef should be folded before we get here, but we
1422 // can't guarantee it. Bail out because optimizing undefs is a waste of time.
1423 // Without this, we have to forward undef state to new register operands to
1424 // avoid machine verifier errors.
1425 if (Src.isUndef())
1426 return nullptr;
1427 if (MI.getNumOperands() > 2)
1428 if (MI.getOperand(2).isReg() && MI.getOperand(2).isUndef())
1429 return nullptr;
1430
1431 MachineInstr *NewMI = nullptr;
1432 Register SrcReg, SrcReg2;
1433 unsigned SrcSubReg, SrcSubReg2;
1434 bool Is64Bit = Subtarget.is64Bit();
1435
1436 bool Is8BitOp = false;
1437 unsigned NumRegOperands = 2;
1438 unsigned MIOpc = MI.getOpcode();
1439 switch (MIOpc) {
1440 default:
1441 llvm_unreachable("Unreachable!");
1442 CASE_NF(SHL64ri) {
1443 assert(MI.getNumOperands() >= 3 && "Unknown shift instruction!");
1444 unsigned ShAmt = getTruncatedShiftCount(MI, 2);
1445 if (!isTruncatedShiftCountForLEA(ShAmt))
1446 return nullptr;
1447
1448 // LEA can't handle RSP.
1449 if (Src.getReg().isVirtual() && !MF.getRegInfo().constrainRegClass(
1450 Src.getReg(), &X86::GR64_NOSPRegClass))
1451 return nullptr;
1452
1453 NewMI = BuildMI(MF, MI.getDebugLoc(), get(X86::LEA64r))
1454 .add(Dest)
1455 .addReg(0)
1456 .addImm(1LL << ShAmt)
1457 .add(Src)
1458 .addImm(0)
1459 .addReg(0);
1460 break;
1461 }
1462 CASE_NF(SHL32ri) {
1463 assert(MI.getNumOperands() >= 3 && "Unknown shift instruction!");
1464 unsigned ShAmt = getTruncatedShiftCount(MI, 2);
1465 if (!isTruncatedShiftCountForLEA(ShAmt))
1466 return nullptr;
1467
1468 unsigned Opc = Is64Bit ? X86::LEA64_32r : X86::LEA32r;
1469
1470 // LEA can't handle ESP.
1471 bool isKill;
1472 MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
1473 if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/false, SrcReg, SrcSubReg,
1474 isKill, ImplicitOp, LV, LIS))
1475 return nullptr;
1476
1478 BuildMI(MF, MI.getDebugLoc(), get(Opc))
1479 .add(Dest)
1480 .addReg(0)
1481 .addImm(1LL << ShAmt)
1482 .addReg(SrcReg, getKillRegState(isKill), SrcSubReg)
1483 .addImm(0)
1484 .addReg(0);
1485 if (ImplicitOp.getReg() != 0)
1486 MIB.add(ImplicitOp);
1487 NewMI = MIB;
1488
1489 // Add kills if classifyLEAReg created a new register.
1490 if (LV && SrcReg != Src.getReg())
1491 LV->getVarInfo(SrcReg).Kills.push_back(NewMI);
1492 break;
1493 }
1494 CASE_NF(SHL8ri)
1495 Is8BitOp = true;
1496 [[fallthrough]];
1497 CASE_NF(SHL16ri) {
1498 assert(MI.getNumOperands() >= 3 && "Unknown shift instruction!");
1499 unsigned ShAmt = getTruncatedShiftCount(MI, 2);
1500 if (!isTruncatedShiftCountForLEA(ShAmt))
1501 return nullptr;
1502 return convertToThreeAddressWithLEA(MIOpc, MI, LV, LIS, Is8BitOp);
1503 }
1504 CASE_NF(INC64r)
1505 CASE_NF(INC32r) {
1506 assert(MI.getNumOperands() >= 2 && "Unknown inc instruction!");
1507 unsigned Opc = (MIOpc == X86::INC64r || MIOpc == X86::INC64r_NF)
1508 ? X86::LEA64r
1509 : (Is64Bit ? X86::LEA64_32r : X86::LEA32r);
1510 bool isKill;
1511 MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
1512 if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/false, SrcReg, SrcSubReg,
1513 isKill, ImplicitOp, LV, LIS))
1514 return nullptr;
1515
1516 MachineInstrBuilder MIB = BuildMI(MF, MI.getDebugLoc(), get(Opc))
1517 .add(Dest)
1518 .addReg(SrcReg, getKillRegState(isKill));
1519 if (ImplicitOp.getReg() != 0)
1520 MIB.add(ImplicitOp);
1521
1522 NewMI = addOffset(MIB, 1);
1523
1524 // Add kills if classifyLEAReg created a new register.
1525 if (LV && SrcReg != Src.getReg())
1526 LV->getVarInfo(SrcReg).Kills.push_back(NewMI);
1527 break;
1528 }
1529 CASE_NF(DEC64r)
1530 CASE_NF(DEC32r) {
1531 assert(MI.getNumOperands() >= 2 && "Unknown dec instruction!");
1532 unsigned Opc = (MIOpc == X86::DEC64r || MIOpc == X86::DEC64r_NF)
1533 ? X86::LEA64r
1534 : (Is64Bit ? X86::LEA64_32r : X86::LEA32r);
1535
1536 bool isKill;
1537 MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
1538 if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/false, SrcReg, SrcSubReg,
1539 isKill, ImplicitOp, LV, LIS))
1540 return nullptr;
1541
1542 MachineInstrBuilder MIB = BuildMI(MF, MI.getDebugLoc(), get(Opc))
1543 .add(Dest)
1544 .addReg(SrcReg, getKillRegState(isKill));
1545 if (ImplicitOp.getReg() != 0)
1546 MIB.add(ImplicitOp);
1547
1548 NewMI = addOffset(MIB, -1);
1549
1550 // Add kills if classifyLEAReg created a new register.
1551 if (LV && SrcReg != Src.getReg())
1552 LV->getVarInfo(SrcReg).Kills.push_back(NewMI);
1553 break;
1554 }
1555 CASE_NF(DEC8r)
1556 CASE_NF(INC8r)
1557 Is8BitOp = true;
1558 [[fallthrough]];
1559 CASE_NF(DEC16r)
1560 CASE_NF(INC16r)
1561 return convertToThreeAddressWithLEA(MIOpc, MI, LV, LIS, Is8BitOp);
1562 CASE_NF(ADD64rr)
1563 CASE_NF(ADD32rr)
1564 case X86::ADD64rr_DB:
1565 case X86::ADD32rr_DB: {
1566 assert(MI.getNumOperands() >= 3 && "Unknown add instruction!");
1567 unsigned Opc;
1568 if (MIOpc == X86::ADD64rr || MIOpc == X86::ADD64rr_NF ||
1569 MIOpc == X86::ADD64rr_DB)
1570 Opc = X86::LEA64r;
1571 else
1572 Opc = Is64Bit ? X86::LEA64_32r : X86::LEA32r;
1573
1574 const MachineOperand &Src2 = MI.getOperand(2);
1575 bool isKill2;
1576 MachineOperand ImplicitOp2 = MachineOperand::CreateReg(0, false);
1577 if (!classifyLEAReg(MI, Src2, Opc, /*AllowSP=*/false, SrcReg2, SrcSubReg2,
1578 isKill2, ImplicitOp2, LV, LIS))
1579 return nullptr;
1580
1581 bool isKill;
1582 MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
1583 if (Src.getReg() == Src2.getReg()) {
1584 // Don't call classify LEAReg a second time on the same register, in case
1585 // the first call inserted a COPY from Src2 and marked it as killed.
1586 isKill = isKill2;
1587 SrcReg = SrcReg2;
1588 SrcSubReg = SrcSubReg2;
1589 } else {
1590 if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/true, SrcReg, SrcSubReg,
1591 isKill, ImplicitOp, LV, LIS))
1592 return nullptr;
1593 }
1594
1595 MachineInstrBuilder MIB = BuildMI(MF, MI.getDebugLoc(), get(Opc)).add(Dest);
1596 if (ImplicitOp.getReg() != 0)
1597 MIB.add(ImplicitOp);
1598 if (ImplicitOp2.getReg() != 0)
1599 MIB.add(ImplicitOp2);
1600
1601 NewMI =
1602 addRegReg(MIB, SrcReg, isKill, SrcSubReg, SrcReg2, isKill2, SrcSubReg2);
1603
1604 // Add kills if classifyLEAReg created a new register.
1605 if (LV) {
1606 if (SrcReg2 != Src2.getReg())
1607 LV->getVarInfo(SrcReg2).Kills.push_back(NewMI);
1608 if (SrcReg != SrcReg2 && SrcReg != Src.getReg())
1609 LV->getVarInfo(SrcReg).Kills.push_back(NewMI);
1610 }
1611 NumRegOperands = 3;
1612 break;
1613 }
1614 CASE_NF(ADD8rr)
1615 case X86::ADD8rr_DB:
1616 Is8BitOp = true;
1617 [[fallthrough]];
1618 CASE_NF(ADD16rr)
1619 case X86::ADD16rr_DB:
1620 return convertToThreeAddressWithLEA(MIOpc, MI, LV, LIS, Is8BitOp);
1621 CASE_NF(ADD64ri32)
1622 case X86::ADD64ri32_DB:
1623 assert(MI.getNumOperands() >= 3 && "Unknown add instruction!");
1624 NewMI = addOffset(
1625 BuildMI(MF, MI.getDebugLoc(), get(X86::LEA64r)).add(Dest).add(Src),
1626 MI.getOperand(2));
1627 break;
1628 CASE_NF(ADD32ri)
1629 case X86::ADD32ri_DB: {
1630 assert(MI.getNumOperands() >= 3 && "Unknown add instruction!");
1631 unsigned Opc = Is64Bit ? X86::LEA64_32r : X86::LEA32r;
1632
1633 bool isKill;
1634 MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
1635 if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/true, SrcReg, SrcSubReg,
1636 isKill, ImplicitOp, LV, LIS))
1637 return nullptr;
1638
1640 BuildMI(MF, MI.getDebugLoc(), get(Opc))
1641 .add(Dest)
1642 .addReg(SrcReg, getKillRegState(isKill), SrcSubReg);
1643 if (ImplicitOp.getReg() != 0)
1644 MIB.add(ImplicitOp);
1645
1646 NewMI = addOffset(MIB, MI.getOperand(2));
1647
1648 // Add kills if classifyLEAReg created a new register.
1649 if (LV && SrcReg != Src.getReg())
1650 LV->getVarInfo(SrcReg).Kills.push_back(NewMI);
1651 break;
1652 }
1653 CASE_NF(ADD8ri)
1654 case X86::ADD8ri_DB:
1655 Is8BitOp = true;
1656 [[fallthrough]];
1657 CASE_NF(ADD16ri)
1658 case X86::ADD16ri_DB:
1659 return convertToThreeAddressWithLEA(MIOpc, MI, LV, LIS, Is8BitOp);
1660 CASE_NF(SUB8ri)
1661 CASE_NF(SUB16ri)
1662 /// FIXME: Support these similar to ADD8ri/ADD16ri*.
1663 return nullptr;
1664 CASE_NF(SUB32ri) {
1665 if (!MI.getOperand(2).isImm())
1666 return nullptr;
1667 int64_t Imm = MI.getOperand(2).getImm();
1668 if (!isInt<32>(-Imm))
1669 return nullptr;
1670
1671 assert(MI.getNumOperands() >= 3 && "Unknown add instruction!");
1672 unsigned Opc = Is64Bit ? X86::LEA64_32r : X86::LEA32r;
1673
1674 bool isKill;
1675 MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
1676 if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/true, SrcReg, SrcSubReg,
1677 isKill, ImplicitOp, LV, LIS))
1678 return nullptr;
1679
1681 BuildMI(MF, MI.getDebugLoc(), get(Opc))
1682 .add(Dest)
1683 .addReg(SrcReg, getKillRegState(isKill), SrcSubReg);
1684 if (ImplicitOp.getReg() != 0)
1685 MIB.add(ImplicitOp);
1686
1687 NewMI = addOffset(MIB, -Imm);
1688
1689 // Add kills if classifyLEAReg created a new register.
1690 if (LV && SrcReg != Src.getReg())
1691 LV->getVarInfo(SrcReg).Kills.push_back(NewMI);
1692 break;
1693 }
1694
1695 CASE_NF(SUB64ri32) {
1696 if (!MI.getOperand(2).isImm())
1697 return nullptr;
1698 int64_t Imm = MI.getOperand(2).getImm();
1699 if (!isInt<32>(-Imm))
1700 return nullptr;
1701
1702 assert(MI.getNumOperands() >= 3 && "Unknown sub instruction!");
1703
1705 BuildMI(MF, MI.getDebugLoc(), get(X86::LEA64r)).add(Dest).add(Src);
1706 NewMI = addOffset(MIB, -Imm);
1707 break;
1708 }
1709
1710 case X86::VMOVDQU8Z128rmk:
1711 case X86::VMOVDQU8Z256rmk:
1712 case X86::VMOVDQU8Zrmk:
1713 case X86::VMOVDQU16Z128rmk:
1714 case X86::VMOVDQU16Z256rmk:
1715 case X86::VMOVDQU16Zrmk:
1716 case X86::VMOVDQU32Z128rmk:
1717 case X86::VMOVDQA32Z128rmk:
1718 case X86::VMOVDQU32Z256rmk:
1719 case X86::VMOVDQA32Z256rmk:
1720 case X86::VMOVDQU32Zrmk:
1721 case X86::VMOVDQA32Zrmk:
1722 case X86::VMOVDQU64Z128rmk:
1723 case X86::VMOVDQA64Z128rmk:
1724 case X86::VMOVDQU64Z256rmk:
1725 case X86::VMOVDQA64Z256rmk:
1726 case X86::VMOVDQU64Zrmk:
1727 case X86::VMOVDQA64Zrmk:
1728 case X86::VMOVUPDZ128rmk:
1729 case X86::VMOVAPDZ128rmk:
1730 case X86::VMOVUPDZ256rmk:
1731 case X86::VMOVAPDZ256rmk:
1732 case X86::VMOVUPDZrmk:
1733 case X86::VMOVAPDZrmk:
1734 case X86::VMOVUPSZ128rmk:
1735 case X86::VMOVAPSZ128rmk:
1736 case X86::VMOVUPSZ256rmk:
1737 case X86::VMOVAPSZ256rmk:
1738 case X86::VMOVUPSZrmk:
1739 case X86::VMOVAPSZrmk:
1740 case X86::VBROADCASTSDZ256rmk:
1741 case X86::VBROADCASTSDZrmk:
1742 case X86::VBROADCASTSSZ128rmk:
1743 case X86::VBROADCASTSSZ256rmk:
1744 case X86::VBROADCASTSSZrmk:
1745 case X86::VPBROADCASTDZ128rmk:
1746 case X86::VPBROADCASTDZ256rmk:
1747 case X86::VPBROADCASTDZrmk:
1748 case X86::VPBROADCASTQZ128rmk:
1749 case X86::VPBROADCASTQZ256rmk:
1750 case X86::VPBROADCASTQZrmk: {
1751 unsigned Opc;
1752 switch (MIOpc) {
1753 default:
1754 llvm_unreachable("Unreachable!");
1755 case X86::VMOVDQU8Z128rmk:
1756 Opc = X86::VPBLENDMBZ128rmk;
1757 break;
1758 case X86::VMOVDQU8Z256rmk:
1759 Opc = X86::VPBLENDMBZ256rmk;
1760 break;
1761 case X86::VMOVDQU8Zrmk:
1762 Opc = X86::VPBLENDMBZrmk;
1763 break;
1764 case X86::VMOVDQU16Z128rmk:
1765 Opc = X86::VPBLENDMWZ128rmk;
1766 break;
1767 case X86::VMOVDQU16Z256rmk:
1768 Opc = X86::VPBLENDMWZ256rmk;
1769 break;
1770 case X86::VMOVDQU16Zrmk:
1771 Opc = X86::VPBLENDMWZrmk;
1772 break;
1773 case X86::VMOVDQU32Z128rmk:
1774 Opc = X86::VPBLENDMDZ128rmk;
1775 break;
1776 case X86::VMOVDQU32Z256rmk:
1777 Opc = X86::VPBLENDMDZ256rmk;
1778 break;
1779 case X86::VMOVDQU32Zrmk:
1780 Opc = X86::VPBLENDMDZrmk;
1781 break;
1782 case X86::VMOVDQU64Z128rmk:
1783 Opc = X86::VPBLENDMQZ128rmk;
1784 break;
1785 case X86::VMOVDQU64Z256rmk:
1786 Opc = X86::VPBLENDMQZ256rmk;
1787 break;
1788 case X86::VMOVDQU64Zrmk:
1789 Opc = X86::VPBLENDMQZrmk;
1790 break;
1791 case X86::VMOVUPDZ128rmk:
1792 Opc = X86::VBLENDMPDZ128rmk;
1793 break;
1794 case X86::VMOVUPDZ256rmk:
1795 Opc = X86::VBLENDMPDZ256rmk;
1796 break;
1797 case X86::VMOVUPDZrmk:
1798 Opc = X86::VBLENDMPDZrmk;
1799 break;
1800 case X86::VMOVUPSZ128rmk:
1801 Opc = X86::VBLENDMPSZ128rmk;
1802 break;
1803 case X86::VMOVUPSZ256rmk:
1804 Opc = X86::VBLENDMPSZ256rmk;
1805 break;
1806 case X86::VMOVUPSZrmk:
1807 Opc = X86::VBLENDMPSZrmk;
1808 break;
1809 case X86::VMOVDQA32Z128rmk:
1810 Opc = X86::VPBLENDMDZ128rmk;
1811 break;
1812 case X86::VMOVDQA32Z256rmk:
1813 Opc = X86::VPBLENDMDZ256rmk;
1814 break;
1815 case X86::VMOVDQA32Zrmk:
1816 Opc = X86::VPBLENDMDZrmk;
1817 break;
1818 case X86::VMOVDQA64Z128rmk:
1819 Opc = X86::VPBLENDMQZ128rmk;
1820 break;
1821 case X86::VMOVDQA64Z256rmk:
1822 Opc = X86::VPBLENDMQZ256rmk;
1823 break;
1824 case X86::VMOVDQA64Zrmk:
1825 Opc = X86::VPBLENDMQZrmk;
1826 break;
1827 case X86::VMOVAPDZ128rmk:
1828 Opc = X86::VBLENDMPDZ128rmk;
1829 break;
1830 case X86::VMOVAPDZ256rmk:
1831 Opc = X86::VBLENDMPDZ256rmk;
1832 break;
1833 case X86::VMOVAPDZrmk:
1834 Opc = X86::VBLENDMPDZrmk;
1835 break;
1836 case X86::VMOVAPSZ128rmk:
1837 Opc = X86::VBLENDMPSZ128rmk;
1838 break;
1839 case X86::VMOVAPSZ256rmk:
1840 Opc = X86::VBLENDMPSZ256rmk;
1841 break;
1842 case X86::VMOVAPSZrmk:
1843 Opc = X86::VBLENDMPSZrmk;
1844 break;
1845 case X86::VBROADCASTSDZ256rmk:
1846 Opc = X86::VBLENDMPDZ256rmbk;
1847 break;
1848 case X86::VBROADCASTSDZrmk:
1849 Opc = X86::VBLENDMPDZrmbk;
1850 break;
1851 case X86::VBROADCASTSSZ128rmk:
1852 Opc = X86::VBLENDMPSZ128rmbk;
1853 break;
1854 case X86::VBROADCASTSSZ256rmk:
1855 Opc = X86::VBLENDMPSZ256rmbk;
1856 break;
1857 case X86::VBROADCASTSSZrmk:
1858 Opc = X86::VBLENDMPSZrmbk;
1859 break;
1860 case X86::VPBROADCASTDZ128rmk:
1861 Opc = X86::VPBLENDMDZ128rmbk;
1862 break;
1863 case X86::VPBROADCASTDZ256rmk:
1864 Opc = X86::VPBLENDMDZ256rmbk;
1865 break;
1866 case X86::VPBROADCASTDZrmk:
1867 Opc = X86::VPBLENDMDZrmbk;
1868 break;
1869 case X86::VPBROADCASTQZ128rmk:
1870 Opc = X86::VPBLENDMQZ128rmbk;
1871 break;
1872 case X86::VPBROADCASTQZ256rmk:
1873 Opc = X86::VPBLENDMQZ256rmbk;
1874 break;
1875 case X86::VPBROADCASTQZrmk:
1876 Opc = X86::VPBLENDMQZrmbk;
1877 break;
1878 }
1879
1880 NewMI = BuildMI(MF, MI.getDebugLoc(), get(Opc))
1881 .add(Dest)
1882 .add(MI.getOperand(2))
1883 .add(Src)
1884 .add(MI.getOperand(3))
1885 .add(MI.getOperand(4))
1886 .add(MI.getOperand(5))
1887 .add(MI.getOperand(6))
1888 .add(MI.getOperand(7));
1889 NumRegOperands = 4;
1890 break;
1891 }
1892
1893 case X86::VMOVDQU8Z128rrk:
1894 case X86::VMOVDQU8Z256rrk:
1895 case X86::VMOVDQU8Zrrk:
1896 case X86::VMOVDQU16Z128rrk:
1897 case X86::VMOVDQU16Z256rrk:
1898 case X86::VMOVDQU16Zrrk:
1899 case X86::VMOVDQU32Z128rrk:
1900 case X86::VMOVDQA32Z128rrk:
1901 case X86::VMOVDQU32Z256rrk:
1902 case X86::VMOVDQA32Z256rrk:
1903 case X86::VMOVDQU32Zrrk:
1904 case X86::VMOVDQA32Zrrk:
1905 case X86::VMOVDQU64Z128rrk:
1906 case X86::VMOVDQA64Z128rrk:
1907 case X86::VMOVDQU64Z256rrk:
1908 case X86::VMOVDQA64Z256rrk:
1909 case X86::VMOVDQU64Zrrk:
1910 case X86::VMOVDQA64Zrrk:
1911 case X86::VMOVUPDZ128rrk:
1912 case X86::VMOVAPDZ128rrk:
1913 case X86::VMOVUPDZ256rrk:
1914 case X86::VMOVAPDZ256rrk:
1915 case X86::VMOVUPDZrrk:
1916 case X86::VMOVAPDZrrk:
1917 case X86::VMOVUPSZ128rrk:
1918 case X86::VMOVAPSZ128rrk:
1919 case X86::VMOVUPSZ256rrk:
1920 case X86::VMOVAPSZ256rrk:
1921 case X86::VMOVUPSZrrk:
1922 case X86::VMOVAPSZrrk: {
1923 unsigned Opc;
1924 switch (MIOpc) {
1925 default:
1926 llvm_unreachable("Unreachable!");
1927 case X86::VMOVDQU8Z128rrk:
1928 Opc = X86::VPBLENDMBZ128rrk;
1929 break;
1930 case X86::VMOVDQU8Z256rrk:
1931 Opc = X86::VPBLENDMBZ256rrk;
1932 break;
1933 case X86::VMOVDQU8Zrrk:
1934 Opc = X86::VPBLENDMBZrrk;
1935 break;
1936 case X86::VMOVDQU16Z128rrk:
1937 Opc = X86::VPBLENDMWZ128rrk;
1938 break;
1939 case X86::VMOVDQU16Z256rrk:
1940 Opc = X86::VPBLENDMWZ256rrk;
1941 break;
1942 case X86::VMOVDQU16Zrrk:
1943 Opc = X86::VPBLENDMWZrrk;
1944 break;
1945 case X86::VMOVDQU32Z128rrk:
1946 Opc = X86::VPBLENDMDZ128rrk;
1947 break;
1948 case X86::VMOVDQU32Z256rrk:
1949 Opc = X86::VPBLENDMDZ256rrk;
1950 break;
1951 case X86::VMOVDQU32Zrrk:
1952 Opc = X86::VPBLENDMDZrrk;
1953 break;
1954 case X86::VMOVDQU64Z128rrk:
1955 Opc = X86::VPBLENDMQZ128rrk;
1956 break;
1957 case X86::VMOVDQU64Z256rrk:
1958 Opc = X86::VPBLENDMQZ256rrk;
1959 break;
1960 case X86::VMOVDQU64Zrrk:
1961 Opc = X86::VPBLENDMQZrrk;
1962 break;
1963 case X86::VMOVUPDZ128rrk:
1964 Opc = X86::VBLENDMPDZ128rrk;
1965 break;
1966 case X86::VMOVUPDZ256rrk:
1967 Opc = X86::VBLENDMPDZ256rrk;
1968 break;
1969 case X86::VMOVUPDZrrk:
1970 Opc = X86::VBLENDMPDZrrk;
1971 break;
1972 case X86::VMOVUPSZ128rrk:
1973 Opc = X86::VBLENDMPSZ128rrk;
1974 break;
1975 case X86::VMOVUPSZ256rrk:
1976 Opc = X86::VBLENDMPSZ256rrk;
1977 break;
1978 case X86::VMOVUPSZrrk:
1979 Opc = X86::VBLENDMPSZrrk;
1980 break;
1981 case X86::VMOVDQA32Z128rrk:
1982 Opc = X86::VPBLENDMDZ128rrk;
1983 break;
1984 case X86::VMOVDQA32Z256rrk:
1985 Opc = X86::VPBLENDMDZ256rrk;
1986 break;
1987 case X86::VMOVDQA32Zrrk:
1988 Opc = X86::VPBLENDMDZrrk;
1989 break;
1990 case X86::VMOVDQA64Z128rrk:
1991 Opc = X86::VPBLENDMQZ128rrk;
1992 break;
1993 case X86::VMOVDQA64Z256rrk:
1994 Opc = X86::VPBLENDMQZ256rrk;
1995 break;
1996 case X86::VMOVDQA64Zrrk:
1997 Opc = X86::VPBLENDMQZrrk;
1998 break;
1999 case X86::VMOVAPDZ128rrk:
2000 Opc = X86::VBLENDMPDZ128rrk;
2001 break;
2002 case X86::VMOVAPDZ256rrk:
2003 Opc = X86::VBLENDMPDZ256rrk;
2004 break;
2005 case X86::VMOVAPDZrrk:
2006 Opc = X86::VBLENDMPDZrrk;
2007 break;
2008 case X86::VMOVAPSZ128rrk:
2009 Opc = X86::VBLENDMPSZ128rrk;
2010 break;
2011 case X86::VMOVAPSZ256rrk:
2012 Opc = X86::VBLENDMPSZ256rrk;
2013 break;
2014 case X86::VMOVAPSZrrk:
2015 Opc = X86::VBLENDMPSZrrk;
2016 break;
2017 }
2018
2019 NewMI = BuildMI(MF, MI.getDebugLoc(), get(Opc))
2020 .add(Dest)
2021 .add(MI.getOperand(2))
2022 .add(Src)
2023 .add(MI.getOperand(3));
2024 NumRegOperands = 4;
2025 break;
2026 }
2027 }
2028#undef CASE_NF
2029
2030 if (!NewMI)
2031 return nullptr;
2032
2033 if (LV) { // Update live variables
2034 for (unsigned I = 0; I < NumRegOperands; ++I) {
2035 MachineOperand &Op = MI.getOperand(I);
2036 if (Op.isReg() && (Op.isDead() || Op.isKill()))
2037 LV->replaceKillInstruction(Op.getReg(), MI, *NewMI);
2038 }
2039 }
2040
2041 MachineBasicBlock &MBB = *MI.getParent();
2042 MBB.insert(MI.getIterator(), NewMI); // Insert the new inst
2043
2044 if (LIS) {
2045 LIS->ReplaceMachineInstrInMaps(MI, *NewMI);
2046 if (SrcReg)
2047 LIS->getInterval(SrcReg);
2048 if (SrcReg2)
2049 LIS->getInterval(SrcReg2);
2050 }
2051
2052 return NewMI;
2053}
2054
2055/// This determines which of three possible cases of a three source commute
2056/// the source indexes correspond to taking into account any mask operands.
2057/// All prevents commuting a passthru operand. Returns -1 if the commute isn't
2058/// possible.
2059/// Case 0 - Possible to commute the first and second operands.
2060/// Case 1 - Possible to commute the first and third operands.
2061/// Case 2 - Possible to commute the second and third operands.
2062static unsigned getThreeSrcCommuteCase(uint64_t TSFlags, unsigned SrcOpIdx1,
2063 unsigned SrcOpIdx2) {
2064 // Put the lowest index to SrcOpIdx1 to simplify the checks below.
2065 if (SrcOpIdx1 > SrcOpIdx2)
2066 std::swap(SrcOpIdx1, SrcOpIdx2);
2067
2068 unsigned Op1 = 1, Op2 = 2, Op3 = 3;
2069 if (X86II::isKMasked(TSFlags)) {
2070 Op2++;
2071 Op3++;
2072 }
2073
2074 if (SrcOpIdx1 == Op1 && SrcOpIdx2 == Op2)
2075 return 0;
2076 if (SrcOpIdx1 == Op1 && SrcOpIdx2 == Op3)
2077 return 1;
2078 if (SrcOpIdx1 == Op2 && SrcOpIdx2 == Op3)
2079 return 2;
2080 llvm_unreachable("Unknown three src commute case.");
2081}
2082
2084 const MachineInstr &MI, unsigned SrcOpIdx1, unsigned SrcOpIdx2,
2085 const X86InstrFMA3Group &FMA3Group) const {
2086
2087 unsigned Opc = MI.getOpcode();
2088
2089 // TODO: Commuting the 1st operand of FMA*_Int requires some additional
2090 // analysis. The commute optimization is legal only if all users of FMA*_Int
2091 // use only the lowest element of the FMA*_Int instruction. Such analysis are
2092 // not implemented yet. So, just return 0 in that case.
2093 // When such analysis are available this place will be the right place for
2094 // calling it.
2095 assert(!(FMA3Group.isIntrinsic() && (SrcOpIdx1 == 1 || SrcOpIdx2 == 1)) &&
2096 "Intrinsic instructions can't commute operand 1");
2097
2098 // Determine which case this commute is or if it can't be done.
2099 unsigned Case =
2100 getThreeSrcCommuteCase(MI.getDesc().TSFlags, SrcOpIdx1, SrcOpIdx2);
2101 assert(Case < 3 && "Unexpected case number!");
2102
2103 // Define the FMA forms mapping array that helps to map input FMA form
2104 // to output FMA form to preserve the operation semantics after
2105 // commuting the operands.
2106 const unsigned Form132Index = 0;
2107 const unsigned Form213Index = 1;
2108 const unsigned Form231Index = 2;
2109 static const unsigned FormMapping[][3] = {
2110 // 0: SrcOpIdx1 == 1 && SrcOpIdx2 == 2;
2111 // FMA132 A, C, b; ==> FMA231 C, A, b;
2112 // FMA213 B, A, c; ==> FMA213 A, B, c;
2113 // FMA231 C, A, b; ==> FMA132 A, C, b;
2114 {Form231Index, Form213Index, Form132Index},
2115 // 1: SrcOpIdx1 == 1 && SrcOpIdx2 == 3;
2116 // FMA132 A, c, B; ==> FMA132 B, c, A;
2117 // FMA213 B, a, C; ==> FMA231 C, a, B;
2118 // FMA231 C, a, B; ==> FMA213 B, a, C;
2119 {Form132Index, Form231Index, Form213Index},
2120 // 2: SrcOpIdx1 == 2 && SrcOpIdx2 == 3;
2121 // FMA132 a, C, B; ==> FMA213 a, B, C;
2122 // FMA213 b, A, C; ==> FMA132 b, C, A;
2123 // FMA231 c, A, B; ==> FMA231 c, B, A;
2124 {Form213Index, Form132Index, Form231Index}};
2125
2126 unsigned FMAForms[3];
2127 FMAForms[0] = FMA3Group.get132Opcode();
2128 FMAForms[1] = FMA3Group.get213Opcode();
2129 FMAForms[2] = FMA3Group.get231Opcode();
2130
2131 // Everything is ready, just adjust the FMA opcode and return it.
2132 for (unsigned FormIndex = 0; FormIndex < 3; FormIndex++)
2133 if (Opc == FMAForms[FormIndex])
2134 return FMAForms[FormMapping[Case][FormIndex]];
2135
2136 llvm_unreachable("Illegal FMA3 format");
2137}
2138
2139static void commuteVPTERNLOG(MachineInstr &MI, unsigned SrcOpIdx1,
2140 unsigned SrcOpIdx2) {
2141 // Determine which case this commute is or if it can't be done.
2142 unsigned Case =
2143 getThreeSrcCommuteCase(MI.getDesc().TSFlags, SrcOpIdx1, SrcOpIdx2);
2144 assert(Case < 3 && "Unexpected case value!");
2145
2146 // For each case we need to swap two pairs of bits in the final immediate.
2147 static const uint8_t SwapMasks[3][4] = {
2148 {0x04, 0x10, 0x08, 0x20}, // Swap bits 2/4 and 3/5.
2149 {0x02, 0x10, 0x08, 0x40}, // Swap bits 1/4 and 3/6.
2150 {0x02, 0x04, 0x20, 0x40}, // Swap bits 1/2 and 5/6.
2151 };
2152
2153 uint8_t Imm = MI.getOperand(MI.getNumOperands() - 1).getImm();
2154 // Clear out the bits we are swapping.
2155 uint8_t NewImm = Imm & ~(SwapMasks[Case][0] | SwapMasks[Case][1] |
2156 SwapMasks[Case][2] | SwapMasks[Case][3]);
2157 // If the immediate had a bit of the pair set, then set the opposite bit.
2158 if (Imm & SwapMasks[Case][0])
2159 NewImm |= SwapMasks[Case][1];
2160 if (Imm & SwapMasks[Case][1])
2161 NewImm |= SwapMasks[Case][0];
2162 if (Imm & SwapMasks[Case][2])
2163 NewImm |= SwapMasks[Case][3];
2164 if (Imm & SwapMasks[Case][3])
2165 NewImm |= SwapMasks[Case][2];
2166 MI.getOperand(MI.getNumOperands() - 1).setImm(NewImm);
2167}
2168
2169// Returns true if this is a VPERMI2 or VPERMT2 instruction that can be
2170// commuted.
2171static bool isCommutableVPERMV3Instruction(unsigned Opcode) {
2172#define VPERM_CASES(Suffix) \
2173 case X86::VPERMI2##Suffix##Z128rr: \
2174 case X86::VPERMT2##Suffix##Z128rr: \
2175 case X86::VPERMI2##Suffix##Z256rr: \
2176 case X86::VPERMT2##Suffix##Z256rr: \
2177 case X86::VPERMI2##Suffix##Zrr: \
2178 case X86::VPERMT2##Suffix##Zrr: \
2179 case X86::VPERMI2##Suffix##Z128rm: \
2180 case X86::VPERMT2##Suffix##Z128rm: \
2181 case X86::VPERMI2##Suffix##Z256rm: \
2182 case X86::VPERMT2##Suffix##Z256rm: \
2183 case X86::VPERMI2##Suffix##Zrm: \
2184 case X86::VPERMT2##Suffix##Zrm: \
2185 case X86::VPERMI2##Suffix##Z128rrkz: \
2186 case X86::VPERMT2##Suffix##Z128rrkz: \
2187 case X86::VPERMI2##Suffix##Z256rrkz: \
2188 case X86::VPERMT2##Suffix##Z256rrkz: \
2189 case X86::VPERMI2##Suffix##Zrrkz: \
2190 case X86::VPERMT2##Suffix##Zrrkz: \
2191 case X86::VPERMI2##Suffix##Z128rmkz: \
2192 case X86::VPERMT2##Suffix##Z128rmkz: \
2193 case X86::VPERMI2##Suffix##Z256rmkz: \
2194 case X86::VPERMT2##Suffix##Z256rmkz: \
2195 case X86::VPERMI2##Suffix##Zrmkz: \
2196 case X86::VPERMT2##Suffix##Zrmkz:
2197
2198#define VPERM_CASES_BROADCAST(Suffix) \
2199 VPERM_CASES(Suffix) \
2200 case X86::VPERMI2##Suffix##Z128rmb: \
2201 case X86::VPERMT2##Suffix##Z128rmb: \
2202 case X86::VPERMI2##Suffix##Z256rmb: \
2203 case X86::VPERMT2##Suffix##Z256rmb: \
2204 case X86::VPERMI2##Suffix##Zrmb: \
2205 case X86::VPERMT2##Suffix##Zrmb: \
2206 case X86::VPERMI2##Suffix##Z128rmbkz: \
2207 case X86::VPERMT2##Suffix##Z128rmbkz: \
2208 case X86::VPERMI2##Suffix##Z256rmbkz: \
2209 case X86::VPERMT2##Suffix##Z256rmbkz: \
2210 case X86::VPERMI2##Suffix##Zrmbkz: \
2211 case X86::VPERMT2##Suffix##Zrmbkz:
2212
2213 switch (Opcode) {
2214 default:
2215 return false;
2216 VPERM_CASES(B)
2221 VPERM_CASES(W)
2222 return true;
2223 }
2224#undef VPERM_CASES_BROADCAST
2225#undef VPERM_CASES
2226}
2227
2228// Returns commuted opcode for VPERMI2 and VPERMT2 instructions by switching
2229// from the I opcode to the T opcode and vice versa.
2230static unsigned getCommutedVPERMV3Opcode(unsigned Opcode) {
2231#define VPERM_CASES(Orig, New) \
2232 case X86::Orig##Z128rr: \
2233 return X86::New##Z128rr; \
2234 case X86::Orig##Z128rrkz: \
2235 return X86::New##Z128rrkz; \
2236 case X86::Orig##Z128rm: \
2237 return X86::New##Z128rm; \
2238 case X86::Orig##Z128rmkz: \
2239 return X86::New##Z128rmkz; \
2240 case X86::Orig##Z256rr: \
2241 return X86::New##Z256rr; \
2242 case X86::Orig##Z256rrkz: \
2243 return X86::New##Z256rrkz; \
2244 case X86::Orig##Z256rm: \
2245 return X86::New##Z256rm; \
2246 case X86::Orig##Z256rmkz: \
2247 return X86::New##Z256rmkz; \
2248 case X86::Orig##Zrr: \
2249 return X86::New##Zrr; \
2250 case X86::Orig##Zrrkz: \
2251 return X86::New##Zrrkz; \
2252 case X86::Orig##Zrm: \
2253 return X86::New##Zrm; \
2254 case X86::Orig##Zrmkz: \
2255 return X86::New##Zrmkz;
2256
2257#define VPERM_CASES_BROADCAST(Orig, New) \
2258 VPERM_CASES(Orig, New) \
2259 case X86::Orig##Z128rmb: \
2260 return X86::New##Z128rmb; \
2261 case X86::Orig##Z128rmbkz: \
2262 return X86::New##Z128rmbkz; \
2263 case X86::Orig##Z256rmb: \
2264 return X86::New##Z256rmb; \
2265 case X86::Orig##Z256rmbkz: \
2266 return X86::New##Z256rmbkz; \
2267 case X86::Orig##Zrmb: \
2268 return X86::New##Zrmb; \
2269 case X86::Orig##Zrmbkz: \
2270 return X86::New##Zrmbkz;
2271
2272 switch (Opcode) {
2273 VPERM_CASES(VPERMI2B, VPERMT2B)
2274 VPERM_CASES_BROADCAST(VPERMI2D, VPERMT2D)
2275 VPERM_CASES_BROADCAST(VPERMI2PD, VPERMT2PD)
2276 VPERM_CASES_BROADCAST(VPERMI2PS, VPERMT2PS)
2277 VPERM_CASES_BROADCAST(VPERMI2Q, VPERMT2Q)
2278 VPERM_CASES(VPERMI2W, VPERMT2W)
2279 VPERM_CASES(VPERMT2B, VPERMI2B)
2280 VPERM_CASES_BROADCAST(VPERMT2D, VPERMI2D)
2281 VPERM_CASES_BROADCAST(VPERMT2PD, VPERMI2PD)
2282 VPERM_CASES_BROADCAST(VPERMT2PS, VPERMI2PS)
2283 VPERM_CASES_BROADCAST(VPERMT2Q, VPERMI2Q)
2284 VPERM_CASES(VPERMT2W, VPERMI2W)
2285 }
2286
2287 llvm_unreachable("Unreachable!");
2288#undef VPERM_CASES_BROADCAST
2289#undef VPERM_CASES
2290}
2291
2293 unsigned OpIdx1,
2294 unsigned OpIdx2) const {
2295 auto CloneIfNew = [&](MachineInstr &MI) {
2296 return std::exchange(NewMI, false)
2297 ? MI.getParent()->getParent()->CloneMachineInstr(&MI)
2298 : &MI;
2299 };
2300 MachineInstr *WorkingMI = nullptr;
2301 unsigned Opc = MI.getOpcode();
2302
2303#define CASE_ND(OP) \
2304 case X86::OP: \
2305 case X86::OP##_ND:
2306
2307 switch (Opc) {
2308 // SHLD B, C, I <-> SHRD C, B, (BitWidth - I)
2309 CASE_ND(SHRD16rri8)
2310 CASE_ND(SHLD16rri8)
2311 CASE_ND(SHRD32rri8)
2312 CASE_ND(SHLD32rri8)
2313 CASE_ND(SHRD64rri8)
2314 CASE_ND(SHLD64rri8) {
2315 unsigned Size;
2316 switch (Opc) {
2317 default:
2318 llvm_unreachable("Unreachable!");
2319#define FROM_TO_SIZE(A, B, S) \
2320 case X86::A: \
2321 Opc = X86::B; \
2322 Size = S; \
2323 break; \
2324 case X86::A##_ND: \
2325 Opc = X86::B##_ND; \
2326 Size = S; \
2327 break; \
2328 case X86::B: \
2329 Opc = X86::A; \
2330 Size = S; \
2331 break; \
2332 case X86::B##_ND: \
2333 Opc = X86::A##_ND; \
2334 Size = S; \
2335 break;
2336
2337 FROM_TO_SIZE(SHRD16rri8, SHLD16rri8, 16)
2338 FROM_TO_SIZE(SHRD32rri8, SHLD32rri8, 32)
2339 FROM_TO_SIZE(SHRD64rri8, SHLD64rri8, 64)
2340#undef FROM_TO_SIZE
2341 }
2342 WorkingMI = CloneIfNew(MI);
2343 WorkingMI->setDesc(get(Opc));
2344 WorkingMI->getOperand(3).setImm(Size - MI.getOperand(3).getImm());
2345 break;
2346 }
2347 case X86::PFSUBrr:
2348 case X86::PFSUBRrr:
2349 // PFSUB x, y: x = x - y
2350 // PFSUBR x, y: x = y - x
2351 WorkingMI = CloneIfNew(MI);
2352 WorkingMI->setDesc(
2353 get(X86::PFSUBRrr == Opc ? X86::PFSUBrr : X86::PFSUBRrr));
2354 break;
2355 case X86::BLENDPDrri:
2356 case X86::BLENDPSrri:
2357 case X86::PBLENDWrri:
2358 case X86::VBLENDPDrri:
2359 case X86::VBLENDPSrri:
2360 case X86::VBLENDPDYrri:
2361 case X86::VBLENDPSYrri:
2362 case X86::VPBLENDDrri:
2363 case X86::VPBLENDWrri:
2364 case X86::VPBLENDDYrri:
2365 case X86::VPBLENDWYrri: {
2366 int8_t Mask;
2367 switch (Opc) {
2368 default:
2369 llvm_unreachable("Unreachable!");
2370 case X86::BLENDPDrri:
2371 Mask = (int8_t)0x03;
2372 break;
2373 case X86::BLENDPSrri:
2374 Mask = (int8_t)0x0F;
2375 break;
2376 case X86::PBLENDWrri:
2377 Mask = (int8_t)0xFF;
2378 break;
2379 case X86::VBLENDPDrri:
2380 Mask = (int8_t)0x03;
2381 break;
2382 case X86::VBLENDPSrri:
2383 Mask = (int8_t)0x0F;
2384 break;
2385 case X86::VBLENDPDYrri:
2386 Mask = (int8_t)0x0F;
2387 break;
2388 case X86::VBLENDPSYrri:
2389 Mask = (int8_t)0xFF;
2390 break;
2391 case X86::VPBLENDDrri:
2392 Mask = (int8_t)0x0F;
2393 break;
2394 case X86::VPBLENDWrri:
2395 Mask = (int8_t)0xFF;
2396 break;
2397 case X86::VPBLENDDYrri:
2398 Mask = (int8_t)0xFF;
2399 break;
2400 case X86::VPBLENDWYrri:
2401 Mask = (int8_t)0xFF;
2402 break;
2403 }
2404 // Only the least significant bits of Imm are used.
2405 // Using int8_t to ensure it will be sign extended to the int64_t that
2406 // setImm takes in order to match isel behavior.
2407 int8_t Imm = MI.getOperand(3).getImm() & Mask;
2408 WorkingMI = CloneIfNew(MI);
2409 WorkingMI->getOperand(3).setImm(Mask ^ Imm);
2410 break;
2411 }
2412 case X86::INSERTPSrri:
2413 case X86::VINSERTPSrri:
2414 case X86::VINSERTPSZrri: {
2415 unsigned Imm = MI.getOperand(MI.getNumOperands() - 1).getImm();
2416 unsigned ZMask = Imm & 15;
2417 unsigned DstIdx = (Imm >> 4) & 3;
2418 unsigned SrcIdx = (Imm >> 6) & 3;
2419
2420 // We can commute insertps if we zero 2 of the elements, the insertion is
2421 // "inline" and we don't override the insertion with a zero.
2422 if (DstIdx == SrcIdx && (ZMask & (1 << DstIdx)) == 0 &&
2423 llvm::popcount(ZMask) == 2) {
2424 unsigned AltIdx = llvm::countr_zero((ZMask | (1 << DstIdx)) ^ 15);
2425 assert(AltIdx < 4 && "Illegal insertion index");
2426 unsigned AltImm = (AltIdx << 6) | (AltIdx << 4) | ZMask;
2427 WorkingMI = CloneIfNew(MI);
2428 WorkingMI->getOperand(MI.getNumOperands() - 1).setImm(AltImm);
2429 break;
2430 }
2431 return nullptr;
2432 }
2433 case X86::MOVSDrr:
2434 case X86::MOVSSrr:
2435 case X86::VMOVSDrr:
2436 case X86::VMOVSSrr: {
2437 // On SSE41 or later we can commute a MOVSS/MOVSD to a BLENDPS/BLENDPD.
2438 if (Subtarget.hasSSE41()) {
2439 unsigned Mask;
2440 switch (Opc) {
2441 default:
2442 llvm_unreachable("Unreachable!");
2443 case X86::MOVSDrr:
2444 Opc = X86::BLENDPDrri;
2445 Mask = 0x02;
2446 break;
2447 case X86::MOVSSrr:
2448 Opc = X86::BLENDPSrri;
2449 Mask = 0x0E;
2450 break;
2451 case X86::VMOVSDrr:
2452 Opc = X86::VBLENDPDrri;
2453 Mask = 0x02;
2454 break;
2455 case X86::VMOVSSrr:
2456 Opc = X86::VBLENDPSrri;
2457 Mask = 0x0E;
2458 break;
2459 }
2460
2461 WorkingMI = CloneIfNew(MI);
2462 WorkingMI->setDesc(get(Opc));
2463 WorkingMI->addOperand(MachineOperand::CreateImm(Mask));
2464 break;
2465 }
2466
2467 assert(Opc == X86::MOVSDrr && "Only MOVSD can commute to SHUFPD");
2468 WorkingMI = CloneIfNew(MI);
2469 WorkingMI->setDesc(get(X86::SHUFPDrri));
2470 WorkingMI->addOperand(MachineOperand::CreateImm(0x02));
2471 break;
2472 }
2473 case X86::SHUFPDrri: {
2474 // Commute to MOVSD.
2475 assert(MI.getOperand(3).getImm() == 0x02 && "Unexpected immediate!");
2476 WorkingMI = CloneIfNew(MI);
2477 WorkingMI->setDesc(get(X86::MOVSDrr));
2478 WorkingMI->removeOperand(3);
2479 break;
2480 }
2481 case X86::PCLMULQDQrri:
2482 case X86::VPCLMULQDQrri:
2483 case X86::VPCLMULQDQYrri:
2484 case X86::VPCLMULQDQZrri:
2485 case X86::VPCLMULQDQZ128rri:
2486 case X86::VPCLMULQDQZ256rri: {
2487 // SRC1 64bits = Imm[0] ? SRC1[127:64] : SRC1[63:0]
2488 // SRC2 64bits = Imm[4] ? SRC2[127:64] : SRC2[63:0]
2489 unsigned Imm = MI.getOperand(3).getImm();
2490 unsigned Src1Hi = Imm & 0x01;
2491 unsigned Src2Hi = Imm & 0x10;
2492 WorkingMI = CloneIfNew(MI);
2493 WorkingMI->getOperand(3).setImm((Src1Hi << 4) | (Src2Hi >> 4));
2494 break;
2495 }
2496 case X86::VPCMPBZ128rri:
2497 case X86::VPCMPUBZ128rri:
2498 case X86::VPCMPBZ256rri:
2499 case X86::VPCMPUBZ256rri:
2500 case X86::VPCMPBZrri:
2501 case X86::VPCMPUBZrri:
2502 case X86::VPCMPDZ128rri:
2503 case X86::VPCMPUDZ128rri:
2504 case X86::VPCMPDZ256rri:
2505 case X86::VPCMPUDZ256rri:
2506 case X86::VPCMPDZrri:
2507 case X86::VPCMPUDZrri:
2508 case X86::VPCMPQZ128rri:
2509 case X86::VPCMPUQZ128rri:
2510 case X86::VPCMPQZ256rri:
2511 case X86::VPCMPUQZ256rri:
2512 case X86::VPCMPQZrri:
2513 case X86::VPCMPUQZrri:
2514 case X86::VPCMPWZ128rri:
2515 case X86::VPCMPUWZ128rri:
2516 case X86::VPCMPWZ256rri:
2517 case X86::VPCMPUWZ256rri:
2518 case X86::VPCMPWZrri:
2519 case X86::VPCMPUWZrri:
2520 case X86::VPCMPBZ128rrik:
2521 case X86::VPCMPUBZ128rrik:
2522 case X86::VPCMPBZ256rrik:
2523 case X86::VPCMPUBZ256rrik:
2524 case X86::VPCMPBZrrik:
2525 case X86::VPCMPUBZrrik:
2526 case X86::VPCMPDZ128rrik:
2527 case X86::VPCMPUDZ128rrik:
2528 case X86::VPCMPDZ256rrik:
2529 case X86::VPCMPUDZ256rrik:
2530 case X86::VPCMPDZrrik:
2531 case X86::VPCMPUDZrrik:
2532 case X86::VPCMPQZ128rrik:
2533 case X86::VPCMPUQZ128rrik:
2534 case X86::VPCMPQZ256rrik:
2535 case X86::VPCMPUQZ256rrik:
2536 case X86::VPCMPQZrrik:
2537 case X86::VPCMPUQZrrik:
2538 case X86::VPCMPWZ128rrik:
2539 case X86::VPCMPUWZ128rrik:
2540 case X86::VPCMPWZ256rrik:
2541 case X86::VPCMPUWZ256rrik:
2542 case X86::VPCMPWZrrik:
2543 case X86::VPCMPUWZrrik:
2544 WorkingMI = CloneIfNew(MI);
2545 // Flip comparison mode immediate (if necessary).
2546 WorkingMI->getOperand(MI.getNumOperands() - 1)
2548 MI.getOperand(MI.getNumOperands() - 1).getImm() & 0x7));
2549 break;
2550 case X86::VPCOMBri:
2551 case X86::VPCOMUBri:
2552 case X86::VPCOMDri:
2553 case X86::VPCOMUDri:
2554 case X86::VPCOMQri:
2555 case X86::VPCOMUQri:
2556 case X86::VPCOMWri:
2557 case X86::VPCOMUWri:
2558 WorkingMI = CloneIfNew(MI);
2559 // Flip comparison mode immediate (if necessary).
2560 WorkingMI->getOperand(3).setImm(
2561 X86::getSwappedVPCOMImm(MI.getOperand(3).getImm() & 0x7));
2562 break;
2563 case X86::VCMPSDZrri:
2564 case X86::VCMPSSZrri:
2565 case X86::VCMPPDZrri:
2566 case X86::VCMPPSZrri:
2567 case X86::VCMPSHZrri:
2568 case X86::VCMPPHZrri:
2569 case X86::VCMPPHZ128rri:
2570 case X86::VCMPPHZ256rri:
2571 case X86::VCMPPDZ128rri:
2572 case X86::VCMPPSZ128rri:
2573 case X86::VCMPPDZ256rri:
2574 case X86::VCMPPSZ256rri:
2575 case X86::VCMPPDZrrik:
2576 case X86::VCMPPSZrrik:
2577 case X86::VCMPPHZrrik:
2578 case X86::VCMPPDZ128rrik:
2579 case X86::VCMPPSZ128rrik:
2580 case X86::VCMPPHZ128rrik:
2581 case X86::VCMPPDZ256rrik:
2582 case X86::VCMPPSZ256rrik:
2583 case X86::VCMPPHZ256rrik:
2584 WorkingMI = CloneIfNew(MI);
2585 WorkingMI->getOperand(MI.getNumExplicitOperands() - 1)
2587 MI.getOperand(MI.getNumExplicitOperands() - 1).getImm() & 0x1f));
2588 break;
2589 case X86::VPERM2F128rri:
2590 case X86::VPERM2I128rri:
2591 // Flip permute source immediate.
2592 // Imm & 0x02: lo = if set, select Op1.lo/hi else Op0.lo/hi.
2593 // Imm & 0x20: hi = if set, select Op1.lo/hi else Op0.lo/hi.
2594 WorkingMI = CloneIfNew(MI);
2595 WorkingMI->getOperand(3).setImm((MI.getOperand(3).getImm() & 0xFF) ^ 0x22);
2596 break;
2597 case X86::MOVHLPSrr:
2598 case X86::UNPCKHPDrr:
2599 case X86::VMOVHLPSrr:
2600 case X86::VUNPCKHPDrr:
2601 case X86::VMOVHLPSZrr:
2602 case X86::VUNPCKHPDZ128rr:
2603 assert(Subtarget.hasSSE2() && "Commuting MOVHLP/UNPCKHPD requires SSE2!");
2604
2605 switch (Opc) {
2606 default:
2607 llvm_unreachable("Unreachable!");
2608 case X86::MOVHLPSrr:
2609 Opc = X86::UNPCKHPDrr;
2610 break;
2611 case X86::UNPCKHPDrr:
2612 Opc = X86::MOVHLPSrr;
2613 break;
2614 case X86::VMOVHLPSrr:
2615 Opc = X86::VUNPCKHPDrr;
2616 break;
2617 case X86::VUNPCKHPDrr:
2618 Opc = X86::VMOVHLPSrr;
2619 break;
2620 case X86::VMOVHLPSZrr:
2621 Opc = X86::VUNPCKHPDZ128rr;
2622 break;
2623 case X86::VUNPCKHPDZ128rr:
2624 Opc = X86::VMOVHLPSZrr;
2625 break;
2626 }
2627 WorkingMI = CloneIfNew(MI);
2628 WorkingMI->setDesc(get(Opc));
2629 break;
2630 CASE_ND(CMOV16rr)
2631 CASE_ND(CMOV32rr)
2632 CASE_ND(CMOV64rr) {
2633 WorkingMI = CloneIfNew(MI);
2634 unsigned OpNo = MI.getDesc().getNumOperands() - 1;
2635 X86::CondCode CC = static_cast<X86::CondCode>(MI.getOperand(OpNo).getImm());
2637 break;
2638 }
2639 case X86::VPTERNLOGDZrri:
2640 case X86::VPTERNLOGDZrmi:
2641 case X86::VPTERNLOGDZ128rri:
2642 case X86::VPTERNLOGDZ128rmi:
2643 case X86::VPTERNLOGDZ256rri:
2644 case X86::VPTERNLOGDZ256rmi:
2645 case X86::VPTERNLOGQZrri:
2646 case X86::VPTERNLOGQZrmi:
2647 case X86::VPTERNLOGQZ128rri:
2648 case X86::VPTERNLOGQZ128rmi:
2649 case X86::VPTERNLOGQZ256rri:
2650 case X86::VPTERNLOGQZ256rmi:
2651 case X86::VPTERNLOGDZrrik:
2652 case X86::VPTERNLOGDZ128rrik:
2653 case X86::VPTERNLOGDZ256rrik:
2654 case X86::VPTERNLOGQZrrik:
2655 case X86::VPTERNLOGQZ128rrik:
2656 case X86::VPTERNLOGQZ256rrik:
2657 case X86::VPTERNLOGDZrrikz:
2658 case X86::VPTERNLOGDZrmikz:
2659 case X86::VPTERNLOGDZ128rrikz:
2660 case X86::VPTERNLOGDZ128rmikz:
2661 case X86::VPTERNLOGDZ256rrikz:
2662 case X86::VPTERNLOGDZ256rmikz:
2663 case X86::VPTERNLOGQZrrikz:
2664 case X86::VPTERNLOGQZrmikz:
2665 case X86::VPTERNLOGQZ128rrikz:
2666 case X86::VPTERNLOGQZ128rmikz:
2667 case X86::VPTERNLOGQZ256rrikz:
2668 case X86::VPTERNLOGQZ256rmikz:
2669 case X86::VPTERNLOGDZ128rmbi:
2670 case X86::VPTERNLOGDZ256rmbi:
2671 case X86::VPTERNLOGDZrmbi:
2672 case X86::VPTERNLOGQZ128rmbi:
2673 case X86::VPTERNLOGQZ256rmbi:
2674 case X86::VPTERNLOGQZrmbi:
2675 case X86::VPTERNLOGDZ128rmbikz:
2676 case X86::VPTERNLOGDZ256rmbikz:
2677 case X86::VPTERNLOGDZrmbikz:
2678 case X86::VPTERNLOGQZ128rmbikz:
2679 case X86::VPTERNLOGQZ256rmbikz:
2680 case X86::VPTERNLOGQZrmbikz: {
2681 WorkingMI = CloneIfNew(MI);
2682 commuteVPTERNLOG(*WorkingMI, OpIdx1, OpIdx2);
2683 break;
2684 }
2685 default:
2687 WorkingMI = CloneIfNew(MI);
2689 break;
2690 }
2691
2692 if (auto *FMA3Group = getFMA3Group(Opc, MI.getDesc().TSFlags)) {
2693 WorkingMI = CloneIfNew(MI);
2694 WorkingMI->setDesc(
2695 get(getFMA3OpcodeToCommuteOperands(MI, OpIdx1, OpIdx2, *FMA3Group)));
2696 break;
2697 }
2698 }
2699 return TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2);
2700}
2701
2702bool X86InstrInfo::findThreeSrcCommutedOpIndices(const MachineInstr &MI,
2703 unsigned &SrcOpIdx1,
2704 unsigned &SrcOpIdx2,
2705 bool IsIntrinsic) const {
2706 uint64_t TSFlags = MI.getDesc().TSFlags;
2707
2708 unsigned FirstCommutableVecOp = 1;
2709 unsigned LastCommutableVecOp = 3;
2710 unsigned KMaskOp = -1U;
2711 if (X86II::isKMasked(TSFlags)) {
2712 // For k-zero-masked operations it is Ok to commute the first vector
2713 // operand. Unless this is an intrinsic instruction.
2714 // For regular k-masked operations a conservative choice is done as the
2715 // elements of the first vector operand, for which the corresponding bit
2716 // in the k-mask operand is set to 0, are copied to the result of the
2717 // instruction.
2718 // TODO/FIXME: The commute still may be legal if it is known that the
2719 // k-mask operand is set to either all ones or all zeroes.
2720 // It is also Ok to commute the 1st operand if all users of MI use only
2721 // the elements enabled by the k-mask operand. For example,
2722 // v4 = VFMADD213PSZrk v1, k, v2, v3; // v1[i] = k[i] ? v2[i]*v1[i]+v3[i]
2723 // : v1[i];
2724 // VMOVAPSZmrk <mem_addr>, k, v4; // this is the ONLY user of v4 ->
2725 // // Ok, to commute v1 in FMADD213PSZrk.
2726
2727 // The k-mask operand has index = 2 for masked and zero-masked operations.
2728 KMaskOp = 2;
2729
2730 // The operand with index = 1 is used as a source for those elements for
2731 // which the corresponding bit in the k-mask is set to 0.
2732 if (X86II::isKMergeMasked(TSFlags) || IsIntrinsic)
2733 FirstCommutableVecOp = 3;
2734
2735 LastCommutableVecOp++;
2736 } else if (IsIntrinsic) {
2737 // Commuting the first operand of an intrinsic instruction isn't possible
2738 // unless we can prove that only the lowest element of the result is used.
2739 FirstCommutableVecOp = 2;
2740 }
2741
2742 if (isMem(MI, LastCommutableVecOp))
2743 LastCommutableVecOp--;
2744
2745 // Only the first RegOpsNum operands are commutable.
2746 // Also, the value 'CommuteAnyOperandIndex' is valid here as it means
2747 // that the operand is not specified/fixed.
2748 if (SrcOpIdx1 != CommuteAnyOperandIndex &&
2749 (SrcOpIdx1 < FirstCommutableVecOp || SrcOpIdx1 > LastCommutableVecOp ||
2750 SrcOpIdx1 == KMaskOp))
2751 return false;
2752 if (SrcOpIdx2 != CommuteAnyOperandIndex &&
2753 (SrcOpIdx2 < FirstCommutableVecOp || SrcOpIdx2 > LastCommutableVecOp ||
2754 SrcOpIdx2 == KMaskOp))
2755 return false;
2756
2757 // Look for two different register operands assumed to be commutable
2758 // regardless of the FMA opcode. The FMA opcode is adjusted later.
2759 if (SrcOpIdx1 == CommuteAnyOperandIndex ||
2760 SrcOpIdx2 == CommuteAnyOperandIndex) {
2761 unsigned CommutableOpIdx2 = SrcOpIdx2;
2762
2763 // At least one of operands to be commuted is not specified and
2764 // this method is free to choose appropriate commutable operands.
2765 if (SrcOpIdx1 == SrcOpIdx2)
2766 // Both of operands are not fixed. By default set one of commutable
2767 // operands to the last register operand of the instruction.
2768 CommutableOpIdx2 = LastCommutableVecOp;
2769 else if (SrcOpIdx2 == CommuteAnyOperandIndex)
2770 // Only one of operands is not fixed.
2771 CommutableOpIdx2 = SrcOpIdx1;
2772
2773 // CommutableOpIdx2 is well defined now. Let's choose another commutable
2774 // operand and assign its index to CommutableOpIdx1.
2775 Register Op2Reg = MI.getOperand(CommutableOpIdx2).getReg();
2776
2777 unsigned CommutableOpIdx1;
2778 for (CommutableOpIdx1 = LastCommutableVecOp;
2779 CommutableOpIdx1 >= FirstCommutableVecOp; CommutableOpIdx1--) {
2780 // Just ignore and skip the k-mask operand.
2781 if (CommutableOpIdx1 == KMaskOp)
2782 continue;
2783
2784 // The commuted operands must have different registers.
2785 // Otherwise, the commute transformation does not change anything and
2786 // is useless then.
2787 if (Op2Reg != MI.getOperand(CommutableOpIdx1).getReg())
2788 break;
2789 }
2790
2791 // No appropriate commutable operands were found.
2792 if (CommutableOpIdx1 < FirstCommutableVecOp)
2793 return false;
2794
2795 // Assign the found pair of commutable indices to SrcOpIdx1 and SrcOpidx2
2796 // to return those values.
2797 if (!fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2, CommutableOpIdx1,
2798 CommutableOpIdx2))
2799 return false;
2800 }
2801
2802 return true;
2803}
2804
2806 unsigned &SrcOpIdx1,
2807 unsigned &SrcOpIdx2) const {
2808 const MCInstrDesc &Desc = MI.getDesc();
2809 if (!Desc.isCommutable())
2810 return false;
2811
2812 switch (MI.getOpcode()) {
2813 case X86::CMPSDrri:
2814 case X86::CMPSSrri:
2815 case X86::CMPPDrri:
2816 case X86::CMPPSrri:
2817 case X86::VCMPSDrri:
2818 case X86::VCMPSSrri:
2819 case X86::VCMPPDrri:
2820 case X86::VCMPPSrri:
2821 case X86::VCMPPDYrri:
2822 case X86::VCMPPSYrri:
2823 case X86::VCMPSDZrri:
2824 case X86::VCMPSSZrri:
2825 case X86::VCMPPDZrri:
2826 case X86::VCMPPSZrri:
2827 case X86::VCMPSHZrri:
2828 case X86::VCMPPHZrri:
2829 case X86::VCMPPHZ128rri:
2830 case X86::VCMPPHZ256rri:
2831 case X86::VCMPPDZ128rri:
2832 case X86::VCMPPSZ128rri:
2833 case X86::VCMPPDZ256rri:
2834 case X86::VCMPPSZ256rri:
2835 case X86::VCMPPDZrrik:
2836 case X86::VCMPPSZrrik:
2837 case X86::VCMPPHZrrik:
2838 case X86::VCMPPDZ128rrik:
2839 case X86::VCMPPSZ128rrik:
2840 case X86::VCMPPHZ128rrik:
2841 case X86::VCMPPDZ256rrik:
2842 case X86::VCMPPSZ256rrik:
2843 case X86::VCMPPHZ256rrik: {
2844 unsigned OpOffset = X86II::isKMasked(Desc.TSFlags) ? 1 : 0;
2845
2846 // Float comparison can be safely commuted for
2847 // Ordered/Unordered/Equal/NotEqual tests
2848 unsigned Imm = MI.getOperand(3 + OpOffset).getImm() & 0x7;
2849 switch (Imm) {
2850 default:
2851 // EVEX versions can be commuted.
2852 if ((Desc.TSFlags & X86II::EncodingMask) == X86II::EVEX)
2853 break;
2854 return false;
2855 case 0x00: // EQUAL
2856 case 0x03: // UNORDERED
2857 case 0x04: // NOT EQUAL
2858 case 0x07: // ORDERED
2859 break;
2860 }
2861
2862 // The indices of the commutable operands are 1 and 2 (or 2 and 3
2863 // when masked).
2864 // Assign them to the returned operand indices here.
2865 return fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2, 1 + OpOffset,
2866 2 + OpOffset);
2867 }
2868 case X86::MOVSSrr:
2869 // X86::MOVSDrr is always commutable. MOVSS is only commutable if we can
2870 // form sse4.1 blend. We assume VMOVSSrr/VMOVSDrr is always commutable since
2871 // AVX implies sse4.1.
2872 if (Subtarget.hasSSE41())
2873 return TargetInstrInfo::findCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2);
2874 return false;
2875 case X86::SHUFPDrri:
2876 // We can commute this to MOVSD.
2877 if (MI.getOperand(3).getImm() == 0x02)
2878 return TargetInstrInfo::findCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2);
2879 return false;
2880 case X86::MOVHLPSrr:
2881 case X86::UNPCKHPDrr:
2882 case X86::VMOVHLPSrr:
2883 case X86::VUNPCKHPDrr:
2884 case X86::VMOVHLPSZrr:
2885 case X86::VUNPCKHPDZ128rr:
2886 if (Subtarget.hasSSE2())
2887 return TargetInstrInfo::findCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2);
2888 return false;
2889 case X86::VPTERNLOGDZrri:
2890 case X86::VPTERNLOGDZrmi:
2891 case X86::VPTERNLOGDZ128rri:
2892 case X86::VPTERNLOGDZ128rmi:
2893 case X86::VPTERNLOGDZ256rri:
2894 case X86::VPTERNLOGDZ256rmi:
2895 case X86::VPTERNLOGQZrri:
2896 case X86::VPTERNLOGQZrmi:
2897 case X86::VPTERNLOGQZ128rri:
2898 case X86::VPTERNLOGQZ128rmi:
2899 case X86::VPTERNLOGQZ256rri:
2900 case X86::VPTERNLOGQZ256rmi:
2901 case X86::VPTERNLOGDZrrik:
2902 case X86::VPTERNLOGDZ128rrik:
2903 case X86::VPTERNLOGDZ256rrik:
2904 case X86::VPTERNLOGQZrrik:
2905 case X86::VPTERNLOGQZ128rrik:
2906 case X86::VPTERNLOGQZ256rrik:
2907 case X86::VPTERNLOGDZrrikz:
2908 case X86::VPTERNLOGDZrmikz:
2909 case X86::VPTERNLOGDZ128rrikz:
2910 case X86::VPTERNLOGDZ128rmikz:
2911 case X86::VPTERNLOGDZ256rrikz:
2912 case X86::VPTERNLOGDZ256rmikz:
2913 case X86::VPTERNLOGQZrrikz:
2914 case X86::VPTERNLOGQZrmikz:
2915 case X86::VPTERNLOGQZ128rrikz:
2916 case X86::VPTERNLOGQZ128rmikz:
2917 case X86::VPTERNLOGQZ256rrikz:
2918 case X86::VPTERNLOGQZ256rmikz:
2919 case X86::VPTERNLOGDZ128rmbi:
2920 case X86::VPTERNLOGDZ256rmbi:
2921 case X86::VPTERNLOGDZrmbi:
2922 case X86::VPTERNLOGQZ128rmbi:
2923 case X86::VPTERNLOGQZ256rmbi:
2924 case X86::VPTERNLOGQZrmbi:
2925 case X86::VPTERNLOGDZ128rmbikz:
2926 case X86::VPTERNLOGDZ256rmbikz:
2927 case X86::VPTERNLOGDZrmbikz:
2928 case X86::VPTERNLOGQZ128rmbikz:
2929 case X86::VPTERNLOGQZ256rmbikz:
2930 case X86::VPTERNLOGQZrmbikz:
2931 return findThreeSrcCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2);
2932 case X86::VPDPWSSDYrr:
2933 case X86::VPDPWSSDrr:
2934 case X86::VPDPWSSDSYrr:
2935 case X86::VPDPWSSDSrr:
2936 case X86::VPDPWUUDrr:
2937 case X86::VPDPWUUDYrr:
2938 case X86::VPDPWUUDSrr:
2939 case X86::VPDPWUUDSYrr:
2940 case X86::VPDPBSSDSrr:
2941 case X86::VPDPBSSDSYrr:
2942 case X86::VPDPBSSDrr:
2943 case X86::VPDPBSSDYrr:
2944 case X86::VPDPBUUDSrr:
2945 case X86::VPDPBUUDSYrr:
2946 case X86::VPDPBUUDrr:
2947 case X86::VPDPBUUDYrr:
2948 case X86::VPDPBSSDSZ128rr:
2949 case X86::VPDPBSSDSZ128rrk:
2950 case X86::VPDPBSSDSZ128rrkz:
2951 case X86::VPDPBSSDSZ256rr:
2952 case X86::VPDPBSSDSZ256rrk:
2953 case X86::VPDPBSSDSZ256rrkz:
2954 case X86::VPDPBSSDSZrr:
2955 case X86::VPDPBSSDSZrrk:
2956 case X86::VPDPBSSDSZrrkz:
2957 case X86::VPDPBSSDZ128rr:
2958 case X86::VPDPBSSDZ128rrk:
2959 case X86::VPDPBSSDZ128rrkz:
2960 case X86::VPDPBSSDZ256rr:
2961 case X86::VPDPBSSDZ256rrk:
2962 case X86::VPDPBSSDZ256rrkz:
2963 case X86::VPDPBSSDZrr:
2964 case X86::VPDPBSSDZrrk:
2965 case X86::VPDPBSSDZrrkz:
2966 case X86::VPDPBUUDSZ128rr:
2967 case X86::VPDPBUUDSZ128rrk:
2968 case X86::VPDPBUUDSZ128rrkz:
2969 case X86::VPDPBUUDSZ256rr:
2970 case X86::VPDPBUUDSZ256rrk:
2971 case X86::VPDPBUUDSZ256rrkz:
2972 case X86::VPDPBUUDSZrr:
2973 case X86::VPDPBUUDSZrrk:
2974 case X86::VPDPBUUDSZrrkz:
2975 case X86::VPDPBUUDZ128rr:
2976 case X86::VPDPBUUDZ128rrk:
2977 case X86::VPDPBUUDZ128rrkz:
2978 case X86::VPDPBUUDZ256rr:
2979 case X86::VPDPBUUDZ256rrk:
2980 case X86::VPDPBUUDZ256rrkz:
2981 case X86::VPDPBUUDZrr:
2982 case X86::VPDPBUUDZrrk:
2983 case X86::VPDPBUUDZrrkz:
2984 case X86::VPDPWSSDZ128rr:
2985 case X86::VPDPWSSDZ128rrk:
2986 case X86::VPDPWSSDZ128rrkz:
2987 case X86::VPDPWSSDZ256rr:
2988 case X86::VPDPWSSDZ256rrk:
2989 case X86::VPDPWSSDZ256rrkz:
2990 case X86::VPDPWSSDZrr:
2991 case X86::VPDPWSSDZrrk:
2992 case X86::VPDPWSSDZrrkz:
2993 case X86::VPDPWSSDSZ128rr:
2994 case X86::VPDPWSSDSZ128rrk:
2995 case X86::VPDPWSSDSZ128rrkz:
2996 case X86::VPDPWSSDSZ256rr:
2997 case X86::VPDPWSSDSZ256rrk:
2998 case X86::VPDPWSSDSZ256rrkz:
2999 case X86::VPDPWSSDSZrr:
3000 case X86::VPDPWSSDSZrrk:
3001 case X86::VPDPWSSDSZrrkz:
3002 case X86::VPDPWUUDZ128rr:
3003 case X86::VPDPWUUDZ128rrk:
3004 case X86::VPDPWUUDZ128rrkz:
3005 case X86::VPDPWUUDZ256rr:
3006 case X86::VPDPWUUDZ256rrk:
3007 case X86::VPDPWUUDZ256rrkz:
3008 case X86::VPDPWUUDZrr:
3009 case X86::VPDPWUUDZrrk:
3010 case X86::VPDPWUUDZrrkz:
3011 case X86::VPDPWUUDSZ128rr:
3012 case X86::VPDPWUUDSZ128rrk:
3013 case X86::VPDPWUUDSZ128rrkz:
3014 case X86::VPDPWUUDSZ256rr:
3015 case X86::VPDPWUUDSZ256rrk:
3016 case X86::VPDPWUUDSZ256rrkz:
3017 case X86::VPDPWUUDSZrr:
3018 case X86::VPDPWUUDSZrrk:
3019 case X86::VPDPWUUDSZrrkz:
3020 case X86::VPMADD52HUQrr:
3021 case X86::VPMADD52HUQYrr:
3022 case X86::VPMADD52HUQZ128r:
3023 case X86::VPMADD52HUQZ128rk:
3024 case X86::VPMADD52HUQZ128rkz:
3025 case X86::VPMADD52HUQZ256r:
3026 case X86::VPMADD52HUQZ256rk:
3027 case X86::VPMADD52HUQZ256rkz:
3028 case X86::VPMADD52HUQZr:
3029 case X86::VPMADD52HUQZrk:
3030 case X86::VPMADD52HUQZrkz:
3031 case X86::VPMADD52LUQrr:
3032 case X86::VPMADD52LUQYrr:
3033 case X86::VPMADD52LUQZ128r:
3034 case X86::VPMADD52LUQZ128rk:
3035 case X86::VPMADD52LUQZ128rkz:
3036 case X86::VPMADD52LUQZ256r:
3037 case X86::VPMADD52LUQZ256rk:
3038 case X86::VPMADD52LUQZ256rkz:
3039 case X86::VPMADD52LUQZr:
3040 case X86::VPMADD52LUQZrk:
3041 case X86::VPMADD52LUQZrkz:
3042 case X86::VFMADDCPHZr:
3043 case X86::VFMADDCPHZrk:
3044 case X86::VFMADDCPHZrkz:
3045 case X86::VFMADDCPHZ128r:
3046 case X86::VFMADDCPHZ128rk:
3047 case X86::VFMADDCPHZ128rkz:
3048 case X86::VFMADDCPHZ256r:
3049 case X86::VFMADDCPHZ256rk:
3050 case X86::VFMADDCPHZ256rkz:
3051 case X86::VFMADDCSHZr:
3052 case X86::VFMADDCSHZrk:
3053 case X86::VFMADDCSHZrkz: {
3054 unsigned CommutableOpIdx1 = 2;
3055 unsigned CommutableOpIdx2 = 3;
3056 if (X86II::isKMasked(Desc.TSFlags)) {
3057 // Skip the mask register.
3058 ++CommutableOpIdx1;
3059 ++CommutableOpIdx2;
3060 }
3061 if (!fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2, CommutableOpIdx1,
3062 CommutableOpIdx2))
3063 return false;
3064 if (!MI.getOperand(SrcOpIdx1).isReg() || !MI.getOperand(SrcOpIdx2).isReg())
3065 // No idea.
3066 return false;
3067 return true;
3068 }
3069
3070 default:
3071 const X86InstrFMA3Group *FMA3Group =
3072 getFMA3Group(MI.getOpcode(), MI.getDesc().TSFlags);
3073 if (FMA3Group)
3074 return findThreeSrcCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2,
3075 FMA3Group->isIntrinsic());
3076
3077 // Handled masked instructions since we need to skip over the mask input
3078 // and the preserved input.
3079 if (X86II::isKMasked(Desc.TSFlags)) {
3080 // First assume that the first input is the mask operand and skip past it.
3081 unsigned CommutableOpIdx1 = Desc.getNumDefs() + 1;
3082 unsigned CommutableOpIdx2 = Desc.getNumDefs() + 2;
3083 // Check if the first input is tied. If there isn't one then we only
3084 // need to skip the mask operand which we did above.
3085 if ((MI.getDesc().getOperandConstraint(Desc.getNumDefs(),
3086 MCOI::TIED_TO) != -1)) {
3087 // If this is zero masking instruction with a tied operand, we need to
3088 // move the first index back to the first input since this must
3089 // be a 3 input instruction and we want the first two non-mask inputs.
3090 // Otherwise this is a 2 input instruction with a preserved input and
3091 // mask, so we need to move the indices to skip one more input.
3092 if (X86II::isKMergeMasked(Desc.TSFlags)) {
3093 ++CommutableOpIdx1;
3094 ++CommutableOpIdx2;
3095 } else {
3096 --CommutableOpIdx1;
3097 }
3098 }
3099
3100 if (!fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2, CommutableOpIdx1,
3101 CommutableOpIdx2))
3102 return false;
3103
3104 if (!MI.getOperand(SrcOpIdx1).isReg() ||
3105 !MI.getOperand(SrcOpIdx2).isReg())
3106 // No idea.
3107 return false;
3108 return true;
3109 }
3110
3111 return TargetInstrInfo::findCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2);
3112 }
3113 return false;
3114}
3115
3117 unsigned Opcode = MI->getOpcode();
3118 if (Opcode != X86::LEA32r && Opcode != X86::LEA64r &&
3119 Opcode != X86::LEA64_32r)
3120 return false;
3121
3122 const MachineOperand &Scale = MI->getOperand(1 + X86::AddrScaleAmt);
3123 const MachineOperand &Disp = MI->getOperand(1 + X86::AddrDisp);
3124 const MachineOperand &Segment = MI->getOperand(1 + X86::AddrSegmentReg);
3125
3126 if (Segment.getReg() != 0 || !Disp.isImm() || Disp.getImm() != 0 ||
3127 Scale.getImm() > 1)
3128 return false;
3129
3130 return true;
3131}
3132
3134 // Currently we're interested in following sequence only.
3135 // r3 = lea r1, r2
3136 // r5 = add r3, r4
3137 // Both r3 and r4 are killed in add, we hope the add instruction has the
3138 // operand order
3139 // r5 = add r4, r3
3140 // So later in X86FixupLEAs the lea instruction can be rewritten as add.
3141 unsigned Opcode = MI.getOpcode();
3142 if (Opcode != X86::ADD32rr && Opcode != X86::ADD64rr)
3143 return false;
3144
3145 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
3146 Register Reg1 = MI.getOperand(1).getReg();
3147 Register Reg2 = MI.getOperand(2).getReg();
3148
3149 // Check if Reg1 comes from LEA in the same MBB.
3150 if (MachineInstr *Inst = MRI.getUniqueVRegDef(Reg1)) {
3151 if (isConvertibleLEA(Inst) && Inst->getParent() == MI.getParent()) {
3152 Commute = true;
3153 return true;
3154 }
3155 }
3156
3157 // Check if Reg2 comes from LEA in the same MBB.
3158 if (MachineInstr *Inst = MRI.getUniqueVRegDef(Reg2)) {
3159 if (isConvertibleLEA(Inst) && Inst->getParent() == MI.getParent()) {
3160 Commute = false;
3161 return true;
3162 }
3163 }
3164
3165 return false;
3166}
3167
3169 unsigned Opcode = MCID.getOpcode();
3170 if (!(X86::isJCC(Opcode) || X86::isSETCC(Opcode) || X86::isSETZUCC(Opcode) ||
3171 X86::isCMOVCC(Opcode) || X86::isCFCMOVCC(Opcode) ||
3172 X86::isCCMPCC(Opcode) || X86::isCTESTCC(Opcode)))
3173 return -1;
3174 // Assume that condition code is always the last use operand.
3175 unsigned NumUses = MCID.getNumOperands() - MCID.getNumDefs();
3176 return NumUses - 1;
3177}
3178
3180 const MCInstrDesc &MCID = MI.getDesc();
3181 int CondNo = getCondSrcNoFromDesc(MCID);
3182 if (CondNo < 0)
3183 return X86::COND_INVALID;
3184 CondNo += MCID.getNumDefs();
3185 return static_cast<X86::CondCode>(MI.getOperand(CondNo).getImm());
3186}
3187
3189 return X86::isJCC(MI.getOpcode()) ? X86::getCondFromMI(MI)
3191}
3192
3194 return X86::isSETCC(MI.getOpcode()) || X86::isSETZUCC(MI.getOpcode())
3197}
3198
3200 return X86::isCMOVCC(MI.getOpcode()) ? X86::getCondFromMI(MI)
3202}
3203
3205 return X86::isCFCMOVCC(MI.getOpcode()) ? X86::getCondFromMI(MI)
3207}
3208
3210 return X86::isCCMPCC(MI.getOpcode()) || X86::isCTESTCC(MI.getOpcode())
3213}
3214
3216 // CCMP/CTEST has two conditional operands:
3217 // - SCC: source conditonal code (same as CMOV)
3218 // - DCF: destination conditional flags, which has 4 valid bits
3219 //
3220 // +----+----+----+----+
3221 // | OF | SF | ZF | CF |
3222 // +----+----+----+----+
3223 //
3224 // If SCC(source conditional code) evaluates to false, CCMP/CTEST will updates
3225 // the conditional flags by as follows:
3226 //
3227 // OF = DCF.OF
3228 // SF = DCF.SF
3229 // ZF = DCF.ZF
3230 // CF = DCF.CF
3231 // PF = DCF.CF
3232 // AF = 0 (Auxiliary Carry Flag)
3233 //
3234 // Otherwise, the CMP or TEST is executed and it updates the
3235 // CSPAZO flags normally.
3236 //
3237 // NOTE:
3238 // If SCC = P, then SCC evaluates to true regardless of the CSPAZO value.
3239 // If SCC = NP, then SCC evaluates to false regardless of the CSPAZO value.
3240
3241 enum { CF = 1, ZF = 2, SF = 4, OF = 8, PF = CF };
3242
3243 switch (CC) {
3244 default:
3245 llvm_unreachable("Illegal condition code!");
3246 case X86::COND_NO:
3247 case X86::COND_NE:
3248 case X86::COND_GE:
3249 case X86::COND_G:
3250 case X86::COND_AE:
3251 case X86::COND_A:
3252 case X86::COND_NS:
3253 case X86::COND_NP:
3254 return 0;
3255 case X86::COND_O:
3256 return OF;
3257 case X86::COND_B:
3258 case X86::COND_BE:
3259 return CF;
3260 break;
3261 case X86::COND_E:
3262 case X86::COND_LE:
3263 return ZF;
3264 case X86::COND_S:
3265 case X86::COND_L:
3266 return SF;
3267 case X86::COND_P:
3268 return PF;
3269 }
3270}
3271
3272#define GET_X86_NF_TRANSFORM_TABLE
3273#define GET_X86_ND2NONND_TABLE
3274#include "X86GenInstrMapping.inc"
3275
3277 unsigned Opc) {
3278 const auto I = llvm::lower_bound(Table, Opc);
3279 return (I == Table.end() || I->OldOpc != Opc) ? 0U : I->NewOpc;
3280}
3281unsigned X86::getNFVariant(unsigned Opc) {
3282#if defined(EXPENSIVE_CHECKS) && !defined(NDEBUG)
3283 // Make sure the tables are sorted.
3284 static std::atomic<bool> NFTableChecked(false);
3285 if (!NFTableChecked.load(std::memory_order_relaxed)) {
3286 assert(llvm::is_sorted(X86NFTransformTable) &&
3287 "X86NFTransformTable is not sorted!");
3288 NFTableChecked.store(true, std::memory_order_relaxed);
3289 }
3290#endif
3291 return getNewOpcFromTable(X86NFTransformTable, Opc);
3292}
3293
3294unsigned X86::getNonNDVariant(unsigned Opc) {
3295#if defined(EXPENSIVE_CHECKS) && !defined(NDEBUG)
3296 // Make sure the tables are sorted.
3297 static std::atomic<bool> NDTableChecked(false);
3298 if (!NDTableChecked.load(std::memory_order_relaxed)) {
3299 assert(llvm::is_sorted(X86ND2NonNDTable) &&
3300 "X86ND2NonNDTableis not sorted!");
3301 NDTableChecked.store(true, std::memory_order_relaxed);
3302 }
3303#endif
3304 return getNewOpcFromTable(X86ND2NonNDTable, Opc);
3305}
3306
3307/// Return the inverse of the specified condition,
3308/// e.g. turning COND_E to COND_NE.
3310 switch (CC) {
3311 default:
3312 llvm_unreachable("Illegal condition code!");
3313 case X86::COND_E:
3314 return X86::COND_NE;
3315 case X86::COND_NE:
3316 return X86::COND_E;
3317 case X86::COND_L:
3318 return X86::COND_GE;
3319 case X86::COND_LE:
3320 return X86::COND_G;
3321 case X86::COND_G:
3322 return X86::COND_LE;
3323 case X86::COND_GE:
3324 return X86::COND_L;
3325 case X86::COND_B:
3326 return X86::COND_AE;
3327 case X86::COND_BE:
3328 return X86::COND_A;
3329 case X86::COND_A:
3330 return X86::COND_BE;
3331 case X86::COND_AE:
3332 return X86::COND_B;
3333 case X86::COND_S:
3334 return X86::COND_NS;
3335 case X86::COND_NS:
3336 return X86::COND_S;
3337 case X86::COND_P:
3338 return X86::COND_NP;
3339 case X86::COND_NP:
3340 return X86::COND_P;
3341 case X86::COND_O:
3342 return X86::COND_NO;
3343 case X86::COND_NO:
3344 return X86::COND_O;
3345 case X86::COND_NE_OR_P:
3346 return X86::COND_E_AND_NP;
3347 case X86::COND_E_AND_NP:
3348 return X86::COND_NE_OR_P;
3349 }
3350}
3351
3352/// Assuming the flags are set by MI(a,b), return the condition code if we
3353/// modify the instructions such that flags are set by MI(b,a).
3355 switch (CC) {
3356 default:
3357 return X86::COND_INVALID;
3358 case X86::COND_E:
3359 return X86::COND_E;
3360 case X86::COND_NE:
3361 return X86::COND_NE;
3362 case X86::COND_L:
3363 return X86::COND_G;
3364 case X86::COND_LE:
3365 return X86::COND_GE;
3366 case X86::COND_G:
3367 return X86::COND_L;
3368 case X86::COND_GE:
3369 return X86::COND_LE;
3370 case X86::COND_B:
3371 return X86::COND_A;
3372 case X86::COND_BE:
3373 return X86::COND_AE;
3374 case X86::COND_A:
3375 return X86::COND_B;
3376 case X86::COND_AE:
3377 return X86::COND_BE;
3378 }
3379}
3380
3381std::pair<X86::CondCode, bool>
3384 bool NeedSwap = false;
3385 switch (Predicate) {
3386 default:
3387 break;
3388 // Floating-point Predicates
3389 case CmpInst::FCMP_UEQ:
3390 CC = X86::COND_E;
3391 break;
3392 case CmpInst::FCMP_OLT:
3393 NeedSwap = true;
3394 [[fallthrough]];
3395 case CmpInst::FCMP_OGT:
3396 CC = X86::COND_A;
3397 break;
3398 case CmpInst::FCMP_OLE:
3399 NeedSwap = true;
3400 [[fallthrough]];
3401 case CmpInst::FCMP_OGE:
3402 CC = X86::COND_AE;
3403 break;
3404 case CmpInst::FCMP_UGT:
3405 NeedSwap = true;
3406 [[fallthrough]];
3407 case CmpInst::FCMP_ULT:
3408 CC = X86::COND_B;
3409 break;
3410 case CmpInst::FCMP_UGE:
3411 NeedSwap = true;
3412 [[fallthrough]];
3413 case CmpInst::FCMP_ULE:
3414 CC = X86::COND_BE;
3415 break;
3416 case CmpInst::FCMP_ONE:
3417 CC = X86::COND_NE;
3418 break;
3419 case CmpInst::FCMP_UNO:
3420 CC = X86::COND_P;
3421 break;
3422 case CmpInst::FCMP_ORD:
3423 CC = X86::COND_NP;
3424 break;
3425 case CmpInst::FCMP_OEQ:
3426 [[fallthrough]];
3427 case CmpInst::FCMP_UNE:
3428 CC = X86::COND_INVALID;
3429 break;
3430
3431 // Integer Predicates
3432 case CmpInst::ICMP_EQ:
3433 CC = X86::COND_E;
3434 break;
3435 case CmpInst::ICMP_NE:
3436 CC = X86::COND_NE;
3437 break;
3438 case CmpInst::ICMP_UGT:
3439 CC = X86::COND_A;
3440 break;
3441 case CmpInst::ICMP_UGE:
3442 CC = X86::COND_AE;
3443 break;
3444 case CmpInst::ICMP_ULT:
3445 CC = X86::COND_B;
3446 break;
3447 case CmpInst::ICMP_ULE:
3448 CC = X86::COND_BE;
3449 break;
3450 case CmpInst::ICMP_SGT:
3451 CC = X86::COND_G;
3452 break;
3453 case CmpInst::ICMP_SGE:
3454 CC = X86::COND_GE;
3455 break;
3456 case CmpInst::ICMP_SLT:
3457 CC = X86::COND_L;
3458 break;
3459 case CmpInst::ICMP_SLE:
3460 CC = X86::COND_LE;
3461 break;
3462 }
3463
3464 return std::make_pair(CC, NeedSwap);
3465}
3466
3467/// Return a cmov opcode for the given register size in bytes, and operand type.
3468unsigned X86::getCMovOpcode(unsigned RegBytes, bool HasMemoryOperand,
3469 bool HasNDD) {
3470 switch (RegBytes) {
3471 default:
3472 llvm_unreachable("Illegal register size!");
3473#define GET_ND_IF_ENABLED(OPC) (HasNDD ? OPC##_ND : OPC)
3474 case 2:
3475 return HasMemoryOperand ? GET_ND_IF_ENABLED(X86::CMOV16rm)
3476 : GET_ND_IF_ENABLED(X86::CMOV16rr);
3477 case 4:
3478 return HasMemoryOperand ? GET_ND_IF_ENABLED(X86::CMOV32rm)
3479 : GET_ND_IF_ENABLED(X86::CMOV32rr);
3480 case 8:
3481 return HasMemoryOperand ? GET_ND_IF_ENABLED(X86::CMOV64rm)
3482 : GET_ND_IF_ENABLED(X86::CMOV64rr);
3483 }
3484}
3485
3486/// Get the VPCMP immediate for the given condition.
3488 switch (CC) {
3489 default:
3490 llvm_unreachable("Unexpected SETCC condition");
3491 case ISD::SETNE:
3492 return 4;
3493 case ISD::SETEQ:
3494 return 0;
3495 case ISD::SETULT:
3496 case ISD::SETLT:
3497 return 1;
3498 case ISD::SETUGT:
3499 case ISD::SETGT:
3500 return 6;
3501 case ISD::SETUGE:
3502 case ISD::SETGE:
3503 return 5;
3504 case ISD::SETULE:
3505 case ISD::SETLE:
3506 return 2;
3507 }
3508}
3509
3510/// Get the VPCMP immediate if the operands are swapped.
3511unsigned X86::getSwappedVPCMPImm(unsigned Imm) {
3512 switch (Imm) {
3513 default:
3514 llvm_unreachable("Unreachable!");
3515 case 0x01:
3516 Imm = 0x06;
3517 break; // LT -> NLE
3518 case 0x02:
3519 Imm = 0x05;
3520 break; // LE -> NLT
3521 case 0x05:
3522 Imm = 0x02;
3523 break; // NLT -> LE
3524 case 0x06:
3525 Imm = 0x01;
3526 break; // NLE -> LT
3527 case 0x00: // EQ
3528 case 0x03: // FALSE
3529 case 0x04: // NE
3530 case 0x07: // TRUE
3531 break;
3532 }
3533
3534 return Imm;
3535}
3536
3537/// Get the VPCOM immediate if the operands are swapped.
3538unsigned X86::getSwappedVPCOMImm(unsigned Imm) {
3539 switch (Imm) {
3540 default:
3541 llvm_unreachable("Unreachable!");
3542 case 0x00:
3543 Imm = 0x02;
3544 break; // LT -> GT
3545 case 0x01:
3546 Imm = 0x03;
3547 break; // LE -> GE
3548 case 0x02:
3549 Imm = 0x00;
3550 break; // GT -> LT
3551 case 0x03:
3552 Imm = 0x01;
3553 break; // GE -> LE
3554 case 0x04: // EQ
3555 case 0x05: // NE
3556 case 0x06: // FALSE
3557 case 0x07: // TRUE
3558 break;
3559 }
3560
3561 return Imm;
3562}
3563
3564/// Get the VCMP immediate if the operands are swapped.
3565unsigned X86::getSwappedVCMPImm(unsigned Imm) {
3566 // Only need the lower 2 bits to distinquish.
3567 switch (Imm & 0x3) {
3568 default:
3569 llvm_unreachable("Unreachable!");
3570 case 0x00:
3571 case 0x03:
3572 // EQ/NE/TRUE/FALSE/ORD/UNORD don't change immediate when commuted.
3573 break;
3574 case 0x01:
3575 case 0x02:
3576 // Need to toggle bits 3:0. Bit 4 stays the same.
3577 Imm ^= 0xf;
3578 break;
3579 }
3580
3581 return Imm;
3582}
3583
3585 if (Info.RegClass == X86::VR128RegClassID ||
3586 Info.RegClass == X86::VR128XRegClassID)
3587 return 128;
3588 if (Info.RegClass == X86::VR256RegClassID ||
3589 Info.RegClass == X86::VR256XRegClassID)
3590 return 256;
3591 if (Info.RegClass == X86::VR512RegClassID)
3592 return 512;
3593 llvm_unreachable("Unknown register class!");
3594}
3595
3596/// Return true if the Reg is X87 register.
3597static bool isX87Reg(Register Reg) {
3598 return (Reg == X86::FPCW || Reg == X86::FPSW ||
3599 (Reg >= X86::ST0 && Reg <= X86::ST7));
3600}
3601
3602/// check if the instruction is X87 instruction
3604 // Call and inlineasm defs X87 register, so we special case it here because
3605 // otherwise calls are incorrectly flagged as x87 instructions
3606 // as a result.
3607 if (MI.isCall() || MI.isInlineAsm())
3608 return false;
3609 for (const MachineOperand &MO : MI.operands()) {
3610 if (!MO.isReg())
3611 continue;
3612 if (isX87Reg(MO.getReg()))
3613 return true;
3614 }
3615 return false;
3616}
3617
3619 auto IsMemOp = [](const MCOperandInfo &OpInfo) {
3620 return OpInfo.OperandType == MCOI::OPERAND_MEMORY;
3621 };
3622
3623 const MCInstrDesc &Desc = MI.getDesc();
3624
3625 // Directly invoke the MC-layer routine for real (i.e., non-pseudo)
3626 // instructions (fast case).
3627 if (!X86II::isPseudo(Desc.TSFlags)) {
3628 int MemRefIdx = X86II::getMemoryOperandNo(Desc.TSFlags);
3629 if (MemRefIdx >= 0)
3630 return MemRefIdx + X86II::getOperandBias(Desc);
3631#ifdef EXPENSIVE_CHECKS
3632 assert(none_of(Desc.operands(), IsMemOp) &&
3633 "Got false negative from X86II::getMemoryOperandNo()!");
3634#endif
3635 return -1;
3636 }
3637
3638 // Otherwise, handle pseudo instructions by examining the type of their
3639 // operands (slow case). An instruction cannot have a memory reference if it
3640 // has fewer than AddrNumOperands (= 5) explicit operands.
3641 unsigned NumOps = Desc.getNumOperands();
3643#ifdef EXPENSIVE_CHECKS
3644 assert(none_of(Desc.operands(), IsMemOp) &&
3645 "Expected no operands to have OPERAND_MEMORY type!");
3646#endif
3647 return -1;
3648 }
3649
3650 // The first operand with type OPERAND_MEMORY indicates the start of a memory
3651 // reference. We expect the following AddrNumOperand-1 operands to also have
3652 // OPERAND_MEMORY type.
3653 for (unsigned I = 0, E = NumOps - X86::AddrNumOperands; I != E; ++I) {
3654 if (IsMemOp(Desc.operands()[I])) {
3655#ifdef EXPENSIVE_CHECKS
3656 assert(std::all_of(Desc.operands().begin() + I,
3657 Desc.operands().begin() + I + X86::AddrNumOperands,
3658 IsMemOp) &&
3659 "Expected all five operands in the memory reference to have "
3660 "OPERAND_MEMORY type!");
3661#endif
3662 return I;
3663 }
3664 }
3665
3666 return -1;
3667}
3668
3670 unsigned OpNo) {
3671 assert(MI.getNumOperands() >= (OpNo + X86::AddrNumOperands) &&
3672 "Unexpected number of operands!");
3673
3674 const MachineOperand &Index = MI.getOperand(OpNo + X86::AddrIndexReg);
3675 if (!Index.isReg() || Index.getReg() != X86::NoRegister)
3676 return nullptr;
3677
3678 const MachineOperand &Disp = MI.getOperand(OpNo + X86::AddrDisp);
3679 if (!Disp.isCPI() || Disp.getOffset() != 0)
3680 return nullptr;
3681
3683 MI.getParent()->getParent()->getConstantPool()->getConstants();
3684 const MachineConstantPoolEntry &ConstantEntry = Constants[Disp.getIndex()];
3685
3686 // Bail if this is a machine constant pool entry, we won't be able to dig out
3687 // anything useful.
3688 if (ConstantEntry.isMachineConstantPoolEntry())
3689 return nullptr;
3690
3691 return ConstantEntry.Val.ConstVal;
3692}
3693
3695 switch (MI.getOpcode()) {
3696 case X86::TCRETURNdi:
3697 case X86::TCRETURNri:
3698 case X86::TCRETURNmi:
3699 case X86::TCRETURNdi64:
3700 case X86::TCRETURNri64:
3701 case X86::TCRETURNri64_ImpCall:
3702 case X86::TCRETURNmi64:
3703 return true;
3704 default:
3705 return false;
3706 }
3707}
3708
3711 const MachineInstr &TailCall) const {
3712
3713 const MachineFunction *MF = TailCall.getMF();
3714
3715 if (MF->getTarget().getCodeModel() == CodeModel::Kernel) {
3716 // Kernel patches thunk calls in runtime, these should never be conditional.
3717 const MachineOperand &Target = TailCall.getOperand(0);
3718 if (Target.isSymbol()) {
3719 StringRef Symbol(Target.getSymbolName());
3720 // this is currently only relevant to r11/kernel indirect thunk.
3721 if (Symbol == "__x86_indirect_thunk_r11")
3722 return false;
3723 }
3724 }
3725
3726 if (TailCall.getOpcode() != X86::TCRETURNdi &&
3727 TailCall.getOpcode() != X86::TCRETURNdi64) {
3728 // Only direct calls can be done with a conditional branch.
3729 return false;
3730 }
3731
3732 if (Subtarget.isTargetWin64() && MF->hasWinCFI()) {
3733 // Conditional tail calls confuse the Win64 unwinder.
3734 return false;
3735 }
3736
3737 assert(BranchCond.size() == 1);
3738 if (BranchCond[0].getImm() > X86::LAST_VALID_COND) {
3739 // Can't make a conditional tail call with this condition.
3740 return false;
3741 }
3742
3744 if (X86FI->getTCReturnAddrDelta() != 0 ||
3745 TailCall.getOperand(1).getImm() != 0) {
3746 // A conditional tail call cannot do any stack adjustment.
3747 return false;
3748 }
3749
3750 return true;
3751}
3752
3755 const MachineInstr &TailCall) const {
3756 assert(canMakeTailCallConditional(BranchCond, TailCall));
3757
3759 while (I != MBB.begin()) {
3760 --I;
3761 if (I->isDebugInstr())
3762 continue;
3763 if (!I->isBranch())
3764 assert(0 && "Can't find the branch to replace!");
3765
3767 assert(BranchCond.size() == 1);
3768 if (CC != BranchCond[0].getImm())
3769 continue;
3770
3771 break;
3772 }
3773
3774 unsigned Opc = TailCall.getOpcode() == X86::TCRETURNdi ? X86::TCRETURNdicc
3775 : X86::TCRETURNdi64cc;
3776
3777 auto MIB = BuildMI(MBB, I, MBB.findDebugLoc(I), get(Opc));
3778 MIB->addOperand(TailCall.getOperand(0)); // Destination.
3779 MIB.addImm(0); // Stack offset (not used).
3780 MIB->addOperand(BranchCond[0]); // Condition.
3781 MIB.copyImplicitOps(TailCall); // Regmask and (imp-used) parameters.
3782
3783 // Add implicit uses and defs of all live regs potentially clobbered by the
3784 // call. This way they still appear live across the call.
3786 LiveRegs.addLiveOuts(MBB);
3788 LiveRegs.stepForward(*MIB, Clobbers);
3789 for (const auto &C : Clobbers) {
3790 MIB.addReg(C.first, RegState::Implicit);
3792 }
3793
3794 I->eraseFromParent();
3795}
3796
3797// Given a MBB and its TBB, find the FBB which was a fallthrough MBB (it may
3798// not be a fallthrough MBB now due to layout changes). Return nullptr if the
3799// fallthrough MBB cannot be identified.
3802 // Look for non-EHPad successors other than TBB. If we find exactly one, it
3803 // is the fallthrough MBB. If we find zero, then TBB is both the target MBB
3804 // and fallthrough MBB. If we find more than one, we cannot identify the
3805 // fallthrough MBB and should return nullptr.
3806 MachineBasicBlock *FallthroughBB = nullptr;
3807 for (MachineBasicBlock *Succ : MBB->successors()) {
3808 if (Succ->isEHPad() || (Succ == TBB && FallthroughBB))
3809 continue;
3810 // Return a nullptr if we found more than one fallthrough successor.
3811 if (FallthroughBB && FallthroughBB != TBB)
3812 return nullptr;
3813 FallthroughBB = Succ;
3814 }
3815 return FallthroughBB;
3816}
3817
3818bool X86InstrInfo::analyzeBranchImpl(
3821 SmallVectorImpl<MachineInstr *> &CondBranches, bool AllowModify) const {
3822
3823 // Start from the bottom of the block and work up, examining the
3824 // terminator instructions.
3826 MachineBasicBlock::iterator UnCondBrIter = MBB.end();
3827 while (I != MBB.begin()) {
3828 --I;
3829 if (I->isDebugInstr())
3830 continue;
3831
3832 // Working from the bottom, when we see a non-terminator instruction, we're
3833 // done.
3834 if (!isUnpredicatedTerminator(*I))
3835 break;
3836
3837 // A terminator that isn't a branch can't easily be handled by this
3838 // analysis.
3839 if (!I->isBranch())
3840 return true;
3841
3842 // Handle unconditional branches.
3843 if (I->getOpcode() == X86::JMP_1) {
3844 UnCondBrIter = I;
3845
3846 if (!AllowModify) {
3847 TBB = I->getOperand(0).getMBB();
3848 continue;
3849 }
3850
3851 // If the block has any instructions after a JMP, delete them.
3852 MBB.erase(std::next(I), MBB.end());
3853
3854 Cond.clear();
3855 FBB = nullptr;
3856
3857 // Delete the JMP if it's equivalent to a fall-through.
3858 if (MBB.isLayoutSuccessor(I->getOperand(0).getMBB())) {
3859 TBB = nullptr;
3860 I->eraseFromParent();
3861 I = MBB.end();
3862 UnCondBrIter = MBB.end();
3863 continue;
3864 }
3865
3866 // TBB is used to indicate the unconditional destination.
3867 TBB = I->getOperand(0).getMBB();
3868 continue;
3869 }
3870
3871 // Handle conditional branches.
3872 X86::CondCode BranchCode = X86::getCondFromBranch(*I);
3873 if (BranchCode == X86::COND_INVALID)
3874 return true; // Can't handle indirect branch.
3875
3876 // In practice we should never have an undef eflags operand, if we do
3877 // abort here as we are not prepared to preserve the flag.
3878 if (I->findRegisterUseOperand(X86::EFLAGS, /*TRI=*/nullptr)->isUndef())
3879 return true;
3880
3881 // Working from the bottom, handle the first conditional branch.
3882 if (Cond.empty()) {
3883 FBB = TBB;
3884 TBB = I->getOperand(0).getMBB();
3886 CondBranches.push_back(&*I);
3887 continue;
3888 }
3889
3890 // Handle subsequent conditional branches. Only handle the case where all
3891 // conditional branches branch to the same destination and their condition
3892 // opcodes fit one of the special multi-branch idioms.
3893 assert(Cond.size() == 1);
3894 assert(TBB);
3895
3896 // If the conditions are the same, we can leave them alone.
3897 X86::CondCode OldBranchCode = (X86::CondCode)Cond[0].getImm();
3898 auto NewTBB = I->getOperand(0).getMBB();
3899 if (OldBranchCode == BranchCode && TBB == NewTBB)
3900 continue;
3901
3902 // If they differ, see if they fit one of the known patterns. Theoretically,
3903 // we could handle more patterns here, but we shouldn't expect to see them
3904 // if instruction selection has done a reasonable job.
3905 if (TBB == NewTBB &&
3906 ((OldBranchCode == X86::COND_P && BranchCode == X86::COND_NE) ||
3907 (OldBranchCode == X86::COND_NE && BranchCode == X86::COND_P))) {
3908 BranchCode = X86::COND_NE_OR_P;
3909 } else if ((OldBranchCode == X86::COND_NP && BranchCode == X86::COND_NE) ||
3910 (OldBranchCode == X86::COND_E && BranchCode == X86::COND_P)) {
3911 if (NewTBB != (FBB ? FBB : getFallThroughMBB(&MBB, TBB)))
3912 return true;
3913
3914 // X86::COND_E_AND_NP usually has two different branch destinations.
3915 //
3916 // JP B1
3917 // JE B2
3918 // JMP B1
3919 // B1:
3920 // B2:
3921 //
3922 // Here this condition branches to B2 only if NP && E. It has another
3923 // equivalent form:
3924 //
3925 // JNE B1
3926 // JNP B2
3927 // JMP B1
3928 // B1:
3929 // B2:
3930 //
3931 // Similarly it branches to B2 only if E && NP. That is why this condition
3932 // is named with COND_E_AND_NP.
3933 BranchCode = X86::COND_E_AND_NP;
3934 } else
3935 return true;
3936
3937 // Update the MachineOperand.
3938 Cond[0].setImm(BranchCode);
3939 CondBranches.push_back(&*I);
3940 }
3941
3942 return false;
3943}
3944
3947 MachineBasicBlock *&FBB,
3949 bool AllowModify) const {
3950 SmallVector<MachineInstr *, 4> CondBranches;
3951 return analyzeBranchImpl(MBB, TBB, FBB, Cond, CondBranches, AllowModify);
3952}
3953
3955 const MCInstrDesc &Desc = MI.getDesc();
3956 int MemRefBegin = X86II::getMemoryOperandNo(Desc.TSFlags);
3957 assert(MemRefBegin >= 0 && "instr should have memory operand");
3958 MemRefBegin += X86II::getOperandBias(Desc);
3959
3960 const MachineOperand &MO = MI.getOperand(MemRefBegin + X86::AddrDisp);
3961 if (!MO.isJTI())
3962 return -1;
3963
3964 return MO.getIndex();
3965}
3966
3968 Register Reg) {
3969 if (!Reg.isVirtual())
3970 return -1;
3971 MachineInstr *MI = MRI.getUniqueVRegDef(Reg);
3972 if (MI == nullptr)
3973 return -1;
3974 unsigned Opcode = MI->getOpcode();
3975 if (Opcode != X86::LEA64r && Opcode != X86::LEA32r)
3976 return -1;
3978}
3979
3981 unsigned Opcode = MI.getOpcode();
3982 // Switch-jump pattern for non-PIC code looks like:
3983 // JMP64m $noreg, 8, %X, %jump-table.X, $noreg
3984 if (Opcode == X86::JMP64m || Opcode == X86::JMP32m) {
3986 }
3987 // The pattern for PIC code looks like:
3988 // %0 = LEA64r $rip, 1, $noreg, %jump-table.X
3989 // %1 = MOVSX64rm32 %0, 4, XX, 0, $noreg
3990 // %2 = ADD64rr %1, %0
3991 // JMP64r %2
3992 if (Opcode == X86::JMP64r || Opcode == X86::JMP32r) {
3993 Register Reg = MI.getOperand(0).getReg();
3994 if (!Reg.isVirtual())
3995 return -1;
3996 const MachineFunction &MF = *MI.getParent()->getParent();
3997 const MachineRegisterInfo &MRI = MF.getRegInfo();
3998 MachineInstr *Add = MRI.getUniqueVRegDef(Reg);
3999 if (Add == nullptr)
4000 return -1;
4001 if (Add->getOpcode() != X86::ADD64rr && Add->getOpcode() != X86::ADD32rr)
4002 return -1;
4003 int JTI1 = getJumpTableIndexFromReg(MRI, Add->getOperand(1).getReg());
4004 if (JTI1 >= 0)
4005 return JTI1;
4006 int JTI2 = getJumpTableIndexFromReg(MRI, Add->getOperand(2).getReg());
4007 if (JTI2 >= 0)
4008 return JTI2;
4009 }
4010 return -1;
4011}
4012
4014 MachineBranchPredicate &MBP,
4015 bool AllowModify) const {
4016 using namespace std::placeholders;
4017
4019 SmallVector<MachineInstr *, 4> CondBranches;
4020 if (analyzeBranchImpl(MBB, MBP.TrueDest, MBP.FalseDest, Cond, CondBranches,
4021 AllowModify))
4022 return true;
4023
4024 if (Cond.size() != 1)
4025 return true;
4026
4027 assert(MBP.TrueDest && "expected!");
4028
4029 if (!MBP.FalseDest)
4030 MBP.FalseDest = MBB.getNextNode();
4031
4033
4034 MachineInstr *ConditionDef = nullptr;
4035 bool SingleUseCondition = true;
4036
4038 if (MI.modifiesRegister(X86::EFLAGS, TRI)) {
4039 ConditionDef = &MI;
4040 break;
4041 }
4042
4043 if (MI.readsRegister(X86::EFLAGS, TRI))
4044 SingleUseCondition = false;
4045 }
4046
4047 if (!ConditionDef)
4048 return true;
4049
4050 if (SingleUseCondition) {
4051 for (auto *Succ : MBB.successors())
4052 if (Succ->isLiveIn(X86::EFLAGS))
4053 SingleUseCondition = false;
4054 }
4055
4056 MBP.ConditionDef = ConditionDef;
4057 MBP.SingleUseCondition = SingleUseCondition;
4058
4059 // Currently we only recognize the simple pattern:
4060 //
4061 // test %reg, %reg
4062 // je %label
4063 //
4064 const unsigned TestOpcode =
4065 Subtarget.is64Bit() ? X86::TEST64rr : X86::TEST32rr;
4066
4067 if (ConditionDef->getOpcode() == TestOpcode &&
4068 ConditionDef->getNumOperands() == 3 &&
4069 ConditionDef->getOperand(0).isIdenticalTo(ConditionDef->getOperand(1)) &&
4070 (Cond[0].getImm() == X86::COND_NE || Cond[0].getImm() == X86::COND_E)) {
4071 MBP.LHS = ConditionDef->getOperand(0);
4072 MBP.RHS = MachineOperand::CreateImm(0);
4073 MBP.Predicate = Cond[0].getImm() == X86::COND_NE
4074 ? MachineBranchPredicate::PRED_NE
4075 : MachineBranchPredicate::PRED_EQ;
4076 return false;
4077 }
4078
4079 return true;
4080}
4081
4083 int *BytesRemoved) const {
4084 assert(!BytesRemoved && "code size not handled");
4085
4087 unsigned Count = 0;
4088
4089 while (I != MBB.begin()) {
4090 --I;
4091 if (I->isDebugInstr())
4092 continue;
4093 if (I->getOpcode() != X86::JMP_1 &&
4095 break;
4096 // Remove the branch.
4097 I->eraseFromParent();
4098 I = MBB.end();
4099 ++Count;
4100 }
4101
4102 return Count;
4103}
4104
4107 MachineBasicBlock *FBB,
4109 const DebugLoc &DL, int *BytesAdded) const {
4110 // Shouldn't be a fall through.
4111 assert(TBB && "insertBranch must not be told to insert a fallthrough");
4112 assert((Cond.size() == 1 || Cond.size() == 0) &&
4113 "X86 branch conditions have one component!");
4114 assert(!BytesAdded && "code size not handled");
4115
4116 if (Cond.empty()) {
4117 // Unconditional branch?
4118 assert(!FBB && "Unconditional branch with multiple successors!");
4119 BuildMI(&MBB, DL, get(X86::JMP_1)).addMBB(TBB);
4120 return 1;
4121 }
4122
4123 // If FBB is null, it is implied to be a fall-through block.
4124 bool FallThru = FBB == nullptr;
4125
4126 // Conditional branch.
4127 unsigned Count = 0;
4129 switch (CC) {
4130 case X86::COND_NE_OR_P:
4131 // Synthesize NE_OR_P with two branches.
4132 BuildMI(&MBB, DL, get(X86::JCC_1)).addMBB(TBB).addImm(X86::COND_NE);
4133 ++Count;
4134 BuildMI(&MBB, DL, get(X86::JCC_1)).addMBB(TBB).addImm(X86::COND_P);
4135 ++Count;
4136 break;
4137 case X86::COND_E_AND_NP:
4138 // Use the next block of MBB as FBB if it is null.
4139 if (FBB == nullptr) {
4140 FBB = getFallThroughMBB(&MBB, TBB);
4141 assert(FBB && "MBB cannot be the last block in function when the false "
4142 "body is a fall-through.");
4143 }
4144 // Synthesize COND_E_AND_NP with two branches.
4145 BuildMI(&MBB, DL, get(X86::JCC_1)).addMBB(FBB).addImm(X86::COND_NE);
4146 ++Count;
4147 BuildMI(&MBB, DL, get(X86::JCC_1)).addMBB(TBB).addImm(X86::COND_NP);
4148 ++Count;
4149 break;
4150 default: {
4151 BuildMI(&MBB, DL, get(X86::JCC_1)).addMBB(TBB).addImm(CC);
4152 ++Count;
4153 }
4154 }
4155 if (!FallThru) {
4156 // Two-way Conditional branch. Insert the second branch.
4157 BuildMI(&MBB, DL, get(X86::JMP_1)).addMBB(FBB);
4158 ++Count;
4159 }
4160 return Count;
4161}
4162
4165 Register DstReg, Register TrueReg,
4166 Register FalseReg, int &CondCycles,
4167 int &TrueCycles, int &FalseCycles) const {
4168 // Not all subtargets have cmov instructions.
4169 if (!Subtarget.canUseCMOV())
4170 return false;
4171 if (Cond.size() != 1)
4172 return false;
4173 // We cannot do the composite conditions, at least not in SSA form.
4175 return false;
4176
4177 // Check register classes.
4178 const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
4179 const TargetRegisterClass *RC =
4180 RI.getCommonSubClass(MRI.getRegClass(TrueReg), MRI.getRegClass(FalseReg));
4181 if (!RC)
4182 return false;
4183
4184 // We have cmov instructions for 16, 32, and 64 bit general purpose registers.
4185 if (X86::GR16RegClass.hasSubClassEq(RC) ||
4186 X86::GR32RegClass.hasSubClassEq(RC) ||
4187 X86::GR64RegClass.hasSubClassEq(RC)) {
4188 // This latency applies to Pentium M, Merom, Wolfdale, Nehalem, and Sandy
4189 // Bridge. Probably Ivy Bridge as well.
4190 CondCycles = 2;
4191 TrueCycles = 2;
4192 FalseCycles = 2;
4193 return true;
4194 }
4195
4196 // Can't do vectors.
4197 return false;
4198}
4199
4202 const DebugLoc &DL, Register DstReg,
4204 Register FalseReg) const {
4205 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
4206 const TargetRegisterInfo &TRI = *MRI.getTargetRegisterInfo();
4207 const TargetRegisterClass &RC = *MRI.getRegClass(DstReg);
4208 assert(Cond.size() == 1 && "Invalid Cond array");
4209 unsigned Opc =
4210 X86::getCMovOpcode(TRI.getRegSizeInBits(RC) / 8,
4211 false /*HasMemoryOperand*/, Subtarget.hasNDD());
4212 BuildMI(MBB, I, DL, get(Opc), DstReg)
4213 .addReg(FalseReg)
4214 .addReg(TrueReg)
4215 .addImm(Cond[0].getImm());
4216}
4217
4218/// Test if the given register is a physical h register.
4219static bool isHReg(Register Reg) {
4220 return X86::GR8_ABCD_HRegClass.contains(Reg);
4221}
4222
4223// Try and copy between VR128/VR64 and GR64 registers.
4224static unsigned CopyToFromAsymmetricReg(Register DestReg, Register SrcReg,
4225 const X86Subtarget &Subtarget) {
4226 bool HasAVX = Subtarget.hasAVX();
4227 bool HasAVX512 = Subtarget.hasAVX512();
4228 bool HasEGPR = Subtarget.hasEGPR();
4229
4230 // SrcReg(MaskReg) -> DestReg(GR64)
4231 // SrcReg(MaskReg) -> DestReg(GR32)
4232
4233 // All KMASK RegClasses hold the same k registers, can be tested against
4234 // anyone.
4235 if (X86::VK16RegClass.contains(SrcReg)) {
4236 if (X86::GR64RegClass.contains(DestReg)) {
4237 assert(Subtarget.hasBWI());
4238 return HasEGPR ? X86::KMOVQrk_EVEX : X86::KMOVQrk;
4239 }
4240 if (X86::GR32RegClass.contains(DestReg))
4241 return Subtarget.hasBWI() ? (HasEGPR ? X86::KMOVDrk_EVEX : X86::KMOVDrk)
4242 : (HasEGPR ? X86::KMOVWrk_EVEX : X86::KMOVWrk);
4243 }
4244
4245 // SrcReg(GR64) -> DestReg(MaskReg)
4246 // SrcReg(GR32) -> DestReg(MaskReg)
4247
4248 // All KMASK RegClasses hold the same k registers, can be tested against
4249 // anyone.
4250 if (X86::VK16RegClass.contains(DestReg)) {
4251 if (X86::GR64RegClass.contains(SrcReg)) {
4252 assert(Subtarget.hasBWI());
4253 return HasEGPR ? X86::KMOVQkr_EVEX : X86::KMOVQkr;
4254 }
4255 if (X86::GR32RegClass.contains(SrcReg))
4256 return Subtarget.hasBWI() ? (HasEGPR ? X86::KMOVDkr_EVEX : X86::KMOVDkr)
4257 : (HasEGPR ? X86::KMOVWkr_EVEX : X86::KMOVWkr);
4258 }
4259
4260 // SrcReg(VR128) -> DestReg(GR64)
4261 // SrcReg(VR64) -> DestReg(GR64)
4262 // SrcReg(GR64) -> DestReg(VR128)
4263 // SrcReg(GR64) -> DestReg(VR64)
4264
4265 if (X86::GR64RegClass.contains(DestReg)) {
4266 if (X86::VR128XRegClass.contains(SrcReg))
4267 // Copy from a VR128 register to a GR64 register.
4268 return HasAVX512 ? X86::VMOVPQIto64Zrr
4269 : HasAVX ? X86::VMOVPQIto64rr
4270 : X86::MOVPQIto64rr;
4271 if (X86::VR64RegClass.contains(SrcReg))
4272 // Copy from a VR64 register to a GR64 register.
4273 return X86::MMX_MOVD64from64rr;
4274 } else if (X86::GR64RegClass.contains(SrcReg)) {
4275 // Copy from a GR64 register to a VR128 register.
4276 if (X86::VR128XRegClass.contains(DestReg))
4277 return HasAVX512 ? X86::VMOV64toPQIZrr
4278 : HasAVX ? X86::VMOV64toPQIrr
4279 : X86::MOV64toPQIrr;
4280 // Copy from a GR64 register to a VR64 register.
4281 if (X86::VR64RegClass.contains(DestReg))
4282 return X86::MMX_MOVD64to64rr;
4283 }
4284
4285 // SrcReg(VR128) -> DestReg(GR32)
4286 // SrcReg(GR32) -> DestReg(VR128)
4287
4288 if (X86::GR32RegClass.contains(DestReg) &&
4289 X86::VR128XRegClass.contains(SrcReg))
4290 // Copy from a VR128 register to a GR32 register.
4291 return HasAVX512 ? X86::VMOVPDI2DIZrr
4292 : HasAVX ? X86::VMOVPDI2DIrr
4293 : X86::MOVPDI2DIrr;
4294
4295 if (X86::VR128XRegClass.contains(DestReg) &&
4296 X86::GR32RegClass.contains(SrcReg))
4297 // Copy from a VR128 register to a VR128 register.
4298 return HasAVX512 ? X86::VMOVDI2PDIZrr
4299 : HasAVX ? X86::VMOVDI2PDIrr
4300 : X86::MOVDI2PDIrr;
4301 return 0;
4302}
4303
4306 const DebugLoc &DL, Register DestReg,
4307 Register SrcReg, bool KillSrc,
4308 bool RenamableDest, bool RenamableSrc) const {
4309 // First deal with the normal symmetric copies.
4310 bool HasAVX = Subtarget.hasAVX();
4311 bool HasVLX = Subtarget.hasVLX();
4312 bool HasEGPR = Subtarget.hasEGPR();
4313 unsigned Opc = 0;
4314 if (X86::GR64RegClass.contains(DestReg, SrcReg))
4315 Opc = X86::MOV64rr;
4316 else if (X86::GR32RegClass.contains(DestReg, SrcReg))
4317 Opc = X86::MOV32rr;
4318 else if (X86::GR16RegClass.contains(DestReg, SrcReg))
4319 Opc = X86::MOV16rr;
4320 else if (X86::GR8RegClass.contains(DestReg, SrcReg)) {
4321 // Copying to or from a physical H register on x86-64 requires a NOREX
4322 // move. Otherwise use a normal move.
4323 if ((isHReg(DestReg) || isHReg(SrcReg)) && Subtarget.is64Bit()) {
4324 Opc = X86::MOV8rr_NOREX;
4325 // Both operands must be encodable without an REX prefix.
4326 assert(X86::GR8_NOREXRegClass.contains(SrcReg, DestReg) &&
4327 "8-bit H register can not be copied outside GR8_NOREX");
4328 } else
4329 Opc = X86::MOV8rr;
4330 } else if (X86::VR64RegClass.contains(DestReg, SrcReg))
4331 Opc = X86::MMX_MOVQ64rr;
4332 else if (X86::VR128XRegClass.contains(DestReg, SrcReg)) {
4333 if (HasVLX)
4334 Opc = X86::VMOVAPSZ128rr;
4335 else if (X86::VR128RegClass.contains(DestReg, SrcReg))
4336 Opc = HasAVX ? X86::VMOVAPSrr : X86::MOVAPSrr;
4337 else {
4338 // If this an extended register and we don't have VLX we need to use a
4339 // 512-bit move.
4340 Opc = X86::VMOVAPSZrr;
4342 DestReg =
4343 TRI->getMatchingSuperReg(DestReg, X86::sub_xmm, &X86::VR512RegClass);
4344 SrcReg =
4345 TRI->getMatchingSuperReg(SrcReg, X86::sub_xmm, &X86::VR512RegClass);
4346 }
4347 } else if (X86::VR256XRegClass.contains(DestReg, SrcReg)) {
4348 if (HasVLX)
4349 Opc = X86::VMOVAPSZ256rr;
4350 else if (X86::VR256RegClass.contains(DestReg, SrcReg))
4351 Opc = X86::VMOVAPSYrr;
4352 else {
4353 // If this an extended register and we don't have VLX we need to use a
4354 // 512-bit move.
4355 Opc = X86::VMOVAPSZrr;
4357 DestReg =
4358 TRI->getMatchingSuperReg(DestReg, X86::sub_ymm, &X86::VR512RegClass);
4359 SrcReg =
4360 TRI->getMatchingSuperReg(SrcReg, X86::sub_ymm, &X86::VR512RegClass);
4361 }
4362 } else if (X86::VR512RegClass.contains(DestReg, SrcReg))
4363 Opc = X86::VMOVAPSZrr;
4364 // All KMASK RegClasses hold the same k registers, can be tested against
4365 // anyone.
4366 else if (X86::VK16RegClass.contains(DestReg, SrcReg))
4367 Opc = Subtarget.hasBWI() ? (HasEGPR ? X86::KMOVQkk_EVEX : X86::KMOVQkk)
4368 : (HasEGPR ? X86::KMOVQkk_EVEX : X86::KMOVWkk);
4369 if (!Opc)
4370 Opc = CopyToFromAsymmetricReg(DestReg, SrcReg, Subtarget);
4371
4372 if (Opc) {
4373 BuildMI(MBB, MI, DL, get(Opc), DestReg)
4374 .addReg(SrcReg, getKillRegState(KillSrc));
4375 return;
4376 }
4377
4378 if (SrcReg == X86::EFLAGS || DestReg == X86::EFLAGS) {
4379 // FIXME: We use a fatal error here because historically LLVM has tried
4380 // lower some of these physreg copies and we want to ensure we get
4381 // reasonable bug reports if someone encounters a case no other testing
4382 // found. This path should be removed after the LLVM 7 release.
4383 report_fatal_error("Unable to copy EFLAGS physical register!");
4384 }
4385
4386 LLVM_DEBUG(dbgs() << "Cannot copy " << RI.getName(SrcReg) << " to "
4387 << RI.getName(DestReg) << '\n');
4388 report_fatal_error("Cannot emit physreg copy instruction");
4389}
4390
4391std::optional<DestSourcePair>
4393 if (MI.isMoveReg()) {
4394 // FIXME: Dirty hack for apparent invariant that doesn't hold when
4395 // subreg_to_reg is coalesced with ordinary copies, such that the bits that
4396 // were asserted as 0 are now undef.
4397 if (MI.getOperand(0).isUndef() && MI.getOperand(0).getSubReg())
4398 return std::nullopt;
4399
4400 return DestSourcePair{MI.getOperand(0), MI.getOperand(1)};
4401 }
4402 return std::nullopt;
4403}
4404
4405static unsigned getLoadStoreOpcodeForFP16(bool Load, const X86Subtarget &STI) {
4406 if (STI.hasFP16())
4407 return Load ? X86::VMOVSHZrm_alt : X86::VMOVSHZmr;
4408 if (Load)
4409 return X86::MOVSHPrm;
4410 return X86::MOVSHPmr;
4411}
4412
4414 const TargetRegisterClass *RC,
4415 bool IsStackAligned,
4416 const X86Subtarget &STI, bool Load) {
4417 bool HasAVX = STI.hasAVX();
4418 bool HasAVX512 = STI.hasAVX512();
4419 bool HasVLX = STI.hasVLX();
4420 bool HasEGPR = STI.hasEGPR();
4421
4422 assert(RC != nullptr && "Invalid target register class");
4423 switch (STI.getRegisterInfo()->getSpillSize(*RC)) {
4424 default:
4425 llvm_unreachable("Unknown spill size");
4426 case 1:
4427 assert(X86::GR8RegClass.hasSubClassEq(RC) && "Unknown 1-byte regclass");
4428 if (STI.is64Bit())
4429 // Copying to or from a physical H register on x86-64 requires a NOREX
4430 // move. Otherwise use a normal move.
4431 if (isHReg(Reg) || X86::GR8_ABCD_HRegClass.hasSubClassEq(RC))
4432 return Load ? X86::MOV8rm_NOREX : X86::MOV8mr_NOREX;
4433 return Load ? X86::MOV8rm : X86::MOV8mr;
4434 case 2:
4435 if (X86::VK16RegClass.hasSubClassEq(RC))
4436 return Load ? (HasEGPR ? X86::KMOVWkm_EVEX : X86::KMOVWkm)
4437 : (HasEGPR ? X86::KMOVWmk_EVEX : X86::KMOVWmk);
4438 assert(X86::GR16RegClass.hasSubClassEq(RC) && "Unknown 2-byte regclass");
4439 return Load ? X86::MOV16rm : X86::MOV16mr;
4440 case 4:
4441 if (X86::GR32RegClass.hasSubClassEq(RC))
4442 return Load ? X86::MOV32rm : X86::MOV32mr;
4443 if (X86::FR32XRegClass.hasSubClassEq(RC))
4444 return Load ? (HasAVX512 ? X86::VMOVSSZrm_alt
4445 : HasAVX ? X86::VMOVSSrm_alt
4446 : X86::MOVSSrm_alt)
4447 : (HasAVX512 ? X86::VMOVSSZmr
4448 : HasAVX ? X86::VMOVSSmr
4449 : X86::MOVSSmr);
4450 if (X86::RFP32RegClass.hasSubClassEq(RC))
4451 return Load ? X86::LD_Fp32m : X86::ST_Fp32m;
4452 if (X86::VK32RegClass.hasSubClassEq(RC)) {
4453 assert(STI.hasBWI() && "KMOVD requires BWI");
4454 return Load ? (HasEGPR ? X86::KMOVDkm_EVEX : X86::KMOVDkm)
4455 : (HasEGPR ? X86::KMOVDmk_EVEX : X86::KMOVDmk);
4456 }
4457 // All of these mask pair classes have the same spill size, the same kind
4458 // of kmov instructions can be used with all of them.
4459 if (X86::VK1PAIRRegClass.hasSubClassEq(RC) ||
4460 X86::VK2PAIRRegClass.hasSubClassEq(RC) ||
4461 X86::VK4PAIRRegClass.hasSubClassEq(RC) ||
4462 X86::VK8PAIRRegClass.hasSubClassEq(RC) ||
4463 X86::VK16PAIRRegClass.hasSubClassEq(RC))
4464 return Load ? X86::MASKPAIR16LOAD : X86::MASKPAIR16STORE;
4465 if (X86::FR16RegClass.hasSubClassEq(RC) ||
4466 X86::FR16XRegClass.hasSubClassEq(RC))
4467 return getLoadStoreOpcodeForFP16(Load, STI);
4468 llvm_unreachable("Unknown 4-byte regclass");
4469 case 8:
4470 if (X86::GR64RegClass.hasSubClassEq(RC))
4471 return Load ? X86::MOV64rm : X86::MOV64mr;
4472 if (X86::FR64XRegClass.hasSubClassEq(RC))
4473 return Load ? (HasAVX512 ? X86::VMOVSDZrm_alt
4474 : HasAVX ? X86::VMOVSDrm_alt
4475 : X86::MOVSDrm_alt)
4476 : (HasAVX512 ? X86::VMOVSDZmr
4477 : HasAVX ? X86::VMOVSDmr
4478 : X86::MOVSDmr);
4479 if (X86::VR64RegClass.hasSubClassEq(RC))
4480 return Load ? X86::MMX_MOVQ64rm : X86::MMX_MOVQ64mr;
4481 if (X86::RFP64RegClass.hasSubClassEq(RC))
4482 return Load ? X86::LD_Fp64m : X86::ST_Fp64m;
4483 if (X86::VK64RegClass.hasSubClassEq(RC)) {
4484 assert(STI.hasBWI() && "KMOVQ requires BWI");
4485 return Load ? (HasEGPR ? X86::KMOVQkm_EVEX : X86::KMOVQkm)
4486 : (HasEGPR ? X86::KMOVQmk_EVEX : X86::KMOVQmk);
4487 }
4488 llvm_unreachable("Unknown 8-byte regclass");
4489 case 10:
4490 assert(X86::RFP80RegClass.hasSubClassEq(RC) && "Unknown 10-byte regclass");
4491 return Load ? X86::LD_Fp80m : X86::ST_FpP80m;
4492 case 16: {
4493 if (X86::VR128XRegClass.hasSubClassEq(RC)) {
4494 // If stack is realigned we can use aligned stores.
4495 if (IsStackAligned)
4496 return Load ? (HasVLX ? X86::VMOVAPSZ128rm
4497 : HasAVX512 ? X86::VMOVAPSZ128rm_NOVLX
4498 : HasAVX ? X86::VMOVAPSrm
4499 : X86::MOVAPSrm)
4500 : (HasVLX ? X86::VMOVAPSZ128mr
4501 : HasAVX512 ? X86::VMOVAPSZ128mr_NOVLX
4502 : HasAVX ? X86::VMOVAPSmr
4503 : X86::MOVAPSmr);
4504 else
4505 return Load ? (HasVLX ? X86::VMOVUPSZ128rm
4506 : HasAVX512 ? X86::VMOVUPSZ128rm_NOVLX
4507 : HasAVX ? X86::VMOVUPSrm
4508 : X86::MOVUPSrm)
4509 : (HasVLX ? X86::VMOVUPSZ128mr
4510 : HasAVX512 ? X86::VMOVUPSZ128mr_NOVLX
4511 : HasAVX ? X86::VMOVUPSmr
4512 : X86::MOVUPSmr);
4513 }
4514 llvm_unreachable("Unknown 16-byte regclass");
4515 }
4516 case 32:
4517 assert(X86::VR256XRegClass.hasSubClassEq(RC) && "Unknown 32-byte regclass");
4518 // If stack is realigned we can use aligned stores.
4519 if (IsStackAligned)
4520 return Load ? (HasVLX ? X86::VMOVAPSZ256rm
4521 : HasAVX512 ? X86::VMOVAPSZ256rm_NOVLX
4522 : X86::VMOVAPSYrm)
4523 : (HasVLX ? X86::VMOVAPSZ256mr
4524 : HasAVX512 ? X86::VMOVAPSZ256mr_NOVLX
4525 : X86::VMOVAPSYmr);
4526 else
4527 return Load ? (HasVLX ? X86::VMOVUPSZ256rm
4528 : HasAVX512 ? X86::VMOVUPSZ256rm_NOVLX
4529 : X86::VMOVUPSYrm)
4530 : (HasVLX ? X86::VMOVUPSZ256mr
4531 : HasAVX512 ? X86::VMOVUPSZ256mr_NOVLX
4532 : X86::VMOVUPSYmr);
4533 case 64:
4534 assert(X86::VR512RegClass.hasSubClassEq(RC) && "Unknown 64-byte regclass");
4535 assert(STI.hasAVX512() && "Using 512-bit register requires AVX512");
4536 if (IsStackAligned)
4537 return Load ? X86::VMOVAPSZrm : X86::VMOVAPSZmr;
4538 else
4539 return Load ? X86::VMOVUPSZrm : X86::VMOVUPSZmr;
4540 case 1024:
4541 assert(X86::TILERegClass.hasSubClassEq(RC) && "Unknown 1024-byte regclass");
4542 assert(STI.hasAMXTILE() && "Using 8*1024-bit register requires AMX-TILE");
4543#define GET_EGPR_IF_ENABLED(OPC) (STI.hasEGPR() ? OPC##_EVEX : OPC)
4544 return Load ? GET_EGPR_IF_ENABLED(X86::TILELOADD)
4545 : GET_EGPR_IF_ENABLED(X86::TILESTORED);
4546#undef GET_EGPR_IF_ENABLED
4547 case 2048:
4548 assert(X86::TILEPAIRRegClass.hasSubClassEq(RC) &&
4549 "Unknown 2048-byte regclass");
4550 assert(STI.hasAMXTILE() && "Using 2048-bit register requires AMX-TILE");
4551 return Load ? X86::PTILEPAIRLOAD : X86::PTILEPAIRSTORE;
4552 }
4553}
4554
4555std::optional<ExtAddrMode>
4557 const TargetRegisterInfo *TRI) const {
4558 const MCInstrDesc &Desc = MemI.getDesc();
4559 int MemRefBegin = X86II::getMemoryOperandNo(Desc.TSFlags);
4560 if (MemRefBegin < 0)
4561 return std::nullopt;
4562
4563 MemRefBegin += X86II::getOperandBias(Desc);
4564
4565 auto &BaseOp = MemI.getOperand(MemRefBegin + X86::AddrBaseReg);
4566 if (!BaseOp.isReg()) // Can be an MO_FrameIndex
4567 return std::nullopt;
4568
4569 const MachineOperand &DispMO = MemI.getOperand(MemRefBegin + X86::AddrDisp);
4570 // Displacement can be symbolic
4571 if (!DispMO.isImm())
4572 return std::nullopt;
4573
4574 ExtAddrMode AM;
4575 AM.BaseReg = BaseOp.getReg();
4576 AM.ScaledReg = MemI.getOperand(MemRefBegin + X86::AddrIndexReg).getReg();
4577 AM.Scale = MemI.getOperand(MemRefBegin + X86::AddrScaleAmt).getImm();
4578 AM.Displacement = DispMO.getImm();
4579 return AM;
4580}
4581
4583 StringRef &ErrInfo) const {
4584 std::optional<ExtAddrMode> AMOrNone = getAddrModeFromMemoryOp(MI, nullptr);
4585 if (!AMOrNone)
4586 return true;
4587
4588 ExtAddrMode AM = *AMOrNone;
4590 if (AM.ScaledReg != X86::NoRegister) {
4591 switch (AM.Scale) {
4592 case 1:
4593 case 2:
4594 case 4:
4595 case 8:
4596 break;
4597 default:
4598 ErrInfo = "Scale factor in address must be 1, 2, 4 or 8";
4599 return false;
4600 }
4601 }
4602 if (!isInt<32>(AM.Displacement)) {
4603 ErrInfo = "Displacement in address must fit into 32-bit signed "
4604 "integer";
4605 return false;
4606 }
4607
4608 return true;
4609}
4610
4612 const Register Reg,
4613 int64_t &ImmVal) const {
4614 Register MovReg = Reg;
4615 const MachineInstr *MovMI = &MI;
4616
4617 // Follow use-def for SUBREG_TO_REG to find the real move immediate
4618 // instruction. It is quite common for x86-64.
4619 if (MI.isSubregToReg()) {
4620 // We use following pattern to setup 64b immediate.
4621 // %8:gr32 = MOV32r0 implicit-def dead $eflags
4622 // %6:gr64 = SUBREG_TO_REG 0, killed %8:gr32, %subreg.sub_32bit
4623 if (!MI.getOperand(1).isImm())
4624 return false;
4625 unsigned FillBits = MI.getOperand(1).getImm();
4626 unsigned SubIdx = MI.getOperand(3).getImm();
4627 MovReg = MI.getOperand(2).getReg();
4628 if (SubIdx != X86::sub_32bit || FillBits != 0)
4629 return false;
4630 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
4631 MovMI = MRI.getUniqueVRegDef(MovReg);
4632 if (!MovMI)
4633 return false;
4634 }
4635
4636 if (MovMI->getOpcode() == X86::MOV32r0 &&
4637 MovMI->getOperand(0).getReg() == MovReg) {
4638 ImmVal = 0;
4639 return true;
4640 }
4641
4642 if (MovMI->getOpcode() != X86::MOV32ri &&
4643 MovMI->getOpcode() != X86::MOV64ri &&
4644 MovMI->getOpcode() != X86::MOV32ri64 && MovMI->getOpcode() != X86::MOV8ri)
4645 return false;
4646 // Mov Src can be a global address.
4647 if (!MovMI->getOperand(1).isImm() || MovMI->getOperand(0).getReg() != MovReg)
4648 return false;
4649 ImmVal = MovMI->getOperand(1).getImm();
4650 return true;
4651}
4652
4654 const MachineInstr *MI, const Register NullValueReg,
4655 const TargetRegisterInfo *TRI) const {
4656 if (!MI->modifiesRegister(NullValueReg, TRI))
4657 return true;
4658 switch (MI->getOpcode()) {
4659 // Shift right/left of a null unto itself is still a null, i.e. rax = shl rax
4660 // X.
4661 case X86::SHR64ri:
4662 case X86::SHR32ri:
4663 case X86::SHL64ri:
4664 case X86::SHL32ri:
4665 assert(MI->getOperand(0).isDef() && MI->getOperand(1).isUse() &&
4666 "expected for shift opcode!");
4667 return MI->getOperand(0).getReg() == NullValueReg &&
4668 MI->getOperand(1).getReg() == NullValueReg;
4669 // Zero extend of a sub-reg of NullValueReg into itself does not change the
4670 // null value.
4671 case X86::MOV32rr:
4672 return llvm::all_of(MI->operands(), [&](const MachineOperand &MO) {
4673 return TRI->isSubRegisterEq(NullValueReg, MO.getReg());
4674 });
4675 default:
4676 return false;
4677 }
4678 llvm_unreachable("Should be handled above!");
4679}
4680
4683 int64_t &Offset, bool &OffsetIsScalable, LocationSize &Width,
4684 const TargetRegisterInfo *TRI) const {
4685 const MCInstrDesc &Desc = MemOp.getDesc();
4686 int MemRefBegin = X86II::getMemoryOperandNo(Desc.TSFlags);
4687 if (MemRefBegin < 0)
4688 return false;
4689
4690 MemRefBegin += X86II::getOperandBias(Desc);
4691
4692 const MachineOperand *BaseOp =
4693 &MemOp.getOperand(MemRefBegin + X86::AddrBaseReg);
4694 if (!BaseOp->isReg()) // Can be an MO_FrameIndex
4695 return false;
4696
4697 if (MemOp.getOperand(MemRefBegin + X86::AddrScaleAmt).getImm() != 1)
4698 return false;
4699
4700 if (MemOp.getOperand(MemRefBegin + X86::AddrIndexReg).getReg() !=
4701 X86::NoRegister)
4702 return false;
4703
4704 const MachineOperand &DispMO = MemOp.getOperand(MemRefBegin + X86::AddrDisp);
4705
4706 // Displacement can be symbolic
4707 if (!DispMO.isImm())
4708 return false;
4709
4710 Offset = DispMO.getImm();
4711
4712 if (!BaseOp->isReg())
4713 return false;
4714
4715 OffsetIsScalable = false;
4716 // FIXME: Relying on memoperands() may not be right thing to do here. Check
4717 // with X86 maintainers, and fix it accordingly. For now, it is ok, since
4718 // there is no use of `Width` for X86 back-end at the moment.
4719 Width = !MemOp.memoperands_empty() ? MemOp.memoperands().front()->getSize()
4721 BaseOps.push_back(BaseOp);
4722 return true;
4723}
4724
4725static unsigned getStoreRegOpcode(Register SrcReg,
4726 const TargetRegisterClass *RC,
4727 bool IsStackAligned,
4728 const X86Subtarget &STI) {
4729 return getLoadStoreRegOpcode(SrcReg, RC, IsStackAligned, STI, false);
4730}
4731
4732static unsigned getLoadRegOpcode(Register DestReg,
4733 const TargetRegisterClass *RC,
4734 bool IsStackAligned, const X86Subtarget &STI) {
4735 return getLoadStoreRegOpcode(DestReg, RC, IsStackAligned, STI, true);
4736}
4737
4738static bool isAMXOpcode(unsigned Opc) {
4739 switch (Opc) {
4740 default:
4741 return false;
4742 case X86::TILELOADD:
4743 case X86::TILESTORED:
4744 case X86::TILELOADD_EVEX:
4745 case X86::TILESTORED_EVEX:
4746 case X86::PTILEPAIRLOAD:
4747 case X86::PTILEPAIRSTORE:
4748 return true;
4749 }
4750}
4751
4754 unsigned Opc, Register Reg, int FrameIdx,
4755 bool isKill) const {
4756 switch (Opc) {
4757 default:
4758 llvm_unreachable("Unexpected special opcode!");
4759 case X86::TILESTORED:
4760 case X86::TILESTORED_EVEX:
4761 case X86::PTILEPAIRSTORE: {
4762 // tilestored %tmm, (%sp, %idx)
4763 MachineRegisterInfo &RegInfo = MBB.getParent()->getRegInfo();
4764 Register VirtReg = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass);
4765 BuildMI(MBB, MI, DebugLoc(), get(X86::MOV64ri), VirtReg).addImm(64);
4766 MachineInstr *NewMI =
4767 addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc)), FrameIdx)
4768 .addReg(Reg, getKillRegState(isKill));
4770 MO.setReg(VirtReg);
4771 MO.setIsKill(true);
4772 break;
4773 }
4774 case X86::TILELOADD:
4775 case X86::TILELOADD_EVEX:
4776 case X86::PTILEPAIRLOAD: {
4777 // tileloadd (%sp, %idx), %tmm
4778 MachineRegisterInfo &RegInfo = MBB.getParent()->getRegInfo();
4779 Register VirtReg = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass);
4780 BuildMI(MBB, MI, DebugLoc(), get(X86::MOV64ri), VirtReg).addImm(64);
4782 BuildMI(MBB, MI, DebugLoc(), get(Opc), Reg), FrameIdx);
4784 MO.setReg(VirtReg);
4785 MO.setIsKill(true);
4786 break;
4787 }
4788 }
4789}
4790
4793 bool isKill, int FrameIdx, const TargetRegisterClass *RC,
4794 const TargetRegisterInfo *TRI, Register VReg,
4795 MachineInstr::MIFlag Flags) const {
4796 const MachineFunction &MF = *MBB.getParent();
4797 const MachineFrameInfo &MFI = MF.getFrameInfo();
4798 assert(MFI.getObjectSize(FrameIdx) >= TRI->getSpillSize(*RC) &&
4799 "Stack slot too small for store");
4800
4801 unsigned Alignment = std::max<uint32_t>(TRI->getSpillSize(*RC), 16);
4802 bool isAligned =
4803 (Subtarget.getFrameLowering()->getStackAlign() >= Alignment) ||
4804 (RI.canRealignStack(MF) && !MFI.isFixedObjectIndex(FrameIdx));
4805
4806 unsigned Opc = getStoreRegOpcode(SrcReg, RC, isAligned, Subtarget);
4807 if (isAMXOpcode(Opc))
4808 loadStoreTileReg(MBB, MI, Opc, SrcReg, FrameIdx, isKill);
4809 else
4810 addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc)), FrameIdx)
4811 .addReg(SrcReg, getKillRegState(isKill))
4812 .setMIFlag(Flags);
4813}
4814
4817 int FrameIdx, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI,
4818 Register VReg, MachineInstr::MIFlag Flags) const {
4819 const MachineFunction &MF = *MBB.getParent();
4820 const MachineFrameInfo &MFI = MF.getFrameInfo();
4821 assert(MFI.getObjectSize(FrameIdx) >= TRI->getSpillSize(*RC) &&
4822 "Load size exceeds stack slot");
4823 unsigned Alignment = std::max<uint32_t>(TRI->getSpillSize(*RC), 16);
4824 bool isAligned =
4825 (Subtarget.getFrameLowering()->getStackAlign() >= Alignment) ||
4826 (RI.canRealignStack(MF) && !MFI.isFixedObjectIndex(FrameIdx));
4827
4828 unsigned Opc = getLoadRegOpcode(DestReg, RC, isAligned, Subtarget);
4829 if (isAMXOpcode(Opc))
4830 loadStoreTileReg(MBB, MI, Opc, DestReg, FrameIdx);
4831 else
4832 addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc), DestReg), FrameIdx)
4833 .setMIFlag(Flags);
4834}
4835
4837 Register &SrcReg2, int64_t &CmpMask,
4838 int64_t &CmpValue) const {
4839 switch (MI.getOpcode()) {
4840 default:
4841 break;
4842 case X86::CMP64ri32:
4843 case X86::CMP32ri:
4844 case X86::CMP16ri:
4845 case X86::CMP8ri:
4846 SrcReg = MI.getOperand(0).getReg();
4847 SrcReg2 = 0;
4848 if (MI.getOperand(1).isImm()) {
4849 CmpMask = ~0;
4850 CmpValue = MI.getOperand(1).getImm();
4851 } else {
4852 CmpMask = CmpValue = 0;
4853 }
4854 return true;
4855 // A SUB can be used to perform comparison.
4856 CASE_ND(SUB64rm)
4857 CASE_ND(SUB32rm)
4858 CASE_ND(SUB16rm)
4859 CASE_ND(SUB8rm)
4860 SrcReg = MI.getOperand(1).getReg();
4861 SrcReg2 = 0;
4862 CmpMask = 0;
4863 CmpValue = 0;
4864 return true;
4865 CASE_ND(SUB64rr)
4866 CASE_ND(SUB32rr)
4867 CASE_ND(SUB16rr)
4868 CASE_ND(SUB8rr)
4869 SrcReg = MI.getOperand(1).getReg();
4870 SrcReg2 = MI.getOperand(2).getReg();
4871 CmpMask = 0;
4872 CmpValue = 0;
4873 return true;
4874 CASE_ND(SUB64ri32)
4875 CASE_ND(SUB32ri)
4876 CASE_ND(SUB16ri)
4877 CASE_ND(SUB8ri)
4878 SrcReg = MI.getOperand(1).getReg();
4879 SrcReg2 = 0;
4880 if (MI.getOperand(2).isImm()) {
4881 CmpMask = ~0;
4882 CmpValue = MI.getOperand(2).getImm();
4883 } else {
4884 CmpMask = CmpValue = 0;
4885 }
4886 return true;
4887 case X86::CMP64rr:
4888 case X86::CMP32rr:
4889 case X86::CMP16rr:
4890 case X86::CMP8rr:
4891 SrcReg = MI.getOperand(0).getReg();
4892 SrcReg2 = MI.getOperand(1).getReg();
4893 CmpMask = 0;
4894 CmpValue = 0;
4895 return true;
4896 case X86::TEST8rr:
4897 case X86::TEST16rr:
4898 case X86::TEST32rr:
4899 case X86::TEST64rr:
4900 SrcReg = MI.getOperand(0).getReg();
4901 if (MI.getOperand(1).getReg() != SrcReg)
4902 return false;
4903 // Compare against zero.
4904 SrcReg2 = 0;
4905 CmpMask = ~0;
4906 CmpValue = 0;
4907 return true;
4908 case X86::TEST64ri32:
4909 case X86::TEST32ri:
4910 case X86::TEST16ri:
4911 case X86::TEST8ri:
4912 SrcReg = MI.getOperand(0).getReg();
4913 SrcReg2 = 0;
4914 // Force identical compare.
4915 CmpMask = 0;
4916 CmpValue = 0;
4917 return true;
4918 }
4919 return false;
4920}
4921
4922bool X86InstrInfo::isRedundantFlagInstr(const MachineInstr &FlagI,
4923 Register SrcReg, Register SrcReg2,
4924 int64_t ImmMask, int64_t ImmValue,
4925 const MachineInstr &OI, bool *IsSwapped,
4926 int64_t *ImmDelta) const {
4927 switch (OI.getOpcode()) {
4928 case X86::CMP64rr:
4929 case X86::CMP32rr:
4930 case X86::CMP16rr:
4931 case X86::CMP8rr:
4932 CASE_ND(SUB64rr)
4933 CASE_ND(SUB32rr)
4934 CASE_ND(SUB16rr)
4935 CASE_ND(SUB8rr) {
4936 Register OISrcReg;
4937 Register OISrcReg2;
4938 int64_t OIMask;
4939 int64_t OIValue;
4940 if (!analyzeCompare(OI, OISrcReg, OISrcReg2, OIMask, OIValue) ||
4941 OIMask != ImmMask || OIValue != ImmValue)
4942 return false;
4943 if (SrcReg == OISrcReg && SrcReg2 == OISrcReg2) {
4944 *IsSwapped = false;
4945 return true;
4946 }
4947 if (SrcReg == OISrcReg2 && SrcReg2 == OISrcReg) {
4948 *IsSwapped = true;
4949 return true;
4950 }
4951 return false;
4952 }
4953 case X86::CMP64ri32:
4954 case X86::CMP32ri:
4955 case X86::CMP16ri:
4956 case X86::CMP8ri:
4957 case X86::TEST64ri32:
4958 case X86::TEST32ri:
4959 case X86::TEST16ri:
4960 case X86::TEST8ri:
4961 CASE_ND(SUB64ri32)
4962 CASE_ND(SUB32ri)
4963 CASE_ND(SUB16ri)
4964 CASE_ND(SUB8ri)
4965 case X86::TEST64rr:
4966 case X86::TEST32rr:
4967 case X86::TEST16rr:
4968 case X86::TEST8rr: {
4969 if (ImmMask != 0) {
4970 Register OISrcReg;
4971 Register OISrcReg2;
4972 int64_t OIMask;
4973 int64_t OIValue;
4974 if (analyzeCompare(OI, OISrcReg, OISrcReg2, OIMask, OIValue) &&
4975 SrcReg == OISrcReg && ImmMask == OIMask) {
4976 if (OIValue == ImmValue) {
4977 *ImmDelta = 0;
4978 return true;
4979 } else if (static_cast<uint64_t>(ImmValue) ==
4980 static_cast<uint64_t>(OIValue) - 1) {
4981 *ImmDelta = -1;
4982 return true;
4983 } else if (static_cast<uint64_t>(ImmValue) ==
4984 static_cast<uint64_t>(OIValue) + 1) {
4985 *ImmDelta = 1;
4986 return true;
4987 } else {
4988 return false;
4989 }
4990 }
4991 }
4992 return FlagI.isIdenticalTo(OI);
4993 }
4994 default:
4995 return false;
4996 }
4997}
4998
4999/// Check whether the definition can be converted
5000/// to remove a comparison against zero.
5001inline static bool isDefConvertible(const MachineInstr &MI, bool &NoSignFlag,
5002 bool &ClearsOverflowFlag) {
5003 NoSignFlag = false;
5004 ClearsOverflowFlag = false;
5005
5006 // "ELF Handling for Thread-Local Storage" specifies that x86-64 GOTTPOFF, and
5007 // i386 GOTNTPOFF/INDNTPOFF relocations can convert an ADD to a LEA during
5008 // Initial Exec to Local Exec relaxation. In these cases, we must not depend
5009 // on the EFLAGS modification of ADD actually happening in the final binary.
5010 if (MI.getOpcode() == X86::ADD64rm || MI.getOpcode() == X86::ADD32rm) {
5011 unsigned Flags = MI.getOperand(5).getTargetFlags();
5012 if (Flags == X86II::MO_GOTTPOFF || Flags == X86II::MO_INDNTPOFF ||
5013 Flags == X86II::MO_GOTNTPOFF)
5014 return false;
5015 }
5016
5017 switch (MI.getOpcode()) {
5018 default:
5019 return false;
5020
5021 // The shift instructions only modify ZF if their shift count is non-zero.
5022 // N.B.: The processor truncates the shift count depending on the encoding.
5023 CASE_ND(SAR8ri)
5024 CASE_ND(SAR16ri)
5025 CASE_ND(SAR32ri)
5026 CASE_ND(SAR64ri)
5027 CASE_ND(SHR8ri)
5028 CASE_ND(SHR16ri)
5029 CASE_ND(SHR32ri)
5030 CASE_ND(SHR64ri)
5031 return getTruncatedShiftCount(MI, 2) != 0;
5032
5033 // Some left shift instructions can be turned into LEA instructions but only
5034 // if their flags aren't used. Avoid transforming such instructions.
5035 CASE_ND(SHL8ri)
5036 CASE_ND(SHL16ri)
5037 CASE_ND(SHL32ri)
5038 CASE_ND(SHL64ri) {
5039 unsigned ShAmt = getTruncatedShiftCount(MI, 2);
5040 if (isTruncatedShiftCountForLEA(ShAmt))
5041 return false;
5042 return ShAmt != 0;
5043 }
5044
5045 CASE_ND(SHRD16rri8)
5046 CASE_ND(SHRD32rri8)
5047 CASE_ND(SHRD64rri8)
5048 CASE_ND(SHLD16rri8)
5049 CASE_ND(SHLD32rri8)
5050 CASE_ND(SHLD64rri8)
5051 return getTruncatedShiftCount(MI, 3) != 0;
5052
5053 CASE_ND(SUB64ri32)
5054 CASE_ND(SUB32ri)
5055 CASE_ND(SUB16ri)
5056 CASE_ND(SUB8ri)
5057 CASE_ND(SUB64rr)
5058 CASE_ND(SUB32rr)
5059 CASE_ND(SUB16rr)
5060 CASE_ND(SUB8rr)
5061 CASE_ND(SUB64rm)
5062 CASE_ND(SUB32rm)
5063 CASE_ND(SUB16rm)
5064 CASE_ND(SUB8rm)
5065 CASE_ND(DEC64r)
5066 CASE_ND(DEC32r)
5067 CASE_ND(DEC16r)
5068 CASE_ND(DEC8r)
5069 CASE_ND(ADD64ri32)
5070 CASE_ND(ADD32ri)
5071 CASE_ND(ADD16ri)
5072 CASE_ND(ADD8ri)
5073 CASE_ND(ADD64rr)
5074 CASE_ND(ADD32rr)
5075 CASE_ND(ADD16rr)
5076 CASE_ND(ADD8rr)
5077 CASE_ND(ADD64rm)
5078 CASE_ND(ADD32rm)
5079 CASE_ND(ADD16rm)
5080 CASE_ND(ADD8rm)
5081 CASE_ND(INC64r)
5082 CASE_ND(INC32r)
5083 CASE_ND(INC16r)
5084 CASE_ND(INC8r)
5085 CASE_ND(ADC64ri32)
5086 CASE_ND(ADC32ri)
5087 CASE_ND(ADC16ri)
5088 CASE_ND(ADC8ri)
5089 CASE_ND(ADC64rr)
5090 CASE_ND(ADC32rr)
5091 CASE_ND(ADC16rr)
5092 CASE_ND(ADC8rr)
5093 CASE_ND(ADC64rm)
5094 CASE_ND(ADC32rm)
5095 CASE_ND(ADC16rm)
5096 CASE_ND(ADC8rm)
5097 CASE_ND(SBB64ri32)
5098 CASE_ND(SBB32ri)
5099 CASE_ND(SBB16ri)
5100 CASE_ND(SBB8ri)
5101 CASE_ND(SBB64rr)
5102 CASE_ND(SBB32rr)
5103 CASE_ND(SBB16rr)
5104 CASE_ND(SBB8rr)
5105 CASE_ND(SBB64rm)
5106 CASE_ND(SBB32rm)
5107 CASE_ND(SBB16rm)
5108 CASE_ND(SBB8rm)
5109 CASE_ND(NEG8r)
5110 CASE_ND(NEG16r)
5111 CASE_ND(NEG32r)
5112 CASE_ND(NEG64r)
5113 case X86::LZCNT16rr:
5114 case X86::LZCNT16rm:
5115 case X86::LZCNT32rr:
5116 case X86::LZCNT32rm:
5117 case X86::LZCNT64rr:
5118 case X86::LZCNT64rm:
5119 case X86::POPCNT16rr:
5120 case X86::POPCNT16rm:
5121 case X86::POPCNT32rr:
5122 case X86::POPCNT32rm:
5123 case X86::POPCNT64rr:
5124 case X86::POPCNT64rm:
5125 case X86::TZCNT16rr:
5126 case X86::TZCNT16rm:
5127 case X86::TZCNT32rr:
5128 case X86::TZCNT32rm:
5129 case X86::TZCNT64rr:
5130 case X86::TZCNT64rm:
5131 return true;
5132 CASE_ND(AND64ri32)
5133 CASE_ND(AND32ri)
5134 CASE_ND(AND16ri)
5135 CASE_ND(AND8ri)
5136 CASE_ND(AND64rr)
5137 CASE_ND(AND32rr)
5138 CASE_ND(AND16rr)
5139 CASE_ND(AND8rr)
5140 CASE_ND(AND64rm)
5141 CASE_ND(AND32rm)
5142 CASE_ND(AND16rm)
5143 CASE_ND(AND8rm)
5144 CASE_ND(XOR64ri32)
5145 CASE_ND(XOR32ri)
5146 CASE_ND(XOR16ri)
5147 CASE_ND(XOR8ri)
5148 CASE_ND(XOR64rr)
5149 CASE_ND(XOR32rr)
5150 CASE_ND(XOR16rr)
5151 CASE_ND(XOR8rr)
5152 CASE_ND(XOR64rm)
5153 CASE_ND(XOR32rm)
5154 CASE_ND(XOR16rm)
5155 CASE_ND(XOR8rm)
5156 CASE_ND(OR64ri32)
5157 CASE_ND(OR32ri)
5158 CASE_ND(OR16ri)
5159 CASE_ND(OR8ri)
5160 CASE_ND(OR64rr)
5161 CASE_ND(OR32rr)
5162 CASE_ND(OR16rr)
5163 CASE_ND(OR8rr)
5164 CASE_ND(OR64rm)
5165 CASE_ND(OR32rm)
5166 CASE_ND(OR16rm)
5167 CASE_ND(OR8rm)
5168 case X86::ANDN32rr:
5169 case X86::ANDN32rm:
5170 case X86::ANDN64rr:
5171 case X86::ANDN64rm:
5172 case X86::BLSI32rr:
5173 case X86::BLSI32rm:
5174 case X86::BLSI64rr:
5175 case X86::BLSI64rm:
5176 case X86::BLSMSK32rr:
5177 case X86::BLSMSK32rm:
5178 case X86::BLSMSK64rr:
5179 case X86::BLSMSK64rm:
5180 case X86::BLSR32rr:
5181 case X86::BLSR32rm:
5182 case X86::BLSR64rr:
5183 case X86::BLSR64rm:
5184 case X86::BLCFILL32rr:
5185 case X86::BLCFILL32rm:
5186 case X86::BLCFILL64rr:
5187 case X86::BLCFILL64rm:
5188 case X86::BLCI32rr:
5189 case X86::BLCI32rm:
5190 case X86::BLCI64rr:
5191 case X86::BLCI64rm:
5192 case X86::BLCIC32rr:
5193 case X86::BLCIC32rm:
5194 case X86::BLCIC64rr:
5195 case X86::BLCIC64rm:
5196 case X86::BLCMSK32rr:
5197 case X86::BLCMSK32rm:
5198 case X86::BLCMSK64rr:
5199 case X86::BLCMSK64rm:
5200 case X86::BLCS32rr:
5201 case X86::BLCS32rm:
5202 case X86::BLCS64rr:
5203 case X86::BLCS64rm:
5204 case X86::BLSFILL32rr:
5205 case X86::BLSFILL32rm:
5206 case X86::BLSFILL64rr:
5207 case X86::BLSFILL64rm:
5208 case X86::BLSIC32rr:
5209 case X86::BLSIC32rm:
5210 case X86::BLSIC64rr:
5211 case X86::BLSIC64rm:
5212 case X86::BZHI32rr:
5213 case X86::BZHI32rm:
5214 case X86::BZHI64rr:
5215 case X86::BZHI64rm:
5216 case X86::T1MSKC32rr:
5217 case X86::T1MSKC32rm:
5218 case X86::T1MSKC64rr:
5219 case X86::T1MSKC64rm:
5220 case X86::TZMSK32rr:
5221 case X86::TZMSK32rm:
5222 case X86::TZMSK64rr:
5223 case X86::TZMSK64rm:
5224 // These instructions clear the overflow flag just like TEST.
5225 // FIXME: These are not the only instructions in this switch that clear the
5226 // overflow flag.
5227 ClearsOverflowFlag = true;
5228 return true;
5229 case X86::BEXTR32rr:
5230 case X86::BEXTR64rr:
5231 case X86::BEXTR32rm:
5232 case X86::BEXTR64rm:
5233 case X86::BEXTRI32ri:
5234 case X86::BEXTRI32mi:
5235 case X86::BEXTRI64ri:
5236 case X86::BEXTRI64mi:
5237 // BEXTR doesn't update the sign flag so we can't use it. It does clear
5238 // the overflow flag, but that's not useful without the sign flag.
5239 NoSignFlag = true;
5240 return true;
5241 }
5242}
5243
5244/// Check whether the use can be converted to remove a comparison against zero.
5245/// Returns the EFLAGS condition and the operand that we are comparing against zero.
5246static std::pair<X86::CondCode, unsigned> isUseDefConvertible(const MachineInstr &MI) {
5247 switch (MI.getOpcode()) {
5248 default:
5249 return std::make_pair(X86::COND_INVALID, ~0U);
5250 CASE_ND(NEG8r)
5251 CASE_ND(NEG16r)
5252 CASE_ND(NEG32r)
5253 CASE_ND(NEG64r)
5254 return std::make_pair(X86::COND_AE, 1U);
5255 case X86::LZCNT16rr:
5256 case X86::LZCNT32rr:
5257 case X86::LZCNT64rr:
5258 return std::make_pair(X86::COND_B, 1U);
5259 case X86::POPCNT16rr:
5260 case X86::POPCNT32rr:
5261 case X86::POPCNT64rr:
5262 return std::make_pair(X86::COND_E, 1U);
5263 case X86::TZCNT16rr:
5264 case X86::TZCNT32rr:
5265 case X86::TZCNT64rr:
5266 return std::make_pair(X86::COND_B, 1U);
5267 case X86::BSF16rr:
5268 case X86::BSF32rr:
5269 case X86::BSF64rr:
5270 case X86::BSR16rr:
5271 case X86::BSR32rr:
5272 case X86::BSR64rr:
5273 return std::make_pair(X86::COND_E, 2U);
5274 case X86::BLSI32rr:
5275 case X86::BLSI64rr:
5276 return std::make_pair(X86::COND_AE, 1U);
5277 case X86::BLSR32rr:
5278 case X86::BLSR64rr:
5279 case X86::BLSMSK32rr:
5280 case X86::BLSMSK64rr:
5281 return std::make_pair(X86::COND_B, 1U);
5282 // TODO: TBM instructions.
5283 }
5284}
5285
5286/// Check if there exists an earlier instruction that
5287/// operates on the same source operands and sets flags in the same way as
5288/// Compare; remove Compare if possible.
5290 Register SrcReg2, int64_t CmpMask,
5291 int64_t CmpValue,
5292 const MachineRegisterInfo *MRI) const {
5293 // Check whether we can replace SUB with CMP.
5294 switch (CmpInstr.getOpcode()) {
5295 default:
5296 break;
5297 CASE_ND(SUB64ri32)
5298 CASE_ND(SUB32ri)
5299 CASE_ND(SUB16ri)
5300 CASE_ND(SUB8ri)
5301 CASE_ND(SUB64rm)
5302 CASE_ND(SUB32rm)
5303 CASE_ND(SUB16rm)
5304 CASE_ND(SUB8rm)
5305 CASE_ND(SUB64rr)
5306 CASE_ND(SUB32rr)
5307 CASE_ND(SUB16rr)
5308 CASE_ND(SUB8rr) {
5309 if (!MRI->use_nodbg_empty(CmpInstr.getOperand(0).getReg()))
5310 return false;
5311 // There is no use of the destination register, we can replace SUB with CMP.
5312 unsigned NewOpcode = 0;
5313#define FROM_TO(A, B) \
5314 CASE_ND(A) NewOpcode = X86::B; \
5315 break;
5316 switch (CmpInstr.getOpcode()) {
5317 default:
5318 llvm_unreachable("Unreachable!");
5319 FROM_TO(SUB64rm, CMP64rm)
5320 FROM_TO(SUB32rm, CMP32rm)
5321 FROM_TO(SUB16rm, CMP16rm)
5322 FROM_TO(SUB8rm, CMP8rm)
5323 FROM_TO(SUB64rr, CMP64rr)
5324 FROM_TO(SUB32rr, CMP32rr)
5325 FROM_TO(SUB16rr, CMP16rr)
5326 FROM_TO(SUB8rr, CMP8rr)
5327 FROM_TO(SUB64ri32, CMP64ri32)
5328 FROM_TO(SUB32ri, CMP32ri)
5329 FROM_TO(SUB16ri, CMP16ri)
5330 FROM_TO(SUB8ri, CMP8ri)
5331 }
5332#undef FROM_TO
5333 CmpInstr.setDesc(get(NewOpcode));
5334 CmpInstr.removeOperand(0);
5335 // Mutating this instruction invalidates any debug data associated with it.
5336 CmpInstr.dropDebugNumber();
5337 // Fall through to optimize Cmp if Cmp is CMPrr or CMPri.
5338 if (NewOpcode == X86::CMP64rm || NewOpcode == X86::CMP32rm ||
5339 NewOpcode == X86::CMP16rm || NewOpcode == X86::CMP8rm)
5340 return false;
5341 }
5342 }
5343
5344 // The following code tries to remove the comparison by re-using EFLAGS
5345 // from earlier instructions.
5346
5347 bool IsCmpZero = (CmpMask != 0 && CmpValue == 0);
5348
5349 // Transformation currently requires SSA values.
5350 if (SrcReg2.isPhysical())
5351 return false;
5352 MachineInstr *SrcRegDef = MRI->getVRegDef(SrcReg);
5353 assert(SrcRegDef && "Must have a definition (SSA)");
5354
5355 MachineInstr *MI = nullptr;
5356 MachineInstr *Sub = nullptr;
5357 MachineInstr *Movr0Inst = nullptr;
5359 bool NoSignFlag = false;
5360 bool ClearsOverflowFlag = false;
5361 bool ShouldUpdateCC = false;
5362 bool IsSwapped = false;
5363 bool HasNF = Subtarget.hasNF();
5364 unsigned OpNo = 0;
5366 int64_t ImmDelta = 0;
5367
5368 // Search backward from CmpInstr for the next instruction defining EFLAGS.
5370 MachineBasicBlock &CmpMBB = *CmpInstr.getParent();
5372 std::next(MachineBasicBlock::reverse_iterator(CmpInstr));
5373 for (MachineBasicBlock *MBB = &CmpMBB;;) {
5374 for (MachineInstr &Inst : make_range(From, MBB->rend())) {
5375 // Try to use EFLAGS from the instruction defining %SrcReg. Example:
5376 // %eax = addl ...
5377 // ... // EFLAGS not changed
5378 // testl %eax, %eax // <-- can be removed
5379 if (&Inst == SrcRegDef) {
5380 if (IsCmpZero &&
5381 isDefConvertible(Inst, NoSignFlag, ClearsOverflowFlag)) {
5382 MI = &Inst;
5383 break;
5384 }
5385
5386 // Look back for the following pattern, in which case the
5387 // test16rr/test64rr instruction could be erased.
5388 //
5389 // Example for test16rr:
5390 // %reg = and32ri %in_reg, 5
5391 // ... // EFLAGS not changed.
5392 // %src_reg = copy %reg.sub_16bit:gr32
5393 // test16rr %src_reg, %src_reg, implicit-def $eflags
5394 // Example for test64rr:
5395 // %reg = and32ri %in_reg, 5
5396 // ... // EFLAGS not changed.
5397 // %src_reg = subreg_to_reg 0, %reg, %subreg.sub_index
5398 // test64rr %src_reg, %src_reg, implicit-def $eflags
5399 MachineInstr *AndInstr = nullptr;
5400 if (IsCmpZero &&
5401 findRedundantFlagInstr(CmpInstr, Inst, MRI, &AndInstr, TRI,
5402 Subtarget, NoSignFlag, ClearsOverflowFlag)) {
5403 assert(AndInstr != nullptr && X86::isAND(AndInstr->getOpcode()));
5404 MI = AndInstr;
5405 break;
5406 }
5407 // Cannot find other candidates before definition of SrcReg.
5408 return false;
5409 }
5410
5411 if (Inst.modifiesRegister(X86::EFLAGS, TRI)) {
5412 // Try to use EFLAGS produced by an instruction reading %SrcReg.
5413 // Example:
5414 // %eax = ...
5415 // ...
5416 // popcntl %eax
5417 // ... // EFLAGS not changed
5418 // testl %eax, %eax // <-- can be removed
5419 if (IsCmpZero) {
5420 std::tie(NewCC, OpNo) = isUseDefConvertible(Inst);
5421 if (NewCC != X86::COND_INVALID && Inst.getOperand(OpNo).isReg() &&
5422 Inst.getOperand(OpNo).getReg() == SrcReg) {
5423 ShouldUpdateCC = true;
5424 MI = &Inst;
5425 break;
5426 }
5427 }
5428
5429 // Try to use EFLAGS from an instruction with similar flag results.
5430 // Example:
5431 // sub x, y or cmp x, y
5432 // ... // EFLAGS not changed
5433 // cmp x, y // <-- can be removed
5434 if (isRedundantFlagInstr(CmpInstr, SrcReg, SrcReg2, CmpMask, CmpValue,
5435 Inst, &IsSwapped, &ImmDelta)) {
5436 Sub = &Inst;
5437 break;
5438 }
5439
5440 // MOV32r0 is implemented with xor which clobbers condition code. It is
5441 // safe to move up, if the definition to EFLAGS is dead and earlier
5442 // instructions do not read or write EFLAGS.
5443 if (!Movr0Inst && Inst.getOpcode() == X86::MOV32r0 &&
5444 Inst.registerDefIsDead(X86::EFLAGS, TRI)) {
5445 Movr0Inst = &Inst;
5446 continue;
5447 }
5448
5449 // For the instructions are ADDrm/ADDmr with relocation, we'll skip the
5450 // optimization for replacing non-NF with NF. This is to keep backward
5451 // compatiblity with old version of linkers without APX relocation type
5452 // support on Linux OS.
5453 bool IsWithReloc = X86EnableAPXForRelocation
5454 ? false
5456
5457 // Try to replace non-NF with NF instructions.
5458 if (HasNF && Inst.registerDefIsDead(X86::EFLAGS, TRI) && !IsWithReloc) {
5459 unsigned NewOp = X86::getNFVariant(Inst.getOpcode());
5460 if (!NewOp)
5461 return false;
5462
5463 InstsToUpdate.push_back(std::make_pair(&Inst, NewOp));
5464 continue;
5465 }
5466
5467 // Cannot do anything for any other EFLAG changes.
5468 return false;
5469 }
5470 }
5471
5472 if (MI || Sub)
5473 break;
5474
5475 // Reached begin of basic block. Continue in predecessor if there is
5476 // exactly one.
5477 if (MBB->pred_size() != 1)
5478 return false;
5479 MBB = *MBB->pred_begin();
5480 From = MBB->rbegin();
5481 }
5482
5483 // Scan forward from the instruction after CmpInstr for uses of EFLAGS.
5484 // It is safe to remove CmpInstr if EFLAGS is redefined or killed.
5485 // If we are done with the basic block, we need to check whether EFLAGS is
5486 // live-out.
5487 bool FlagsMayLiveOut = true;
5489 MachineBasicBlock::iterator AfterCmpInstr =
5490 std::next(MachineBasicBlock::iterator(CmpInstr));
5491 for (MachineInstr &Instr : make_range(AfterCmpInstr, CmpMBB.end())) {
5492 bool ModifyEFLAGS = Instr.modifiesRegister(X86::EFLAGS, TRI);
5493 bool UseEFLAGS = Instr.readsRegister(X86::EFLAGS, TRI);
5494 // We should check the usage if this instruction uses and updates EFLAGS.
5495 if (!UseEFLAGS && ModifyEFLAGS) {
5496 // It is safe to remove CmpInstr if EFLAGS is updated again.
5497 FlagsMayLiveOut = false;
5498 break;
5499 }
5500 if (!UseEFLAGS && !ModifyEFLAGS)
5501 continue;
5502
5503 // EFLAGS is used by this instruction.
5504 X86::CondCode OldCC = X86::getCondFromMI(Instr);
5505 if ((MI || IsSwapped || ImmDelta != 0) && OldCC == X86::COND_INVALID)
5506 return false;
5507
5508 X86::CondCode ReplacementCC = X86::COND_INVALID;
5509 if (MI) {
5510 switch (OldCC) {
5511 default:
5512 break;
5513 case X86::COND_A:
5514 case X86::COND_AE:
5515 case X86::COND_B:
5516 case X86::COND_BE:
5517 // CF is used, we can't perform this optimization.
5518 return false;
5519 case X86::COND_G:
5520 case X86::COND_GE:
5521 case X86::COND_L:
5522 case X86::COND_LE:
5523 // If SF is used, but the instruction doesn't update the SF, then we
5524 // can't do the optimization.
5525 if (NoSignFlag)
5526 return false;
5527 [[fallthrough]];
5528 case X86::COND_O:
5529 case X86::COND_NO:
5530 // If OF is used, the instruction needs to clear it like CmpZero does.
5531 if (!ClearsOverflowFlag)
5532 return false;
5533 break;
5534 case X86::COND_S:
5535 case X86::COND_NS:
5536 // If SF is used, but the instruction doesn't update the SF, then we
5537 // can't do the optimization.
5538 if (NoSignFlag)
5539 return false;
5540 break;
5541 }
5542
5543 // If we're updating the condition code check if we have to reverse the
5544 // condition.
5545 if (ShouldUpdateCC)
5546 switch (OldCC) {
5547 default:
5548 return false;
5549 case X86::COND_E:
5550 ReplacementCC = NewCC;
5551 break;
5552 case X86::COND_NE:
5553 ReplacementCC = GetOppositeBranchCondition(NewCC);
5554 break;
5555 }
5556 } else if (IsSwapped) {
5557 // If we have SUB(r1, r2) and CMP(r2, r1), the condition code needs
5558 // to be changed from r2 > r1 to r1 < r2, from r2 < r1 to r1 > r2, etc.
5559 // We swap the condition code and synthesize the new opcode.
5560 ReplacementCC = getSwappedCondition(OldCC);
5561 if (ReplacementCC == X86::COND_INVALID)
5562 return false;
5563 ShouldUpdateCC = true;
5564 } else if (ImmDelta != 0) {
5565 unsigned BitWidth = TRI->getRegSizeInBits(*MRI->getRegClass(SrcReg));
5566 // Shift amount for min/max constants to adjust for 8/16/32 instruction
5567 // sizes.
5568 switch (OldCC) {
5569 case X86::COND_L: // x <s (C + 1) --> x <=s C
5570 if (ImmDelta != 1 || APInt::getSignedMinValue(BitWidth) == CmpValue)
5571 return false;
5572 ReplacementCC = X86::COND_LE;
5573 break;
5574 case X86::COND_B: // x <u (C + 1) --> x <=u C
5575 if (ImmDelta != 1 || CmpValue == 0)
5576 return false;
5577 ReplacementCC = X86::COND_BE;
5578 break;
5579 case X86::COND_GE: // x >=s (C + 1) --> x >s C
5580 if (ImmDelta != 1 || APInt::getSignedMinValue(BitWidth) == CmpValue)
5581 return false;
5582 ReplacementCC = X86::COND_G;
5583 break;
5584 case X86::COND_AE: // x >=u (C + 1) --> x >u C
5585 if (ImmDelta != 1 || CmpValue == 0)
5586 return false;
5587 ReplacementCC = X86::COND_A;
5588 break;
5589 case X86::COND_G: // x >s (C - 1) --> x >=s C
5590 if (ImmDelta != -1 || APInt::getSignedMaxValue(BitWidth) == CmpValue)
5591 return false;
5592 ReplacementCC = X86::COND_GE;
5593 break;
5594 case X86::COND_A: // x >u (C - 1) --> x >=u C
5595 if (ImmDelta != -1 || APInt::getMaxValue(BitWidth) == CmpValue)
5596 return false;
5597 ReplacementCC = X86::COND_AE;
5598 break;
5599 case X86::COND_LE: // x <=s (C - 1) --> x <s C
5600 if (ImmDelta != -1 || APInt::getSignedMaxValue(BitWidth) == CmpValue)
5601 return false;
5602 ReplacementCC = X86::COND_L;
5603 break;
5604 case X86::COND_BE: // x <=u (C - 1) --> x <u C
5605 if (ImmDelta != -1 || APInt::getMaxValue(BitWidth) == CmpValue)
5606 return false;
5607 ReplacementCC = X86::COND_B;
5608 break;
5609 default:
5610 return false;
5611 }
5612 ShouldUpdateCC = true;
5613 }
5614
5615 if (ShouldUpdateCC && ReplacementCC != OldCC) {
5616 // Push the MachineInstr to OpsToUpdate.
5617 // If it is safe to remove CmpInstr, the condition code of these
5618 // instructions will be modified.
5619 OpsToUpdate.push_back(std::make_pair(&Instr, ReplacementCC));
5620 }
5621 if (ModifyEFLAGS || Instr.killsRegister(X86::EFLAGS, TRI)) {
5622 // It is safe to remove CmpInstr if EFLAGS is updated again or killed.
5623 FlagsMayLiveOut = false;
5624 break;
5625 }
5626 }
5627
5628 // If we have to update users but EFLAGS is live-out abort, since we cannot
5629 // easily find all of the users.
5630 if ((MI != nullptr || ShouldUpdateCC) && FlagsMayLiveOut) {
5631 for (MachineBasicBlock *Successor : CmpMBB.successors())
5632 if (Successor->isLiveIn(X86::EFLAGS))
5633 return false;
5634 }
5635
5636 // The instruction to be updated is either Sub or MI.
5637 assert((MI == nullptr || Sub == nullptr) && "Should not have Sub and MI set");
5638 Sub = MI != nullptr ? MI : Sub;
5639 MachineBasicBlock *SubBB = Sub->getParent();
5640 // Move Movr0Inst to the appropriate place before Sub.
5641 if (Movr0Inst) {
5642 // Only move within the same block so we don't accidentally move to a
5643 // block with higher execution frequency.
5644 if (&CmpMBB != SubBB)
5645 return false;
5646 // Look backwards until we find a def that doesn't use the current EFLAGS.
5648 InsertE = Sub->getParent()->rend();
5649 for (; InsertI != InsertE; ++InsertI) {
5650 MachineInstr *Instr = &*InsertI;
5651 if (!Instr->readsRegister(X86::EFLAGS, TRI) &&
5652 Instr->modifiesRegister(X86::EFLAGS, TRI)) {
5653 Movr0Inst->getParent()->remove(Movr0Inst);
5654 Instr->getParent()->insert(MachineBasicBlock::iterator(Instr),
5655 Movr0Inst);
5656 break;
5657 }
5658 }
5659 if (InsertI == InsertE)
5660 return false;
5661 }
5662
5663 // Replace non-NF with NF instructions.
5664 for (auto &Inst : InstsToUpdate) {
5665 Inst.first->setDesc(get(Inst.second));
5666 Inst.first->removeOperand(
5667 Inst.first->findRegisterDefOperandIdx(X86::EFLAGS, /*TRI=*/nullptr));
5668 }
5669
5670 // Make sure Sub instruction defines EFLAGS and mark the def live.
5671 MachineOperand *FlagDef =
5672 Sub->findRegisterDefOperand(X86::EFLAGS, /*TRI=*/nullptr);
5673 assert(FlagDef && "Unable to locate a def EFLAGS operand");
5674 FlagDef->setIsDead(false);
5675
5676 CmpInstr.eraseFromParent();
5677
5678 // Modify the condition code of instructions in OpsToUpdate.
5679 for (auto &Op : OpsToUpdate) {
5680 Op.first->getOperand(Op.first->getDesc().getNumOperands() - 1)
5681 .setImm(Op.second);
5682 }
5683 // Add EFLAGS to block live-ins between CmpBB and block of flags producer.
5684 for (MachineBasicBlock *MBB = &CmpMBB; MBB != SubBB;
5685 MBB = *MBB->pred_begin()) {
5686 assert(MBB->pred_size() == 1 && "Expected exactly one predecessor");
5687 if (!MBB->isLiveIn(X86::EFLAGS))
5688 MBB->addLiveIn(X86::EFLAGS);
5689 }
5690 return true;
5691}
5692
5693/// \returns true if the instruction can be changed to COPY when imm is 0.
5694static bool canConvert2Copy(unsigned Opc) {
5695 switch (Opc) {
5696 default:
5697 return false;
5698 CASE_ND(ADD64ri32)
5699 CASE_ND(SUB64ri32)
5700 CASE_ND(OR64ri32)
5701 CASE_ND(XOR64ri32)
5702 CASE_ND(ADD32ri)
5703 CASE_ND(SUB32ri)
5704 CASE_ND(OR32ri)
5705 CASE_ND(XOR32ri)
5706 return true;
5707 }
5708}
5709
5710/// Convert an ALUrr opcode to corresponding ALUri opcode. Such as
5711/// ADD32rr ==> ADD32ri
5712static unsigned convertALUrr2ALUri(unsigned Opc) {
5713 switch (Opc) {
5714 default:
5715 return 0;
5716#define FROM_TO(FROM, TO) \
5717 case X86::FROM: \
5718 return X86::TO; \
5719 case X86::FROM##_ND: \
5720 return X86::TO##_ND;
5721 FROM_TO(ADD64rr, ADD64ri32)
5722 FROM_TO(ADC64rr, ADC64ri32)
5723 FROM_TO(SUB64rr, SUB64ri32)
5724 FROM_TO(SBB64rr, SBB64ri32)
5725 FROM_TO(AND64rr, AND64ri32)
5726 FROM_TO(OR64rr, OR64ri32)
5727 FROM_TO(XOR64rr, XOR64ri32)
5728 FROM_TO(SHR64rCL, SHR64ri)
5729 FROM_TO(SHL64rCL, SHL64ri)
5730 FROM_TO(SAR64rCL, SAR64ri)
5731 FROM_TO(ROL64rCL, ROL64ri)
5732 FROM_TO(ROR64rCL, ROR64ri)
5733 FROM_TO(RCL64rCL, RCL64ri)
5734 FROM_TO(RCR64rCL, RCR64ri)
5735 FROM_TO(ADD32rr, ADD32ri)
5736 FROM_TO(ADC32rr, ADC32ri)
5737 FROM_TO(SUB32rr, SUB32ri)
5738 FROM_TO(SBB32rr, SBB32ri)
5739 FROM_TO(AND32rr, AND32ri)
5740 FROM_TO(OR32rr, OR32ri)
5741 FROM_TO(XOR32rr, XOR32ri)
5742 FROM_TO(SHR32rCL, SHR32ri)
5743 FROM_TO(SHL32rCL, SHL32ri)
5744 FROM_TO(SAR32rCL, SAR32ri)
5745 FROM_TO(ROL32rCL, ROL32ri)
5746 FROM_TO(ROR32rCL, ROR32ri)
5747 FROM_TO(RCL32rCL, RCL32ri)
5748 FROM_TO(RCR32rCL, RCR32ri)
5749#undef FROM_TO
5750#define FROM_TO(FROM, TO) \
5751 case X86::FROM: \
5752 return X86::TO;
5753 FROM_TO(TEST64rr, TEST64ri32)
5754 FROM_TO(CTEST64rr, CTEST64ri32)
5755 FROM_TO(CMP64rr, CMP64ri32)
5756 FROM_TO(CCMP64rr, CCMP64ri32)
5757 FROM_TO(TEST32rr, TEST32ri)
5758 FROM_TO(CTEST32rr, CTEST32ri)
5759 FROM_TO(CMP32rr, CMP32ri)
5760 FROM_TO(CCMP32rr, CCMP32ri)
5761#undef FROM_TO
5762 }
5763}
5764
5765/// Reg is assigned ImmVal in DefMI, and is used in UseMI.
5766/// If MakeChange is true, this function tries to replace Reg by ImmVal in
5767/// UseMI. If MakeChange is false, just check if folding is possible.
5768//
5769/// \returns true if folding is successful or possible.
5770bool X86InstrInfo::foldImmediateImpl(MachineInstr &UseMI, MachineInstr *DefMI,
5771 Register Reg, int64_t ImmVal,
5773 bool MakeChange) const {
5774 bool Modified = false;
5775
5776 // 64 bit operations accept sign extended 32 bit immediates.
5777 // 32 bit operations accept all 32 bit immediates, so we don't need to check
5778 // them.
5779 const TargetRegisterClass *RC = nullptr;
5780 if (Reg.isVirtual())
5781 RC = MRI->getRegClass(Reg);
5782 if ((Reg.isPhysical() && X86::GR64RegClass.contains(Reg)) ||
5783 (Reg.isVirtual() && X86::GR64RegClass.hasSubClassEq(RC))) {
5784 if (!isInt<32>(ImmVal))
5785 return false;
5786 }
5787
5788 if (UseMI.findRegisterUseOperand(Reg, /*TRI=*/nullptr)->getSubReg())
5789 return false;
5790 // Immediate has larger code size than register. So avoid folding the
5791 // immediate if it has more than 1 use and we are optimizing for size.
5792 if (UseMI.getMF()->getFunction().hasOptSize() && Reg.isVirtual() &&
5793 !MRI->hasOneNonDBGUse(Reg))
5794 return false;
5795
5796 unsigned Opc = UseMI.getOpcode();
5797 unsigned NewOpc;
5798 if (Opc == TargetOpcode::COPY) {
5799 Register ToReg = UseMI.getOperand(0).getReg();
5800 const TargetRegisterClass *RC = nullptr;
5801 if (ToReg.isVirtual())
5802 RC = MRI->getRegClass(ToReg);
5803 bool GR32Reg = (ToReg.isVirtual() && X86::GR32RegClass.hasSubClassEq(RC)) ||
5804 (ToReg.isPhysical() && X86::GR32RegClass.contains(ToReg));
5805 bool GR64Reg = (ToReg.isVirtual() && X86::GR64RegClass.hasSubClassEq(RC)) ||
5806 (ToReg.isPhysical() && X86::GR64RegClass.contains(ToReg));
5807 bool GR8Reg = (ToReg.isVirtual() && X86::GR8RegClass.hasSubClassEq(RC)) ||
5808 (ToReg.isPhysical() && X86::GR8RegClass.contains(ToReg));
5809
5810 if (ImmVal == 0) {
5811 // We have MOV32r0 only.
5812 if (!GR32Reg)
5813 return false;
5814 }
5815
5816 if (GR64Reg) {
5817 if (isUInt<32>(ImmVal))
5818 NewOpc = X86::MOV32ri64;
5819 else
5820 NewOpc = X86::MOV64ri;
5821 } else if (GR32Reg) {
5822 NewOpc = X86::MOV32ri;
5823 if (ImmVal == 0) {
5824 // MOV32r0 clobbers EFLAGS.
5825 const TargetRegisterInfo *TRI = &getRegisterInfo();
5826 if (UseMI.getParent()->computeRegisterLiveness(
5827 TRI, X86::EFLAGS, UseMI) != MachineBasicBlock::LQR_Dead)
5828 return false;
5829
5830 // MOV32r0 is different than other cases because it doesn't encode the
5831 // immediate in the instruction. So we directly modify it here.
5832 if (!MakeChange)
5833 return true;
5834 UseMI.setDesc(get(X86::MOV32r0));
5835 UseMI.removeOperand(
5836 UseMI.findRegisterUseOperandIdx(Reg, /*TRI=*/nullptr));
5837 UseMI.addOperand(MachineOperand::CreateReg(X86::EFLAGS, /*isDef=*/true,
5838 /*isImp=*/true,
5839 /*isKill=*/false,
5840 /*isDead=*/true));
5841 Modified = true;
5842 }
5843 } else if (GR8Reg)
5844 NewOpc = X86::MOV8ri;
5845 else
5846 return false;
5847 } else
5848 NewOpc = convertALUrr2ALUri(Opc);
5849
5850 if (!NewOpc)
5851 return false;
5852
5853 // For SUB instructions the immediate can only be the second source operand.
5854 if ((NewOpc == X86::SUB64ri32 || NewOpc == X86::SUB32ri ||
5855 NewOpc == X86::SBB64ri32 || NewOpc == X86::SBB32ri ||
5856 NewOpc == X86::SUB64ri32_ND || NewOpc == X86::SUB32ri_ND ||
5857 NewOpc == X86::SBB64ri32_ND || NewOpc == X86::SBB32ri_ND) &&
5858 UseMI.findRegisterUseOperandIdx(Reg, /*TRI=*/nullptr) != 2)
5859 return false;
5860 // For CMP instructions the immediate can only be at index 1.
5861 if (((NewOpc == X86::CMP64ri32 || NewOpc == X86::CMP32ri) ||
5862 (NewOpc == X86::CCMP64ri32 || NewOpc == X86::CCMP32ri)) &&
5863 UseMI.findRegisterUseOperandIdx(Reg, /*TRI=*/nullptr) != 1)
5864 return false;
5865
5866 using namespace X86;
5867 if (isSHL(Opc) || isSHR(Opc) || isSAR(Opc) || isROL(Opc) || isROR(Opc) ||
5868 isRCL(Opc) || isRCR(Opc)) {
5869 unsigned RegIdx = UseMI.findRegisterUseOperandIdx(Reg, /*TRI=*/nullptr);
5870 if (RegIdx < 2)
5871 return false;
5872 if (!isInt<8>(ImmVal))
5873 return false;
5874 assert(Reg == X86::CL);
5875
5876 if (!MakeChange)
5877 return true;
5878 UseMI.setDesc(get(NewOpc));
5879 UseMI.removeOperand(RegIdx);
5880 UseMI.addOperand(MachineOperand::CreateImm(ImmVal));
5881 // Reg is physical register $cl, so we don't know if DefMI is dead through
5882 // MRI. Let the caller handle it, or pass dead-mi-elimination can delete
5883 // the dead physical register define instruction.
5884 return true;
5885 }
5886
5887 if (!MakeChange)
5888 return true;
5889
5890 if (!Modified) {
5891 // Modify the instruction.
5892 if (ImmVal == 0 && canConvert2Copy(NewOpc) &&
5893 UseMI.registerDefIsDead(X86::EFLAGS, /*TRI=*/nullptr)) {
5894 // %100 = add %101, 0
5895 // ==>
5896 // %100 = COPY %101
5897 UseMI.setDesc(get(TargetOpcode::COPY));
5898 UseMI.removeOperand(
5899 UseMI.findRegisterUseOperandIdx(Reg, /*TRI=*/nullptr));
5900 UseMI.removeOperand(
5901 UseMI.findRegisterDefOperandIdx(X86::EFLAGS, /*TRI=*/nullptr));
5902 UseMI.untieRegOperand(0);
5905 } else {
5906 unsigned Op1 = 1, Op2 = CommuteAnyOperandIndex;
5907 unsigned ImmOpNum = 2;
5908 if (!UseMI.getOperand(0).isDef()) {
5909 Op1 = 0; // TEST, CMP, CTEST, CCMP
5910 ImmOpNum = 1;
5911 }
5912 if (Opc == TargetOpcode::COPY)
5913 ImmOpNum = 1;
5914 if (findCommutedOpIndices(UseMI, Op1, Op2) &&
5915 UseMI.getOperand(Op1).getReg() == Reg)
5916 commuteInstruction(UseMI);
5917
5918 assert(UseMI.getOperand(ImmOpNum).getReg() == Reg);
5919 UseMI.setDesc(get(NewOpc));
5920 UseMI.getOperand(ImmOpNum).ChangeToImmediate(ImmVal);
5921 }
5922 }
5923
5924 if (Reg.isVirtual() && MRI->use_nodbg_empty(Reg))
5926
5927 return true;
5928}
5929
5930/// foldImmediate - 'Reg' is known to be defined by a move immediate
5931/// instruction, try to fold the immediate into the use instruction.
5933 Register Reg, MachineRegisterInfo *MRI) const {
5934 int64_t ImmVal;
5935 if (!getConstValDefinedInReg(DefMI, Reg, ImmVal))
5936 return false;
5937
5938 return foldImmediateImpl(UseMI, &DefMI, Reg, ImmVal, MRI, true);
5939}
5940
5941/// Expand a single-def pseudo instruction to a two-addr
5942/// instruction with two undef reads of the register being defined.
5943/// This is used for mapping:
5944/// %xmm4 = V_SET0
5945/// to:
5946/// %xmm4 = PXORrr undef %xmm4, undef %xmm4
5947///
5949 const MCInstrDesc &Desc) {
5950 assert(Desc.getNumOperands() == 3 && "Expected two-addr instruction.");
5951 Register Reg = MIB.getReg(0);
5952 MIB->setDesc(Desc);
5953
5954 // MachineInstr::addOperand() will insert explicit operands before any
5955 // implicit operands.
5957 // But we don't trust that.
5958 assert(MIB.getReg(1) == Reg && MIB.getReg(2) == Reg && "Misplaced operand");
5959 return true;
5960}
5961
5962/// Expand a single-def pseudo instruction to a two-addr
5963/// instruction with two %k0 reads.
5964/// This is used for mapping:
5965/// %k4 = K_SET1
5966/// to:
5967/// %k4 = KXNORrr %k0, %k0
5969 Register Reg) {
5970 assert(Desc.getNumOperands() == 3 && "Expected two-addr instruction.");
5971 MIB->setDesc(Desc);
5973 return true;
5974}
5975
5977 bool MinusOne) {
5978 MachineBasicBlock &MBB = *MIB->getParent();
5979 const DebugLoc &DL = MIB->getDebugLoc();
5980 Register Reg = MIB.getReg(0);
5981
5982 // Insert the XOR.
5983 BuildMI(MBB, MIB.getInstr(), DL, TII.get(X86::XOR32rr), Reg)
5986
5987 // Turn the pseudo into an INC or DEC.
5988 MIB->setDesc(TII.get(MinusOne ? X86::DEC32r : X86::INC32r));
5989 MIB.addReg(Reg);
5990
5991 return true;
5992}
5993
5995 const TargetInstrInfo &TII,
5996 const X86Subtarget &Subtarget) {
5997 MachineBasicBlock &MBB = *MIB->getParent();
5998 const DebugLoc &DL = MIB->getDebugLoc();
5999 int64_t Imm = MIB->getOperand(1).getImm();
6000 assert(Imm != 0 && "Using push/pop for 0 is not efficient.");
6002
6003 int StackAdjustment;
6004
6005 if (Subtarget.is64Bit()) {
6006 assert(MIB->getOpcode() == X86::MOV64ImmSExti8 ||
6007 MIB->getOpcode() == X86::MOV32ImmSExti8);
6008
6009 // Can't use push/pop lowering if the function might write to the red zone.
6010 X86MachineFunctionInfo *X86FI =
6011 MBB.getParent()->getInfo<X86MachineFunctionInfo>();
6012 if (X86FI->getUsesRedZone()) {
6013 MIB->setDesc(TII.get(MIB->getOpcode() == X86::MOV32ImmSExti8
6014 ? X86::MOV32ri
6015 : X86::MOV64ri));
6016 return true;
6017 }
6018
6019 // 64-bit mode doesn't have 32-bit push/pop, so use 64-bit operations and
6020 // widen the register if necessary.
6021 StackAdjustment = 8;
6022 BuildMI(MBB, I, DL, TII.get(X86::PUSH64i32)).addImm(Imm);
6023 MIB->setDesc(TII.get(X86::POP64r));
6024 MIB->getOperand(0).setReg(getX86SubSuperRegister(MIB.getReg(0), 64));
6025 } else {
6026 assert(MIB->getOpcode() == X86::MOV32ImmSExti8);
6027 StackAdjustment = 4;
6028 BuildMI(MBB, I, DL, TII.get(X86::PUSH32i)).addImm(Imm);
6029 MIB->setDesc(TII.get(X86::POP32r));
6030 }
6031 MIB->removeOperand(1);
6032 MIB->addImplicitDefUseOperands(*MBB.getParent());
6033
6034 // Build CFI if necessary.
6035 MachineFunction &MF = *MBB.getParent();
6036 const X86FrameLowering *TFL = Subtarget.getFrameLowering();
6037 bool IsWin64Prologue = MF.getTarget().getMCAsmInfo()->usesWindowsCFI();
6038 bool NeedsDwarfCFI = !IsWin64Prologue && MF.needsFrameMoves();
6039 bool EmitCFI = !TFL->hasFP(MF) && NeedsDwarfCFI;
6040 if (EmitCFI) {
6041 TFL->BuildCFI(
6042 MBB, I, DL,
6043 MCCFIInstruction::createAdjustCfaOffset(nullptr, StackAdjustment));
6044 TFL->BuildCFI(
6045 MBB, std::next(I), DL,
6046 MCCFIInstruction::createAdjustCfaOffset(nullptr, -StackAdjustment));
6047 }
6048
6049 return true;
6050}
6051
6052// LoadStackGuard has so far only been implemented for 64-bit MachO. Different
6053// code sequence is needed for other targets.
6055 const TargetInstrInfo &TII) {
6056 MachineBasicBlock &MBB = *MIB->getParent();
6057 const DebugLoc &DL = MIB->getDebugLoc();
6058 Register Reg = MIB.getReg(0);
6059 const GlobalValue *GV =
6060 cast<GlobalValue>((*MIB->memoperands_begin())->getValue());
6061 auto Flags = MachineMemOperand::MOLoad |
6064 MachineMemOperand *MMO = MBB.getParent()->getMachineMemOperand(
6065 MachinePointerInfo::getGOT(*MBB.getParent()), Flags, 8, Align(8));
6067
6068 BuildMI(MBB, I, DL, TII.get(X86::MOV64rm), Reg)
6069 .addReg(X86::RIP)
6070 .addImm(1)
6071 .addReg(0)
6073 .addReg(0)
6074 .addMemOperand(MMO);
6075 MIB->setDebugLoc(DL);
6076 MIB->setDesc(TII.get(X86::MOV64rm));
6078}
6079
6081 MachineBasicBlock &MBB = *MIB->getParent();
6082 MachineFunction &MF = *MBB.getParent();
6083 const X86Subtarget &Subtarget = MF.getSubtarget<X86Subtarget>();
6084 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
6085 unsigned XorOp =
6086 MIB->getOpcode() == X86::XOR64_FP ? X86::XOR64rr : X86::XOR32rr;
6087 MIB->setDesc(TII.get(XorOp));
6088 MIB.addReg(TRI->getFrameRegister(MF), RegState::Undef);
6089 return true;
6090}
6091
6092// This is used to handle spills for 128/256-bit registers when we have AVX512,
6093// but not VLX. If it uses an extended register we need to use an instruction
6094// that loads the lower 128/256-bit, but is available with only AVX512F.
6096 const TargetRegisterInfo *TRI,
6097 const MCInstrDesc &LoadDesc,
6098 const MCInstrDesc &BroadcastDesc, unsigned SubIdx) {
6099 Register DestReg = MIB.getReg(0);
6100 // Check if DestReg is XMM16-31 or YMM16-31.
6101 if (TRI->getEncodingValue(DestReg) < 16) {
6102 // We can use a normal VEX encoded load.
6103 MIB->setDesc(LoadDesc);
6104 } else {
6105 // Use a 128/256-bit VBROADCAST instruction.
6106 MIB->setDesc(BroadcastDesc);
6107 // Change the destination to a 512-bit register.
6108 DestReg = TRI->getMatchingSuperReg(DestReg, SubIdx, &X86::VR512RegClass);
6109 MIB->getOperand(0).setReg(DestReg);
6110 }
6111 return true;
6112}
6113
6114// This is used to handle spills for 128/256-bit registers when we have AVX512,
6115// but not VLX. If it uses an extended register we need to use an instruction
6116// that stores the lower 128/256-bit, but is available with only AVX512F.
6118 const TargetRegisterInfo *TRI,
6119 const MCInstrDesc &StoreDesc,
6120 const MCInstrDesc &ExtractDesc, unsigned SubIdx) {
6121 Register SrcReg = MIB.getReg(X86::AddrNumOperands);
6122 // Check if DestReg is XMM16-31 or YMM16-31.
6123 if (TRI->getEncodingValue(SrcReg) < 16) {
6124 // We can use a normal VEX encoded store.
6125 MIB->setDesc(StoreDesc);
6126 } else {
6127 // Use a VEXTRACTF instruction.
6128 MIB->setDesc(ExtractDesc);
6129 // Change the destination to a 512-bit register.
6130 SrcReg = TRI->getMatchingSuperReg(SrcReg, SubIdx, &X86::VR512RegClass);
6132 MIB.addImm(0x0); // Append immediate to extract from the lower bits.
6133 }
6134
6135 return true;
6136}
6137
6139 MIB->setDesc(Desc);
6140 int64_t ShiftAmt = MIB->getOperand(2).getImm();
6141 // Temporarily remove the immediate so we can add another source register.
6142 MIB->removeOperand(2);
6143 // Add the register. Don't copy the kill flag if there is one.
6144 MIB.addReg(MIB.getReg(1), getUndefRegState(MIB->getOperand(1).isUndef()));
6145 // Add back the immediate.
6146 MIB.addImm(ShiftAmt);
6147 return true;
6148}
6149
6151 const TargetInstrInfo &TII, bool HasAVX) {
6152 unsigned NewOpc;
6153 if (MI.getOpcode() == X86::MOVSHPrm) {
6154 NewOpc = HasAVX ? X86::VMOVSSrm : X86::MOVSSrm;
6155 Register Reg = MI.getOperand(0).getReg();
6156 if (Reg > X86::XMM15)
6157 NewOpc = X86::VMOVSSZrm;
6158 } else {
6159 NewOpc = HasAVX ? X86::VMOVSSmr : X86::MOVSSmr;
6160 Register Reg = MI.getOperand(5).getReg();
6161 if (Reg > X86::XMM15)
6162 NewOpc = X86::VMOVSSZmr;
6163 }
6164
6165 MIB->setDesc(TII.get(NewOpc));
6166 return true;
6167}
6168
6170 bool HasAVX = Subtarget.hasAVX();
6171 MachineInstrBuilder MIB(*MI.getParent()->getParent(), MI);
6172 switch (MI.getOpcode()) {
6173 case X86::MOV32r0:
6174 return Expand2AddrUndef(MIB, get(X86::XOR32rr));
6175 case X86::MOV32r1:
6176 return expandMOV32r1(MIB, *this, /*MinusOne=*/false);
6177 case X86::MOV32r_1:
6178 return expandMOV32r1(MIB, *this, /*MinusOne=*/true);
6179 case X86::MOV32ImmSExti8:
6180 case X86::MOV64ImmSExti8:
6181 return ExpandMOVImmSExti8(MIB, *this, Subtarget);
6182 case X86::SETB_C32r:
6183 return Expand2AddrUndef(MIB, get(X86::SBB32rr));
6184 case X86::SETB_C64r:
6185 return Expand2AddrUndef(MIB, get(X86::SBB64rr));
6186 case X86::MMX_SET0:
6187 return Expand2AddrUndef(MIB, get(X86::MMX_PXORrr));
6188 case X86::V_SET0:
6189 case X86::FsFLD0SS:
6190 case X86::FsFLD0SD:
6191 case X86::FsFLD0SH:
6192 case X86::FsFLD0F128:
6193 return Expand2AddrUndef(MIB, get(HasAVX ? X86::VXORPSrr : X86::XORPSrr));
6194 case X86::AVX_SET0: {
6195 assert(HasAVX && "AVX not supported");
6197 Register SrcReg = MIB.getReg(0);
6198 Register XReg = TRI->getSubReg(SrcReg, X86::sub_xmm);
6199 MIB->getOperand(0).setReg(XReg);
6200 Expand2AddrUndef(MIB, get(X86::VXORPSrr));
6201 MIB.addReg(SrcReg, RegState::ImplicitDefine);
6202 return true;
6203 }
6204 case X86::AVX512_128_SET0:
6205 case X86::AVX512_FsFLD0SH:
6206 case X86::AVX512_FsFLD0SS:
6207 case X86::AVX512_FsFLD0SD:
6208 case X86::AVX512_FsFLD0F128: {
6209 bool HasVLX = Subtarget.hasVLX();
6210 Register SrcReg = MIB.getReg(0);
6212 if (HasVLX || TRI->getEncodingValue(SrcReg) < 16)
6213 return Expand2AddrUndef(MIB,
6214 get(HasVLX ? X86::VPXORDZ128rr : X86::VXORPSrr));
6215 // Extended register without VLX. Use a larger XOR.
6216 SrcReg =
6217 TRI->getMatchingSuperReg(SrcReg, X86::sub_xmm, &X86::VR512RegClass);
6218 MIB->getOperand(0).setReg(SrcReg);
6219 return Expand2AddrUndef(MIB, get(X86::VPXORDZrr));
6220 }
6221 case X86::AVX512_256_SET0:
6222 case X86::AVX512_512_SET0: {
6223 bool HasVLX = Subtarget.hasVLX();
6224 Register SrcReg = MIB.getReg(0);
6226 if (HasVLX || TRI->getEncodingValue(SrcReg) < 16) {
6227 Register XReg = TRI->getSubReg(SrcReg, X86::sub_xmm);
6228 MIB->getOperand(0).setReg(XReg);
6229 Expand2AddrUndef(MIB, get(HasVLX ? X86::VPXORDZ128rr : X86::VXORPSrr));
6230 MIB.addReg(SrcReg, RegState::ImplicitDefine);
6231 return true;
6232 }
6233 if (MI.getOpcode() == X86::AVX512_256_SET0) {
6234 // No VLX so we must reference a zmm.
6235 MCRegister ZReg =
6236 TRI->getMatchingSuperReg(SrcReg, X86::sub_ymm, &X86::VR512RegClass);
6237 MIB->getOperand(0).setReg(ZReg);
6238 }
6239 return Expand2AddrUndef(MIB, get(X86::VPXORDZrr));
6240 }
6241 case X86::MOVSHPmr:
6242 case X86::MOVSHPrm:
6243 return expandMOVSHP(MIB, MI, *this, Subtarget.hasAVX());
6244 case X86::V_SETALLONES:
6245 return Expand2AddrUndef(MIB,
6246 get(HasAVX ? X86::VPCMPEQDrr : X86::PCMPEQDrr));
6247 case X86::AVX2_SETALLONES:
6248 return Expand2AddrUndef(MIB, get(X86::VPCMPEQDYrr));
6249 case X86::AVX1_SETALLONES: {
6250 Register Reg = MIB.getReg(0);
6251 // VCMPPSYrri with an immediate 0xf should produce VCMPTRUEPS.
6252 MIB->setDesc(get(X86::VCMPPSYrri));
6253 MIB.addReg(Reg, RegState::Undef).addReg(Reg, RegState::Undef).addImm(0xf);
6254 return true;
6255 }
6256 case X86::AVX512_512_SETALLONES: {
6257 Register Reg = MIB.getReg(0);
6258 MIB->setDesc(get(X86::VPTERNLOGDZrri));
6259 // VPTERNLOGD needs 3 register inputs and an immediate.
6260 // 0xff will return 1s for any input.
6261 MIB.addReg(Reg, RegState::Undef)
6262 .addReg(Reg, RegState::Undef)
6263 .addReg(Reg, RegState::Undef)
6264 .addImm(0xff);
6265 return true;
6266 }
6267 case X86::AVX512_512_SEXT_MASK_32:
6268 case X86::AVX512_512_SEXT_MASK_64: {
6269 Register Reg = MIB.getReg(0);
6270 Register MaskReg = MIB.getReg(1);
6271 unsigned MaskState = getRegState(MIB->getOperand(1));
6272 unsigned Opc = (MI.getOpcode() == X86::AVX512_512_SEXT_MASK_64)
6273 ? X86::VPTERNLOGQZrrikz
6274 : X86::VPTERNLOGDZrrikz;
6275 MI.removeOperand(1);
6276 MIB->setDesc(get(Opc));
6277 // VPTERNLOG needs 3 register inputs and an immediate.
6278 // 0xff will return 1s for any input.
6279 MIB.addReg(Reg, RegState::Undef)
6280 .addReg(MaskReg, MaskState)
6281 .addReg(Reg, RegState::Undef)
6282 .addReg(Reg, RegState::Undef)
6283 .addImm(0xff);
6284 return true;
6285 }
6286 case X86::VMOVAPSZ128rm_NOVLX:
6287 return expandNOVLXLoad(MIB, &getRegisterInfo(), get(X86::VMOVAPSrm),
6288 get(X86::VBROADCASTF32X4Zrm), X86::sub_xmm);
6289 case X86::VMOVUPSZ128rm_NOVLX:
6290 return expandNOVLXLoad(MIB, &getRegisterInfo(), get(X86::VMOVUPSrm),
6291 get(X86::VBROADCASTF32X4Zrm), X86::sub_xmm);
6292 case X86::VMOVAPSZ256rm_NOVLX:
6293 return expandNOVLXLoad(MIB, &getRegisterInfo(), get(X86::VMOVAPSYrm),
6294 get(X86::VBROADCASTF64X4Zrm), X86::sub_ymm);
6295 case X86::VMOVUPSZ256rm_NOVLX:
6296 return expandNOVLXLoad(MIB, &getRegisterInfo(), get(X86::VMOVUPSYrm),
6297 get(X86::VBROADCASTF64X4Zrm), X86::sub_ymm);
6298 case X86::VMOVAPSZ128mr_NOVLX:
6299 return expandNOVLXStore(MIB, &getRegisterInfo(), get(X86::VMOVAPSmr),
6300 get(X86::VEXTRACTF32X4Zmri), X86::sub_xmm);
6301 case X86::VMOVUPSZ128mr_NOVLX:
6302 return expandNOVLXStore(MIB, &getRegisterInfo(), get(X86::VMOVUPSmr),
6303 get(X86::VEXTRACTF32X4Zmri), X86::sub_xmm);
6304 case X86::VMOVAPSZ256mr_NOVLX:
6305 return expandNOVLXStore(MIB, &getRegisterInfo(), get(X86::VMOVAPSYmr),
6306 get(X86::VEXTRACTF64X4Zmri), X86::sub_ymm);
6307 case X86::VMOVUPSZ256mr_NOVLX:
6308 return expandNOVLXStore(MIB, &getRegisterInfo(), get(X86::VMOVUPSYmr),
6309 get(X86::VEXTRACTF64X4Zmri), X86::sub_ymm);
6310 case X86::MOV32ri64: {
6311 Register Reg = MIB.getReg(0);
6312 Register Reg32 = RI.getSubReg(Reg, X86::sub_32bit);
6313 MI.setDesc(get(X86::MOV32ri));
6314 MIB->getOperand(0).setReg(Reg32);
6316 return true;
6317 }
6318
6319 case X86::RDFLAGS32:
6320 case X86::RDFLAGS64: {
6321 unsigned Is64Bit = MI.getOpcode() == X86::RDFLAGS64;
6322 MachineBasicBlock &MBB = *MIB->getParent();
6323
6324 MachineInstr *NewMI = BuildMI(MBB, MI, MIB->getDebugLoc(),
6325 get(Is64Bit ? X86::PUSHF64 : X86::PUSHF32))
6326 .getInstr();
6327
6328 // Permit reads of the EFLAGS and DF registers without them being defined.
6329 // This intrinsic exists to read external processor state in flags, such as
6330 // the trap flag, interrupt flag, and direction flag, none of which are
6331 // modeled by the backend.
6332 assert(NewMI->getOperand(2).getReg() == X86::EFLAGS &&
6333 "Unexpected register in operand! Should be EFLAGS.");
6334 NewMI->getOperand(2).setIsUndef();
6335 assert(NewMI->getOperand(3).getReg() == X86::DF &&
6336 "Unexpected register in operand! Should be DF.");
6337 NewMI->getOperand(3).setIsUndef();
6338
6339 MIB->setDesc(get(Is64Bit ? X86::POP64r : X86::POP32r));
6340 return true;
6341 }
6342
6343 case X86::WRFLAGS32:
6344 case X86::WRFLAGS64: {
6345 unsigned Is64Bit = MI.getOpcode() == X86::WRFLAGS64;
6346 MachineBasicBlock &MBB = *MIB->getParent();
6347
6348 BuildMI(MBB, MI, MIB->getDebugLoc(),
6349 get(Is64Bit ? X86::PUSH64r : X86::PUSH32r))
6350 .addReg(MI.getOperand(0).getReg());
6351 BuildMI(MBB, MI, MIB->getDebugLoc(),
6352 get(Is64Bit ? X86::POPF64 : X86::POPF32));
6353 MI.eraseFromParent();
6354 return true;
6355 }
6356
6357 // KNL does not recognize dependency-breaking idioms for mask registers,
6358 // so kxnor %k1, %k1, %k2 has a RAW dependence on %k1.
6359 // Using %k0 as the undef input register is a performance heuristic based
6360 // on the assumption that %k0 is used less frequently than the other mask
6361 // registers, since it is not usable as a write mask.
6362 // FIXME: A more advanced approach would be to choose the best input mask
6363 // register based on context.
6364 case X86::KSET0W:
6365 return Expand2AddrKreg(MIB, get(X86::KXORWkk), X86::K0);
6366 case X86::KSET0D:
6367 return Expand2AddrKreg(MIB, get(X86::KXORDkk), X86::K0);
6368 case X86::KSET0Q:
6369 return Expand2AddrKreg(MIB, get(X86::KXORQkk), X86::K0);
6370 case X86::KSET1W:
6371 return Expand2AddrKreg(MIB, get(X86::KXNORWkk), X86::K0);
6372 case X86::KSET1D:
6373 return Expand2AddrKreg(MIB, get(X86::KXNORDkk), X86::K0);
6374 case X86::KSET1Q:
6375 return Expand2AddrKreg(MIB, get(X86::KXNORQkk), X86::K0);
6376 case TargetOpcode::LOAD_STACK_GUARD:
6377 expandLoadStackGuard(MIB, *this);
6378 return true;
6379 case X86::XOR64_FP:
6380 case X86::XOR32_FP:
6381 return expandXorFP(MIB, *this);
6382 case X86::SHLDROT32ri:
6383 return expandSHXDROT(MIB, get(X86::SHLD32rri8));
6384 case X86::SHLDROT64ri:
6385 return expandSHXDROT(MIB, get(X86::SHLD64rri8));
6386 case X86::SHRDROT32ri:
6387 return expandSHXDROT(MIB, get(X86::SHRD32rri8));
6388 case X86::SHRDROT64ri:
6389 return expandSHXDROT(MIB, get(X86::SHRD64rri8));
6390 case X86::ADD8rr_DB:
6391 MIB->setDesc(get(X86::OR8rr));
6392 break;
6393 case X86::ADD16rr_DB:
6394 MIB->setDesc(get(X86::OR16rr));
6395 break;
6396 case X86::ADD32rr_DB:
6397 MIB->setDesc(get(X86::OR32rr));
6398 break;
6399 case X86::ADD64rr_DB:
6400 MIB->setDesc(get(X86::OR64rr));
6401 break;
6402 case X86::ADD8ri_DB:
6403 MIB->setDesc(get(X86::OR8ri));
6404 break;
6405 case X86::ADD16ri_DB:
6406 MIB->setDesc(get(X86::OR16ri));
6407 break;
6408 case X86::ADD32ri_DB:
6409 MIB->setDesc(get(X86::OR32ri));
6410 break;
6411 case X86::ADD64ri32_DB:
6412 MIB->setDesc(get(X86::OR64ri32));
6413 break;
6414 }
6415 return false;
6416}
6417
6418/// Return true for all instructions that only update
6419/// the first 32 or 64-bits of the destination register and leave the rest
6420/// unmodified. This can be used to avoid folding loads if the instructions
6421/// only update part of the destination register, and the non-updated part is
6422/// not needed. e.g. cvtss2sd, sqrtss. Unfolding the load from these
6423/// instructions breaks the partial register dependency and it can improve
6424/// performance. e.g.:
6425///
6426/// movss (%rdi), %xmm0
6427/// cvtss2sd %xmm0, %xmm0
6428///
6429/// Instead of
6430/// cvtss2sd (%rdi), %xmm0
6431///
6432/// FIXME: This should be turned into a TSFlags.
6433///
6434static bool hasPartialRegUpdate(unsigned Opcode, const X86Subtarget &Subtarget,
6435 bool ForLoadFold = false) {
6436 switch (Opcode) {
6437 case X86::CVTSI2SSrr:
6438 case X86::CVTSI2SSrm:
6439 case X86::CVTSI642SSrr:
6440 case X86::CVTSI642SSrm:
6441 case X86::CVTSI2SDrr:
6442 case X86::CVTSI2SDrm:
6443 case X86::CVTSI642SDrr:
6444 case X86::CVTSI642SDrm:
6445 // Load folding won't effect the undef register update since the input is
6446 // a GPR.
6447 return !ForLoadFold;
6448 case X86::CVTSD2SSrr:
6449 case X86::CVTSD2SSrm:
6450 case X86::CVTSS2SDrr:
6451 case X86::CVTSS2SDrm:
6452 case X86::MOVHPDrm:
6453 case X86::MOVHPSrm:
6454 case X86::MOVLPDrm:
6455 case X86::MOVLPSrm:
6456 case X86::RCPSSr:
6457 case X86::RCPSSm:
6458 case X86::RCPSSr_Int:
6459 case X86::RCPSSm_Int:
6460 case X86::ROUNDSDri:
6461 case X86::ROUNDSDmi:
6462 case X86::ROUNDSSri:
6463 case X86::ROUNDSSmi:
6464 case X86::RSQRTSSr:
6465 case X86::RSQRTSSm:
6466 case X86::RSQRTSSr_Int:
6467 case X86::RSQRTSSm_Int:
6468 case X86::SQRTSSr:
6469 case X86::SQRTSSm:
6470 case X86::SQRTSSr_Int:
6471 case X86::SQRTSSm_Int:
6472 case X86::SQRTSDr:
6473 case X86::SQRTSDm:
6474 case X86::SQRTSDr_Int:
6475 case X86::SQRTSDm_Int:
6476 return true;
6477 case X86::VFCMULCPHZ128rm:
6478 case X86::VFCMULCPHZ128rmb:
6479 case X86::VFCMULCPHZ128rmbkz:
6480 case X86::VFCMULCPHZ128rmkz:
6481 case X86::VFCMULCPHZ128rr:
6482 case X86::VFCMULCPHZ128rrkz:
6483 case X86::VFCMULCPHZ256rm:
6484 case X86::VFCMULCPHZ256rmb:
6485 case X86::VFCMULCPHZ256rmbkz:
6486 case X86::VFCMULCPHZ256rmkz:
6487 case X86::VFCMULCPHZ256rr:
6488 case X86::VFCMULCPHZ256rrkz:
6489 case X86::VFCMULCPHZrm:
6490 case X86::VFCMULCPHZrmb:
6491 case X86::VFCMULCPHZrmbkz:
6492 case X86::VFCMULCPHZrmkz:
6493 case X86::VFCMULCPHZrr:
6494 case X86::VFCMULCPHZrrb:
6495 case X86::VFCMULCPHZrrbkz:
6496 case X86::VFCMULCPHZrrkz:
6497 case X86::VFMULCPHZ128rm:
6498 case X86::VFMULCPHZ128rmb:
6499 case X86::VFMULCPHZ128rmbkz:
6500 case X86::VFMULCPHZ128rmkz:
6501 case X86::VFMULCPHZ128rr:
6502 case X86::VFMULCPHZ128rrkz:
6503 case X86::VFMULCPHZ256rm:
6504 case X86::VFMULCPHZ256rmb:
6505 case X86::VFMULCPHZ256rmbkz:
6506 case X86::VFMULCPHZ256rmkz:
6507 case X86::VFMULCPHZ256rr:
6508 case X86::VFMULCPHZ256rrkz:
6509 case X86::VFMULCPHZrm:
6510 case X86::VFMULCPHZrmb:
6511 case X86::VFMULCPHZrmbkz:
6512 case X86::VFMULCPHZrmkz:
6513 case X86::VFMULCPHZrr:
6514 case X86::VFMULCPHZrrb:
6515 case X86::VFMULCPHZrrbkz:
6516 case X86::VFMULCPHZrrkz:
6517 case X86::VFCMULCSHZrm:
6518 case X86::VFCMULCSHZrmkz:
6519 case X86::VFCMULCSHZrr:
6520 case X86::VFCMULCSHZrrb:
6521 case X86::VFCMULCSHZrrbkz:
6522 case X86::VFCMULCSHZrrkz:
6523 case X86::VFMULCSHZrm:
6524 case X86::VFMULCSHZrmkz:
6525 case X86::VFMULCSHZrr:
6526 case X86::VFMULCSHZrrb:
6527 case X86::VFMULCSHZrrbkz:
6528 case X86::VFMULCSHZrrkz:
6529 return Subtarget.hasMULCFalseDeps();
6530 case X86::VPERMDYrm:
6531 case X86::VPERMDYrr:
6532 case X86::VPERMQYmi:
6533 case X86::VPERMQYri:
6534 case X86::VPERMPSYrm:
6535 case X86::VPERMPSYrr:
6536 case X86::VPERMPDYmi:
6537 case X86::VPERMPDYri:
6538 case X86::VPERMDZ256rm:
6539 case X86::VPERMDZ256rmb:
6540 case X86::VPERMDZ256rmbkz:
6541 case X86::VPERMDZ256rmkz:
6542 case X86::VPERMDZ256rr:
6543 case X86::VPERMDZ256rrkz:
6544 case X86::VPERMDZrm:
6545 case X86::VPERMDZrmb:
6546 case X86::VPERMDZrmbkz:
6547 case X86::VPERMDZrmkz:
6548 case X86::VPERMDZrr:
6549 case X86::VPERMDZrrkz:
6550 case X86::VPERMQZ256mbi:
6551 case X86::VPERMQZ256mbikz:
6552 case X86::VPERMQZ256mi:
6553 case X86::VPERMQZ256mikz:
6554 case X86::VPERMQZ256ri:
6555 case X86::VPERMQZ256rikz:
6556 case X86::VPERMQZ256rm:
6557 case X86::VPERMQZ256rmb:
6558 case X86::VPERMQZ256rmbkz:
6559 case X86::VPERMQZ256rmkz:
6560 case X86::VPERMQZ256rr:
6561 case X86::VPERMQZ256rrkz:
6562 case X86::VPERMQZmbi:
6563 case X86::VPERMQZmbikz:
6564 case X86::VPERMQZmi:
6565 case X86::VPERMQZmikz:
6566 case X86::VPERMQZri:
6567 case X86::VPERMQZrikz:
6568 case X86::VPERMQZrm:
6569 case X86::VPERMQZrmb:
6570 case X86::VPERMQZrmbkz:
6571 case X86::VPERMQZrmkz:
6572 case X86::VPERMQZrr:
6573 case X86::VPERMQZrrkz:
6574 case X86::VPERMPSZ256rm:
6575 case X86::VPERMPSZ256rmb:
6576 case X86::VPERMPSZ256rmbkz:
6577 case X86::VPERMPSZ256rmkz:
6578 case X86::VPERMPSZ256rr:
6579 case X86::VPERMPSZ256rrkz:
6580 case X86::VPERMPSZrm:
6581 case X86::VPERMPSZrmb:
6582 case X86::VPERMPSZrmbkz:
6583 case X86::VPERMPSZrmkz:
6584 case X86::VPERMPSZrr:
6585 case X86::VPERMPSZrrkz:
6586 case X86::VPERMPDZ256mbi:
6587 case X86::VPERMPDZ256mbikz:
6588 case X86::VPERMPDZ256mi:
6589 case X86::VPERMPDZ256mikz:
6590 case X86::VPERMPDZ256ri:
6591 case X86::VPERMPDZ256rikz:
6592 case X86::VPERMPDZ256rm:
6593 case X86::VPERMPDZ256rmb:
6594 case X86::VPERMPDZ256rmbkz:
6595 case X86::VPERMPDZ256rmkz:
6596 case X86::VPERMPDZ256rr:
6597 case X86::VPERMPDZ256rrkz:
6598 case X86::VPERMPDZmbi:
6599 case X86::VPERMPDZmbikz:
6600 case X86::VPERMPDZmi:
6601 case X86::VPERMPDZmikz:
6602 case X86::VPERMPDZri:
6603 case X86::VPERMPDZrikz:
6604 case X86::VPERMPDZrm:
6605 case X86::VPERMPDZrmb:
6606 case X86::VPERMPDZrmbkz:
6607 case X86::VPERMPDZrmkz:
6608 case X86::VPERMPDZrr:
6609 case X86::VPERMPDZrrkz:
6610 return Subtarget.hasPERMFalseDeps();
6611 case X86::VRANGEPDZ128rmbi:
6612 case X86::VRANGEPDZ128rmbikz:
6613 case X86::VRANGEPDZ128rmi:
6614 case X86::VRANGEPDZ128rmikz:
6615 case X86::VRANGEPDZ128rri:
6616 case X86::VRANGEPDZ128rrikz:
6617 case X86::VRANGEPDZ256rmbi:
6618 case X86::VRANGEPDZ256rmbikz:
6619 case X86::VRANGEPDZ256rmi:
6620 case X86::VRANGEPDZ256rmikz:
6621 case X86::VRANGEPDZ256rri:
6622 case X86::VRANGEPDZ256rrikz:
6623 case X86::VRANGEPDZrmbi:
6624 case X86::VRANGEPDZrmbikz:
6625 case X86::VRANGEPDZrmi:
6626 case X86::VRANGEPDZrmikz:
6627 case X86::VRANGEPDZrri:
6628 case X86::VRANGEPDZrrib:
6629 case X86::VRANGEPDZrribkz:
6630 case X86::VRANGEPDZrrikz:
6631 case X86::VRANGEPSZ128rmbi:
6632 case X86::VRANGEPSZ128rmbikz:
6633 case X86::VRANGEPSZ128rmi:
6634 case X86::VRANGEPSZ128rmikz:
6635 case X86::VRANGEPSZ128rri:
6636 case X86::VRANGEPSZ128rrikz:
6637 case X86::VRANGEPSZ256rmbi:
6638 case X86::VRANGEPSZ256rmbikz:
6639 case X86::VRANGEPSZ256rmi:
6640 case X86::VRANGEPSZ256rmikz:
6641 case X86::VRANGEPSZ256rri:
6642 case X86::VRANGEPSZ256rrikz:
6643 case X86::VRANGEPSZrmbi:
6644 case X86::VRANGEPSZrmbikz:
6645 case X86::VRANGEPSZrmi:
6646 case X86::VRANGEPSZrmikz:
6647 case X86::VRANGEPSZrri:
6648 case X86::VRANGEPSZrrib:
6649 case X86::VRANGEPSZrribkz:
6650 case X86::VRANGEPSZrrikz:
6651 case X86::VRANGESDZrmi:
6652 case X86::VRANGESDZrmikz:
6653 case X86::VRANGESDZrri:
6654 case X86::VRANGESDZrrib:
6655 case X86::VRANGESDZrribkz:
6656 case X86::VRANGESDZrrikz:
6657 case X86::VRANGESSZrmi:
6658 case X86::VRANGESSZrmikz:
6659 case X86::VRANGESSZrri:
6660 case X86::VRANGESSZrrib:
6661 case X86::VRANGESSZrribkz:
6662 case X86::VRANGESSZrrikz:
6663 return Subtarget.hasRANGEFalseDeps();
6664 case X86::VGETMANTSSZrmi:
6665 case X86::VGETMANTSSZrmikz:
6666 case X86::VGETMANTSSZrri:
6667 case X86::VGETMANTSSZrrib:
6668 case X86::VGETMANTSSZrribkz:
6669 case X86::VGETMANTSSZrrikz:
6670 case X86::VGETMANTSDZrmi:
6671 case X86::VGETMANTSDZrmikz:
6672 case X86::VGETMANTSDZrri:
6673 case X86::VGETMANTSDZrrib:
6674 case X86::VGETMANTSDZrribkz:
6675 case X86::VGETMANTSDZrrikz:
6676 case X86::VGETMANTSHZrmi:
6677 case X86::VGETMANTSHZrmikz:
6678 case X86::VGETMANTSHZrri:
6679 case X86::VGETMANTSHZrrib:
6680 case X86::VGETMANTSHZrribkz:
6681 case X86::VGETMANTSHZrrikz:
6682 case X86::VGETMANTPSZ128rmbi:
6683 case X86::VGETMANTPSZ128rmbikz:
6684 case X86::VGETMANTPSZ128rmi:
6685 case X86::VGETMANTPSZ128rmikz:
6686 case X86::VGETMANTPSZ256rmbi:
6687 case X86::VGETMANTPSZ256rmbikz:
6688 case X86::VGETMANTPSZ256rmi:
6689 case X86::VGETMANTPSZ256rmikz:
6690 case X86::VGETMANTPSZrmbi:
6691 case X86::VGETMANTPSZrmbikz:
6692 case X86::VGETMANTPSZrmi:
6693 case X86::VGETMANTPSZrmikz:
6694 case X86::VGETMANTPDZ128rmbi:
6695 case X86::VGETMANTPDZ128rmbikz:
6696 case X86::VGETMANTPDZ128rmi:
6697 case X86::VGETMANTPDZ128rmikz:
6698 case X86::VGETMANTPDZ256rmbi:
6699 case X86::VGETMANTPDZ256rmbikz:
6700 case X86::VGETMANTPDZ256rmi:
6701 case X86::VGETMANTPDZ256rmikz:
6702 case X86::VGETMANTPDZrmbi:
6703 case X86::VGETMANTPDZrmbikz:
6704 case X86::VGETMANTPDZrmi:
6705 case X86::VGETMANTPDZrmikz:
6706 return Subtarget.hasGETMANTFalseDeps();
6707 case X86::VPMULLQZ128rm:
6708 case X86::VPMULLQZ128rmb:
6709 case X86::VPMULLQZ128rmbkz:
6710 case X86::VPMULLQZ128rmkz:
6711 case X86::VPMULLQZ128rr:
6712 case X86::VPMULLQZ128rrkz:
6713 case X86::VPMULLQZ256rm:
6714 case X86::VPMULLQZ256rmb:
6715 case X86::VPMULLQZ256rmbkz:
6716 case X86::VPMULLQZ256rmkz:
6717 case X86::VPMULLQZ256rr:
6718 case X86::VPMULLQZ256rrkz:
6719 case X86::VPMULLQZrm:
6720 case X86::VPMULLQZrmb:
6721 case X86::VPMULLQZrmbkz:
6722 case X86::VPMULLQZrmkz:
6723 case X86::VPMULLQZrr:
6724 case X86::VPMULLQZrrkz:
6725 return Subtarget.hasMULLQFalseDeps();
6726 // GPR
6727 case X86::POPCNT32rm:
6728 case X86::POPCNT32rr:
6729 case X86::POPCNT64rm:
6730 case X86::POPCNT64rr:
6731 return Subtarget.hasPOPCNTFalseDeps();
6732 case X86::LZCNT32rm:
6733 case X86::LZCNT32rr:
6734 case X86::LZCNT64rm:
6735 case X86::LZCNT64rr:
6736 case X86::TZCNT32rm:
6737 case X86::TZCNT32rr:
6738 case X86::TZCNT64rm:
6739 case X86::TZCNT64rr:
6740 return Subtarget.hasLZCNTFalseDeps();
6741 }
6742
6743 return false;
6744}
6745
6746/// Inform the BreakFalseDeps pass how many idle
6747/// instructions we would like before a partial register update.
6749 const MachineInstr &MI, unsigned OpNum,
6750 const TargetRegisterInfo *TRI) const {
6751
6752 if (OpNum != 0)
6753 return 0;
6754
6755 // NDD ops with 8/16b results may appear to be partial register
6756 // updates after register allocation.
6757 bool HasNDDPartialWrite = false;
6758 if (X86II::hasNewDataDest(MI.getDesc().TSFlags)) {
6759 Register Reg = MI.getOperand(0).getReg();
6760 if (!Reg.isVirtual())
6761 HasNDDPartialWrite =
6762 X86::GR8RegClass.contains(Reg) || X86::GR16RegClass.contains(Reg);
6763 }
6764
6765 if (!(HasNDDPartialWrite || hasPartialRegUpdate(MI.getOpcode(), Subtarget)))
6766 return 0;
6767
6768 // Check if the result register is also used as a source.
6769 // For non-NDD ops, this means a partial update is wanted, hence we return 0.
6770 // For NDD ops, this means it is possible to compress the instruction
6771 // to a legacy form in CompressEVEX, which would create an unwanted partial
6772 // update, so we return the clearance.
6773 const MachineOperand &MO = MI.getOperand(0);
6774 Register Reg = MO.getReg();
6775 bool ReadsReg = false;
6776 if (Reg.isVirtual())
6777 ReadsReg = (MO.readsReg() || MI.readsVirtualRegister(Reg));
6778 else
6779 ReadsReg = MI.readsRegister(Reg, TRI);
6780 if (ReadsReg != HasNDDPartialWrite)
6781 return 0;
6782
6783 // If any instructions in the clearance range are reading Reg, insert a
6784 // dependency breaking instruction, which is inexpensive and is likely to
6785 // be hidden in other instruction's cycles.
6787}
6788
6789// Return true for any instruction the copies the high bits of the first source
6790// operand into the unused high bits of the destination operand.
6791// Also returns true for instructions that have two inputs where one may
6792// be undef and we want it to use the same register as the other input.
6793static bool hasUndefRegUpdate(unsigned Opcode, unsigned OpNum,
6794 bool ForLoadFold = false) {
6795 // Set the OpNum parameter to the first source operand.
6796 switch (Opcode) {
6797 case X86::MMX_PUNPCKHBWrr:
6798 case X86::MMX_PUNPCKHWDrr:
6799 case X86::MMX_PUNPCKHDQrr:
6800 case X86::MMX_PUNPCKLBWrr:
6801 case X86::MMX_PUNPCKLWDrr:
6802 case X86::MMX_PUNPCKLDQrr:
6803 case X86::MOVHLPSrr:
6804 case X86::PACKSSWBrr:
6805 case X86::PACKUSWBrr:
6806 case X86::PACKSSDWrr:
6807 case X86::PACKUSDWrr:
6808 case X86::PUNPCKHBWrr:
6809 case X86::PUNPCKLBWrr:
6810 case X86::PUNPCKHWDrr:
6811 case X86::PUNPCKLWDrr:
6812 case X86::PUNPCKHDQrr:
6813 case X86::PUNPCKLDQrr:
6814 case X86::PUNPCKHQDQrr:
6815 case X86::PUNPCKLQDQrr:
6816 case X86::SHUFPDrri:
6817 case X86::SHUFPSrri:
6818 // These instructions are sometimes used with an undef first or second
6819 // source. Return true here so BreakFalseDeps will assign this source to the
6820 // same register as the first source to avoid a false dependency.
6821 // Operand 1 of these instructions is tied so they're separate from their
6822 // VEX counterparts.
6823 return OpNum == 2 && !ForLoadFold;
6824
6825 case X86::VMOVLHPSrr:
6826 case X86::VMOVLHPSZrr:
6827 case X86::VPACKSSWBrr:
6828 case X86::VPACKUSWBrr:
6829 case X86::VPACKSSDWrr:
6830 case X86::VPACKUSDWrr:
6831 case X86::VPACKSSWBZ128rr:
6832 case X86::VPACKUSWBZ128rr:
6833 case X86::VPACKSSDWZ128rr:
6834 case X86::VPACKUSDWZ128rr:
6835 case X86::VPERM2F128rri:
6836 case X86::VPERM2I128rri:
6837 case X86::VSHUFF32X4Z256rri:
6838 case X86::VSHUFF32X4Zrri:
6839 case X86::VSHUFF64X2Z256rri:
6840 case X86::VSHUFF64X2Zrri:
6841 case X86::VSHUFI32X4Z256rri:
6842 case X86::VSHUFI32X4Zrri:
6843 case X86::VSHUFI64X2Z256rri:
6844 case X86::VSHUFI64X2Zrri:
6845 case X86::VPUNPCKHBWrr:
6846 case X86::VPUNPCKLBWrr:
6847 case X86::VPUNPCKHBWYrr:
6848 case X86::VPUNPCKLBWYrr:
6849 case X86::VPUNPCKHBWZ128rr:
6850 case X86::VPUNPCKLBWZ128rr:
6851 case X86::VPUNPCKHBWZ256rr:
6852 case X86::VPUNPCKLBWZ256rr:
6853 case X86::VPUNPCKHBWZrr:
6854 case X86::VPUNPCKLBWZrr:
6855 case X86::VPUNPCKHWDrr:
6856 case X86::VPUNPCKLWDrr:
6857 case X86::VPUNPCKHWDYrr:
6858 case X86::VPUNPCKLWDYrr:
6859 case X86::VPUNPCKHWDZ128rr:
6860 case X86::VPUNPCKLWDZ128rr:
6861 case X86::VPUNPCKHWDZ256rr:
6862 case X86::VPUNPCKLWDZ256rr:
6863 case X86::VPUNPCKHWDZrr:
6864 case X86::VPUNPCKLWDZrr:
6865 case X86::VPUNPCKHDQrr:
6866 case X86::VPUNPCKLDQrr:
6867 case X86::VPUNPCKHDQYrr:
6868 case X86::VPUNPCKLDQYrr:
6869 case X86::VPUNPCKHDQZ128rr:
6870 case X86::VPUNPCKLDQZ128rr:
6871 case X86::VPUNPCKHDQZ256rr:
6872 case X86::VPUNPCKLDQZ256rr:
6873 case X86::VPUNPCKHDQZrr:
6874 case X86::VPUNPCKLDQZrr:
6875 case X86::VPUNPCKHQDQrr:
6876 case X86::VPUNPCKLQDQrr:
6877 case X86::VPUNPCKHQDQYrr:
6878 case X86::VPUNPCKLQDQYrr:
6879 case X86::VPUNPCKHQDQZ128rr:
6880 case X86::VPUNPCKLQDQZ128rr:
6881 case X86::VPUNPCKHQDQZ256rr:
6882 case X86::VPUNPCKLQDQZ256rr:
6883 case X86::VPUNPCKHQDQZrr:
6884 case X86::VPUNPCKLQDQZrr:
6885 // These instructions are sometimes used with an undef first or second
6886 // source. Return true here so BreakFalseDeps will assign this source to the
6887 // same register as the first source to avoid a false dependency.
6888 return (OpNum == 1 || OpNum == 2) && !ForLoadFold;
6889
6890 case X86::VCVTSI2SSrr:
6891 case X86::VCVTSI2SSrm:
6892 case X86::VCVTSI2SSrr_Int:
6893 case X86::VCVTSI2SSrm_Int:
6894 case X86::VCVTSI642SSrr:
6895 case X86::VCVTSI642SSrm:
6896 case X86::VCVTSI642SSrr_Int:
6897 case X86::VCVTSI642SSrm_Int:
6898 case X86::VCVTSI2SDrr:
6899 case X86::VCVTSI2SDrm:
6900 case X86::VCVTSI2SDrr_Int:
6901 case X86::VCVTSI2SDrm_Int:
6902 case X86::VCVTSI642SDrr:
6903 case X86::VCVTSI642SDrm:
6904 case X86::VCVTSI642SDrr_Int:
6905 case X86::VCVTSI642SDrm_Int:
6906 // AVX-512
6907 case X86::VCVTSI2SSZrr:
6908 case X86::VCVTSI2SSZrm:
6909 case X86::VCVTSI2SSZrr_Int:
6910 case X86::VCVTSI2SSZrrb_Int:
6911 case X86::VCVTSI2SSZrm_Int:
6912 case X86::VCVTSI642SSZrr:
6913 case X86::VCVTSI642SSZrm:
6914 case X86::VCVTSI642SSZrr_Int:
6915 case X86::VCVTSI642SSZrrb_Int:
6916 case X86::VCVTSI642SSZrm_Int:
6917 case X86::VCVTSI2SDZrr:
6918 case X86::VCVTSI2SDZrm:
6919 case X86::VCVTSI2SDZrr_Int:
6920 case X86::VCVTSI2SDZrm_Int:
6921 case X86::VCVTSI642SDZrr:
6922 case X86::VCVTSI642SDZrm:
6923 case X86::VCVTSI642SDZrr_Int:
6924 case X86::VCVTSI642SDZrrb_Int:
6925 case X86::VCVTSI642SDZrm_Int:
6926 case X86::VCVTUSI2SSZrr:
6927 case X86::VCVTUSI2SSZrm:
6928 case X86::VCVTUSI2SSZrr_Int:
6929 case X86::VCVTUSI2SSZrrb_Int:
6930 case X86::VCVTUSI2SSZrm_Int:
6931 case X86::VCVTUSI642SSZrr:
6932 case X86::VCVTUSI642SSZrm:
6933 case X86::VCVTUSI642SSZrr_Int:
6934 case X86::VCVTUSI642SSZrrb_Int:
6935 case X86::VCVTUSI642SSZrm_Int:
6936 case X86::VCVTUSI2SDZrr:
6937 case X86::VCVTUSI2SDZrm:
6938 case X86::VCVTUSI2SDZrr_Int:
6939 case X86::VCVTUSI2SDZrm_Int:
6940 case X86::VCVTUSI642SDZrr:
6941 case X86::VCVTUSI642SDZrm:
6942 case X86::VCVTUSI642SDZrr_Int:
6943 case X86::VCVTUSI642SDZrrb_Int:
6944 case X86::VCVTUSI642SDZrm_Int:
6945 case X86::VCVTSI2SHZrr:
6946 case X86::VCVTSI2SHZrm:
6947 case X86::VCVTSI2SHZrr_Int:
6948 case X86::VCVTSI2SHZrrb_Int:
6949 case X86::VCVTSI2SHZrm_Int:
6950 case X86::VCVTSI642SHZrr:
6951 case X86::VCVTSI642SHZrm:
6952 case X86::VCVTSI642SHZrr_Int:
6953 case X86::VCVTSI642SHZrrb_Int:
6954 case X86::VCVTSI642SHZrm_Int:
6955 case X86::VCVTUSI2SHZrr:
6956 case X86::VCVTUSI2SHZrm:
6957 case X86::VCVTUSI2SHZrr_Int:
6958 case X86::VCVTUSI2SHZrrb_Int:
6959 case X86::VCVTUSI2SHZrm_Int:
6960 case X86::VCVTUSI642SHZrr:
6961 case X86::VCVTUSI642SHZrm:
6962 case X86::VCVTUSI642SHZrr_Int:
6963 case X86::VCVTUSI642SHZrrb_Int:
6964 case X86::VCVTUSI642SHZrm_Int:
6965 // Load folding won't effect the undef register update since the input is
6966 // a GPR.
6967 return OpNum == 1 && !ForLoadFold;
6968 case X86::VCVTSD2SSrr:
6969 case X86::VCVTSD2SSrm:
6970 case X86::VCVTSD2SSrr_Int:
6971 case X86::VCVTSD2SSrm_Int:
6972 case X86::VCVTSS2SDrr:
6973 case X86::VCVTSS2SDrm:
6974 case X86::VCVTSS2SDrr_Int:
6975 case X86::VCVTSS2SDrm_Int:
6976 case X86::VRCPSSr:
6977 case X86::VRCPSSr_Int:
6978 case X86::VRCPSSm:
6979 case X86::VRCPSSm_Int:
6980 case X86::VROUNDSDri:
6981 case X86::VROUNDSDmi:
6982 case X86::VROUNDSDri_Int:
6983 case X86::VROUNDSDmi_Int:
6984 case X86::VROUNDSSri:
6985 case X86::VROUNDSSmi:
6986 case X86::VROUNDSSri_Int:
6987 case X86::VROUNDSSmi_Int:
6988 case X86::VRSQRTSSr:
6989 case X86::VRSQRTSSr_Int:
6990 case X86::VRSQRTSSm:
6991 case X86::VRSQRTSSm_Int:
6992 case X86::VSQRTSSr:
6993 case X86::VSQRTSSr_Int:
6994 case X86::VSQRTSSm:
6995 case X86::VSQRTSSm_Int:
6996 case X86::VSQRTSDr:
6997 case X86::VSQRTSDr_Int:
6998 case X86::VSQRTSDm:
6999 case X86::VSQRTSDm_Int:
7000 // AVX-512
7001 case X86::VCVTSD2SSZrr:
7002 case X86::VCVTSD2SSZrr_Int:
7003 case X86::VCVTSD2SSZrrb_Int:
7004 case X86::VCVTSD2SSZrm:
7005 case X86::VCVTSD2SSZrm_Int:
7006 case X86::VCVTSS2SDZrr:
7007 case X86::VCVTSS2SDZrr_Int:
7008 case X86::VCVTSS2SDZrrb_Int:
7009 case X86::VCVTSS2SDZrm:
7010 case X86::VCVTSS2SDZrm_Int:
7011 case X86::VGETEXPSDZr:
7012 case X86::VGETEXPSDZrb:
7013 case X86::VGETEXPSDZm:
7014 case X86::VGETEXPSSZr:
7015 case X86::VGETEXPSSZrb:
7016 case X86::VGETEXPSSZm:
7017 case X86::VGETMANTSDZrri:
7018 case X86::VGETMANTSDZrrib:
7019 case X86::VGETMANTSDZrmi:
7020 case X86::VGETMANTSSZrri:
7021 case X86::VGETMANTSSZrrib:
7022 case X86::VGETMANTSSZrmi:
7023 case X86::VRNDSCALESDZrri:
7024 case X86::VRNDSCALESDZrri_Int:
7025 case X86::VRNDSCALESDZrrib_Int:
7026 case X86::VRNDSCALESDZrmi:
7027 case X86::VRNDSCALESDZrmi_Int:
7028 case X86::VRNDSCALESSZrri:
7029 case X86::VRNDSCALESSZrri_Int:
7030 case X86::VRNDSCALESSZrrib_Int:
7031 case X86::VRNDSCALESSZrmi:
7032 case X86::VRNDSCALESSZrmi_Int:
7033 case X86::VRCP14SDZrr:
7034 case X86::VRCP14SDZrm:
7035 case X86::VRCP14SSZrr:
7036 case X86::VRCP14SSZrm:
7037 case X86::VRCPSHZrr:
7038 case X86::VRCPSHZrm:
7039 case X86::VRSQRTSHZrr:
7040 case X86::VRSQRTSHZrm:
7041 case X86::VREDUCESHZrmi:
7042 case X86::VREDUCESHZrri:
7043 case X86::VREDUCESHZrrib:
7044 case X86::VGETEXPSHZr:
7045 case X86::VGETEXPSHZrb:
7046 case X86::VGETEXPSHZm:
7047 case X86::VGETMANTSHZrri:
7048 case X86::VGETMANTSHZrrib:
7049 case X86::VGETMANTSHZrmi:
7050 case X86::VRNDSCALESHZrri:
7051 case X86::VRNDSCALESHZrri_Int:
7052 case X86::VRNDSCALESHZrrib_Int:
7053 case X86::VRNDSCALESHZrmi:
7054 case X86::VRNDSCALESHZrmi_Int:
7055 case X86::VSQRTSHZr:
7056 case X86::VSQRTSHZr_Int:
7057 case X86::VSQRTSHZrb_Int:
7058 case X86::VSQRTSHZm:
7059 case X86::VSQRTSHZm_Int:
7060 case X86::VRCP28SDZr:
7061 case X86::VRCP28SDZrb:
7062 case X86::VRCP28SDZm:
7063 case X86::VRCP28SSZr:
7064 case X86::VRCP28SSZrb:
7065 case X86::VRCP28SSZm:
7066 case X86::VREDUCESSZrmi:
7067 case X86::VREDUCESSZrri:
7068 case X86::VREDUCESSZrrib:
7069 case X86::VRSQRT14SDZrr:
7070 case X86::VRSQRT14SDZrm:
7071 case X86::VRSQRT14SSZrr:
7072 case X86::VRSQRT14SSZrm:
7073 case X86::VRSQRT28SDZr:
7074 case X86::VRSQRT28SDZrb:
7075 case X86::VRSQRT28SDZm:
7076 case X86::VRSQRT28SSZr:
7077 case X86::VRSQRT28SSZrb:
7078 case X86::VRSQRT28SSZm:
7079 case X86::VSQRTSSZr:
7080 case X86::VSQRTSSZr_Int:
7081 case X86::VSQRTSSZrb_Int:
7082 case X86::VSQRTSSZm:
7083 case X86::VSQRTSSZm_Int:
7084 case X86::VSQRTSDZr:
7085 case X86::VSQRTSDZr_Int:
7086 case X86::VSQRTSDZrb_Int:
7087 case X86::VSQRTSDZm:
7088 case X86::VSQRTSDZm_Int:
7089 case X86::VCVTSD2SHZrr:
7090 case X86::VCVTSD2SHZrr_Int:
7091 case X86::VCVTSD2SHZrrb_Int:
7092 case X86::VCVTSD2SHZrm:
7093 case X86::VCVTSD2SHZrm_Int:
7094 case X86::VCVTSS2SHZrr:
7095 case X86::VCVTSS2SHZrr_Int:
7096 case X86::VCVTSS2SHZrrb_Int:
7097 case X86::VCVTSS2SHZrm:
7098 case X86::VCVTSS2SHZrm_Int:
7099 case X86::VCVTSH2SDZrr:
7100 case X86::VCVTSH2SDZrr_Int:
7101 case X86::VCVTSH2SDZrrb_Int:
7102 case X86::VCVTSH2SDZrm:
7103 case X86::VCVTSH2SDZrm_Int:
7104 case X86::VCVTSH2SSZrr:
7105 case X86::VCVTSH2SSZrr_Int:
7106 case X86::VCVTSH2SSZrrb_Int:
7107 case X86::VCVTSH2SSZrm:
7108 case X86::VCVTSH2SSZrm_Int:
7109 return OpNum == 1;
7110 case X86::VMOVSSZrrk:
7111 case X86::VMOVSDZrrk:
7112 return OpNum == 3 && !ForLoadFold;
7113 case X86::VMOVSSZrrkz:
7114 case X86::VMOVSDZrrkz:
7115 return OpNum == 2 && !ForLoadFold;
7116 }
7117
7118 return false;
7119}
7120
7121/// Inform the BreakFalseDeps pass how many idle instructions we would like
7122/// before certain undef register reads.
7123///
7124/// This catches the VCVTSI2SD family of instructions:
7125///
7126/// vcvtsi2sdq %rax, undef %xmm0, %xmm14
7127///
7128/// We should to be careful *not* to catch VXOR idioms which are presumably
7129/// handled specially in the pipeline:
7130///
7131/// vxorps undef %xmm1, undef %xmm1, %xmm1
7132///
7133/// Like getPartialRegUpdateClearance, this makes a strong assumption that the
7134/// high bits that are passed-through are not live.
7135unsigned
7137 const TargetRegisterInfo *TRI) const {
7138 const MachineOperand &MO = MI.getOperand(OpNum);
7139 if (MO.getReg().isPhysical() && hasUndefRegUpdate(MI.getOpcode(), OpNum))
7140 return UndefRegClearance;
7141
7142 return 0;
7143}
7144
7146 MachineInstr &MI, unsigned OpNum, const TargetRegisterInfo *TRI) const {
7147 Register Reg = MI.getOperand(OpNum).getReg();
7148 // If MI kills this register, the false dependence is already broken.
7149 if (MI.killsRegister(Reg, TRI))
7150 return;
7151
7152 if (X86::VR128RegClass.contains(Reg)) {
7153 // These instructions are all floating point domain, so xorps is the best
7154 // choice.
7155 unsigned Opc = Subtarget.hasAVX() ? X86::VXORPSrr : X86::XORPSrr;
7156 BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(Opc), Reg)
7157 .addReg(Reg, RegState::Undef)
7158 .addReg(Reg, RegState::Undef);
7159 MI.addRegisterKilled(Reg, TRI, true);
7160 } else if (X86::VR256RegClass.contains(Reg)) {
7161 // Use vxorps to clear the full ymm register.
7162 // It wants to read and write the xmm sub-register.
7163 Register XReg = TRI->getSubReg(Reg, X86::sub_xmm);
7164 BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(X86::VXORPSrr), XReg)
7165 .addReg(XReg, RegState::Undef)
7166 .addReg(XReg, RegState::Undef)
7168 MI.addRegisterKilled(Reg, TRI, true);
7169 } else if (X86::VR128XRegClass.contains(Reg)) {
7170 // Only handle VLX targets.
7171 if (!Subtarget.hasVLX())
7172 return;
7173 // Since vxorps requires AVX512DQ, vpxord should be the best choice.
7174 BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(X86::VPXORDZ128rr), Reg)
7175 .addReg(Reg, RegState::Undef)
7176 .addReg(Reg, RegState::Undef);
7177 MI.addRegisterKilled(Reg, TRI, true);
7178 } else if (X86::VR256XRegClass.contains(Reg) ||
7179 X86::VR512RegClass.contains(Reg)) {
7180 // Only handle VLX targets.
7181 if (!Subtarget.hasVLX())
7182 return;
7183 // Use vpxord to clear the full ymm/zmm register.
7184 // It wants to read and write the xmm sub-register.
7185 Register XReg = TRI->getSubReg(Reg, X86::sub_xmm);
7186 BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(X86::VPXORDZ128rr), XReg)
7187 .addReg(XReg, RegState::Undef)
7188 .addReg(XReg, RegState::Undef)
7190 MI.addRegisterKilled(Reg, TRI, true);
7191 } else if (X86::GR64RegClass.contains(Reg)) {
7192 // Using XOR32rr because it has shorter encoding and zeros up the upper bits
7193 // as well.
7194 Register XReg = TRI->getSubReg(Reg, X86::sub_32bit);
7195 BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(X86::XOR32rr), XReg)
7196 .addReg(XReg, RegState::Undef)
7197 .addReg(XReg, RegState::Undef)
7199 MI.addRegisterKilled(Reg, TRI, true);
7200 } else if (X86::GR32RegClass.contains(Reg)) {
7201 BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(X86::XOR32rr), Reg)
7202 .addReg(Reg, RegState::Undef)
7203 .addReg(Reg, RegState::Undef);
7204 MI.addRegisterKilled(Reg, TRI, true);
7205 } else if ((X86::GR16RegClass.contains(Reg) ||
7206 X86::GR8RegClass.contains(Reg)) &&
7207 X86II::hasNewDataDest(MI.getDesc().TSFlags)) {
7208 // This case is only expected for NDD ops which appear to be partial
7209 // writes, but are not due to the zeroing of the upper part. Here
7210 // we add an implicit def of the superegister, which prevents
7211 // CompressEVEX from converting this to a legacy form.
7212 Register SuperReg = getX86SubSuperRegister(Reg, 64);
7213 MachineInstrBuilder BuildMI(*MI.getParent()->getParent(), &MI);
7214 if (!MI.definesRegister(SuperReg, /*TRI=*/nullptr))
7215 BuildMI.addReg(SuperReg, RegState::ImplicitDefine);
7216 }
7217}
7218
7220 int PtrOffset = 0) {
7221 unsigned NumAddrOps = MOs.size();
7222
7223 if (NumAddrOps < 4) {
7224 // FrameIndex only - add an immediate offset (whether its zero or not).
7225 for (unsigned i = 0; i != NumAddrOps; ++i)
7226 MIB.add(MOs[i]);
7227 addOffset(MIB, PtrOffset);
7228 } else {
7229 // General Memory Addressing - we need to add any offset to an existing
7230 // offset.
7231 assert(MOs.size() == 5 && "Unexpected memory operand list length");
7232 for (unsigned i = 0; i != NumAddrOps; ++i) {
7233 const MachineOperand &MO = MOs[i];
7234 if (i == 3 && PtrOffset != 0) {
7235 MIB.addDisp(MO, PtrOffset);
7236 } else {
7237 MIB.add(MO);
7238 }
7239 }
7240 }
7241}
7242
7244 MachineInstr &NewMI,
7245 const TargetInstrInfo &TII) {
7247 const TargetRegisterInfo &TRI = *MRI.getTargetRegisterInfo();
7248
7249 for (int Idx : llvm::seq<int>(0, NewMI.getNumOperands())) {
7250 MachineOperand &MO = NewMI.getOperand(Idx);
7251 // We only need to update constraints on virtual register operands.
7252 if (!MO.isReg())
7253 continue;
7254 Register Reg = MO.getReg();
7255 if (!Reg.isVirtual())
7256 continue;
7257
7258 auto *NewRC =
7259 MRI.constrainRegClass(Reg, TII.getRegClass(NewMI.getDesc(), Idx, &TRI));
7260 if (!NewRC) {
7261 LLVM_DEBUG(
7262 dbgs() << "WARNING: Unable to update register constraint for operand "
7263 << Idx << " of instruction:\n";
7264 NewMI.dump(); dbgs() << "\n");
7265 }
7266 }
7267}
7268
7269static MachineInstr *fuseTwoAddrInst(MachineFunction &MF, unsigned Opcode,
7273 const TargetInstrInfo &TII) {
7274 // Create the base instruction with the memory operand as the first part.
7275 // Omit the implicit operands, something BuildMI can't do.
7276 MachineInstr *NewMI =
7277 MF.CreateMachineInstr(TII.get(Opcode), MI.getDebugLoc(), true);
7278 MachineInstrBuilder MIB(MF, NewMI);
7279 addOperands(MIB, MOs);
7280
7281 // Loop over the rest of the ri operands, converting them over.
7282 unsigned NumOps = MI.getDesc().getNumOperands() - 2;
7283 for (unsigned i = 0; i != NumOps; ++i) {
7284 MachineOperand &MO = MI.getOperand(i + 2);
7285 MIB.add(MO);
7286 }
7287 for (const MachineOperand &MO : llvm::drop_begin(MI.operands(), NumOps + 2))
7288 MIB.add(MO);
7289
7290 updateOperandRegConstraints(MF, *NewMI, TII);
7291
7292 MachineBasicBlock *MBB = InsertPt->getParent();
7293 MBB->insert(InsertPt, NewMI);
7294
7295 return MIB;
7296}
7297
7298static MachineInstr *fuseInst(MachineFunction &MF, unsigned Opcode,
7299 unsigned OpNo, ArrayRef<MachineOperand> MOs,
7302 int PtrOffset = 0) {
7303 // Omit the implicit operands, something BuildMI can't do.
7304 MachineInstr *NewMI =
7305 MF.CreateMachineInstr(TII.get(Opcode), MI.getDebugLoc(), true);
7306 MachineInstrBuilder MIB(MF, NewMI);
7307
7308 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
7309 MachineOperand &MO = MI.getOperand(i);
7310 if (i == OpNo) {
7311 assert(MO.isReg() && "Expected to fold into reg operand!");
7312 addOperands(MIB, MOs, PtrOffset);
7313 } else {
7314 MIB.add(MO);
7315 }
7316 }
7317
7318 updateOperandRegConstraints(MF, *NewMI, TII);
7319
7320 // Copy the NoFPExcept flag from the instruction we're fusing.
7323
7324 MachineBasicBlock *MBB = InsertPt->getParent();
7325 MBB->insert(InsertPt, NewMI);
7326
7327 return MIB;
7328}
7329
7330static MachineInstr *makeM0Inst(const TargetInstrInfo &TII, unsigned Opcode,
7333 MachineInstr &MI) {
7334 MachineInstrBuilder MIB = BuildMI(*InsertPt->getParent(), InsertPt,
7335 MI.getDebugLoc(), TII.get(Opcode));
7336 addOperands(MIB, MOs);
7337 return MIB.addImm(0);
7338}
7339
7340MachineInstr *X86InstrInfo::foldMemoryOperandCustom(
7341 MachineFunction &MF, MachineInstr &MI, unsigned OpNum,
7343 unsigned Size, Align Alignment) const {
7344 switch (MI.getOpcode()) {
7345 case X86::INSERTPSrri:
7346 case X86::VINSERTPSrri:
7347 case X86::VINSERTPSZrri:
7348 // Attempt to convert the load of inserted vector into a fold load
7349 // of a single float.
7350 if (OpNum == 2) {
7351 unsigned Imm = MI.getOperand(MI.getNumOperands() - 1).getImm();
7352 unsigned ZMask = Imm & 15;
7353 unsigned DstIdx = (Imm >> 4) & 3;
7354 unsigned SrcIdx = (Imm >> 6) & 3;
7355
7356 const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
7357 const TargetRegisterClass *RC = getRegClass(MI.getDesc(), OpNum, &RI);
7358 unsigned RCSize = TRI.getRegSizeInBits(*RC) / 8;
7359 if ((Size == 0 || Size >= 16) && RCSize >= 16 &&
7360 (MI.getOpcode() != X86::INSERTPSrri || Alignment >= Align(4))) {
7361 int PtrOffset = SrcIdx * 4;
7362 unsigned NewImm = (DstIdx << 4) | ZMask;
7363 unsigned NewOpCode =
7364 (MI.getOpcode() == X86::VINSERTPSZrri) ? X86::VINSERTPSZrmi
7365 : (MI.getOpcode() == X86::VINSERTPSrri) ? X86::VINSERTPSrmi
7366 : X86::INSERTPSrmi;
7367 MachineInstr *NewMI =
7368 fuseInst(MF, NewOpCode, OpNum, MOs, InsertPt, MI, *this, PtrOffset);
7369 NewMI->getOperand(NewMI->getNumOperands() - 1).setImm(NewImm);
7370 return NewMI;
7371 }
7372 }
7373 break;
7374 case X86::MOVHLPSrr:
7375 case X86::VMOVHLPSrr:
7376 case X86::VMOVHLPSZrr:
7377 // Move the upper 64-bits of the second operand to the lower 64-bits.
7378 // To fold the load, adjust the pointer to the upper and use (V)MOVLPS.
7379 // TODO: In most cases AVX doesn't have a 8-byte alignment requirement.
7380 if (OpNum == 2) {
7381 const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
7382 const TargetRegisterClass *RC = getRegClass(MI.getDesc(), OpNum, &RI);
7383 unsigned RCSize = TRI.getRegSizeInBits(*RC) / 8;
7384 if ((Size == 0 || Size >= 16) && RCSize >= 16 && Alignment >= Align(8)) {
7385 unsigned NewOpCode =
7386 (MI.getOpcode() == X86::VMOVHLPSZrr) ? X86::VMOVLPSZ128rm
7387 : (MI.getOpcode() == X86::VMOVHLPSrr) ? X86::VMOVLPSrm
7388 : X86::MOVLPSrm;
7389 MachineInstr *NewMI =
7390 fuseInst(MF, NewOpCode, OpNum, MOs, InsertPt, MI, *this, 8);
7391 return NewMI;
7392 }
7393 }
7394 break;
7395 case X86::UNPCKLPDrr:
7396 // If we won't be able to fold this to the memory form of UNPCKL, use
7397 // MOVHPD instead. Done as custom because we can't have this in the load
7398 // table twice.
7399 if (OpNum == 2) {
7400 const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
7401 const TargetRegisterClass *RC = getRegClass(MI.getDesc(), OpNum, &RI);
7402 unsigned RCSize = TRI.getRegSizeInBits(*RC) / 8;
7403 if ((Size == 0 || Size >= 16) && RCSize >= 16 && Alignment < Align(16)) {
7404 MachineInstr *NewMI =
7405 fuseInst(MF, X86::MOVHPDrm, OpNum, MOs, InsertPt, MI, *this);
7406 return NewMI;
7407 }
7408 }
7409 break;
7410 case X86::MOV32r0:
7411 if (auto *NewMI =
7412 makeM0Inst(*this, (Size == 4) ? X86::MOV32mi : X86::MOV64mi32, MOs,
7413 InsertPt, MI))
7414 return NewMI;
7415 break;
7416 }
7417
7418 return nullptr;
7419}
7420
7422 MachineInstr &MI) {
7423 if (!hasUndefRegUpdate(MI.getOpcode(), 1, /*ForLoadFold*/ true) ||
7424 !MI.getOperand(1).isReg())
7425 return false;
7426
7427 // The are two cases we need to handle depending on where in the pipeline
7428 // the folding attempt is being made.
7429 // -Register has the undef flag set.
7430 // -Register is produced by the IMPLICIT_DEF instruction.
7431
7432 if (MI.getOperand(1).isUndef())
7433 return true;
7434
7436 MachineInstr *VRegDef = RegInfo.getUniqueVRegDef(MI.getOperand(1).getReg());
7437 return VRegDef && VRegDef->isImplicitDef();
7438}
7439
7440unsigned X86InstrInfo::commuteOperandsForFold(MachineInstr &MI,
7441 unsigned Idx1) const {
7442 unsigned Idx2 = CommuteAnyOperandIndex;
7443 if (!findCommutedOpIndices(MI, Idx1, Idx2))
7444 return Idx1;
7445
7446 bool HasDef = MI.getDesc().getNumDefs();
7447 Register Reg0 = HasDef ? MI.getOperand(0).getReg() : Register();
7448 Register Reg1 = MI.getOperand(Idx1).getReg();
7449 Register Reg2 = MI.getOperand(Idx2).getReg();
7450 bool Tied1 = 0 == MI.getDesc().getOperandConstraint(Idx1, MCOI::TIED_TO);
7451 bool Tied2 = 0 == MI.getDesc().getOperandConstraint(Idx2, MCOI::TIED_TO);
7452
7453 // If either of the commutable operands are tied to the destination
7454 // then we can not commute + fold.
7455 if ((HasDef && Reg0 == Reg1 && Tied1) || (HasDef && Reg0 == Reg2 && Tied2))
7456 return Idx1;
7457
7458 return commuteInstruction(MI, false, Idx1, Idx2) ? Idx2 : Idx1;
7459}
7460
7461static void printFailMsgforFold(const MachineInstr &MI, unsigned Idx) {
7462 if (PrintFailedFusing && !MI.isCopy())
7463 dbgs() << "We failed to fuse operand " << Idx << " in " << MI;
7464}
7465
7467 MachineFunction &MF, MachineInstr &MI, unsigned OpNum,
7469 unsigned Size, Align Alignment, bool AllowCommute) const {
7470 bool isSlowTwoMemOps = Subtarget.slowTwoMemOps();
7471 unsigned Opc = MI.getOpcode();
7472
7473 // For CPUs that favor the register form of a call or push,
7474 // do not fold loads into calls or pushes, unless optimizing for size
7475 // aggressively.
7476 if (isSlowTwoMemOps && !MF.getFunction().hasMinSize() &&
7477 (Opc == X86::CALL32r || Opc == X86::CALL64r ||
7478 Opc == X86::CALL64r_ImpCall || Opc == X86::PUSH16r ||
7479 Opc == X86::PUSH32r || Opc == X86::PUSH64r))
7480 return nullptr;
7481
7482 // Avoid partial and undef register update stalls unless optimizing for size.
7483 if (!MF.getFunction().hasOptSize() &&
7484 (hasPartialRegUpdate(Opc, Subtarget, /*ForLoadFold*/ true) ||
7486 return nullptr;
7487
7488 unsigned NumOps = MI.getDesc().getNumOperands();
7489 bool IsTwoAddr = NumOps > 1 && OpNum < 2 && MI.getOperand(0).isReg() &&
7490 MI.getOperand(1).isReg() &&
7491 MI.getOperand(0).getReg() == MI.getOperand(1).getReg();
7492
7493 // FIXME: AsmPrinter doesn't know how to handle
7494 // X86II::MO_GOT_ABSOLUTE_ADDRESS after folding.
7495 if (Opc == X86::ADD32ri &&
7496 MI.getOperand(2).getTargetFlags() == X86II::MO_GOT_ABSOLUTE_ADDRESS)
7497 return nullptr;
7498
7499 // GOTTPOFF relocation loads can only be folded into add instructions.
7500 // FIXME: Need to exclude other relocations that only support specific
7501 // instructions.
7502 if (MOs.size() == X86::AddrNumOperands &&
7503 MOs[X86::AddrDisp].getTargetFlags() == X86II::MO_GOTTPOFF &&
7504 Opc != X86::ADD64rr)
7505 return nullptr;
7506
7507 // Don't fold loads into indirect calls that need a KCFI check as we'll
7508 // have to unfold these in X86TargetLowering::EmitKCFICheck anyway.
7509 if (MI.isCall() && MI.getCFIType())
7510 return nullptr;
7511
7512 // Attempt to fold any custom cases we have.
7513 if (auto *CustomMI = foldMemoryOperandCustom(MF, MI, OpNum, MOs, InsertPt,
7514 Size, Alignment))
7515 return CustomMI;
7516
7517 // Folding a memory location into the two-address part of a two-address
7518 // instruction is different than folding it other places. It requires
7519 // replacing the *two* registers with the memory location.
7520 //
7521 // Utilize the mapping NonNDD -> RMW for the NDD variant.
7522 unsigned NonNDOpc = Subtarget.hasNDD() ? X86::getNonNDVariant(Opc) : 0U;
7523 const X86FoldTableEntry *I =
7524 IsTwoAddr ? lookupTwoAddrFoldTable(NonNDOpc ? NonNDOpc : Opc)
7525 : lookupFoldTable(Opc, OpNum);
7526
7527 MachineInstr *NewMI = nullptr;
7528 if (I) {
7529 unsigned Opcode = I->DstOp;
7530 if (Alignment <
7531 Align(1ULL << ((I->Flags & TB_ALIGN_MASK) >> TB_ALIGN_SHIFT)))
7532 return nullptr;
7533 bool NarrowToMOV32rm = false;
7534 if (Size) {
7536 const TargetRegisterClass *RC = getRegClass(MI.getDesc(), OpNum, &RI);
7537 unsigned RCSize = TRI.getRegSizeInBits(*RC) / 8;
7538 // Check if it's safe to fold the load. If the size of the object is
7539 // narrower than the load width, then it's not.
7540 // FIXME: Allow scalar intrinsic instructions like ADDSSrm_Int.
7541 if ((I->Flags & TB_FOLDED_LOAD) && Size < RCSize) {
7542 // If this is a 64-bit load, but the spill slot is 32, then we can do
7543 // a 32-bit load which is implicitly zero-extended. This likely is
7544 // due to live interval analysis remat'ing a load from stack slot.
7545 if (Opcode != X86::MOV64rm || RCSize != 8 || Size != 4)
7546 return nullptr;
7547 if (MI.getOperand(0).getSubReg() || MI.getOperand(1).getSubReg())
7548 return nullptr;
7549 Opcode = X86::MOV32rm;
7550 NarrowToMOV32rm = true;
7551 }
7552 // For stores, make sure the size of the object is equal to the size of
7553 // the store. If the object is larger, the extra bits would be garbage. If
7554 // the object is smaller we might overwrite another object or fault.
7555 if ((I->Flags & TB_FOLDED_STORE) && Size != RCSize)
7556 return nullptr;
7557 }
7558
7559 NewMI = IsTwoAddr ? fuseTwoAddrInst(MF, Opcode, MOs, InsertPt, MI, *this)
7560 : fuseInst(MF, Opcode, OpNum, MOs, InsertPt, MI, *this);
7561
7562 if (NarrowToMOV32rm) {
7563 // If this is the special case where we use a MOV32rm to load a 32-bit
7564 // value and zero-extend the top bits. Change the destination register
7565 // to a 32-bit one.
7566 Register DstReg = NewMI->getOperand(0).getReg();
7567 if (DstReg.isPhysical())
7568 NewMI->getOperand(0).setReg(RI.getSubReg(DstReg, X86::sub_32bit));
7569 else
7570 NewMI->getOperand(0).setSubReg(X86::sub_32bit);
7571 }
7572 return NewMI;
7573 }
7574
7575 if (AllowCommute) {
7576 // If the instruction and target operand are commutable, commute the
7577 // instruction and try again.
7578 unsigned CommuteOpIdx2 = commuteOperandsForFold(MI, OpNum);
7579 if (CommuteOpIdx2 == OpNum) {
7580 printFailMsgforFold(MI, OpNum);
7581 return nullptr;
7582 }
7583 // Attempt to fold with the commuted version of the instruction.
7584 NewMI = foldMemoryOperandImpl(MF, MI, CommuteOpIdx2, MOs, InsertPt, Size,
7585 Alignment, /*AllowCommute=*/false);
7586 if (NewMI)
7587 return NewMI;
7588 // Folding failed again - undo the commute before returning.
7589 commuteInstruction(MI, false, OpNum, CommuteOpIdx2);
7590 }
7591
7592 printFailMsgforFold(MI, OpNum);
7593 return nullptr;
7594}
7595
7598 MachineBasicBlock::iterator InsertPt, int FrameIndex, LiveIntervals *LIS,
7599 VirtRegMap *VRM) const {
7600 // Check switch flag
7601 if (NoFusing)
7602 return nullptr;
7603
7604 // Avoid partial and undef register update stalls unless optimizing for size.
7605 if (!MF.getFunction().hasOptSize() &&
7606 (hasPartialRegUpdate(MI.getOpcode(), Subtarget, /*ForLoadFold*/ true) ||
7608 return nullptr;
7609
7610 // Don't fold subreg spills, or reloads that use a high subreg.
7611 for (auto Op : Ops) {
7612 MachineOperand &MO = MI.getOperand(Op);
7613 auto SubReg = MO.getSubReg();
7614 // MOV32r0 is special b/c it's used to clear a 64-bit register too.
7615 // (See patterns for MOV32r0 in TD files).
7616 if (MI.getOpcode() == X86::MOV32r0 && SubReg == X86::sub_32bit)
7617 continue;
7618 if (SubReg && (MO.isDef() || SubReg == X86::sub_8bit_hi))
7619 return nullptr;
7620 }
7621
7622 const MachineFrameInfo &MFI = MF.getFrameInfo();
7623 unsigned Size = MFI.getObjectSize(FrameIndex);
7624 Align Alignment = MFI.getObjectAlign(FrameIndex);
7625 // If the function stack isn't realigned we don't want to fold instructions
7626 // that need increased alignment.
7627 if (!RI.hasStackRealignment(MF))
7628 Alignment =
7629 std::min(Alignment, Subtarget.getFrameLowering()->getStackAlign());
7630
7631 auto Impl = [&]() {
7632 return foldMemoryOperandImpl(MF, MI, Ops[0],
7633 MachineOperand::CreateFI(FrameIndex), InsertPt,
7634 Size, Alignment, /*AllowCommute=*/true);
7635 };
7636 if (Ops.size() == 2 && Ops[0] == 0 && Ops[1] == 1) {
7637 unsigned NewOpc = 0;
7638 unsigned RCSize = 0;
7639 unsigned Opc = MI.getOpcode();
7640 switch (Opc) {
7641 default:
7642 // NDD can be folded into RMW though its Op0 and Op1 are not tied.
7643 return (Subtarget.hasNDD() ? X86::getNonNDVariant(Opc) : 0U) ? Impl()
7644 : nullptr;
7645 case X86::TEST8rr:
7646 NewOpc = X86::CMP8ri;
7647 RCSize = 1;
7648 break;
7649 case X86::TEST16rr:
7650 NewOpc = X86::CMP16ri;
7651 RCSize = 2;
7652 break;
7653 case X86::TEST32rr:
7654 NewOpc = X86::CMP32ri;
7655 RCSize = 4;
7656 break;
7657 case X86::TEST64rr:
7658 NewOpc = X86::CMP64ri32;
7659 RCSize = 8;
7660 break;
7661 }
7662 // Check if it's safe to fold the load. If the size of the object is
7663 // narrower than the load width, then it's not.
7664 if (Size < RCSize)
7665 return nullptr;
7666 // Change to CMPXXri r, 0 first.
7667 MI.setDesc(get(NewOpc));
7668 MI.getOperand(1).ChangeToImmediate(0);
7669 } else if (Ops.size() != 1)
7670 return nullptr;
7671
7672 return Impl();
7673}
7674
7675/// Check if \p LoadMI is a partial register load that we can't fold into \p MI
7676/// because the latter uses contents that wouldn't be defined in the folded
7677/// version. For instance, this transformation isn't legal:
7678/// movss (%rdi), %xmm0
7679/// addps %xmm0, %xmm0
7680/// ->
7681/// addps (%rdi), %xmm0
7682///
7683/// But this one is:
7684/// movss (%rdi), %xmm0
7685/// addss %xmm0, %xmm0
7686/// ->
7687/// addss (%rdi), %xmm0
7688///
7690 const MachineInstr &UserMI,
7691 const MachineFunction &MF) {
7692 unsigned Opc = LoadMI.getOpcode();
7693 unsigned UserOpc = UserMI.getOpcode();
7695 const TargetRegisterClass *RC =
7696 MF.getRegInfo().getRegClass(LoadMI.getOperand(0).getReg());
7697 unsigned RegSize = TRI.getRegSizeInBits(*RC);
7698
7699 if ((Opc == X86::MOVSSrm || Opc == X86::VMOVSSrm || Opc == X86::VMOVSSZrm ||
7700 Opc == X86::MOVSSrm_alt || Opc == X86::VMOVSSrm_alt ||
7701 Opc == X86::VMOVSSZrm_alt) &&
7702 RegSize > 32) {
7703 // These instructions only load 32 bits, we can't fold them if the
7704 // destination register is wider than 32 bits (4 bytes), and its user
7705 // instruction isn't scalar (SS).
7706 switch (UserOpc) {
7707 case X86::CVTSS2SDrr_Int:
7708 case X86::VCVTSS2SDrr_Int:
7709 case X86::VCVTSS2SDZrr_Int:
7710 case X86::VCVTSS2SDZrrk_Int:
7711 case X86::VCVTSS2SDZrrkz_Int:
7712 case X86::CVTSS2SIrr_Int:
7713 case X86::CVTSS2SI64rr_Int:
7714 case X86::VCVTSS2SIrr_Int:
7715 case X86::VCVTSS2SI64rr_Int:
7716 case X86::VCVTSS2SIZrr_Int:
7717 case X86::VCVTSS2SI64Zrr_Int:
7718 case X86::CVTTSS2SIrr_Int:
7719 case X86::CVTTSS2SI64rr_Int:
7720 case X86::VCVTTSS2SIrr_Int:
7721 case X86::VCVTTSS2SI64rr_Int:
7722 case X86::VCVTTSS2SIZrr_Int:
7723 case X86::VCVTTSS2SI64Zrr_Int:
7724 case X86::VCVTSS2USIZrr_Int:
7725 case X86::VCVTSS2USI64Zrr_Int:
7726 case X86::VCVTTSS2USIZrr_Int:
7727 case X86::VCVTTSS2USI64Zrr_Int:
7728 case X86::RCPSSr_Int:
7729 case X86::VRCPSSr_Int:
7730 case X86::RSQRTSSr_Int:
7731 case X86::VRSQRTSSr_Int:
7732 case X86::ROUNDSSri_Int:
7733 case X86::VROUNDSSri_Int:
7734 case X86::COMISSrr_Int:
7735 case X86::VCOMISSrr_Int:
7736 case X86::VCOMISSZrr_Int:
7737 case X86::UCOMISSrr_Int:
7738 case X86::VUCOMISSrr_Int:
7739 case X86::VUCOMISSZrr_Int:
7740 case X86::ADDSSrr_Int:
7741 case X86::VADDSSrr_Int:
7742 case X86::VADDSSZrr_Int:
7743 case X86::CMPSSrri_Int:
7744 case X86::VCMPSSrri_Int:
7745 case X86::VCMPSSZrri_Int:
7746 case X86::DIVSSrr_Int:
7747 case X86::VDIVSSrr_Int:
7748 case X86::VDIVSSZrr_Int:
7749 case X86::MAXSSrr_Int:
7750 case X86::VMAXSSrr_Int:
7751 case X86::VMAXSSZrr_Int:
7752 case X86::MINSSrr_Int:
7753 case X86::VMINSSrr_Int:
7754 case X86::VMINSSZrr_Int:
7755 case X86::MULSSrr_Int:
7756 case X86::VMULSSrr_Int:
7757 case X86::VMULSSZrr_Int:
7758 case X86::SQRTSSr_Int:
7759 case X86::VSQRTSSr_Int:
7760 case X86::VSQRTSSZr_Int:
7761 case X86::SUBSSrr_Int:
7762 case X86::VSUBSSrr_Int:
7763 case X86::VSUBSSZrr_Int:
7764 case X86::VADDSSZrrk_Int:
7765 case X86::VADDSSZrrkz_Int:
7766 case X86::VCMPSSZrrik_Int:
7767 case X86::VDIVSSZrrk_Int:
7768 case X86::VDIVSSZrrkz_Int:
7769 case X86::VMAXSSZrrk_Int:
7770 case X86::VMAXSSZrrkz_Int:
7771 case X86::VMINSSZrrk_Int:
7772 case X86::VMINSSZrrkz_Int:
7773 case X86::VMULSSZrrk_Int:
7774 case X86::VMULSSZrrkz_Int:
7775 case X86::VSQRTSSZrk_Int:
7776 case X86::VSQRTSSZrkz_Int:
7777 case X86::VSUBSSZrrk_Int:
7778 case X86::VSUBSSZrrkz_Int:
7779 case X86::VFMADDSS4rr_Int:
7780 case X86::VFNMADDSS4rr_Int:
7781 case X86::VFMSUBSS4rr_Int:
7782 case X86::VFNMSUBSS4rr_Int:
7783 case X86::VFMADD132SSr_Int:
7784 case X86::VFNMADD132SSr_Int:
7785 case X86::VFMADD213SSr_Int:
7786 case X86::VFNMADD213SSr_Int:
7787 case X86::VFMADD231SSr_Int:
7788 case X86::VFNMADD231SSr_Int:
7789 case X86::VFMSUB132SSr_Int:
7790 case X86::VFNMSUB132SSr_Int:
7791 case X86::VFMSUB213SSr_Int:
7792 case X86::VFNMSUB213SSr_Int:
7793 case X86::VFMSUB231SSr_Int:
7794 case X86::VFNMSUB231SSr_Int:
7795 case X86::VFMADD132SSZr_Int:
7796 case X86::VFNMADD132SSZr_Int:
7797 case X86::VFMADD213SSZr_Int:
7798 case X86::VFNMADD213SSZr_Int:
7799 case X86::VFMADD231SSZr_Int:
7800 case X86::VFNMADD231SSZr_Int:
7801 case X86::VFMSUB132SSZr_Int:
7802 case X86::VFNMSUB132SSZr_Int:
7803 case X86::VFMSUB213SSZr_Int:
7804 case X86::VFNMSUB213SSZr_Int:
7805 case X86::VFMSUB231SSZr_Int:
7806 case X86::VFNMSUB231SSZr_Int:
7807 case X86::VFMADD132SSZrk_Int:
7808 case X86::VFNMADD132SSZrk_Int:
7809 case X86::VFMADD213SSZrk_Int:
7810 case X86::VFNMADD213SSZrk_Int:
7811 case X86::VFMADD231SSZrk_Int:
7812 case X86::VFNMADD231SSZrk_Int:
7813 case X86::VFMSUB132SSZrk_Int:
7814 case X86::VFNMSUB132SSZrk_Int:
7815 case X86::VFMSUB213SSZrk_Int:
7816 case X86::VFNMSUB213SSZrk_Int:
7817 case X86::VFMSUB231SSZrk_Int:
7818 case X86::VFNMSUB231SSZrk_Int:
7819 case X86::VFMADD132SSZrkz_Int:
7820 case X86::VFNMADD132SSZrkz_Int:
7821 case X86::VFMADD213SSZrkz_Int:
7822 case X86::VFNMADD213SSZrkz_Int:
7823 case X86::VFMADD231SSZrkz_Int:
7824 case X86::VFNMADD231SSZrkz_Int:
7825 case X86::VFMSUB132SSZrkz_Int:
7826 case X86::VFNMSUB132SSZrkz_Int:
7827 case X86::VFMSUB213SSZrkz_Int:
7828 case X86::VFNMSUB213SSZrkz_Int:
7829 case X86::VFMSUB231SSZrkz_Int:
7830 case X86::VFNMSUB231SSZrkz_Int:
7831 case X86::VFIXUPIMMSSZrri:
7832 case X86::VFIXUPIMMSSZrrik:
7833 case X86::VFIXUPIMMSSZrrikz:
7834 case X86::VFPCLASSSSZri:
7835 case X86::VFPCLASSSSZrik:
7836 case X86::VGETEXPSSZr:
7837 case X86::VGETEXPSSZrk:
7838 case X86::VGETEXPSSZrkz:
7839 case X86::VGETMANTSSZrri:
7840 case X86::VGETMANTSSZrrik:
7841 case X86::VGETMANTSSZrrikz:
7842 case X86::VRANGESSZrri:
7843 case X86::VRANGESSZrrik:
7844 case X86::VRANGESSZrrikz:
7845 case X86::VRCP14SSZrr:
7846 case X86::VRCP14SSZrrk:
7847 case X86::VRCP14SSZrrkz:
7848 case X86::VRCP28SSZr:
7849 case X86::VRCP28SSZrk:
7850 case X86::VRCP28SSZrkz:
7851 case X86::VREDUCESSZrri:
7852 case X86::VREDUCESSZrrik:
7853 case X86::VREDUCESSZrrikz:
7854 case X86::VRNDSCALESSZrri_Int:
7855 case X86::VRNDSCALESSZrrik_Int:
7856 case X86::VRNDSCALESSZrrikz_Int:
7857 case X86::VRSQRT14SSZrr:
7858 case X86::VRSQRT14SSZrrk:
7859 case X86::VRSQRT14SSZrrkz:
7860 case X86::VRSQRT28SSZr:
7861 case X86::VRSQRT28SSZrk:
7862 case X86::VRSQRT28SSZrkz:
7863 case X86::VSCALEFSSZrr:
7864 case X86::VSCALEFSSZrrk:
7865 case X86::VSCALEFSSZrrkz:
7866 return false;
7867 default:
7868 return true;
7869 }
7870 }
7871
7872 if ((Opc == X86::MOVSDrm || Opc == X86::VMOVSDrm || Opc == X86::VMOVSDZrm ||
7873 Opc == X86::MOVSDrm_alt || Opc == X86::VMOVSDrm_alt ||
7874 Opc == X86::VMOVSDZrm_alt) &&
7875 RegSize > 64) {
7876 // These instructions only load 64 bits, we can't fold them if the
7877 // destination register is wider than 64 bits (8 bytes), and its user
7878 // instruction isn't scalar (SD).
7879 switch (UserOpc) {
7880 case X86::CVTSD2SSrr_Int:
7881 case X86::VCVTSD2SSrr_Int:
7882 case X86::VCVTSD2SSZrr_Int:
7883 case X86::VCVTSD2SSZrrk_Int:
7884 case X86::VCVTSD2SSZrrkz_Int:
7885 case X86::CVTSD2SIrr_Int:
7886 case X86::CVTSD2SI64rr_Int:
7887 case X86::VCVTSD2SIrr_Int:
7888 case X86::VCVTSD2SI64rr_Int:
7889 case X86::VCVTSD2SIZrr_Int:
7890 case X86::VCVTSD2SI64Zrr_Int:
7891 case X86::CVTTSD2SIrr_Int:
7892 case X86::CVTTSD2SI64rr_Int:
7893 case X86::VCVTTSD2SIrr_Int:
7894 case X86::VCVTTSD2SI64rr_Int:
7895 case X86::VCVTTSD2SIZrr_Int:
7896 case X86::VCVTTSD2SI64Zrr_Int:
7897 case X86::VCVTSD2USIZrr_Int:
7898 case X86::VCVTSD2USI64Zrr_Int:
7899 case X86::VCVTTSD2USIZrr_Int:
7900 case X86::VCVTTSD2USI64Zrr_Int:
7901 case X86::ROUNDSDri_Int:
7902 case X86::VROUNDSDri_Int:
7903 case X86::COMISDrr_Int:
7904 case X86::VCOMISDrr_Int:
7905 case X86::VCOMISDZrr_Int:
7906 case X86::UCOMISDrr_Int:
7907 case X86::VUCOMISDrr_Int:
7908 case X86::VUCOMISDZrr_Int:
7909 case X86::ADDSDrr_Int:
7910 case X86::VADDSDrr_Int:
7911 case X86::VADDSDZrr_Int:
7912 case X86::CMPSDrri_Int:
7913 case X86::VCMPSDrri_Int:
7914 case X86::VCMPSDZrri_Int:
7915 case X86::DIVSDrr_Int:
7916 case X86::VDIVSDrr_Int:
7917 case X86::VDIVSDZrr_Int:
7918 case X86::MAXSDrr_Int:
7919 case X86::VMAXSDrr_Int:
7920 case X86::VMAXSDZrr_Int:
7921 case X86::MINSDrr_Int:
7922 case X86::VMINSDrr_Int:
7923 case X86::VMINSDZrr_Int:
7924 case X86::MULSDrr_Int:
7925 case X86::VMULSDrr_Int:
7926 case X86::VMULSDZrr_Int:
7927 case X86::SQRTSDr_Int:
7928 case X86::VSQRTSDr_Int:
7929 case X86::VSQRTSDZr_Int:
7930 case X86::SUBSDrr_Int:
7931 case X86::VSUBSDrr_Int:
7932 case X86::VSUBSDZrr_Int:
7933 case X86::VADDSDZrrk_Int:
7934 case X86::VADDSDZrrkz_Int:
7935 case X86::VCMPSDZrrik_Int:
7936 case X86::VDIVSDZrrk_Int:
7937 case X86::VDIVSDZrrkz_Int:
7938 case X86::VMAXSDZrrk_Int:
7939 case X86::VMAXSDZrrkz_Int:
7940 case X86::VMINSDZrrk_Int:
7941 case X86::VMINSDZrrkz_Int:
7942 case X86::VMULSDZrrk_Int:
7943 case X86::VMULSDZrrkz_Int:
7944 case X86::VSQRTSDZrk_Int:
7945 case X86::VSQRTSDZrkz_Int:
7946 case X86::VSUBSDZrrk_Int:
7947 case X86::VSUBSDZrrkz_Int:
7948 case X86::VFMADDSD4rr_Int:
7949 case X86::VFNMADDSD4rr_Int:
7950 case X86::VFMSUBSD4rr_Int:
7951 case X86::VFNMSUBSD4rr_Int:
7952 case X86::VFMADD132SDr_Int:
7953 case X86::VFNMADD132SDr_Int:
7954 case X86::VFMADD213SDr_Int:
7955 case X86::VFNMADD213SDr_Int:
7956 case X86::VFMADD231SDr_Int:
7957 case X86::VFNMADD231SDr_Int:
7958 case X86::VFMSUB132SDr_Int:
7959 case X86::VFNMSUB132SDr_Int:
7960 case X86::VFMSUB213SDr_Int:
7961 case X86::VFNMSUB213SDr_Int:
7962 case X86::VFMSUB231SDr_Int:
7963 case X86::VFNMSUB231SDr_Int:
7964 case X86::VFMADD132SDZr_Int:
7965 case X86::VFNMADD132SDZr_Int:
7966 case X86::VFMADD213SDZr_Int:
7967 case X86::VFNMADD213SDZr_Int:
7968 case X86::VFMADD231SDZr_Int:
7969 case X86::VFNMADD231SDZr_Int:
7970 case X86::VFMSUB132SDZr_Int:
7971 case X86::VFNMSUB132SDZr_Int:
7972 case X86::VFMSUB213SDZr_Int:
7973 case X86::VFNMSUB213SDZr_Int:
7974 case X86::VFMSUB231SDZr_Int:
7975 case X86::VFNMSUB231SDZr_Int:
7976 case X86::VFMADD132SDZrk_Int:
7977 case X86::VFNMADD132SDZrk_Int:
7978 case X86::VFMADD213SDZrk_Int:
7979 case X86::VFNMADD213SDZrk_Int:
7980 case X86::VFMADD231SDZrk_Int:
7981 case X86::VFNMADD231SDZrk_Int:
7982 case X86::VFMSUB132SDZrk_Int:
7983 case X86::VFNMSUB132SDZrk_Int:
7984 case X86::VFMSUB213SDZrk_Int:
7985 case X86::VFNMSUB213SDZrk_Int:
7986 case X86::VFMSUB231SDZrk_Int:
7987 case X86::VFNMSUB231SDZrk_Int:
7988 case X86::VFMADD132SDZrkz_Int:
7989 case X86::VFNMADD132SDZrkz_Int:
7990 case X86::VFMADD213SDZrkz_Int:
7991 case X86::VFNMADD213SDZrkz_Int:
7992 case X86::VFMADD231SDZrkz_Int:
7993 case X86::VFNMADD231SDZrkz_Int:
7994 case X86::VFMSUB132SDZrkz_Int:
7995 case X86::VFNMSUB132SDZrkz_Int:
7996 case X86::VFMSUB213SDZrkz_Int:
7997 case X86::VFNMSUB213SDZrkz_Int:
7998 case X86::VFMSUB231SDZrkz_Int:
7999 case X86::VFNMSUB231SDZrkz_Int:
8000 case X86::VFIXUPIMMSDZrri:
8001 case X86::VFIXUPIMMSDZrrik:
8002 case X86::VFIXUPIMMSDZrrikz:
8003 case X86::VFPCLASSSDZri:
8004 case X86::VFPCLASSSDZrik:
8005 case X86::VGETEXPSDZr:
8006 case X86::VGETEXPSDZrk:
8007 case X86::VGETEXPSDZrkz:
8008 case X86::VGETMANTSDZrri:
8009 case X86::VGETMANTSDZrrik:
8010 case X86::VGETMANTSDZrrikz:
8011 case X86::VRANGESDZrri:
8012 case X86::VRANGESDZrrik:
8013 case X86::VRANGESDZrrikz:
8014 case X86::VRCP14SDZrr:
8015 case X86::VRCP14SDZrrk:
8016 case X86::VRCP14SDZrrkz:
8017 case X86::VRCP28SDZr:
8018 case X86::VRCP28SDZrk:
8019 case X86::VRCP28SDZrkz:
8020 case X86::VREDUCESDZrri:
8021 case X86::VREDUCESDZrrik:
8022 case X86::VREDUCESDZrrikz:
8023 case X86::VRNDSCALESDZrri_Int:
8024 case X86::VRNDSCALESDZrrik_Int:
8025 case X86::VRNDSCALESDZrrikz_Int:
8026 case X86::VRSQRT14SDZrr:
8027 case X86::VRSQRT14SDZrrk:
8028 case X86::VRSQRT14SDZrrkz:
8029 case X86::VRSQRT28SDZr:
8030 case X86::VRSQRT28SDZrk:
8031 case X86::VRSQRT28SDZrkz:
8032 case X86::VSCALEFSDZrr:
8033 case X86::VSCALEFSDZrrk:
8034 case X86::VSCALEFSDZrrkz:
8035 return false;
8036 default:
8037 return true;
8038 }
8039 }
8040
8041 if ((Opc == X86::VMOVSHZrm || Opc == X86::VMOVSHZrm_alt) && RegSize > 16) {
8042 // These instructions only load 16 bits, we can't fold them if the
8043 // destination register is wider than 16 bits (2 bytes), and its user
8044 // instruction isn't scalar (SH).
8045 switch (UserOpc) {
8046 case X86::VADDSHZrr_Int:
8047 case X86::VCMPSHZrri_Int:
8048 case X86::VDIVSHZrr_Int:
8049 case X86::VMAXSHZrr_Int:
8050 case X86::VMINSHZrr_Int:
8051 case X86::VMULSHZrr_Int:
8052 case X86::VSUBSHZrr_Int:
8053 case X86::VADDSHZrrk_Int:
8054 case X86::VADDSHZrrkz_Int:
8055 case X86::VCMPSHZrrik_Int:
8056 case X86::VDIVSHZrrk_Int:
8057 case X86::VDIVSHZrrkz_Int:
8058 case X86::VMAXSHZrrk_Int:
8059 case X86::VMAXSHZrrkz_Int:
8060 case X86::VMINSHZrrk_Int:
8061 case X86::VMINSHZrrkz_Int:
8062 case X86::VMULSHZrrk_Int:
8063 case X86::VMULSHZrrkz_Int:
8064 case X86::VSUBSHZrrk_Int:
8065 case X86::VSUBSHZrrkz_Int:
8066 case X86::VFMADD132SHZr_Int:
8067 case X86::VFNMADD132SHZr_Int:
8068 case X86::VFMADD213SHZr_Int:
8069 case X86::VFNMADD213SHZr_Int:
8070 case X86::VFMADD231SHZr_Int:
8071 case X86::VFNMADD231SHZr_Int:
8072 case X86::VFMSUB132SHZr_Int:
8073 case X86::VFNMSUB132SHZr_Int:
8074 case X86::VFMSUB213SHZr_Int:
8075 case X86::VFNMSUB213SHZr_Int:
8076 case X86::VFMSUB231SHZr_Int:
8077 case X86::VFNMSUB231SHZr_Int:
8078 case X86::VFMADD132SHZrk_Int:
8079 case X86::VFNMADD132SHZrk_Int:
8080 case X86::VFMADD213SHZrk_Int:
8081 case X86::VFNMADD213SHZrk_Int:
8082 case X86::VFMADD231SHZrk_Int:
8083 case X86::VFNMADD231SHZrk_Int:
8084 case X86::VFMSUB132SHZrk_Int:
8085 case X86::VFNMSUB132SHZrk_Int:
8086 case X86::VFMSUB213SHZrk_Int:
8087 case X86::VFNMSUB213SHZrk_Int:
8088 case X86::VFMSUB231SHZrk_Int:
8089 case X86::VFNMSUB231SHZrk_Int:
8090 case X86::VFMADD132SHZrkz_Int:
8091 case X86::VFNMADD132SHZrkz_Int:
8092 case X86::VFMADD213SHZrkz_Int:
8093 case X86::VFNMADD213SHZrkz_Int:
8094 case X86::VFMADD231SHZrkz_Int:
8095 case X86::VFNMADD231SHZrkz_Int:
8096 case X86::VFMSUB132SHZrkz_Int:
8097 case X86::VFNMSUB132SHZrkz_Int:
8098 case X86::VFMSUB213SHZrkz_Int:
8099 case X86::VFNMSUB213SHZrkz_Int:
8100 case X86::VFMSUB231SHZrkz_Int:
8101 case X86::VFNMSUB231SHZrkz_Int:
8102 return false;
8103 default:
8104 return true;
8105 }
8106 }
8107
8108 return false;
8109}
8110
8113 MachineBasicBlock::iterator InsertPt, MachineInstr &LoadMI,
8114 LiveIntervals *LIS) const {
8115
8116 // If LoadMI is a masked load, check MI having the same mask.
8117 const MCInstrDesc &MCID = get(LoadMI.getOpcode());
8118 unsigned NumOps = MCID.getNumOperands();
8119 if (NumOps >= 3) {
8120 Register MaskReg;
8121 const MachineOperand &Op1 = LoadMI.getOperand(1);
8122 const MachineOperand &Op2 = LoadMI.getOperand(2);
8123
8124 auto IsVKWMClass = [](const TargetRegisterClass *RC) {
8125 return RC == &X86::VK2WMRegClass || RC == &X86::VK4WMRegClass ||
8126 RC == &X86::VK8WMRegClass || RC == &X86::VK16WMRegClass ||
8127 RC == &X86::VK32WMRegClass || RC == &X86::VK64WMRegClass;
8128 };
8129
8130 if (Op1.isReg() && IsVKWMClass(getRegClass(MCID, 1, &RI)))
8131 MaskReg = Op1.getReg();
8132 else if (Op2.isReg() && IsVKWMClass(getRegClass(MCID, 2, &RI)))
8133 MaskReg = Op2.getReg();
8134
8135 if (MaskReg) {
8136 bool HasSameMask = false;
8137 for (unsigned I = 1, E = MI.getDesc().getNumOperands(); I < E; ++I) {
8138 const MachineOperand &Op = MI.getOperand(I);
8139 if (Op.isReg() && Op.getReg() == MaskReg) {
8140 HasSameMask = true;
8141 break;
8142 }
8143 }
8144 if (!HasSameMask)
8145 return nullptr;
8146 }
8147 }
8148
8149 // TODO: Support the case where LoadMI loads a wide register, but MI
8150 // only uses a subreg.
8151 for (auto Op : Ops) {
8152 if (MI.getOperand(Op).getSubReg())
8153 return nullptr;
8154 }
8155
8156 // If loading from a FrameIndex, fold directly from the FrameIndex.
8157 int FrameIndex;
8158 if (isLoadFromStackSlot(LoadMI, FrameIndex)) {
8159 if (isNonFoldablePartialRegisterLoad(LoadMI, MI, MF))
8160 return nullptr;
8161 return foldMemoryOperandImpl(MF, MI, Ops, InsertPt, FrameIndex, LIS);
8162 }
8163
8164 // Check switch flag
8165 if (NoFusing)
8166 return nullptr;
8167
8168 // Avoid partial and undef register update stalls unless optimizing for size.
8169 if (!MF.getFunction().hasOptSize() &&
8170 (hasPartialRegUpdate(MI.getOpcode(), Subtarget, /*ForLoadFold*/ true) ||
8172 return nullptr;
8173
8174 // Do not fold a NDD instruction and a memory instruction with relocation to
8175 // avoid emit APX relocation when the flag is disabled for backward
8176 // compatibility.
8177 uint64_t TSFlags = MI.getDesc().TSFlags;
8179 X86II::hasNewDataDest(TSFlags))
8180 return nullptr;
8181
8182 // Determine the alignment of the load.
8183 Align Alignment;
8184 unsigned LoadOpc = LoadMI.getOpcode();
8185 if (LoadMI.hasOneMemOperand())
8186 Alignment = (*LoadMI.memoperands_begin())->getAlign();
8187 else
8188 switch (LoadOpc) {
8189 case X86::AVX512_512_SET0:
8190 case X86::AVX512_512_SETALLONES:
8191 Alignment = Align(64);
8192 break;
8193 case X86::AVX2_SETALLONES:
8194 case X86::AVX1_SETALLONES:
8195 case X86::AVX_SET0:
8196 case X86::AVX512_256_SET0:
8197 Alignment = Align(32);
8198 break;
8199 case X86::V_SET0:
8200 case X86::V_SETALLONES:
8201 case X86::AVX512_128_SET0:
8202 case X86::FsFLD0F128:
8203 case X86::AVX512_FsFLD0F128:
8204 Alignment = Align(16);
8205 break;
8206 case X86::MMX_SET0:
8207 case X86::FsFLD0SD:
8208 case X86::AVX512_FsFLD0SD:
8209 Alignment = Align(8);
8210 break;
8211 case X86::FsFLD0SS:
8212 case X86::AVX512_FsFLD0SS:
8213 Alignment = Align(4);
8214 break;
8215 case X86::FsFLD0SH:
8216 case X86::AVX512_FsFLD0SH:
8217 Alignment = Align(2);
8218 break;
8219 default:
8220 return nullptr;
8221 }
8222 if (Ops.size() == 2 && Ops[0] == 0 && Ops[1] == 1) {
8223 unsigned NewOpc = 0;
8224 switch (MI.getOpcode()) {
8225 default:
8226 return nullptr;
8227 case X86::TEST8rr:
8228 NewOpc = X86::CMP8ri;
8229 break;
8230 case X86::TEST16rr:
8231 NewOpc = X86::CMP16ri;
8232 break;
8233 case X86::TEST32rr:
8234 NewOpc = X86::CMP32ri;
8235 break;
8236 case X86::TEST64rr:
8237 NewOpc = X86::CMP64ri32;
8238 break;
8239 }
8240 // Change to CMPXXri r, 0 first.
8241 MI.setDesc(get(NewOpc));
8242 MI.getOperand(1).ChangeToImmediate(0);
8243 } else if (Ops.size() != 1)
8244 return nullptr;
8245
8246 // Make sure the subregisters match.
8247 // Otherwise we risk changing the size of the load.
8248 if (LoadMI.getOperand(0).getSubReg() != MI.getOperand(Ops[0]).getSubReg())
8249 return nullptr;
8250
8252 switch (LoadOpc) {
8253 case X86::MMX_SET0:
8254 case X86::V_SET0:
8255 case X86::V_SETALLONES:
8256 case X86::AVX2_SETALLONES:
8257 case X86::AVX1_SETALLONES:
8258 case X86::AVX_SET0:
8259 case X86::AVX512_128_SET0:
8260 case X86::AVX512_256_SET0:
8261 case X86::AVX512_512_SET0:
8262 case X86::AVX512_512_SETALLONES:
8263 case X86::FsFLD0SH:
8264 case X86::AVX512_FsFLD0SH:
8265 case X86::FsFLD0SD:
8266 case X86::AVX512_FsFLD0SD:
8267 case X86::FsFLD0SS:
8268 case X86::AVX512_FsFLD0SS:
8269 case X86::FsFLD0F128:
8270 case X86::AVX512_FsFLD0F128: {
8271 // Folding a V_SET0 or V_SETALLONES as a load, to ease register pressure.
8272 // Create a constant-pool entry and operands to load from it.
8273
8274 // Large code model can't fold loads this way.
8276 return nullptr;
8277
8278 // x86-32 PIC requires a PIC base register for constant pools.
8279 unsigned PICBase = 0;
8280 // Since we're using Small or Kernel code model, we can always use
8281 // RIP-relative addressing for a smaller encoding.
8282 if (Subtarget.is64Bit()) {
8283 PICBase = X86::RIP;
8284 } else if (MF.getTarget().isPositionIndependent()) {
8285 // FIXME: PICBase = getGlobalBaseReg(&MF);
8286 // This doesn't work for several reasons.
8287 // 1. GlobalBaseReg may have been spilled.
8288 // 2. It may not be live at MI.
8289 return nullptr;
8290 }
8291
8292 // Create a constant-pool entry.
8294 Type *Ty;
8295 bool IsAllOnes = false;
8296 switch (LoadOpc) {
8297 case X86::FsFLD0SS:
8298 case X86::AVX512_FsFLD0SS:
8300 break;
8301 case X86::FsFLD0SD:
8302 case X86::AVX512_FsFLD0SD:
8304 break;
8305 case X86::FsFLD0F128:
8306 case X86::AVX512_FsFLD0F128:
8308 break;
8309 case X86::FsFLD0SH:
8310 case X86::AVX512_FsFLD0SH:
8312 break;
8313 case X86::AVX512_512_SETALLONES:
8314 IsAllOnes = true;
8315 [[fallthrough]];
8316 case X86::AVX512_512_SET0:
8318 16);
8319 break;
8320 case X86::AVX1_SETALLONES:
8321 case X86::AVX2_SETALLONES:
8322 IsAllOnes = true;
8323 [[fallthrough]];
8324 case X86::AVX512_256_SET0:
8325 case X86::AVX_SET0:
8327 8);
8328
8329 break;
8330 case X86::MMX_SET0:
8332 2);
8333 break;
8334 case X86::V_SETALLONES:
8335 IsAllOnes = true;
8336 [[fallthrough]];
8337 case X86::V_SET0:
8338 case X86::AVX512_128_SET0:
8340 4);
8341 break;
8342 }
8343
8344 const Constant *C =
8346 unsigned CPI = MCP.getConstantPoolIndex(C, Alignment);
8347
8348 // Create operands to load from the constant pool entry.
8349 MOs.push_back(MachineOperand::CreateReg(PICBase, false));
8351 MOs.push_back(MachineOperand::CreateReg(0, false));
8353 MOs.push_back(MachineOperand::CreateReg(0, false));
8354 break;
8355 }
8356 case X86::VPBROADCASTBZ128rm:
8357 case X86::VPBROADCASTBZ256rm:
8358 case X86::VPBROADCASTBZrm:
8359 case X86::VBROADCASTF32X2Z256rm:
8360 case X86::VBROADCASTF32X2Zrm:
8361 case X86::VBROADCASTI32X2Z128rm:
8362 case X86::VBROADCASTI32X2Z256rm:
8363 case X86::VBROADCASTI32X2Zrm:
8364 // No instructions currently fuse with 8bits or 32bits x 2.
8365 return nullptr;
8366
8367#define FOLD_BROADCAST(SIZE) \
8368 MOs.append(LoadMI.operands_begin() + NumOps - X86::AddrNumOperands, \
8369 LoadMI.operands_begin() + NumOps); \
8370 return foldMemoryBroadcast(MF, MI, Ops[0], MOs, InsertPt, /*Size=*/SIZE, \
8371 /*AllowCommute=*/true);
8372 case X86::VPBROADCASTWZ128rm:
8373 case X86::VPBROADCASTWZ256rm:
8374 case X86::VPBROADCASTWZrm:
8375 FOLD_BROADCAST(16);
8376 case X86::VPBROADCASTDZ128rm:
8377 case X86::VPBROADCASTDZ256rm:
8378 case X86::VPBROADCASTDZrm:
8379 case X86::VBROADCASTSSZ128rm:
8380 case X86::VBROADCASTSSZ256rm:
8381 case X86::VBROADCASTSSZrm:
8382 FOLD_BROADCAST(32);
8383 case X86::VPBROADCASTQZ128rm:
8384 case X86::VPBROADCASTQZ256rm:
8385 case X86::VPBROADCASTQZrm:
8386 case X86::VBROADCASTSDZ256rm:
8387 case X86::VBROADCASTSDZrm:
8388 FOLD_BROADCAST(64);
8389 default: {
8390 if (isNonFoldablePartialRegisterLoad(LoadMI, MI, MF))
8391 return nullptr;
8392
8393 // Folding a normal load. Just copy the load's address operands.
8395 LoadMI.operands_begin() + NumOps);
8396 break;
8397 }
8398 }
8399 return foldMemoryOperandImpl(MF, MI, Ops[0], MOs, InsertPt,
8400 /*Size=*/0, Alignment, /*AllowCommute=*/true);
8401}
8402
8404X86InstrInfo::foldMemoryBroadcast(MachineFunction &MF, MachineInstr &MI,
8405 unsigned OpNum, ArrayRef<MachineOperand> MOs,
8407 unsigned BitsSize, bool AllowCommute) const {
8408
8409 if (auto *I = lookupBroadcastFoldTable(MI.getOpcode(), OpNum))
8410 return matchBroadcastSize(*I, BitsSize)
8411 ? fuseInst(MF, I->DstOp, OpNum, MOs, InsertPt, MI, *this)
8412 : nullptr;
8413
8414 if (AllowCommute) {
8415 // If the instruction and target operand are commutable, commute the
8416 // instruction and try again.
8417 unsigned CommuteOpIdx2 = commuteOperandsForFold(MI, OpNum);
8418 if (CommuteOpIdx2 == OpNum) {
8419 printFailMsgforFold(MI, OpNum);
8420 return nullptr;
8421 }
8422 MachineInstr *NewMI =
8423 foldMemoryBroadcast(MF, MI, CommuteOpIdx2, MOs, InsertPt, BitsSize,
8424 /*AllowCommute=*/false);
8425 if (NewMI)
8426 return NewMI;
8427 // Folding failed again - undo the commute before returning.
8428 commuteInstruction(MI, false, OpNum, CommuteOpIdx2);
8429 }
8430
8431 printFailMsgforFold(MI, OpNum);
8432 return nullptr;
8433}
8434
8438
8439 for (MachineMemOperand *MMO : MMOs) {
8440 if (!MMO->isLoad())
8441 continue;
8442
8443 if (!MMO->isStore()) {
8444 // Reuse the MMO.
8445 LoadMMOs.push_back(MMO);
8446 } else {
8447 // Clone the MMO and unset the store flag.
8448 LoadMMOs.push_back(MF.getMachineMemOperand(
8449 MMO, MMO->getFlags() & ~MachineMemOperand::MOStore));
8450 }
8451 }
8452
8453 return LoadMMOs;
8454}
8455
8459
8460 for (MachineMemOperand *MMO : MMOs) {
8461 if (!MMO->isStore())
8462 continue;
8463
8464 if (!MMO->isLoad()) {
8465 // Reuse the MMO.
8466 StoreMMOs.push_back(MMO);
8467 } else {
8468 // Clone the MMO and unset the load flag.
8469 StoreMMOs.push_back(MF.getMachineMemOperand(
8470 MMO, MMO->getFlags() & ~MachineMemOperand::MOLoad));
8471 }
8472 }
8473
8474 return StoreMMOs;
8475}
8476
8478 const TargetRegisterClass *RC,
8479 const X86Subtarget &STI) {
8480 assert(STI.hasAVX512() && "Expected at least AVX512!");
8481 unsigned SpillSize = STI.getRegisterInfo()->getSpillSize(*RC);
8482 assert((SpillSize == 64 || STI.hasVLX()) &&
8483 "Can't broadcast less than 64 bytes without AVX512VL!");
8484
8485#define CASE_BCAST_TYPE_OPC(TYPE, OP16, OP32, OP64) \
8486 case TYPE: \
8487 switch (SpillSize) { \
8488 default: \
8489 llvm_unreachable("Unknown spill size"); \
8490 case 16: \
8491 return X86::OP16; \
8492 case 32: \
8493 return X86::OP32; \
8494 case 64: \
8495 return X86::OP64; \
8496 } \
8497 break;
8498
8499 switch (I->Flags & TB_BCAST_MASK) {
8500 default:
8501 llvm_unreachable("Unexpected broadcast type!");
8502 CASE_BCAST_TYPE_OPC(TB_BCAST_W, VPBROADCASTWZ128rm, VPBROADCASTWZ256rm,
8503 VPBROADCASTWZrm)
8504 CASE_BCAST_TYPE_OPC(TB_BCAST_D, VPBROADCASTDZ128rm, VPBROADCASTDZ256rm,
8505 VPBROADCASTDZrm)
8506 CASE_BCAST_TYPE_OPC(TB_BCAST_Q, VPBROADCASTQZ128rm, VPBROADCASTQZ256rm,
8507 VPBROADCASTQZrm)
8508 CASE_BCAST_TYPE_OPC(TB_BCAST_SH, VPBROADCASTWZ128rm, VPBROADCASTWZ256rm,
8509 VPBROADCASTWZrm)
8510 CASE_BCAST_TYPE_OPC(TB_BCAST_SS, VBROADCASTSSZ128rm, VBROADCASTSSZ256rm,
8511 VBROADCASTSSZrm)
8512 CASE_BCAST_TYPE_OPC(TB_BCAST_SD, VMOVDDUPZ128rm, VBROADCASTSDZ256rm,
8513 VBROADCASTSDZrm)
8514 }
8515}
8516
8518 MachineFunction &MF, MachineInstr &MI, Register Reg, bool UnfoldLoad,
8519 bool UnfoldStore, SmallVectorImpl<MachineInstr *> &NewMIs) const {
8520 const X86FoldTableEntry *I = lookupUnfoldTable(MI.getOpcode());
8521 if (I == nullptr)
8522 return false;
8523 unsigned Opc = I->DstOp;
8524 unsigned Index = I->Flags & TB_INDEX_MASK;
8525 bool FoldedLoad = I->Flags & TB_FOLDED_LOAD;
8526 bool FoldedStore = I->Flags & TB_FOLDED_STORE;
8527 if (UnfoldLoad && !FoldedLoad)
8528 return false;
8529 UnfoldLoad &= FoldedLoad;
8530 if (UnfoldStore && !FoldedStore)
8531 return false;
8532 UnfoldStore &= FoldedStore;
8533
8534 const MCInstrDesc &MCID = get(Opc);
8535
8536 const TargetRegisterClass *RC = getRegClass(MCID, Index, &RI);
8538 // TODO: Check if 32-byte or greater accesses are slow too?
8539 if (!MI.hasOneMemOperand() && RC == &X86::VR128RegClass &&
8540 Subtarget.isUnalignedMem16Slow())
8541 // Without memoperands, loadRegFromAddr and storeRegToStackSlot will
8542 // conservatively assume the address is unaligned. That's bad for
8543 // performance.
8544 return false;
8549 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
8550 MachineOperand &Op = MI.getOperand(i);
8551 if (i >= Index && i < Index + X86::AddrNumOperands)
8552 AddrOps.push_back(Op);
8553 else if (Op.isReg() && Op.isImplicit())
8554 ImpOps.push_back(Op);
8555 else if (i < Index)
8556 BeforeOps.push_back(Op);
8557 else if (i > Index)
8558 AfterOps.push_back(Op);
8559 }
8560
8561 // Emit the load or broadcast instruction.
8562 if (UnfoldLoad) {
8563 auto MMOs = extractLoadMMOs(MI.memoperands(), MF);
8564
8565 unsigned Opc;
8566 if (I->Flags & TB_BCAST_MASK) {
8567 Opc = getBroadcastOpcode(I, RC, Subtarget);
8568 } else {
8569 unsigned Alignment = std::max<uint32_t>(TRI.getSpillSize(*RC), 16);
8570 bool isAligned = !MMOs.empty() && MMOs.front()->getAlign() >= Alignment;
8571 Opc = getLoadRegOpcode(Reg, RC, isAligned, Subtarget);
8572 }
8573
8574 DebugLoc DL;
8575 MachineInstrBuilder MIB = BuildMI(MF, DL, get(Opc), Reg);
8576 for (const MachineOperand &AddrOp : AddrOps)
8577 MIB.add(AddrOp);
8578 MIB.setMemRefs(MMOs);
8579 NewMIs.push_back(MIB);
8580
8581 if (UnfoldStore) {
8582 // Address operands cannot be marked isKill.
8583 for (unsigned i = 1; i != 1 + X86::AddrNumOperands; ++i) {
8584 MachineOperand &MO = NewMIs[0]->getOperand(i);
8585 if (MO.isReg())
8586 MO.setIsKill(false);
8587 }
8588 }
8589 }
8590
8591 // Emit the data processing instruction.
8592 MachineInstr *DataMI = MF.CreateMachineInstr(MCID, MI.getDebugLoc(), true);
8593 MachineInstrBuilder MIB(MF, DataMI);
8594
8595 if (FoldedStore)
8596 MIB.addReg(Reg, RegState::Define);
8597 for (MachineOperand &BeforeOp : BeforeOps)
8598 MIB.add(BeforeOp);
8599 if (FoldedLoad)
8600 MIB.addReg(Reg);
8601 for (MachineOperand &AfterOp : AfterOps)
8602 MIB.add(AfterOp);
8603 for (MachineOperand &ImpOp : ImpOps) {
8604 MIB.addReg(ImpOp.getReg(), getDefRegState(ImpOp.isDef()) |
8606 getKillRegState(ImpOp.isKill()) |
8607 getDeadRegState(ImpOp.isDead()) |
8608 getUndefRegState(ImpOp.isUndef()));
8609 }
8610 // Change CMP32ri r, 0 back to TEST32rr r, r, etc.
8611 switch (DataMI->getOpcode()) {
8612 default:
8613 break;
8614 case X86::CMP64ri32:
8615 case X86::CMP32ri:
8616 case X86::CMP16ri:
8617 case X86::CMP8ri: {
8618 MachineOperand &MO0 = DataMI->getOperand(0);
8619 MachineOperand &MO1 = DataMI->getOperand(1);
8620 if (MO1.isImm() && MO1.getImm() == 0) {
8621 unsigned NewOpc;
8622 switch (DataMI->getOpcode()) {
8623 default:
8624 llvm_unreachable("Unreachable!");
8625 case X86::CMP64ri32:
8626 NewOpc = X86::TEST64rr;
8627 break;
8628 case X86::CMP32ri:
8629 NewOpc = X86::TEST32rr;
8630 break;
8631 case X86::CMP16ri:
8632 NewOpc = X86::TEST16rr;
8633 break;
8634 case X86::CMP8ri:
8635 NewOpc = X86::TEST8rr;
8636 break;
8637 }
8638 DataMI->setDesc(get(NewOpc));
8639 MO1.ChangeToRegister(MO0.getReg(), false);
8640 }
8641 }
8642 }
8643 NewMIs.push_back(DataMI);
8644
8645 // Emit the store instruction.
8646 if (UnfoldStore) {
8647 const TargetRegisterClass *DstRC = getRegClass(MCID, 0, &RI);
8648 auto MMOs = extractStoreMMOs(MI.memoperands(), MF);
8649 unsigned Alignment = std::max<uint32_t>(TRI.getSpillSize(*DstRC), 16);
8650 bool isAligned = !MMOs.empty() && MMOs.front()->getAlign() >= Alignment;
8651 unsigned Opc = getStoreRegOpcode(Reg, DstRC, isAligned, Subtarget);
8652 DebugLoc DL;
8653 MachineInstrBuilder MIB = BuildMI(MF, DL, get(Opc));
8654 for (const MachineOperand &AddrOp : AddrOps)
8655 MIB.add(AddrOp);
8656 MIB.addReg(Reg, RegState::Kill);
8657 MIB.setMemRefs(MMOs);
8658 NewMIs.push_back(MIB);
8659 }
8660
8661 return true;
8662}
8663
8665 SelectionDAG &DAG, SDNode *N, SmallVectorImpl<SDNode *> &NewNodes) const {
8666 if (!N->isMachineOpcode())
8667 return false;
8668
8669 const X86FoldTableEntry *I = lookupUnfoldTable(N->getMachineOpcode());
8670 if (I == nullptr)
8671 return false;
8672 unsigned Opc = I->DstOp;
8673 unsigned Index = I->Flags & TB_INDEX_MASK;
8674 bool FoldedLoad = I->Flags & TB_FOLDED_LOAD;
8675 bool FoldedStore = I->Flags & TB_FOLDED_STORE;
8676 const MCInstrDesc &MCID = get(Opc);
8679 const TargetRegisterClass *RC = getRegClass(MCID, Index, &RI);
8680 unsigned NumDefs = MCID.NumDefs;
8681 std::vector<SDValue> AddrOps;
8682 std::vector<SDValue> BeforeOps;
8683 std::vector<SDValue> AfterOps;
8684 SDLoc dl(N);
8685 unsigned NumOps = N->getNumOperands();
8686 for (unsigned i = 0; i != NumOps - 1; ++i) {
8687 SDValue Op = N->getOperand(i);
8688 if (i >= Index - NumDefs && i < Index - NumDefs + X86::AddrNumOperands)
8689 AddrOps.push_back(Op);
8690 else if (i < Index - NumDefs)
8691 BeforeOps.push_back(Op);
8692 else if (i > Index - NumDefs)
8693 AfterOps.push_back(Op);
8694 }
8695 SDValue Chain = N->getOperand(NumOps - 1);
8696 AddrOps.push_back(Chain);
8697
8698 // Emit the load instruction.
8699 SDNode *Load = nullptr;
8700 if (FoldedLoad) {
8701 EVT VT = *TRI.legalclasstypes_begin(*RC);
8702 auto MMOs = extractLoadMMOs(cast<MachineSDNode>(N)->memoperands(), MF);
8703 if (MMOs.empty() && RC == &X86::VR128RegClass &&
8704 Subtarget.isUnalignedMem16Slow())
8705 // Do not introduce a slow unaligned load.
8706 return false;
8707 // FIXME: If a VR128 can have size 32, we should be checking if a 32-byte
8708 // memory access is slow above.
8709
8710 unsigned Opc;
8711 if (I->Flags & TB_BCAST_MASK) {
8712 Opc = getBroadcastOpcode(I, RC, Subtarget);
8713 } else {
8714 unsigned Alignment = std::max<uint32_t>(TRI.getSpillSize(*RC), 16);
8715 bool isAligned = !MMOs.empty() && MMOs.front()->getAlign() >= Alignment;
8716 Opc = getLoadRegOpcode(0, RC, isAligned, Subtarget);
8717 }
8718
8719 Load = DAG.getMachineNode(Opc, dl, VT, MVT::Other, AddrOps);
8720 NewNodes.push_back(Load);
8721
8722 // Preserve memory reference information.
8723 DAG.setNodeMemRefs(cast<MachineSDNode>(Load), MMOs);
8724 }
8725
8726 // Emit the data processing instruction.
8727 std::vector<EVT> VTs;
8728 const TargetRegisterClass *DstRC = nullptr;
8729 if (MCID.getNumDefs() > 0) {
8730 DstRC = getRegClass(MCID, 0, &RI);
8731 VTs.push_back(*TRI.legalclasstypes_begin(*DstRC));
8732 }
8733 for (unsigned i = 0, e = N->getNumValues(); i != e; ++i) {
8734 EVT VT = N->getValueType(i);
8735 if (VT != MVT::Other && i >= (unsigned)MCID.getNumDefs())
8736 VTs.push_back(VT);
8737 }
8738 if (Load)
8739 BeforeOps.push_back(SDValue(Load, 0));
8740 llvm::append_range(BeforeOps, AfterOps);
8741 // Change CMP32ri r, 0 back to TEST32rr r, r, etc.
8742 switch (Opc) {
8743 default:
8744 break;
8745 case X86::CMP64ri32:
8746 case X86::CMP32ri:
8747 case X86::CMP16ri:
8748 case X86::CMP8ri:
8749 if (isNullConstant(BeforeOps[1])) {
8750 switch (Opc) {
8751 default:
8752 llvm_unreachable("Unreachable!");
8753 case X86::CMP64ri32:
8754 Opc = X86::TEST64rr;
8755 break;
8756 case X86::CMP32ri:
8757 Opc = X86::TEST32rr;
8758 break;
8759 case X86::CMP16ri:
8760 Opc = X86::TEST16rr;
8761 break;
8762 case X86::CMP8ri:
8763 Opc = X86::TEST8rr;
8764 break;
8765 }
8766 BeforeOps[1] = BeforeOps[0];
8767 }
8768 }
8769 SDNode *NewNode = DAG.getMachineNode(Opc, dl, VTs, BeforeOps);
8770 NewNodes.push_back(NewNode);
8771
8772 // Emit the store instruction.
8773 if (FoldedStore) {
8774 AddrOps.pop_back();
8775 AddrOps.push_back(SDValue(NewNode, 0));
8776 AddrOps.push_back(Chain);
8777 auto MMOs = extractStoreMMOs(cast<MachineSDNode>(N)->memoperands(), MF);
8778 if (MMOs.empty() && RC == &X86::VR128RegClass &&
8779 Subtarget.isUnalignedMem16Slow())
8780 // Do not introduce a slow unaligned store.
8781 return false;
8782 // FIXME: If a VR128 can have size 32, we should be checking if a 32-byte
8783 // memory access is slow above.
8784 unsigned Alignment = std::max<uint32_t>(TRI.getSpillSize(*RC), 16);
8785 bool isAligned = !MMOs.empty() && MMOs.front()->getAlign() >= Alignment;
8786 SDNode *Store =
8787 DAG.getMachineNode(getStoreRegOpcode(0, DstRC, isAligned, Subtarget),
8788 dl, MVT::Other, AddrOps);
8789 NewNodes.push_back(Store);
8790
8791 // Preserve memory reference information.
8792 DAG.setNodeMemRefs(cast<MachineSDNode>(Store), MMOs);
8793 }
8794
8795 return true;
8796}
8797
8798unsigned
8800 bool UnfoldStore,
8801 unsigned *LoadRegIndex) const {
8803 if (I == nullptr)
8804 return 0;
8805 bool FoldedLoad = I->Flags & TB_FOLDED_LOAD;
8806 bool FoldedStore = I->Flags & TB_FOLDED_STORE;
8807 if (UnfoldLoad && !FoldedLoad)
8808 return 0;
8809 if (UnfoldStore && !FoldedStore)
8810 return 0;
8811 if (LoadRegIndex)
8812 *LoadRegIndex = I->Flags & TB_INDEX_MASK;
8813 return I->DstOp;
8814}
8815
8817 int64_t &Offset1,
8818 int64_t &Offset2) const {
8819 if (!Load1->isMachineOpcode() || !Load2->isMachineOpcode())
8820 return false;
8821
8822 auto IsLoadOpcode = [&](unsigned Opcode) {
8823 switch (Opcode) {
8824 default:
8825 return false;
8826 case X86::MOV8rm:
8827 case X86::MOV16rm:
8828 case X86::MOV32rm:
8829 case X86::MOV64rm:
8830 case X86::LD_Fp32m:
8831 case X86::LD_Fp64m:
8832 case X86::LD_Fp80m:
8833 case X86::MOVSSrm:
8834 case X86::MOVSSrm_alt:
8835 case X86::MOVSDrm:
8836 case X86::MOVSDrm_alt:
8837 case X86::MMX_MOVD64rm:
8838 case X86::MMX_MOVQ64rm:
8839 case X86::MOVAPSrm:
8840 case X86::MOVUPSrm:
8841 case X86::MOVAPDrm:
8842 case X86::MOVUPDrm:
8843 case X86::MOVDQArm:
8844 case X86::MOVDQUrm:
8845 // AVX load instructions
8846 case X86::VMOVSSrm:
8847 case X86::VMOVSSrm_alt:
8848 case X86::VMOVSDrm:
8849 case X86::VMOVSDrm_alt:
8850 case X86::VMOVAPSrm:
8851 case X86::VMOVUPSrm:
8852 case X86::VMOVAPDrm:
8853 case X86::VMOVUPDrm:
8854 case X86::VMOVDQArm:
8855 case X86::VMOVDQUrm:
8856 case X86::VMOVAPSYrm:
8857 case X86::VMOVUPSYrm:
8858 case X86::VMOVAPDYrm:
8859 case X86::VMOVUPDYrm:
8860 case X86::VMOVDQAYrm:
8861 case X86::VMOVDQUYrm:
8862 // AVX512 load instructions
8863 case X86::VMOVSSZrm:
8864 case X86::VMOVSSZrm_alt:
8865 case X86::VMOVSDZrm:
8866 case X86::VMOVSDZrm_alt:
8867 case X86::VMOVAPSZ128rm:
8868 case X86::VMOVUPSZ128rm:
8869 case X86::VMOVAPSZ128rm_NOVLX:
8870 case X86::VMOVUPSZ128rm_NOVLX:
8871 case X86::VMOVAPDZ128rm:
8872 case X86::VMOVUPDZ128rm:
8873 case X86::VMOVDQU8Z128rm:
8874 case X86::VMOVDQU16Z128rm:
8875 case X86::VMOVDQA32Z128rm:
8876 case X86::VMOVDQU32Z128rm:
8877 case X86::VMOVDQA64Z128rm:
8878 case X86::VMOVDQU64Z128rm:
8879 case X86::VMOVAPSZ256rm:
8880 case X86::VMOVUPSZ256rm:
8881 case X86::VMOVAPSZ256rm_NOVLX:
8882 case X86::VMOVUPSZ256rm_NOVLX:
8883 case X86::VMOVAPDZ256rm:
8884 case X86::VMOVUPDZ256rm:
8885 case X86::VMOVDQU8Z256rm:
8886 case X86::VMOVDQU16Z256rm:
8887 case X86::VMOVDQA32Z256rm:
8888 case X86::VMOVDQU32Z256rm:
8889 case X86::VMOVDQA64Z256rm:
8890 case X86::VMOVDQU64Z256rm:
8891 case X86::VMOVAPSZrm:
8892 case X86::VMOVUPSZrm:
8893 case X86::VMOVAPDZrm:
8894 case X86::VMOVUPDZrm:
8895 case X86::VMOVDQU8Zrm:
8896 case X86::VMOVDQU16Zrm:
8897 case X86::VMOVDQA32Zrm:
8898 case X86::VMOVDQU32Zrm:
8899 case X86::VMOVDQA64Zrm:
8900 case X86::VMOVDQU64Zrm:
8901 case X86::KMOVBkm:
8902 case X86::KMOVBkm_EVEX:
8903 case X86::KMOVWkm:
8904 case X86::KMOVWkm_EVEX:
8905 case X86::KMOVDkm:
8906 case X86::KMOVDkm_EVEX:
8907 case X86::KMOVQkm:
8908 case X86::KMOVQkm_EVEX:
8909 return true;
8910 }
8911 };
8912
8913 if (!IsLoadOpcode(Load1->getMachineOpcode()) ||
8914 !IsLoadOpcode(Load2->getMachineOpcode()))
8915 return false;
8916
8917 // Lambda to check if both the loads have the same value for an operand index.
8918 auto HasSameOp = [&](int I) {
8919 return Load1->getOperand(I) == Load2->getOperand(I);
8920 };
8921
8922 // All operands except the displacement should match.
8923 if (!HasSameOp(X86::AddrBaseReg) || !HasSameOp(X86::AddrScaleAmt) ||
8924 !HasSameOp(X86::AddrIndexReg) || !HasSameOp(X86::AddrSegmentReg))
8925 return false;
8926
8927 // Chain Operand must be the same.
8928 if (!HasSameOp(5))
8929 return false;
8930
8931 // Now let's examine if the displacements are constants.
8934 if (!Disp1 || !Disp2)
8935 return false;
8936
8937 Offset1 = Disp1->getSExtValue();
8938 Offset2 = Disp2->getSExtValue();
8939 return true;
8940}
8941
8943 int64_t Offset1, int64_t Offset2,
8944 unsigned NumLoads) const {
8945 assert(Offset2 > Offset1);
8946 if ((Offset2 - Offset1) / 8 > 64)
8947 return false;
8948
8949 unsigned Opc1 = Load1->getMachineOpcode();
8950 unsigned Opc2 = Load2->getMachineOpcode();
8951 if (Opc1 != Opc2)
8952 return false; // FIXME: overly conservative?
8953
8954 switch (Opc1) {
8955 default:
8956 break;
8957 case X86::LD_Fp32m:
8958 case X86::LD_Fp64m:
8959 case X86::LD_Fp80m:
8960 case X86::MMX_MOVD64rm:
8961 case X86::MMX_MOVQ64rm:
8962 return false;
8963 }
8964
8965 EVT VT = Load1->getValueType(0);
8966 switch (VT.getSimpleVT().SimpleTy) {
8967 default:
8968 // XMM registers. In 64-bit mode we can be a bit more aggressive since we
8969 // have 16 of them to play with.
8970 if (Subtarget.is64Bit()) {
8971 if (NumLoads >= 3)
8972 return false;
8973 } else if (NumLoads) {
8974 return false;
8975 }
8976 break;
8977 case MVT::i8:
8978 case MVT::i16:
8979 case MVT::i32:
8980 case MVT::i64:
8981 case MVT::f32:
8982 case MVT::f64:
8983 if (NumLoads)
8984 return false;
8985 break;
8986 }
8987
8988 return true;
8989}
8990
8992 const MachineBasicBlock *MBB,
8993 const MachineFunction &MF) const {
8994
8995 // ENDBR instructions should not be scheduled around.
8996 unsigned Opcode = MI.getOpcode();
8997 if (Opcode == X86::ENDBR64 || Opcode == X86::ENDBR32 ||
8998 Opcode == X86::PLDTILECFGV)
8999 return true;
9000
9001 // Frame setup and destroy can't be scheduled around.
9002 if (MI.getFlag(MachineInstr::FrameSetup) ||
9004 return true;
9005
9007}
9008
9011 assert(Cond.size() == 1 && "Invalid X86 branch condition!");
9012 X86::CondCode CC = static_cast<X86::CondCode>(Cond[0].getImm());
9013 Cond[0].setImm(GetOppositeBranchCondition(CC));
9014 return false;
9015}
9016
9018 const TargetRegisterClass *RC) const {
9019 // FIXME: Return false for x87 stack register classes for now. We can't
9020 // allow any loads of these registers before FpGet_ST0_80.
9021 return !(RC == &X86::CCRRegClass || RC == &X86::DFCCRRegClass ||
9022 RC == &X86::RFP32RegClass || RC == &X86::RFP64RegClass ||
9023 RC == &X86::RFP80RegClass);
9024}
9025
9026/// Return a virtual register initialized with the
9027/// the global base register value. Output instructions required to
9028/// initialize the register in the function entry block, if necessary.
9029///
9030/// TODO: Eliminate this and move the code to X86MachineFunctionInfo.
9031///
9034 Register GlobalBaseReg = X86FI->getGlobalBaseReg();
9035 if (GlobalBaseReg)
9036 return GlobalBaseReg;
9037
9038 // Create the register. The code to initialize it is inserted
9039 // later, by the CGBR pass (below).
9040 MachineRegisterInfo &RegInfo = MF->getRegInfo();
9041 GlobalBaseReg = RegInfo.createVirtualRegister(
9042 Subtarget.is64Bit() ? &X86::GR64_NOSPRegClass : &X86::GR32_NOSPRegClass);
9043 X86FI->setGlobalBaseReg(GlobalBaseReg);
9044 return GlobalBaseReg;
9045}
9046
9047// FIXME: Some shuffle and unpack instructions have equivalents in different
9048// domains, but they require a bit more work than just switching opcodes.
9049
9050static const uint16_t *lookup(unsigned opcode, unsigned domain,
9051 ArrayRef<uint16_t[3]> Table) {
9052 for (const uint16_t(&Row)[3] : Table)
9053 if (Row[domain - 1] == opcode)
9054 return Row;
9055 return nullptr;
9056}
9057
9058static const uint16_t *lookupAVX512(unsigned opcode, unsigned domain,
9059 ArrayRef<uint16_t[4]> Table) {
9060 // If this is the integer domain make sure to check both integer columns.
9061 for (const uint16_t(&Row)[4] : Table)
9062 if (Row[domain - 1] == opcode || (domain == 3 && Row[3] == opcode))
9063 return Row;
9064 return nullptr;
9065}
9066
9067// Helper to attempt to widen/narrow blend masks.
9068static bool AdjustBlendMask(unsigned OldMask, unsigned OldWidth,
9069 unsigned NewWidth, unsigned *pNewMask = nullptr) {
9070 assert(((OldWidth % NewWidth) == 0 || (NewWidth % OldWidth) == 0) &&
9071 "Illegal blend mask scale");
9072 unsigned NewMask = 0;
9073
9074 if ((OldWidth % NewWidth) == 0) {
9075 unsigned Scale = OldWidth / NewWidth;
9076 unsigned SubMask = (1u << Scale) - 1;
9077 for (unsigned i = 0; i != NewWidth; ++i) {
9078 unsigned Sub = (OldMask >> (i * Scale)) & SubMask;
9079 if (Sub == SubMask)
9080 NewMask |= (1u << i);
9081 else if (Sub != 0x0)
9082 return false;
9083 }
9084 } else {
9085 unsigned Scale = NewWidth / OldWidth;
9086 unsigned SubMask = (1u << Scale) - 1;
9087 for (unsigned i = 0; i != OldWidth; ++i) {
9088 if (OldMask & (1 << i)) {
9089 NewMask |= (SubMask << (i * Scale));
9090 }
9091 }
9092 }
9093
9094 if (pNewMask)
9095 *pNewMask = NewMask;
9096 return true;
9097}
9098
9100 unsigned Opcode = MI.getOpcode();
9101 unsigned NumOperands = MI.getDesc().getNumOperands();
9102
9103 auto GetBlendDomains = [&](unsigned ImmWidth, bool Is256) {
9104 uint16_t validDomains = 0;
9105 if (MI.getOperand(NumOperands - 1).isImm()) {
9106 unsigned Imm = MI.getOperand(NumOperands - 1).getImm();
9107 if (AdjustBlendMask(Imm, ImmWidth, Is256 ? 8 : 4))
9108 validDomains |= 0x2; // PackedSingle
9109 if (AdjustBlendMask(Imm, ImmWidth, Is256 ? 4 : 2))
9110 validDomains |= 0x4; // PackedDouble
9111 if (!Is256 || Subtarget.hasAVX2())
9112 validDomains |= 0x8; // PackedInt
9113 }
9114 return validDomains;
9115 };
9116
9117 switch (Opcode) {
9118 case X86::BLENDPDrmi:
9119 case X86::BLENDPDrri:
9120 case X86::VBLENDPDrmi:
9121 case X86::VBLENDPDrri:
9122 return GetBlendDomains(2, false);
9123 case X86::VBLENDPDYrmi:
9124 case X86::VBLENDPDYrri:
9125 return GetBlendDomains(4, true);
9126 case X86::BLENDPSrmi:
9127 case X86::BLENDPSrri:
9128 case X86::VBLENDPSrmi:
9129 case X86::VBLENDPSrri:
9130 case X86::VPBLENDDrmi:
9131 case X86::VPBLENDDrri:
9132 return GetBlendDomains(4, false);
9133 case X86::VBLENDPSYrmi:
9134 case X86::VBLENDPSYrri:
9135 case X86::VPBLENDDYrmi:
9136 case X86::VPBLENDDYrri:
9137 return GetBlendDomains(8, true);
9138 case X86::PBLENDWrmi:
9139 case X86::PBLENDWrri:
9140 case X86::VPBLENDWrmi:
9141 case X86::VPBLENDWrri:
9142 // Treat VPBLENDWY as a 128-bit vector as it repeats the lo/hi masks.
9143 case X86::VPBLENDWYrmi:
9144 case X86::VPBLENDWYrri:
9145 return GetBlendDomains(8, false);
9146 case X86::VPANDDZ128rr:
9147 case X86::VPANDDZ128rm:
9148 case X86::VPANDDZ256rr:
9149 case X86::VPANDDZ256rm:
9150 case X86::VPANDQZ128rr:
9151 case X86::VPANDQZ128rm:
9152 case X86::VPANDQZ256rr:
9153 case X86::VPANDQZ256rm:
9154 case X86::VPANDNDZ128rr:
9155 case X86::VPANDNDZ128rm:
9156 case X86::VPANDNDZ256rr:
9157 case X86::VPANDNDZ256rm:
9158 case X86::VPANDNQZ128rr:
9159 case X86::VPANDNQZ128rm:
9160 case X86::VPANDNQZ256rr:
9161 case X86::VPANDNQZ256rm:
9162 case X86::VPORDZ128rr:
9163 case X86::VPORDZ128rm:
9164 case X86::VPORDZ256rr:
9165 case X86::VPORDZ256rm:
9166 case X86::VPORQZ128rr:
9167 case X86::VPORQZ128rm:
9168 case X86::VPORQZ256rr:
9169 case X86::VPORQZ256rm:
9170 case X86::VPXORDZ128rr:
9171 case X86::VPXORDZ128rm:
9172 case X86::VPXORDZ256rr:
9173 case X86::VPXORDZ256rm:
9174 case X86::VPXORQZ128rr:
9175 case X86::VPXORQZ128rm:
9176 case X86::VPXORQZ256rr:
9177 case X86::VPXORQZ256rm:
9178 // If we don't have DQI see if we can still switch from an EVEX integer
9179 // instruction to a VEX floating point instruction.
9180 if (Subtarget.hasDQI())
9181 return 0;
9182
9183 if (RI.getEncodingValue(MI.getOperand(0).getReg()) >= 16)
9184 return 0;
9185 if (RI.getEncodingValue(MI.getOperand(1).getReg()) >= 16)
9186 return 0;
9187 // Register forms will have 3 operands. Memory form will have more.
9188 if (NumOperands == 3 &&
9189 RI.getEncodingValue(MI.getOperand(2).getReg()) >= 16)
9190 return 0;
9191
9192 // All domains are valid.
9193 return 0xe;
9194 case X86::MOVHLPSrr:
9195 // We can swap domains when both inputs are the same register.
9196 // FIXME: This doesn't catch all the cases we would like. If the input
9197 // register isn't KILLed by the instruction, the two address instruction
9198 // pass puts a COPY on one input. The other input uses the original
9199 // register. This prevents the same physical register from being used by
9200 // both inputs.
9201 if (MI.getOperand(1).getReg() == MI.getOperand(2).getReg() &&
9202 MI.getOperand(0).getSubReg() == 0 &&
9203 MI.getOperand(1).getSubReg() == 0 && MI.getOperand(2).getSubReg() == 0)
9204 return 0x6;
9205 return 0;
9206 case X86::SHUFPDrri:
9207 return 0x6;
9208 }
9209 return 0;
9210}
9211
9212#include "X86ReplaceableInstrs.def"
9213
9215 unsigned Domain) const {
9216 assert(Domain > 0 && Domain < 4 && "Invalid execution domain");
9217 uint16_t dom = (MI.getDesc().TSFlags >> X86II::SSEDomainShift) & 3;
9218 assert(dom && "Not an SSE instruction");
9219
9220 unsigned Opcode = MI.getOpcode();
9221 unsigned NumOperands = MI.getDesc().getNumOperands();
9222
9223 auto SetBlendDomain = [&](unsigned ImmWidth, bool Is256) {
9224 if (MI.getOperand(NumOperands - 1).isImm()) {
9225 unsigned Imm = MI.getOperand(NumOperands - 1).getImm() & 255;
9226 Imm = (ImmWidth == 16 ? ((Imm << 8) | Imm) : Imm);
9227 unsigned NewImm = Imm;
9228
9229 const uint16_t *table = lookup(Opcode, dom, ReplaceableBlendInstrs);
9230 if (!table)
9231 table = lookup(Opcode, dom, ReplaceableBlendAVX2Instrs);
9232
9233 if (Domain == 1) { // PackedSingle
9234 AdjustBlendMask(Imm, ImmWidth, Is256 ? 8 : 4, &NewImm);
9235 } else if (Domain == 2) { // PackedDouble
9236 AdjustBlendMask(Imm, ImmWidth, Is256 ? 4 : 2, &NewImm);
9237 } else if (Domain == 3) { // PackedInt
9238 if (Subtarget.hasAVX2()) {
9239 // If we are already VPBLENDW use that, else use VPBLENDD.
9240 if ((ImmWidth / (Is256 ? 2 : 1)) != 8) {
9241 table = lookup(Opcode, dom, ReplaceableBlendAVX2Instrs);
9242 AdjustBlendMask(Imm, ImmWidth, Is256 ? 8 : 4, &NewImm);
9243 }
9244 } else {
9245 assert(!Is256 && "128-bit vector expected");
9246 AdjustBlendMask(Imm, ImmWidth, 8, &NewImm);
9247 }
9248 }
9249
9250 assert(table && table[Domain - 1] && "Unknown domain op");
9251 MI.setDesc(get(table[Domain - 1]));
9252 MI.getOperand(NumOperands - 1).setImm(NewImm & 255);
9253 }
9254 return true;
9255 };
9256
9257 switch (Opcode) {
9258 case X86::BLENDPDrmi:
9259 case X86::BLENDPDrri:
9260 case X86::VBLENDPDrmi:
9261 case X86::VBLENDPDrri:
9262 return SetBlendDomain(2, false);
9263 case X86::VBLENDPDYrmi:
9264 case X86::VBLENDPDYrri:
9265 return SetBlendDomain(4, true);
9266 case X86::BLENDPSrmi:
9267 case X86::BLENDPSrri:
9268 case X86::VBLENDPSrmi:
9269 case X86::VBLENDPSrri:
9270 case X86::VPBLENDDrmi:
9271 case X86::VPBLENDDrri:
9272 return SetBlendDomain(4, false);
9273 case X86::VBLENDPSYrmi:
9274 case X86::VBLENDPSYrri:
9275 case X86::VPBLENDDYrmi:
9276 case X86::VPBLENDDYrri:
9277 return SetBlendDomain(8, true);
9278 case X86::PBLENDWrmi:
9279 case X86::PBLENDWrri:
9280 case X86::VPBLENDWrmi:
9281 case X86::VPBLENDWrri:
9282 return SetBlendDomain(8, false);
9283 case X86::VPBLENDWYrmi:
9284 case X86::VPBLENDWYrri:
9285 return SetBlendDomain(16, true);
9286 case X86::VPANDDZ128rr:
9287 case X86::VPANDDZ128rm:
9288 case X86::VPANDDZ256rr:
9289 case X86::VPANDDZ256rm:
9290 case X86::VPANDQZ128rr:
9291 case X86::VPANDQZ128rm:
9292 case X86::VPANDQZ256rr:
9293 case X86::VPANDQZ256rm:
9294 case X86::VPANDNDZ128rr:
9295 case X86::VPANDNDZ128rm:
9296 case X86::VPANDNDZ256rr:
9297 case X86::VPANDNDZ256rm:
9298 case X86::VPANDNQZ128rr:
9299 case X86::VPANDNQZ128rm:
9300 case X86::VPANDNQZ256rr:
9301 case X86::VPANDNQZ256rm:
9302 case X86::VPORDZ128rr:
9303 case X86::VPORDZ128rm:
9304 case X86::VPORDZ256rr:
9305 case X86::VPORDZ256rm:
9306 case X86::VPORQZ128rr:
9307 case X86::VPORQZ128rm:
9308 case X86::VPORQZ256rr:
9309 case X86::VPORQZ256rm:
9310 case X86::VPXORDZ128rr:
9311 case X86::VPXORDZ128rm:
9312 case X86::VPXORDZ256rr:
9313 case X86::VPXORDZ256rm:
9314 case X86::VPXORQZ128rr:
9315 case X86::VPXORQZ128rm:
9316 case X86::VPXORQZ256rr:
9317 case X86::VPXORQZ256rm: {
9318 // Without DQI, convert EVEX instructions to VEX instructions.
9319 if (Subtarget.hasDQI())
9320 return false;
9321
9322 const uint16_t *table =
9323 lookupAVX512(MI.getOpcode(), dom, ReplaceableCustomAVX512LogicInstrs);
9324 assert(table && "Instruction not found in table?");
9325 // Don't change integer Q instructions to D instructions and
9326 // use D intructions if we started with a PS instruction.
9327 if (Domain == 3 && (dom == 1 || table[3] == MI.getOpcode()))
9328 Domain = 4;
9329 MI.setDesc(get(table[Domain - 1]));
9330 return true;
9331 }
9332 case X86::UNPCKHPDrr:
9333 case X86::MOVHLPSrr:
9334 // We just need to commute the instruction which will switch the domains.
9335 if (Domain != dom && Domain != 3 &&
9336 MI.getOperand(1).getReg() == MI.getOperand(2).getReg() &&
9337 MI.getOperand(0).getSubReg() == 0 &&
9338 MI.getOperand(1).getSubReg() == 0 &&
9339 MI.getOperand(2).getSubReg() == 0) {
9340 commuteInstruction(MI, false);
9341 return true;
9342 }
9343 // We must always return true for MOVHLPSrr.
9344 if (Opcode == X86::MOVHLPSrr)
9345 return true;
9346 break;
9347 case X86::SHUFPDrri: {
9348 if (Domain == 1) {
9349 unsigned Imm = MI.getOperand(3).getImm();
9350 unsigned NewImm = 0x44;
9351 if (Imm & 1)
9352 NewImm |= 0x0a;
9353 if (Imm & 2)
9354 NewImm |= 0xa0;
9355 MI.getOperand(3).setImm(NewImm);
9356 MI.setDesc(get(X86::SHUFPSrri));
9357 }
9358 return true;
9359 }
9360 }
9361 return false;
9362}
9363
9364std::pair<uint16_t, uint16_t>
9366 uint16_t domain = (MI.getDesc().TSFlags >> X86II::SSEDomainShift) & 3;
9367 unsigned opcode = MI.getOpcode();
9368 uint16_t validDomains = 0;
9369 if (domain) {
9370 // Attempt to match for custom instructions.
9371 validDomains = getExecutionDomainCustom(MI);
9372 if (validDomains)
9373 return std::make_pair(domain, validDomains);
9374
9375 if (lookup(opcode, domain, ReplaceableInstrs)) {
9376 validDomains = 0xe;
9377 } else if (lookup(opcode, domain, ReplaceableInstrsAVX2)) {
9378 validDomains = Subtarget.hasAVX2() ? 0xe : 0x6;
9379 } else if (lookup(opcode, domain, ReplaceableInstrsFP)) {
9380 validDomains = 0x6;
9381 } else if (lookup(opcode, domain, ReplaceableInstrsAVX2InsertExtract)) {
9382 // Insert/extract instructions should only effect domain if AVX2
9383 // is enabled.
9384 if (!Subtarget.hasAVX2())
9385 return std::make_pair(0, 0);
9386 validDomains = 0xe;
9387 } else if (lookupAVX512(opcode, domain, ReplaceableInstrsAVX512)) {
9388 validDomains = 0xe;
9389 } else if (Subtarget.hasDQI() &&
9390 lookupAVX512(opcode, domain, ReplaceableInstrsAVX512DQ)) {
9391 validDomains = 0xe;
9392 } else if (Subtarget.hasDQI()) {
9393 if (const uint16_t *table =
9394 lookupAVX512(opcode, domain, ReplaceableInstrsAVX512DQMasked)) {
9395 if (domain == 1 || (domain == 3 && table[3] == opcode))
9396 validDomains = 0xa;
9397 else
9398 validDomains = 0xc;
9399 }
9400 }
9401 }
9402 return std::make_pair(domain, validDomains);
9403}
9404
9406 assert(Domain > 0 && Domain < 4 && "Invalid execution domain");
9407 uint16_t dom = (MI.getDesc().TSFlags >> X86II::SSEDomainShift) & 3;
9408 assert(dom && "Not an SSE instruction");
9409
9410 // Attempt to match for custom instructions.
9412 return;
9413
9414 const uint16_t *table = lookup(MI.getOpcode(), dom, ReplaceableInstrs);
9415 if (!table) { // try the other table
9416 assert((Subtarget.hasAVX2() || Domain < 3) &&
9417 "256-bit vector operations only available in AVX2");
9418 table = lookup(MI.getOpcode(), dom, ReplaceableInstrsAVX2);
9419 }
9420 if (!table) { // try the FP table
9421 table = lookup(MI.getOpcode(), dom, ReplaceableInstrsFP);
9422 assert((!table || Domain < 3) &&
9423 "Can only select PackedSingle or PackedDouble");
9424 }
9425 if (!table) { // try the other table
9426 assert(Subtarget.hasAVX2() &&
9427 "256-bit insert/extract only available in AVX2");
9428 table = lookup(MI.getOpcode(), dom, ReplaceableInstrsAVX2InsertExtract);
9429 }
9430 if (!table) { // try the AVX512 table
9431 assert(Subtarget.hasAVX512() && "Requires AVX-512");
9432 table = lookupAVX512(MI.getOpcode(), dom, ReplaceableInstrsAVX512);
9433 // Don't change integer Q instructions to D instructions.
9434 if (table && Domain == 3 && table[3] == MI.getOpcode())
9435 Domain = 4;
9436 }
9437 if (!table) { // try the AVX512DQ table
9438 assert((Subtarget.hasDQI() || Domain >= 3) && "Requires AVX-512DQ");
9439 table = lookupAVX512(MI.getOpcode(), dom, ReplaceableInstrsAVX512DQ);
9440 // Don't change integer Q instructions to D instructions and
9441 // use D instructions if we started with a PS instruction.
9442 if (table && Domain == 3 && (dom == 1 || table[3] == MI.getOpcode()))
9443 Domain = 4;
9444 }
9445 if (!table) { // try the AVX512DQMasked table
9446 assert((Subtarget.hasDQI() || Domain >= 3) && "Requires AVX-512DQ");
9447 table = lookupAVX512(MI.getOpcode(), dom, ReplaceableInstrsAVX512DQMasked);
9448 if (table && Domain == 3 && (dom == 1 || table[3] == MI.getOpcode()))
9449 Domain = 4;
9450 }
9451 assert(table && "Cannot change domain");
9452 MI.setDesc(get(table[Domain - 1]));
9453}
9454
9460
9461/// Return the noop instruction to use for a noop.
9463 MCInst Nop;
9464 Nop.setOpcode(X86::NOOP);
9465 return Nop;
9466}
9467
9469 switch (opc) {
9470 default:
9471 return false;
9472 case X86::DIVPDrm:
9473 case X86::DIVPDrr:
9474 case X86::DIVPSrm:
9475 case X86::DIVPSrr:
9476 case X86::DIVSDrm:
9477 case X86::DIVSDrm_Int:
9478 case X86::DIVSDrr:
9479 case X86::DIVSDrr_Int:
9480 case X86::DIVSSrm:
9481 case X86::DIVSSrm_Int:
9482 case X86::DIVSSrr:
9483 case X86::DIVSSrr_Int:
9484 case X86::SQRTPDm:
9485 case X86::SQRTPDr:
9486 case X86::SQRTPSm:
9487 case X86::SQRTPSr:
9488 case X86::SQRTSDm:
9489 case X86::SQRTSDm_Int:
9490 case X86::SQRTSDr:
9491 case X86::SQRTSDr_Int:
9492 case X86::SQRTSSm:
9493 case X86::SQRTSSm_Int:
9494 case X86::SQRTSSr:
9495 case X86::SQRTSSr_Int:
9496 // AVX instructions with high latency
9497 case X86::VDIVPDrm:
9498 case X86::VDIVPDrr:
9499 case X86::VDIVPDYrm:
9500 case X86::VDIVPDYrr:
9501 case X86::VDIVPSrm:
9502 case X86::VDIVPSrr:
9503 case X86::VDIVPSYrm:
9504 case X86::VDIVPSYrr:
9505 case X86::VDIVSDrm:
9506 case X86::VDIVSDrm_Int:
9507 case X86::VDIVSDrr:
9508 case X86::VDIVSDrr_Int:
9509 case X86::VDIVSSrm:
9510 case X86::VDIVSSrm_Int:
9511 case X86::VDIVSSrr:
9512 case X86::VDIVSSrr_Int:
9513 case X86::VSQRTPDm:
9514 case X86::VSQRTPDr:
9515 case X86::VSQRTPDYm:
9516 case X86::VSQRTPDYr:
9517 case X86::VSQRTPSm:
9518 case X86::VSQRTPSr:
9519 case X86::VSQRTPSYm:
9520 case X86::VSQRTPSYr:
9521 case X86::VSQRTSDm:
9522 case X86::VSQRTSDm_Int:
9523 case X86::VSQRTSDr:
9524 case X86::VSQRTSDr_Int:
9525 case X86::VSQRTSSm:
9526 case X86::VSQRTSSm_Int:
9527 case X86::VSQRTSSr:
9528 case X86::VSQRTSSr_Int:
9529 // AVX512 instructions with high latency
9530 case X86::VDIVPDZ128rm:
9531 case X86::VDIVPDZ128rmb:
9532 case X86::VDIVPDZ128rmbk:
9533 case X86::VDIVPDZ128rmbkz:
9534 case X86::VDIVPDZ128rmk:
9535 case X86::VDIVPDZ128rmkz:
9536 case X86::VDIVPDZ128rr:
9537 case X86::VDIVPDZ128rrk:
9538 case X86::VDIVPDZ128rrkz:
9539 case X86::VDIVPDZ256rm:
9540 case X86::VDIVPDZ256rmb:
9541 case X86::VDIVPDZ256rmbk:
9542 case X86::VDIVPDZ256rmbkz:
9543 case X86::VDIVPDZ256rmk:
9544 case X86::VDIVPDZ256rmkz:
9545 case X86::VDIVPDZ256rr:
9546 case X86::VDIVPDZ256rrk:
9547 case X86::VDIVPDZ256rrkz:
9548 case X86::VDIVPDZrrb:
9549 case X86::VDIVPDZrrbk:
9550 case X86::VDIVPDZrrbkz:
9551 case X86::VDIVPDZrm:
9552 case X86::VDIVPDZrmb:
9553 case X86::VDIVPDZrmbk:
9554 case X86::VDIVPDZrmbkz:
9555 case X86::VDIVPDZrmk:
9556 case X86::VDIVPDZrmkz:
9557 case X86::VDIVPDZrr:
9558 case X86::VDIVPDZrrk:
9559 case X86::VDIVPDZrrkz:
9560 case X86::VDIVPSZ128rm:
9561 case X86::VDIVPSZ128rmb:
9562 case X86::VDIVPSZ128rmbk:
9563 case X86::VDIVPSZ128rmbkz:
9564 case X86::VDIVPSZ128rmk:
9565 case X86::VDIVPSZ128rmkz:
9566 case X86::VDIVPSZ128rr:
9567 case X86::VDIVPSZ128rrk:
9568 case X86::VDIVPSZ128rrkz:
9569 case X86::VDIVPSZ256rm:
9570 case X86::VDIVPSZ256rmb:
9571 case X86::VDIVPSZ256rmbk:
9572 case X86::VDIVPSZ256rmbkz:
9573 case X86::VDIVPSZ256rmk:
9574 case X86::VDIVPSZ256rmkz:
9575 case X86::VDIVPSZ256rr:
9576 case X86::VDIVPSZ256rrk:
9577 case X86::VDIVPSZ256rrkz:
9578 case X86::VDIVPSZrrb:
9579 case X86::VDIVPSZrrbk:
9580 case X86::VDIVPSZrrbkz:
9581 case X86::VDIVPSZrm:
9582 case X86::VDIVPSZrmb:
9583 case X86::VDIVPSZrmbk:
9584 case X86::VDIVPSZrmbkz:
9585 case X86::VDIVPSZrmk:
9586 case X86::VDIVPSZrmkz:
9587 case X86::VDIVPSZrr:
9588 case X86::VDIVPSZrrk:
9589 case X86::VDIVPSZrrkz:
9590 case X86::VDIVSDZrm:
9591 case X86::VDIVSDZrr:
9592 case X86::VDIVSDZrm_Int:
9593 case X86::VDIVSDZrmk_Int:
9594 case X86::VDIVSDZrmkz_Int:
9595 case X86::VDIVSDZrr_Int:
9596 case X86::VDIVSDZrrk_Int:
9597 case X86::VDIVSDZrrkz_Int:
9598 case X86::VDIVSDZrrb_Int:
9599 case X86::VDIVSDZrrbk_Int:
9600 case X86::VDIVSDZrrbkz_Int:
9601 case X86::VDIVSSZrm:
9602 case X86::VDIVSSZrr:
9603 case X86::VDIVSSZrm_Int:
9604 case X86::VDIVSSZrmk_Int:
9605 case X86::VDIVSSZrmkz_Int:
9606 case X86::VDIVSSZrr_Int:
9607 case X86::VDIVSSZrrk_Int:
9608 case X86::VDIVSSZrrkz_Int:
9609 case X86::VDIVSSZrrb_Int:
9610 case X86::VDIVSSZrrbk_Int:
9611 case X86::VDIVSSZrrbkz_Int:
9612 case X86::VSQRTPDZ128m:
9613 case X86::VSQRTPDZ128mb:
9614 case X86::VSQRTPDZ128mbk:
9615 case X86::VSQRTPDZ128mbkz:
9616 case X86::VSQRTPDZ128mk:
9617 case X86::VSQRTPDZ128mkz:
9618 case X86::VSQRTPDZ128r:
9619 case X86::VSQRTPDZ128rk:
9620 case X86::VSQRTPDZ128rkz:
9621 case X86::VSQRTPDZ256m:
9622 case X86::VSQRTPDZ256mb:
9623 case X86::VSQRTPDZ256mbk:
9624 case X86::VSQRTPDZ256mbkz:
9625 case X86::VSQRTPDZ256mk:
9626 case X86::VSQRTPDZ256mkz:
9627 case X86::VSQRTPDZ256r:
9628 case X86::VSQRTPDZ256rk:
9629 case X86::VSQRTPDZ256rkz:
9630 case X86::VSQRTPDZm:
9631 case X86::VSQRTPDZmb:
9632 case X86::VSQRTPDZmbk:
9633 case X86::VSQRTPDZmbkz:
9634 case X86::VSQRTPDZmk:
9635 case X86::VSQRTPDZmkz:
9636 case X86::VSQRTPDZr:
9637 case X86::VSQRTPDZrb:
9638 case X86::VSQRTPDZrbk:
9639 case X86::VSQRTPDZrbkz:
9640 case X86::VSQRTPDZrk:
9641 case X86::VSQRTPDZrkz:
9642 case X86::VSQRTPSZ128m:
9643 case X86::VSQRTPSZ128mb:
9644 case X86::VSQRTPSZ128mbk:
9645 case X86::VSQRTPSZ128mbkz:
9646 case X86::VSQRTPSZ128mk:
9647 case X86::VSQRTPSZ128mkz:
9648 case X86::VSQRTPSZ128r:
9649 case X86::VSQRTPSZ128rk:
9650 case X86::VSQRTPSZ128rkz:
9651 case X86::VSQRTPSZ256m:
9652 case X86::VSQRTPSZ256mb:
9653 case X86::VSQRTPSZ256mbk:
9654 case X86::VSQRTPSZ256mbkz:
9655 case X86::VSQRTPSZ256mk:
9656 case X86::VSQRTPSZ256mkz:
9657 case X86::VSQRTPSZ256r:
9658 case X86::VSQRTPSZ256rk:
9659 case X86::VSQRTPSZ256rkz:
9660 case X86::VSQRTPSZm:
9661 case X86::VSQRTPSZmb:
9662 case X86::VSQRTPSZmbk:
9663 case X86::VSQRTPSZmbkz:
9664 case X86::VSQRTPSZmk:
9665 case X86::VSQRTPSZmkz:
9666 case X86::VSQRTPSZr:
9667 case X86::VSQRTPSZrb:
9668 case X86::VSQRTPSZrbk:
9669 case X86::VSQRTPSZrbkz:
9670 case X86::VSQRTPSZrk:
9671 case X86::VSQRTPSZrkz:
9672 case X86::VSQRTSDZm:
9673 case X86::VSQRTSDZm_Int:
9674 case X86::VSQRTSDZmk_Int:
9675 case X86::VSQRTSDZmkz_Int:
9676 case X86::VSQRTSDZr:
9677 case X86::VSQRTSDZr_Int:
9678 case X86::VSQRTSDZrk_Int:
9679 case X86::VSQRTSDZrkz_Int:
9680 case X86::VSQRTSDZrb_Int:
9681 case X86::VSQRTSDZrbk_Int:
9682 case X86::VSQRTSDZrbkz_Int:
9683 case X86::VSQRTSSZm:
9684 case X86::VSQRTSSZm_Int:
9685 case X86::VSQRTSSZmk_Int:
9686 case X86::VSQRTSSZmkz_Int:
9687 case X86::VSQRTSSZr:
9688 case X86::VSQRTSSZr_Int:
9689 case X86::VSQRTSSZrk_Int:
9690 case X86::VSQRTSSZrkz_Int:
9691 case X86::VSQRTSSZrb_Int:
9692 case X86::VSQRTSSZrbk_Int:
9693 case X86::VSQRTSSZrbkz_Int:
9694
9695 case X86::VGATHERDPDYrm:
9696 case X86::VGATHERDPDZ128rm:
9697 case X86::VGATHERDPDZ256rm:
9698 case X86::VGATHERDPDZrm:
9699 case X86::VGATHERDPDrm:
9700 case X86::VGATHERDPSYrm:
9701 case X86::VGATHERDPSZ128rm:
9702 case X86::VGATHERDPSZ256rm:
9703 case X86::VGATHERDPSZrm:
9704 case X86::VGATHERDPSrm:
9705 case X86::VGATHERPF0DPDm:
9706 case X86::VGATHERPF0DPSm:
9707 case X86::VGATHERPF0QPDm:
9708 case X86::VGATHERPF0QPSm:
9709 case X86::VGATHERPF1DPDm:
9710 case X86::VGATHERPF1DPSm:
9711 case X86::VGATHERPF1QPDm:
9712 case X86::VGATHERPF1QPSm:
9713 case X86::VGATHERQPDYrm:
9714 case X86::VGATHERQPDZ128rm:
9715 case X86::VGATHERQPDZ256rm:
9716 case X86::VGATHERQPDZrm:
9717 case X86::VGATHERQPDrm:
9718 case X86::VGATHERQPSYrm:
9719 case X86::VGATHERQPSZ128rm:
9720 case X86::VGATHERQPSZ256rm:
9721 case X86::VGATHERQPSZrm:
9722 case X86::VGATHERQPSrm:
9723 case X86::VPGATHERDDYrm:
9724 case X86::VPGATHERDDZ128rm:
9725 case X86::VPGATHERDDZ256rm:
9726 case X86::VPGATHERDDZrm:
9727 case X86::VPGATHERDDrm:
9728 case X86::VPGATHERDQYrm:
9729 case X86::VPGATHERDQZ128rm:
9730 case X86::VPGATHERDQZ256rm:
9731 case X86::VPGATHERDQZrm:
9732 case X86::VPGATHERDQrm:
9733 case X86::VPGATHERQDYrm:
9734 case X86::VPGATHERQDZ128rm:
9735 case X86::VPGATHERQDZ256rm:
9736 case X86::VPGATHERQDZrm:
9737 case X86::VPGATHERQDrm:
9738 case X86::VPGATHERQQYrm:
9739 case X86::VPGATHERQQZ128rm:
9740 case X86::VPGATHERQQZ256rm:
9741 case X86::VPGATHERQQZrm:
9742 case X86::VPGATHERQQrm:
9743 case X86::VSCATTERDPDZ128mr:
9744 case X86::VSCATTERDPDZ256mr:
9745 case X86::VSCATTERDPDZmr:
9746 case X86::VSCATTERDPSZ128mr:
9747 case X86::VSCATTERDPSZ256mr:
9748 case X86::VSCATTERDPSZmr:
9749 case X86::VSCATTERPF0DPDm:
9750 case X86::VSCATTERPF0DPSm:
9751 case X86::VSCATTERPF0QPDm:
9752 case X86::VSCATTERPF0QPSm:
9753 case X86::VSCATTERPF1DPDm:
9754 case X86::VSCATTERPF1DPSm:
9755 case X86::VSCATTERPF1QPDm:
9756 case X86::VSCATTERPF1QPSm:
9757 case X86::VSCATTERQPDZ128mr:
9758 case X86::VSCATTERQPDZ256mr:
9759 case X86::VSCATTERQPDZmr:
9760 case X86::VSCATTERQPSZ128mr:
9761 case X86::VSCATTERQPSZ256mr:
9762 case X86::VSCATTERQPSZmr:
9763 case X86::VPSCATTERDDZ128mr:
9764 case X86::VPSCATTERDDZ256mr:
9765 case X86::VPSCATTERDDZmr:
9766 case X86::VPSCATTERDQZ128mr:
9767 case X86::VPSCATTERDQZ256mr:
9768 case X86::VPSCATTERDQZmr:
9769 case X86::VPSCATTERQDZ128mr:
9770 case X86::VPSCATTERQDZ256mr:
9771 case X86::VPSCATTERQDZmr:
9772 case X86::VPSCATTERQQZ128mr:
9773 case X86::VPSCATTERQQZ256mr:
9774 case X86::VPSCATTERQQZmr:
9775 return true;
9776 }
9777}
9778
9780 const MachineRegisterInfo *MRI,
9781 const MachineInstr &DefMI,
9782 unsigned DefIdx,
9783 const MachineInstr &UseMI,
9784 unsigned UseIdx) const {
9785 return isHighLatencyDef(DefMI.getOpcode());
9786}
9787
9789 const MachineBasicBlock *MBB) const {
9790 assert(Inst.getNumExplicitOperands() == 3 && Inst.getNumExplicitDefs() == 1 &&
9791 Inst.getNumDefs() <= 2 && "Reassociation needs binary operators");
9792
9793 // Integer binary math/logic instructions have a third source operand:
9794 // the EFLAGS register. That operand must be both defined here and never
9795 // used; ie, it must be dead. If the EFLAGS operand is live, then we can
9796 // not change anything because rearranging the operands could affect other
9797 // instructions that depend on the exact status flags (zero, sign, etc.)
9798 // that are set by using these particular operands with this operation.
9799 const MachineOperand *FlagDef =
9800 Inst.findRegisterDefOperand(X86::EFLAGS, /*TRI=*/nullptr);
9801 assert((Inst.getNumDefs() == 1 || FlagDef) && "Implicit def isn't flags?");
9802 if (FlagDef && !FlagDef->isDead())
9803 return false;
9804
9806}
9807
9808// TODO: There are many more machine instruction opcodes to match:
9809// 1. Other data types (integer, vectors)
9810// 2. Other math / logic operations (xor, or)
9811// 3. Other forms of the same operation (intrinsics and other variants)
9813 bool Invert) const {
9814 if (Invert)
9815 return false;
9816 switch (Inst.getOpcode()) {
9817 CASE_ND(ADD8rr)
9818 CASE_ND(ADD16rr)
9819 CASE_ND(ADD32rr)
9820 CASE_ND(ADD64rr)
9821 CASE_ND(AND8rr)
9822 CASE_ND(AND16rr)
9823 CASE_ND(AND32rr)
9824 CASE_ND(AND64rr)
9825 CASE_ND(OR8rr)
9826 CASE_ND(OR16rr)
9827 CASE_ND(OR32rr)
9828 CASE_ND(OR64rr)
9829 CASE_ND(XOR8rr)
9830 CASE_ND(XOR16rr)
9831 CASE_ND(XOR32rr)
9832 CASE_ND(XOR64rr)
9833 CASE_ND(IMUL16rr)
9834 CASE_ND(IMUL32rr)
9835 CASE_ND(IMUL64rr)
9836 case X86::PANDrr:
9837 case X86::PORrr:
9838 case X86::PXORrr:
9839 case X86::ANDPDrr:
9840 case X86::ANDPSrr:
9841 case X86::ORPDrr:
9842 case X86::ORPSrr:
9843 case X86::XORPDrr:
9844 case X86::XORPSrr:
9845 case X86::PADDBrr:
9846 case X86::PADDWrr:
9847 case X86::PADDDrr:
9848 case X86::PADDQrr:
9849 case X86::PMULLWrr:
9850 case X86::PMULLDrr:
9851 case X86::PMAXSBrr:
9852 case X86::PMAXSDrr:
9853 case X86::PMAXSWrr:
9854 case X86::PMAXUBrr:
9855 case X86::PMAXUDrr:
9856 case X86::PMAXUWrr:
9857 case X86::PMINSBrr:
9858 case X86::PMINSDrr:
9859 case X86::PMINSWrr:
9860 case X86::PMINUBrr:
9861 case X86::PMINUDrr:
9862 case X86::PMINUWrr:
9863 case X86::VPANDrr:
9864 case X86::VPANDYrr:
9865 case X86::VPANDDZ128rr:
9866 case X86::VPANDDZ256rr:
9867 case X86::VPANDDZrr:
9868 case X86::VPANDQZ128rr:
9869 case X86::VPANDQZ256rr:
9870 case X86::VPANDQZrr:
9871 case X86::VPORrr:
9872 case X86::VPORYrr:
9873 case X86::VPORDZ128rr:
9874 case X86::VPORDZ256rr:
9875 case X86::VPORDZrr:
9876 case X86::VPORQZ128rr:
9877 case X86::VPORQZ256rr:
9878 case X86::VPORQZrr:
9879 case X86::VPXORrr:
9880 case X86::VPXORYrr:
9881 case X86::VPXORDZ128rr:
9882 case X86::VPXORDZ256rr:
9883 case X86::VPXORDZrr:
9884 case X86::VPXORQZ128rr:
9885 case X86::VPXORQZ256rr:
9886 case X86::VPXORQZrr:
9887 case X86::VANDPDrr:
9888 case X86::VANDPSrr:
9889 case X86::VANDPDYrr:
9890 case X86::VANDPSYrr:
9891 case X86::VANDPDZ128rr:
9892 case X86::VANDPSZ128rr:
9893 case X86::VANDPDZ256rr:
9894 case X86::VANDPSZ256rr:
9895 case X86::VANDPDZrr:
9896 case X86::VANDPSZrr:
9897 case X86::VORPDrr:
9898 case X86::VORPSrr:
9899 case X86::VORPDYrr:
9900 case X86::VORPSYrr:
9901 case X86::VORPDZ128rr:
9902 case X86::VORPSZ128rr:
9903 case X86::VORPDZ256rr:
9904 case X86::VORPSZ256rr:
9905 case X86::VORPDZrr:
9906 case X86::VORPSZrr:
9907 case X86::VXORPDrr:
9908 case X86::VXORPSrr:
9909 case X86::VXORPDYrr:
9910 case X86::VXORPSYrr:
9911 case X86::VXORPDZ128rr:
9912 case X86::VXORPSZ128rr:
9913 case X86::VXORPDZ256rr:
9914 case X86::VXORPSZ256rr:
9915 case X86::VXORPDZrr:
9916 case X86::VXORPSZrr:
9917 case X86::KADDBkk:
9918 case X86::KADDWkk:
9919 case X86::KADDDkk:
9920 case X86::KADDQkk:
9921 case X86::KANDBkk:
9922 case X86::KANDWkk:
9923 case X86::KANDDkk:
9924 case X86::KANDQkk:
9925 case X86::KORBkk:
9926 case X86::KORWkk:
9927 case X86::KORDkk:
9928 case X86::KORQkk:
9929 case X86::KXORBkk:
9930 case X86::KXORWkk:
9931 case X86::KXORDkk:
9932 case X86::KXORQkk:
9933 case X86::VPADDBrr:
9934 case X86::VPADDWrr:
9935 case X86::VPADDDrr:
9936 case X86::VPADDQrr:
9937 case X86::VPADDBYrr:
9938 case X86::VPADDWYrr:
9939 case X86::VPADDDYrr:
9940 case X86::VPADDQYrr:
9941 case X86::VPADDBZ128rr:
9942 case X86::VPADDWZ128rr:
9943 case X86::VPADDDZ128rr:
9944 case X86::VPADDQZ128rr:
9945 case X86::VPADDBZ256rr:
9946 case X86::VPADDWZ256rr:
9947 case X86::VPADDDZ256rr:
9948 case X86::VPADDQZ256rr:
9949 case X86::VPADDBZrr:
9950 case X86::VPADDWZrr:
9951 case X86::VPADDDZrr:
9952 case X86::VPADDQZrr:
9953 case X86::VPMULLWrr:
9954 case X86::VPMULLWYrr:
9955 case X86::VPMULLWZ128rr:
9956 case X86::VPMULLWZ256rr:
9957 case X86::VPMULLWZrr:
9958 case X86::VPMULLDrr:
9959 case X86::VPMULLDYrr:
9960 case X86::VPMULLDZ128rr:
9961 case X86::VPMULLDZ256rr:
9962 case X86::VPMULLDZrr:
9963 case X86::VPMULLQZ128rr:
9964 case X86::VPMULLQZ256rr:
9965 case X86::VPMULLQZrr:
9966 case X86::VPMAXSBrr:
9967 case X86::VPMAXSBYrr:
9968 case X86::VPMAXSBZ128rr:
9969 case X86::VPMAXSBZ256rr:
9970 case X86::VPMAXSBZrr:
9971 case X86::VPMAXSDrr:
9972 case X86::VPMAXSDYrr:
9973 case X86::VPMAXSDZ128rr:
9974 case X86::VPMAXSDZ256rr:
9975 case X86::VPMAXSDZrr:
9976 case X86::VPMAXSQZ128rr:
9977 case X86::VPMAXSQZ256rr:
9978 case X86::VPMAXSQZrr:
9979 case X86::VPMAXSWrr:
9980 case X86::VPMAXSWYrr:
9981 case X86::VPMAXSWZ128rr:
9982 case X86::VPMAXSWZ256rr:
9983 case X86::VPMAXSWZrr:
9984 case X86::VPMAXUBrr:
9985 case X86::VPMAXUBYrr:
9986 case X86::VPMAXUBZ128rr:
9987 case X86::VPMAXUBZ256rr:
9988 case X86::VPMAXUBZrr:
9989 case X86::VPMAXUDrr:
9990 case X86::VPMAXUDYrr:
9991 case X86::VPMAXUDZ128rr:
9992 case X86::VPMAXUDZ256rr:
9993 case X86::VPMAXUDZrr:
9994 case X86::VPMAXUQZ128rr:
9995 case X86::VPMAXUQZ256rr:
9996 case X86::VPMAXUQZrr:
9997 case X86::VPMAXUWrr:
9998 case X86::VPMAXUWYrr:
9999 case X86::VPMAXUWZ128rr:
10000 case X86::VPMAXUWZ256rr:
10001 case X86::VPMAXUWZrr:
10002 case X86::VPMINSBrr:
10003 case X86::VPMINSBYrr:
10004 case X86::VPMINSBZ128rr:
10005 case X86::VPMINSBZ256rr:
10006 case X86::VPMINSBZrr:
10007 case X86::VPMINSDrr:
10008 case X86::VPMINSDYrr:
10009 case X86::VPMINSDZ128rr:
10010 case X86::VPMINSDZ256rr:
10011 case X86::VPMINSDZrr:
10012 case X86::VPMINSQZ128rr:
10013 case X86::VPMINSQZ256rr:
10014 case X86::VPMINSQZrr:
10015 case X86::VPMINSWrr:
10016 case X86::VPMINSWYrr:
10017 case X86::VPMINSWZ128rr:
10018 case X86::VPMINSWZ256rr:
10019 case X86::VPMINSWZrr:
10020 case X86::VPMINUBrr:
10021 case X86::VPMINUBYrr:
10022 case X86::VPMINUBZ128rr:
10023 case X86::VPMINUBZ256rr:
10024 case X86::VPMINUBZrr:
10025 case X86::VPMINUDrr:
10026 case X86::VPMINUDYrr:
10027 case X86::VPMINUDZ128rr:
10028 case X86::VPMINUDZ256rr:
10029 case X86::VPMINUDZrr:
10030 case X86::VPMINUQZ128rr:
10031 case X86::VPMINUQZ256rr:
10032 case X86::VPMINUQZrr:
10033 case X86::VPMINUWrr:
10034 case X86::VPMINUWYrr:
10035 case X86::VPMINUWZ128rr:
10036 case X86::VPMINUWZ256rr:
10037 case X86::VPMINUWZrr:
10038 // Normal min/max instructions are not commutative because of NaN and signed
10039 // zero semantics, but these are. Thus, there's no need to check for global
10040 // relaxed math; the instructions themselves have the properties we need.
10041 case X86::MAXCPDrr:
10042 case X86::MAXCPSrr:
10043 case X86::MAXCSDrr:
10044 case X86::MAXCSSrr:
10045 case X86::MINCPDrr:
10046 case X86::MINCPSrr:
10047 case X86::MINCSDrr:
10048 case X86::MINCSSrr:
10049 case X86::VMAXCPDrr:
10050 case X86::VMAXCPSrr:
10051 case X86::VMAXCPDYrr:
10052 case X86::VMAXCPSYrr:
10053 case X86::VMAXCPDZ128rr:
10054 case X86::VMAXCPSZ128rr:
10055 case X86::VMAXCPDZ256rr:
10056 case X86::VMAXCPSZ256rr:
10057 case X86::VMAXCPDZrr:
10058 case X86::VMAXCPSZrr:
10059 case X86::VMAXCSDrr:
10060 case X86::VMAXCSSrr:
10061 case X86::VMAXCSDZrr:
10062 case X86::VMAXCSSZrr:
10063 case X86::VMINCPDrr:
10064 case X86::VMINCPSrr:
10065 case X86::VMINCPDYrr:
10066 case X86::VMINCPSYrr:
10067 case X86::VMINCPDZ128rr:
10068 case X86::VMINCPSZ128rr:
10069 case X86::VMINCPDZ256rr:
10070 case X86::VMINCPSZ256rr:
10071 case X86::VMINCPDZrr:
10072 case X86::VMINCPSZrr:
10073 case X86::VMINCSDrr:
10074 case X86::VMINCSSrr:
10075 case X86::VMINCSDZrr:
10076 case X86::VMINCSSZrr:
10077 case X86::VMAXCPHZ128rr:
10078 case X86::VMAXCPHZ256rr:
10079 case X86::VMAXCPHZrr:
10080 case X86::VMAXCSHZrr:
10081 case X86::VMINCPHZ128rr:
10082 case X86::VMINCPHZ256rr:
10083 case X86::VMINCPHZrr:
10084 case X86::VMINCSHZrr:
10085 return true;
10086 case X86::ADDPDrr:
10087 case X86::ADDPSrr:
10088 case X86::ADDSDrr:
10089 case X86::ADDSSrr:
10090 case X86::MULPDrr:
10091 case X86::MULPSrr:
10092 case X86::MULSDrr:
10093 case X86::MULSSrr:
10094 case X86::VADDPDrr:
10095 case X86::VADDPSrr:
10096 case X86::VADDPDYrr:
10097 case X86::VADDPSYrr:
10098 case X86::VADDPDZ128rr:
10099 case X86::VADDPSZ128rr:
10100 case X86::VADDPDZ256rr:
10101 case X86::VADDPSZ256rr:
10102 case X86::VADDPDZrr:
10103 case X86::VADDPSZrr:
10104 case X86::VADDSDrr:
10105 case X86::VADDSSrr:
10106 case X86::VADDSDZrr:
10107 case X86::VADDSSZrr:
10108 case X86::VMULPDrr:
10109 case X86::VMULPSrr:
10110 case X86::VMULPDYrr:
10111 case X86::VMULPSYrr:
10112 case X86::VMULPDZ128rr:
10113 case X86::VMULPSZ128rr:
10114 case X86::VMULPDZ256rr:
10115 case X86::VMULPSZ256rr:
10116 case X86::VMULPDZrr:
10117 case X86::VMULPSZrr:
10118 case X86::VMULSDrr:
10119 case X86::VMULSSrr:
10120 case X86::VMULSDZrr:
10121 case X86::VMULSSZrr:
10122 case X86::VADDPHZ128rr:
10123 case X86::VADDPHZ256rr:
10124 case X86::VADDPHZrr:
10125 case X86::VADDSHZrr:
10126 case X86::VMULPHZ128rr:
10127 case X86::VMULPHZ256rr:
10128 case X86::VMULPHZrr:
10129 case X86::VMULSHZrr:
10132 default:
10133 return false;
10134 }
10135}
10136
10137/// If \p DescribedReg overlaps with the MOVrr instruction's destination
10138/// register then, if possible, describe the value in terms of the source
10139/// register.
10140static std::optional<ParamLoadedValue>
10142 const TargetRegisterInfo *TRI) {
10143 Register DestReg = MI.getOperand(0).getReg();
10144 Register SrcReg = MI.getOperand(1).getReg();
10145
10146 auto Expr = DIExpression::get(MI.getMF()->getFunction().getContext(), {});
10147
10148 // If the described register is the destination, just return the source.
10149 if (DestReg == DescribedReg)
10150 return ParamLoadedValue(MachineOperand::CreateReg(SrcReg, false), Expr);
10151
10152 // If the described register is a sub-register of the destination register,
10153 // then pick out the source register's corresponding sub-register.
10154 if (unsigned SubRegIdx = TRI->getSubRegIndex(DestReg, DescribedReg)) {
10155 Register SrcSubReg = TRI->getSubReg(SrcReg, SubRegIdx);
10156 return ParamLoadedValue(MachineOperand::CreateReg(SrcSubReg, false), Expr);
10157 }
10158
10159 // The remaining case to consider is when the described register is a
10160 // super-register of the destination register. MOV8rr and MOV16rr does not
10161 // write to any of the other bytes in the register, meaning that we'd have to
10162 // describe the value using a combination of the source register and the
10163 // non-overlapping bits in the described register, which is not currently
10164 // possible.
10165 if (MI.getOpcode() == X86::MOV8rr || MI.getOpcode() == X86::MOV16rr ||
10166 !TRI->isSuperRegister(DestReg, DescribedReg))
10167 return std::nullopt;
10168
10169 assert(MI.getOpcode() == X86::MOV32rr && "Unexpected super-register case");
10170 return ParamLoadedValue(MachineOperand::CreateReg(SrcReg, false), Expr);
10171}
10172
10173std::optional<ParamLoadedValue>
10175 const MachineOperand *Op = nullptr;
10176 DIExpression *Expr = nullptr;
10177
10179
10180 switch (MI.getOpcode()) {
10181 case X86::LEA32r:
10182 case X86::LEA64r:
10183 case X86::LEA64_32r: {
10184 // We may need to describe a 64-bit parameter with a 32-bit LEA.
10185 if (!TRI->isSuperRegisterEq(MI.getOperand(0).getReg(), Reg))
10186 return std::nullopt;
10187
10188 // Operand 4 could be global address. For now we do not support
10189 // such situation.
10190 if (!MI.getOperand(4).isImm() || !MI.getOperand(2).isImm())
10191 return std::nullopt;
10192
10193 const MachineOperand &Op1 = MI.getOperand(1);
10194 const MachineOperand &Op2 = MI.getOperand(3);
10195 assert(Op2.isReg() &&
10196 (Op2.getReg() == X86::NoRegister || Op2.getReg().isPhysical()));
10197
10198 // Omit situations like:
10199 // %rsi = lea %rsi, 4, ...
10200 if ((Op1.isReg() && Op1.getReg() == MI.getOperand(0).getReg()) ||
10201 Op2.getReg() == MI.getOperand(0).getReg())
10202 return std::nullopt;
10203 else if ((Op1.isReg() && Op1.getReg() != X86::NoRegister &&
10204 TRI->regsOverlap(Op1.getReg(), MI.getOperand(0).getReg())) ||
10205 (Op2.getReg() != X86::NoRegister &&
10206 TRI->regsOverlap(Op2.getReg(), MI.getOperand(0).getReg())))
10207 return std::nullopt;
10208
10209 int64_t Coef = MI.getOperand(2).getImm();
10210 int64_t Offset = MI.getOperand(4).getImm();
10212
10213 if ((Op1.isReg() && Op1.getReg() != X86::NoRegister)) {
10214 Op = &Op1;
10215 } else if (Op1.isFI())
10216 Op = &Op1;
10217
10218 if (Op && Op->isReg() && Op->getReg() == Op2.getReg() && Coef > 0) {
10219 Ops.push_back(dwarf::DW_OP_constu);
10220 Ops.push_back(Coef + 1);
10221 Ops.push_back(dwarf::DW_OP_mul);
10222 } else {
10223 if (Op && Op2.getReg() != X86::NoRegister) {
10224 int dwarfReg = TRI->getDwarfRegNum(Op2.getReg(), false);
10225 if (dwarfReg < 0)
10226 return std::nullopt;
10227 else if (dwarfReg < 32) {
10228 Ops.push_back(dwarf::DW_OP_breg0 + dwarfReg);
10229 Ops.push_back(0);
10230 } else {
10231 Ops.push_back(dwarf::DW_OP_bregx);
10232 Ops.push_back(dwarfReg);
10233 Ops.push_back(0);
10234 }
10235 } else if (!Op) {
10236 assert(Op2.getReg() != X86::NoRegister);
10237 Op = &Op2;
10238 }
10239
10240 if (Coef > 1) {
10241 assert(Op2.getReg() != X86::NoRegister);
10242 Ops.push_back(dwarf::DW_OP_constu);
10243 Ops.push_back(Coef);
10244 Ops.push_back(dwarf::DW_OP_mul);
10245 }
10246
10247 if (((Op1.isReg() && Op1.getReg() != X86::NoRegister) || Op1.isFI()) &&
10248 Op2.getReg() != X86::NoRegister) {
10249 Ops.push_back(dwarf::DW_OP_plus);
10250 }
10251 }
10252
10254 Expr = DIExpression::get(MI.getMF()->getFunction().getContext(), Ops);
10255
10256 return ParamLoadedValue(*Op, Expr);
10257 }
10258 case X86::MOV8ri:
10259 case X86::MOV16ri:
10260 // TODO: Handle MOV8ri and MOV16ri.
10261 return std::nullopt;
10262 case X86::MOV32ri:
10263 case X86::MOV64ri:
10264 case X86::MOV64ri32:
10265 // MOV32ri may be used for producing zero-extended 32-bit immediates in
10266 // 64-bit parameters, so we need to consider super-registers.
10267 if (!TRI->isSuperRegisterEq(MI.getOperand(0).getReg(), Reg))
10268 return std::nullopt;
10269 return ParamLoadedValue(MI.getOperand(1), Expr);
10270 case X86::MOV8rr:
10271 case X86::MOV16rr:
10272 case X86::MOV32rr:
10273 case X86::MOV64rr:
10274 return describeMOVrrLoadedValue(MI, Reg, TRI);
10275 case X86::XOR32rr: {
10276 // 64-bit parameters are zero-materialized using XOR32rr, so also consider
10277 // super-registers.
10278 if (!TRI->isSuperRegisterEq(MI.getOperand(0).getReg(), Reg))
10279 return std::nullopt;
10280 if (MI.getOperand(1).getReg() == MI.getOperand(2).getReg())
10282 return std::nullopt;
10283 }
10284 case X86::MOVSX64rr32: {
10285 // We may need to describe the lower 32 bits of the MOVSX; for example, in
10286 // cases like this:
10287 //
10288 // $ebx = [...]
10289 // $rdi = MOVSX64rr32 $ebx
10290 // $esi = MOV32rr $edi
10291 if (!TRI->isSubRegisterEq(MI.getOperand(0).getReg(), Reg))
10292 return std::nullopt;
10293
10294 Expr = DIExpression::get(MI.getMF()->getFunction().getContext(), {});
10295
10296 // If the described register is the destination register we need to
10297 // sign-extend the source register from 32 bits. The other case we handle
10298 // is when the described register is the 32-bit sub-register of the
10299 // destination register, in case we just need to return the source
10300 // register.
10301 if (Reg == MI.getOperand(0).getReg())
10302 Expr = DIExpression::appendExt(Expr, 32, 64, true);
10303 else
10304 assert(X86MCRegisterClasses[X86::GR32RegClassID].contains(Reg) &&
10305 "Unhandled sub-register case for MOVSX64rr32");
10306
10307 return ParamLoadedValue(MI.getOperand(1), Expr);
10308 }
10309 default:
10310 assert(!MI.isMoveImmediate() && "Unexpected MoveImm instruction");
10312 }
10313}
10314
10315/// This is an architecture-specific helper function of reassociateOps.
10316/// Set special operand attributes for new instructions after reassociation.
10318 MachineInstr &OldMI2,
10319 MachineInstr &NewMI1,
10320 MachineInstr &NewMI2) const {
10321 // Integer instructions may define an implicit EFLAGS dest register operand.
10322 MachineOperand *OldFlagDef1 =
10323 OldMI1.findRegisterDefOperand(X86::EFLAGS, /*TRI=*/nullptr);
10324 MachineOperand *OldFlagDef2 =
10325 OldMI2.findRegisterDefOperand(X86::EFLAGS, /*TRI=*/nullptr);
10326
10327 assert(!OldFlagDef1 == !OldFlagDef2 &&
10328 "Unexpected instruction type for reassociation");
10329
10330 if (!OldFlagDef1 || !OldFlagDef2)
10331 return;
10332
10333 assert(OldFlagDef1->isDead() && OldFlagDef2->isDead() &&
10334 "Must have dead EFLAGS operand in reassociable instruction");
10335
10336 MachineOperand *NewFlagDef1 =
10337 NewMI1.findRegisterDefOperand(X86::EFLAGS, /*TRI=*/nullptr);
10338 MachineOperand *NewFlagDef2 =
10339 NewMI2.findRegisterDefOperand(X86::EFLAGS, /*TRI=*/nullptr);
10340
10341 assert(NewFlagDef1 && NewFlagDef2 &&
10342 "Unexpected operand in reassociable instruction");
10343
10344 // Mark the new EFLAGS operands as dead to be helpful to subsequent iterations
10345 // of this pass or other passes. The EFLAGS operands must be dead in these new
10346 // instructions because the EFLAGS operands in the original instructions must
10347 // be dead in order for reassociation to occur.
10348 NewFlagDef1->setIsDead();
10349 NewFlagDef2->setIsDead();
10350}
10351
10352std::pair<unsigned, unsigned>
10354 return std::make_pair(TF, 0u);
10355}
10356
10359 using namespace X86II;
10360 static const std::pair<unsigned, const char *> TargetFlags[] = {
10361 {MO_GOT_ABSOLUTE_ADDRESS, "x86-got-absolute-address"},
10362 {MO_PIC_BASE_OFFSET, "x86-pic-base-offset"},
10363 {MO_GOT, "x86-got"},
10364 {MO_GOTOFF, "x86-gotoff"},
10365 {MO_GOTPCREL, "x86-gotpcrel"},
10366 {MO_GOTPCREL_NORELAX, "x86-gotpcrel-norelax"},
10367 {MO_PLT, "x86-plt"},
10368 {MO_TLSGD, "x86-tlsgd"},
10369 {MO_TLSLD, "x86-tlsld"},
10370 {MO_TLSLDM, "x86-tlsldm"},
10371 {MO_GOTTPOFF, "x86-gottpoff"},
10372 {MO_INDNTPOFF, "x86-indntpoff"},
10373 {MO_TPOFF, "x86-tpoff"},
10374 {MO_DTPOFF, "x86-dtpoff"},
10375 {MO_NTPOFF, "x86-ntpoff"},
10376 {MO_GOTNTPOFF, "x86-gotntpoff"},
10377 {MO_DLLIMPORT, "x86-dllimport"},
10378 {MO_DARWIN_NONLAZY, "x86-darwin-nonlazy"},
10379 {MO_DARWIN_NONLAZY_PIC_BASE, "x86-darwin-nonlazy-pic-base"},
10380 {MO_TLVP, "x86-tlvp"},
10381 {MO_TLVP_PIC_BASE, "x86-tlvp-pic-base"},
10382 {MO_SECREL, "x86-secrel"},
10383 {MO_COFFSTUB, "x86-coffstub"}};
10384 return ArrayRef(TargetFlags);
10385}
10386
10387namespace {
10388/// Create Global Base Reg pass. This initializes the PIC
10389/// global base register for x86-32.
10390struct CGBR : public MachineFunctionPass {
10391 static char ID;
10392 CGBR() : MachineFunctionPass(ID) {}
10393
10394 bool runOnMachineFunction(MachineFunction &MF) override {
10395 const X86TargetMachine *TM =
10396 static_cast<const X86TargetMachine *>(&MF.getTarget());
10397 const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>();
10398
10399 // Only emit a global base reg in PIC mode.
10400 if (!TM->isPositionIndependent())
10401 return false;
10402
10404 Register GlobalBaseReg = X86FI->getGlobalBaseReg();
10405
10406 // If we didn't need a GlobalBaseReg, don't insert code.
10407 if (GlobalBaseReg == 0)
10408 return false;
10409
10410 // Insert the set of GlobalBaseReg into the first MBB of the function
10411 MachineBasicBlock &FirstMBB = MF.front();
10413 DebugLoc DL = FirstMBB.findDebugLoc(MBBI);
10415 const X86InstrInfo *TII = STI.getInstrInfo();
10416
10417 Register PC;
10418 if (STI.isPICStyleGOT())
10419 PC = RegInfo.createVirtualRegister(&X86::GR32RegClass);
10420 else
10421 PC = GlobalBaseReg;
10422
10423 if (STI.is64Bit()) {
10424 if (TM->getCodeModel() == CodeModel::Large) {
10425 // In the large code model, we are aiming for this code, though the
10426 // register allocation may vary:
10427 // leaq .LN$pb(%rip), %rax
10428 // movq $_GLOBAL_OFFSET_TABLE_ - .LN$pb, %rcx
10429 // addq %rcx, %rax
10430 // RAX now holds address of _GLOBAL_OFFSET_TABLE_.
10431 Register PBReg = RegInfo.createVirtualRegister(&X86::GR64RegClass);
10432 Register GOTReg = RegInfo.createVirtualRegister(&X86::GR64RegClass);
10433 BuildMI(FirstMBB, MBBI, DL, TII->get(X86::LEA64r), PBReg)
10434 .addReg(X86::RIP)
10435 .addImm(0)
10436 .addReg(0)
10438 .addReg(0);
10439 std::prev(MBBI)->setPreInstrSymbol(MF, MF.getPICBaseSymbol());
10440 BuildMI(FirstMBB, MBBI, DL, TII->get(X86::MOV64ri), GOTReg)
10441 .addExternalSymbol("_GLOBAL_OFFSET_TABLE_",
10443 BuildMI(FirstMBB, MBBI, DL, TII->get(X86::ADD64rr), PC)
10444 .addReg(PBReg, RegState::Kill)
10445 .addReg(GOTReg, RegState::Kill);
10446 } else {
10447 // In other code models, use a RIP-relative LEA to materialize the
10448 // GOT.
10449 BuildMI(FirstMBB, MBBI, DL, TII->get(X86::LEA64r), PC)
10450 .addReg(X86::RIP)
10451 .addImm(0)
10452 .addReg(0)
10453 .addExternalSymbol("_GLOBAL_OFFSET_TABLE_")
10454 .addReg(0);
10455 }
10456 } else {
10457 // Operand of MovePCtoStack is completely ignored by asm printer. It's
10458 // only used in JIT code emission as displacement to pc.
10459 BuildMI(FirstMBB, MBBI, DL, TII->get(X86::MOVPC32r), PC).addImm(0);
10460
10461 // If we're using vanilla 'GOT' PIC style, we should use relative
10462 // addressing not to pc, but to _GLOBAL_OFFSET_TABLE_ external.
10463 if (STI.isPICStyleGOT()) {
10464 // Generate addl $__GLOBAL_OFFSET_TABLE_ + [.-piclabel],
10465 // %some_register
10466 BuildMI(FirstMBB, MBBI, DL, TII->get(X86::ADD32ri), GlobalBaseReg)
10467 .addReg(PC)
10468 .addExternalSymbol("_GLOBAL_OFFSET_TABLE_",
10470 }
10471 }
10472
10473 return true;
10474 }
10475
10476 StringRef getPassName() const override {
10477 return "X86 PIC Global Base Reg Initialization";
10478 }
10479
10480 void getAnalysisUsage(AnalysisUsage &AU) const override {
10481 AU.setPreservesCFG();
10483 }
10484};
10485} // namespace
10486
10487char CGBR::ID = 0;
10489
10490namespace {
10491struct LDTLSCleanup : public MachineFunctionPass {
10492 static char ID;
10493 LDTLSCleanup() : MachineFunctionPass(ID) {}
10494
10495 bool runOnMachineFunction(MachineFunction &MF) override {
10496 if (skipFunction(MF.getFunction()))
10497 return false;
10498
10499 X86MachineFunctionInfo *MFI = MF.getInfo<X86MachineFunctionInfo>();
10500 if (MFI->getNumLocalDynamicTLSAccesses() < 2) {
10501 // No point folding accesses if there isn't at least two.
10502 return false;
10503 }
10504
10505 MachineDominatorTree *DT =
10506 &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
10507 return VisitNode(DT->getRootNode(), Register());
10508 }
10509
10510 // Visit the dominator subtree rooted at Node in pre-order.
10511 // If TLSBaseAddrReg is non-null, then use that to replace any
10512 // TLS_base_addr instructions. Otherwise, create the register
10513 // when the first such instruction is seen, and then use it
10514 // as we encounter more instructions.
10515 bool VisitNode(MachineDomTreeNode *Node, Register TLSBaseAddrReg) {
10516 MachineBasicBlock *BB = Node->getBlock();
10517 bool Changed = false;
10518
10519 // Traverse the current block.
10520 for (MachineBasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;
10521 ++I) {
10522 switch (I->getOpcode()) {
10523 case X86::TLS_base_addr32:
10524 case X86::TLS_base_addr64:
10525 if (TLSBaseAddrReg)
10526 I = ReplaceTLSBaseAddrCall(*I, TLSBaseAddrReg);
10527 else
10528 I = SetRegister(*I, &TLSBaseAddrReg);
10529 Changed = true;
10530 break;
10531 default:
10532 break;
10533 }
10534 }
10535
10536 // Visit the children of this block in the dominator tree.
10537 for (auto &I : *Node) {
10538 Changed |= VisitNode(I, TLSBaseAddrReg);
10539 }
10540
10541 return Changed;
10542 }
10543
10544 // Replace the TLS_base_addr instruction I with a copy from
10545 // TLSBaseAddrReg, returning the new instruction.
10546 MachineInstr *ReplaceTLSBaseAddrCall(MachineInstr &I,
10547 Register TLSBaseAddrReg) {
10548 MachineFunction *MF = I.getParent()->getParent();
10549 const X86Subtarget &STI = MF->getSubtarget<X86Subtarget>();
10550 const bool is64Bit = STI.is64Bit();
10551 const X86InstrInfo *TII = STI.getInstrInfo();
10552
10553 // Insert a Copy from TLSBaseAddrReg to RAX/EAX.
10554 MachineInstr *Copy =
10555 BuildMI(*I.getParent(), I, I.getDebugLoc(),
10556 TII->get(TargetOpcode::COPY), is64Bit ? X86::RAX : X86::EAX)
10557 .addReg(TLSBaseAddrReg);
10558
10559 // Erase the TLS_base_addr instruction.
10560 I.eraseFromParent();
10561
10562 return Copy;
10563 }
10564
10565 // Create a virtual register in *TLSBaseAddrReg, and populate it by
10566 // inserting a copy instruction after I. Returns the new instruction.
10567 MachineInstr *SetRegister(MachineInstr &I, Register *TLSBaseAddrReg) {
10568 MachineFunction *MF = I.getParent()->getParent();
10569 const X86Subtarget &STI = MF->getSubtarget<X86Subtarget>();
10570 const bool is64Bit = STI.is64Bit();
10571 const X86InstrInfo *TII = STI.getInstrInfo();
10572
10573 // Create a virtual register for the TLS base address.
10574 MachineRegisterInfo &RegInfo = MF->getRegInfo();
10575 *TLSBaseAddrReg = RegInfo.createVirtualRegister(
10576 is64Bit ? &X86::GR64RegClass : &X86::GR32RegClass);
10577
10578 // Insert a copy from RAX/EAX to TLSBaseAddrReg.
10579 MachineInstr *Next = I.getNextNode();
10580 MachineInstr *Copy = BuildMI(*I.getParent(), Next, I.getDebugLoc(),
10581 TII->get(TargetOpcode::COPY), *TLSBaseAddrReg)
10582 .addReg(is64Bit ? X86::RAX : X86::EAX);
10583
10584 return Copy;
10585 }
10586
10587 StringRef getPassName() const override {
10588 return "Local Dynamic TLS Access Clean-up";
10589 }
10590
10591 void getAnalysisUsage(AnalysisUsage &AU) const override {
10592 AU.setPreservesCFG();
10593 AU.addRequired<MachineDominatorTreeWrapperPass>();
10595 }
10596};
10597} // namespace
10598
10599char LDTLSCleanup::ID = 0;
10601 return new LDTLSCleanup();
10602}
10603
10604/// Constants defining how certain sequences should be outlined.
10605///
10606/// \p MachineOutlinerDefault implies that the function is called with a call
10607/// instruction, and a return must be emitted for the outlined function frame.
10608///
10609/// That is,
10610///
10611/// I1 OUTLINED_FUNCTION:
10612/// I2 --> call OUTLINED_FUNCTION I1
10613/// I3 I2
10614/// I3
10615/// ret
10616///
10617/// * Call construction overhead: 1 (call instruction)
10618/// * Frame construction overhead: 1 (return instruction)
10619///
10620/// \p MachineOutlinerTailCall implies that the function is being tail called.
10621/// A jump is emitted instead of a call, and the return is already present in
10622/// the outlined sequence. That is,
10623///
10624/// I1 OUTLINED_FUNCTION:
10625/// I2 --> jmp OUTLINED_FUNCTION I1
10626/// ret I2
10627/// ret
10628///
10629/// * Call construction overhead: 1 (jump instruction)
10630/// * Frame construction overhead: 0 (don't need to return)
10631///
10633
10634std::optional<std::unique_ptr<outliner::OutlinedFunction>>
10636 const MachineModuleInfo &MMI,
10637 std::vector<outliner::Candidate> &RepeatedSequenceLocs,
10638 unsigned MinRepeats) const {
10639 unsigned SequenceSize = 0;
10640 for (auto &MI : RepeatedSequenceLocs[0]) {
10641 // FIXME: x86 doesn't implement getInstSizeInBytes, so
10642 // we can't tell the cost. Just assume each instruction
10643 // is one byte.
10644 if (MI.isDebugInstr() || MI.isKill())
10645 continue;
10646 SequenceSize += 1;
10647 }
10648
10649 // We check to see if CFI Instructions are present, and if they are
10650 // we find the number of CFI Instructions in the candidates.
10651 unsigned CFICount = 0;
10652 for (auto &I : RepeatedSequenceLocs[0]) {
10653 if (I.isCFIInstruction())
10654 CFICount++;
10655 }
10656
10657 // We compare the number of found CFI Instructions to the number of CFI
10658 // instructions in the parent function for each candidate. We must check this
10659 // since if we outline one of the CFI instructions in a function, we have to
10660 // outline them all for correctness. If we do not, the address offsets will be
10661 // incorrect between the two sections of the program.
10662 for (outliner::Candidate &C : RepeatedSequenceLocs) {
10663 std::vector<MCCFIInstruction> CFIInstructions =
10664 C.getMF()->getFrameInstructions();
10665
10666 if (CFICount > 0 && CFICount != CFIInstructions.size())
10667 return std::nullopt;
10668 }
10669
10670 // FIXME: Use real size in bytes for call and ret instructions.
10671 if (RepeatedSequenceLocs[0].back().isTerminator()) {
10672 for (outliner::Candidate &C : RepeatedSequenceLocs)
10673 C.setCallInfo(MachineOutlinerTailCall, 1);
10674
10675 return std::make_unique<outliner::OutlinedFunction>(
10676 RepeatedSequenceLocs, SequenceSize,
10677 0, // Number of bytes to emit frame.
10678 MachineOutlinerTailCall // Type of frame.
10679 );
10680 }
10681
10682 if (CFICount > 0)
10683 return std::nullopt;
10684
10685 for (outliner::Candidate &C : RepeatedSequenceLocs)
10686 C.setCallInfo(MachineOutlinerDefault, 1);
10687
10688 return std::make_unique<outliner::OutlinedFunction>(
10689 RepeatedSequenceLocs, SequenceSize, 1, MachineOutlinerDefault);
10690}
10691
10693 MachineFunction &MF, bool OutlineFromLinkOnceODRs) const {
10694 const Function &F = MF.getFunction();
10695
10696 // Does the function use a red zone? If it does, then we can't risk messing
10697 // with the stack.
10698 if (Subtarget.getFrameLowering()->has128ByteRedZone(MF)) {
10699 // It could have a red zone. If it does, then we don't want to touch it.
10701 if (!X86FI || X86FI->getUsesRedZone())
10702 return false;
10703 }
10704
10705 // If we *don't* want to outline from things that could potentially be deduped
10706 // then return false.
10707 if (!OutlineFromLinkOnceODRs && F.hasLinkOnceODRLinkage())
10708 return false;
10709
10710 // This function is viable for outlining, so return true.
10711 return true;
10712}
10713
10717 unsigned Flags) const {
10718 MachineInstr &MI = *MIT;
10719
10720 // Is this a terminator for a basic block?
10721 if (MI.isTerminator())
10722 // TargetInstrInfo::getOutliningType has already filtered out anything
10723 // that would break this, so we can allow it here.
10725
10726 // Don't outline anything that modifies or reads from the stack pointer.
10727 //
10728 // FIXME: There are instructions which are being manually built without
10729 // explicit uses/defs so we also have to check the MCInstrDesc. We should be
10730 // able to remove the extra checks once those are fixed up. For example,
10731 // sometimes we might get something like %rax = POP64r 1. This won't be
10732 // caught by modifiesRegister or readsRegister even though the instruction
10733 // really ought to be formed so that modifiesRegister/readsRegister would
10734 // catch it.
10735 if (MI.modifiesRegister(X86::RSP, &RI) || MI.readsRegister(X86::RSP, &RI) ||
10736 MI.getDesc().hasImplicitUseOfPhysReg(X86::RSP) ||
10737 MI.getDesc().hasImplicitDefOfPhysReg(X86::RSP))
10739
10740 // Outlined calls change the instruction pointer, so don't read from it.
10741 if (MI.readsRegister(X86::RIP, &RI) ||
10742 MI.getDesc().hasImplicitUseOfPhysReg(X86::RIP) ||
10743 MI.getDesc().hasImplicitDefOfPhysReg(X86::RIP))
10745
10746 // Don't outline CFI instructions.
10747 if (MI.isCFIInstruction())
10749
10751}
10752
10755 const outliner::OutlinedFunction &OF) const {
10756 // If we're a tail call, we already have a return, so don't do anything.
10757 if (OF.FrameConstructionID == MachineOutlinerTailCall)
10758 return;
10759
10760 // We're a normal call, so our sequence doesn't have a return instruction.
10761 // Add it in.
10762 MachineInstr *retq = BuildMI(MF, DebugLoc(), get(X86::RET64));
10763 MBB.insert(MBB.end(), retq);
10764}
10765
10769 // Is it a tail call?
10770 if (C.CallConstructionID == MachineOutlinerTailCall) {
10771 // Yes, just insert a JMP.
10772 It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(X86::TAILJMPd64))
10773 .addGlobalAddress(M.getNamedValue(MF.getName())));
10774 } else {
10775 // No, insert a call.
10776 It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(X86::CALL64pcrel32))
10777 .addGlobalAddress(M.getNamedValue(MF.getName())));
10778 }
10779
10780 return It;
10781}
10782
10785 DebugLoc &DL,
10786 bool AllowSideEffects) const {
10787 const MachineFunction &MF = *MBB.getParent();
10788 const X86Subtarget &ST = MF.getSubtarget<X86Subtarget>();
10790
10791 if (ST.hasMMX() && X86::VR64RegClass.contains(Reg))
10792 // FIXME: Should we ignore MMX registers?
10793 return;
10794
10795 if (TRI.isGeneralPurposeRegister(MF, Reg)) {
10796 // Convert register to the 32-bit version. Both 'movl' and 'xorl' clear the
10797 // upper bits of a 64-bit register automagically.
10798 Reg = getX86SubSuperRegister(Reg, 32);
10799
10800 if (!AllowSideEffects)
10801 // XOR affects flags, so use a MOV instead.
10802 BuildMI(MBB, Iter, DL, get(X86::MOV32ri), Reg).addImm(0);
10803 else
10804 BuildMI(MBB, Iter, DL, get(X86::XOR32rr), Reg)
10805 .addReg(Reg, RegState::Undef)
10806 .addReg(Reg, RegState::Undef);
10807 } else if (X86::VR128RegClass.contains(Reg)) {
10808 // XMM#
10809 if (!ST.hasSSE1())
10810 return;
10811
10812 // PXOR is safe to use because it doesn't affect flags.
10813 BuildMI(MBB, Iter, DL, get(X86::PXORrr), Reg)
10814 .addReg(Reg, RegState::Undef)
10815 .addReg(Reg, RegState::Undef);
10816 } else if (X86::VR256RegClass.contains(Reg)) {
10817 // YMM#
10818 if (!ST.hasAVX())
10819 return;
10820
10821 // VPXOR is safe to use because it doesn't affect flags.
10822 BuildMI(MBB, Iter, DL, get(X86::VPXORrr), Reg)
10823 .addReg(Reg, RegState::Undef)
10824 .addReg(Reg, RegState::Undef);
10825 } else if (X86::VR512RegClass.contains(Reg)) {
10826 // ZMM#
10827 if (!ST.hasAVX512())
10828 return;
10829
10830 // VPXORY is safe to use because it doesn't affect flags.
10831 BuildMI(MBB, Iter, DL, get(X86::VPXORYrr), Reg)
10832 .addReg(Reg, RegState::Undef)
10833 .addReg(Reg, RegState::Undef);
10834 } else if (X86::VK1RegClass.contains(Reg) || X86::VK2RegClass.contains(Reg) ||
10835 X86::VK4RegClass.contains(Reg) || X86::VK8RegClass.contains(Reg) ||
10836 X86::VK16RegClass.contains(Reg)) {
10837 if (!ST.hasVLX())
10838 return;
10839
10840 // KXOR is safe to use because it doesn't affect flags.
10841 unsigned Op = ST.hasBWI() ? X86::KXORQkk : X86::KXORWkk;
10842 BuildMI(MBB, Iter, DL, get(Op), Reg)
10843 .addReg(Reg, RegState::Undef)
10844 .addReg(Reg, RegState::Undef);
10845 }
10846}
10847
10849 MachineInstr &Root, SmallVectorImpl<unsigned> &Patterns,
10850 bool DoRegPressureReduce) const {
10851 unsigned Opc = Root.getOpcode();
10852 switch (Opc) {
10853 case X86::VPDPWSSDrr:
10854 case X86::VPDPWSSDrm:
10855 case X86::VPDPWSSDYrr:
10856 case X86::VPDPWSSDYrm: {
10857 if (!Subtarget.hasFastDPWSSD()) {
10859 return true;
10860 }
10861 break;
10862 }
10863 case X86::VPDPWSSDZ128rr:
10864 case X86::VPDPWSSDZ128rm:
10865 case X86::VPDPWSSDZ256rr:
10866 case X86::VPDPWSSDZ256rm:
10867 case X86::VPDPWSSDZrr:
10868 case X86::VPDPWSSDZrm: {
10869 if (Subtarget.hasBWI() && !Subtarget.hasFastDPWSSD()) {
10871 return true;
10872 }
10873 break;
10874 }
10875 }
10877 Patterns, DoRegPressureReduce);
10878}
10879
10880static void
10884 DenseMap<Register, unsigned> &InstrIdxForVirtReg) {
10885 MachineFunction *MF = Root.getMF();
10887
10888 unsigned Opc = Root.getOpcode();
10889 unsigned AddOpc = 0;
10890 unsigned MaddOpc = 0;
10891 switch (Opc) {
10892 default:
10893 assert(false && "It should not reach here");
10894 break;
10895 // vpdpwssd xmm2,xmm3,xmm1
10896 // -->
10897 // vpmaddwd xmm3,xmm3,xmm1
10898 // vpaddd xmm2,xmm2,xmm3
10899 case X86::VPDPWSSDrr:
10900 MaddOpc = X86::VPMADDWDrr;
10901 AddOpc = X86::VPADDDrr;
10902 break;
10903 case X86::VPDPWSSDrm:
10904 MaddOpc = X86::VPMADDWDrm;
10905 AddOpc = X86::VPADDDrr;
10906 break;
10907 case X86::VPDPWSSDZ128rr:
10908 MaddOpc = X86::VPMADDWDZ128rr;
10909 AddOpc = X86::VPADDDZ128rr;
10910 break;
10911 case X86::VPDPWSSDZ128rm:
10912 MaddOpc = X86::VPMADDWDZ128rm;
10913 AddOpc = X86::VPADDDZ128rr;
10914 break;
10915 // vpdpwssd ymm2,ymm3,ymm1
10916 // -->
10917 // vpmaddwd ymm3,ymm3,ymm1
10918 // vpaddd ymm2,ymm2,ymm3
10919 case X86::VPDPWSSDYrr:
10920 MaddOpc = X86::VPMADDWDYrr;
10921 AddOpc = X86::VPADDDYrr;
10922 break;
10923 case X86::VPDPWSSDYrm:
10924 MaddOpc = X86::VPMADDWDYrm;
10925 AddOpc = X86::VPADDDYrr;
10926 break;
10927 case X86::VPDPWSSDZ256rr:
10928 MaddOpc = X86::VPMADDWDZ256rr;
10929 AddOpc = X86::VPADDDZ256rr;
10930 break;
10931 case X86::VPDPWSSDZ256rm:
10932 MaddOpc = X86::VPMADDWDZ256rm;
10933 AddOpc = X86::VPADDDZ256rr;
10934 break;
10935 // vpdpwssd zmm2,zmm3,zmm1
10936 // -->
10937 // vpmaddwd zmm3,zmm3,zmm1
10938 // vpaddd zmm2,zmm2,zmm3
10939 case X86::VPDPWSSDZrr:
10940 MaddOpc = X86::VPMADDWDZrr;
10941 AddOpc = X86::VPADDDZrr;
10942 break;
10943 case X86::VPDPWSSDZrm:
10944 MaddOpc = X86::VPMADDWDZrm;
10945 AddOpc = X86::VPADDDZrr;
10946 break;
10947 }
10948 // Create vpmaddwd.
10949 const TargetRegisterClass *RC =
10950 RegInfo.getRegClass(Root.getOperand(0).getReg());
10951 Register NewReg = RegInfo.createVirtualRegister(RC);
10952 MachineInstr *Madd = Root.getMF()->CloneMachineInstr(&Root);
10953 Madd->setDesc(TII.get(MaddOpc));
10954 Madd->untieRegOperand(1);
10955 Madd->removeOperand(1);
10956 Madd->getOperand(0).setReg(NewReg);
10957 InstrIdxForVirtReg.insert(std::make_pair(NewReg, 0));
10958 // Create vpaddd.
10959 Register DstReg = Root.getOperand(0).getReg();
10960 bool IsKill = Root.getOperand(1).isKill();
10961 MachineInstr *Add =
10962 BuildMI(*MF, MIMetadata(Root), TII.get(AddOpc), DstReg)
10963 .addReg(Root.getOperand(1).getReg(), getKillRegState(IsKill))
10964 .addReg(Madd->getOperand(0).getReg(), getKillRegState(true));
10965 InsInstrs.push_back(Madd);
10966 InsInstrs.push_back(Add);
10967 DelInstrs.push_back(&Root);
10968}
10969
10971 MachineInstr &Root, unsigned Pattern,
10974 DenseMap<Register, unsigned> &InstrIdxForVirtReg) const {
10975 switch (Pattern) {
10976 default:
10977 // Reassociate instructions.
10979 DelInstrs, InstrIdxForVirtReg);
10980 return;
10982 genAlternativeDpCodeSequence(Root, *this, InsInstrs, DelInstrs,
10983 InstrIdxForVirtReg);
10984 return;
10985 }
10986}
10987
10988// See also: X86DAGToDAGISel::SelectInlineAsmMemoryOperand().
10990 int FI) const {
10993 M.Base.FrameIndex = FI;
10994 M.getFullAddress(Ops);
10995}
10996
10997#define GET_INSTRINFO_HELPERS
10998#include "X86GenInstrInfo.inc"
unsigned SubReg
unsigned const MachineRegisterInfo * MRI
MachineInstrBuilder & UseMI
MachineInstrBuilder MachineInstrBuilder & DefMI
return SDValue()
MachineOutlinerClass
Constants defining how certain sequences should be outlined.
@ MachineOutlinerTailCall
Emit a save, restore, call, and return.
@ MachineOutlinerDefault
unsigned RegSize
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
DXIL Forward Handle Accesses
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
Module.h This file contains the declarations for the Module class.
static bool lookup(const GsymReader &GR, DataExtractor &Data, uint64_t &Offset, uint64_t BaseAddr, uint64_t Addr, SourceLocations &SrcLocs, llvm::Error &Err)
A Lookup helper functions.
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
This file implements the LivePhysRegs utility for tracking liveness of physical registers.
static bool Expand2AddrUndef(MachineInstrBuilder &MIB, const MCInstrDesc &Desc)
Expand a single-def pseudo instruction to a two-addr instruction with two undef reads of the register...
#define F(x, y, z)
Definition MD5.cpp:55
#define I(x, y, z)
Definition MD5.cpp:58
This file declares the MachineConstantPool class which is an abstract constant pool to keep track of ...
Register Reg
Register const TargetRegisterInfo * TRI
Promote Memory to Register
Definition Mem2Reg.cpp:110
const SmallVectorImpl< MachineOperand > MachineBasicBlock * TBB
const SmallVectorImpl< MachineOperand > & Cond
bool IsDead
This file contains some templates that are useful if you are working with the STL at all.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition Value.cpp:480
Provides some synthesis utilities to produce sequences of values.
static SPCC::CondCodes GetOppositeBranchCondition(SPCC::CondCodes CC)
#define LLVM_DEBUG(...)
Definition Debug.h:114
#define FROM_TO(FROM, TO)
cl::opt< bool > X86EnableAPXForRelocation
static bool is64Bit(const char *name)
#define GET_EGPR_IF_ENABLED(OPC)
static bool isLEA(unsigned Opcode)
static void addOperands(MachineInstrBuilder &MIB, ArrayRef< MachineOperand > MOs, int PtrOffset=0)
static std::optional< ParamLoadedValue > describeMOVrrLoadedValue(const MachineInstr &MI, Register DescribedReg, const TargetRegisterInfo *TRI)
If DescribedReg overlaps with the MOVrr instruction's destination register then, if possible,...
static cl::opt< unsigned > PartialRegUpdateClearance("partial-reg-update-clearance", cl::desc("Clearance between two register writes " "for inserting XOR to avoid partial " "register update"), cl::init(64), cl::Hidden)
static bool shouldPreventUndefRegUpdateMemFold(MachineFunction &MF, MachineInstr &MI)
static unsigned CopyToFromAsymmetricReg(Register DestReg, Register SrcReg, const X86Subtarget &Subtarget)
static bool isConvertibleLEA(MachineInstr *MI)
static bool ExpandMOVImmSExti8(MachineInstrBuilder &MIB, const TargetInstrInfo &TII, const X86Subtarget &Subtarget)
static bool isAMXOpcode(unsigned Opc)
static int getJumpTableIndexFromReg(const MachineRegisterInfo &MRI, Register Reg)
static void updateOperandRegConstraints(MachineFunction &MF, MachineInstr &NewMI, const TargetInstrInfo &TII)
static int getJumpTableIndexFromAddr(const MachineInstr &MI)
static bool AdjustBlendMask(unsigned OldMask, unsigned OldWidth, unsigned NewWidth, unsigned *pNewMask=nullptr)
static bool expandMOV32r1(MachineInstrBuilder &MIB, const TargetInstrInfo &TII, bool MinusOne)
static unsigned getNewOpcFromTable(ArrayRef< X86TableEntry > Table, unsigned Opc)
static unsigned getStoreRegOpcode(Register SrcReg, const TargetRegisterClass *RC, bool IsStackAligned, const X86Subtarget &STI)
#define FOLD_BROADCAST(SIZE)
static cl::opt< unsigned > UndefRegClearance("undef-reg-clearance", cl::desc("How many idle instructions we would like before " "certain undef register reads"), cl::init(128), cl::Hidden)
#define CASE_BCAST_TYPE_OPC(TYPE, OP16, OP32, OP64)
static bool isTruncatedShiftCountForLEA(unsigned ShAmt)
Check whether the given shift count is appropriate can be represented by a LEA instruction.
static cl::opt< bool > ReMatPICStubLoad("remat-pic-stub-load", cl::desc("Re-materialize load from stub in PIC mode"), cl::init(false), cl::Hidden)
static SmallVector< MachineMemOperand *, 2 > extractLoadMMOs(ArrayRef< MachineMemOperand * > MMOs, MachineFunction &MF)
static MachineInstr * fuseTwoAddrInst(MachineFunction &MF, unsigned Opcode, ArrayRef< MachineOperand > MOs, MachineBasicBlock::iterator InsertPt, MachineInstr &MI, const TargetInstrInfo &TII)
static void printFailMsgforFold(const MachineInstr &MI, unsigned Idx)
static bool canConvert2Copy(unsigned Opc)
static cl::opt< bool > NoFusing("disable-spill-fusing", cl::desc("Disable fusing of spill code into instructions"), cl::Hidden)
static bool expandNOVLXStore(MachineInstrBuilder &MIB, const TargetRegisterInfo *TRI, const MCInstrDesc &StoreDesc, const MCInstrDesc &ExtractDesc, unsigned SubIdx)
static bool isX87Reg(Register Reg)
Return true if the Reg is X87 register.
static bool Expand2AddrKreg(MachineInstrBuilder &MIB, const MCInstrDesc &Desc, Register Reg)
Expand a single-def pseudo instruction to a two-addr instruction with two k0 reads.
static bool isFrameLoadOpcode(int Opcode, TypeSize &MemBytes)
#define VPERM_CASES_BROADCAST(Suffix)
static std::pair< X86::CondCode, unsigned > isUseDefConvertible(const MachineInstr &MI)
Check whether the use can be converted to remove a comparison against zero.
static bool findRedundantFlagInstr(MachineInstr &CmpInstr, MachineInstr &CmpValDefInstr, const MachineRegisterInfo *MRI, MachineInstr **AndInstr, const TargetRegisterInfo *TRI, const X86Subtarget &ST, bool &NoSignFlag, bool &ClearsOverflowFlag)
static bool expandSHXDROT(MachineInstrBuilder &MIB, const MCInstrDesc &Desc)
static unsigned getLoadRegOpcode(Register DestReg, const TargetRegisterClass *RC, bool IsStackAligned, const X86Subtarget &STI)
static void expandLoadStackGuard(MachineInstrBuilder &MIB, const TargetInstrInfo &TII)
static bool hasUndefRegUpdate(unsigned Opcode, unsigned OpNum, bool ForLoadFold=false)
static MachineInstr * makeM0Inst(const TargetInstrInfo &TII, unsigned Opcode, ArrayRef< MachineOperand > MOs, MachineBasicBlock::iterator InsertPt, MachineInstr &MI)
#define GET_ND_IF_ENABLED(OPC)
static bool expandMOVSHP(MachineInstrBuilder &MIB, MachineInstr &MI, const TargetInstrInfo &TII, bool HasAVX)
static bool hasPartialRegUpdate(unsigned Opcode, const X86Subtarget &Subtarget, bool ForLoadFold=false)
Return true for all instructions that only update the first 32 or 64-bits of the destination register...
#define CASE_NF(OP)
static const uint16_t * lookupAVX512(unsigned opcode, unsigned domain, ArrayRef< uint16_t[4]> Table)
static unsigned getLoadStoreRegOpcode(Register Reg, const TargetRegisterClass *RC, bool IsStackAligned, const X86Subtarget &STI, bool Load)
#define VPERM_CASES(Suffix)
#define FROM_TO_SIZE(A, B, S)
static void commuteVPTERNLOG(MachineInstr &MI, unsigned SrcOpIdx1, unsigned SrcOpIdx2)
static bool isDefConvertible(const MachineInstr &MI, bool &NoSignFlag, bool &ClearsOverflowFlag)
Check whether the definition can be converted to remove a comparison against zero.
static MachineInstr * fuseInst(MachineFunction &MF, unsigned Opcode, unsigned OpNo, ArrayRef< MachineOperand > MOs, MachineBasicBlock::iterator InsertPt, MachineInstr &MI, const TargetInstrInfo &TII, int PtrOffset=0)
static X86::CondCode getSwappedCondition(X86::CondCode CC)
Assuming the flags are set by MI(a,b), return the condition code if we modify the instructions such t...
static unsigned getCommutedVPERMV3Opcode(unsigned Opcode)
static bool expandXorFP(MachineInstrBuilder &MIB, const TargetInstrInfo &TII)
static MachineBasicBlock * getFallThroughMBB(MachineBasicBlock *MBB, MachineBasicBlock *TBB)
static bool isNonFoldablePartialRegisterLoad(const MachineInstr &LoadMI, const MachineInstr &UserMI, const MachineFunction &MF)
Check if LoadMI is a partial register load that we can't fold into MI because the latter uses content...
static unsigned getLoadStoreOpcodeForFP16(bool Load, const X86Subtarget &STI)
static bool isHReg(Register Reg)
Test if the given register is a physical h register.
static cl::opt< bool > PrintFailedFusing("print-failed-fuse-candidates", cl::desc("Print instructions that the allocator wants to" " fuse, but the X86 backend currently can't"), cl::Hidden)
static bool expandNOVLXLoad(MachineInstrBuilder &MIB, const TargetRegisterInfo *TRI, const MCInstrDesc &LoadDesc, const MCInstrDesc &BroadcastDesc, unsigned SubIdx)
static void genAlternativeDpCodeSequence(MachineInstr &Root, const TargetInstrInfo &TII, SmallVectorImpl< MachineInstr * > &InsInstrs, SmallVectorImpl< MachineInstr * > &DelInstrs, DenseMap< Register, unsigned > &InstrIdxForVirtReg)
#define CASE_ND(OP)
static unsigned getThreeSrcCommuteCase(uint64_t TSFlags, unsigned SrcOpIdx1, unsigned SrcOpIdx2)
This determines which of three possible cases of a three source commute the source indexes correspond...
static bool isFrameStoreOpcode(int Opcode, TypeSize &MemBytes)
static unsigned getTruncatedShiftCount(const MachineInstr &MI, unsigned ShiftAmtOperandIdx)
Check whether the shift count for a machine operand is non-zero.
static SmallVector< MachineMemOperand *, 2 > extractStoreMMOs(ArrayRef< MachineMemOperand * > MMOs, MachineFunction &MF)
static unsigned getBroadcastOpcode(const X86FoldTableEntry *I, const TargetRegisterClass *RC, const X86Subtarget &STI)
static unsigned convertALUrr2ALUri(unsigned Opc)
Convert an ALUrr opcode to corresponding ALUri opcode.
static bool regIsPICBase(Register BaseReg, const MachineRegisterInfo &MRI)
Return true if register is PIC base; i.e.g defined by X86::MOVPC32r.
static bool isCommutableVPERMV3Instruction(unsigned Opcode)
static APInt getMaxValue(unsigned numBits)
Gets maximum unsigned value of APInt for specific bit width.
Definition APInt.h:206
static APInt getSignedMaxValue(unsigned numBits)
Gets maximum signed value of APInt for a specific bit width.
Definition APInt.h:209
static APInt getSignedMinValue(unsigned numBits)
Gets minimum signed value of APInt for a specific bit width.
Definition APInt.h:219
AnalysisUsage & addRequired()
LLVM_ABI void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition Pass.cpp:270
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:41
iterator end() const
Definition ArrayRef.h:136
size_t size() const
size - Get the array size.
Definition ArrayRef.h:147
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:678
@ FCMP_OEQ
0 0 0 1 True if ordered and equal
Definition InstrTypes.h:681
@ ICMP_SLT
signed less than
Definition InstrTypes.h:707
@ ICMP_SLE
signed less or equal
Definition InstrTypes.h:708
@ FCMP_OLT
0 1 0 0 True if ordered and less than
Definition InstrTypes.h:684
@ FCMP_ULE
1 1 0 1 True if unordered, less than, or equal
Definition InstrTypes.h:693
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
Definition InstrTypes.h:682
@ FCMP_OGE
0 0 1 1 True if ordered and greater than or equal
Definition InstrTypes.h:683
@ ICMP_UGE
unsigned greater or equal
Definition InstrTypes.h:702
@ ICMP_UGT
unsigned greater than
Definition InstrTypes.h:701
@ ICMP_SGT
signed greater than
Definition InstrTypes.h:705
@ FCMP_ULT
1 1 0 0 True if unordered or less than
Definition InstrTypes.h:692
@ FCMP_ONE
0 1 1 0 True if ordered and operands are unequal
Definition InstrTypes.h:686
@ FCMP_UEQ
1 0 0 1 True if unordered or equal
Definition InstrTypes.h:689
@ ICMP_ULT
unsigned less than
Definition InstrTypes.h:703
@ FCMP_UGT
1 0 1 0 True if unordered or greater than
Definition InstrTypes.h:690
@ FCMP_OLE
0 1 0 1 True if ordered and less than or equal
Definition InstrTypes.h:685
@ FCMP_ORD
0 1 1 1 True if ordered (no nans)
Definition InstrTypes.h:687
@ ICMP_NE
not equal
Definition InstrTypes.h:700
@ ICMP_SGE
signed greater or equal
Definition InstrTypes.h:706
@ FCMP_UNE
1 1 1 0 True if unordered or not equal
Definition InstrTypes.h:694
@ ICMP_ULE
unsigned less or equal
Definition InstrTypes.h:704
@ FCMP_UGE
1 0 1 1 True if unordered, greater than, or equal
Definition InstrTypes.h:691
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition InstrTypes.h:688
This is an important base class in LLVM.
Definition Constant.h:43
static LLVM_ABI Constant * getAllOnesValue(Type *Ty)
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
DWARF expression.
static LLVM_ABI void appendOffset(SmallVectorImpl< uint64_t > &Ops, int64_t Offset)
Append Ops with operations to apply the Offset.
static LLVM_ABI DIExpression * appendExt(const DIExpression *Expr, unsigned FromSize, unsigned ToSize, bool Signed)
Append a zero- or sign-extension to Expr.
A debug info location.
Definition DebugLoc.h:124
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition DenseMap.h:222
DomTreeNodeBase< NodeT > * getRootNode()
getRootNode - This returns the entry node for the CFG of the function.
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:803
FunctionPass class - This class is used to implement most global optimizations.
Definition Pass.h:314
bool hasOptSize() const
Optimize this function for size (-Os) or minimum size (-Oz).
Definition Function.h:706
bool hasMinSize() const
Optimize this function for minimum size (-Oz).
Definition Function.h:703
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:359
LiveInterval - This class represents the liveness of a register, or stack slot.
SlotIndex InsertMachineInstrInMaps(MachineInstr &MI)
SlotIndex getInstructionIndex(const MachineInstr &Instr) const
Returns the base index of the given instruction.
LiveInterval & getInterval(Register Reg)
SlotIndex ReplaceMachineInstrInMaps(MachineInstr &MI, MachineInstr &NewMI)
A set of physical registers with utility functions to track liveness when walking backward/forward th...
const Segment * getSegmentContaining(SlotIndex Idx) const
Return the segment that contains the specified index, or null if there is none.
LLVM_ABI void replaceKillInstruction(Register Reg, MachineInstr &OldMI, MachineInstr &NewMI)
replaceKillInstruction - Update register kill info by replacing a kill instruction with a new one.
LLVM_ABI VarInfo & getVarInfo(Register Reg)
getVarInfo - Return the VarInfo structure for the specified VIRTUAL register.
static LocationSize precise(uint64_t Value)
bool usesWindowsCFI() const
Definition MCAsmInfo.h:652
static MCCFIInstruction createAdjustCfaOffset(MCSymbol *L, int64_t Adjustment, SMLoc Loc={})
.cfi_adjust_cfa_offset Same as .cfi_def_cfa_offset, but Offset is a relative value that is added/subt...
Definition MCDwarf.h:608
Instances of this class represent a single low-level machine instruction.
Definition MCInst.h:188
void setOpcode(unsigned Op)
Definition MCInst.h:201
Describe properties that are true of each instruction in the target description file.
This holds information about one operand of a machine instruction, indicating the register class for ...
Definition MCInstrDesc.h:87
Wrapper class representing physical registers. Should be passed by value.
Definition MCRegister.h:33
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition Metadata.h:1561
Set of metadata that should be preserved when using BuildMI().
SimpleValueType SimpleTy
MachineInstrBundleIterator< const MachineInstr > const_iterator
void push_back(MachineInstr *MI)
MachineInstr * remove(MachineInstr *I)
Remove the unbundled instruction from the instruction list without deleting it.
LLVM_ABI DebugLoc findDebugLoc(instr_iterator MBBI)
Find the next valid DebugLoc starting at MBBI, skipping any debug instructions.
MachineInstrBundleIterator< MachineInstr, true > reverse_iterator
LLVM_ABI bool isLayoutSuccessor(const MachineBasicBlock *MBB) const
Return true if the specified MBB will be emitted immediately after this block, such that if this bloc...
LLVM_ABI void eraseFromParent()
This method unlinks 'this' from the containing function and deletes it.
LLVM_ABI instr_iterator erase(instr_iterator I)
Remove an instruction from the instruction list and delete it.
iterator_range< succ_iterator > successors()
MachineInstrBundleIterator< MachineInstr > iterator
@ LQR_Dead
Register is known to be fully dead.
This class is a data container for one entry in a MachineConstantPool.
union llvm::MachineConstantPoolEntry::@004270020304201266316354007027341142157160323045 Val
The constant itself.
bool isMachineConstantPoolEntry() const
isMachineConstantPoolEntry - Return true if the MachineConstantPoolEntry is indeed a target specific ...
The MachineConstantPool class keeps track of constants referenced by a function which must be spilled...
unsigned getConstantPoolIndex(const Constant *C, Align Alignment)
getConstantPoolIndex - Create a new entry in the constant pool or return an existing one.
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
Align getObjectAlign(int ObjectIdx) const
Return the alignment of the specified stack object.
int64_t getObjectSize(int ObjectIdx) const
Return the size of the specified object.
bool isFixedObjectIndex(int ObjectIdx) const
Returns true if the specified index corresponds to a fixed stack object.
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
MCSymbol * getPICBaseSymbol() const
getPICBaseSymbol - Return a function-local symbol to represent the PIC base.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
StringRef getName() const
getName - Return the name of the corresponding LLVM function.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
bool needsFrameMoves() const
True if this function needs frame moves for debug or exceptions.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
MachineConstantPool * getConstantPool()
getConstantPool - Return the constant pool object for the current function.
const MachineBasicBlock & front() const
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
Register getReg(unsigned Idx) const
Get the register for the operand index.
const MachineInstrBuilder & setMemRefs(ArrayRef< MachineMemOperand * > MMOs) const
const MachineInstrBuilder & addExternalSymbol(const char *FnName, unsigned TargetFlags=0) const
const MachineInstrBuilder & setMIFlag(MachineInstr::MIFlag Flag) const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addSym(MCSymbol *Sym, unsigned char TargetFlags=0) const
const MachineInstrBuilder & addGlobalAddress(const GlobalValue *GV, int64_t Offset=0, unsigned TargetFlags=0) const
const MachineInstrBuilder & addDisp(const MachineOperand &Disp, int64_t off, unsigned char TargetFlags=0) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & copyImplicitOps(const MachineInstr &OtherMI) const
Copy all the implicit operands from OtherMI onto this one.
const MachineInstrBuilder & addMemOperand(MachineMemOperand *MMO) const
MachineInstr * getInstr() const
If conversion operators fail, use this method to get the MachineInstr explicitly.
Representation of each machine instruction.
mop_iterator operands_begin()
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
bool isImplicitDef() const
const MachineBasicBlock * getParent() const
void dropDebugNumber()
Drop any variable location debugging information associated with this instruction.
LLVM_ABI void setPreInstrSymbol(MachineFunction &MF, MCSymbol *Symbol)
Set a symbol that will be emitted just prior to the instruction itself.
LLVM_ABI void addImplicitDefUseOperands(MachineFunction &MF)
Add all implicit def and use operands to this instruction.
bool getFlag(MIFlag Flag) const
Return whether an MI flag is set.
unsigned getNumOperands() const
Retuns the total number of operands.
LLVM_ABI void addOperand(MachineFunction &MF, const MachineOperand &Op)
Add the specified operand to the instruction.
LLVM_ABI unsigned getNumExplicitOperands() const
Returns the number of non-implicit operands.
bool modifiesRegister(Register Reg, const TargetRegisterInfo *TRI) const
Return true if the MachineInstr modifies (fully define or partially define) the specified register.
const MCInstrDesc & getDesc() const
Returns the target instruction descriptor of this MachineInstr.
void untieRegOperand(unsigned OpIdx)
Break any tie involving OpIdx.
LLVM_ABI void setDesc(const MCInstrDesc &TID)
Replace the instruction descriptor (thus opcode) of the current instruction with a new one.
LLVM_ABI unsigned getNumExplicitDefs() const
Returns the number of non-implicit definitions.
LLVM_ABI void eraseFromBundle()
Unlink 'this' from its basic block and delete it.
bool hasOneMemOperand() const
Return true if this instruction has exactly one MachineMemOperand.
LLVM_ABI void substituteRegister(Register FromReg, Register ToReg, unsigned SubIdx, const TargetRegisterInfo &RegInfo)
Replace all occurrences of FromReg with ToReg:SubIdx, properly composing subreg indices where necessa...
mmo_iterator memoperands_begin() const
Access to memory operands of the instruction.
LLVM_ABI bool isIdenticalTo(const MachineInstr &Other, MICheckType Check=CheckDefs) const
Return true if this instruction is identical to Other.
LLVM_ABI const MachineFunction * getMF() const
Return the function that contains the basic block that this instruction belongs to.
void setFlag(MIFlag Flag)
Set a MI flag.
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
LLVM_ABI void eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
LLVM_ABI void removeOperand(unsigned OpNo)
Erase an operand from an instruction, leaving it with one fewer operand than it started with.
LLVM_ABI void dump() const
const MachineOperand & getOperand(unsigned i) const
unsigned getNumDefs() const
Returns the total number of definitions.
void setDebugLoc(DebugLoc DL)
Replace current source information with new such.
MachineOperand * findRegisterDefOperand(Register Reg, const TargetRegisterInfo *TRI, bool isDead=false, bool Overlap=false)
Wrapper for findRegisterDefOperandIdx, it returns a pointer to the MachineOperand rather than an inde...
A description of a memory reference used in the backend.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MOInvariant
The memory access always returns the same value (or traps).
@ MOStore
The memory access writes data.
This class contains meta information specific to a module.
MachineOperand class - Representation of each machine instruction operand.
void setSubReg(unsigned subReg)
unsigned getSubReg() const
void setImplicit(bool Val=true)
void setImm(int64_t immVal)
int64_t getImm() const
bool readsReg() const
readsReg - Returns true if this operand reads the previous value of its register.
bool isReg() const
isReg - Tests if this is a MO_Register operand.
MachineBasicBlock * getMBB() const
bool isCPI() const
isCPI - Tests if this is a MO_ConstantPoolIndex operand.
void setIsDead(bool Val=true)
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
void setIsKill(bool Val=true)
bool isJTI() const
isJTI - Tests if this is a MO_JumpTableIndex operand.
LLVM_ABI void ChangeToRegister(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isDebug=false)
ChangeToRegister - Replace this operand with a new register operand of the specified value.
static MachineOperand CreateImm(int64_t Val)
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
bool isFI() const
isFI - Tests if this is a MO_FrameIndex operand.
LLVM_ABI bool isIdenticalTo(const MachineOperand &Other) const
Returns true if this operand is identical to the specified operand except for liveness related flags ...
static MachineOperand CreateCPI(unsigned Idx, int Offset, unsigned TargetFlags=0)
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
int64_t getOffset() const
Return the offset from the symbol in this operand.
static MachineOperand CreateFI(int Idx)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
const TargetRegisterClass * getRegClass(Register Reg) const
Return the register class of the specified virtual register.
LLVM_ABI Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
LLVM_ABI const TargetRegisterClass * constrainRegClass(Register Reg, const TargetRegisterClass *RC, unsigned MinNumRegs=0)
constrainRegClass - Constrain the register class of the specified virtual register to be a common sub...
A Module instance is used to store all the information related to an LLVM module.
Definition Module.h:67
Wrapper class representing virtual and physical registers.
Definition Register.h:19
constexpr bool isValid() const
Definition Register.h:107
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
Definition Register.h:74
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition Register.h:78
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
Represents one node in the SelectionDAG.
bool isMachineOpcode() const
Test if this node has a post-isel opcode, directly corresponding to a MachineInstr opcode.
unsigned getMachineOpcode() const
This may only be called if isMachineOpcode returns true.
const SDValue & getOperand(unsigned Num) const
EVT getValueType(unsigned ResNo) const
Return the type of a specified result.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
LLVM_ABI MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
LLVM_ABI void setNodeMemRefs(MachineSDNode *N, ArrayRef< MachineMemOperand * > NewMemRefs)
Mutate the specified machine node's memory references to the provided list.
MachineFunction & getMachineFunction() const
SlotIndex - An opaque wrapper around machine indexes.
Definition SlotIndexes.h:66
SlotIndex getBaseIndex() const
Returns the base index for associated with this index.
SlotIndex getRegSlot(bool EC=false) const
Returns the register use/def slot in the current instruction for a normal or early-clobber def.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
Information about stack frame layout on the target.
bool hasFP(const MachineFunction &MF) const
hasFP - Return true if the specified function should have a dedicated frame pointer register.
Align getStackAlign() const
getStackAlignment - This method returns the number of bytes to which the stack pointer must be aligne...
TargetInstrInfo - Interface to description of machine instruction set.
virtual bool findCommutedOpIndices(const MachineInstr &MI, unsigned &SrcOpIdx1, unsigned &SrcOpIdx2) const
Returns true iff the routine could find two commutable operands in the given machine instruction.
virtual const TargetRegisterClass * getRegClass(const MCInstrDesc &MCID, unsigned OpNum, const TargetRegisterInfo *TRI) const
Given a machine instruction descriptor, returns the register class constraint for OpNum,...
virtual bool hasReassociableOperands(const MachineInstr &Inst, const MachineBasicBlock *MBB) const
Return true when \P Inst has reassociable operands in the same \P MBB.
virtual void genAlternativeCodeSequence(MachineInstr &Root, unsigned Pattern, SmallVectorImpl< MachineInstr * > &InsInstrs, SmallVectorImpl< MachineInstr * > &DelInstrs, DenseMap< Register, unsigned > &InstIdxForVirtReg) const
When getMachineCombinerPatterns() finds patterns, this function generates the instructions that could...
virtual std::optional< ParamLoadedValue > describeLoadedValue(const MachineInstr &MI, Register Reg) const
Produce the expression describing the MI loading a value into the physical register Reg.
virtual bool getMachineCombinerPatterns(MachineInstr &Root, SmallVectorImpl< unsigned > &Patterns, bool DoRegPressureReduce) const
Return true when there is potentially a faster code sequence for an instruction chain ending in Root.
virtual bool isReMaterializableImpl(const MachineInstr &MI) const
For instructions with opcodes for which the M_REMATERIALIZABLE flag is set, this hook lets the target...
virtual bool isSchedulingBoundary(const MachineInstr &MI, const MachineBasicBlock *MBB, const MachineFunction &MF) const
Test if the given instruction should be considered a scheduling boundary.
virtual MachineInstr * commuteInstructionImpl(MachineInstr &MI, bool NewMI, unsigned OpIdx1, unsigned OpIdx2) const
This method commutes the operands of the given machine instruction MI.
bool isPositionIndependent() const
CodeModel::Model getCodeModel() const
Returns the code model.
const MCAsmInfo * getMCAsmInfo() const
Return target specific asm information.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
Provide an instruction scheduling machine model to CodeGen passes.
virtual const TargetFrameLowering * getFrameLowering() const
virtual const TargetRegisterInfo * getRegisterInfo() const =0
Return the target's register information.
Target - Wrapper for Target specific information.
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:343
static constexpr TypeSize getZero()
Definition TypeSize.h:349
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:297
static LLVM_ABI Type * getFP128Ty(LLVMContext &C)
Definition Type.cpp:290
static LLVM_ABI Type * getDoubleTy(LLVMContext &C)
Definition Type.cpp:286
static LLVM_ABI Type * getFloatTy(LLVMContext &C)
Definition Type.cpp:285
static LLVM_ABI Type * getHalfTy(LLVMContext &C)
Definition Type.cpp:283
SlotIndex def
The index of the defining instruction.
LLVM Value Representation.
Definition Value.h:75
void BuildCFI(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, const MCCFIInstruction &CFIInst, MachineInstr::MIFlag Flag=MachineInstr::NoFlags) const
Wraps up getting a CFI index and building a MachineInstr for it.
void loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, int FrameIndex, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI, Register VReg, MachineInstr::MIFlag Flags=MachineInstr::NoFlags) const override
void getFrameIndexOperands(SmallVectorImpl< MachineOperand > &Ops, int FI) const override
bool optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, Register SrcReg2, int64_t CmpMask, int64_t CmpValue, const MachineRegisterInfo *MRI) const override
Check if there exists an earlier instruction that operates on the same source operands and sets eflag...
bool getMachineCombinerPatterns(MachineInstr &Root, SmallVectorImpl< unsigned > &Patterns, bool DoRegPressureReduce) const override
void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, Register DestReg, Register SrcReg, bool KillSrc, bool RenamableDest=false, bool RenamableSrc=false) const override
bool isSchedulingBoundary(const MachineInstr &MI, const MachineBasicBlock *MBB, const MachineFunction &MF) const override
Overrides the isSchedulingBoundary from Codegen/TargetInstrInfo.cpp to make it capable of identifying...
MachineBasicBlock::iterator insertOutlinedCall(Module &M, MachineBasicBlock &MBB, MachineBasicBlock::iterator &It, MachineFunction &MF, outliner::Candidate &C) const override
void replaceBranchWithTailCall(MachineBasicBlock &MBB, SmallVectorImpl< MachineOperand > &Cond, const MachineInstr &TailCall) const override
void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, unsigned SubIdx, const MachineInstr &Orig, const TargetRegisterInfo &TRI) const override
bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl< MachineOperand > &Cond, bool AllowModify) const override
bool canInsertSelect(const MachineBasicBlock &, ArrayRef< MachineOperand > Cond, Register, Register, Register, int &, int &, int &) const override
void insertSelect(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, Register DstReg, ArrayRef< MachineOperand > Cond, Register TrueReg, Register FalseReg) const override
unsigned getOpcodeAfterMemoryUnfold(unsigned Opc, bool UnfoldLoad, bool UnfoldStore, unsigned *LoadRegIndex=nullptr) const override
bool findCommutedOpIndices(const MachineInstr &MI, unsigned &SrcOpIdx1, unsigned &SrcOpIdx2) const override
Returns true iff the routine could find two commutable operands in the given machine instruction.
bool areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2, int64_t &Offset1, int64_t &Offset2) const override
X86InstrInfo(const X86Subtarget &STI)
static bool isDataInvariantLoad(MachineInstr &MI)
Returns true if the instruction has no behavior (specified or otherwise) that is based on the value l...
MachineInstr * commuteInstructionImpl(MachineInstr &MI, bool NewMI, unsigned CommuteOpIdx1, unsigned CommuteOpIdx2) const override
bool isFunctionSafeToOutlineFrom(MachineFunction &MF, bool OutlineFromLinkOnceODRs) const override
const X86RegisterInfo & getRegisterInfo() const
getRegisterInfo - TargetInstrInfo is a superset of MRegister info.
bool hasCommutePreference(MachineInstr &MI, bool &Commute) const override
Returns true if we have preference on the operands order in MI, the commute decision is returned in C...
bool hasLiveCondCodeDef(MachineInstr &MI) const
True if MI has a condition code def, e.g.
std::optional< ParamLoadedValue > describeLoadedValue(const MachineInstr &MI, Register Reg) const override
bool canMakeTailCallConditional(SmallVectorImpl< MachineOperand > &Cond, const MachineInstr &TailCall) const override
bool getMemOperandsWithOffsetWidth(const MachineInstr &LdSt, SmallVectorImpl< const MachineOperand * > &BaseOps, int64_t &Offset, bool &OffsetIsScalable, LocationSize &Width, const TargetRegisterInfo *TRI) const override
bool unfoldMemoryOperand(MachineFunction &MF, MachineInstr &MI, Register Reg, bool UnfoldLoad, bool UnfoldStore, SmallVectorImpl< MachineInstr * > &NewMIs) const override
std::optional< DestSourcePair > isCopyInstrImpl(const MachineInstr &MI) const override
MachineInstr * convertToThreeAddress(MachineInstr &MI, LiveVariables *LV, LiveIntervals *LIS) const override
convertToThreeAddress - This method must be implemented by targets that set the M_CONVERTIBLE_TO_3_AD...
std::pair< unsigned, unsigned > decomposeMachineOperandsTargetFlags(unsigned TF) const override
bool expandPostRAPseudo(MachineInstr &MI) const override
bool isAssociativeAndCommutative(const MachineInstr &Inst, bool Invert) const override
MCInst getNop() const override
Return the noop instruction to use for a noop.
outliner::InstrType getOutliningTypeImpl(const MachineModuleInfo &MMI, MachineBasicBlock::iterator &MIT, unsigned Flags) const override
bool shouldScheduleLoadsNear(SDNode *Load1, SDNode *Load2, int64_t Offset1, int64_t Offset2, unsigned NumLoads) const override
This is a used by the pre-regalloc scheduler to determine (in conjunction with areLoadsFromSameBasePt...
MachineInstr * foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI, ArrayRef< unsigned > Ops, MachineBasicBlock::iterator InsertPt, int FrameIndex, LiveIntervals *LIS=nullptr, VirtRegMap *VRM=nullptr) const override
Fold a load or store of the specified stack slot into the specified machine instruction for the speci...
bool analyzeCompare(const MachineInstr &MI, Register &SrcReg, Register &SrcReg2, int64_t &CmpMask, int64_t &CmpValue) const override
bool getConstValDefinedInReg(const MachineInstr &MI, const Register Reg, int64_t &ImmVal) const override
std::optional< ExtAddrMode > getAddrModeFromMemoryOp(const MachineInstr &MemI, const TargetRegisterInfo *TRI) const override
Register isStoreToStackSlotPostFE(const MachineInstr &MI, int &FrameIndex) const override
isStoreToStackSlotPostFE - Check for post-frame ptr elimination stack locations as well.
bool isUnconditionalTailCall(const MachineInstr &MI) const override
bool reverseBranchCondition(SmallVectorImpl< MachineOperand > &Cond) const override
std::optional< std::unique_ptr< outliner::OutlinedFunction > > getOutliningCandidateInfo(const MachineModuleInfo &MMI, std::vector< outliner::Candidate > &RepeatedSequenceLocs, unsigned MinRepeats) const override
bool classifyLEAReg(MachineInstr &MI, const MachineOperand &Src, unsigned LEAOpcode, bool AllowSP, Register &NewSrc, unsigned &NewSrcSubReg, bool &isKill, MachineOperand &ImplicitOp, LiveVariables *LV, LiveIntervals *LIS) const
Given an operand within a MachineInstr, insert preceding code to put it into the right format for a p...
Register isLoadFromStackSlotPostFE(const MachineInstr &MI, int &FrameIndex) const override
isLoadFromStackSlotPostFE - Check for post-frame ptr elimination stack locations as well.
void setExecutionDomain(MachineInstr &MI, unsigned Domain) const override
void storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg, bool isKill, int FrameIndex, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI, Register VReg, MachineInstr::MIFlag Flags=MachineInstr::NoFlags) const override
const TargetRegisterClass * getRegClass(const MCInstrDesc &MCID, unsigned OpNum, const TargetRegisterInfo *TRI) const override
Given a machine instruction descriptor, returns the register class constraint for OpNum,...
unsigned insertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB, ArrayRef< MachineOperand > Cond, const DebugLoc &DL, int *BytesAdded=nullptr) const override
ArrayRef< std::pair< unsigned, const char * > > getSerializableDirectMachineOperandTargetFlags() const override
Register isStoreToStackSlot(const MachineInstr &MI, int &FrameIndex) const override
bool setExecutionDomainCustom(MachineInstr &MI, unsigned Domain) const
int getSPAdjust(const MachineInstr &MI) const override
getSPAdjust - This returns the stack pointer adjustment made by this instruction.
bool verifyInstruction(const MachineInstr &MI, StringRef &ErrInfo) const override
bool isReMaterializableImpl(const MachineInstr &MI) const override
Register getGlobalBaseReg(MachineFunction *MF) const
getGlobalBaseReg - Return a virtual register initialized with the the global base register value.
int getJumpTableIndex(const MachineInstr &MI) const override
void insertNoop(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI) const override
void setSpecialOperandAttr(MachineInstr &OldMI1, MachineInstr &OldMI2, MachineInstr &NewMI1, MachineInstr &NewMI2) const override
This is an architecture-specific helper function of reassociateOps.
std::pair< uint16_t, uint16_t > getExecutionDomain(const MachineInstr &MI) const override
bool isCoalescableExtInstr(const MachineInstr &MI, Register &SrcReg, Register &DstReg, unsigned &SubIdx) const override
isCoalescableExtInstr - Return true if the instruction is a "coalescable" extension instruction.
void loadStoreTileReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, unsigned Opc, Register Reg, int FrameIdx, bool isKill=false) const
void genAlternativeCodeSequence(MachineInstr &Root, unsigned Pattern, SmallVectorImpl< MachineInstr * > &InsInstrs, SmallVectorImpl< MachineInstr * > &DelInstrs, DenseMap< Register, unsigned > &InstrIdxForVirtReg) const override
When getMachineCombinerPatterns() finds potential patterns, this function generates the instructions ...
bool hasReassociableOperands(const MachineInstr &Inst, const MachineBasicBlock *MBB) const override
bool analyzeBranchPredicate(MachineBasicBlock &MBB, TargetInstrInfo::MachineBranchPredicate &MBP, bool AllowModify=false) const override
static bool isDataInvariant(MachineInstr &MI)
Returns true if the instruction has no behavior (specified or otherwise) that is based on the value o...
unsigned getUndefRegClearance(const MachineInstr &MI, unsigned OpNum, const TargetRegisterInfo *TRI) const override
Inform the BreakFalseDeps pass how many idle instructions we would like before certain undef register...
void breakPartialRegDependency(MachineInstr &MI, unsigned OpNum, const TargetRegisterInfo *TRI) const override
void buildClearRegister(Register Reg, MachineBasicBlock &MBB, MachineBasicBlock::iterator Iter, DebugLoc &DL, bool AllowSideEffects=true) const override
Register isLoadFromStackSlot(const MachineInstr &MI, int &FrameIndex) const override
int64_t getFrameAdjustment(const MachineInstr &I) const
Returns the stack pointer adjustment that happens inside the frame setup..destroy sequence (e....
bool hasHighOperandLatency(const TargetSchedModel &SchedModel, const MachineRegisterInfo *MRI, const MachineInstr &DefMI, unsigned DefIdx, const MachineInstr &UseMI, unsigned UseIdx) const override
bool isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const override
uint16_t getExecutionDomainCustom(const MachineInstr &MI) const
bool isHighLatencyDef(int opc) const override
void buildOutlinedFrame(MachineBasicBlock &MBB, MachineFunction &MF, const outliner::OutlinedFunction &OF) const override
bool foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, Register Reg, MachineRegisterInfo *MRI) const override
foldImmediate - 'Reg' is known to be defined by a move immediate instruction, try to fold the immedia...
unsigned removeBranch(MachineBasicBlock &MBB, int *BytesRemoved=nullptr) const override
unsigned getFMA3OpcodeToCommuteOperands(const MachineInstr &MI, unsigned SrcOpIdx1, unsigned SrcOpIdx2, const X86InstrFMA3Group &FMA3Group) const
Returns an adjusted FMA opcode that must be used in FMA instruction that performs the same computatio...
bool preservesZeroValueInReg(const MachineInstr *MI, const Register NullValueReg, const TargetRegisterInfo *TRI) const override
unsigned getPartialRegUpdateClearance(const MachineInstr &MI, unsigned OpNum, const TargetRegisterInfo *TRI) const override
Inform the BreakFalseDeps pass how many idle instructions we would like before a partial register upd...
X86MachineFunctionInfo - This class is derived from MachineFunction and contains private X86 target-s...
const TargetRegisterClass * constrainRegClassToNonRex2(const TargetRegisterClass *RC) const
bool isPICStyleGOT() const
const X86InstrInfo * getInstrInfo() const override
bool hasAVX512() const
const X86RegisterInfo * getRegisterInfo() const override
bool hasAVX() const
const X86FrameLowering * getFrameLowering() const override
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Define
Register definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
@ X86
Windows x64, Windows Itanium (IA-64)
Definition MCAsmInfo.h:50
X86II - This namespace holds all of the target specific flags that instruction info tracks.
bool isKMergeMasked(uint64_t TSFlags)
bool hasNewDataDest(uint64_t TSFlags)
@ MO_GOT_ABSOLUTE_ADDRESS
MO_GOT_ABSOLUTE_ADDRESS - On a symbol operand, this represents a relocation of: SYMBOL_LABEL + [.
@ MO_INDNTPOFF
MO_INDNTPOFF - On a symbol operand this indicates that the immediate is the absolute address of the G...
@ MO_GOTNTPOFF
MO_GOTNTPOFF - On a symbol operand this indicates that the immediate is the offset of the GOT entry w...
@ MO_GOTTPOFF
MO_GOTTPOFF - On a symbol operand this indicates that the immediate is the offset of the GOT entry wi...
@ MO_PIC_BASE_OFFSET
MO_PIC_BASE_OFFSET - On a symbol operand this indicates that the immediate should get the value of th...
@ MO_GOTPCREL
MO_GOTPCREL - On a symbol operand this indicates that the immediate is offset to the GOT entry for th...
@ EVEX
EVEX - Specifies that this instruction use EVEX form which provides syntax support up to 32 512-bit r...
@ SSEDomainShift
Execution domain for SSE instructions.
bool canUseApxExtendedReg(const MCInstrDesc &Desc)
bool isPseudo(uint64_t TSFlags)
bool isKMasked(uint64_t TSFlags)
int getMemoryOperandNo(uint64_t TSFlags)
unsigned getOperandBias(const MCInstrDesc &Desc)
Compute whether all of the def operands are repeated in the uses and therefore should be skipped.
Define some predicates that are used for node matching.
CondCode getCondFromBranch(const MachineInstr &MI)
CondCode getCondFromCFCMov(const MachineInstr &MI)
@ LAST_VALID_COND
Definition X86BaseInfo.h:94
CondCode getCondFromMI(const MachineInstr &MI)
Return the condition code of the instruction.
int getFirstAddrOperandIdx(const MachineInstr &MI)
Return the index of the instruction's first address operand, if it has a memory reference,...
@ AddrNumOperands
Definition X86BaseInfo.h:36
unsigned getSwappedVCMPImm(unsigned Imm)
Get the VCMP immediate if the opcodes are swapped.
CondCode GetOppositeBranchCondition(CondCode CC)
GetOppositeBranchCondition - Return the inverse of the specified cond, e.g.
unsigned getSwappedVPCOMImm(unsigned Imm)
Get the VPCOM immediate if the opcodes are swapped.
bool isX87Instruction(MachineInstr &MI)
Check if the instruction is X87 instruction.
unsigned getNonNDVariant(unsigned Opc)
unsigned getVPCMPImmForCond(ISD::CondCode CC)
Get the VPCMP immediate for the given condition.
std::pair< CondCode, bool > getX86ConditionCode(CmpInst::Predicate Predicate)
Return a pair of condition code for the given predicate and whether the instruction operands should b...
CondCode getCondFromSETCC(const MachineInstr &MI)
unsigned getSwappedVPCMPImm(unsigned Imm)
Get the VPCMP immediate if the opcodes are swapped.
CondCode getCondFromCCMP(const MachineInstr &MI)
int getCCMPCondFlagsFromCondCode(CondCode CC)
int getCondSrcNoFromDesc(const MCInstrDesc &MCID)
Return the source operand # for condition code by MCID.
const Constant * getConstantFromPool(const MachineInstr &MI, unsigned OpNo)
Find any constant pool entry associated with a specific instruction operand.
unsigned getCMovOpcode(unsigned RegBytes, bool HasMemoryOperand=false, bool HasNDD=false)
Return a cmov opcode for the given register size in bytes, and operand type.
unsigned getNFVariant(unsigned Opc)
unsigned getVectorRegisterWidth(const MCOperandInfo &Info)
Get the width of the vector register operand.
CondCode getCondFromCMov(const MachineInstr &MI)
initializer< Ty > init(const Ty &Val)
InstrType
Represents how an instruction should be mapped by the outliner.
NodeAddr< NodeBase * > Node
Definition RDFGraph.h:381
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:318
@ Offset
Definition DWP.cpp:477
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1705
static bool isAddMemInstrWithRelocation(const MachineInstr &MI)
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:174
LLVM_ABI bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
static bool isMem(const MachineInstr &MI, unsigned Op)
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:649
bool isAligned(Align Lhs, uint64_t SizeInBytes)
Checks that SizeInBytes is a multiple of the alignment.
Definition Alignment.h:134
MCRegister getX86SubSuperRegister(MCRegister Reg, unsigned Size, bool High=false)
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
FunctionPass * createX86GlobalBaseRegPass()
This pass initializes a global base register for PIC on x86-32.
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2116
static const MachineInstrBuilder & addRegReg(const MachineInstrBuilder &MIB, Register Reg1, bool isKill1, unsigned SubReg1, Register Reg2, bool isKill2, unsigned SubReg2)
addRegReg - This function is used to add a memory reference of the form: [Reg + Reg].
unsigned getDeadRegState(bool B)
static const MachineInstrBuilder & addFrameReference(const MachineInstrBuilder &MIB, int FI, int Offset=0, bool mem=true)
addFrameReference - This function is used to add a reference to the base of an abstract object on the...
FunctionPass * createCleanupLocalDynamicTLSPass()
This pass combines multiple accesses to local-dynamic TLS variables so that the TLS base address for ...
Op::Description Desc
const X86FoldTableEntry * lookupBroadcastFoldTable(unsigned RegOp, unsigned OpNum)
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition bit.h:186
const X86InstrFMA3Group * getFMA3Group(unsigned Opcode, uint64_t TSFlags)
Returns a reference to a group of FMA3 opcodes to where the given Opcode is included.
auto reverse(ContainerTy &&C)
Definition STLExtras.h:408
MachineInstr * getImm(const MachineOperand &MO, const MachineRegisterInfo *MRI)
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1719
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:167
const X86FoldTableEntry * lookupTwoAddrFoldTable(unsigned RegOp)
FunctionAddr VTableAddr Count
Definition InstrProf.h:139
bool is_sorted(R &&Range, Compare C)
Wrapper function around std::is_sorted to check if elements in a range R are sorted with respect to a...
Definition STLExtras.h:1900
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
Definition MathExtras.h:198
DomTreeNodeBase< MachineBasicBlock > MachineDomTreeNode
static bool isMemInstrWithGOTPCREL(const MachineInstr &MI)
static const MachineInstrBuilder & addOffset(const MachineInstrBuilder &MIB, int Offset)
unsigned getUndefRegState(bool B)
unsigned getRegState(const MachineOperand &RegOp)
Get all register state flags from machine operand RegOp.
unsigned getDefRegState(bool B)
auto lower_bound(R &&Range, T &&Value)
Provide wrappers to std::lower_bound which take ranges instead of having to pass begin/end explicitly...
Definition STLExtras.h:1974
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
unsigned getKillRegState(bool B)
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition Alignment.h:144
FunctionAddr VTableAddr Next
Definition InstrProf.h:141
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
const X86FoldTableEntry * lookupUnfoldTable(unsigned MemOp)
constexpr unsigned BitWidth
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:565
bool matchBroadcastSize(const X86FoldTableEntry &Entry, unsigned BroadcastBits)
std::pair< MachineOperand, DIExpression * > ParamLoadedValue
auto seq(T Begin, T End)
Iterate over an integral type from Begin up to - but not including - End.
Definition Sequence.h:305
const X86FoldTableEntry * lookupFoldTable(unsigned RegOp, unsigned OpNum)
static const MachineInstrBuilder & addRegOffset(const MachineInstrBuilder &MIB, Register Reg, bool isKill, int Offset)
addRegOffset - This function is used to add a memory reference of the form [Reg + Offset],...
int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition bit.h:154
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:872
#define N
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
Extended Value Type.
Definition ValueTypes.h:35
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition ValueTypes.h:316
Used to describe addressing mode similar to ExtAddrMode in CodeGenPrepare.
This represents a simple continuous liveness interval for a value.
std::vector< MachineInstr * > Kills
Kills - List of MachineInstruction's which are the last use of this virtual register (kill it) in the...
static LLVM_ABI MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
X86AddressMode - This struct holds a generalized full x86 address mode.
enum llvm::X86AddressMode::@202116273335065351270200035056227005202106004277 BaseType
This class is used to group {132, 213, 231} forms of FMA opcodes together.
unsigned get213Opcode() const
Returns the 213 form of FMA opcode.
unsigned get231Opcode() const
Returns the 231 form of FMA opcode.
bool isIntrinsic() const
Returns true iff the group of FMA opcodes holds intrinsic opcodes.
unsigned get132Opcode() const
Returns the 132 form of FMA opcode.
An individual sequence of instructions to be replaced with a call to an outlined function.
The information necessary to create an outlined function for some class of candidate.