LLVM 22.0.0git
X86InstrInfo.cpp
Go to the documentation of this file.
1//===-- X86InstrInfo.cpp - X86 Instruction Information --------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file contains the X86 implementation of the TargetInstrInfo class.
10//
11//===----------------------------------------------------------------------===//
12
13#include "X86InstrInfo.h"
14#include "X86.h"
15#include "X86InstrBuilder.h"
16#include "X86InstrFoldTables.h"
18#include "X86Subtarget.h"
19#include "X86TargetMachine.h"
20#include "llvm/ADT/STLExtras.h"
21#include "llvm/ADT/Sequence.h"
36#include "llvm/IR/Function.h"
37#include "llvm/IR/InstrTypes.h"
38#include "llvm/IR/Module.h"
39#include "llvm/MC/MCAsmInfo.h"
40#include "llvm/MC/MCExpr.h"
41#include "llvm/MC/MCInst.h"
43#include "llvm/Support/Debug.h"
47#include <atomic>
48#include <optional>
49
50using namespace llvm;
51
52#define DEBUG_TYPE "x86-instr-info"
53
54#define GET_INSTRINFO_CTOR_DTOR
55#include "X86GenInstrInfo.inc"
56
58
59static cl::opt<bool>
60 NoFusing("disable-spill-fusing",
61 cl::desc("Disable fusing of spill code into instructions"),
63static cl::opt<bool>
64 PrintFailedFusing("print-failed-fuse-candidates",
65 cl::desc("Print instructions that the allocator wants to"
66 " fuse, but the X86 backend currently can't"),
68static cl::opt<bool>
69 ReMatPICStubLoad("remat-pic-stub-load",
70 cl::desc("Re-materialize load from stub in PIC mode"),
71 cl::init(false), cl::Hidden);
73 PartialRegUpdateClearance("partial-reg-update-clearance",
74 cl::desc("Clearance between two register writes "
75 "for inserting XOR to avoid partial "
76 "register update"),
77 cl::init(64), cl::Hidden);
79 "undef-reg-clearance",
80 cl::desc("How many idle instructions we would like before "
81 "certain undef register reads"),
82 cl::init(128), cl::Hidden);
83
84// Pin the vtable to this file.
85void X86InstrInfo::anchor() {}
86
88 : X86GenInstrInfo(STI, RI,
89 (STI.isTarget64BitLP64() ? X86::ADJCALLSTACKDOWN64
90 : X86::ADJCALLSTACKDOWN32),
91 (STI.isTarget64BitLP64() ? X86::ADJCALLSTACKUP64
92 : X86::ADJCALLSTACKUP32),
93 X86::CATCHRET, (STI.is64Bit() ? X86::RET64 : X86::RET32)),
94 Subtarget(STI), RI(STI.getTargetTriple()) {}
95
97 unsigned OpNum) const {
98 auto *RC = TargetInstrInfo::getRegClass(MCID, OpNum);
99 // If the target does not have egpr, then r16-r31 will be resereved for all
100 // instructions.
101 if (!RC || !Subtarget.hasEGPR())
102 return RC;
103
105 return RC;
106
107 const X86RegisterInfo *RI = Subtarget.getRegisterInfo();
108 return RI->constrainRegClassToNonRex2(RC);
109}
110
112 Register &SrcReg, Register &DstReg,
113 unsigned &SubIdx) const {
114 switch (MI.getOpcode()) {
115 default:
116 break;
117 case X86::MOVSX16rr8:
118 case X86::MOVZX16rr8:
119 case X86::MOVSX32rr8:
120 case X86::MOVZX32rr8:
121 case X86::MOVSX64rr8:
122 if (!Subtarget.is64Bit())
123 // It's not always legal to reference the low 8-bit of the larger
124 // register in 32-bit mode.
125 return false;
126 [[fallthrough]];
127 case X86::MOVSX32rr16:
128 case X86::MOVZX32rr16:
129 case X86::MOVSX64rr16:
130 case X86::MOVSX64rr32: {
131 if (MI.getOperand(0).getSubReg() || MI.getOperand(1).getSubReg())
132 // Be conservative.
133 return false;
134 SrcReg = MI.getOperand(1).getReg();
135 DstReg = MI.getOperand(0).getReg();
136 switch (MI.getOpcode()) {
137 default:
138 llvm_unreachable("Unreachable!");
139 case X86::MOVSX16rr8:
140 case X86::MOVZX16rr8:
141 case X86::MOVSX32rr8:
142 case X86::MOVZX32rr8:
143 case X86::MOVSX64rr8:
144 SubIdx = X86::sub_8bit;
145 break;
146 case X86::MOVSX32rr16:
147 case X86::MOVZX32rr16:
148 case X86::MOVSX64rr16:
149 SubIdx = X86::sub_16bit;
150 break;
151 case X86::MOVSX64rr32:
152 SubIdx = X86::sub_32bit;
153 break;
154 }
155 return true;
156 }
157 }
158 return false;
159}
160
162 if (MI.mayLoad() || MI.mayStore())
163 return false;
164
165 // Some target-independent operations that trivially lower to data-invariant
166 // instructions.
167 if (MI.isCopyLike() || MI.isInsertSubreg())
168 return true;
169
170 unsigned Opcode = MI.getOpcode();
171 using namespace X86;
172 // On x86 it is believed that imul is constant time w.r.t. the loaded data.
173 // However, they set flags and are perhaps the most surprisingly constant
174 // time operations so we call them out here separately.
175 if (isIMUL(Opcode))
176 return true;
177 // Bit scanning and counting instructions that are somewhat surprisingly
178 // constant time as they scan across bits and do other fairly complex
179 // operations like popcnt, but are believed to be constant time on x86.
180 // However, these set flags.
181 if (isBSF(Opcode) || isBSR(Opcode) || isLZCNT(Opcode) || isPOPCNT(Opcode) ||
182 isTZCNT(Opcode))
183 return true;
184 // Bit manipulation instructions are effectively combinations of basic
185 // arithmetic ops, and should still execute in constant time. These also
186 // set flags.
187 if (isBLCFILL(Opcode) || isBLCI(Opcode) || isBLCIC(Opcode) ||
188 isBLCMSK(Opcode) || isBLCS(Opcode) || isBLSFILL(Opcode) ||
189 isBLSI(Opcode) || isBLSIC(Opcode) || isBLSMSK(Opcode) || isBLSR(Opcode) ||
190 isTZMSK(Opcode))
191 return true;
192 // Bit extracting and clearing instructions should execute in constant time,
193 // and set flags.
194 if (isBEXTR(Opcode) || isBZHI(Opcode))
195 return true;
196 // Shift and rotate.
197 if (isROL(Opcode) || isROR(Opcode) || isSAR(Opcode) || isSHL(Opcode) ||
198 isSHR(Opcode) || isSHLD(Opcode) || isSHRD(Opcode))
199 return true;
200 // Basic arithmetic is constant time on the input but does set flags.
201 if (isADC(Opcode) || isADD(Opcode) || isAND(Opcode) || isOR(Opcode) ||
202 isSBB(Opcode) || isSUB(Opcode) || isXOR(Opcode))
203 return true;
204 // Arithmetic with just 32-bit and 64-bit variants and no immediates.
205 if (isANDN(Opcode))
206 return true;
207 // Unary arithmetic operations.
208 if (isDEC(Opcode) || isINC(Opcode) || isNEG(Opcode))
209 return true;
210 // Unlike other arithmetic, NOT doesn't set EFLAGS.
211 if (isNOT(Opcode))
212 return true;
213 // Various move instructions used to zero or sign extend things. Note that we
214 // intentionally don't support the _NOREX variants as we can't handle that
215 // register constraint anyways.
216 if (isMOVSX(Opcode) || isMOVZX(Opcode) || isMOVSXD(Opcode) || isMOV(Opcode))
217 return true;
218 // Arithmetic instructions that are both constant time and don't set flags.
219 if (isRORX(Opcode) || isSARX(Opcode) || isSHLX(Opcode) || isSHRX(Opcode))
220 return true;
221 // LEA doesn't actually access memory, and its arithmetic is constant time.
222 if (isLEA(Opcode))
223 return true;
224 // By default, assume that the instruction is not data invariant.
225 return false;
226}
227
229 switch (MI.getOpcode()) {
230 default:
231 // By default, assume that the load will immediately leak.
232 return false;
233
234 // On x86 it is believed that imul is constant time w.r.t. the loaded data.
235 // However, they set flags and are perhaps the most surprisingly constant
236 // time operations so we call them out here separately.
237 case X86::IMUL16rm:
238 case X86::IMUL16rmi:
239 case X86::IMUL32rm:
240 case X86::IMUL32rmi:
241 case X86::IMUL64rm:
242 case X86::IMUL64rmi32:
243
244 // Bit scanning and counting instructions that are somewhat surprisingly
245 // constant time as they scan across bits and do other fairly complex
246 // operations like popcnt, but are believed to be constant time on x86.
247 // However, these set flags.
248 case X86::BSF16rm:
249 case X86::BSF32rm:
250 case X86::BSF64rm:
251 case X86::BSR16rm:
252 case X86::BSR32rm:
253 case X86::BSR64rm:
254 case X86::LZCNT16rm:
255 case X86::LZCNT32rm:
256 case X86::LZCNT64rm:
257 case X86::POPCNT16rm:
258 case X86::POPCNT32rm:
259 case X86::POPCNT64rm:
260 case X86::TZCNT16rm:
261 case X86::TZCNT32rm:
262 case X86::TZCNT64rm:
263
264 // Bit manipulation instructions are effectively combinations of basic
265 // arithmetic ops, and should still execute in constant time. These also
266 // set flags.
267 case X86::BLCFILL32rm:
268 case X86::BLCFILL64rm:
269 case X86::BLCI32rm:
270 case X86::BLCI64rm:
271 case X86::BLCIC32rm:
272 case X86::BLCIC64rm:
273 case X86::BLCMSK32rm:
274 case X86::BLCMSK64rm:
275 case X86::BLCS32rm:
276 case X86::BLCS64rm:
277 case X86::BLSFILL32rm:
278 case X86::BLSFILL64rm:
279 case X86::BLSI32rm:
280 case X86::BLSI64rm:
281 case X86::BLSIC32rm:
282 case X86::BLSIC64rm:
283 case X86::BLSMSK32rm:
284 case X86::BLSMSK64rm:
285 case X86::BLSR32rm:
286 case X86::BLSR64rm:
287 case X86::TZMSK32rm:
288 case X86::TZMSK64rm:
289
290 // Bit extracting and clearing instructions should execute in constant time,
291 // and set flags.
292 case X86::BEXTR32rm:
293 case X86::BEXTR64rm:
294 case X86::BEXTRI32mi:
295 case X86::BEXTRI64mi:
296 case X86::BZHI32rm:
297 case X86::BZHI64rm:
298
299 // Basic arithmetic is constant time on the input but does set flags.
300 case X86::ADC8rm:
301 case X86::ADC16rm:
302 case X86::ADC32rm:
303 case X86::ADC64rm:
304 case X86::ADD8rm:
305 case X86::ADD16rm:
306 case X86::ADD32rm:
307 case X86::ADD64rm:
308 case X86::AND8rm:
309 case X86::AND16rm:
310 case X86::AND32rm:
311 case X86::AND64rm:
312 case X86::ANDN32rm:
313 case X86::ANDN64rm:
314 case X86::OR8rm:
315 case X86::OR16rm:
316 case X86::OR32rm:
317 case X86::OR64rm:
318 case X86::SBB8rm:
319 case X86::SBB16rm:
320 case X86::SBB32rm:
321 case X86::SBB64rm:
322 case X86::SUB8rm:
323 case X86::SUB16rm:
324 case X86::SUB32rm:
325 case X86::SUB64rm:
326 case X86::XOR8rm:
327 case X86::XOR16rm:
328 case X86::XOR32rm:
329 case X86::XOR64rm:
330
331 // Integer multiply w/o affecting flags is still believed to be constant
332 // time on x86. Called out separately as this is among the most surprising
333 // instructions to exhibit that behavior.
334 case X86::MULX32rm:
335 case X86::MULX64rm:
336
337 // Arithmetic instructions that are both constant time and don't set flags.
338 case X86::RORX32mi:
339 case X86::RORX64mi:
340 case X86::SARX32rm:
341 case X86::SARX64rm:
342 case X86::SHLX32rm:
343 case X86::SHLX64rm:
344 case X86::SHRX32rm:
345 case X86::SHRX64rm:
346
347 // Conversions are believed to be constant time and don't set flags.
348 case X86::CVTTSD2SI64rm:
349 case X86::VCVTTSD2SI64rm:
350 case X86::VCVTTSD2SI64Zrm:
351 case X86::CVTTSD2SIrm:
352 case X86::VCVTTSD2SIrm:
353 case X86::VCVTTSD2SIZrm:
354 case X86::CVTTSS2SI64rm:
355 case X86::VCVTTSS2SI64rm:
356 case X86::VCVTTSS2SI64Zrm:
357 case X86::CVTTSS2SIrm:
358 case X86::VCVTTSS2SIrm:
359 case X86::VCVTTSS2SIZrm:
360 case X86::CVTSI2SDrm:
361 case X86::VCVTSI2SDrm:
362 case X86::VCVTSI2SDZrm:
363 case X86::CVTSI2SSrm:
364 case X86::VCVTSI2SSrm:
365 case X86::VCVTSI2SSZrm:
366 case X86::CVTSI642SDrm:
367 case X86::VCVTSI642SDrm:
368 case X86::VCVTSI642SDZrm:
369 case X86::CVTSI642SSrm:
370 case X86::VCVTSI642SSrm:
371 case X86::VCVTSI642SSZrm:
372 case X86::CVTSS2SDrm:
373 case X86::VCVTSS2SDrm:
374 case X86::VCVTSS2SDZrm:
375 case X86::CVTSD2SSrm:
376 case X86::VCVTSD2SSrm:
377 case X86::VCVTSD2SSZrm:
378 // AVX512 added unsigned integer conversions.
379 case X86::VCVTTSD2USI64Zrm:
380 case X86::VCVTTSD2USIZrm:
381 case X86::VCVTTSS2USI64Zrm:
382 case X86::VCVTTSS2USIZrm:
383 case X86::VCVTUSI2SDZrm:
384 case X86::VCVTUSI642SDZrm:
385 case X86::VCVTUSI2SSZrm:
386 case X86::VCVTUSI642SSZrm:
387
388 // Loads to register don't set flags.
389 case X86::MOV8rm:
390 case X86::MOV8rm_NOREX:
391 case X86::MOV16rm:
392 case X86::MOV32rm:
393 case X86::MOV64rm:
394 case X86::MOVSX16rm8:
395 case X86::MOVSX32rm16:
396 case X86::MOVSX32rm8:
397 case X86::MOVSX32rm8_NOREX:
398 case X86::MOVSX64rm16:
399 case X86::MOVSX64rm32:
400 case X86::MOVSX64rm8:
401 case X86::MOVZX16rm8:
402 case X86::MOVZX32rm16:
403 case X86::MOVZX32rm8:
404 case X86::MOVZX32rm8_NOREX:
405 case X86::MOVZX64rm16:
406 case X86::MOVZX64rm8:
407 return true;
408 }
409}
410
412 const MachineFunction *MF = MI.getParent()->getParent();
414
415 if (isFrameInstr(MI)) {
416 int SPAdj = alignTo(getFrameSize(MI), TFI->getStackAlign());
417 SPAdj -= getFrameAdjustment(MI);
418 if (!isFrameSetup(MI))
419 SPAdj = -SPAdj;
420 return SPAdj;
421 }
422
423 // To know whether a call adjusts the stack, we need information
424 // that is bound to the following ADJCALLSTACKUP pseudo.
425 // Look for the next ADJCALLSTACKUP that follows the call.
426 if (MI.isCall()) {
427 const MachineBasicBlock *MBB = MI.getParent();
429 for (auto E = MBB->end(); I != E; ++I) {
430 if (I->getOpcode() == getCallFrameDestroyOpcode() || I->isCall())
431 break;
432 }
433
434 // If we could not find a frame destroy opcode, then it has already
435 // been simplified, so we don't care.
436 if (I->getOpcode() != getCallFrameDestroyOpcode())
437 return 0;
438
439 return -(I->getOperand(1).getImm());
440 }
441
442 // Currently handle only PUSHes we can reasonably expect to see
443 // in call sequences
444 switch (MI.getOpcode()) {
445 default:
446 return 0;
447 case X86::PUSH32r:
448 case X86::PUSH32rmm:
449 case X86::PUSH32rmr:
450 case X86::PUSH32i:
451 return 4;
452 case X86::PUSH64r:
453 case X86::PUSH64rmm:
454 case X86::PUSH64rmr:
455 case X86::PUSH64i32:
456 return 8;
457 }
458}
459
460/// Return true and the FrameIndex if the specified
461/// operand and follow operands form a reference to the stack frame.
462bool X86InstrInfo::isFrameOperand(const MachineInstr &MI, unsigned int Op,
463 int &FrameIndex) const {
464 if (MI.getOperand(Op + X86::AddrBaseReg).isFI() &&
465 MI.getOperand(Op + X86::AddrScaleAmt).isImm() &&
466 MI.getOperand(Op + X86::AddrIndexReg).isReg() &&
467 MI.getOperand(Op + X86::AddrDisp).isImm() &&
468 MI.getOperand(Op + X86::AddrScaleAmt).getImm() == 1 &&
469 MI.getOperand(Op + X86::AddrIndexReg).getReg() == 0 &&
470 MI.getOperand(Op + X86::AddrDisp).getImm() == 0) {
471 FrameIndex = MI.getOperand(Op + X86::AddrBaseReg).getIndex();
472 return true;
473 }
474 return false;
475}
476
477static bool isFrameLoadOpcode(int Opcode, TypeSize &MemBytes) {
478 switch (Opcode) {
479 default:
480 return false;
481 case X86::MOV8rm:
482 case X86::KMOVBkm:
483 case X86::KMOVBkm_EVEX:
484 MemBytes = TypeSize::getFixed(1);
485 return true;
486 case X86::MOV16rm:
487 case X86::KMOVWkm:
488 case X86::KMOVWkm_EVEX:
489 case X86::VMOVSHZrm:
490 case X86::VMOVSHZrm_alt:
491 MemBytes = TypeSize::getFixed(2);
492 return true;
493 case X86::MOV32rm:
494 case X86::MOVSSrm:
495 case X86::MOVSSrm_alt:
496 case X86::VMOVSSrm:
497 case X86::VMOVSSrm_alt:
498 case X86::VMOVSSZrm:
499 case X86::VMOVSSZrm_alt:
500 case X86::KMOVDkm:
501 case X86::KMOVDkm_EVEX:
502 MemBytes = TypeSize::getFixed(4);
503 return true;
504 case X86::MOV64rm:
505 case X86::LD_Fp64m:
506 case X86::MOVSDrm:
507 case X86::MOVSDrm_alt:
508 case X86::VMOVSDrm:
509 case X86::VMOVSDrm_alt:
510 case X86::VMOVSDZrm:
511 case X86::VMOVSDZrm_alt:
512 case X86::MMX_MOVD64rm:
513 case X86::MMX_MOVQ64rm:
514 case X86::KMOVQkm:
515 case X86::KMOVQkm_EVEX:
516 MemBytes = TypeSize::getFixed(8);
517 return true;
518 case X86::MOVAPSrm:
519 case X86::MOVUPSrm:
520 case X86::MOVAPDrm:
521 case X86::MOVUPDrm:
522 case X86::MOVDQArm:
523 case X86::MOVDQUrm:
524 case X86::VMOVAPSrm:
525 case X86::VMOVUPSrm:
526 case X86::VMOVAPDrm:
527 case X86::VMOVUPDrm:
528 case X86::VMOVDQArm:
529 case X86::VMOVDQUrm:
530 case X86::VMOVAPSZ128rm:
531 case X86::VMOVUPSZ128rm:
532 case X86::VMOVAPSZ128rm_NOVLX:
533 case X86::VMOVUPSZ128rm_NOVLX:
534 case X86::VMOVAPDZ128rm:
535 case X86::VMOVUPDZ128rm:
536 case X86::VMOVDQU8Z128rm:
537 case X86::VMOVDQU16Z128rm:
538 case X86::VMOVDQA32Z128rm:
539 case X86::VMOVDQU32Z128rm:
540 case X86::VMOVDQA64Z128rm:
541 case X86::VMOVDQU64Z128rm:
542 MemBytes = TypeSize::getFixed(16);
543 return true;
544 case X86::VMOVAPSYrm:
545 case X86::VMOVUPSYrm:
546 case X86::VMOVAPDYrm:
547 case X86::VMOVUPDYrm:
548 case X86::VMOVDQAYrm:
549 case X86::VMOVDQUYrm:
550 case X86::VMOVAPSZ256rm:
551 case X86::VMOVUPSZ256rm:
552 case X86::VMOVAPSZ256rm_NOVLX:
553 case X86::VMOVUPSZ256rm_NOVLX:
554 case X86::VMOVAPDZ256rm:
555 case X86::VMOVUPDZ256rm:
556 case X86::VMOVDQU8Z256rm:
557 case X86::VMOVDQU16Z256rm:
558 case X86::VMOVDQA32Z256rm:
559 case X86::VMOVDQU32Z256rm:
560 case X86::VMOVDQA64Z256rm:
561 case X86::VMOVDQU64Z256rm:
562 MemBytes = TypeSize::getFixed(32);
563 return true;
564 case X86::VMOVAPSZrm:
565 case X86::VMOVUPSZrm:
566 case X86::VMOVAPDZrm:
567 case X86::VMOVUPDZrm:
568 case X86::VMOVDQU8Zrm:
569 case X86::VMOVDQU16Zrm:
570 case X86::VMOVDQA32Zrm:
571 case X86::VMOVDQU32Zrm:
572 case X86::VMOVDQA64Zrm:
573 case X86::VMOVDQU64Zrm:
574 MemBytes = TypeSize::getFixed(64);
575 return true;
576 }
577}
578
579static bool isFrameStoreOpcode(int Opcode, TypeSize &MemBytes) {
580 switch (Opcode) {
581 default:
582 return false;
583 case X86::MOV8mr:
584 case X86::KMOVBmk:
585 case X86::KMOVBmk_EVEX:
586 MemBytes = TypeSize::getFixed(1);
587 return true;
588 case X86::MOV16mr:
589 case X86::KMOVWmk:
590 case X86::KMOVWmk_EVEX:
591 case X86::VMOVSHZmr:
592 MemBytes = TypeSize::getFixed(2);
593 return true;
594 case X86::MOV32mr:
595 case X86::MOVSSmr:
596 case X86::VMOVSSmr:
597 case X86::VMOVSSZmr:
598 case X86::KMOVDmk:
599 case X86::KMOVDmk_EVEX:
600 MemBytes = TypeSize::getFixed(4);
601 return true;
602 case X86::MOV64mr:
603 case X86::ST_FpP64m:
604 case X86::MOVSDmr:
605 case X86::VMOVSDmr:
606 case X86::VMOVSDZmr:
607 case X86::MMX_MOVD64mr:
608 case X86::MMX_MOVQ64mr:
609 case X86::MMX_MOVNTQmr:
610 case X86::KMOVQmk:
611 case X86::KMOVQmk_EVEX:
612 MemBytes = TypeSize::getFixed(8);
613 return true;
614 case X86::MOVAPSmr:
615 case X86::MOVUPSmr:
616 case X86::MOVAPDmr:
617 case X86::MOVUPDmr:
618 case X86::MOVDQAmr:
619 case X86::MOVDQUmr:
620 case X86::VMOVAPSmr:
621 case X86::VMOVUPSmr:
622 case X86::VMOVAPDmr:
623 case X86::VMOVUPDmr:
624 case X86::VMOVDQAmr:
625 case X86::VMOVDQUmr:
626 case X86::VMOVUPSZ128mr:
627 case X86::VMOVAPSZ128mr:
628 case X86::VMOVUPSZ128mr_NOVLX:
629 case X86::VMOVAPSZ128mr_NOVLX:
630 case X86::VMOVUPDZ128mr:
631 case X86::VMOVAPDZ128mr:
632 case X86::VMOVDQA32Z128mr:
633 case X86::VMOVDQU32Z128mr:
634 case X86::VMOVDQA64Z128mr:
635 case X86::VMOVDQU64Z128mr:
636 case X86::VMOVDQU8Z128mr:
637 case X86::VMOVDQU16Z128mr:
638 MemBytes = TypeSize::getFixed(16);
639 return true;
640 case X86::VMOVUPSYmr:
641 case X86::VMOVAPSYmr:
642 case X86::VMOVUPDYmr:
643 case X86::VMOVAPDYmr:
644 case X86::VMOVDQUYmr:
645 case X86::VMOVDQAYmr:
646 case X86::VMOVUPSZ256mr:
647 case X86::VMOVAPSZ256mr:
648 case X86::VMOVUPSZ256mr_NOVLX:
649 case X86::VMOVAPSZ256mr_NOVLX:
650 case X86::VMOVUPDZ256mr:
651 case X86::VMOVAPDZ256mr:
652 case X86::VMOVDQU8Z256mr:
653 case X86::VMOVDQU16Z256mr:
654 case X86::VMOVDQA32Z256mr:
655 case X86::VMOVDQU32Z256mr:
656 case X86::VMOVDQA64Z256mr:
657 case X86::VMOVDQU64Z256mr:
658 MemBytes = TypeSize::getFixed(32);
659 return true;
660 case X86::VMOVUPSZmr:
661 case X86::VMOVAPSZmr:
662 case X86::VMOVUPDZmr:
663 case X86::VMOVAPDZmr:
664 case X86::VMOVDQU8Zmr:
665 case X86::VMOVDQU16Zmr:
666 case X86::VMOVDQA32Zmr:
667 case X86::VMOVDQU32Zmr:
668 case X86::VMOVDQA64Zmr:
669 case X86::VMOVDQU64Zmr:
670 MemBytes = TypeSize::getFixed(64);
671 return true;
672 }
673 return false;
674}
675
677 int &FrameIndex) const {
678 TypeSize Dummy = TypeSize::getZero();
679 return X86InstrInfo::isLoadFromStackSlot(MI, FrameIndex, Dummy);
680}
681
683 int &FrameIndex,
684 TypeSize &MemBytes) const {
685 if (isFrameLoadOpcode(MI.getOpcode(), MemBytes))
686 if (MI.getOperand(0).getSubReg() == 0 && isFrameOperand(MI, 1, FrameIndex))
687 return MI.getOperand(0).getReg();
688 return Register();
689}
690
692 int &FrameIndex) const {
693 TypeSize Dummy = TypeSize::getZero();
694 if (isFrameLoadOpcode(MI.getOpcode(), Dummy)) {
695 if (Register Reg = isLoadFromStackSlot(MI, FrameIndex))
696 return Reg;
697 // Check for post-frame index elimination operations
699 if (hasLoadFromStackSlot(MI, Accesses)) {
700 FrameIndex =
701 cast<FixedStackPseudoSourceValue>(Accesses.front()->getPseudoValue())
702 ->getFrameIndex();
703 return MI.getOperand(0).getReg();
704 }
705 }
706 return Register();
707}
708
710 int &FrameIndex) const {
711 TypeSize Dummy = TypeSize::getZero();
712 return X86InstrInfo::isStoreToStackSlot(MI, FrameIndex, Dummy);
713}
714
716 int &FrameIndex,
717 TypeSize &MemBytes) const {
718 if (isFrameStoreOpcode(MI.getOpcode(), MemBytes))
719 if (MI.getOperand(X86::AddrNumOperands).getSubReg() == 0 &&
720 isFrameOperand(MI, 0, FrameIndex))
721 return MI.getOperand(X86::AddrNumOperands).getReg();
722 return Register();
723}
724
726 int &FrameIndex) const {
727 TypeSize Dummy = TypeSize::getZero();
728 if (isFrameStoreOpcode(MI.getOpcode(), Dummy)) {
729 if (Register Reg = isStoreToStackSlot(MI, FrameIndex))
730 return Reg;
731 // Check for post-frame index elimination operations
733 if (hasStoreToStackSlot(MI, Accesses)) {
734 FrameIndex =
735 cast<FixedStackPseudoSourceValue>(Accesses.front()->getPseudoValue())
736 ->getFrameIndex();
737 return MI.getOperand(X86::AddrNumOperands).getReg();
738 }
739 }
740 return Register();
741}
742
743/// Return true if register is PIC base; i.e.g defined by X86::MOVPC32r.
744static bool regIsPICBase(Register BaseReg, const MachineRegisterInfo &MRI) {
745 // Don't waste compile time scanning use-def chains of physregs.
746 if (!BaseReg.isVirtual())
747 return false;
748 bool isPICBase = false;
749 for (const MachineInstr &DefMI : MRI.def_instructions(BaseReg)) {
750 if (DefMI.getOpcode() != X86::MOVPC32r)
751 return false;
752 assert(!isPICBase && "More than one PIC base?");
753 isPICBase = true;
754 }
755 return isPICBase;
756}
757
759 const MachineInstr &MI) const {
760 switch (MI.getOpcode()) {
761 default:
762 // This function should only be called for opcodes with the ReMaterializable
763 // flag set.
764 llvm_unreachable("Unknown rematerializable operation!");
765 break;
766 case X86::IMPLICIT_DEF:
767 // Defer to generic logic.
768 break;
769 case X86::LOAD_STACK_GUARD:
770 case X86::LD_Fp032:
771 case X86::LD_Fp064:
772 case X86::LD_Fp080:
773 case X86::LD_Fp132:
774 case X86::LD_Fp164:
775 case X86::LD_Fp180:
776 case X86::AVX1_SETALLONES:
777 case X86::AVX2_SETALLONES:
778 case X86::AVX512_128_SET0:
779 case X86::AVX512_256_SET0:
780 case X86::AVX512_512_SET0:
781 case X86::AVX512_128_SETALLONES:
782 case X86::AVX512_256_SETALLONES:
783 case X86::AVX512_512_SETALLONES:
784 case X86::AVX512_FsFLD0SD:
785 case X86::AVX512_FsFLD0SH:
786 case X86::AVX512_FsFLD0SS:
787 case X86::AVX512_FsFLD0F128:
788 case X86::AVX_SET0:
789 case X86::FsFLD0SD:
790 case X86::FsFLD0SS:
791 case X86::FsFLD0SH:
792 case X86::FsFLD0F128:
793 case X86::KSET0B:
794 case X86::KSET0D:
795 case X86::KSET0Q:
796 case X86::KSET0W:
797 case X86::KSET1B:
798 case X86::KSET1D:
799 case X86::KSET1Q:
800 case X86::KSET1W:
801 case X86::MMX_SET0:
802 case X86::MOV32ImmSExti8:
803 case X86::MOV32r0:
804 case X86::MOV32r1:
805 case X86::MOV32r_1:
806 case X86::MOV32ri64:
807 case X86::MOV64ImmSExti8:
808 case X86::V_SET0:
809 case X86::V_SETALLONES:
810 case X86::MOV16ri:
811 case X86::MOV32ri:
812 case X86::MOV64ri:
813 case X86::MOV64ri32:
814 case X86::MOV8ri:
815 case X86::PTILEZEROV:
816 return true;
817
818 case X86::MOV8rm:
819 case X86::MOV8rm_NOREX:
820 case X86::MOV16rm:
821 case X86::MOV32rm:
822 case X86::MOV64rm:
823 case X86::MOVSSrm:
824 case X86::MOVSSrm_alt:
825 case X86::MOVSDrm:
826 case X86::MOVSDrm_alt:
827 case X86::MOVAPSrm:
828 case X86::MOVUPSrm:
829 case X86::MOVAPDrm:
830 case X86::MOVUPDrm:
831 case X86::MOVDQArm:
832 case X86::MOVDQUrm:
833 case X86::VMOVSSrm:
834 case X86::VMOVSSrm_alt:
835 case X86::VMOVSDrm:
836 case X86::VMOVSDrm_alt:
837 case X86::VMOVAPSrm:
838 case X86::VMOVUPSrm:
839 case X86::VMOVAPDrm:
840 case X86::VMOVUPDrm:
841 case X86::VMOVDQArm:
842 case X86::VMOVDQUrm:
843 case X86::VMOVAPSYrm:
844 case X86::VMOVUPSYrm:
845 case X86::VMOVAPDYrm:
846 case X86::VMOVUPDYrm:
847 case X86::VMOVDQAYrm:
848 case X86::VMOVDQUYrm:
849 case X86::MMX_MOVD64rm:
850 case X86::MMX_MOVQ64rm:
851 case X86::VBROADCASTSSrm:
852 case X86::VBROADCASTSSYrm:
853 case X86::VBROADCASTSDYrm:
854 // AVX-512
855 case X86::VPBROADCASTBZ128rm:
856 case X86::VPBROADCASTBZ256rm:
857 case X86::VPBROADCASTBZrm:
858 case X86::VBROADCASTF32X2Z256rm:
859 case X86::VBROADCASTF32X2Zrm:
860 case X86::VBROADCASTI32X2Z128rm:
861 case X86::VBROADCASTI32X2Z256rm:
862 case X86::VBROADCASTI32X2Zrm:
863 case X86::VPBROADCASTWZ128rm:
864 case X86::VPBROADCASTWZ256rm:
865 case X86::VPBROADCASTWZrm:
866 case X86::VPBROADCASTDZ128rm:
867 case X86::VPBROADCASTDZ256rm:
868 case X86::VPBROADCASTDZrm:
869 case X86::VBROADCASTSSZ128rm:
870 case X86::VBROADCASTSSZ256rm:
871 case X86::VBROADCASTSSZrm:
872 case X86::VPBROADCASTQZ128rm:
873 case X86::VPBROADCASTQZ256rm:
874 case X86::VPBROADCASTQZrm:
875 case X86::VBROADCASTSDZ256rm:
876 case X86::VBROADCASTSDZrm:
877 case X86::VMOVSSZrm:
878 case X86::VMOVSSZrm_alt:
879 case X86::VMOVSDZrm:
880 case X86::VMOVSDZrm_alt:
881 case X86::VMOVSHZrm:
882 case X86::VMOVSHZrm_alt:
883 case X86::VMOVAPDZ128rm:
884 case X86::VMOVAPDZ256rm:
885 case X86::VMOVAPDZrm:
886 case X86::VMOVAPSZ128rm:
887 case X86::VMOVAPSZ256rm:
888 case X86::VMOVAPSZ128rm_NOVLX:
889 case X86::VMOVAPSZ256rm_NOVLX:
890 case X86::VMOVAPSZrm:
891 case X86::VMOVDQA32Z128rm:
892 case X86::VMOVDQA32Z256rm:
893 case X86::VMOVDQA32Zrm:
894 case X86::VMOVDQA64Z128rm:
895 case X86::VMOVDQA64Z256rm:
896 case X86::VMOVDQA64Zrm:
897 case X86::VMOVDQU16Z128rm:
898 case X86::VMOVDQU16Z256rm:
899 case X86::VMOVDQU16Zrm:
900 case X86::VMOVDQU32Z128rm:
901 case X86::VMOVDQU32Z256rm:
902 case X86::VMOVDQU32Zrm:
903 case X86::VMOVDQU64Z128rm:
904 case X86::VMOVDQU64Z256rm:
905 case X86::VMOVDQU64Zrm:
906 case X86::VMOVDQU8Z128rm:
907 case X86::VMOVDQU8Z256rm:
908 case X86::VMOVDQU8Zrm:
909 case X86::VMOVUPDZ128rm:
910 case X86::VMOVUPDZ256rm:
911 case X86::VMOVUPDZrm:
912 case X86::VMOVUPSZ128rm:
913 case X86::VMOVUPSZ256rm:
914 case X86::VMOVUPSZ128rm_NOVLX:
915 case X86::VMOVUPSZ256rm_NOVLX:
916 case X86::VMOVUPSZrm: {
917 // Loads from constant pools are trivially rematerializable.
918 if (MI.getOperand(1 + X86::AddrBaseReg).isReg() &&
919 MI.getOperand(1 + X86::AddrScaleAmt).isImm() &&
920 MI.getOperand(1 + X86::AddrIndexReg).isReg() &&
921 MI.getOperand(1 + X86::AddrIndexReg).getReg() == 0 &&
922 MI.isDereferenceableInvariantLoad()) {
923 Register BaseReg = MI.getOperand(1 + X86::AddrBaseReg).getReg();
924 if (BaseReg == 0 || BaseReg == X86::RIP)
925 return true;
926 // Allow re-materialization of PIC load.
927 if (!(!ReMatPICStubLoad && MI.getOperand(1 + X86::AddrDisp).isGlobal())) {
928 const MachineFunction &MF = *MI.getParent()->getParent();
929 const MachineRegisterInfo &MRI = MF.getRegInfo();
930 if (regIsPICBase(BaseReg, MRI))
931 return true;
932 }
933 }
934 break;
935 }
936
937 case X86::LEA32r:
938 case X86::LEA64r: {
939 if (MI.getOperand(1 + X86::AddrScaleAmt).isImm() &&
940 MI.getOperand(1 + X86::AddrIndexReg).isReg() &&
941 MI.getOperand(1 + X86::AddrIndexReg).getReg() == 0 &&
942 !MI.getOperand(1 + X86::AddrDisp).isReg()) {
943 // lea fi#, lea GV, etc. are all rematerializable.
944 if (!MI.getOperand(1 + X86::AddrBaseReg).isReg())
945 return true;
946 Register BaseReg = MI.getOperand(1 + X86::AddrBaseReg).getReg();
947 if (BaseReg == 0)
948 return true;
949 // Allow re-materialization of lea PICBase + x.
950 const MachineFunction &MF = *MI.getParent()->getParent();
951 const MachineRegisterInfo &MRI = MF.getRegInfo();
952 if (regIsPICBase(BaseReg, MRI))
953 return true;
954 }
955 break;
956 }
957 }
959}
960
963 Register DestReg, unsigned SubIdx,
964 const MachineInstr &Orig) const {
965 bool ClobbersEFLAGS = Orig.modifiesRegister(X86::EFLAGS, &TRI);
966 if (ClobbersEFLAGS && MBB.computeRegisterLiveness(&TRI, X86::EFLAGS, I) !=
968 // The instruction clobbers EFLAGS. Re-materialize as MOV32ri to avoid side
969 // effects.
970 int Value;
971 switch (Orig.getOpcode()) {
972 case X86::MOV32r0:
973 Value = 0;
974 break;
975 case X86::MOV32r1:
976 Value = 1;
977 break;
978 case X86::MOV32r_1:
979 Value = -1;
980 break;
981 default:
982 llvm_unreachable("Unexpected instruction!");
983 }
984
985 const DebugLoc &DL = Orig.getDebugLoc();
986 BuildMI(MBB, I, DL, get(X86::MOV32ri))
987 .add(Orig.getOperand(0))
988 .addImm(Value);
989 } else {
990 MachineInstr *MI = MBB.getParent()->CloneMachineInstr(&Orig);
991 MBB.insert(I, MI);
992 }
993
994 MachineInstr &NewMI = *std::prev(I);
995 NewMI.substituteRegister(Orig.getOperand(0).getReg(), DestReg, SubIdx, TRI);
996}
997
998/// True if MI has a condition code def, e.g. EFLAGS, that is not marked dead.
1000 for (const MachineOperand &MO : MI.operands()) {
1001 if (MO.isReg() && MO.isDef() && MO.getReg() == X86::EFLAGS &&
1002 !MO.isDead()) {
1003 return true;
1004 }
1005 }
1006 return false;
1007}
1008
1009/// Check whether the shift count for a machine operand is non-zero.
1010inline static unsigned getTruncatedShiftCount(const MachineInstr &MI,
1011 unsigned ShiftAmtOperandIdx) {
1012 // The shift count is six bits with the REX.W prefix and five bits without.
1013 unsigned ShiftCountMask = (MI.getDesc().TSFlags & X86II::REX_W) ? 63 : 31;
1014 unsigned Imm = MI.getOperand(ShiftAmtOperandIdx).getImm();
1015 return Imm & ShiftCountMask;
1016}
1017
1018/// Check whether the given shift count is appropriate
1019/// can be represented by a LEA instruction.
1020inline static bool isTruncatedShiftCountForLEA(unsigned ShAmt) {
1021 // Left shift instructions can be transformed into load-effective-address
1022 // instructions if we can encode them appropriately.
1023 // A LEA instruction utilizes a SIB byte to encode its scale factor.
1024 // The SIB.scale field is two bits wide which means that we can encode any
1025 // shift amount less than 4.
1026 return ShAmt < 4 && ShAmt > 0;
1027}
1028
1029static bool
1031 const MachineRegisterInfo *MRI, MachineInstr **AndInstr,
1032 const TargetRegisterInfo *TRI, const X86Subtarget &ST,
1033 bool &NoSignFlag, bool &ClearsOverflowFlag) {
1034 if (!(CmpValDefInstr.getOpcode() == X86::SUBREG_TO_REG &&
1035 CmpInstr.getOpcode() == X86::TEST64rr) &&
1036 !(CmpValDefInstr.getOpcode() == X86::COPY &&
1037 CmpInstr.getOpcode() == X86::TEST16rr))
1038 return false;
1039
1040 // CmpInstr is a TEST16rr/TEST64rr instruction, and
1041 // `X86InstrInfo::analyzeCompare` guarantees that it's analyzable only if two
1042 // registers are identical.
1043 assert((CmpInstr.getOperand(0).getReg() == CmpInstr.getOperand(1).getReg()) &&
1044 "CmpInstr is an analyzable TEST16rr/TEST64rr, and "
1045 "`X86InstrInfo::analyzeCompare` requires two reg operands are the"
1046 "same.");
1047
1048 // Caller (`X86InstrInfo::optimizeCompareInstr`) guarantees that
1049 // `CmpValDefInstr` defines the value that's used by `CmpInstr`; in this case
1050 // if `CmpValDefInstr` sets the EFLAGS, it is likely that `CmpInstr` is
1051 // redundant.
1052 assert(
1053 (MRI->getVRegDef(CmpInstr.getOperand(0).getReg()) == &CmpValDefInstr) &&
1054 "Caller guarantees that TEST64rr is a user of SUBREG_TO_REG or TEST16rr "
1055 "is a user of COPY sub16bit.");
1056 MachineInstr *VregDefInstr = nullptr;
1057 if (CmpInstr.getOpcode() == X86::TEST16rr) {
1058 if (!CmpValDefInstr.getOperand(1).getReg().isVirtual())
1059 return false;
1060 VregDefInstr = MRI->getVRegDef(CmpValDefInstr.getOperand(1).getReg());
1061 if (!VregDefInstr)
1062 return false;
1063 // We can only remove test when AND32ri or AND64ri32 whose imm can fit 16bit
1064 // size, others 32/64 bit ops would test higher bits which test16rr don't
1065 // want to.
1066 if (!((VregDefInstr->getOpcode() == X86::AND32ri ||
1067 VregDefInstr->getOpcode() == X86::AND64ri32) &&
1068 isUInt<16>(VregDefInstr->getOperand(2).getImm())))
1069 return false;
1070 }
1071
1072 if (CmpInstr.getOpcode() == X86::TEST64rr) {
1073 // As seen in X86 td files, CmpValDefInstr.getOperand(1).getImm() is
1074 // typically 0.
1075 if (CmpValDefInstr.getOperand(1).getImm() != 0)
1076 return false;
1077
1078 // As seen in X86 td files, CmpValDefInstr.getOperand(3) is typically
1079 // sub_32bit or sub_xmm.
1080 if (CmpValDefInstr.getOperand(3).getImm() != X86::sub_32bit)
1081 return false;
1082
1083 VregDefInstr = MRI->getVRegDef(CmpValDefInstr.getOperand(2).getReg());
1084 }
1085
1086 assert(VregDefInstr && "Must have a definition (SSA)");
1087
1088 // Requires `CmpValDefInstr` and `VregDefInstr` are from the same MBB
1089 // to simplify the subsequent analysis.
1090 //
1091 // FIXME: If `VregDefInstr->getParent()` is the only predecessor of
1092 // `CmpValDefInstr.getParent()`, this could be handled.
1093 if (VregDefInstr->getParent() != CmpValDefInstr.getParent())
1094 return false;
1095
1096 if (X86::isAND(VregDefInstr->getOpcode()) &&
1097 (!ST.hasNF() || VregDefInstr->modifiesRegister(X86::EFLAGS, TRI))) {
1098 // Get a sequence of instructions like
1099 // %reg = and* ... // Set EFLAGS
1100 // ... // EFLAGS not changed
1101 // %extended_reg = subreg_to_reg 0, %reg, %subreg.sub_32bit
1102 // test64rr %extended_reg, %extended_reg, implicit-def $eflags
1103 // or
1104 // %reg = and32* ...
1105 // ... // EFLAGS not changed.
1106 // %src_reg = copy %reg.sub_16bit:gr32
1107 // test16rr %src_reg, %src_reg, implicit-def $eflags
1108 //
1109 // If subsequent readers use a subset of bits that don't change
1110 // after `and*` instructions, it's likely that the test64rr could
1111 // be optimized away.
1112 for (const MachineInstr &Instr :
1113 make_range(std::next(MachineBasicBlock::iterator(VregDefInstr)),
1114 MachineBasicBlock::iterator(CmpValDefInstr))) {
1115 // There are instructions between 'VregDefInstr' and
1116 // 'CmpValDefInstr' that modifies EFLAGS.
1117 if (Instr.modifiesRegister(X86::EFLAGS, TRI))
1118 return false;
1119 }
1120
1121 *AndInstr = VregDefInstr;
1122
1123 // AND instruction will essentially update SF and clear OF, so
1124 // NoSignFlag should be false in the sense that SF is modified by `AND`.
1125 //
1126 // However, the implementation artifically sets `NoSignFlag` to true
1127 // to poison the SF bit; that is to say, if SF is looked at later, the
1128 // optimization (to erase TEST64rr) will be disabled.
1129 //
1130 // The reason to poison SF bit is that SF bit value could be different
1131 // in the `AND` and `TEST` operation; signed bit is not known for `AND`,
1132 // and is known to be 0 as a result of `TEST64rr`.
1133 //
1134 // FIXME: As opposed to poisoning the SF bit directly, consider peeking into
1135 // the AND instruction and using the static information to guide peephole
1136 // optimization if possible. For example, it's possible to fold a
1137 // conditional move into a copy if the relevant EFLAG bits could be deduced
1138 // from an immediate operand of and operation.
1139 //
1140 NoSignFlag = true;
1141 // ClearsOverflowFlag is true for AND operation (no surprise).
1142 ClearsOverflowFlag = true;
1143 return true;
1144 }
1145 return false;
1146}
1147
1149 unsigned Opc, bool AllowSP, Register &NewSrc,
1150 unsigned &NewSrcSubReg, bool &isKill,
1151 MachineOperand &ImplicitOp, LiveVariables *LV,
1152 LiveIntervals *LIS) const {
1153 MachineFunction &MF = *MI.getParent()->getParent();
1154 const TargetRegisterClass *RC;
1155 if (AllowSP) {
1156 RC = Opc != X86::LEA32r ? &X86::GR64RegClass : &X86::GR32RegClass;
1157 } else {
1158 RC = Opc != X86::LEA32r ? &X86::GR64_NOSPRegClass : &X86::GR32_NOSPRegClass;
1159 }
1160 Register SrcReg = Src.getReg();
1161 unsigned SubReg = Src.getSubReg();
1162 isKill = MI.killsRegister(SrcReg, /*TRI=*/nullptr);
1163
1164 NewSrcSubReg = X86::NoSubRegister;
1165
1166 // For both LEA64 and LEA32 the register already has essentially the right
1167 // type (32-bit or 64-bit) we may just need to forbid SP.
1168 if (Opc != X86::LEA64_32r) {
1169 NewSrc = SrcReg;
1170 NewSrcSubReg = SubReg;
1171 assert(!Src.isUndef() && "Undef op doesn't need optimization");
1172
1173 if (NewSrc.isVirtual() && !MF.getRegInfo().constrainRegClass(NewSrc, RC))
1174 return false;
1175
1176 return true;
1177 }
1178
1179 // This is for an LEA64_32r and incoming registers are 32-bit. One way or
1180 // another we need to add 64-bit registers to the final MI.
1181 if (SrcReg.isPhysical()) {
1182 ImplicitOp = Src;
1183 ImplicitOp.setImplicit();
1184
1185 NewSrc = getX86SubSuperRegister(SrcReg, 64);
1186 assert(!SubReg && "no superregister for source");
1187 assert(NewSrc.isValid() && "Invalid Operand");
1188 assert(!Src.isUndef() && "Undef op doesn't need optimization");
1189 } else {
1190 // Virtual register of the wrong class, we have to create a temporary 64-bit
1191 // vreg to feed into the LEA.
1192 NewSrc = MF.getRegInfo().createVirtualRegister(RC);
1193 NewSrcSubReg = X86::NoSubRegister;
1194 MachineInstr *Copy =
1195 BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(TargetOpcode::COPY))
1196 .addReg(NewSrc, RegState::Define | RegState::Undef, X86::sub_32bit)
1197 .addReg(SrcReg, getKillRegState(isKill), SubReg);
1198
1199 // Which is obviously going to be dead after we're done with it.
1200 isKill = true;
1201
1202 if (LV)
1203 LV->replaceKillInstruction(SrcReg, MI, *Copy);
1204
1205 if (LIS) {
1206 SlotIndex CopyIdx = LIS->InsertMachineInstrInMaps(*Copy);
1207 SlotIndex Idx = LIS->getInstructionIndex(MI);
1208 LiveInterval &LI = LIS->getInterval(SrcReg);
1210 if (S->end.getBaseIndex() == Idx)
1211 S->end = CopyIdx.getRegSlot();
1212 }
1213 }
1214
1215 // We've set all the parameters without issue.
1216 return true;
1217}
1218
1219MachineInstr *X86InstrInfo::convertToThreeAddressWithLEA(unsigned MIOpc,
1221 LiveVariables *LV,
1222 LiveIntervals *LIS,
1223 bool Is8BitOp) const {
1224 // We handle 8-bit adds and various 16-bit opcodes in the switch below.
1225 MachineBasicBlock &MBB = *MI.getParent();
1226 MachineRegisterInfo &RegInfo = MBB.getParent()->getRegInfo();
1227 assert((Is8BitOp ||
1228 RegInfo.getTargetRegisterInfo()->getRegSizeInBits(
1229 *RegInfo.getRegClass(MI.getOperand(0).getReg())) == 16) &&
1230 "Unexpected type for LEA transform");
1231
1232 // TODO: For a 32-bit target, we need to adjust the LEA variables with
1233 // something like this:
1234 // Opcode = X86::LEA32r;
1235 // InRegLEA = RegInfo.createVirtualRegister(&X86::GR32_NOSPRegClass);
1236 // OutRegLEA =
1237 // Is8BitOp ? RegInfo.createVirtualRegister(&X86::GR32ABCD_RegClass)
1238 // : RegInfo.createVirtualRegister(&X86::GR32RegClass);
1239 if (!Subtarget.is64Bit())
1240 return nullptr;
1241
1242 unsigned Opcode = X86::LEA64_32r;
1243 Register InRegLEA = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass);
1244 Register OutRegLEA = RegInfo.createVirtualRegister(&X86::GR32RegClass);
1245 Register InRegLEA2;
1246
1247 // Build and insert into an implicit UNDEF value. This is OK because
1248 // we will be shifting and then extracting the lower 8/16-bits.
1249 // This has the potential to cause partial register stall. e.g.
1250 // movw (%rbp,%rcx,2), %dx
1251 // leal -65(%rdx), %esi
1252 // But testing has shown this *does* help performance in 64-bit mode (at
1253 // least on modern x86 machines).
1254 MachineBasicBlock::iterator MBBI = MI.getIterator();
1255 Register Dest = MI.getOperand(0).getReg();
1256 Register Src = MI.getOperand(1).getReg();
1257 unsigned SrcSubReg = MI.getOperand(1).getSubReg();
1258 Register Src2;
1259 unsigned Src2SubReg;
1260 bool IsDead = MI.getOperand(0).isDead();
1261 bool IsKill = MI.getOperand(1).isKill();
1262 unsigned SubReg = Is8BitOp ? X86::sub_8bit : X86::sub_16bit;
1263 assert(!MI.getOperand(1).isUndef() && "Undef op doesn't need optimization");
1264 MachineInstr *ImpDef =
1265 BuildMI(MBB, MBBI, MI.getDebugLoc(), get(X86::IMPLICIT_DEF), InRegLEA);
1266 MachineInstr *InsMI =
1267 BuildMI(MBB, MBBI, MI.getDebugLoc(), get(TargetOpcode::COPY))
1268 .addReg(InRegLEA, RegState::Define, SubReg)
1269 .addReg(Src, getKillRegState(IsKill), SrcSubReg);
1270 MachineInstr *ImpDef2 = nullptr;
1271 MachineInstr *InsMI2 = nullptr;
1272
1274 BuildMI(MBB, MBBI, MI.getDebugLoc(), get(Opcode), OutRegLEA);
1275#define CASE_NF(OP) \
1276 case X86::OP: \
1277 case X86::OP##_NF:
1278 switch (MIOpc) {
1279 default:
1280 llvm_unreachable("Unreachable!");
1281 CASE_NF(SHL8ri)
1282 CASE_NF(SHL16ri) {
1283 unsigned ShAmt = MI.getOperand(2).getImm();
1284 MIB.addReg(0)
1285 .addImm(1LL << ShAmt)
1286 .addReg(InRegLEA, RegState::Kill)
1287 .addImm(0)
1288 .addReg(0);
1289 break;
1290 }
1291 CASE_NF(INC8r)
1292 CASE_NF(INC16r)
1293 addRegOffset(MIB, InRegLEA, true, 1);
1294 break;
1295 CASE_NF(DEC8r)
1296 CASE_NF(DEC16r)
1297 addRegOffset(MIB, InRegLEA, true, -1);
1298 break;
1299 CASE_NF(ADD8ri)
1300 CASE_NF(ADD16ri)
1301 case X86::ADD8ri_DB:
1302 case X86::ADD16ri_DB:
1303 addRegOffset(MIB, InRegLEA, true, MI.getOperand(2).getImm());
1304 break;
1305 CASE_NF(ADD8rr)
1306 CASE_NF(ADD16rr)
1307 case X86::ADD8rr_DB:
1308 case X86::ADD16rr_DB: {
1309 Src2 = MI.getOperand(2).getReg();
1310 Src2SubReg = MI.getOperand(2).getSubReg();
1311 bool IsKill2 = MI.getOperand(2).isKill();
1312 assert(!MI.getOperand(2).isUndef() && "Undef op doesn't need optimization");
1313 if (Src == Src2) {
1314 // ADD8rr/ADD16rr killed %reg1028, %reg1028
1315 // just a single insert_subreg.
1316 addRegReg(MIB, InRegLEA, true, X86::NoSubRegister, InRegLEA, false,
1317 X86::NoSubRegister);
1318 } else {
1319 if (Subtarget.is64Bit())
1320 InRegLEA2 = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass);
1321 else
1322 InRegLEA2 = RegInfo.createVirtualRegister(&X86::GR32_NOSPRegClass);
1323 // Build and insert into an implicit UNDEF value. This is OK because
1324 // we will be shifting and then extracting the lower 8/16-bits.
1325 ImpDef2 = BuildMI(MBB, &*MIB, MI.getDebugLoc(), get(X86::IMPLICIT_DEF),
1326 InRegLEA2);
1327 InsMI2 = BuildMI(MBB, &*MIB, MI.getDebugLoc(), get(TargetOpcode::COPY))
1328 .addReg(InRegLEA2, RegState::Define, SubReg)
1329 .addReg(Src2, getKillRegState(IsKill2), Src2SubReg);
1330 addRegReg(MIB, InRegLEA, true, X86::NoSubRegister, InRegLEA2, true,
1331 X86::NoSubRegister);
1332 }
1333 if (LV && IsKill2 && InsMI2)
1334 LV->replaceKillInstruction(Src2, MI, *InsMI2);
1335 break;
1336 }
1337 }
1338
1339 MachineInstr *NewMI = MIB;
1340 MachineInstr *ExtMI =
1341 BuildMI(MBB, MBBI, MI.getDebugLoc(), get(TargetOpcode::COPY))
1343 .addReg(OutRegLEA, RegState::Kill, SubReg);
1344
1345 if (LV) {
1346 // Update live variables.
1347 LV->getVarInfo(InRegLEA).Kills.push_back(NewMI);
1348 if (InRegLEA2)
1349 LV->getVarInfo(InRegLEA2).Kills.push_back(NewMI);
1350 LV->getVarInfo(OutRegLEA).Kills.push_back(ExtMI);
1351 if (IsKill)
1352 LV->replaceKillInstruction(Src, MI, *InsMI);
1353 if (IsDead)
1354 LV->replaceKillInstruction(Dest, MI, *ExtMI);
1355 }
1356
1357 if (LIS) {
1358 LIS->InsertMachineInstrInMaps(*ImpDef);
1359 SlotIndex InsIdx = LIS->InsertMachineInstrInMaps(*InsMI);
1360 if (ImpDef2)
1361 LIS->InsertMachineInstrInMaps(*ImpDef2);
1362 SlotIndex Ins2Idx;
1363 if (InsMI2)
1364 Ins2Idx = LIS->InsertMachineInstrInMaps(*InsMI2);
1365 SlotIndex NewIdx = LIS->ReplaceMachineInstrInMaps(MI, *NewMI);
1366 SlotIndex ExtIdx = LIS->InsertMachineInstrInMaps(*ExtMI);
1367 LIS->getInterval(InRegLEA);
1368 LIS->getInterval(OutRegLEA);
1369 if (InRegLEA2)
1370 LIS->getInterval(InRegLEA2);
1371
1372 // Move the use of Src up to InsMI.
1373 LiveInterval &SrcLI = LIS->getInterval(Src);
1374 LiveRange::Segment *SrcSeg = SrcLI.getSegmentContaining(NewIdx);
1375 if (SrcSeg->end == NewIdx.getRegSlot())
1376 SrcSeg->end = InsIdx.getRegSlot();
1377
1378 if (InsMI2) {
1379 // Move the use of Src2 up to InsMI2.
1380 LiveInterval &Src2LI = LIS->getInterval(Src2);
1381 LiveRange::Segment *Src2Seg = Src2LI.getSegmentContaining(NewIdx);
1382 if (Src2Seg->end == NewIdx.getRegSlot())
1383 Src2Seg->end = Ins2Idx.getRegSlot();
1384 }
1385
1386 // Move the definition of Dest down to ExtMI.
1387 LiveInterval &DestLI = LIS->getInterval(Dest);
1388 LiveRange::Segment *DestSeg =
1389 DestLI.getSegmentContaining(NewIdx.getRegSlot());
1390 assert(DestSeg->start == NewIdx.getRegSlot() &&
1391 DestSeg->valno->def == NewIdx.getRegSlot());
1392 DestSeg->start = ExtIdx.getRegSlot();
1393 DestSeg->valno->def = ExtIdx.getRegSlot();
1394 }
1395
1396 return ExtMI;
1397}
1398
1399/// This method must be implemented by targets that
1400/// set the M_CONVERTIBLE_TO_3_ADDR flag. When this flag is set, the target
1401/// may be able to convert a two-address instruction into a true
1402/// three-address instruction on demand. This allows the X86 target (for
1403/// example) to convert ADD and SHL instructions into LEA instructions if they
1404/// would require register copies due to two-addressness.
1405///
1406/// This method returns a null pointer if the transformation cannot be
1407/// performed, otherwise it returns the new instruction.
1408///
1410 LiveVariables *LV,
1411 LiveIntervals *LIS) const {
1412 // The following opcodes also sets the condition code register(s). Only
1413 // convert them to equivalent lea if the condition code register def's
1414 // are dead!
1416 return nullptr;
1417
1418 MachineFunction &MF = *MI.getParent()->getParent();
1419 // All instructions input are two-addr instructions. Get the known operands.
1420 const MachineOperand &Dest = MI.getOperand(0);
1421 const MachineOperand &Src = MI.getOperand(1);
1422
1423 // Ideally, operations with undef should be folded before we get here, but we
1424 // can't guarantee it. Bail out because optimizing undefs is a waste of time.
1425 // Without this, we have to forward undef state to new register operands to
1426 // avoid machine verifier errors.
1427 if (Src.isUndef())
1428 return nullptr;
1429 if (MI.getNumOperands() > 2)
1430 if (MI.getOperand(2).isReg() && MI.getOperand(2).isUndef())
1431 return nullptr;
1432
1433 MachineInstr *NewMI = nullptr;
1434 Register SrcReg, SrcReg2;
1435 unsigned SrcSubReg, SrcSubReg2;
1436 bool Is64Bit = Subtarget.is64Bit();
1437
1438 bool Is8BitOp = false;
1439 unsigned NumRegOperands = 2;
1440 unsigned MIOpc = MI.getOpcode();
1441 switch (MIOpc) {
1442 default:
1443 llvm_unreachable("Unreachable!");
1444 CASE_NF(SHL64ri) {
1445 assert(MI.getNumOperands() >= 3 && "Unknown shift instruction!");
1446 unsigned ShAmt = getTruncatedShiftCount(MI, 2);
1447 if (!isTruncatedShiftCountForLEA(ShAmt))
1448 return nullptr;
1449
1450 // LEA can't handle RSP.
1451 if (Src.getReg().isVirtual() && !MF.getRegInfo().constrainRegClass(
1452 Src.getReg(), &X86::GR64_NOSPRegClass))
1453 return nullptr;
1454
1455 NewMI = BuildMI(MF, MI.getDebugLoc(), get(X86::LEA64r))
1456 .add(Dest)
1457 .addReg(0)
1458 .addImm(1LL << ShAmt)
1459 .add(Src)
1460 .addImm(0)
1461 .addReg(0);
1462 break;
1463 }
1464 CASE_NF(SHL32ri) {
1465 assert(MI.getNumOperands() >= 3 && "Unknown shift instruction!");
1466 unsigned ShAmt = getTruncatedShiftCount(MI, 2);
1467 if (!isTruncatedShiftCountForLEA(ShAmt))
1468 return nullptr;
1469
1470 unsigned Opc = Is64Bit ? X86::LEA64_32r : X86::LEA32r;
1471
1472 // LEA can't handle ESP.
1473 bool isKill;
1474 MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
1475 if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/false, SrcReg, SrcSubReg,
1476 isKill, ImplicitOp, LV, LIS))
1477 return nullptr;
1478
1480 BuildMI(MF, MI.getDebugLoc(), get(Opc))
1481 .add(Dest)
1482 .addReg(0)
1483 .addImm(1LL << ShAmt)
1484 .addReg(SrcReg, getKillRegState(isKill), SrcSubReg)
1485 .addImm(0)
1486 .addReg(0);
1487 if (ImplicitOp.getReg() != 0)
1488 MIB.add(ImplicitOp);
1489 NewMI = MIB;
1490
1491 // Add kills if classifyLEAReg created a new register.
1492 if (LV && SrcReg != Src.getReg())
1493 LV->getVarInfo(SrcReg).Kills.push_back(NewMI);
1494 break;
1495 }
1496 CASE_NF(SHL8ri)
1497 Is8BitOp = true;
1498 [[fallthrough]];
1499 CASE_NF(SHL16ri) {
1500 assert(MI.getNumOperands() >= 3 && "Unknown shift instruction!");
1501 unsigned ShAmt = getTruncatedShiftCount(MI, 2);
1502 if (!isTruncatedShiftCountForLEA(ShAmt))
1503 return nullptr;
1504 return convertToThreeAddressWithLEA(MIOpc, MI, LV, LIS, Is8BitOp);
1505 }
1506 CASE_NF(INC64r)
1507 CASE_NF(INC32r) {
1508 assert(MI.getNumOperands() >= 2 && "Unknown inc instruction!");
1509 unsigned Opc = (MIOpc == X86::INC64r || MIOpc == X86::INC64r_NF)
1510 ? X86::LEA64r
1511 : (Is64Bit ? X86::LEA64_32r : X86::LEA32r);
1512 bool isKill;
1513 MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
1514 if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/false, SrcReg, SrcSubReg,
1515 isKill, ImplicitOp, LV, LIS))
1516 return nullptr;
1517
1518 MachineInstrBuilder MIB = BuildMI(MF, MI.getDebugLoc(), get(Opc))
1519 .add(Dest)
1520 .addReg(SrcReg, getKillRegState(isKill));
1521 if (ImplicitOp.getReg() != 0)
1522 MIB.add(ImplicitOp);
1523
1524 NewMI = addOffset(MIB, 1);
1525
1526 // Add kills if classifyLEAReg created a new register.
1527 if (LV && SrcReg != Src.getReg())
1528 LV->getVarInfo(SrcReg).Kills.push_back(NewMI);
1529 break;
1530 }
1531 CASE_NF(DEC64r)
1532 CASE_NF(DEC32r) {
1533 assert(MI.getNumOperands() >= 2 && "Unknown dec instruction!");
1534 unsigned Opc = (MIOpc == X86::DEC64r || MIOpc == X86::DEC64r_NF)
1535 ? X86::LEA64r
1536 : (Is64Bit ? X86::LEA64_32r : X86::LEA32r);
1537
1538 bool isKill;
1539 MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
1540 if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/false, SrcReg, SrcSubReg,
1541 isKill, ImplicitOp, LV, LIS))
1542 return nullptr;
1543
1544 MachineInstrBuilder MIB = BuildMI(MF, MI.getDebugLoc(), get(Opc))
1545 .add(Dest)
1546 .addReg(SrcReg, getKillRegState(isKill));
1547 if (ImplicitOp.getReg() != 0)
1548 MIB.add(ImplicitOp);
1549
1550 NewMI = addOffset(MIB, -1);
1551
1552 // Add kills if classifyLEAReg created a new register.
1553 if (LV && SrcReg != Src.getReg())
1554 LV->getVarInfo(SrcReg).Kills.push_back(NewMI);
1555 break;
1556 }
1557 CASE_NF(DEC8r)
1558 CASE_NF(INC8r)
1559 Is8BitOp = true;
1560 [[fallthrough]];
1561 CASE_NF(DEC16r)
1562 CASE_NF(INC16r)
1563 return convertToThreeAddressWithLEA(MIOpc, MI, LV, LIS, Is8BitOp);
1564 CASE_NF(ADD64rr)
1565 CASE_NF(ADD32rr)
1566 case X86::ADD64rr_DB:
1567 case X86::ADD32rr_DB: {
1568 assert(MI.getNumOperands() >= 3 && "Unknown add instruction!");
1569 unsigned Opc;
1570 if (MIOpc == X86::ADD64rr || MIOpc == X86::ADD64rr_NF ||
1571 MIOpc == X86::ADD64rr_DB)
1572 Opc = X86::LEA64r;
1573 else
1574 Opc = Is64Bit ? X86::LEA64_32r : X86::LEA32r;
1575
1576 const MachineOperand &Src2 = MI.getOperand(2);
1577 bool isKill2;
1578 MachineOperand ImplicitOp2 = MachineOperand::CreateReg(0, false);
1579 if (!classifyLEAReg(MI, Src2, Opc, /*AllowSP=*/false, SrcReg2, SrcSubReg2,
1580 isKill2, ImplicitOp2, LV, LIS))
1581 return nullptr;
1582
1583 bool isKill;
1584 MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
1585 if (Src.getReg() == Src2.getReg()) {
1586 // Don't call classify LEAReg a second time on the same register, in case
1587 // the first call inserted a COPY from Src2 and marked it as killed.
1588 isKill = isKill2;
1589 SrcReg = SrcReg2;
1590 SrcSubReg = SrcSubReg2;
1591 } else {
1592 if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/true, SrcReg, SrcSubReg,
1593 isKill, ImplicitOp, LV, LIS))
1594 return nullptr;
1595 }
1596
1597 MachineInstrBuilder MIB = BuildMI(MF, MI.getDebugLoc(), get(Opc)).add(Dest);
1598 if (ImplicitOp.getReg() != 0)
1599 MIB.add(ImplicitOp);
1600 if (ImplicitOp2.getReg() != 0)
1601 MIB.add(ImplicitOp2);
1602
1603 NewMI =
1604 addRegReg(MIB, SrcReg, isKill, SrcSubReg, SrcReg2, isKill2, SrcSubReg2);
1605
1606 // Add kills if classifyLEAReg created a new register.
1607 if (LV) {
1608 if (SrcReg2 != Src2.getReg())
1609 LV->getVarInfo(SrcReg2).Kills.push_back(NewMI);
1610 if (SrcReg != SrcReg2 && SrcReg != Src.getReg())
1611 LV->getVarInfo(SrcReg).Kills.push_back(NewMI);
1612 }
1613 NumRegOperands = 3;
1614 break;
1615 }
1616 CASE_NF(ADD8rr)
1617 case X86::ADD8rr_DB:
1618 Is8BitOp = true;
1619 [[fallthrough]];
1620 CASE_NF(ADD16rr)
1621 case X86::ADD16rr_DB:
1622 return convertToThreeAddressWithLEA(MIOpc, MI, LV, LIS, Is8BitOp);
1623 CASE_NF(ADD64ri32)
1624 case X86::ADD64ri32_DB:
1625 assert(MI.getNumOperands() >= 3 && "Unknown add instruction!");
1626 NewMI = addOffset(
1627 BuildMI(MF, MI.getDebugLoc(), get(X86::LEA64r)).add(Dest).add(Src),
1628 MI.getOperand(2));
1629 break;
1630 CASE_NF(ADD32ri)
1631 case X86::ADD32ri_DB: {
1632 assert(MI.getNumOperands() >= 3 && "Unknown add instruction!");
1633 unsigned Opc = Is64Bit ? X86::LEA64_32r : X86::LEA32r;
1634
1635 bool isKill;
1636 MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
1637 if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/true, SrcReg, SrcSubReg,
1638 isKill, ImplicitOp, LV, LIS))
1639 return nullptr;
1640
1642 BuildMI(MF, MI.getDebugLoc(), get(Opc))
1643 .add(Dest)
1644 .addReg(SrcReg, getKillRegState(isKill), SrcSubReg);
1645 if (ImplicitOp.getReg() != 0)
1646 MIB.add(ImplicitOp);
1647
1648 NewMI = addOffset(MIB, MI.getOperand(2));
1649
1650 // Add kills if classifyLEAReg created a new register.
1651 if (LV && SrcReg != Src.getReg())
1652 LV->getVarInfo(SrcReg).Kills.push_back(NewMI);
1653 break;
1654 }
1655 CASE_NF(ADD8ri)
1656 case X86::ADD8ri_DB:
1657 Is8BitOp = true;
1658 [[fallthrough]];
1659 CASE_NF(ADD16ri)
1660 case X86::ADD16ri_DB:
1661 return convertToThreeAddressWithLEA(MIOpc, MI, LV, LIS, Is8BitOp);
1662 CASE_NF(SUB8ri)
1663 CASE_NF(SUB16ri)
1664 /// FIXME: Support these similar to ADD8ri/ADD16ri*.
1665 return nullptr;
1666 CASE_NF(SUB32ri) {
1667 if (!MI.getOperand(2).isImm())
1668 return nullptr;
1669 int64_t Imm = MI.getOperand(2).getImm();
1670 if (!isInt<32>(-Imm))
1671 return nullptr;
1672
1673 assert(MI.getNumOperands() >= 3 && "Unknown add instruction!");
1674 unsigned Opc = Is64Bit ? X86::LEA64_32r : X86::LEA32r;
1675
1676 bool isKill;
1677 MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
1678 if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/true, SrcReg, SrcSubReg,
1679 isKill, ImplicitOp, LV, LIS))
1680 return nullptr;
1681
1683 BuildMI(MF, MI.getDebugLoc(), get(Opc))
1684 .add(Dest)
1685 .addReg(SrcReg, getKillRegState(isKill), SrcSubReg);
1686 if (ImplicitOp.getReg() != 0)
1687 MIB.add(ImplicitOp);
1688
1689 NewMI = addOffset(MIB, -Imm);
1690
1691 // Add kills if classifyLEAReg created a new register.
1692 if (LV && SrcReg != Src.getReg())
1693 LV->getVarInfo(SrcReg).Kills.push_back(NewMI);
1694 break;
1695 }
1696
1697 CASE_NF(SUB64ri32) {
1698 if (!MI.getOperand(2).isImm())
1699 return nullptr;
1700 int64_t Imm = MI.getOperand(2).getImm();
1701 if (!isInt<32>(-Imm))
1702 return nullptr;
1703
1704 assert(MI.getNumOperands() >= 3 && "Unknown sub instruction!");
1705
1707 BuildMI(MF, MI.getDebugLoc(), get(X86::LEA64r)).add(Dest).add(Src);
1708 NewMI = addOffset(MIB, -Imm);
1709 break;
1710 }
1711
1712 case X86::VMOVDQU8Z128rmk:
1713 case X86::VMOVDQU8Z256rmk:
1714 case X86::VMOVDQU8Zrmk:
1715 case X86::VMOVDQU16Z128rmk:
1716 case X86::VMOVDQU16Z256rmk:
1717 case X86::VMOVDQU16Zrmk:
1718 case X86::VMOVDQU32Z128rmk:
1719 case X86::VMOVDQA32Z128rmk:
1720 case X86::VMOVDQU32Z256rmk:
1721 case X86::VMOVDQA32Z256rmk:
1722 case X86::VMOVDQU32Zrmk:
1723 case X86::VMOVDQA32Zrmk:
1724 case X86::VMOVDQU64Z128rmk:
1725 case X86::VMOVDQA64Z128rmk:
1726 case X86::VMOVDQU64Z256rmk:
1727 case X86::VMOVDQA64Z256rmk:
1728 case X86::VMOVDQU64Zrmk:
1729 case X86::VMOVDQA64Zrmk:
1730 case X86::VMOVUPDZ128rmk:
1731 case X86::VMOVAPDZ128rmk:
1732 case X86::VMOVUPDZ256rmk:
1733 case X86::VMOVAPDZ256rmk:
1734 case X86::VMOVUPDZrmk:
1735 case X86::VMOVAPDZrmk:
1736 case X86::VMOVUPSZ128rmk:
1737 case X86::VMOVAPSZ128rmk:
1738 case X86::VMOVUPSZ256rmk:
1739 case X86::VMOVAPSZ256rmk:
1740 case X86::VMOVUPSZrmk:
1741 case X86::VMOVAPSZrmk:
1742 case X86::VBROADCASTSDZ256rmk:
1743 case X86::VBROADCASTSDZrmk:
1744 case X86::VBROADCASTSSZ128rmk:
1745 case X86::VBROADCASTSSZ256rmk:
1746 case X86::VBROADCASTSSZrmk:
1747 case X86::VPBROADCASTDZ128rmk:
1748 case X86::VPBROADCASTDZ256rmk:
1749 case X86::VPBROADCASTDZrmk:
1750 case X86::VPBROADCASTQZ128rmk:
1751 case X86::VPBROADCASTQZ256rmk:
1752 case X86::VPBROADCASTQZrmk: {
1753 unsigned Opc;
1754 switch (MIOpc) {
1755 default:
1756 llvm_unreachable("Unreachable!");
1757 case X86::VMOVDQU8Z128rmk:
1758 Opc = X86::VPBLENDMBZ128rmk;
1759 break;
1760 case X86::VMOVDQU8Z256rmk:
1761 Opc = X86::VPBLENDMBZ256rmk;
1762 break;
1763 case X86::VMOVDQU8Zrmk:
1764 Opc = X86::VPBLENDMBZrmk;
1765 break;
1766 case X86::VMOVDQU16Z128rmk:
1767 Opc = X86::VPBLENDMWZ128rmk;
1768 break;
1769 case X86::VMOVDQU16Z256rmk:
1770 Opc = X86::VPBLENDMWZ256rmk;
1771 break;
1772 case X86::VMOVDQU16Zrmk:
1773 Opc = X86::VPBLENDMWZrmk;
1774 break;
1775 case X86::VMOVDQU32Z128rmk:
1776 Opc = X86::VPBLENDMDZ128rmk;
1777 break;
1778 case X86::VMOVDQU32Z256rmk:
1779 Opc = X86::VPBLENDMDZ256rmk;
1780 break;
1781 case X86::VMOVDQU32Zrmk:
1782 Opc = X86::VPBLENDMDZrmk;
1783 break;
1784 case X86::VMOVDQU64Z128rmk:
1785 Opc = X86::VPBLENDMQZ128rmk;
1786 break;
1787 case X86::VMOVDQU64Z256rmk:
1788 Opc = X86::VPBLENDMQZ256rmk;
1789 break;
1790 case X86::VMOVDQU64Zrmk:
1791 Opc = X86::VPBLENDMQZrmk;
1792 break;
1793 case X86::VMOVUPDZ128rmk:
1794 Opc = X86::VBLENDMPDZ128rmk;
1795 break;
1796 case X86::VMOVUPDZ256rmk:
1797 Opc = X86::VBLENDMPDZ256rmk;
1798 break;
1799 case X86::VMOVUPDZrmk:
1800 Opc = X86::VBLENDMPDZrmk;
1801 break;
1802 case X86::VMOVUPSZ128rmk:
1803 Opc = X86::VBLENDMPSZ128rmk;
1804 break;
1805 case X86::VMOVUPSZ256rmk:
1806 Opc = X86::VBLENDMPSZ256rmk;
1807 break;
1808 case X86::VMOVUPSZrmk:
1809 Opc = X86::VBLENDMPSZrmk;
1810 break;
1811 case X86::VMOVDQA32Z128rmk:
1812 Opc = X86::VPBLENDMDZ128rmk;
1813 break;
1814 case X86::VMOVDQA32Z256rmk:
1815 Opc = X86::VPBLENDMDZ256rmk;
1816 break;
1817 case X86::VMOVDQA32Zrmk:
1818 Opc = X86::VPBLENDMDZrmk;
1819 break;
1820 case X86::VMOVDQA64Z128rmk:
1821 Opc = X86::VPBLENDMQZ128rmk;
1822 break;
1823 case X86::VMOVDQA64Z256rmk:
1824 Opc = X86::VPBLENDMQZ256rmk;
1825 break;
1826 case X86::VMOVDQA64Zrmk:
1827 Opc = X86::VPBLENDMQZrmk;
1828 break;
1829 case X86::VMOVAPDZ128rmk:
1830 Opc = X86::VBLENDMPDZ128rmk;
1831 break;
1832 case X86::VMOVAPDZ256rmk:
1833 Opc = X86::VBLENDMPDZ256rmk;
1834 break;
1835 case X86::VMOVAPDZrmk:
1836 Opc = X86::VBLENDMPDZrmk;
1837 break;
1838 case X86::VMOVAPSZ128rmk:
1839 Opc = X86::VBLENDMPSZ128rmk;
1840 break;
1841 case X86::VMOVAPSZ256rmk:
1842 Opc = X86::VBLENDMPSZ256rmk;
1843 break;
1844 case X86::VMOVAPSZrmk:
1845 Opc = X86::VBLENDMPSZrmk;
1846 break;
1847 case X86::VBROADCASTSDZ256rmk:
1848 Opc = X86::VBLENDMPDZ256rmbk;
1849 break;
1850 case X86::VBROADCASTSDZrmk:
1851 Opc = X86::VBLENDMPDZrmbk;
1852 break;
1853 case X86::VBROADCASTSSZ128rmk:
1854 Opc = X86::VBLENDMPSZ128rmbk;
1855 break;
1856 case X86::VBROADCASTSSZ256rmk:
1857 Opc = X86::VBLENDMPSZ256rmbk;
1858 break;
1859 case X86::VBROADCASTSSZrmk:
1860 Opc = X86::VBLENDMPSZrmbk;
1861 break;
1862 case X86::VPBROADCASTDZ128rmk:
1863 Opc = X86::VPBLENDMDZ128rmbk;
1864 break;
1865 case X86::VPBROADCASTDZ256rmk:
1866 Opc = X86::VPBLENDMDZ256rmbk;
1867 break;
1868 case X86::VPBROADCASTDZrmk:
1869 Opc = X86::VPBLENDMDZrmbk;
1870 break;
1871 case X86::VPBROADCASTQZ128rmk:
1872 Opc = X86::VPBLENDMQZ128rmbk;
1873 break;
1874 case X86::VPBROADCASTQZ256rmk:
1875 Opc = X86::VPBLENDMQZ256rmbk;
1876 break;
1877 case X86::VPBROADCASTQZrmk:
1878 Opc = X86::VPBLENDMQZrmbk;
1879 break;
1880 }
1881
1882 NewMI = BuildMI(MF, MI.getDebugLoc(), get(Opc))
1883 .add(Dest)
1884 .add(MI.getOperand(2))
1885 .add(Src)
1886 .add(MI.getOperand(3))
1887 .add(MI.getOperand(4))
1888 .add(MI.getOperand(5))
1889 .add(MI.getOperand(6))
1890 .add(MI.getOperand(7));
1891 NumRegOperands = 4;
1892 break;
1893 }
1894
1895 case X86::VMOVDQU8Z128rrk:
1896 case X86::VMOVDQU8Z256rrk:
1897 case X86::VMOVDQU8Zrrk:
1898 case X86::VMOVDQU16Z128rrk:
1899 case X86::VMOVDQU16Z256rrk:
1900 case X86::VMOVDQU16Zrrk:
1901 case X86::VMOVDQU32Z128rrk:
1902 case X86::VMOVDQA32Z128rrk:
1903 case X86::VMOVDQU32Z256rrk:
1904 case X86::VMOVDQA32Z256rrk:
1905 case X86::VMOVDQU32Zrrk:
1906 case X86::VMOVDQA32Zrrk:
1907 case X86::VMOVDQU64Z128rrk:
1908 case X86::VMOVDQA64Z128rrk:
1909 case X86::VMOVDQU64Z256rrk:
1910 case X86::VMOVDQA64Z256rrk:
1911 case X86::VMOVDQU64Zrrk:
1912 case X86::VMOVDQA64Zrrk:
1913 case X86::VMOVUPDZ128rrk:
1914 case X86::VMOVAPDZ128rrk:
1915 case X86::VMOVUPDZ256rrk:
1916 case X86::VMOVAPDZ256rrk:
1917 case X86::VMOVUPDZrrk:
1918 case X86::VMOVAPDZrrk:
1919 case X86::VMOVUPSZ128rrk:
1920 case X86::VMOVAPSZ128rrk:
1921 case X86::VMOVUPSZ256rrk:
1922 case X86::VMOVAPSZ256rrk:
1923 case X86::VMOVUPSZrrk:
1924 case X86::VMOVAPSZrrk: {
1925 unsigned Opc;
1926 switch (MIOpc) {
1927 default:
1928 llvm_unreachable("Unreachable!");
1929 case X86::VMOVDQU8Z128rrk:
1930 Opc = X86::VPBLENDMBZ128rrk;
1931 break;
1932 case X86::VMOVDQU8Z256rrk:
1933 Opc = X86::VPBLENDMBZ256rrk;
1934 break;
1935 case X86::VMOVDQU8Zrrk:
1936 Opc = X86::VPBLENDMBZrrk;
1937 break;
1938 case X86::VMOVDQU16Z128rrk:
1939 Opc = X86::VPBLENDMWZ128rrk;
1940 break;
1941 case X86::VMOVDQU16Z256rrk:
1942 Opc = X86::VPBLENDMWZ256rrk;
1943 break;
1944 case X86::VMOVDQU16Zrrk:
1945 Opc = X86::VPBLENDMWZrrk;
1946 break;
1947 case X86::VMOVDQU32Z128rrk:
1948 Opc = X86::VPBLENDMDZ128rrk;
1949 break;
1950 case X86::VMOVDQU32Z256rrk:
1951 Opc = X86::VPBLENDMDZ256rrk;
1952 break;
1953 case X86::VMOVDQU32Zrrk:
1954 Opc = X86::VPBLENDMDZrrk;
1955 break;
1956 case X86::VMOVDQU64Z128rrk:
1957 Opc = X86::VPBLENDMQZ128rrk;
1958 break;
1959 case X86::VMOVDQU64Z256rrk:
1960 Opc = X86::VPBLENDMQZ256rrk;
1961 break;
1962 case X86::VMOVDQU64Zrrk:
1963 Opc = X86::VPBLENDMQZrrk;
1964 break;
1965 case X86::VMOVUPDZ128rrk:
1966 Opc = X86::VBLENDMPDZ128rrk;
1967 break;
1968 case X86::VMOVUPDZ256rrk:
1969 Opc = X86::VBLENDMPDZ256rrk;
1970 break;
1971 case X86::VMOVUPDZrrk:
1972 Opc = X86::VBLENDMPDZrrk;
1973 break;
1974 case X86::VMOVUPSZ128rrk:
1975 Opc = X86::VBLENDMPSZ128rrk;
1976 break;
1977 case X86::VMOVUPSZ256rrk:
1978 Opc = X86::VBLENDMPSZ256rrk;
1979 break;
1980 case X86::VMOVUPSZrrk:
1981 Opc = X86::VBLENDMPSZrrk;
1982 break;
1983 case X86::VMOVDQA32Z128rrk:
1984 Opc = X86::VPBLENDMDZ128rrk;
1985 break;
1986 case X86::VMOVDQA32Z256rrk:
1987 Opc = X86::VPBLENDMDZ256rrk;
1988 break;
1989 case X86::VMOVDQA32Zrrk:
1990 Opc = X86::VPBLENDMDZrrk;
1991 break;
1992 case X86::VMOVDQA64Z128rrk:
1993 Opc = X86::VPBLENDMQZ128rrk;
1994 break;
1995 case X86::VMOVDQA64Z256rrk:
1996 Opc = X86::VPBLENDMQZ256rrk;
1997 break;
1998 case X86::VMOVDQA64Zrrk:
1999 Opc = X86::VPBLENDMQZrrk;
2000 break;
2001 case X86::VMOVAPDZ128rrk:
2002 Opc = X86::VBLENDMPDZ128rrk;
2003 break;
2004 case X86::VMOVAPDZ256rrk:
2005 Opc = X86::VBLENDMPDZ256rrk;
2006 break;
2007 case X86::VMOVAPDZrrk:
2008 Opc = X86::VBLENDMPDZrrk;
2009 break;
2010 case X86::VMOVAPSZ128rrk:
2011 Opc = X86::VBLENDMPSZ128rrk;
2012 break;
2013 case X86::VMOVAPSZ256rrk:
2014 Opc = X86::VBLENDMPSZ256rrk;
2015 break;
2016 case X86::VMOVAPSZrrk:
2017 Opc = X86::VBLENDMPSZrrk;
2018 break;
2019 }
2020
2021 NewMI = BuildMI(MF, MI.getDebugLoc(), get(Opc))
2022 .add(Dest)
2023 .add(MI.getOperand(2))
2024 .add(Src)
2025 .add(MI.getOperand(3));
2026 NumRegOperands = 4;
2027 break;
2028 }
2029 }
2030#undef CASE_NF
2031
2032 if (!NewMI)
2033 return nullptr;
2034
2035 if (LV) { // Update live variables
2036 for (unsigned I = 0; I < NumRegOperands; ++I) {
2037 MachineOperand &Op = MI.getOperand(I);
2038 if (Op.isReg() && (Op.isDead() || Op.isKill()))
2039 LV->replaceKillInstruction(Op.getReg(), MI, *NewMI);
2040 }
2041 }
2042
2043 MachineBasicBlock &MBB = *MI.getParent();
2044 MBB.insert(MI.getIterator(), NewMI); // Insert the new inst
2045
2046 if (LIS) {
2047 LIS->ReplaceMachineInstrInMaps(MI, *NewMI);
2048 if (SrcReg)
2049 LIS->getInterval(SrcReg);
2050 if (SrcReg2)
2051 LIS->getInterval(SrcReg2);
2052 }
2053
2054 return NewMI;
2055}
2056
2057/// This determines which of three possible cases of a three source commute
2058/// the source indexes correspond to taking into account any mask operands.
2059/// All prevents commuting a passthru operand. Returns -1 if the commute isn't
2060/// possible.
2061/// Case 0 - Possible to commute the first and second operands.
2062/// Case 1 - Possible to commute the first and third operands.
2063/// Case 2 - Possible to commute the second and third operands.
2064static unsigned getThreeSrcCommuteCase(uint64_t TSFlags, unsigned SrcOpIdx1,
2065 unsigned SrcOpIdx2) {
2066 // Put the lowest index to SrcOpIdx1 to simplify the checks below.
2067 if (SrcOpIdx1 > SrcOpIdx2)
2068 std::swap(SrcOpIdx1, SrcOpIdx2);
2069
2070 unsigned Op1 = 1, Op2 = 2, Op3 = 3;
2071 if (X86II::isKMasked(TSFlags)) {
2072 Op2++;
2073 Op3++;
2074 }
2075
2076 if (SrcOpIdx1 == Op1 && SrcOpIdx2 == Op2)
2077 return 0;
2078 if (SrcOpIdx1 == Op1 && SrcOpIdx2 == Op3)
2079 return 1;
2080 if (SrcOpIdx1 == Op2 && SrcOpIdx2 == Op3)
2081 return 2;
2082 llvm_unreachable("Unknown three src commute case.");
2083}
2084
2086 const MachineInstr &MI, unsigned SrcOpIdx1, unsigned SrcOpIdx2,
2087 const X86InstrFMA3Group &FMA3Group) const {
2088
2089 unsigned Opc = MI.getOpcode();
2090
2091 // TODO: Commuting the 1st operand of FMA*_Int requires some additional
2092 // analysis. The commute optimization is legal only if all users of FMA*_Int
2093 // use only the lowest element of the FMA*_Int instruction. Such analysis are
2094 // not implemented yet. So, just return 0 in that case.
2095 // When such analysis are available this place will be the right place for
2096 // calling it.
2097 assert(!(FMA3Group.isIntrinsic() && (SrcOpIdx1 == 1 || SrcOpIdx2 == 1)) &&
2098 "Intrinsic instructions can't commute operand 1");
2099
2100 // Determine which case this commute is or if it can't be done.
2101 unsigned Case =
2102 getThreeSrcCommuteCase(MI.getDesc().TSFlags, SrcOpIdx1, SrcOpIdx2);
2103 assert(Case < 3 && "Unexpected case number!");
2104
2105 // Define the FMA forms mapping array that helps to map input FMA form
2106 // to output FMA form to preserve the operation semantics after
2107 // commuting the operands.
2108 const unsigned Form132Index = 0;
2109 const unsigned Form213Index = 1;
2110 const unsigned Form231Index = 2;
2111 static const unsigned FormMapping[][3] = {
2112 // 0: SrcOpIdx1 == 1 && SrcOpIdx2 == 2;
2113 // FMA132 A, C, b; ==> FMA231 C, A, b;
2114 // FMA213 B, A, c; ==> FMA213 A, B, c;
2115 // FMA231 C, A, b; ==> FMA132 A, C, b;
2116 {Form231Index, Form213Index, Form132Index},
2117 // 1: SrcOpIdx1 == 1 && SrcOpIdx2 == 3;
2118 // FMA132 A, c, B; ==> FMA132 B, c, A;
2119 // FMA213 B, a, C; ==> FMA231 C, a, B;
2120 // FMA231 C, a, B; ==> FMA213 B, a, C;
2121 {Form132Index, Form231Index, Form213Index},
2122 // 2: SrcOpIdx1 == 2 && SrcOpIdx2 == 3;
2123 // FMA132 a, C, B; ==> FMA213 a, B, C;
2124 // FMA213 b, A, C; ==> FMA132 b, C, A;
2125 // FMA231 c, A, B; ==> FMA231 c, B, A;
2126 {Form213Index, Form132Index, Form231Index}};
2127
2128 unsigned FMAForms[3];
2129 FMAForms[0] = FMA3Group.get132Opcode();
2130 FMAForms[1] = FMA3Group.get213Opcode();
2131 FMAForms[2] = FMA3Group.get231Opcode();
2132
2133 // Everything is ready, just adjust the FMA opcode and return it.
2134 for (unsigned FormIndex = 0; FormIndex < 3; FormIndex++)
2135 if (Opc == FMAForms[FormIndex])
2136 return FMAForms[FormMapping[Case][FormIndex]];
2137
2138 llvm_unreachable("Illegal FMA3 format");
2139}
2140
2141static void commuteVPTERNLOG(MachineInstr &MI, unsigned SrcOpIdx1,
2142 unsigned SrcOpIdx2) {
2143 // Determine which case this commute is or if it can't be done.
2144 unsigned Case =
2145 getThreeSrcCommuteCase(MI.getDesc().TSFlags, SrcOpIdx1, SrcOpIdx2);
2146 assert(Case < 3 && "Unexpected case value!");
2147
2148 // For each case we need to swap two pairs of bits in the final immediate.
2149 static const uint8_t SwapMasks[3][4] = {
2150 {0x04, 0x10, 0x08, 0x20}, // Swap bits 2/4 and 3/5.
2151 {0x02, 0x10, 0x08, 0x40}, // Swap bits 1/4 and 3/6.
2152 {0x02, 0x04, 0x20, 0x40}, // Swap bits 1/2 and 5/6.
2153 };
2154
2155 uint8_t Imm = MI.getOperand(MI.getNumOperands() - 1).getImm();
2156 // Clear out the bits we are swapping.
2157 uint8_t NewImm = Imm & ~(SwapMasks[Case][0] | SwapMasks[Case][1] |
2158 SwapMasks[Case][2] | SwapMasks[Case][3]);
2159 // If the immediate had a bit of the pair set, then set the opposite bit.
2160 if (Imm & SwapMasks[Case][0])
2161 NewImm |= SwapMasks[Case][1];
2162 if (Imm & SwapMasks[Case][1])
2163 NewImm |= SwapMasks[Case][0];
2164 if (Imm & SwapMasks[Case][2])
2165 NewImm |= SwapMasks[Case][3];
2166 if (Imm & SwapMasks[Case][3])
2167 NewImm |= SwapMasks[Case][2];
2168 MI.getOperand(MI.getNumOperands() - 1).setImm(NewImm);
2169}
2170
2171// Returns true if this is a VPERMI2 or VPERMT2 instruction that can be
2172// commuted.
2173static bool isCommutableVPERMV3Instruction(unsigned Opcode) {
2174#define VPERM_CASES(Suffix) \
2175 case X86::VPERMI2##Suffix##Z128rr: \
2176 case X86::VPERMT2##Suffix##Z128rr: \
2177 case X86::VPERMI2##Suffix##Z256rr: \
2178 case X86::VPERMT2##Suffix##Z256rr: \
2179 case X86::VPERMI2##Suffix##Zrr: \
2180 case X86::VPERMT2##Suffix##Zrr: \
2181 case X86::VPERMI2##Suffix##Z128rm: \
2182 case X86::VPERMT2##Suffix##Z128rm: \
2183 case X86::VPERMI2##Suffix##Z256rm: \
2184 case X86::VPERMT2##Suffix##Z256rm: \
2185 case X86::VPERMI2##Suffix##Zrm: \
2186 case X86::VPERMT2##Suffix##Zrm: \
2187 case X86::VPERMI2##Suffix##Z128rrkz: \
2188 case X86::VPERMT2##Suffix##Z128rrkz: \
2189 case X86::VPERMI2##Suffix##Z256rrkz: \
2190 case X86::VPERMT2##Suffix##Z256rrkz: \
2191 case X86::VPERMI2##Suffix##Zrrkz: \
2192 case X86::VPERMT2##Suffix##Zrrkz: \
2193 case X86::VPERMI2##Suffix##Z128rmkz: \
2194 case X86::VPERMT2##Suffix##Z128rmkz: \
2195 case X86::VPERMI2##Suffix##Z256rmkz: \
2196 case X86::VPERMT2##Suffix##Z256rmkz: \
2197 case X86::VPERMI2##Suffix##Zrmkz: \
2198 case X86::VPERMT2##Suffix##Zrmkz:
2199
2200#define VPERM_CASES_BROADCAST(Suffix) \
2201 VPERM_CASES(Suffix) \
2202 case X86::VPERMI2##Suffix##Z128rmb: \
2203 case X86::VPERMT2##Suffix##Z128rmb: \
2204 case X86::VPERMI2##Suffix##Z256rmb: \
2205 case X86::VPERMT2##Suffix##Z256rmb: \
2206 case X86::VPERMI2##Suffix##Zrmb: \
2207 case X86::VPERMT2##Suffix##Zrmb: \
2208 case X86::VPERMI2##Suffix##Z128rmbkz: \
2209 case X86::VPERMT2##Suffix##Z128rmbkz: \
2210 case X86::VPERMI2##Suffix##Z256rmbkz: \
2211 case X86::VPERMT2##Suffix##Z256rmbkz: \
2212 case X86::VPERMI2##Suffix##Zrmbkz: \
2213 case X86::VPERMT2##Suffix##Zrmbkz:
2214
2215 switch (Opcode) {
2216 default:
2217 return false;
2218 VPERM_CASES(B)
2223 VPERM_CASES(W)
2224 return true;
2225 }
2226#undef VPERM_CASES_BROADCAST
2227#undef VPERM_CASES
2228}
2229
2230// Returns commuted opcode for VPERMI2 and VPERMT2 instructions by switching
2231// from the I opcode to the T opcode and vice versa.
2232static unsigned getCommutedVPERMV3Opcode(unsigned Opcode) {
2233#define VPERM_CASES(Orig, New) \
2234 case X86::Orig##Z128rr: \
2235 return X86::New##Z128rr; \
2236 case X86::Orig##Z128rrkz: \
2237 return X86::New##Z128rrkz; \
2238 case X86::Orig##Z128rm: \
2239 return X86::New##Z128rm; \
2240 case X86::Orig##Z128rmkz: \
2241 return X86::New##Z128rmkz; \
2242 case X86::Orig##Z256rr: \
2243 return X86::New##Z256rr; \
2244 case X86::Orig##Z256rrkz: \
2245 return X86::New##Z256rrkz; \
2246 case X86::Orig##Z256rm: \
2247 return X86::New##Z256rm; \
2248 case X86::Orig##Z256rmkz: \
2249 return X86::New##Z256rmkz; \
2250 case X86::Orig##Zrr: \
2251 return X86::New##Zrr; \
2252 case X86::Orig##Zrrkz: \
2253 return X86::New##Zrrkz; \
2254 case X86::Orig##Zrm: \
2255 return X86::New##Zrm; \
2256 case X86::Orig##Zrmkz: \
2257 return X86::New##Zrmkz;
2258
2259#define VPERM_CASES_BROADCAST(Orig, New) \
2260 VPERM_CASES(Orig, New) \
2261 case X86::Orig##Z128rmb: \
2262 return X86::New##Z128rmb; \
2263 case X86::Orig##Z128rmbkz: \
2264 return X86::New##Z128rmbkz; \
2265 case X86::Orig##Z256rmb: \
2266 return X86::New##Z256rmb; \
2267 case X86::Orig##Z256rmbkz: \
2268 return X86::New##Z256rmbkz; \
2269 case X86::Orig##Zrmb: \
2270 return X86::New##Zrmb; \
2271 case X86::Orig##Zrmbkz: \
2272 return X86::New##Zrmbkz;
2273
2274 switch (Opcode) {
2275 VPERM_CASES(VPERMI2B, VPERMT2B)
2276 VPERM_CASES_BROADCAST(VPERMI2D, VPERMT2D)
2277 VPERM_CASES_BROADCAST(VPERMI2PD, VPERMT2PD)
2278 VPERM_CASES_BROADCAST(VPERMI2PS, VPERMT2PS)
2279 VPERM_CASES_BROADCAST(VPERMI2Q, VPERMT2Q)
2280 VPERM_CASES(VPERMI2W, VPERMT2W)
2281 VPERM_CASES(VPERMT2B, VPERMI2B)
2282 VPERM_CASES_BROADCAST(VPERMT2D, VPERMI2D)
2283 VPERM_CASES_BROADCAST(VPERMT2PD, VPERMI2PD)
2284 VPERM_CASES_BROADCAST(VPERMT2PS, VPERMI2PS)
2285 VPERM_CASES_BROADCAST(VPERMT2Q, VPERMI2Q)
2286 VPERM_CASES(VPERMT2W, VPERMI2W)
2287 }
2288
2289 llvm_unreachable("Unreachable!");
2290#undef VPERM_CASES_BROADCAST
2291#undef VPERM_CASES
2292}
2293
2295 unsigned OpIdx1,
2296 unsigned OpIdx2) const {
2297 auto CloneIfNew = [&](MachineInstr &MI) {
2298 return std::exchange(NewMI, false)
2299 ? MI.getParent()->getParent()->CloneMachineInstr(&MI)
2300 : &MI;
2301 };
2302 MachineInstr *WorkingMI = nullptr;
2303 unsigned Opc = MI.getOpcode();
2304
2305#define CASE_ND(OP) \
2306 case X86::OP: \
2307 case X86::OP##_ND:
2308
2309 switch (Opc) {
2310 // SHLD B, C, I <-> SHRD C, B, (BitWidth - I)
2311 CASE_ND(SHRD16rri8)
2312 CASE_ND(SHLD16rri8)
2313 CASE_ND(SHRD32rri8)
2314 CASE_ND(SHLD32rri8)
2315 CASE_ND(SHRD64rri8)
2316 CASE_ND(SHLD64rri8) {
2317 unsigned Size;
2318 switch (Opc) {
2319 default:
2320 llvm_unreachable("Unreachable!");
2321#define FROM_TO_SIZE(A, B, S) \
2322 case X86::A: \
2323 Opc = X86::B; \
2324 Size = S; \
2325 break; \
2326 case X86::A##_ND: \
2327 Opc = X86::B##_ND; \
2328 Size = S; \
2329 break; \
2330 case X86::B: \
2331 Opc = X86::A; \
2332 Size = S; \
2333 break; \
2334 case X86::B##_ND: \
2335 Opc = X86::A##_ND; \
2336 Size = S; \
2337 break;
2338
2339 FROM_TO_SIZE(SHRD16rri8, SHLD16rri8, 16)
2340 FROM_TO_SIZE(SHRD32rri8, SHLD32rri8, 32)
2341 FROM_TO_SIZE(SHRD64rri8, SHLD64rri8, 64)
2342#undef FROM_TO_SIZE
2343 }
2344 WorkingMI = CloneIfNew(MI);
2345 WorkingMI->setDesc(get(Opc));
2346 WorkingMI->getOperand(3).setImm(Size - MI.getOperand(3).getImm());
2347 break;
2348 }
2349 case X86::PFSUBrr:
2350 case X86::PFSUBRrr:
2351 // PFSUB x, y: x = x - y
2352 // PFSUBR x, y: x = y - x
2353 WorkingMI = CloneIfNew(MI);
2354 WorkingMI->setDesc(
2355 get(X86::PFSUBRrr == Opc ? X86::PFSUBrr : X86::PFSUBRrr));
2356 break;
2357 case X86::BLENDPDrri:
2358 case X86::BLENDPSrri:
2359 case X86::PBLENDWrri:
2360 case X86::VBLENDPDrri:
2361 case X86::VBLENDPSrri:
2362 case X86::VBLENDPDYrri:
2363 case X86::VBLENDPSYrri:
2364 case X86::VPBLENDDrri:
2365 case X86::VPBLENDWrri:
2366 case X86::VPBLENDDYrri:
2367 case X86::VPBLENDWYrri: {
2368 int8_t Mask;
2369 switch (Opc) {
2370 default:
2371 llvm_unreachable("Unreachable!");
2372 case X86::BLENDPDrri:
2373 Mask = (int8_t)0x03;
2374 break;
2375 case X86::BLENDPSrri:
2376 Mask = (int8_t)0x0F;
2377 break;
2378 case X86::PBLENDWrri:
2379 Mask = (int8_t)0xFF;
2380 break;
2381 case X86::VBLENDPDrri:
2382 Mask = (int8_t)0x03;
2383 break;
2384 case X86::VBLENDPSrri:
2385 Mask = (int8_t)0x0F;
2386 break;
2387 case X86::VBLENDPDYrri:
2388 Mask = (int8_t)0x0F;
2389 break;
2390 case X86::VBLENDPSYrri:
2391 Mask = (int8_t)0xFF;
2392 break;
2393 case X86::VPBLENDDrri:
2394 Mask = (int8_t)0x0F;
2395 break;
2396 case X86::VPBLENDWrri:
2397 Mask = (int8_t)0xFF;
2398 break;
2399 case X86::VPBLENDDYrri:
2400 Mask = (int8_t)0xFF;
2401 break;
2402 case X86::VPBLENDWYrri:
2403 Mask = (int8_t)0xFF;
2404 break;
2405 }
2406 // Only the least significant bits of Imm are used.
2407 // Using int8_t to ensure it will be sign extended to the int64_t that
2408 // setImm takes in order to match isel behavior.
2409 int8_t Imm = MI.getOperand(3).getImm() & Mask;
2410 WorkingMI = CloneIfNew(MI);
2411 WorkingMI->getOperand(3).setImm(Mask ^ Imm);
2412 break;
2413 }
2414 case X86::INSERTPSrri:
2415 case X86::VINSERTPSrri:
2416 case X86::VINSERTPSZrri: {
2417 unsigned Imm = MI.getOperand(MI.getNumOperands() - 1).getImm();
2418 unsigned ZMask = Imm & 15;
2419 unsigned DstIdx = (Imm >> 4) & 3;
2420 unsigned SrcIdx = (Imm >> 6) & 3;
2421
2422 // We can commute insertps if we zero 2 of the elements, the insertion is
2423 // "inline" and we don't override the insertion with a zero.
2424 if (DstIdx == SrcIdx && (ZMask & (1 << DstIdx)) == 0 &&
2425 llvm::popcount(ZMask) == 2) {
2426 unsigned AltIdx = llvm::countr_zero((ZMask | (1 << DstIdx)) ^ 15);
2427 assert(AltIdx < 4 && "Illegal insertion index");
2428 unsigned AltImm = (AltIdx << 6) | (AltIdx << 4) | ZMask;
2429 WorkingMI = CloneIfNew(MI);
2430 WorkingMI->getOperand(MI.getNumOperands() - 1).setImm(AltImm);
2431 break;
2432 }
2433 return nullptr;
2434 }
2435 case X86::MOVSDrr:
2436 case X86::MOVSSrr:
2437 case X86::VMOVSDrr:
2438 case X86::VMOVSSrr: {
2439 // On SSE41 or later we can commute a MOVSS/MOVSD to a BLENDPS/BLENDPD.
2440 if (Subtarget.hasSSE41()) {
2441 unsigned Mask;
2442 switch (Opc) {
2443 default:
2444 llvm_unreachable("Unreachable!");
2445 case X86::MOVSDrr:
2446 Opc = X86::BLENDPDrri;
2447 Mask = 0x02;
2448 break;
2449 case X86::MOVSSrr:
2450 Opc = X86::BLENDPSrri;
2451 Mask = 0x0E;
2452 break;
2453 case X86::VMOVSDrr:
2454 Opc = X86::VBLENDPDrri;
2455 Mask = 0x02;
2456 break;
2457 case X86::VMOVSSrr:
2458 Opc = X86::VBLENDPSrri;
2459 Mask = 0x0E;
2460 break;
2461 }
2462
2463 WorkingMI = CloneIfNew(MI);
2464 WorkingMI->setDesc(get(Opc));
2465 WorkingMI->addOperand(MachineOperand::CreateImm(Mask));
2466 break;
2467 }
2468
2469 assert(Opc == X86::MOVSDrr && "Only MOVSD can commute to SHUFPD");
2470 WorkingMI = CloneIfNew(MI);
2471 WorkingMI->setDesc(get(X86::SHUFPDrri));
2472 WorkingMI->addOperand(MachineOperand::CreateImm(0x02));
2473 break;
2474 }
2475 case X86::SHUFPDrri: {
2476 // Commute to MOVSD.
2477 assert(MI.getOperand(3).getImm() == 0x02 && "Unexpected immediate!");
2478 WorkingMI = CloneIfNew(MI);
2479 WorkingMI->setDesc(get(X86::MOVSDrr));
2480 WorkingMI->removeOperand(3);
2481 break;
2482 }
2483 case X86::PCLMULQDQrri:
2484 case X86::VPCLMULQDQrri:
2485 case X86::VPCLMULQDQYrri:
2486 case X86::VPCLMULQDQZrri:
2487 case X86::VPCLMULQDQZ128rri:
2488 case X86::VPCLMULQDQZ256rri: {
2489 // SRC1 64bits = Imm[0] ? SRC1[127:64] : SRC1[63:0]
2490 // SRC2 64bits = Imm[4] ? SRC2[127:64] : SRC2[63:0]
2491 unsigned Imm = MI.getOperand(3).getImm();
2492 unsigned Src1Hi = Imm & 0x01;
2493 unsigned Src2Hi = Imm & 0x10;
2494 WorkingMI = CloneIfNew(MI);
2495 WorkingMI->getOperand(3).setImm((Src1Hi << 4) | (Src2Hi >> 4));
2496 break;
2497 }
2498 case X86::VPCMPBZ128rri:
2499 case X86::VPCMPUBZ128rri:
2500 case X86::VPCMPBZ256rri:
2501 case X86::VPCMPUBZ256rri:
2502 case X86::VPCMPBZrri:
2503 case X86::VPCMPUBZrri:
2504 case X86::VPCMPDZ128rri:
2505 case X86::VPCMPUDZ128rri:
2506 case X86::VPCMPDZ256rri:
2507 case X86::VPCMPUDZ256rri:
2508 case X86::VPCMPDZrri:
2509 case X86::VPCMPUDZrri:
2510 case X86::VPCMPQZ128rri:
2511 case X86::VPCMPUQZ128rri:
2512 case X86::VPCMPQZ256rri:
2513 case X86::VPCMPUQZ256rri:
2514 case X86::VPCMPQZrri:
2515 case X86::VPCMPUQZrri:
2516 case X86::VPCMPWZ128rri:
2517 case X86::VPCMPUWZ128rri:
2518 case X86::VPCMPWZ256rri:
2519 case X86::VPCMPUWZ256rri:
2520 case X86::VPCMPWZrri:
2521 case X86::VPCMPUWZrri:
2522 case X86::VPCMPBZ128rrik:
2523 case X86::VPCMPUBZ128rrik:
2524 case X86::VPCMPBZ256rrik:
2525 case X86::VPCMPUBZ256rrik:
2526 case X86::VPCMPBZrrik:
2527 case X86::VPCMPUBZrrik:
2528 case X86::VPCMPDZ128rrik:
2529 case X86::VPCMPUDZ128rrik:
2530 case X86::VPCMPDZ256rrik:
2531 case X86::VPCMPUDZ256rrik:
2532 case X86::VPCMPDZrrik:
2533 case X86::VPCMPUDZrrik:
2534 case X86::VPCMPQZ128rrik:
2535 case X86::VPCMPUQZ128rrik:
2536 case X86::VPCMPQZ256rrik:
2537 case X86::VPCMPUQZ256rrik:
2538 case X86::VPCMPQZrrik:
2539 case X86::VPCMPUQZrrik:
2540 case X86::VPCMPWZ128rrik:
2541 case X86::VPCMPUWZ128rrik:
2542 case X86::VPCMPWZ256rrik:
2543 case X86::VPCMPUWZ256rrik:
2544 case X86::VPCMPWZrrik:
2545 case X86::VPCMPUWZrrik:
2546 WorkingMI = CloneIfNew(MI);
2547 // Flip comparison mode immediate (if necessary).
2548 WorkingMI->getOperand(MI.getNumOperands() - 1)
2550 MI.getOperand(MI.getNumOperands() - 1).getImm() & 0x7));
2551 break;
2552 case X86::VPCOMBri:
2553 case X86::VPCOMUBri:
2554 case X86::VPCOMDri:
2555 case X86::VPCOMUDri:
2556 case X86::VPCOMQri:
2557 case X86::VPCOMUQri:
2558 case X86::VPCOMWri:
2559 case X86::VPCOMUWri:
2560 WorkingMI = CloneIfNew(MI);
2561 // Flip comparison mode immediate (if necessary).
2562 WorkingMI->getOperand(3).setImm(
2563 X86::getSwappedVPCOMImm(MI.getOperand(3).getImm() & 0x7));
2564 break;
2565 case X86::VCMPSDZrri:
2566 case X86::VCMPSSZrri:
2567 case X86::VCMPPDZrri:
2568 case X86::VCMPPSZrri:
2569 case X86::VCMPSHZrri:
2570 case X86::VCMPPHZrri:
2571 case X86::VCMPPHZ128rri:
2572 case X86::VCMPPHZ256rri:
2573 case X86::VCMPPDZ128rri:
2574 case X86::VCMPPSZ128rri:
2575 case X86::VCMPPDZ256rri:
2576 case X86::VCMPPSZ256rri:
2577 case X86::VCMPPDZrrik:
2578 case X86::VCMPPSZrrik:
2579 case X86::VCMPPHZrrik:
2580 case X86::VCMPPDZ128rrik:
2581 case X86::VCMPPSZ128rrik:
2582 case X86::VCMPPHZ128rrik:
2583 case X86::VCMPPDZ256rrik:
2584 case X86::VCMPPSZ256rrik:
2585 case X86::VCMPPHZ256rrik:
2586 WorkingMI = CloneIfNew(MI);
2587 WorkingMI->getOperand(MI.getNumExplicitOperands() - 1)
2589 MI.getOperand(MI.getNumExplicitOperands() - 1).getImm() & 0x1f));
2590 break;
2591 case X86::VPERM2F128rri:
2592 case X86::VPERM2I128rri:
2593 // Flip permute source immediate.
2594 // Imm & 0x02: lo = if set, select Op1.lo/hi else Op0.lo/hi.
2595 // Imm & 0x20: hi = if set, select Op1.lo/hi else Op0.lo/hi.
2596 WorkingMI = CloneIfNew(MI);
2597 WorkingMI->getOperand(3).setImm((MI.getOperand(3).getImm() & 0xFF) ^ 0x22);
2598 break;
2599 case X86::MOVHLPSrr:
2600 case X86::UNPCKHPDrr:
2601 case X86::VMOVHLPSrr:
2602 case X86::VUNPCKHPDrr:
2603 case X86::VMOVHLPSZrr:
2604 case X86::VUNPCKHPDZ128rr:
2605 assert(Subtarget.hasSSE2() && "Commuting MOVHLP/UNPCKHPD requires SSE2!");
2606
2607 switch (Opc) {
2608 default:
2609 llvm_unreachable("Unreachable!");
2610 case X86::MOVHLPSrr:
2611 Opc = X86::UNPCKHPDrr;
2612 break;
2613 case X86::UNPCKHPDrr:
2614 Opc = X86::MOVHLPSrr;
2615 break;
2616 case X86::VMOVHLPSrr:
2617 Opc = X86::VUNPCKHPDrr;
2618 break;
2619 case X86::VUNPCKHPDrr:
2620 Opc = X86::VMOVHLPSrr;
2621 break;
2622 case X86::VMOVHLPSZrr:
2623 Opc = X86::VUNPCKHPDZ128rr;
2624 break;
2625 case X86::VUNPCKHPDZ128rr:
2626 Opc = X86::VMOVHLPSZrr;
2627 break;
2628 }
2629 WorkingMI = CloneIfNew(MI);
2630 WorkingMI->setDesc(get(Opc));
2631 break;
2632 CASE_ND(CMOV16rr)
2633 CASE_ND(CMOV32rr)
2634 CASE_ND(CMOV64rr) {
2635 WorkingMI = CloneIfNew(MI);
2636 unsigned OpNo = MI.getDesc().getNumOperands() - 1;
2637 X86::CondCode CC = static_cast<X86::CondCode>(MI.getOperand(OpNo).getImm());
2639 break;
2640 }
2641 case X86::VPTERNLOGDZrri:
2642 case X86::VPTERNLOGDZrmi:
2643 case X86::VPTERNLOGDZ128rri:
2644 case X86::VPTERNLOGDZ128rmi:
2645 case X86::VPTERNLOGDZ256rri:
2646 case X86::VPTERNLOGDZ256rmi:
2647 case X86::VPTERNLOGQZrri:
2648 case X86::VPTERNLOGQZrmi:
2649 case X86::VPTERNLOGQZ128rri:
2650 case X86::VPTERNLOGQZ128rmi:
2651 case X86::VPTERNLOGQZ256rri:
2652 case X86::VPTERNLOGQZ256rmi:
2653 case X86::VPTERNLOGDZrrik:
2654 case X86::VPTERNLOGDZ128rrik:
2655 case X86::VPTERNLOGDZ256rrik:
2656 case X86::VPTERNLOGQZrrik:
2657 case X86::VPTERNLOGQZ128rrik:
2658 case X86::VPTERNLOGQZ256rrik:
2659 case X86::VPTERNLOGDZrrikz:
2660 case X86::VPTERNLOGDZrmikz:
2661 case X86::VPTERNLOGDZ128rrikz:
2662 case X86::VPTERNLOGDZ128rmikz:
2663 case X86::VPTERNLOGDZ256rrikz:
2664 case X86::VPTERNLOGDZ256rmikz:
2665 case X86::VPTERNLOGQZrrikz:
2666 case X86::VPTERNLOGQZrmikz:
2667 case X86::VPTERNLOGQZ128rrikz:
2668 case X86::VPTERNLOGQZ128rmikz:
2669 case X86::VPTERNLOGQZ256rrikz:
2670 case X86::VPTERNLOGQZ256rmikz:
2671 case X86::VPTERNLOGDZ128rmbi:
2672 case X86::VPTERNLOGDZ256rmbi:
2673 case X86::VPTERNLOGDZrmbi:
2674 case X86::VPTERNLOGQZ128rmbi:
2675 case X86::VPTERNLOGQZ256rmbi:
2676 case X86::VPTERNLOGQZrmbi:
2677 case X86::VPTERNLOGDZ128rmbikz:
2678 case X86::VPTERNLOGDZ256rmbikz:
2679 case X86::VPTERNLOGDZrmbikz:
2680 case X86::VPTERNLOGQZ128rmbikz:
2681 case X86::VPTERNLOGQZ256rmbikz:
2682 case X86::VPTERNLOGQZrmbikz: {
2683 WorkingMI = CloneIfNew(MI);
2684 commuteVPTERNLOG(*WorkingMI, OpIdx1, OpIdx2);
2685 break;
2686 }
2687 default:
2689 WorkingMI = CloneIfNew(MI);
2691 break;
2692 }
2693
2694 if (auto *FMA3Group = getFMA3Group(Opc, MI.getDesc().TSFlags)) {
2695 WorkingMI = CloneIfNew(MI);
2696 WorkingMI->setDesc(
2697 get(getFMA3OpcodeToCommuteOperands(MI, OpIdx1, OpIdx2, *FMA3Group)));
2698 break;
2699 }
2700 }
2701 return TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2);
2702}
2703
2704bool X86InstrInfo::findThreeSrcCommutedOpIndices(const MachineInstr &MI,
2705 unsigned &SrcOpIdx1,
2706 unsigned &SrcOpIdx2,
2707 bool IsIntrinsic) const {
2708 uint64_t TSFlags = MI.getDesc().TSFlags;
2709
2710 unsigned FirstCommutableVecOp = 1;
2711 unsigned LastCommutableVecOp = 3;
2712 unsigned KMaskOp = -1U;
2713 if (X86II::isKMasked(TSFlags)) {
2714 // For k-zero-masked operations it is Ok to commute the first vector
2715 // operand. Unless this is an intrinsic instruction.
2716 // For regular k-masked operations a conservative choice is done as the
2717 // elements of the first vector operand, for which the corresponding bit
2718 // in the k-mask operand is set to 0, are copied to the result of the
2719 // instruction.
2720 // TODO/FIXME: The commute still may be legal if it is known that the
2721 // k-mask operand is set to either all ones or all zeroes.
2722 // It is also Ok to commute the 1st operand if all users of MI use only
2723 // the elements enabled by the k-mask operand. For example,
2724 // v4 = VFMADD213PSZrk v1, k, v2, v3; // v1[i] = k[i] ? v2[i]*v1[i]+v3[i]
2725 // : v1[i];
2726 // VMOVAPSZmrk <mem_addr>, k, v4; // this is the ONLY user of v4 ->
2727 // // Ok, to commute v1 in FMADD213PSZrk.
2728
2729 // The k-mask operand has index = 2 for masked and zero-masked operations.
2730 KMaskOp = 2;
2731
2732 // The operand with index = 1 is used as a source for those elements for
2733 // which the corresponding bit in the k-mask is set to 0.
2734 if (X86II::isKMergeMasked(TSFlags) || IsIntrinsic)
2735 FirstCommutableVecOp = 3;
2736
2737 LastCommutableVecOp++;
2738 } else if (IsIntrinsic) {
2739 // Commuting the first operand of an intrinsic instruction isn't possible
2740 // unless we can prove that only the lowest element of the result is used.
2741 FirstCommutableVecOp = 2;
2742 }
2743
2744 if (isMem(MI, LastCommutableVecOp))
2745 LastCommutableVecOp--;
2746
2747 // Only the first RegOpsNum operands are commutable.
2748 // Also, the value 'CommuteAnyOperandIndex' is valid here as it means
2749 // that the operand is not specified/fixed.
2750 if (SrcOpIdx1 != CommuteAnyOperandIndex &&
2751 (SrcOpIdx1 < FirstCommutableVecOp || SrcOpIdx1 > LastCommutableVecOp ||
2752 SrcOpIdx1 == KMaskOp))
2753 return false;
2754 if (SrcOpIdx2 != CommuteAnyOperandIndex &&
2755 (SrcOpIdx2 < FirstCommutableVecOp || SrcOpIdx2 > LastCommutableVecOp ||
2756 SrcOpIdx2 == KMaskOp))
2757 return false;
2758
2759 // Look for two different register operands assumed to be commutable
2760 // regardless of the FMA opcode. The FMA opcode is adjusted later.
2761 if (SrcOpIdx1 == CommuteAnyOperandIndex ||
2762 SrcOpIdx2 == CommuteAnyOperandIndex) {
2763 unsigned CommutableOpIdx2 = SrcOpIdx2;
2764
2765 // At least one of operands to be commuted is not specified and
2766 // this method is free to choose appropriate commutable operands.
2767 if (SrcOpIdx1 == SrcOpIdx2)
2768 // Both of operands are not fixed. By default set one of commutable
2769 // operands to the last register operand of the instruction.
2770 CommutableOpIdx2 = LastCommutableVecOp;
2771 else if (SrcOpIdx2 == CommuteAnyOperandIndex)
2772 // Only one of operands is not fixed.
2773 CommutableOpIdx2 = SrcOpIdx1;
2774
2775 // CommutableOpIdx2 is well defined now. Let's choose another commutable
2776 // operand and assign its index to CommutableOpIdx1.
2777 Register Op2Reg = MI.getOperand(CommutableOpIdx2).getReg();
2778
2779 unsigned CommutableOpIdx1;
2780 for (CommutableOpIdx1 = LastCommutableVecOp;
2781 CommutableOpIdx1 >= FirstCommutableVecOp; CommutableOpIdx1--) {
2782 // Just ignore and skip the k-mask operand.
2783 if (CommutableOpIdx1 == KMaskOp)
2784 continue;
2785
2786 // The commuted operands must have different registers.
2787 // Otherwise, the commute transformation does not change anything and
2788 // is useless then.
2789 if (Op2Reg != MI.getOperand(CommutableOpIdx1).getReg())
2790 break;
2791 }
2792
2793 // No appropriate commutable operands were found.
2794 if (CommutableOpIdx1 < FirstCommutableVecOp)
2795 return false;
2796
2797 // Assign the found pair of commutable indices to SrcOpIdx1 and SrcOpidx2
2798 // to return those values.
2799 if (!fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2, CommutableOpIdx1,
2800 CommutableOpIdx2))
2801 return false;
2802 }
2803
2804 return true;
2805}
2806
2808 unsigned &SrcOpIdx1,
2809 unsigned &SrcOpIdx2) const {
2810 const MCInstrDesc &Desc = MI.getDesc();
2811 if (!Desc.isCommutable())
2812 return false;
2813
2814 switch (MI.getOpcode()) {
2815 case X86::CMPSDrri:
2816 case X86::CMPSSrri:
2817 case X86::CMPPDrri:
2818 case X86::CMPPSrri:
2819 case X86::VCMPSDrri:
2820 case X86::VCMPSSrri:
2821 case X86::VCMPPDrri:
2822 case X86::VCMPPSrri:
2823 case X86::VCMPPDYrri:
2824 case X86::VCMPPSYrri:
2825 case X86::VCMPSDZrri:
2826 case X86::VCMPSSZrri:
2827 case X86::VCMPPDZrri:
2828 case X86::VCMPPSZrri:
2829 case X86::VCMPSHZrri:
2830 case X86::VCMPPHZrri:
2831 case X86::VCMPPHZ128rri:
2832 case X86::VCMPPHZ256rri:
2833 case X86::VCMPPDZ128rri:
2834 case X86::VCMPPSZ128rri:
2835 case X86::VCMPPDZ256rri:
2836 case X86::VCMPPSZ256rri:
2837 case X86::VCMPPDZrrik:
2838 case X86::VCMPPSZrrik:
2839 case X86::VCMPPHZrrik:
2840 case X86::VCMPPDZ128rrik:
2841 case X86::VCMPPSZ128rrik:
2842 case X86::VCMPPHZ128rrik:
2843 case X86::VCMPPDZ256rrik:
2844 case X86::VCMPPSZ256rrik:
2845 case X86::VCMPPHZ256rrik: {
2846 unsigned OpOffset = X86II::isKMasked(Desc.TSFlags) ? 1 : 0;
2847
2848 // Float comparison can be safely commuted for
2849 // Ordered/Unordered/Equal/NotEqual tests
2850 unsigned Imm = MI.getOperand(3 + OpOffset).getImm() & 0x7;
2851 switch (Imm) {
2852 default:
2853 // EVEX versions can be commuted.
2854 if ((Desc.TSFlags & X86II::EncodingMask) == X86II::EVEX)
2855 break;
2856 return false;
2857 case 0x00: // EQUAL
2858 case 0x03: // UNORDERED
2859 case 0x04: // NOT EQUAL
2860 case 0x07: // ORDERED
2861 break;
2862 }
2863
2864 // The indices of the commutable operands are 1 and 2 (or 2 and 3
2865 // when masked).
2866 // Assign them to the returned operand indices here.
2867 return fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2, 1 + OpOffset,
2868 2 + OpOffset);
2869 }
2870 case X86::MOVSSrr:
2871 // X86::MOVSDrr is always commutable. MOVSS is only commutable if we can
2872 // form sse4.1 blend. We assume VMOVSSrr/VMOVSDrr is always commutable since
2873 // AVX implies sse4.1.
2874 if (Subtarget.hasSSE41())
2875 return TargetInstrInfo::findCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2);
2876 return false;
2877 case X86::SHUFPDrri:
2878 // We can commute this to MOVSD.
2879 if (MI.getOperand(3).getImm() == 0x02)
2880 return TargetInstrInfo::findCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2);
2881 return false;
2882 case X86::MOVHLPSrr:
2883 case X86::UNPCKHPDrr:
2884 case X86::VMOVHLPSrr:
2885 case X86::VUNPCKHPDrr:
2886 case X86::VMOVHLPSZrr:
2887 case X86::VUNPCKHPDZ128rr:
2888 if (Subtarget.hasSSE2())
2889 return TargetInstrInfo::findCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2);
2890 return false;
2891 case X86::VPTERNLOGDZrri:
2892 case X86::VPTERNLOGDZrmi:
2893 case X86::VPTERNLOGDZ128rri:
2894 case X86::VPTERNLOGDZ128rmi:
2895 case X86::VPTERNLOGDZ256rri:
2896 case X86::VPTERNLOGDZ256rmi:
2897 case X86::VPTERNLOGQZrri:
2898 case X86::VPTERNLOGQZrmi:
2899 case X86::VPTERNLOGQZ128rri:
2900 case X86::VPTERNLOGQZ128rmi:
2901 case X86::VPTERNLOGQZ256rri:
2902 case X86::VPTERNLOGQZ256rmi:
2903 case X86::VPTERNLOGDZrrik:
2904 case X86::VPTERNLOGDZ128rrik:
2905 case X86::VPTERNLOGDZ256rrik:
2906 case X86::VPTERNLOGQZrrik:
2907 case X86::VPTERNLOGQZ128rrik:
2908 case X86::VPTERNLOGQZ256rrik:
2909 case X86::VPTERNLOGDZrrikz:
2910 case X86::VPTERNLOGDZrmikz:
2911 case X86::VPTERNLOGDZ128rrikz:
2912 case X86::VPTERNLOGDZ128rmikz:
2913 case X86::VPTERNLOGDZ256rrikz:
2914 case X86::VPTERNLOGDZ256rmikz:
2915 case X86::VPTERNLOGQZrrikz:
2916 case X86::VPTERNLOGQZrmikz:
2917 case X86::VPTERNLOGQZ128rrikz:
2918 case X86::VPTERNLOGQZ128rmikz:
2919 case X86::VPTERNLOGQZ256rrikz:
2920 case X86::VPTERNLOGQZ256rmikz:
2921 case X86::VPTERNLOGDZ128rmbi:
2922 case X86::VPTERNLOGDZ256rmbi:
2923 case X86::VPTERNLOGDZrmbi:
2924 case X86::VPTERNLOGQZ128rmbi:
2925 case X86::VPTERNLOGQZ256rmbi:
2926 case X86::VPTERNLOGQZrmbi:
2927 case X86::VPTERNLOGDZ128rmbikz:
2928 case X86::VPTERNLOGDZ256rmbikz:
2929 case X86::VPTERNLOGDZrmbikz:
2930 case X86::VPTERNLOGQZ128rmbikz:
2931 case X86::VPTERNLOGQZ256rmbikz:
2932 case X86::VPTERNLOGQZrmbikz:
2933 return findThreeSrcCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2);
2934 case X86::VPDPWSSDYrr:
2935 case X86::VPDPWSSDrr:
2936 case X86::VPDPWSSDSYrr:
2937 case X86::VPDPWSSDSrr:
2938 case X86::VPDPWUUDrr:
2939 case X86::VPDPWUUDYrr:
2940 case X86::VPDPWUUDSrr:
2941 case X86::VPDPWUUDSYrr:
2942 case X86::VPDPBSSDSrr:
2943 case X86::VPDPBSSDSYrr:
2944 case X86::VPDPBSSDrr:
2945 case X86::VPDPBSSDYrr:
2946 case X86::VPDPBUUDSrr:
2947 case X86::VPDPBUUDSYrr:
2948 case X86::VPDPBUUDrr:
2949 case X86::VPDPBUUDYrr:
2950 case X86::VPDPBSSDSZ128rr:
2951 case X86::VPDPBSSDSZ128rrk:
2952 case X86::VPDPBSSDSZ128rrkz:
2953 case X86::VPDPBSSDSZ256rr:
2954 case X86::VPDPBSSDSZ256rrk:
2955 case X86::VPDPBSSDSZ256rrkz:
2956 case X86::VPDPBSSDSZrr:
2957 case X86::VPDPBSSDSZrrk:
2958 case X86::VPDPBSSDSZrrkz:
2959 case X86::VPDPBSSDZ128rr:
2960 case X86::VPDPBSSDZ128rrk:
2961 case X86::VPDPBSSDZ128rrkz:
2962 case X86::VPDPBSSDZ256rr:
2963 case X86::VPDPBSSDZ256rrk:
2964 case X86::VPDPBSSDZ256rrkz:
2965 case X86::VPDPBSSDZrr:
2966 case X86::VPDPBSSDZrrk:
2967 case X86::VPDPBSSDZrrkz:
2968 case X86::VPDPBUUDSZ128rr:
2969 case X86::VPDPBUUDSZ128rrk:
2970 case X86::VPDPBUUDSZ128rrkz:
2971 case X86::VPDPBUUDSZ256rr:
2972 case X86::VPDPBUUDSZ256rrk:
2973 case X86::VPDPBUUDSZ256rrkz:
2974 case X86::VPDPBUUDSZrr:
2975 case X86::VPDPBUUDSZrrk:
2976 case X86::VPDPBUUDSZrrkz:
2977 case X86::VPDPBUUDZ128rr:
2978 case X86::VPDPBUUDZ128rrk:
2979 case X86::VPDPBUUDZ128rrkz:
2980 case X86::VPDPBUUDZ256rr:
2981 case X86::VPDPBUUDZ256rrk:
2982 case X86::VPDPBUUDZ256rrkz:
2983 case X86::VPDPBUUDZrr:
2984 case X86::VPDPBUUDZrrk:
2985 case X86::VPDPBUUDZrrkz:
2986 case X86::VPDPWSSDZ128rr:
2987 case X86::VPDPWSSDZ128rrk:
2988 case X86::VPDPWSSDZ128rrkz:
2989 case X86::VPDPWSSDZ256rr:
2990 case X86::VPDPWSSDZ256rrk:
2991 case X86::VPDPWSSDZ256rrkz:
2992 case X86::VPDPWSSDZrr:
2993 case X86::VPDPWSSDZrrk:
2994 case X86::VPDPWSSDZrrkz:
2995 case X86::VPDPWSSDSZ128rr:
2996 case X86::VPDPWSSDSZ128rrk:
2997 case X86::VPDPWSSDSZ128rrkz:
2998 case X86::VPDPWSSDSZ256rr:
2999 case X86::VPDPWSSDSZ256rrk:
3000 case X86::VPDPWSSDSZ256rrkz:
3001 case X86::VPDPWSSDSZrr:
3002 case X86::VPDPWSSDSZrrk:
3003 case X86::VPDPWSSDSZrrkz:
3004 case X86::VPDPWUUDZ128rr:
3005 case X86::VPDPWUUDZ128rrk:
3006 case X86::VPDPWUUDZ128rrkz:
3007 case X86::VPDPWUUDZ256rr:
3008 case X86::VPDPWUUDZ256rrk:
3009 case X86::VPDPWUUDZ256rrkz:
3010 case X86::VPDPWUUDZrr:
3011 case X86::VPDPWUUDZrrk:
3012 case X86::VPDPWUUDZrrkz:
3013 case X86::VPDPWUUDSZ128rr:
3014 case X86::VPDPWUUDSZ128rrk:
3015 case X86::VPDPWUUDSZ128rrkz:
3016 case X86::VPDPWUUDSZ256rr:
3017 case X86::VPDPWUUDSZ256rrk:
3018 case X86::VPDPWUUDSZ256rrkz:
3019 case X86::VPDPWUUDSZrr:
3020 case X86::VPDPWUUDSZrrk:
3021 case X86::VPDPWUUDSZrrkz:
3022 case X86::VPMADD52HUQrr:
3023 case X86::VPMADD52HUQYrr:
3024 case X86::VPMADD52HUQZ128r:
3025 case X86::VPMADD52HUQZ128rk:
3026 case X86::VPMADD52HUQZ128rkz:
3027 case X86::VPMADD52HUQZ256r:
3028 case X86::VPMADD52HUQZ256rk:
3029 case X86::VPMADD52HUQZ256rkz:
3030 case X86::VPMADD52HUQZr:
3031 case X86::VPMADD52HUQZrk:
3032 case X86::VPMADD52HUQZrkz:
3033 case X86::VPMADD52LUQrr:
3034 case X86::VPMADD52LUQYrr:
3035 case X86::VPMADD52LUQZ128r:
3036 case X86::VPMADD52LUQZ128rk:
3037 case X86::VPMADD52LUQZ128rkz:
3038 case X86::VPMADD52LUQZ256r:
3039 case X86::VPMADD52LUQZ256rk:
3040 case X86::VPMADD52LUQZ256rkz:
3041 case X86::VPMADD52LUQZr:
3042 case X86::VPMADD52LUQZrk:
3043 case X86::VPMADD52LUQZrkz:
3044 case X86::VFMADDCPHZr:
3045 case X86::VFMADDCPHZrk:
3046 case X86::VFMADDCPHZrkz:
3047 case X86::VFMADDCPHZ128r:
3048 case X86::VFMADDCPHZ128rk:
3049 case X86::VFMADDCPHZ128rkz:
3050 case X86::VFMADDCPHZ256r:
3051 case X86::VFMADDCPHZ256rk:
3052 case X86::VFMADDCPHZ256rkz:
3053 case X86::VFMADDCSHZr:
3054 case X86::VFMADDCSHZrk:
3055 case X86::VFMADDCSHZrkz: {
3056 unsigned CommutableOpIdx1 = 2;
3057 unsigned CommutableOpIdx2 = 3;
3058 if (X86II::isKMasked(Desc.TSFlags)) {
3059 // Skip the mask register.
3060 ++CommutableOpIdx1;
3061 ++CommutableOpIdx2;
3062 }
3063 if (!fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2, CommutableOpIdx1,
3064 CommutableOpIdx2))
3065 return false;
3066 if (!MI.getOperand(SrcOpIdx1).isReg() || !MI.getOperand(SrcOpIdx2).isReg())
3067 // No idea.
3068 return false;
3069 return true;
3070 }
3071
3072 default:
3073 const X86InstrFMA3Group *FMA3Group =
3074 getFMA3Group(MI.getOpcode(), MI.getDesc().TSFlags);
3075 if (FMA3Group)
3076 return findThreeSrcCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2,
3077 FMA3Group->isIntrinsic());
3078
3079 // Handled masked instructions since we need to skip over the mask input
3080 // and the preserved input.
3081 if (X86II::isKMasked(Desc.TSFlags)) {
3082 // First assume that the first input is the mask operand and skip past it.
3083 unsigned CommutableOpIdx1 = Desc.getNumDefs() + 1;
3084 unsigned CommutableOpIdx2 = Desc.getNumDefs() + 2;
3085 // Check if the first input is tied. If there isn't one then we only
3086 // need to skip the mask operand which we did above.
3087 if ((MI.getDesc().getOperandConstraint(Desc.getNumDefs(),
3088 MCOI::TIED_TO) != -1)) {
3089 // If this is zero masking instruction with a tied operand, we need to
3090 // move the first index back to the first input since this must
3091 // be a 3 input instruction and we want the first two non-mask inputs.
3092 // Otherwise this is a 2 input instruction with a preserved input and
3093 // mask, so we need to move the indices to skip one more input.
3094 if (X86II::isKMergeMasked(Desc.TSFlags)) {
3095 ++CommutableOpIdx1;
3096 ++CommutableOpIdx2;
3097 } else {
3098 --CommutableOpIdx1;
3099 }
3100 }
3101
3102 if (!fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2, CommutableOpIdx1,
3103 CommutableOpIdx2))
3104 return false;
3105
3106 if (!MI.getOperand(SrcOpIdx1).isReg() ||
3107 !MI.getOperand(SrcOpIdx2).isReg())
3108 // No idea.
3109 return false;
3110 return true;
3111 }
3112
3113 return TargetInstrInfo::findCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2);
3114 }
3115 return false;
3116}
3117
3119 unsigned Opcode = MI->getOpcode();
3120 if (Opcode != X86::LEA32r && Opcode != X86::LEA64r &&
3121 Opcode != X86::LEA64_32r)
3122 return false;
3123
3124 const MachineOperand &Scale = MI->getOperand(1 + X86::AddrScaleAmt);
3125 const MachineOperand &Disp = MI->getOperand(1 + X86::AddrDisp);
3126 const MachineOperand &Segment = MI->getOperand(1 + X86::AddrSegmentReg);
3127
3128 if (Segment.getReg() != 0 || !Disp.isImm() || Disp.getImm() != 0 ||
3129 Scale.getImm() > 1)
3130 return false;
3131
3132 return true;
3133}
3134
3136 // Currently we're interested in following sequence only.
3137 // r3 = lea r1, r2
3138 // r5 = add r3, r4
3139 // Both r3 and r4 are killed in add, we hope the add instruction has the
3140 // operand order
3141 // r5 = add r4, r3
3142 // So later in X86FixupLEAs the lea instruction can be rewritten as add.
3143 unsigned Opcode = MI.getOpcode();
3144 if (Opcode != X86::ADD32rr && Opcode != X86::ADD64rr)
3145 return false;
3146
3147 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
3148 Register Reg1 = MI.getOperand(1).getReg();
3149 Register Reg2 = MI.getOperand(2).getReg();
3150
3151 // Check if Reg1 comes from LEA in the same MBB.
3152 if (MachineInstr *Inst = MRI.getUniqueVRegDef(Reg1)) {
3153 if (isConvertibleLEA(Inst) && Inst->getParent() == MI.getParent()) {
3154 Commute = true;
3155 return true;
3156 }
3157 }
3158
3159 // Check if Reg2 comes from LEA in the same MBB.
3160 if (MachineInstr *Inst = MRI.getUniqueVRegDef(Reg2)) {
3161 if (isConvertibleLEA(Inst) && Inst->getParent() == MI.getParent()) {
3162 Commute = false;
3163 return true;
3164 }
3165 }
3166
3167 return false;
3168}
3169
3171 unsigned Opcode = MCID.getOpcode();
3172 if (!(X86::isJCC(Opcode) || X86::isSETCC(Opcode) || X86::isSETZUCC(Opcode) ||
3173 X86::isCMOVCC(Opcode) || X86::isCFCMOVCC(Opcode) ||
3174 X86::isCCMPCC(Opcode) || X86::isCTESTCC(Opcode)))
3175 return -1;
3176 // Assume that condition code is always the last use operand.
3177 unsigned NumUses = MCID.getNumOperands() - MCID.getNumDefs();
3178 return NumUses - 1;
3179}
3180
3182 const MCInstrDesc &MCID = MI.getDesc();
3183 int CondNo = getCondSrcNoFromDesc(MCID);
3184 if (CondNo < 0)
3185 return X86::COND_INVALID;
3186 CondNo += MCID.getNumDefs();
3187 return static_cast<X86::CondCode>(MI.getOperand(CondNo).getImm());
3188}
3189
3191 return X86::isJCC(MI.getOpcode()) ? X86::getCondFromMI(MI)
3193}
3194
3196 return X86::isSETCC(MI.getOpcode()) || X86::isSETZUCC(MI.getOpcode())
3199}
3200
3202 return X86::isCMOVCC(MI.getOpcode()) ? X86::getCondFromMI(MI)
3204}
3205
3207 return X86::isCFCMOVCC(MI.getOpcode()) ? X86::getCondFromMI(MI)
3209}
3210
3212 return X86::isCCMPCC(MI.getOpcode()) || X86::isCTESTCC(MI.getOpcode())
3215}
3216
3218 // CCMP/CTEST has two conditional operands:
3219 // - SCC: source conditonal code (same as CMOV)
3220 // - DCF: destination conditional flags, which has 4 valid bits
3221 //
3222 // +----+----+----+----+
3223 // | OF | SF | ZF | CF |
3224 // +----+----+----+----+
3225 //
3226 // If SCC(source conditional code) evaluates to false, CCMP/CTEST will updates
3227 // the conditional flags by as follows:
3228 //
3229 // OF = DCF.OF
3230 // SF = DCF.SF
3231 // ZF = DCF.ZF
3232 // CF = DCF.CF
3233 // PF = DCF.CF
3234 // AF = 0 (Auxiliary Carry Flag)
3235 //
3236 // Otherwise, the CMP or TEST is executed and it updates the
3237 // CSPAZO flags normally.
3238 //
3239 // NOTE:
3240 // If SCC = P, then SCC evaluates to true regardless of the CSPAZO value.
3241 // If SCC = NP, then SCC evaluates to false regardless of the CSPAZO value.
3242
3243 enum { CF = 1, ZF = 2, SF = 4, OF = 8, PF = CF };
3244
3245 switch (CC) {
3246 default:
3247 llvm_unreachable("Illegal condition code!");
3248 case X86::COND_NO:
3249 case X86::COND_NE:
3250 case X86::COND_GE:
3251 case X86::COND_G:
3252 case X86::COND_AE:
3253 case X86::COND_A:
3254 case X86::COND_NS:
3255 case X86::COND_NP:
3256 return 0;
3257 case X86::COND_O:
3258 return OF;
3259 case X86::COND_B:
3260 case X86::COND_BE:
3261 return CF;
3262 break;
3263 case X86::COND_E:
3264 case X86::COND_LE:
3265 return ZF;
3266 case X86::COND_S:
3267 case X86::COND_L:
3268 return SF;
3269 case X86::COND_P:
3270 return PF;
3271 }
3272}
3273
3274#define GET_X86_NF_TRANSFORM_TABLE
3275#define GET_X86_ND2NONND_TABLE
3276#include "X86GenInstrMapping.inc"
3277
3279 unsigned Opc) {
3280 const auto I = llvm::lower_bound(Table, Opc);
3281 return (I == Table.end() || I->OldOpc != Opc) ? 0U : I->NewOpc;
3282}
3283unsigned X86::getNFVariant(unsigned Opc) {
3284#if defined(EXPENSIVE_CHECKS) && !defined(NDEBUG)
3285 // Make sure the tables are sorted.
3286 static std::atomic<bool> NFTableChecked(false);
3287 if (!NFTableChecked.load(std::memory_order_relaxed)) {
3288 assert(llvm::is_sorted(X86NFTransformTable) &&
3289 "X86NFTransformTable is not sorted!");
3290 NFTableChecked.store(true, std::memory_order_relaxed);
3291 }
3292#endif
3293 return getNewOpcFromTable(X86NFTransformTable, Opc);
3294}
3295
3296unsigned X86::getNonNDVariant(unsigned Opc) {
3297#if defined(EXPENSIVE_CHECKS) && !defined(NDEBUG)
3298 // Make sure the tables are sorted.
3299 static std::atomic<bool> NDTableChecked(false);
3300 if (!NDTableChecked.load(std::memory_order_relaxed)) {
3301 assert(llvm::is_sorted(X86ND2NonNDTable) &&
3302 "X86ND2NonNDTableis not sorted!");
3303 NDTableChecked.store(true, std::memory_order_relaxed);
3304 }
3305#endif
3306 return getNewOpcFromTable(X86ND2NonNDTable, Opc);
3307}
3308
3309/// Return the inverse of the specified condition,
3310/// e.g. turning COND_E to COND_NE.
3312 switch (CC) {
3313 default:
3314 llvm_unreachable("Illegal condition code!");
3315 case X86::COND_E:
3316 return X86::COND_NE;
3317 case X86::COND_NE:
3318 return X86::COND_E;
3319 case X86::COND_L:
3320 return X86::COND_GE;
3321 case X86::COND_LE:
3322 return X86::COND_G;
3323 case X86::COND_G:
3324 return X86::COND_LE;
3325 case X86::COND_GE:
3326 return X86::COND_L;
3327 case X86::COND_B:
3328 return X86::COND_AE;
3329 case X86::COND_BE:
3330 return X86::COND_A;
3331 case X86::COND_A:
3332 return X86::COND_BE;
3333 case X86::COND_AE:
3334 return X86::COND_B;
3335 case X86::COND_S:
3336 return X86::COND_NS;
3337 case X86::COND_NS:
3338 return X86::COND_S;
3339 case X86::COND_P:
3340 return X86::COND_NP;
3341 case X86::COND_NP:
3342 return X86::COND_P;
3343 case X86::COND_O:
3344 return X86::COND_NO;
3345 case X86::COND_NO:
3346 return X86::COND_O;
3347 case X86::COND_NE_OR_P:
3348 return X86::COND_E_AND_NP;
3349 case X86::COND_E_AND_NP:
3350 return X86::COND_NE_OR_P;
3351 }
3352}
3353
3354/// Assuming the flags are set by MI(a,b), return the condition code if we
3355/// modify the instructions such that flags are set by MI(b,a).
3357 switch (CC) {
3358 default:
3359 return X86::COND_INVALID;
3360 case X86::COND_E:
3361 return X86::COND_E;
3362 case X86::COND_NE:
3363 return X86::COND_NE;
3364 case X86::COND_L:
3365 return X86::COND_G;
3366 case X86::COND_LE:
3367 return X86::COND_GE;
3368 case X86::COND_G:
3369 return X86::COND_L;
3370 case X86::COND_GE:
3371 return X86::COND_LE;
3372 case X86::COND_B:
3373 return X86::COND_A;
3374 case X86::COND_BE:
3375 return X86::COND_AE;
3376 case X86::COND_A:
3377 return X86::COND_B;
3378 case X86::COND_AE:
3379 return X86::COND_BE;
3380 }
3381}
3382
3383std::pair<X86::CondCode, bool>
3386 bool NeedSwap = false;
3387 switch (Predicate) {
3388 default:
3389 break;
3390 // Floating-point Predicates
3391 case CmpInst::FCMP_UEQ:
3392 CC = X86::COND_E;
3393 break;
3394 case CmpInst::FCMP_OLT:
3395 NeedSwap = true;
3396 [[fallthrough]];
3397 case CmpInst::FCMP_OGT:
3398 CC = X86::COND_A;
3399 break;
3400 case CmpInst::FCMP_OLE:
3401 NeedSwap = true;
3402 [[fallthrough]];
3403 case CmpInst::FCMP_OGE:
3404 CC = X86::COND_AE;
3405 break;
3406 case CmpInst::FCMP_UGT:
3407 NeedSwap = true;
3408 [[fallthrough]];
3409 case CmpInst::FCMP_ULT:
3410 CC = X86::COND_B;
3411 break;
3412 case CmpInst::FCMP_UGE:
3413 NeedSwap = true;
3414 [[fallthrough]];
3415 case CmpInst::FCMP_ULE:
3416 CC = X86::COND_BE;
3417 break;
3418 case CmpInst::FCMP_ONE:
3419 CC = X86::COND_NE;
3420 break;
3421 case CmpInst::FCMP_UNO:
3422 CC = X86::COND_P;
3423 break;
3424 case CmpInst::FCMP_ORD:
3425 CC = X86::COND_NP;
3426 break;
3427 case CmpInst::FCMP_OEQ:
3428 [[fallthrough]];
3429 case CmpInst::FCMP_UNE:
3430 CC = X86::COND_INVALID;
3431 break;
3432
3433 // Integer Predicates
3434 case CmpInst::ICMP_EQ:
3435 CC = X86::COND_E;
3436 break;
3437 case CmpInst::ICMP_NE:
3438 CC = X86::COND_NE;
3439 break;
3440 case CmpInst::ICMP_UGT:
3441 CC = X86::COND_A;
3442 break;
3443 case CmpInst::ICMP_UGE:
3444 CC = X86::COND_AE;
3445 break;
3446 case CmpInst::ICMP_ULT:
3447 CC = X86::COND_B;
3448 break;
3449 case CmpInst::ICMP_ULE:
3450 CC = X86::COND_BE;
3451 break;
3452 case CmpInst::ICMP_SGT:
3453 CC = X86::COND_G;
3454 break;
3455 case CmpInst::ICMP_SGE:
3456 CC = X86::COND_GE;
3457 break;
3458 case CmpInst::ICMP_SLT:
3459 CC = X86::COND_L;
3460 break;
3461 case CmpInst::ICMP_SLE:
3462 CC = X86::COND_LE;
3463 break;
3464 }
3465
3466 return std::make_pair(CC, NeedSwap);
3467}
3468
3469/// Return a cmov opcode for the given register size in bytes, and operand type.
3470unsigned X86::getCMovOpcode(unsigned RegBytes, bool HasMemoryOperand,
3471 bool HasNDD) {
3472 switch (RegBytes) {
3473 default:
3474 llvm_unreachable("Illegal register size!");
3475#define GET_ND_IF_ENABLED(OPC) (HasNDD ? OPC##_ND : OPC)
3476 case 2:
3477 return HasMemoryOperand ? GET_ND_IF_ENABLED(X86::CMOV16rm)
3478 : GET_ND_IF_ENABLED(X86::CMOV16rr);
3479 case 4:
3480 return HasMemoryOperand ? GET_ND_IF_ENABLED(X86::CMOV32rm)
3481 : GET_ND_IF_ENABLED(X86::CMOV32rr);
3482 case 8:
3483 return HasMemoryOperand ? GET_ND_IF_ENABLED(X86::CMOV64rm)
3484 : GET_ND_IF_ENABLED(X86::CMOV64rr);
3485 }
3486}
3487
3488/// Get the VPCMP immediate for the given condition.
3490 switch (CC) {
3491 default:
3492 llvm_unreachable("Unexpected SETCC condition");
3493 case ISD::SETNE:
3494 return 4;
3495 case ISD::SETEQ:
3496 return 0;
3497 case ISD::SETULT:
3498 case ISD::SETLT:
3499 return 1;
3500 case ISD::SETUGT:
3501 case ISD::SETGT:
3502 return 6;
3503 case ISD::SETUGE:
3504 case ISD::SETGE:
3505 return 5;
3506 case ISD::SETULE:
3507 case ISD::SETLE:
3508 return 2;
3509 }
3510}
3511
3512/// Get the VPCMP immediate if the operands are swapped.
3513unsigned X86::getSwappedVPCMPImm(unsigned Imm) {
3514 switch (Imm) {
3515 default:
3516 llvm_unreachable("Unreachable!");
3517 case 0x01:
3518 Imm = 0x06;
3519 break; // LT -> NLE
3520 case 0x02:
3521 Imm = 0x05;
3522 break; // LE -> NLT
3523 case 0x05:
3524 Imm = 0x02;
3525 break; // NLT -> LE
3526 case 0x06:
3527 Imm = 0x01;
3528 break; // NLE -> LT
3529 case 0x00: // EQ
3530 case 0x03: // FALSE
3531 case 0x04: // NE
3532 case 0x07: // TRUE
3533 break;
3534 }
3535
3536 return Imm;
3537}
3538
3539/// Get the VPCOM immediate if the operands are swapped.
3540unsigned X86::getSwappedVPCOMImm(unsigned Imm) {
3541 switch (Imm) {
3542 default:
3543 llvm_unreachable("Unreachable!");
3544 case 0x00:
3545 Imm = 0x02;
3546 break; // LT -> GT
3547 case 0x01:
3548 Imm = 0x03;
3549 break; // LE -> GE
3550 case 0x02:
3551 Imm = 0x00;
3552 break; // GT -> LT
3553 case 0x03:
3554 Imm = 0x01;
3555 break; // GE -> LE
3556 case 0x04: // EQ
3557 case 0x05: // NE
3558 case 0x06: // FALSE
3559 case 0x07: // TRUE
3560 break;
3561 }
3562
3563 return Imm;
3564}
3565
3566/// Get the VCMP immediate if the operands are swapped.
3567unsigned X86::getSwappedVCMPImm(unsigned Imm) {
3568 // Only need the lower 2 bits to distinquish.
3569 switch (Imm & 0x3) {
3570 default:
3571 llvm_unreachable("Unreachable!");
3572 case 0x00:
3573 case 0x03:
3574 // EQ/NE/TRUE/FALSE/ORD/UNORD don't change immediate when commuted.
3575 break;
3576 case 0x01:
3577 case 0x02:
3578 // Need to toggle bits 3:0. Bit 4 stays the same.
3579 Imm ^= 0xf;
3580 break;
3581 }
3582
3583 return Imm;
3584}
3585
3587 if (Info.RegClass == X86::VR128RegClassID ||
3588 Info.RegClass == X86::VR128XRegClassID)
3589 return 128;
3590 if (Info.RegClass == X86::VR256RegClassID ||
3591 Info.RegClass == X86::VR256XRegClassID)
3592 return 256;
3593 if (Info.RegClass == X86::VR512RegClassID)
3594 return 512;
3595 llvm_unreachable("Unknown register class!");
3596}
3597
3598/// Return true if the Reg is X87 register.
3599static bool isX87Reg(Register Reg) {
3600 return (Reg == X86::FPCW || Reg == X86::FPSW ||
3601 (Reg >= X86::ST0 && Reg <= X86::ST7));
3602}
3603
3604/// check if the instruction is X87 instruction
3606 // Call and inlineasm defs X87 register, so we special case it here because
3607 // otherwise calls are incorrectly flagged as x87 instructions
3608 // as a result.
3609 if (MI.isCall() || MI.isInlineAsm())
3610 return false;
3611 for (const MachineOperand &MO : MI.operands()) {
3612 if (!MO.isReg())
3613 continue;
3614 if (isX87Reg(MO.getReg()))
3615 return true;
3616 }
3617 return false;
3618}
3619
3621 auto IsMemOp = [](const MCOperandInfo &OpInfo) {
3622 return OpInfo.OperandType == MCOI::OPERAND_MEMORY;
3623 };
3624
3625 const MCInstrDesc &Desc = MI.getDesc();
3626
3627 // Directly invoke the MC-layer routine for real (i.e., non-pseudo)
3628 // instructions (fast case).
3629 if (!X86II::isPseudo(Desc.TSFlags)) {
3630 int MemRefIdx = X86II::getMemoryOperandNo(Desc.TSFlags);
3631 if (MemRefIdx >= 0)
3632 return MemRefIdx + X86II::getOperandBias(Desc);
3633#ifdef EXPENSIVE_CHECKS
3634 assert(none_of(Desc.operands(), IsMemOp) &&
3635 "Got false negative from X86II::getMemoryOperandNo()!");
3636#endif
3637 return -1;
3638 }
3639
3640 // Otherwise, handle pseudo instructions by examining the type of their
3641 // operands (slow case). An instruction cannot have a memory reference if it
3642 // has fewer than AddrNumOperands (= 5) explicit operands.
3643 unsigned NumOps = Desc.getNumOperands();
3645#ifdef EXPENSIVE_CHECKS
3646 assert(none_of(Desc.operands(), IsMemOp) &&
3647 "Expected no operands to have OPERAND_MEMORY type!");
3648#endif
3649 return -1;
3650 }
3651
3652 // The first operand with type OPERAND_MEMORY indicates the start of a memory
3653 // reference. We expect the following AddrNumOperand-1 operands to also have
3654 // OPERAND_MEMORY type.
3655 for (unsigned I = 0, E = NumOps - X86::AddrNumOperands; I != E; ++I) {
3656 if (IsMemOp(Desc.operands()[I])) {
3657#ifdef EXPENSIVE_CHECKS
3658 assert(std::all_of(Desc.operands().begin() + I,
3659 Desc.operands().begin() + I + X86::AddrNumOperands,
3660 IsMemOp) &&
3661 "Expected all five operands in the memory reference to have "
3662 "OPERAND_MEMORY type!");
3663#endif
3664 return I;
3665 }
3666 }
3667
3668 return -1;
3669}
3670
3672 unsigned OpNo) {
3673 assert(MI.getNumOperands() >= (OpNo + X86::AddrNumOperands) &&
3674 "Unexpected number of operands!");
3675
3676 const MachineOperand &Index = MI.getOperand(OpNo + X86::AddrIndexReg);
3677 if (!Index.isReg() || Index.getReg() != X86::NoRegister)
3678 return nullptr;
3679
3680 const MachineOperand &Disp = MI.getOperand(OpNo + X86::AddrDisp);
3681 if (!Disp.isCPI() || Disp.getOffset() != 0)
3682 return nullptr;
3683
3685 MI.getParent()->getParent()->getConstantPool()->getConstants();
3686 const MachineConstantPoolEntry &ConstantEntry = Constants[Disp.getIndex()];
3687
3688 // Bail if this is a machine constant pool entry, we won't be able to dig out
3689 // anything useful.
3690 if (ConstantEntry.isMachineConstantPoolEntry())
3691 return nullptr;
3692
3693 return ConstantEntry.Val.ConstVal;
3694}
3695
3697 switch (MI.getOpcode()) {
3698 case X86::TCRETURNdi:
3699 case X86::TCRETURNri:
3700 case X86::TCRETURNmi:
3701 case X86::TCRETURNdi64:
3702 case X86::TCRETURNri64:
3703 case X86::TCRETURNri64_ImpCall:
3704 case X86::TCRETURNmi64:
3705 return true;
3706 default:
3707 return false;
3708 }
3709}
3710
3713 const MachineInstr &TailCall) const {
3714
3715 const MachineFunction *MF = TailCall.getMF();
3716
3717 if (MF->getTarget().getCodeModel() == CodeModel::Kernel) {
3718 // Kernel patches thunk calls in runtime, these should never be conditional.
3719 const MachineOperand &Target = TailCall.getOperand(0);
3720 if (Target.isSymbol()) {
3721 StringRef Symbol(Target.getSymbolName());
3722 // this is currently only relevant to r11/kernel indirect thunk.
3723 if (Symbol == "__x86_indirect_thunk_r11")
3724 return false;
3725 }
3726 }
3727
3728 if (TailCall.getOpcode() != X86::TCRETURNdi &&
3729 TailCall.getOpcode() != X86::TCRETURNdi64) {
3730 // Only direct calls can be done with a conditional branch.
3731 return false;
3732 }
3733
3734 if (Subtarget.isTargetWin64() && MF->hasWinCFI()) {
3735 // Conditional tail calls confuse the Win64 unwinder.
3736 return false;
3737 }
3738
3739 assert(BranchCond.size() == 1);
3740 if (BranchCond[0].getImm() > X86::LAST_VALID_COND) {
3741 // Can't make a conditional tail call with this condition.
3742 return false;
3743 }
3744
3746 if (X86FI->getTCReturnAddrDelta() != 0 ||
3747 TailCall.getOperand(1).getImm() != 0) {
3748 // A conditional tail call cannot do any stack adjustment.
3749 return false;
3750 }
3751
3752 return true;
3753}
3754
3757 const MachineInstr &TailCall) const {
3758 assert(canMakeTailCallConditional(BranchCond, TailCall));
3759
3761 while (I != MBB.begin()) {
3762 --I;
3763 if (I->isDebugInstr())
3764 continue;
3765 if (!I->isBranch())
3766 assert(0 && "Can't find the branch to replace!");
3767
3769 assert(BranchCond.size() == 1);
3770 if (CC != BranchCond[0].getImm())
3771 continue;
3772
3773 break;
3774 }
3775
3776 unsigned Opc = TailCall.getOpcode() == X86::TCRETURNdi ? X86::TCRETURNdicc
3777 : X86::TCRETURNdi64cc;
3778
3779 auto MIB = BuildMI(MBB, I, MBB.findDebugLoc(I), get(Opc));
3780 MIB->addOperand(TailCall.getOperand(0)); // Destination.
3781 MIB.addImm(0); // Stack offset (not used).
3782 MIB->addOperand(BranchCond[0]); // Condition.
3783 MIB.copyImplicitOps(TailCall); // Regmask and (imp-used) parameters.
3784
3785 // Add implicit uses and defs of all live regs potentially clobbered by the
3786 // call. This way they still appear live across the call.
3788 LiveRegs.addLiveOuts(MBB);
3790 LiveRegs.stepForward(*MIB, Clobbers);
3791 for (const auto &C : Clobbers) {
3792 MIB.addReg(C.first, RegState::Implicit);
3794 }
3795
3796 I->eraseFromParent();
3797}
3798
3799// Given a MBB and its TBB, find the FBB which was a fallthrough MBB (it may
3800// not be a fallthrough MBB now due to layout changes). Return nullptr if the
3801// fallthrough MBB cannot be identified.
3804 // Look for non-EHPad successors other than TBB. If we find exactly one, it
3805 // is the fallthrough MBB. If we find zero, then TBB is both the target MBB
3806 // and fallthrough MBB. If we find more than one, we cannot identify the
3807 // fallthrough MBB and should return nullptr.
3808 MachineBasicBlock *FallthroughBB = nullptr;
3809 for (MachineBasicBlock *Succ : MBB->successors()) {
3810 if (Succ->isEHPad() || (Succ == TBB && FallthroughBB))
3811 continue;
3812 // Return a nullptr if we found more than one fallthrough successor.
3813 if (FallthroughBB && FallthroughBB != TBB)
3814 return nullptr;
3815 FallthroughBB = Succ;
3816 }
3817 return FallthroughBB;
3818}
3819
3820bool X86InstrInfo::analyzeBranchImpl(
3823 SmallVectorImpl<MachineInstr *> &CondBranches, bool AllowModify) const {
3824
3825 // Start from the bottom of the block and work up, examining the
3826 // terminator instructions.
3828 MachineBasicBlock::iterator UnCondBrIter = MBB.end();
3829 while (I != MBB.begin()) {
3830 --I;
3831 if (I->isDebugInstr())
3832 continue;
3833
3834 // Working from the bottom, when we see a non-terminator instruction, we're
3835 // done.
3836 if (!isUnpredicatedTerminator(*I))
3837 break;
3838
3839 // A terminator that isn't a branch can't easily be handled by this
3840 // analysis.
3841 if (!I->isBranch())
3842 return true;
3843
3844 // Handle unconditional branches.
3845 if (I->getOpcode() == X86::JMP_1) {
3846 UnCondBrIter = I;
3847
3848 if (!AllowModify) {
3849 TBB = I->getOperand(0).getMBB();
3850 continue;
3851 }
3852
3853 // If the block has any instructions after a JMP, delete them.
3854 MBB.erase(std::next(I), MBB.end());
3855
3856 Cond.clear();
3857 FBB = nullptr;
3858
3859 // Delete the JMP if it's equivalent to a fall-through.
3860 if (MBB.isLayoutSuccessor(I->getOperand(0).getMBB())) {
3861 TBB = nullptr;
3862 I->eraseFromParent();
3863 I = MBB.end();
3864 UnCondBrIter = MBB.end();
3865 continue;
3866 }
3867
3868 // TBB is used to indicate the unconditional destination.
3869 TBB = I->getOperand(0).getMBB();
3870 continue;
3871 }
3872
3873 // Handle conditional branches.
3874 X86::CondCode BranchCode = X86::getCondFromBranch(*I);
3875 if (BranchCode == X86::COND_INVALID)
3876 return true; // Can't handle indirect branch.
3877
3878 // In practice we should never have an undef eflags operand, if we do
3879 // abort here as we are not prepared to preserve the flag.
3880 if (I->findRegisterUseOperand(X86::EFLAGS, /*TRI=*/nullptr)->isUndef())
3881 return true;
3882
3883 // Working from the bottom, handle the first conditional branch.
3884 if (Cond.empty()) {
3885 FBB = TBB;
3886 TBB = I->getOperand(0).getMBB();
3888 CondBranches.push_back(&*I);
3889 continue;
3890 }
3891
3892 // Handle subsequent conditional branches. Only handle the case where all
3893 // conditional branches branch to the same destination and their condition
3894 // opcodes fit one of the special multi-branch idioms.
3895 assert(Cond.size() == 1);
3896 assert(TBB);
3897
3898 // If the conditions are the same, we can leave them alone.
3899 X86::CondCode OldBranchCode = (X86::CondCode)Cond[0].getImm();
3900 auto NewTBB = I->getOperand(0).getMBB();
3901 if (OldBranchCode == BranchCode && TBB == NewTBB)
3902 continue;
3903
3904 // If they differ, see if they fit one of the known patterns. Theoretically,
3905 // we could handle more patterns here, but we shouldn't expect to see them
3906 // if instruction selection has done a reasonable job.
3907 if (TBB == NewTBB &&
3908 ((OldBranchCode == X86::COND_P && BranchCode == X86::COND_NE) ||
3909 (OldBranchCode == X86::COND_NE && BranchCode == X86::COND_P))) {
3910 BranchCode = X86::COND_NE_OR_P;
3911 } else if ((OldBranchCode == X86::COND_NP && BranchCode == X86::COND_NE) ||
3912 (OldBranchCode == X86::COND_E && BranchCode == X86::COND_P)) {
3913 if (NewTBB != (FBB ? FBB : getFallThroughMBB(&MBB, TBB)))
3914 return true;
3915
3916 // X86::COND_E_AND_NP usually has two different branch destinations.
3917 //
3918 // JP B1
3919 // JE B2
3920 // JMP B1
3921 // B1:
3922 // B2:
3923 //
3924 // Here this condition branches to B2 only if NP && E. It has another
3925 // equivalent form:
3926 //
3927 // JNE B1
3928 // JNP B2
3929 // JMP B1
3930 // B1:
3931 // B2:
3932 //
3933 // Similarly it branches to B2 only if E && NP. That is why this condition
3934 // is named with COND_E_AND_NP.
3935 BranchCode = X86::COND_E_AND_NP;
3936 } else
3937 return true;
3938
3939 // Update the MachineOperand.
3940 Cond[0].setImm(BranchCode);
3941 CondBranches.push_back(&*I);
3942 }
3943
3944 return false;
3945}
3946
3949 MachineBasicBlock *&FBB,
3951 bool AllowModify) const {
3952 SmallVector<MachineInstr *, 4> CondBranches;
3953 return analyzeBranchImpl(MBB, TBB, FBB, Cond, CondBranches, AllowModify);
3954}
3955
3957 const MCInstrDesc &Desc = MI.getDesc();
3958 int MemRefBegin = X86II::getMemoryOperandNo(Desc.TSFlags);
3959 assert(MemRefBegin >= 0 && "instr should have memory operand");
3960 MemRefBegin += X86II::getOperandBias(Desc);
3961
3962 const MachineOperand &MO = MI.getOperand(MemRefBegin + X86::AddrDisp);
3963 if (!MO.isJTI())
3964 return -1;
3965
3966 return MO.getIndex();
3967}
3968
3970 Register Reg) {
3971 if (!Reg.isVirtual())
3972 return -1;
3973 MachineInstr *MI = MRI.getUniqueVRegDef(Reg);
3974 if (MI == nullptr)
3975 return -1;
3976 unsigned Opcode = MI->getOpcode();
3977 if (Opcode != X86::LEA64r && Opcode != X86::LEA32r)
3978 return -1;
3980}
3981
3983 unsigned Opcode = MI.getOpcode();
3984 // Switch-jump pattern for non-PIC code looks like:
3985 // JMP64m $noreg, 8, %X, %jump-table.X, $noreg
3986 if (Opcode == X86::JMP64m || Opcode == X86::JMP32m) {
3988 }
3989 // The pattern for PIC code looks like:
3990 // %0 = LEA64r $rip, 1, $noreg, %jump-table.X
3991 // %1 = MOVSX64rm32 %0, 4, XX, 0, $noreg
3992 // %2 = ADD64rr %1, %0
3993 // JMP64r %2
3994 if (Opcode == X86::JMP64r || Opcode == X86::JMP32r) {
3995 Register Reg = MI.getOperand(0).getReg();
3996 if (!Reg.isVirtual())
3997 return -1;
3998 const MachineFunction &MF = *MI.getParent()->getParent();
3999 const MachineRegisterInfo &MRI = MF.getRegInfo();
4000 MachineInstr *Add = MRI.getUniqueVRegDef(Reg);
4001 if (Add == nullptr)
4002 return -1;
4003 if (Add->getOpcode() != X86::ADD64rr && Add->getOpcode() != X86::ADD32rr)
4004 return -1;
4005 int JTI1 = getJumpTableIndexFromReg(MRI, Add->getOperand(1).getReg());
4006 if (JTI1 >= 0)
4007 return JTI1;
4008 int JTI2 = getJumpTableIndexFromReg(MRI, Add->getOperand(2).getReg());
4009 if (JTI2 >= 0)
4010 return JTI2;
4011 }
4012 return -1;
4013}
4014
4016 MachineBranchPredicate &MBP,
4017 bool AllowModify) const {
4018 using namespace std::placeholders;
4019
4021 SmallVector<MachineInstr *, 4> CondBranches;
4022 if (analyzeBranchImpl(MBB, MBP.TrueDest, MBP.FalseDest, Cond, CondBranches,
4023 AllowModify))
4024 return true;
4025
4026 if (Cond.size() != 1)
4027 return true;
4028
4029 assert(MBP.TrueDest && "expected!");
4030
4031 if (!MBP.FalseDest)
4032 MBP.FalseDest = MBB.getNextNode();
4033
4035
4036 MachineInstr *ConditionDef = nullptr;
4037 bool SingleUseCondition = true;
4038
4040 if (MI.modifiesRegister(X86::EFLAGS, TRI)) {
4041 ConditionDef = &MI;
4042 break;
4043 }
4044
4045 if (MI.readsRegister(X86::EFLAGS, TRI))
4046 SingleUseCondition = false;
4047 }
4048
4049 if (!ConditionDef)
4050 return true;
4051
4052 if (SingleUseCondition) {
4053 for (auto *Succ : MBB.successors())
4054 if (Succ->isLiveIn(X86::EFLAGS))
4055 SingleUseCondition = false;
4056 }
4057
4058 MBP.ConditionDef = ConditionDef;
4059 MBP.SingleUseCondition = SingleUseCondition;
4060
4061 // Currently we only recognize the simple pattern:
4062 //
4063 // test %reg, %reg
4064 // je %label
4065 //
4066 const unsigned TestOpcode =
4067 Subtarget.is64Bit() ? X86::TEST64rr : X86::TEST32rr;
4068
4069 if (ConditionDef->getOpcode() == TestOpcode &&
4070 ConditionDef->getNumOperands() == 3 &&
4071 ConditionDef->getOperand(0).isIdenticalTo(ConditionDef->getOperand(1)) &&
4072 (Cond[0].getImm() == X86::COND_NE || Cond[0].getImm() == X86::COND_E)) {
4073 MBP.LHS = ConditionDef->getOperand(0);
4074 MBP.RHS = MachineOperand::CreateImm(0);
4075 MBP.Predicate = Cond[0].getImm() == X86::COND_NE
4076 ? MachineBranchPredicate::PRED_NE
4077 : MachineBranchPredicate::PRED_EQ;
4078 return false;
4079 }
4080
4081 return true;
4082}
4083
4085 int *BytesRemoved) const {
4086 assert(!BytesRemoved && "code size not handled");
4087
4089 unsigned Count = 0;
4090
4091 while (I != MBB.begin()) {
4092 --I;
4093 if (I->isDebugInstr())
4094 continue;
4095 if (I->getOpcode() != X86::JMP_1 &&
4097 break;
4098 // Remove the branch.
4099 I->eraseFromParent();
4100 I = MBB.end();
4101 ++Count;
4102 }
4103
4104 return Count;
4105}
4106
4109 MachineBasicBlock *FBB,
4111 const DebugLoc &DL, int *BytesAdded) const {
4112 // Shouldn't be a fall through.
4113 assert(TBB && "insertBranch must not be told to insert a fallthrough");
4114 assert((Cond.size() == 1 || Cond.size() == 0) &&
4115 "X86 branch conditions have one component!");
4116 assert(!BytesAdded && "code size not handled");
4117
4118 if (Cond.empty()) {
4119 // Unconditional branch?
4120 assert(!FBB && "Unconditional branch with multiple successors!");
4121 BuildMI(&MBB, DL, get(X86::JMP_1)).addMBB(TBB);
4122 return 1;
4123 }
4124
4125 // If FBB is null, it is implied to be a fall-through block.
4126 bool FallThru = FBB == nullptr;
4127
4128 // Conditional branch.
4129 unsigned Count = 0;
4131 switch (CC) {
4132 case X86::COND_NE_OR_P:
4133 // Synthesize NE_OR_P with two branches.
4134 BuildMI(&MBB, DL, get(X86::JCC_1)).addMBB(TBB).addImm(X86::COND_NE);
4135 ++Count;
4136 BuildMI(&MBB, DL, get(X86::JCC_1)).addMBB(TBB).addImm(X86::COND_P);
4137 ++Count;
4138 break;
4139 case X86::COND_E_AND_NP:
4140 // Use the next block of MBB as FBB if it is null.
4141 if (FBB == nullptr) {
4142 FBB = getFallThroughMBB(&MBB, TBB);
4143 assert(FBB && "MBB cannot be the last block in function when the false "
4144 "body is a fall-through.");
4145 }
4146 // Synthesize COND_E_AND_NP with two branches.
4147 BuildMI(&MBB, DL, get(X86::JCC_1)).addMBB(FBB).addImm(X86::COND_NE);
4148 ++Count;
4149 BuildMI(&MBB, DL, get(X86::JCC_1)).addMBB(TBB).addImm(X86::COND_NP);
4150 ++Count;
4151 break;
4152 default: {
4153 BuildMI(&MBB, DL, get(X86::JCC_1)).addMBB(TBB).addImm(CC);
4154 ++Count;
4155 }
4156 }
4157 if (!FallThru) {
4158 // Two-way Conditional branch. Insert the second branch.
4159 BuildMI(&MBB, DL, get(X86::JMP_1)).addMBB(FBB);
4160 ++Count;
4161 }
4162 return Count;
4163}
4164
4167 Register DstReg, Register TrueReg,
4168 Register FalseReg, int &CondCycles,
4169 int &TrueCycles, int &FalseCycles) const {
4170 // Not all subtargets have cmov instructions.
4171 if (!Subtarget.canUseCMOV())
4172 return false;
4173 if (Cond.size() != 1)
4174 return false;
4175 // We cannot do the composite conditions, at least not in SSA form.
4177 return false;
4178
4179 // Check register classes.
4180 const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
4181 const TargetRegisterClass *RC =
4182 RI.getCommonSubClass(MRI.getRegClass(TrueReg), MRI.getRegClass(FalseReg));
4183 if (!RC)
4184 return false;
4185
4186 // We have cmov instructions for 16, 32, and 64 bit general purpose registers.
4187 if (X86::GR16RegClass.hasSubClassEq(RC) ||
4188 X86::GR32RegClass.hasSubClassEq(RC) ||
4189 X86::GR64RegClass.hasSubClassEq(RC)) {
4190 // This latency applies to Pentium M, Merom, Wolfdale, Nehalem, and Sandy
4191 // Bridge. Probably Ivy Bridge as well.
4192 CondCycles = 2;
4193 TrueCycles = 2;
4194 FalseCycles = 2;
4195 return true;
4196 }
4197
4198 // Can't do vectors.
4199 return false;
4200}
4201
4204 const DebugLoc &DL, Register DstReg,
4206 Register FalseReg) const {
4207 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
4208 const TargetRegisterInfo &TRI = *MRI.getTargetRegisterInfo();
4209 const TargetRegisterClass &RC = *MRI.getRegClass(DstReg);
4210 assert(Cond.size() == 1 && "Invalid Cond array");
4211 unsigned Opc =
4212 X86::getCMovOpcode(TRI.getRegSizeInBits(RC) / 8,
4213 false /*HasMemoryOperand*/, Subtarget.hasNDD());
4214 BuildMI(MBB, I, DL, get(Opc), DstReg)
4215 .addReg(FalseReg)
4216 .addReg(TrueReg)
4217 .addImm(Cond[0].getImm());
4218}
4219
4220/// Test if the given register is a physical h register.
4221static bool isHReg(Register Reg) {
4222 return X86::GR8_ABCD_HRegClass.contains(Reg);
4223}
4224
4225// Try and copy between VR128/VR64 and GR64 registers.
4226static unsigned CopyToFromAsymmetricReg(Register DestReg, Register SrcReg,
4227 const X86Subtarget &Subtarget) {
4228 bool HasAVX = Subtarget.hasAVX();
4229 bool HasAVX512 = Subtarget.hasAVX512();
4230 bool HasEGPR = Subtarget.hasEGPR();
4231
4232 // SrcReg(MaskReg) -> DestReg(GR64)
4233 // SrcReg(MaskReg) -> DestReg(GR32)
4234
4235 // All KMASK RegClasses hold the same k registers, can be tested against
4236 // anyone.
4237 if (X86::VK16RegClass.contains(SrcReg)) {
4238 if (X86::GR64RegClass.contains(DestReg)) {
4239 assert(Subtarget.hasBWI());
4240 return HasEGPR ? X86::KMOVQrk_EVEX : X86::KMOVQrk;
4241 }
4242 if (X86::GR32RegClass.contains(DestReg))
4243 return Subtarget.hasBWI() ? (HasEGPR ? X86::KMOVDrk_EVEX : X86::KMOVDrk)
4244 : (HasEGPR ? X86::KMOVWrk_EVEX : X86::KMOVWrk);
4245 }
4246
4247 // SrcReg(GR64) -> DestReg(MaskReg)
4248 // SrcReg(GR32) -> DestReg(MaskReg)
4249
4250 // All KMASK RegClasses hold the same k registers, can be tested against
4251 // anyone.
4252 if (X86::VK16RegClass.contains(DestReg)) {
4253 if (X86::GR64RegClass.contains(SrcReg)) {
4254 assert(Subtarget.hasBWI());
4255 return HasEGPR ? X86::KMOVQkr_EVEX : X86::KMOVQkr;
4256 }
4257 if (X86::GR32RegClass.contains(SrcReg))
4258 return Subtarget.hasBWI() ? (HasEGPR ? X86::KMOVDkr_EVEX : X86::KMOVDkr)
4259 : (HasEGPR ? X86::KMOVWkr_EVEX : X86::KMOVWkr);
4260 }
4261
4262 // SrcReg(VR128) -> DestReg(GR64)
4263 // SrcReg(VR64) -> DestReg(GR64)
4264 // SrcReg(GR64) -> DestReg(VR128)
4265 // SrcReg(GR64) -> DestReg(VR64)
4266
4267 if (X86::GR64RegClass.contains(DestReg)) {
4268 if (X86::VR128XRegClass.contains(SrcReg))
4269 // Copy from a VR128 register to a GR64 register.
4270 return HasAVX512 ? X86::VMOVPQIto64Zrr
4271 : HasAVX ? X86::VMOVPQIto64rr
4272 : X86::MOVPQIto64rr;
4273 if (X86::VR64RegClass.contains(SrcReg))
4274 // Copy from a VR64 register to a GR64 register.
4275 return X86::MMX_MOVD64from64rr;
4276 } else if (X86::GR64RegClass.contains(SrcReg)) {
4277 // Copy from a GR64 register to a VR128 register.
4278 if (X86::VR128XRegClass.contains(DestReg))
4279 return HasAVX512 ? X86::VMOV64toPQIZrr
4280 : HasAVX ? X86::VMOV64toPQIrr
4281 : X86::MOV64toPQIrr;
4282 // Copy from a GR64 register to a VR64 register.
4283 if (X86::VR64RegClass.contains(DestReg))
4284 return X86::MMX_MOVD64to64rr;
4285 }
4286
4287 // SrcReg(VR128) -> DestReg(GR32)
4288 // SrcReg(GR32) -> DestReg(VR128)
4289
4290 if (X86::GR32RegClass.contains(DestReg) &&
4291 X86::VR128XRegClass.contains(SrcReg))
4292 // Copy from a VR128 register to a GR32 register.
4293 return HasAVX512 ? X86::VMOVPDI2DIZrr
4294 : HasAVX ? X86::VMOVPDI2DIrr
4295 : X86::MOVPDI2DIrr;
4296
4297 if (X86::VR128XRegClass.contains(DestReg) &&
4298 X86::GR32RegClass.contains(SrcReg))
4299 // Copy from a GR32 register to a VR128 register.
4300 return HasAVX512 ? X86::VMOVDI2PDIZrr
4301 : HasAVX ? X86::VMOVDI2PDIrr
4302 : X86::MOVDI2PDIrr;
4303
4304 return 0;
4305}
4306
4309 const DebugLoc &DL, Register DestReg,
4310 Register SrcReg, bool KillSrc,
4311 bool RenamableDest, bool RenamableSrc) const {
4312 // First deal with the normal symmetric copies.
4313 bool HasAVX = Subtarget.hasAVX();
4314 bool HasVLX = Subtarget.hasVLX();
4315 bool HasEGPR = Subtarget.hasEGPR();
4316 unsigned Opc = 0;
4317 if (X86::GR64RegClass.contains(DestReg, SrcReg))
4318 Opc = X86::MOV64rr;
4319 else if (X86::GR32RegClass.contains(DestReg, SrcReg))
4320 Opc = X86::MOV32rr;
4321 else if (X86::GR16RegClass.contains(DestReg, SrcReg))
4322 Opc = X86::MOV16rr;
4323 else if (X86::GR8RegClass.contains(DestReg, SrcReg)) {
4324 // Copying to or from a physical H register on x86-64 requires a NOREX
4325 // move. Otherwise use a normal move.
4326 if ((isHReg(DestReg) || isHReg(SrcReg)) && Subtarget.is64Bit()) {
4327 Opc = X86::MOV8rr_NOREX;
4328 // Both operands must be encodable without an REX prefix.
4329 assert(X86::GR8_NOREXRegClass.contains(SrcReg, DestReg) &&
4330 "8-bit H register can not be copied outside GR8_NOREX");
4331 } else
4332 Opc = X86::MOV8rr;
4333 } else if (X86::VR64RegClass.contains(DestReg, SrcReg))
4334 Opc = X86::MMX_MOVQ64rr;
4335 else if (X86::VR128XRegClass.contains(DestReg, SrcReg)) {
4336 if (HasVLX)
4337 Opc = X86::VMOVAPSZ128rr;
4338 else if (X86::VR128RegClass.contains(DestReg, SrcReg))
4339 Opc = HasAVX ? X86::VMOVAPSrr : X86::MOVAPSrr;
4340 else {
4341 // If this an extended register and we don't have VLX we need to use a
4342 // 512-bit move.
4343 Opc = X86::VMOVAPSZrr;
4345 DestReg =
4346 TRI->getMatchingSuperReg(DestReg, X86::sub_xmm, &X86::VR512RegClass);
4347 SrcReg =
4348 TRI->getMatchingSuperReg(SrcReg, X86::sub_xmm, &X86::VR512RegClass);
4349 }
4350 } else if (X86::VR256XRegClass.contains(DestReg, SrcReg)) {
4351 if (HasVLX)
4352 Opc = X86::VMOVAPSZ256rr;
4353 else if (X86::VR256RegClass.contains(DestReg, SrcReg))
4354 Opc = X86::VMOVAPSYrr;
4355 else {
4356 // If this an extended register and we don't have VLX we need to use a
4357 // 512-bit move.
4358 Opc = X86::VMOVAPSZrr;
4360 DestReg =
4361 TRI->getMatchingSuperReg(DestReg, X86::sub_ymm, &X86::VR512RegClass);
4362 SrcReg =
4363 TRI->getMatchingSuperReg(SrcReg, X86::sub_ymm, &X86::VR512RegClass);
4364 }
4365 } else if (X86::VR512RegClass.contains(DestReg, SrcReg))
4366 Opc = X86::VMOVAPSZrr;
4367 // All KMASK RegClasses hold the same k registers, can be tested against
4368 // anyone.
4369 else if (X86::VK16RegClass.contains(DestReg, SrcReg))
4370 Opc = Subtarget.hasBWI() ? (HasEGPR ? X86::KMOVQkk_EVEX : X86::KMOVQkk)
4371 : (HasEGPR ? X86::KMOVQkk_EVEX : X86::KMOVWkk);
4372
4373 if (!Opc)
4374 Opc = CopyToFromAsymmetricReg(DestReg, SrcReg, Subtarget);
4375
4376 if (Opc) {
4377 BuildMI(MBB, MI, DL, get(Opc), DestReg)
4378 .addReg(SrcReg, getKillRegState(KillSrc));
4379 return;
4380 }
4381
4382 if (SrcReg == X86::EFLAGS || DestReg == X86::EFLAGS) {
4383 // FIXME: We use a fatal error here because historically LLVM has tried
4384 // lower some of these physreg copies and we want to ensure we get
4385 // reasonable bug reports if someone encounters a case no other testing
4386 // found. This path should be removed after the LLVM 7 release.
4387 report_fatal_error("Unable to copy EFLAGS physical register!");
4388 }
4389
4390 LLVM_DEBUG(dbgs() << "Cannot copy " << RI.getName(SrcReg) << " to "
4391 << RI.getName(DestReg) << '\n');
4392 report_fatal_error("Cannot emit physreg copy instruction");
4393}
4394
4395std::optional<DestSourcePair>
4397 if (MI.isMoveReg()) {
4398 // FIXME: Dirty hack for apparent invariant that doesn't hold when
4399 // subreg_to_reg is coalesced with ordinary copies, such that the bits that
4400 // were asserted as 0 are now undef.
4401 if (MI.getOperand(0).isUndef() && MI.getOperand(0).getSubReg())
4402 return std::nullopt;
4403
4404 return DestSourcePair{MI.getOperand(0), MI.getOperand(1)};
4405 }
4406 return std::nullopt;
4407}
4408
4409static unsigned getLoadStoreOpcodeForFP16(bool Load, const X86Subtarget &STI) {
4410 if (STI.hasFP16())
4411 return Load ? X86::VMOVSHZrm_alt : X86::VMOVSHZmr;
4412 if (Load)
4413 return X86::MOVSHPrm;
4414 return X86::MOVSHPmr;
4415}
4416
4418 const TargetRegisterClass *RC,
4419 bool IsStackAligned,
4420 const X86Subtarget &STI, bool Load) {
4421 bool HasAVX = STI.hasAVX();
4422 bool HasAVX512 = STI.hasAVX512();
4423 bool HasVLX = STI.hasVLX();
4424 bool HasEGPR = STI.hasEGPR();
4425
4426 assert(RC != nullptr && "Invalid target register class");
4427 switch (STI.getRegisterInfo()->getSpillSize(*RC)) {
4428 default:
4429 llvm_unreachable("Unknown spill size");
4430 case 1:
4431 assert(X86::GR8RegClass.hasSubClassEq(RC) && "Unknown 1-byte regclass");
4432 if (STI.is64Bit())
4433 // Copying to or from a physical H register on x86-64 requires a NOREX
4434 // move. Otherwise use a normal move.
4435 if (isHReg(Reg) || X86::GR8_ABCD_HRegClass.hasSubClassEq(RC))
4436 return Load ? X86::MOV8rm_NOREX : X86::MOV8mr_NOREX;
4437 return Load ? X86::MOV8rm : X86::MOV8mr;
4438 case 2:
4439 if (X86::VK16RegClass.hasSubClassEq(RC))
4440 return Load ? (HasEGPR ? X86::KMOVWkm_EVEX : X86::KMOVWkm)
4441 : (HasEGPR ? X86::KMOVWmk_EVEX : X86::KMOVWmk);
4442 assert(X86::GR16RegClass.hasSubClassEq(RC) && "Unknown 2-byte regclass");
4443 return Load ? X86::MOV16rm : X86::MOV16mr;
4444 case 4:
4445 if (X86::GR32RegClass.hasSubClassEq(RC))
4446 return Load ? X86::MOV32rm : X86::MOV32mr;
4447 if (X86::FR32XRegClass.hasSubClassEq(RC))
4448 return Load ? (HasAVX512 ? X86::VMOVSSZrm_alt
4449 : HasAVX ? X86::VMOVSSrm_alt
4450 : X86::MOVSSrm_alt)
4451 : (HasAVX512 ? X86::VMOVSSZmr
4452 : HasAVX ? X86::VMOVSSmr
4453 : X86::MOVSSmr);
4454 if (X86::RFP32RegClass.hasSubClassEq(RC))
4455 return Load ? X86::LD_Fp32m : X86::ST_Fp32m;
4456 if (X86::VK32RegClass.hasSubClassEq(RC)) {
4457 assert(STI.hasBWI() && "KMOVD requires BWI");
4458 return Load ? (HasEGPR ? X86::KMOVDkm_EVEX : X86::KMOVDkm)
4459 : (HasEGPR ? X86::KMOVDmk_EVEX : X86::KMOVDmk);
4460 }
4461 // All of these mask pair classes have the same spill size, the same kind
4462 // of kmov instructions can be used with all of them.
4463 if (X86::VK1PAIRRegClass.hasSubClassEq(RC) ||
4464 X86::VK2PAIRRegClass.hasSubClassEq(RC) ||
4465 X86::VK4PAIRRegClass.hasSubClassEq(RC) ||
4466 X86::VK8PAIRRegClass.hasSubClassEq(RC) ||
4467 X86::VK16PAIRRegClass.hasSubClassEq(RC))
4468 return Load ? X86::MASKPAIR16LOAD : X86::MASKPAIR16STORE;
4469 if (X86::FR16RegClass.hasSubClassEq(RC) ||
4470 X86::FR16XRegClass.hasSubClassEq(RC))
4471 return getLoadStoreOpcodeForFP16(Load, STI);
4472 llvm_unreachable("Unknown 4-byte regclass");
4473 case 8:
4474 if (X86::GR64RegClass.hasSubClassEq(RC))
4475 return Load ? X86::MOV64rm : X86::MOV64mr;
4476 if (X86::FR64XRegClass.hasSubClassEq(RC))
4477 return Load ? (HasAVX512 ? X86::VMOVSDZrm_alt
4478 : HasAVX ? X86::VMOVSDrm_alt
4479 : X86::MOVSDrm_alt)
4480 : (HasAVX512 ? X86::VMOVSDZmr
4481 : HasAVX ? X86::VMOVSDmr
4482 : X86::MOVSDmr);
4483 if (X86::VR64RegClass.hasSubClassEq(RC))
4484 return Load ? X86::MMX_MOVQ64rm : X86::MMX_MOVQ64mr;
4485 if (X86::RFP64RegClass.hasSubClassEq(RC))
4486 return Load ? X86::LD_Fp64m : X86::ST_Fp64m;
4487 if (X86::VK64RegClass.hasSubClassEq(RC)) {
4488 assert(STI.hasBWI() && "KMOVQ requires BWI");
4489 return Load ? (HasEGPR ? X86::KMOVQkm_EVEX : X86::KMOVQkm)
4490 : (HasEGPR ? X86::KMOVQmk_EVEX : X86::KMOVQmk);
4491 }
4492 llvm_unreachable("Unknown 8-byte regclass");
4493 case 10:
4494 assert(X86::RFP80RegClass.hasSubClassEq(RC) && "Unknown 10-byte regclass");
4495 return Load ? X86::LD_Fp80m : X86::ST_FpP80m;
4496 case 16: {
4497 if (X86::VR128XRegClass.hasSubClassEq(RC)) {
4498 // If stack is realigned we can use aligned stores.
4499 if (IsStackAligned)
4500 return Load ? (HasVLX ? X86::VMOVAPSZ128rm
4501 : HasAVX512 ? X86::VMOVAPSZ128rm_NOVLX
4502 : HasAVX ? X86::VMOVAPSrm
4503 : X86::MOVAPSrm)
4504 : (HasVLX ? X86::VMOVAPSZ128mr
4505 : HasAVX512 ? X86::VMOVAPSZ128mr_NOVLX
4506 : HasAVX ? X86::VMOVAPSmr
4507 : X86::MOVAPSmr);
4508 else
4509 return Load ? (HasVLX ? X86::VMOVUPSZ128rm
4510 : HasAVX512 ? X86::VMOVUPSZ128rm_NOVLX
4511 : HasAVX ? X86::VMOVUPSrm
4512 : X86::MOVUPSrm)
4513 : (HasVLX ? X86::VMOVUPSZ128mr
4514 : HasAVX512 ? X86::VMOVUPSZ128mr_NOVLX
4515 : HasAVX ? X86::VMOVUPSmr
4516 : X86::MOVUPSmr);
4517 }
4518 llvm_unreachable("Unknown 16-byte regclass");
4519 }
4520 case 32:
4521 assert(X86::VR256XRegClass.hasSubClassEq(RC) && "Unknown 32-byte regclass");
4522 // If stack is realigned we can use aligned stores.
4523 if (IsStackAligned)
4524 return Load ? (HasVLX ? X86::VMOVAPSZ256rm
4525 : HasAVX512 ? X86::VMOVAPSZ256rm_NOVLX
4526 : X86::VMOVAPSYrm)
4527 : (HasVLX ? X86::VMOVAPSZ256mr
4528 : HasAVX512 ? X86::VMOVAPSZ256mr_NOVLX
4529 : X86::VMOVAPSYmr);
4530 else
4531 return Load ? (HasVLX ? X86::VMOVUPSZ256rm
4532 : HasAVX512 ? X86::VMOVUPSZ256rm_NOVLX
4533 : X86::VMOVUPSYrm)
4534 : (HasVLX ? X86::VMOVUPSZ256mr
4535 : HasAVX512 ? X86::VMOVUPSZ256mr_NOVLX
4536 : X86::VMOVUPSYmr);
4537 case 64:
4538 assert(X86::VR512RegClass.hasSubClassEq(RC) && "Unknown 64-byte regclass");
4539 assert(STI.hasAVX512() && "Using 512-bit register requires AVX512");
4540 if (IsStackAligned)
4541 return Load ? X86::VMOVAPSZrm : X86::VMOVAPSZmr;
4542 else
4543 return Load ? X86::VMOVUPSZrm : X86::VMOVUPSZmr;
4544 case 1024:
4545 assert(X86::TILERegClass.hasSubClassEq(RC) && "Unknown 1024-byte regclass");
4546 assert(STI.hasAMXTILE() && "Using 8*1024-bit register requires AMX-TILE");
4547#define GET_EGPR_IF_ENABLED(OPC) (STI.hasEGPR() ? OPC##_EVEX : OPC)
4548 return Load ? GET_EGPR_IF_ENABLED(X86::TILELOADD)
4549 : GET_EGPR_IF_ENABLED(X86::TILESTORED);
4550#undef GET_EGPR_IF_ENABLED
4551 }
4552}
4553
4554std::optional<ExtAddrMode>
4556 const TargetRegisterInfo *TRI) const {
4557 const MCInstrDesc &Desc = MemI.getDesc();
4558 int MemRefBegin = X86II::getMemoryOperandNo(Desc.TSFlags);
4559 if (MemRefBegin < 0)
4560 return std::nullopt;
4561
4562 MemRefBegin += X86II::getOperandBias(Desc);
4563
4564 auto &BaseOp = MemI.getOperand(MemRefBegin + X86::AddrBaseReg);
4565 if (!BaseOp.isReg()) // Can be an MO_FrameIndex
4566 return std::nullopt;
4567
4568 const MachineOperand &DispMO = MemI.getOperand(MemRefBegin + X86::AddrDisp);
4569 // Displacement can be symbolic
4570 if (!DispMO.isImm())
4571 return std::nullopt;
4572
4573 ExtAddrMode AM;
4574 AM.BaseReg = BaseOp.getReg();
4575 AM.ScaledReg = MemI.getOperand(MemRefBegin + X86::AddrIndexReg).getReg();
4576 AM.Scale = MemI.getOperand(MemRefBegin + X86::AddrScaleAmt).getImm();
4577 AM.Displacement = DispMO.getImm();
4578 return AM;
4579}
4580
4582 StringRef &ErrInfo) const {
4583 std::optional<ExtAddrMode> AMOrNone = getAddrModeFromMemoryOp(MI, nullptr);
4584 if (!AMOrNone)
4585 return true;
4586
4587 ExtAddrMode AM = *AMOrNone;
4589 if (AM.ScaledReg != X86::NoRegister) {
4590 switch (AM.Scale) {
4591 case 1:
4592 case 2:
4593 case 4:
4594 case 8:
4595 break;
4596 default:
4597 ErrInfo = "Scale factor in address must be 1, 2, 4 or 8";
4598 return false;
4599 }
4600 }
4601 if (!isInt<32>(AM.Displacement)) {
4602 ErrInfo = "Displacement in address must fit into 32-bit signed "
4603 "integer";
4604 return false;
4605 }
4606
4607 return true;
4608}
4609
4611 const Register Reg,
4612 int64_t &ImmVal) const {
4613 Register MovReg = Reg;
4614 const MachineInstr *MovMI = &MI;
4615
4616 // Follow use-def for SUBREG_TO_REG to find the real move immediate
4617 // instruction. It is quite common for x86-64.
4618 if (MI.isSubregToReg()) {
4619 // We use following pattern to setup 64b immediate.
4620 // %8:gr32 = MOV32r0 implicit-def dead $eflags
4621 // %6:gr64 = SUBREG_TO_REG 0, killed %8:gr32, %subreg.sub_32bit
4622 if (!MI.getOperand(1).isImm())
4623 return false;
4624 unsigned FillBits = MI.getOperand(1).getImm();
4625 unsigned SubIdx = MI.getOperand(3).getImm();
4626 MovReg = MI.getOperand(2).getReg();
4627 if (SubIdx != X86::sub_32bit || FillBits != 0)
4628 return false;
4629 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
4630 MovMI = MRI.getUniqueVRegDef(MovReg);
4631 if (!MovMI)
4632 return false;
4633 }
4634
4635 if (MovMI->getOpcode() == X86::MOV32r0 &&
4636 MovMI->getOperand(0).getReg() == MovReg) {
4637 ImmVal = 0;
4638 return true;
4639 }
4640
4641 if (MovMI->getOpcode() != X86::MOV32ri &&
4642 MovMI->getOpcode() != X86::MOV64ri &&
4643 MovMI->getOpcode() != X86::MOV32ri64 && MovMI->getOpcode() != X86::MOV8ri)
4644 return false;
4645 // Mov Src can be a global address.
4646 if (!MovMI->getOperand(1).isImm() || MovMI->getOperand(0).getReg() != MovReg)
4647 return false;
4648 ImmVal = MovMI->getOperand(1).getImm();
4649 return true;
4650}
4651
4653 const MachineInstr *MI, const Register NullValueReg,
4654 const TargetRegisterInfo *TRI) const {
4655 if (!MI->modifiesRegister(NullValueReg, TRI))
4656 return true;
4657 switch (MI->getOpcode()) {
4658 // Shift right/left of a null unto itself is still a null, i.e. rax = shl rax
4659 // X.
4660 case X86::SHR64ri:
4661 case X86::SHR32ri:
4662 case X86::SHL64ri:
4663 case X86::SHL32ri:
4664 assert(MI->getOperand(0).isDef() && MI->getOperand(1).isUse() &&
4665 "expected for shift opcode!");
4666 return MI->getOperand(0).getReg() == NullValueReg &&
4667 MI->getOperand(1).getReg() == NullValueReg;
4668 // Zero extend of a sub-reg of NullValueReg into itself does not change the
4669 // null value.
4670 case X86::MOV32rr:
4671 return llvm::all_of(MI->operands(), [&](const MachineOperand &MO) {
4672 return TRI->isSubRegisterEq(NullValueReg, MO.getReg());
4673 });
4674 default:
4675 return false;
4676 }
4677 llvm_unreachable("Should be handled above!");
4678}
4679
4682 int64_t &Offset, bool &OffsetIsScalable, LocationSize &Width,
4683 const TargetRegisterInfo *TRI) const {
4684 const MCInstrDesc &Desc = MemOp.getDesc();
4685 int MemRefBegin = X86II::getMemoryOperandNo(Desc.TSFlags);
4686 if (MemRefBegin < 0)
4687 return false;
4688
4689 MemRefBegin += X86II::getOperandBias(Desc);
4690
4691 const MachineOperand *BaseOp =
4692 &MemOp.getOperand(MemRefBegin + X86::AddrBaseReg);
4693 if (!BaseOp->isReg()) // Can be an MO_FrameIndex
4694 return false;
4695
4696 if (MemOp.getOperand(MemRefBegin + X86::AddrScaleAmt).getImm() != 1)
4697 return false;
4698
4699 if (MemOp.getOperand(MemRefBegin + X86::AddrIndexReg).getReg() !=
4700 X86::NoRegister)
4701 return false;
4702
4703 const MachineOperand &DispMO = MemOp.getOperand(MemRefBegin + X86::AddrDisp);
4704
4705 // Displacement can be symbolic
4706 if (!DispMO.isImm())
4707 return false;
4708
4709 Offset = DispMO.getImm();
4710
4711 if (!BaseOp->isReg())
4712 return false;
4713
4714 OffsetIsScalable = false;
4715 // FIXME: Relying on memoperands() may not be right thing to do here. Check
4716 // with X86 maintainers, and fix it accordingly. For now, it is ok, since
4717 // there is no use of `Width` for X86 back-end at the moment.
4718 Width = !MemOp.memoperands_empty() ? MemOp.memoperands().front()->getSize()
4720 BaseOps.push_back(BaseOp);
4721 return true;
4722}
4723
4724static unsigned getStoreRegOpcode(Register SrcReg,
4725 const TargetRegisterClass *RC,
4726 bool IsStackAligned,
4727 const X86Subtarget &STI) {
4728 return getLoadStoreRegOpcode(SrcReg, RC, IsStackAligned, STI, false);
4729}
4730
4731static unsigned getLoadRegOpcode(Register DestReg,
4732 const TargetRegisterClass *RC,
4733 bool IsStackAligned, const X86Subtarget &STI) {
4734 return getLoadStoreRegOpcode(DestReg, RC, IsStackAligned, STI, true);
4735}
4736
4737static bool isAMXOpcode(unsigned Opc) {
4738 switch (Opc) {
4739 default:
4740 return false;
4741 case X86::TILELOADD:
4742 case X86::TILESTORED:
4743 case X86::TILELOADD_EVEX:
4744 case X86::TILESTORED_EVEX:
4745 return true;
4746 }
4747}
4748
4751 unsigned Opc, Register Reg, int FrameIdx,
4752 bool isKill) const {
4753 switch (Opc) {
4754 default:
4755 llvm_unreachable("Unexpected special opcode!");
4756 case X86::TILESTORED:
4757 case X86::TILESTORED_EVEX: {
4758 // tilestored %tmm, (%sp, %idx)
4759 MachineRegisterInfo &RegInfo = MBB.getParent()->getRegInfo();
4760 Register VirtReg = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass);
4761 BuildMI(MBB, MI, DebugLoc(), get(X86::MOV64ri), VirtReg).addImm(64);
4762 MachineInstr *NewMI =
4763 addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc)), FrameIdx)
4764 .addReg(Reg, getKillRegState(isKill));
4766 MO.setReg(VirtReg);
4767 MO.setIsKill(true);
4768 break;
4769 }
4770 case X86::TILELOADD:
4771 case X86::TILELOADD_EVEX: {
4772 // tileloadd (%sp, %idx), %tmm
4773 MachineRegisterInfo &RegInfo = MBB.getParent()->getRegInfo();
4774 Register VirtReg = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass);
4775 BuildMI(MBB, MI, DebugLoc(), get(X86::MOV64ri), VirtReg).addImm(64);
4777 BuildMI(MBB, MI, DebugLoc(), get(Opc), Reg), FrameIdx);
4779 MO.setReg(VirtReg);
4780 MO.setIsKill(true);
4781 break;
4782 }
4783 }
4784}
4785
4788 bool isKill, int FrameIdx, const TargetRegisterClass *RC,
4789
4790 Register VReg, MachineInstr::MIFlag Flags) const {
4791 const MachineFunction &MF = *MBB.getParent();
4792 const MachineFrameInfo &MFI = MF.getFrameInfo();
4793 assert(MFI.getObjectSize(FrameIdx) >= RI.getSpillSize(*RC) &&
4794 "Stack slot too small for store");
4795
4796 unsigned Alignment = std::max<uint32_t>(RI.getSpillSize(*RC), 16);
4797 bool isAligned =
4798 (Subtarget.getFrameLowering()->getStackAlign() >= Alignment) ||
4799 (RI.canRealignStack(MF) && !MFI.isFixedObjectIndex(FrameIdx));
4800
4801 unsigned Opc = getStoreRegOpcode(SrcReg, RC, isAligned, Subtarget);
4802 if (isAMXOpcode(Opc))
4803 loadStoreTileReg(MBB, MI, Opc, SrcReg, FrameIdx, isKill);
4804 else
4805 addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc)), FrameIdx)
4806 .addReg(SrcReg, getKillRegState(isKill))
4807 .setMIFlag(Flags);
4808}
4809
4812 Register DestReg, int FrameIdx,
4813 const TargetRegisterClass *RC,
4814 Register VReg,
4815 MachineInstr::MIFlag Flags) const {
4816 const MachineFunction &MF = *MBB.getParent();
4817 const MachineFrameInfo &MFI = MF.getFrameInfo();
4818 assert(MFI.getObjectSize(FrameIdx) >= RI.getSpillSize(*RC) &&
4819 "Load size exceeds stack slot");
4820 unsigned Alignment = std::max<uint32_t>(RI.getSpillSize(*RC), 16);
4821 bool isAligned =
4822 (Subtarget.getFrameLowering()->getStackAlign() >= Alignment) ||
4823 (RI.canRealignStack(MF) && !MFI.isFixedObjectIndex(FrameIdx));
4824
4825 unsigned Opc = getLoadRegOpcode(DestReg, RC, isAligned, Subtarget);
4826 if (isAMXOpcode(Opc))
4827 loadStoreTileReg(MBB, MI, Opc, DestReg, FrameIdx);
4828 else
4829 addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc), DestReg), FrameIdx)
4830 .setMIFlag(Flags);
4831}
4832
4834 Register &SrcReg2, int64_t &CmpMask,
4835 int64_t &CmpValue) const {
4836 switch (MI.getOpcode()) {
4837 default:
4838 break;
4839 case X86::CMP64ri32:
4840 case X86::CMP32ri:
4841 case X86::CMP16ri:
4842 case X86::CMP8ri:
4843 SrcReg = MI.getOperand(0).getReg();
4844 SrcReg2 = 0;
4845 if (MI.getOperand(1).isImm()) {
4846 CmpMask = ~0;
4847 CmpValue = MI.getOperand(1).getImm();
4848 } else {
4849 CmpMask = CmpValue = 0;
4850 }
4851 return true;
4852 // A SUB can be used to perform comparison.
4853 CASE_ND(SUB64rm)
4854 CASE_ND(SUB32rm)
4855 CASE_ND(SUB16rm)
4856 CASE_ND(SUB8rm)
4857 SrcReg = MI.getOperand(1).getReg();
4858 SrcReg2 = 0;
4859 CmpMask = 0;
4860 CmpValue = 0;
4861 return true;
4862 CASE_ND(SUB64rr)
4863 CASE_ND(SUB32rr)
4864 CASE_ND(SUB16rr)
4865 CASE_ND(SUB8rr)
4866 SrcReg = MI.getOperand(1).getReg();
4867 SrcReg2 = MI.getOperand(2).getReg();
4868 CmpMask = 0;
4869 CmpValue = 0;
4870 return true;
4871 CASE_ND(SUB64ri32)
4872 CASE_ND(SUB32ri)
4873 CASE_ND(SUB16ri)
4874 CASE_ND(SUB8ri)
4875 SrcReg = MI.getOperand(1).getReg();
4876 SrcReg2 = 0;
4877 if (MI.getOperand(2).isImm()) {
4878 CmpMask = ~0;
4879 CmpValue = MI.getOperand(2).getImm();
4880 } else {
4881 CmpMask = CmpValue = 0;
4882 }
4883 return true;
4884 case X86::CMP64rr:
4885 case X86::CMP32rr:
4886 case X86::CMP16rr:
4887 case X86::CMP8rr:
4888 SrcReg = MI.getOperand(0).getReg();
4889 SrcReg2 = MI.getOperand(1).getReg();
4890 CmpMask = 0;
4891 CmpValue = 0;
4892 return true;
4893 case X86::TEST8rr:
4894 case X86::TEST16rr:
4895 case X86::TEST32rr:
4896 case X86::TEST64rr:
4897 SrcReg = MI.getOperand(0).getReg();
4898 if (MI.getOperand(1).getReg() != SrcReg)
4899 return false;
4900 // Compare against zero.
4901 SrcReg2 = 0;
4902 CmpMask = ~0;
4903 CmpValue = 0;
4904 return true;
4905 case X86::TEST64ri32:
4906 case X86::TEST32ri:
4907 case X86::TEST16ri:
4908 case X86::TEST8ri:
4909 SrcReg = MI.getOperand(0).getReg();
4910 SrcReg2 = 0;
4911 // Force identical compare.
4912 CmpMask = 0;
4913 CmpValue = 0;
4914 return true;
4915 }
4916 return false;
4917}
4918
4919bool X86InstrInfo::isRedundantFlagInstr(const MachineInstr &FlagI,
4920 Register SrcReg, Register SrcReg2,
4921 int64_t ImmMask, int64_t ImmValue,
4922 const MachineInstr &OI, bool *IsSwapped,
4923 int64_t *ImmDelta) const {
4924 switch (OI.getOpcode()) {
4925 case X86::CMP64rr:
4926 case X86::CMP32rr:
4927 case X86::CMP16rr:
4928 case X86::CMP8rr:
4929 CASE_ND(SUB64rr)
4930 CASE_ND(SUB32rr)
4931 CASE_ND(SUB16rr)
4932 CASE_ND(SUB8rr) {
4933 Register OISrcReg;
4934 Register OISrcReg2;
4935 int64_t OIMask;
4936 int64_t OIValue;
4937 if (!analyzeCompare(OI, OISrcReg, OISrcReg2, OIMask, OIValue) ||
4938 OIMask != ImmMask || OIValue != ImmValue)
4939 return false;
4940 if (SrcReg == OISrcReg && SrcReg2 == OISrcReg2) {
4941 *IsSwapped = false;
4942 return true;
4943 }
4944 if (SrcReg == OISrcReg2 && SrcReg2 == OISrcReg) {
4945 *IsSwapped = true;
4946 return true;
4947 }
4948 return false;
4949 }
4950 case X86::CMP64ri32:
4951 case X86::CMP32ri:
4952 case X86::CMP16ri:
4953 case X86::CMP8ri:
4954 case X86::TEST64ri32:
4955 case X86::TEST32ri:
4956 case X86::TEST16ri:
4957 case X86::TEST8ri:
4958 CASE_ND(SUB64ri32)
4959 CASE_ND(SUB32ri)
4960 CASE_ND(SUB16ri)
4961 CASE_ND(SUB8ri)
4962 case X86::TEST64rr:
4963 case X86::TEST32rr:
4964 case X86::TEST16rr:
4965 case X86::TEST8rr: {
4966 if (ImmMask != 0) {
4967 Register OISrcReg;
4968 Register OISrcReg2;
4969 int64_t OIMask;
4970 int64_t OIValue;
4971 if (analyzeCompare(OI, OISrcReg, OISrcReg2, OIMask, OIValue) &&
4972 SrcReg == OISrcReg && ImmMask == OIMask) {
4973 if (OIValue == ImmValue) {
4974 *ImmDelta = 0;
4975 return true;
4976 } else if (static_cast<uint64_t>(ImmValue) ==
4977 static_cast<uint64_t>(OIValue) - 1) {
4978 *ImmDelta = -1;
4979 return true;
4980 } else if (static_cast<uint64_t>(ImmValue) ==
4981 static_cast<uint64_t>(OIValue) + 1) {
4982 *ImmDelta = 1;
4983 return true;
4984 } else {
4985 return false;
4986 }
4987 }
4988 }
4989 return FlagI.isIdenticalTo(OI);
4990 }
4991 default:
4992 return false;
4993 }
4994}
4995
4996/// Check whether the definition can be converted
4997/// to remove a comparison against zero.
4998inline static bool isDefConvertible(const MachineInstr &MI, bool &NoSignFlag,
4999 bool &ClearsOverflowFlag) {
5000 NoSignFlag = false;
5001 ClearsOverflowFlag = false;
5002
5003 // "ELF Handling for Thread-Local Storage" specifies that x86-64 GOTTPOFF, and
5004 // i386 GOTNTPOFF/INDNTPOFF relocations can convert an ADD to a LEA during
5005 // Initial Exec to Local Exec relaxation. In these cases, we must not depend
5006 // on the EFLAGS modification of ADD actually happening in the final binary.
5007 if (MI.getOpcode() == X86::ADD64rm || MI.getOpcode() == X86::ADD32rm) {
5008 unsigned Flags = MI.getOperand(5).getTargetFlags();
5009 if (Flags == X86II::MO_GOTTPOFF || Flags == X86II::MO_INDNTPOFF ||
5010 Flags == X86II::MO_GOTNTPOFF)
5011 return false;
5012 }
5013
5014 switch (MI.getOpcode()) {
5015 default:
5016 return false;
5017
5018 // The shift instructions only modify ZF if their shift count is non-zero.
5019 // N.B.: The processor truncates the shift count depending on the encoding.
5020 CASE_ND(SAR8ri)
5021 CASE_ND(SAR16ri)
5022 CASE_ND(SAR32ri)
5023 CASE_ND(SAR64ri)
5024 CASE_ND(SHR8ri)
5025 CASE_ND(SHR16ri)
5026 CASE_ND(SHR32ri)
5027 CASE_ND(SHR64ri)
5028 return getTruncatedShiftCount(MI, 2) != 0;
5029
5030 // Some left shift instructions can be turned into LEA instructions but only
5031 // if their flags aren't used. Avoid transforming such instructions.
5032 CASE_ND(SHL8ri)
5033 CASE_ND(SHL16ri)
5034 CASE_ND(SHL32ri)
5035 CASE_ND(SHL64ri) {
5036 unsigned ShAmt = getTruncatedShiftCount(MI, 2);
5037 if (isTruncatedShiftCountForLEA(ShAmt))
5038 return false;
5039 return ShAmt != 0;
5040 }
5041
5042 CASE_ND(SHRD16rri8)
5043 CASE_ND(SHRD32rri8)
5044 CASE_ND(SHRD64rri8)
5045 CASE_ND(SHLD16rri8)
5046 CASE_ND(SHLD32rri8)
5047 CASE_ND(SHLD64rri8)
5048 return getTruncatedShiftCount(MI, 3) != 0;
5049
5050 CASE_ND(SUB64ri32)
5051 CASE_ND(SUB32ri)
5052 CASE_ND(SUB16ri)
5053 CASE_ND(SUB8ri)
5054 CASE_ND(SUB64rr)
5055 CASE_ND(SUB32rr)
5056 CASE_ND(SUB16rr)
5057 CASE_ND(SUB8rr)
5058 CASE_ND(SUB64rm)
5059 CASE_ND(SUB32rm)
5060 CASE_ND(SUB16rm)
5061 CASE_ND(SUB8rm)
5062 CASE_ND(DEC64r)
5063 CASE_ND(DEC32r)
5064 CASE_ND(DEC16r)
5065 CASE_ND(DEC8r)
5066 CASE_ND(ADD64ri32)
5067 CASE_ND(ADD32ri)
5068 CASE_ND(ADD16ri)
5069 CASE_ND(ADD8ri)
5070 CASE_ND(ADD64rr)
5071 CASE_ND(ADD32rr)
5072 CASE_ND(ADD16rr)
5073 CASE_ND(ADD8rr)
5074 CASE_ND(ADD64rm)
5075 CASE_ND(ADD32rm)
5076 CASE_ND(ADD16rm)
5077 CASE_ND(ADD8rm)
5078 CASE_ND(INC64r)
5079 CASE_ND(INC32r)
5080 CASE_ND(INC16r)
5081 CASE_ND(INC8r)
5082 CASE_ND(ADC64ri32)
5083 CASE_ND(ADC32ri)
5084 CASE_ND(ADC16ri)
5085 CASE_ND(ADC8ri)
5086 CASE_ND(ADC64rr)
5087 CASE_ND(ADC32rr)
5088 CASE_ND(ADC16rr)
5089 CASE_ND(ADC8rr)
5090 CASE_ND(ADC64rm)
5091 CASE_ND(ADC32rm)
5092 CASE_ND(ADC16rm)
5093 CASE_ND(ADC8rm)
5094 CASE_ND(SBB64ri32)
5095 CASE_ND(SBB32ri)
5096 CASE_ND(SBB16ri)
5097 CASE_ND(SBB8ri)
5098 CASE_ND(SBB64rr)
5099 CASE_ND(SBB32rr)
5100 CASE_ND(SBB16rr)
5101 CASE_ND(SBB8rr)
5102 CASE_ND(SBB64rm)
5103 CASE_ND(SBB32rm)
5104 CASE_ND(SBB16rm)
5105 CASE_ND(SBB8rm)
5106 CASE_ND(NEG8r)
5107 CASE_ND(NEG16r)
5108 CASE_ND(NEG32r)
5109 CASE_ND(NEG64r)
5110 case X86::LZCNT16rr:
5111 case X86::LZCNT16rm:
5112 case X86::LZCNT32rr:
5113 case X86::LZCNT32rm:
5114 case X86::LZCNT64rr:
5115 case X86::LZCNT64rm:
5116 case X86::POPCNT16rr:
5117 case X86::POPCNT16rm:
5118 case X86::POPCNT32rr:
5119 case X86::POPCNT32rm:
5120 case X86::POPCNT64rr:
5121 case X86::POPCNT64rm:
5122 case X86::TZCNT16rr:
5123 case X86::TZCNT16rm:
5124 case X86::TZCNT32rr:
5125 case X86::TZCNT32rm:
5126 case X86::TZCNT64rr:
5127 case X86::TZCNT64rm:
5128 return true;
5129 CASE_ND(AND64ri32)
5130 CASE_ND(AND32ri)
5131 CASE_ND(AND16ri)
5132 CASE_ND(AND8ri)
5133 CASE_ND(AND64rr)
5134 CASE_ND(AND32rr)
5135 CASE_ND(AND16rr)
5136 CASE_ND(AND8rr)
5137 CASE_ND(AND64rm)
5138 CASE_ND(AND32rm)
5139 CASE_ND(AND16rm)
5140 CASE_ND(AND8rm)
5141 CASE_ND(XOR64ri32)
5142 CASE_ND(XOR32ri)
5143 CASE_ND(XOR16ri)
5144 CASE_ND(XOR8ri)
5145 CASE_ND(XOR64rr)
5146 CASE_ND(XOR32rr)
5147 CASE_ND(XOR16rr)
5148 CASE_ND(XOR8rr)
5149 CASE_ND(XOR64rm)
5150 CASE_ND(XOR32rm)
5151 CASE_ND(XOR16rm)
5152 CASE_ND(XOR8rm)
5153 CASE_ND(OR64ri32)
5154 CASE_ND(OR32ri)
5155 CASE_ND(OR16ri)
5156 CASE_ND(OR8ri)
5157 CASE_ND(OR64rr)
5158 CASE_ND(OR32rr)
5159 CASE_ND(OR16rr)
5160 CASE_ND(OR8rr)
5161 CASE_ND(OR64rm)
5162 CASE_ND(OR32rm)
5163 CASE_ND(OR16rm)
5164 CASE_ND(OR8rm)
5165 case X86::ANDN32rr:
5166 case X86::ANDN32rm:
5167 case X86::ANDN64rr:
5168 case X86::ANDN64rm:
5169 case X86::BLSI32rr:
5170 case X86::BLSI32rm:
5171 case X86::BLSI64rr:
5172 case X86::BLSI64rm:
5173 case X86::BLSMSK32rr:
5174 case X86::BLSMSK32rm:
5175 case X86::BLSMSK64rr:
5176 case X86::BLSMSK64rm:
5177 case X86::BLSR32rr:
5178 case X86::BLSR32rm:
5179 case X86::BLSR64rr:
5180 case X86::BLSR64rm:
5181 case X86::BLCFILL32rr:
5182 case X86::BLCFILL32rm:
5183 case X86::BLCFILL64rr:
5184 case X86::BLCFILL64rm:
5185 case X86::BLCI32rr:
5186 case X86::BLCI32rm:
5187 case X86::BLCI64rr:
5188 case X86::BLCI64rm:
5189 case X86::BLCIC32rr:
5190 case X86::BLCIC32rm:
5191 case X86::BLCIC64rr:
5192 case X86::BLCIC64rm:
5193 case X86::BLCMSK32rr:
5194 case X86::BLCMSK32rm:
5195 case X86::BLCMSK64rr:
5196 case X86::BLCMSK64rm:
5197 case X86::BLCS32rr:
5198 case X86::BLCS32rm:
5199 case X86::BLCS64rr:
5200 case X86::BLCS64rm:
5201 case X86::BLSFILL32rr:
5202 case X86::BLSFILL32rm:
5203 case X86::BLSFILL64rr:
5204 case X86::BLSFILL64rm:
5205 case X86::BLSIC32rr:
5206 case X86::BLSIC32rm:
5207 case X86::BLSIC64rr:
5208 case X86::BLSIC64rm:
5209 case X86::BZHI32rr:
5210 case X86::BZHI32rm:
5211 case X86::BZHI64rr:
5212 case X86::BZHI64rm:
5213 case X86::T1MSKC32rr:
5214 case X86::T1MSKC32rm:
5215 case X86::T1MSKC64rr:
5216 case X86::T1MSKC64rm:
5217 case X86::TZMSK32rr:
5218 case X86::TZMSK32rm:
5219 case X86::TZMSK64rr:
5220 case X86::TZMSK64rm:
5221 // These instructions clear the overflow flag just like TEST.
5222 // FIXME: These are not the only instructions in this switch that clear the
5223 // overflow flag.
5224 ClearsOverflowFlag = true;
5225 return true;
5226 case X86::BEXTR32rr:
5227 case X86::BEXTR64rr:
5228 case X86::BEXTR32rm:
5229 case X86::BEXTR64rm:
5230 case X86::BEXTRI32ri:
5231 case X86::BEXTRI32mi:
5232 case X86::BEXTRI64ri:
5233 case X86::BEXTRI64mi:
5234 // BEXTR doesn't update the sign flag so we can't use it. It does clear
5235 // the overflow flag, but that's not useful without the sign flag.
5236 NoSignFlag = true;
5237 return true;
5238 }
5239}
5240
5241/// Check whether the use can be converted to remove a comparison against zero.
5242/// Returns the EFLAGS condition and the operand that we are comparing against zero.
5243static std::pair<X86::CondCode, unsigned> isUseDefConvertible(const MachineInstr &MI) {
5244 switch (MI.getOpcode()) {
5245 default:
5246 return std::make_pair(X86::COND_INVALID, ~0U);
5247 CASE_ND(NEG8r)
5248 CASE_ND(NEG16r)
5249 CASE_ND(NEG32r)
5250 CASE_ND(NEG64r)
5251 return std::make_pair(X86::COND_AE, 1U);
5252 case X86::LZCNT16rr:
5253 case X86::LZCNT32rr:
5254 case X86::LZCNT64rr:
5255 return std::make_pair(X86::COND_B, 1U);
5256 case X86::POPCNT16rr:
5257 case X86::POPCNT32rr:
5258 case X86::POPCNT64rr:
5259 return std::make_pair(X86::COND_E, 1U);
5260 case X86::TZCNT16rr:
5261 case X86::TZCNT32rr:
5262 case X86::TZCNT64rr:
5263 return std::make_pair(X86::COND_B, 1U);
5264 case X86::BSF16rr:
5265 case X86::BSF32rr:
5266 case X86::BSF64rr:
5267 case X86::BSR16rr:
5268 case X86::BSR32rr:
5269 case X86::BSR64rr:
5270 return std::make_pair(X86::COND_E, 2U);
5271 case X86::BLSI32rr:
5272 case X86::BLSI64rr:
5273 return std::make_pair(X86::COND_AE, 1U);
5274 case X86::BLSR32rr:
5275 case X86::BLSR64rr:
5276 case X86::BLSMSK32rr:
5277 case X86::BLSMSK64rr:
5278 return std::make_pair(X86::COND_B, 1U);
5279 // TODO: TBM instructions.
5280 }
5281}
5282
5283/// Check if there exists an earlier instruction that
5284/// operates on the same source operands and sets flags in the same way as
5285/// Compare; remove Compare if possible.
5287 Register SrcReg2, int64_t CmpMask,
5288 int64_t CmpValue,
5289 const MachineRegisterInfo *MRI) const {
5290 // Check whether we can replace SUB with CMP.
5291 switch (CmpInstr.getOpcode()) {
5292 default:
5293 break;
5294 CASE_ND(SUB64ri32)
5295 CASE_ND(SUB32ri)
5296 CASE_ND(SUB16ri)
5297 CASE_ND(SUB8ri)
5298 CASE_ND(SUB64rm)
5299 CASE_ND(SUB32rm)
5300 CASE_ND(SUB16rm)
5301 CASE_ND(SUB8rm)
5302 CASE_ND(SUB64rr)
5303 CASE_ND(SUB32rr)
5304 CASE_ND(SUB16rr)
5305 CASE_ND(SUB8rr) {
5306 if (!MRI->use_nodbg_empty(CmpInstr.getOperand(0).getReg()))
5307 return false;
5308 // There is no use of the destination register, we can replace SUB with CMP.
5309 unsigned NewOpcode = 0;
5310#define FROM_TO(A, B) \
5311 CASE_ND(A) NewOpcode = X86::B; \
5312 break;
5313 switch (CmpInstr.getOpcode()) {
5314 default:
5315 llvm_unreachable("Unreachable!");
5316 FROM_TO(SUB64rm, CMP64rm)
5317 FROM_TO(SUB32rm, CMP32rm)
5318 FROM_TO(SUB16rm, CMP16rm)
5319 FROM_TO(SUB8rm, CMP8rm)
5320 FROM_TO(SUB64rr, CMP64rr)
5321 FROM_TO(SUB32rr, CMP32rr)
5322 FROM_TO(SUB16rr, CMP16rr)
5323 FROM_TO(SUB8rr, CMP8rr)
5324 FROM_TO(SUB64ri32, CMP64ri32)
5325 FROM_TO(SUB32ri, CMP32ri)
5326 FROM_TO(SUB16ri, CMP16ri)
5327 FROM_TO(SUB8ri, CMP8ri)
5328 }
5329#undef FROM_TO
5330 CmpInstr.setDesc(get(NewOpcode));
5331 CmpInstr.removeOperand(0);
5332 // Mutating this instruction invalidates any debug data associated with it.
5333 CmpInstr.dropDebugNumber();
5334 // Fall through to optimize Cmp if Cmp is CMPrr or CMPri.
5335 if (NewOpcode == X86::CMP64rm || NewOpcode == X86::CMP32rm ||
5336 NewOpcode == X86::CMP16rm || NewOpcode == X86::CMP8rm)
5337 return false;
5338 }
5339 }
5340
5341 // The following code tries to remove the comparison by re-using EFLAGS
5342 // from earlier instructions.
5343
5344 bool IsCmpZero = (CmpMask != 0 && CmpValue == 0);
5345
5346 // Transformation currently requires SSA values.
5347 if (SrcReg2.isPhysical())
5348 return false;
5349 MachineInstr *SrcRegDef = MRI->getVRegDef(SrcReg);
5350 assert(SrcRegDef && "Must have a definition (SSA)");
5351
5352 MachineInstr *MI = nullptr;
5353 MachineInstr *Sub = nullptr;
5354 MachineInstr *Movr0Inst = nullptr;
5356 bool NoSignFlag = false;
5357 bool ClearsOverflowFlag = false;
5358 bool ShouldUpdateCC = false;
5359 bool IsSwapped = false;
5360 bool HasNF = Subtarget.hasNF();
5361 unsigned OpNo = 0;
5363 int64_t ImmDelta = 0;
5364
5365 // Search backward from CmpInstr for the next instruction defining EFLAGS.
5367 MachineBasicBlock &CmpMBB = *CmpInstr.getParent();
5369 std::next(MachineBasicBlock::reverse_iterator(CmpInstr));
5370 for (MachineBasicBlock *MBB = &CmpMBB;;) {
5371 for (MachineInstr &Inst : make_range(From, MBB->rend())) {
5372 // Try to use EFLAGS from the instruction defining %SrcReg. Example:
5373 // %eax = addl ...
5374 // ... // EFLAGS not changed
5375 // testl %eax, %eax // <-- can be removed
5376 if (&Inst == SrcRegDef) {
5377 if (IsCmpZero &&
5378 isDefConvertible(Inst, NoSignFlag, ClearsOverflowFlag)) {
5379 MI = &Inst;
5380 break;
5381 }
5382
5383 // Look back for the following pattern, in which case the
5384 // test16rr/test64rr instruction could be erased.
5385 //
5386 // Example for test16rr:
5387 // %reg = and32ri %in_reg, 5
5388 // ... // EFLAGS not changed.
5389 // %src_reg = copy %reg.sub_16bit:gr32
5390 // test16rr %src_reg, %src_reg, implicit-def $eflags
5391 // Example for test64rr:
5392 // %reg = and32ri %in_reg, 5
5393 // ... // EFLAGS not changed.
5394 // %src_reg = subreg_to_reg 0, %reg, %subreg.sub_index
5395 // test64rr %src_reg, %src_reg, implicit-def $eflags
5396 MachineInstr *AndInstr = nullptr;
5397 if (IsCmpZero &&
5398 findRedundantFlagInstr(CmpInstr, Inst, MRI, &AndInstr, TRI,
5399 Subtarget, NoSignFlag, ClearsOverflowFlag)) {
5400 assert(AndInstr != nullptr && X86::isAND(AndInstr->getOpcode()));
5401 MI = AndInstr;
5402 break;
5403 }
5404 // Cannot find other candidates before definition of SrcReg.
5405 return false;
5406 }
5407
5408 if (Inst.modifiesRegister(X86::EFLAGS, TRI)) {
5409 // Try to use EFLAGS produced by an instruction reading %SrcReg.
5410 // Example:
5411 // %eax = ...
5412 // ...
5413 // popcntl %eax
5414 // ... // EFLAGS not changed
5415 // testl %eax, %eax // <-- can be removed
5416 if (IsCmpZero) {
5417 std::tie(NewCC, OpNo) = isUseDefConvertible(Inst);
5418 if (NewCC != X86::COND_INVALID && Inst.getOperand(OpNo).isReg() &&
5419 Inst.getOperand(OpNo).getReg() == SrcReg) {
5420 ShouldUpdateCC = true;
5421 MI = &Inst;
5422 break;
5423 }
5424 }
5425
5426 // Try to use EFLAGS from an instruction with similar flag results.
5427 // Example:
5428 // sub x, y or cmp x, y
5429 // ... // EFLAGS not changed
5430 // cmp x, y // <-- can be removed
5431 if (isRedundantFlagInstr(CmpInstr, SrcReg, SrcReg2, CmpMask, CmpValue,
5432 Inst, &IsSwapped, &ImmDelta)) {
5433 Sub = &Inst;
5434 break;
5435 }
5436
5437 // MOV32r0 is implemented with xor which clobbers condition code. It is
5438 // safe to move up, if the definition to EFLAGS is dead and earlier
5439 // instructions do not read or write EFLAGS.
5440 if (!Movr0Inst && Inst.getOpcode() == X86::MOV32r0 &&
5441 Inst.registerDefIsDead(X86::EFLAGS, TRI)) {
5442 Movr0Inst = &Inst;
5443 continue;
5444 }
5445
5446 // For the instructions are ADDrm/ADDmr with relocation, we'll skip the
5447 // optimization for replacing non-NF with NF. This is to keep backward
5448 // compatiblity with old version of linkers without APX relocation type
5449 // support on Linux OS.
5450 bool IsWithReloc = X86EnableAPXForRelocation
5451 ? false
5453
5454 // Try to replace non-NF with NF instructions.
5455 if (HasNF && Inst.registerDefIsDead(X86::EFLAGS, TRI) && !IsWithReloc) {
5456 unsigned NewOp = X86::getNFVariant(Inst.getOpcode());
5457 if (!NewOp)
5458 return false;
5459
5460 InstsToUpdate.push_back(std::make_pair(&Inst, NewOp));
5461 continue;
5462 }
5463
5464 // Cannot do anything for any other EFLAG changes.
5465 return false;
5466 }
5467 }
5468
5469 if (MI || Sub)
5470 break;
5471
5472 // Reached begin of basic block. Continue in predecessor if there is
5473 // exactly one.
5474 if (MBB->pred_size() != 1)
5475 return false;
5476 MBB = *MBB->pred_begin();
5477 From = MBB->rbegin();
5478 }
5479
5480 // Scan forward from the instruction after CmpInstr for uses of EFLAGS.
5481 // It is safe to remove CmpInstr if EFLAGS is redefined or killed.
5482 // If we are done with the basic block, we need to check whether EFLAGS is
5483 // live-out.
5484 bool FlagsMayLiveOut = true;
5486 MachineBasicBlock::iterator AfterCmpInstr =
5487 std::next(MachineBasicBlock::iterator(CmpInstr));
5488 for (MachineInstr &Instr : make_range(AfterCmpInstr, CmpMBB.end())) {
5489 bool ModifyEFLAGS = Instr.modifiesRegister(X86::EFLAGS, TRI);
5490 bool UseEFLAGS = Instr.readsRegister(X86::EFLAGS, TRI);
5491 // We should check the usage if this instruction uses and updates EFLAGS.
5492 if (!UseEFLAGS && ModifyEFLAGS) {
5493 // It is safe to remove CmpInstr if EFLAGS is updated again.
5494 FlagsMayLiveOut = false;
5495 break;
5496 }
5497 if (!UseEFLAGS && !ModifyEFLAGS)
5498 continue;
5499
5500 // EFLAGS is used by this instruction.
5501 X86::CondCode OldCC = X86::getCondFromMI(Instr);
5502 if ((MI || IsSwapped || ImmDelta != 0) && OldCC == X86::COND_INVALID)
5503 return false;
5504
5505 X86::CondCode ReplacementCC = X86::COND_INVALID;
5506 if (MI) {
5507 switch (OldCC) {
5508 default:
5509 break;
5510 case X86::COND_A:
5511 case X86::COND_AE:
5512 case X86::COND_B:
5513 case X86::COND_BE:
5514 // CF is used, we can't perform this optimization.
5515 return false;
5516 case X86::COND_G:
5517 case X86::COND_GE:
5518 case X86::COND_L:
5519 case X86::COND_LE:
5520 // If SF is used, but the instruction doesn't update the SF, then we
5521 // can't do the optimization.
5522 if (NoSignFlag)
5523 return false;
5524 [[fallthrough]];
5525 case X86::COND_O:
5526 case X86::COND_NO:
5527 // If OF is used, the instruction needs to clear it like CmpZero does.
5528 if (!ClearsOverflowFlag)
5529 return false;
5530 break;
5531 case X86::COND_S:
5532 case X86::COND_NS:
5533 // If SF is used, but the instruction doesn't update the SF, then we
5534 // can't do the optimization.
5535 if (NoSignFlag)
5536 return false;
5537 break;
5538 }
5539
5540 // If we're updating the condition code check if we have to reverse the
5541 // condition.
5542 if (ShouldUpdateCC)
5543 switch (OldCC) {
5544 default:
5545 return false;
5546 case X86::COND_E:
5547 ReplacementCC = NewCC;
5548 break;
5549 case X86::COND_NE:
5550 ReplacementCC = GetOppositeBranchCondition(NewCC);
5551 break;
5552 }
5553 } else if (IsSwapped) {
5554 // If we have SUB(r1, r2) and CMP(r2, r1), the condition code needs
5555 // to be changed from r2 > r1 to r1 < r2, from r2 < r1 to r1 > r2, etc.
5556 // We swap the condition code and synthesize the new opcode.
5557 ReplacementCC = getSwappedCondition(OldCC);
5558 if (ReplacementCC == X86::COND_INVALID)
5559 return false;
5560 ShouldUpdateCC = true;
5561 } else if (ImmDelta != 0) {
5562 unsigned BitWidth = RI.getRegSizeInBits(*MRI->getRegClass(SrcReg));
5563 // Shift amount for min/max constants to adjust for 8/16/32 instruction
5564 // sizes.
5565 switch (OldCC) {
5566 case X86::COND_L: // x <s (C + 1) --> x <=s C
5567 if (ImmDelta != 1 || APInt::getSignedMinValue(BitWidth) == CmpValue)
5568 return false;
5569 ReplacementCC = X86::COND_LE;
5570 break;
5571 case X86::COND_B: // x <u (C + 1) --> x <=u C
5572 if (ImmDelta != 1 || CmpValue == 0)
5573 return false;
5574 ReplacementCC = X86::COND_BE;
5575 break;
5576 case X86::COND_GE: // x >=s (C + 1) --> x >s C
5577 if (ImmDelta != 1 || APInt::getSignedMinValue(BitWidth) == CmpValue)
5578 return false;
5579 ReplacementCC = X86::COND_G;
5580 break;
5581 case X86::COND_AE: // x >=u (C + 1) --> x >u C
5582 if (ImmDelta != 1 || CmpValue == 0)
5583 return false;
5584 ReplacementCC = X86::COND_A;
5585 break;
5586 case X86::COND_G: // x >s (C - 1) --> x >=s C
5587 if (ImmDelta != -1 || APInt::getSignedMaxValue(BitWidth) == CmpValue)
5588 return false;
5589 ReplacementCC = X86::COND_GE;
5590 break;
5591 case X86::COND_A: // x >u (C - 1) --> x >=u C
5592 if (ImmDelta != -1 || APInt::getMaxValue(BitWidth) == CmpValue)
5593 return false;
5594 ReplacementCC = X86::COND_AE;
5595 break;
5596 case X86::COND_LE: // x <=s (C - 1) --> x <s C
5597 if (ImmDelta != -1 || APInt::getSignedMaxValue(BitWidth) == CmpValue)
5598 return false;
5599 ReplacementCC = X86::COND_L;
5600 break;
5601 case X86::COND_BE: // x <=u (C - 1) --> x <u C
5602 if (ImmDelta != -1 || APInt::getMaxValue(BitWidth) == CmpValue)
5603 return false;
5604 ReplacementCC = X86::COND_B;
5605 break;
5606 default:
5607 return false;
5608 }
5609 ShouldUpdateCC = true;
5610 }
5611
5612 if (ShouldUpdateCC && ReplacementCC != OldCC) {
5613 // Push the MachineInstr to OpsToUpdate.
5614 // If it is safe to remove CmpInstr, the condition code of these
5615 // instructions will be modified.
5616 OpsToUpdate.push_back(std::make_pair(&Instr, ReplacementCC));
5617 }
5618 if (ModifyEFLAGS || Instr.killsRegister(X86::EFLAGS, TRI)) {
5619 // It is safe to remove CmpInstr if EFLAGS is updated again or killed.
5620 FlagsMayLiveOut = false;
5621 break;
5622 }
5623 }
5624
5625 // If we have to update users but EFLAGS is live-out abort, since we cannot
5626 // easily find all of the users.
5627 if ((MI != nullptr || ShouldUpdateCC) && FlagsMayLiveOut) {
5628 for (MachineBasicBlock *Successor : CmpMBB.successors())
5629 if (Successor->isLiveIn(X86::EFLAGS))
5630 return false;
5631 }
5632
5633 // The instruction to be updated is either Sub or MI.
5634 assert((MI == nullptr || Sub == nullptr) && "Should not have Sub and MI set");
5635 Sub = MI != nullptr ? MI : Sub;
5636 MachineBasicBlock *SubBB = Sub->getParent();
5637 // Move Movr0Inst to the appropriate place before Sub.
5638 if (Movr0Inst) {
5639 // Only move within the same block so we don't accidentally move to a
5640 // block with higher execution frequency.
5641 if (&CmpMBB != SubBB)
5642 return false;
5643 // Look backwards until we find a def that doesn't use the current EFLAGS.
5645 InsertE = Sub->getParent()->rend();
5646 for (; InsertI != InsertE; ++InsertI) {
5647 MachineInstr *Instr = &*InsertI;
5648 if (!Instr->readsRegister(X86::EFLAGS, TRI) &&
5649 Instr->modifiesRegister(X86::EFLAGS, TRI)) {
5650 Movr0Inst->getParent()->remove(Movr0Inst);
5651 Instr->getParent()->insert(MachineBasicBlock::iterator(Instr),
5652 Movr0Inst);
5653 break;
5654 }
5655 }
5656 if (InsertI == InsertE)
5657 return false;
5658 }
5659
5660 // Replace non-NF with NF instructions.
5661 for (auto &Inst : InstsToUpdate) {
5662 Inst.first->setDesc(get(Inst.second));
5663 Inst.first->removeOperand(
5664 Inst.first->findRegisterDefOperandIdx(X86::EFLAGS, /*TRI=*/nullptr));
5665 }
5666
5667 // Make sure Sub instruction defines EFLAGS and mark the def live.
5668 MachineOperand *FlagDef =
5669 Sub->findRegisterDefOperand(X86::EFLAGS, /*TRI=*/nullptr);
5670 assert(FlagDef && "Unable to locate a def EFLAGS operand");
5671 FlagDef->setIsDead(false);
5672
5673 CmpInstr.eraseFromParent();
5674
5675 // Modify the condition code of instructions in OpsToUpdate.
5676 for (auto &Op : OpsToUpdate) {
5677 Op.first->getOperand(Op.first->getDesc().getNumOperands() - 1)
5678 .setImm(Op.second);
5679 }
5680 // Add EFLAGS to block live-ins between CmpBB and block of flags producer.
5681 for (MachineBasicBlock *MBB = &CmpMBB; MBB != SubBB;
5682 MBB = *MBB->pred_begin()) {
5683 assert(MBB->pred_size() == 1 && "Expected exactly one predecessor");
5684 if (!MBB->isLiveIn(X86::EFLAGS))
5685 MBB->addLiveIn(X86::EFLAGS);
5686 }
5687 return true;
5688}
5689
5690/// \returns true if the instruction can be changed to COPY when imm is 0.
5691static bool canConvert2Copy(unsigned Opc) {
5692 switch (Opc) {
5693 default:
5694 return false;
5695 CASE_ND(ADD64ri32)
5696 CASE_ND(SUB64ri32)
5697 CASE_ND(OR64ri32)
5698 CASE_ND(XOR64ri32)
5699 CASE_ND(ADD32ri)
5700 CASE_ND(SUB32ri)
5701 CASE_ND(OR32ri)
5702 CASE_ND(XOR32ri)
5703 return true;
5704 }
5705}
5706
5707/// Convert an ALUrr opcode to corresponding ALUri opcode. Such as
5708/// ADD32rr ==> ADD32ri
5709static unsigned convertALUrr2ALUri(unsigned Opc) {
5710 switch (Opc) {
5711 default:
5712 return 0;
5713#define FROM_TO(FROM, TO) \
5714 case X86::FROM: \
5715 return X86::TO; \
5716 case X86::FROM##_ND: \
5717 return X86::TO##_ND;
5718 FROM_TO(ADD64rr, ADD64ri32)
5719 FROM_TO(ADC64rr, ADC64ri32)
5720 FROM_TO(SUB64rr, SUB64ri32)
5721 FROM_TO(SBB64rr, SBB64ri32)
5722 FROM_TO(AND64rr, AND64ri32)
5723 FROM_TO(OR64rr, OR64ri32)
5724 FROM_TO(XOR64rr, XOR64ri32)
5725 FROM_TO(SHR64rCL, SHR64ri)
5726 FROM_TO(SHL64rCL, SHL64ri)
5727 FROM_TO(SAR64rCL, SAR64ri)
5728 FROM_TO(ROL64rCL, ROL64ri)
5729 FROM_TO(ROR64rCL, ROR64ri)
5730 FROM_TO(RCL64rCL, RCL64ri)
5731 FROM_TO(RCR64rCL, RCR64ri)
5732 FROM_TO(ADD32rr, ADD32ri)
5733 FROM_TO(ADC32rr, ADC32ri)
5734 FROM_TO(SUB32rr, SUB32ri)
5735 FROM_TO(SBB32rr, SBB32ri)
5736 FROM_TO(AND32rr, AND32ri)
5737 FROM_TO(OR32rr, OR32ri)
5738 FROM_TO(XOR32rr, XOR32ri)
5739 FROM_TO(SHR32rCL, SHR32ri)
5740 FROM_TO(SHL32rCL, SHL32ri)
5741 FROM_TO(SAR32rCL, SAR32ri)
5742 FROM_TO(ROL32rCL, ROL32ri)
5743 FROM_TO(ROR32rCL, ROR32ri)
5744 FROM_TO(RCL32rCL, RCL32ri)
5745 FROM_TO(RCR32rCL, RCR32ri)
5746#undef FROM_TO
5747#define FROM_TO(FROM, TO) \
5748 case X86::FROM: \
5749 return X86::TO;
5750 FROM_TO(TEST64rr, TEST64ri32)
5751 FROM_TO(CTEST64rr, CTEST64ri32)
5752 FROM_TO(CMP64rr, CMP64ri32)
5753 FROM_TO(CCMP64rr, CCMP64ri32)
5754 FROM_TO(TEST32rr, TEST32ri)
5755 FROM_TO(CTEST32rr, CTEST32ri)
5756 FROM_TO(CMP32rr, CMP32ri)
5757 FROM_TO(CCMP32rr, CCMP32ri)
5758#undef FROM_TO
5759 }
5760}
5761
5762/// Reg is assigned ImmVal in DefMI, and is used in UseMI.
5763/// If MakeChange is true, this function tries to replace Reg by ImmVal in
5764/// UseMI. If MakeChange is false, just check if folding is possible.
5765//
5766/// \returns true if folding is successful or possible.
5767bool X86InstrInfo::foldImmediateImpl(MachineInstr &UseMI, MachineInstr *DefMI,
5768 Register Reg, int64_t ImmVal,
5770 bool MakeChange) const {
5771 bool Modified = false;
5772
5773 // 64 bit operations accept sign extended 32 bit immediates.
5774 // 32 bit operations accept all 32 bit immediates, so we don't need to check
5775 // them.
5776 const TargetRegisterClass *RC = nullptr;
5777 if (Reg.isVirtual())
5778 RC = MRI->getRegClass(Reg);
5779 if ((Reg.isPhysical() && X86::GR64RegClass.contains(Reg)) ||
5780 (Reg.isVirtual() && X86::GR64RegClass.hasSubClassEq(RC))) {
5781 if (!isInt<32>(ImmVal))
5782 return false;
5783 }
5784
5785 if (UseMI.findRegisterUseOperand(Reg, /*TRI=*/nullptr)->getSubReg())
5786 return false;
5787 // Immediate has larger code size than register. So avoid folding the
5788 // immediate if it has more than 1 use and we are optimizing for size.
5789 if (UseMI.getMF()->getFunction().hasOptSize() && Reg.isVirtual() &&
5790 !MRI->hasOneNonDBGUse(Reg))
5791 return false;
5792
5793 unsigned Opc = UseMI.getOpcode();
5794 unsigned NewOpc;
5795 if (Opc == TargetOpcode::COPY) {
5796 Register ToReg = UseMI.getOperand(0).getReg();
5797 const TargetRegisterClass *RC = nullptr;
5798 if (ToReg.isVirtual())
5799 RC = MRI->getRegClass(ToReg);
5800 bool GR32Reg = (ToReg.isVirtual() && X86::GR32RegClass.hasSubClassEq(RC)) ||
5801 (ToReg.isPhysical() && X86::GR32RegClass.contains(ToReg));
5802 bool GR64Reg = (ToReg.isVirtual() && X86::GR64RegClass.hasSubClassEq(RC)) ||
5803 (ToReg.isPhysical() && X86::GR64RegClass.contains(ToReg));
5804 bool GR8Reg = (ToReg.isVirtual() && X86::GR8RegClass.hasSubClassEq(RC)) ||
5805 (ToReg.isPhysical() && X86::GR8RegClass.contains(ToReg));
5806
5807 if (ImmVal == 0) {
5808 // We have MOV32r0 only.
5809 if (!GR32Reg)
5810 return false;
5811 }
5812
5813 if (GR64Reg) {
5814 if (isUInt<32>(ImmVal))
5815 NewOpc = X86::MOV32ri64;
5816 else
5817 NewOpc = X86::MOV64ri;
5818 } else if (GR32Reg) {
5819 NewOpc = X86::MOV32ri;
5820 if (ImmVal == 0) {
5821 // MOV32r0 clobbers EFLAGS.
5822 const TargetRegisterInfo *TRI = &getRegisterInfo();
5823 if (UseMI.getParent()->computeRegisterLiveness(
5824 TRI, X86::EFLAGS, UseMI) != MachineBasicBlock::LQR_Dead)
5825 return false;
5826
5827 // MOV32r0 is different than other cases because it doesn't encode the
5828 // immediate in the instruction. So we directly modify it here.
5829 if (!MakeChange)
5830 return true;
5831 UseMI.setDesc(get(X86::MOV32r0));
5832 UseMI.removeOperand(
5833 UseMI.findRegisterUseOperandIdx(Reg, /*TRI=*/nullptr));
5834 UseMI.addOperand(MachineOperand::CreateReg(X86::EFLAGS, /*isDef=*/true,
5835 /*isImp=*/true,
5836 /*isKill=*/false,
5837 /*isDead=*/true));
5838 Modified = true;
5839 }
5840 } else if (GR8Reg)
5841 NewOpc = X86::MOV8ri;
5842 else
5843 return false;
5844 } else
5845 NewOpc = convertALUrr2ALUri(Opc);
5846
5847 if (!NewOpc)
5848 return false;
5849
5850 // For SUB instructions the immediate can only be the second source operand.
5851 if ((NewOpc == X86::SUB64ri32 || NewOpc == X86::SUB32ri ||
5852 NewOpc == X86::SBB64ri32 || NewOpc == X86::SBB32ri ||
5853 NewOpc == X86::SUB64ri32_ND || NewOpc == X86::SUB32ri_ND ||
5854 NewOpc == X86::SBB64ri32_ND || NewOpc == X86::SBB32ri_ND) &&
5855 UseMI.findRegisterUseOperandIdx(Reg, /*TRI=*/nullptr) != 2)
5856 return false;
5857 // For CMP instructions the immediate can only be at index 1.
5858 if (((NewOpc == X86::CMP64ri32 || NewOpc == X86::CMP32ri) ||
5859 (NewOpc == X86::CCMP64ri32 || NewOpc == X86::CCMP32ri)) &&
5860 UseMI.findRegisterUseOperandIdx(Reg, /*TRI=*/nullptr) != 1)
5861 return false;
5862
5863 using namespace X86;
5864 if (isSHL(Opc) || isSHR(Opc) || isSAR(Opc) || isROL(Opc) || isROR(Opc) ||
5865 isRCL(Opc) || isRCR(Opc)) {
5866 unsigned RegIdx = UseMI.findRegisterUseOperandIdx(Reg, /*TRI=*/nullptr);
5867 if (RegIdx < 2)
5868 return false;
5869 if (!isInt<8>(ImmVal))
5870 return false;
5871 assert(Reg == X86::CL);
5872
5873 if (!MakeChange)
5874 return true;
5875 UseMI.setDesc(get(NewOpc));
5876 UseMI.removeOperand(RegIdx);
5877 UseMI.addOperand(MachineOperand::CreateImm(ImmVal));
5878 // Reg is physical register $cl, so we don't know if DefMI is dead through
5879 // MRI. Let the caller handle it, or pass dead-mi-elimination can delete
5880 // the dead physical register define instruction.
5881 return true;
5882 }
5883
5884 if (!MakeChange)
5885 return true;
5886
5887 if (!Modified) {
5888 // Modify the instruction.
5889 if (ImmVal == 0 && canConvert2Copy(NewOpc) &&
5890 UseMI.registerDefIsDead(X86::EFLAGS, /*TRI=*/nullptr)) {
5891 // %100 = add %101, 0
5892 // ==>
5893 // %100 = COPY %101
5894 UseMI.setDesc(get(TargetOpcode::COPY));
5895 UseMI.removeOperand(
5896 UseMI.findRegisterUseOperandIdx(Reg, /*TRI=*/nullptr));
5897 UseMI.removeOperand(
5898 UseMI.findRegisterDefOperandIdx(X86::EFLAGS, /*TRI=*/nullptr));
5899 UseMI.untieRegOperand(0);
5902 } else {
5903 unsigned Op1 = 1, Op2 = CommuteAnyOperandIndex;
5904 unsigned ImmOpNum = 2;
5905 if (!UseMI.getOperand(0).isDef()) {
5906 Op1 = 0; // TEST, CMP, CTEST, CCMP
5907 ImmOpNum = 1;
5908 }
5909 if (Opc == TargetOpcode::COPY)
5910 ImmOpNum = 1;
5911 if (findCommutedOpIndices(UseMI, Op1, Op2) &&
5912 UseMI.getOperand(Op1).getReg() == Reg)
5913 commuteInstruction(UseMI);
5914
5915 assert(UseMI.getOperand(ImmOpNum).getReg() == Reg);
5916 UseMI.setDesc(get(NewOpc));
5917 UseMI.getOperand(ImmOpNum).ChangeToImmediate(ImmVal);
5918 }
5919 }
5920
5921 if (Reg.isVirtual() && MRI->use_nodbg_empty(Reg))
5923
5924 return true;
5925}
5926
5927/// foldImmediate - 'Reg' is known to be defined by a move immediate
5928/// instruction, try to fold the immediate into the use instruction.
5930 Register Reg, MachineRegisterInfo *MRI) const {
5931 int64_t ImmVal;
5932 if (!getConstValDefinedInReg(DefMI, Reg, ImmVal))
5933 return false;
5934
5935 return foldImmediateImpl(UseMI, &DefMI, Reg, ImmVal, MRI, true);
5936}
5937
5938/// Expand a single-def pseudo instruction to a two-addr
5939/// instruction with two undef reads of the register being defined.
5940/// This is used for mapping:
5941/// %xmm4 = V_SET0
5942/// to:
5943/// %xmm4 = PXORrr undef %xmm4, undef %xmm4
5944///
5946 const MCInstrDesc &Desc) {
5947 assert(Desc.getNumOperands() == 3 && "Expected two-addr instruction.");
5948 Register Reg = MIB.getReg(0);
5949 MIB->setDesc(Desc);
5950
5951 // MachineInstr::addOperand() will insert explicit operands before any
5952 // implicit operands.
5954 // But we don't trust that.
5955 assert(MIB.getReg(1) == Reg && MIB.getReg(2) == Reg && "Misplaced operand");
5956 return true;
5957}
5958
5959/// Expand a single-def pseudo instruction to a two-addr
5960/// instruction with two %k0 reads.
5961/// This is used for mapping:
5962/// %k4 = K_SET1
5963/// to:
5964/// %k4 = KXNORrr %k0, %k0
5966 Register Reg) {
5967 assert(Desc.getNumOperands() == 3 && "Expected two-addr instruction.");
5968 MIB->setDesc(Desc);
5970 return true;
5971}
5972
5974 bool MinusOne) {
5975 MachineBasicBlock &MBB = *MIB->getParent();
5976 const DebugLoc &DL = MIB->getDebugLoc();
5977 Register Reg = MIB.getReg(0);
5978
5979 // Insert the XOR.
5980 BuildMI(MBB, MIB.getInstr(), DL, TII.get(X86::XOR32rr), Reg)
5983
5984 // Turn the pseudo into an INC or DEC.
5985 MIB->setDesc(TII.get(MinusOne ? X86::DEC32r : X86::INC32r));
5986 MIB.addReg(Reg);
5987
5988 return true;
5989}
5990
5992 const TargetInstrInfo &TII,
5993 const X86Subtarget &Subtarget) {
5994 MachineBasicBlock &MBB = *MIB->getParent();
5995 const DebugLoc &DL = MIB->getDebugLoc();
5996 int64_t Imm = MIB->getOperand(1).getImm();
5997 assert(Imm != 0 && "Using push/pop for 0 is not efficient.");
5999
6000 int StackAdjustment;
6001
6002 if (Subtarget.is64Bit()) {
6003 assert(MIB->getOpcode() == X86::MOV64ImmSExti8 ||
6004 MIB->getOpcode() == X86::MOV32ImmSExti8);
6005
6006 // Can't use push/pop lowering if the function might write to the red zone.
6007 X86MachineFunctionInfo *X86FI =
6008 MBB.getParent()->getInfo<X86MachineFunctionInfo>();
6009 if (X86FI->getUsesRedZone()) {
6010 MIB->setDesc(TII.get(MIB->getOpcode() == X86::MOV32ImmSExti8
6011 ? X86::MOV32ri
6012 : X86::MOV64ri));
6013 return true;
6014 }
6015
6016 // 64-bit mode doesn't have 32-bit push/pop, so use 64-bit operations and
6017 // widen the register if necessary.
6018 StackAdjustment = 8;
6019 BuildMI(MBB, I, DL, TII.get(X86::PUSH64i32)).addImm(Imm);
6020 MIB->setDesc(TII.get(X86::POP64r));
6021 MIB->getOperand(0).setReg(getX86SubSuperRegister(MIB.getReg(0), 64));
6022 } else {
6023 assert(MIB->getOpcode() == X86::MOV32ImmSExti8);
6024 StackAdjustment = 4;
6025 BuildMI(MBB, I, DL, TII.get(X86::PUSH32i)).addImm(Imm);
6026 MIB->setDesc(TII.get(X86::POP32r));
6027 }
6028 MIB->removeOperand(1);
6029 MIB->addImplicitDefUseOperands(*MBB.getParent());
6030
6031 // Build CFI if necessary.
6032 MachineFunction &MF = *MBB.getParent();
6033 const X86FrameLowering *TFL = Subtarget.getFrameLowering();
6034 bool IsWin64Prologue = MF.getTarget().getMCAsmInfo()->usesWindowsCFI();
6035 bool NeedsDwarfCFI = !IsWin64Prologue && MF.needsFrameMoves();
6036 bool EmitCFI = !TFL->hasFP(MF) && NeedsDwarfCFI;
6037 if (EmitCFI) {
6038 TFL->BuildCFI(
6039 MBB, I, DL,
6040 MCCFIInstruction::createAdjustCfaOffset(nullptr, StackAdjustment));
6041 TFL->BuildCFI(
6042 MBB, std::next(I), DL,
6043 MCCFIInstruction::createAdjustCfaOffset(nullptr, -StackAdjustment));
6044 }
6045
6046 return true;
6047}
6048
6049// LoadStackGuard has so far only been implemented for 64-bit MachO. Different
6050// code sequence is needed for other targets.
6052 const TargetInstrInfo &TII) {
6053 MachineBasicBlock &MBB = *MIB->getParent();
6054 const DebugLoc &DL = MIB->getDebugLoc();
6055 Register Reg = MIB.getReg(0);
6056 const GlobalValue *GV =
6057 cast<GlobalValue>((*MIB->memoperands_begin())->getValue());
6058 auto Flags = MachineMemOperand::MOLoad |
6061 MachineMemOperand *MMO = MBB.getParent()->getMachineMemOperand(
6062 MachinePointerInfo::getGOT(*MBB.getParent()), Flags, 8, Align(8));
6064
6065 BuildMI(MBB, I, DL, TII.get(X86::MOV64rm), Reg)
6066 .addReg(X86::RIP)
6067 .addImm(1)
6068 .addReg(0)
6070 .addReg(0)
6071 .addMemOperand(MMO);
6072 MIB->setDebugLoc(DL);
6073 MIB->setDesc(TII.get(X86::MOV64rm));
6075}
6076
6078 MachineBasicBlock &MBB = *MIB->getParent();
6079 MachineFunction &MF = *MBB.getParent();
6080 const X86Subtarget &Subtarget = MF.getSubtarget<X86Subtarget>();
6081 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
6082 unsigned XorOp =
6083 MIB->getOpcode() == X86::XOR64_FP ? X86::XOR64rr : X86::XOR32rr;
6084 MIB->setDesc(TII.get(XorOp));
6085 MIB.addReg(TRI->getFrameRegister(MF), RegState::Undef);
6086 return true;
6087}
6088
6089// This is used to handle spills for 128/256-bit registers when we have AVX512,
6090// but not VLX. If it uses an extended register we need to use an instruction
6091// that loads the lower 128/256-bit, but is available with only AVX512F.
6093 const TargetRegisterInfo *TRI,
6094 const MCInstrDesc &LoadDesc,
6095 const MCInstrDesc &BroadcastDesc, unsigned SubIdx) {
6096 Register DestReg = MIB.getReg(0);
6097 // Check if DestReg is XMM16-31 or YMM16-31.
6098 if (TRI->getEncodingValue(DestReg) < 16) {
6099 // We can use a normal VEX encoded load.
6100 MIB->setDesc(LoadDesc);
6101 } else {
6102 // Use a 128/256-bit VBROADCAST instruction.
6103 MIB->setDesc(BroadcastDesc);
6104 // Change the destination to a 512-bit register.
6105 DestReg = TRI->getMatchingSuperReg(DestReg, SubIdx, &X86::VR512RegClass);
6106 MIB->getOperand(0).setReg(DestReg);
6107 }
6108 return true;
6109}
6110
6111// This is used to handle spills for 128/256-bit registers when we have AVX512,
6112// but not VLX. If it uses an extended register we need to use an instruction
6113// that stores the lower 128/256-bit, but is available with only AVX512F.
6115 const TargetRegisterInfo *TRI,
6116 const MCInstrDesc &StoreDesc,
6117 const MCInstrDesc &ExtractDesc, unsigned SubIdx) {
6118 Register SrcReg = MIB.getReg(X86::AddrNumOperands);
6119 // Check if DestReg is XMM16-31 or YMM16-31.
6120 if (TRI->getEncodingValue(SrcReg) < 16) {
6121 // We can use a normal VEX encoded store.
6122 MIB->setDesc(StoreDesc);
6123 } else {
6124 // Use a VEXTRACTF instruction.
6125 MIB->setDesc(ExtractDesc);
6126 // Change the destination to a 512-bit register.
6127 SrcReg = TRI->getMatchingSuperReg(SrcReg, SubIdx, &X86::VR512RegClass);
6129 MIB.addImm(0x0); // Append immediate to extract from the lower bits.
6130 }
6131
6132 return true;
6133}
6134
6136 MIB->setDesc(Desc);
6137 int64_t ShiftAmt = MIB->getOperand(2).getImm();
6138 // Temporarily remove the immediate so we can add another source register.
6139 MIB->removeOperand(2);
6140 // Add the register. Don't copy the kill flag if there is one.
6141 MIB.addReg(MIB.getReg(1), getUndefRegState(MIB->getOperand(1).isUndef()));
6142 // Add back the immediate.
6143 MIB.addImm(ShiftAmt);
6144 return true;
6145}
6146
6148 const TargetInstrInfo &TII, bool HasAVX) {
6149 unsigned NewOpc;
6150 if (MI.getOpcode() == X86::MOVSHPrm) {
6151 NewOpc = HasAVX ? X86::VMOVSSrm : X86::MOVSSrm;
6152 Register Reg = MI.getOperand(0).getReg();
6153 if (Reg > X86::XMM15)
6154 NewOpc = X86::VMOVSSZrm;
6155 } else {
6156 NewOpc = HasAVX ? X86::VMOVSSmr : X86::MOVSSmr;
6157 Register Reg = MI.getOperand(5).getReg();
6158 if (Reg > X86::XMM15)
6159 NewOpc = X86::VMOVSSZmr;
6160 }
6161
6162 MIB->setDesc(TII.get(NewOpc));
6163 return true;
6164}
6165
6167 bool HasAVX = Subtarget.hasAVX();
6168 MachineInstrBuilder MIB(*MI.getParent()->getParent(), MI);
6169 switch (MI.getOpcode()) {
6170 case X86::MOV32r0:
6171 return Expand2AddrUndef(MIB, get(X86::XOR32rr));
6172 case X86::MOV32r1:
6173 return expandMOV32r1(MIB, *this, /*MinusOne=*/false);
6174 case X86::MOV32r_1:
6175 return expandMOV32r1(MIB, *this, /*MinusOne=*/true);
6176 case X86::MOV32ImmSExti8:
6177 case X86::MOV64ImmSExti8:
6178 return ExpandMOVImmSExti8(MIB, *this, Subtarget);
6179 case X86::SETB_C32r:
6180 return Expand2AddrUndef(MIB, get(X86::SBB32rr));
6181 case X86::SETB_C64r:
6182 return Expand2AddrUndef(MIB, get(X86::SBB64rr));
6183 case X86::MMX_SET0:
6184 return Expand2AddrUndef(MIB, get(X86::MMX_PXORrr));
6185 case X86::V_SET0:
6186 case X86::FsFLD0SS:
6187 case X86::FsFLD0SD:
6188 case X86::FsFLD0SH:
6189 case X86::FsFLD0F128:
6190 return Expand2AddrUndef(MIB, get(HasAVX ? X86::VXORPSrr : X86::XORPSrr));
6191 case X86::AVX_SET0: {
6192 assert(HasAVX && "AVX not supported");
6194 Register SrcReg = MIB.getReg(0);
6195 Register XReg = TRI->getSubReg(SrcReg, X86::sub_xmm);
6196 MIB->getOperand(0).setReg(XReg);
6197 Expand2AddrUndef(MIB, get(X86::VXORPSrr));
6198 MIB.addReg(SrcReg, RegState::ImplicitDefine);
6199 return true;
6200 }
6201 case X86::AVX512_128_SET0:
6202 case X86::AVX512_FsFLD0SH:
6203 case X86::AVX512_FsFLD0SS:
6204 case X86::AVX512_FsFLD0SD:
6205 case X86::AVX512_FsFLD0F128: {
6206 bool HasVLX = Subtarget.hasVLX();
6207 Register SrcReg = MIB.getReg(0);
6209 if (HasVLX || TRI->getEncodingValue(SrcReg) < 16)
6210 return Expand2AddrUndef(MIB,
6211 get(HasVLX ? X86::VPXORDZ128rr : X86::VXORPSrr));
6212 // Extended register without VLX. Use a larger XOR.
6213 SrcReg =
6214 TRI->getMatchingSuperReg(SrcReg, X86::sub_xmm, &X86::VR512RegClass);
6215 MIB->getOperand(0).setReg(SrcReg);
6216 return Expand2AddrUndef(MIB, get(X86::VPXORDZrr));
6217 }
6218 case X86::AVX512_256_SET0:
6219 case X86::AVX512_512_SET0: {
6220 bool HasVLX = Subtarget.hasVLX();
6221 Register SrcReg = MIB.getReg(0);
6223 if (HasVLX || TRI->getEncodingValue(SrcReg) < 16) {
6224 Register XReg = TRI->getSubReg(SrcReg, X86::sub_xmm);
6225 MIB->getOperand(0).setReg(XReg);
6226 Expand2AddrUndef(MIB, get(HasVLX ? X86::VPXORDZ128rr : X86::VXORPSrr));
6227 MIB.addReg(SrcReg, RegState::ImplicitDefine);
6228 return true;
6229 }
6230 if (MI.getOpcode() == X86::AVX512_256_SET0) {
6231 // No VLX so we must reference a zmm.
6232 MCRegister ZReg =
6233 TRI->getMatchingSuperReg(SrcReg, X86::sub_ymm, &X86::VR512RegClass);
6234 MIB->getOperand(0).setReg(ZReg);
6235 }
6236 return Expand2AddrUndef(MIB, get(X86::VPXORDZrr));
6237 }
6238 case X86::MOVSHPmr:
6239 case X86::MOVSHPrm:
6240 return expandMOVSHP(MIB, MI, *this, Subtarget.hasAVX());
6241 case X86::V_SETALLONES:
6242 return Expand2AddrUndef(MIB,
6243 get(HasAVX ? X86::VPCMPEQDrr : X86::PCMPEQDrr));
6244 case X86::AVX2_SETALLONES:
6245 return Expand2AddrUndef(MIB, get(X86::VPCMPEQDYrr));
6246 case X86::AVX1_SETALLONES: {
6247 Register Reg = MIB.getReg(0);
6248 // VCMPPSYrri with an immediate 0xf should produce VCMPTRUEPS.
6249 MIB->setDesc(get(X86::VCMPPSYrri));
6250 MIB.addReg(Reg, RegState::Undef).addReg(Reg, RegState::Undef).addImm(0xf);
6251 return true;
6252 }
6253 case X86::AVX512_128_SETALLONES:
6254 case X86::AVX512_256_SETALLONES:
6255 case X86::AVX512_512_SETALLONES: {
6256 Register Reg = MIB.getReg(0);
6257 unsigned Opc;
6258 switch (MI.getOpcode()) {
6259 case X86::AVX512_128_SETALLONES: {
6260 if (X86::VR128RegClass.contains(Reg))
6261 return Expand2AddrUndef(MIB, get(X86::VPCMPEQDrr));
6262
6263 Opc = X86::VPTERNLOGDZ128rri;
6264 break;
6265 }
6266 case X86::AVX512_256_SETALLONES: {
6267 if (X86::VR256RegClass.contains(Reg))
6268 return Expand2AddrUndef(MIB, get(X86::VPCMPEQDYrr));
6269
6270 Opc = X86::VPTERNLOGDZ256rri;
6271 break;
6272 }
6273 case X86::AVX512_512_SETALLONES:
6274 Opc = X86::VPTERNLOGDZrri;
6275 break;
6276 }
6277 MIB->setDesc(get(Opc));
6278 // VPTERNLOGD needs 3 register inputs and an immediate.
6279 // 0xff will return 1s for any input.
6280 MIB.addReg(Reg, RegState::Undef)
6281 .addReg(Reg, RegState::Undef)
6282 .addReg(Reg, RegState::Undef)
6283 .addImm(0xff);
6284 return true;
6285 }
6286 case X86::AVX512_512_SEXT_MASK_32:
6287 case X86::AVX512_512_SEXT_MASK_64: {
6288 Register Reg = MIB.getReg(0);
6289 Register MaskReg = MIB.getReg(1);
6290 unsigned MaskState = getRegState(MIB->getOperand(1));
6291 unsigned Opc = (MI.getOpcode() == X86::AVX512_512_SEXT_MASK_64)
6292 ? X86::VPTERNLOGQZrrikz
6293 : X86::VPTERNLOGDZrrikz;
6294 MI.removeOperand(1);
6295 MIB->setDesc(get(Opc));
6296 // VPTERNLOG needs 3 register inputs and an immediate.
6297 // 0xff will return 1s for any input.
6298 MIB.addReg(Reg, RegState::Undef)
6299 .addReg(MaskReg, MaskState)
6300 .addReg(Reg, RegState::Undef)
6301 .addReg(Reg, RegState::Undef)
6302 .addImm(0xff);
6303 return true;
6304 }
6305 case X86::VMOVAPSZ128rm_NOVLX:
6306 return expandNOVLXLoad(MIB, &getRegisterInfo(), get(X86::VMOVAPSrm),
6307 get(X86::VBROADCASTF32X4Zrm), X86::sub_xmm);
6308 case X86::VMOVUPSZ128rm_NOVLX:
6309 return expandNOVLXLoad(MIB, &getRegisterInfo(), get(X86::VMOVUPSrm),
6310 get(X86::VBROADCASTF32X4Zrm), X86::sub_xmm);
6311 case X86::VMOVAPSZ256rm_NOVLX:
6312 return expandNOVLXLoad(MIB, &getRegisterInfo(), get(X86::VMOVAPSYrm),
6313 get(X86::VBROADCASTF64X4Zrm), X86::sub_ymm);
6314 case X86::VMOVUPSZ256rm_NOVLX:
6315 return expandNOVLXLoad(MIB, &getRegisterInfo(), get(X86::VMOVUPSYrm),
6316 get(X86::VBROADCASTF64X4Zrm), X86::sub_ymm);
6317 case X86::VMOVAPSZ128mr_NOVLX:
6318 return expandNOVLXStore(MIB, &getRegisterInfo(), get(X86::VMOVAPSmr),
6319 get(X86::VEXTRACTF32X4Zmri), X86::sub_xmm);
6320 case X86::VMOVUPSZ128mr_NOVLX:
6321 return expandNOVLXStore(MIB, &getRegisterInfo(), get(X86::VMOVUPSmr),
6322 get(X86::VEXTRACTF32X4Zmri), X86::sub_xmm);
6323 case X86::VMOVAPSZ256mr_NOVLX:
6324 return expandNOVLXStore(MIB, &getRegisterInfo(), get(X86::VMOVAPSYmr),
6325 get(X86::VEXTRACTF64X4Zmri), X86::sub_ymm);
6326 case X86::VMOVUPSZ256mr_NOVLX:
6327 return expandNOVLXStore(MIB, &getRegisterInfo(), get(X86::VMOVUPSYmr),
6328 get(X86::VEXTRACTF64X4Zmri), X86::sub_ymm);
6329 case X86::MOV32ri64: {
6330 Register Reg = MIB.getReg(0);
6331 Register Reg32 = RI.getSubReg(Reg, X86::sub_32bit);
6332 MI.setDesc(get(X86::MOV32ri));
6333 MIB->getOperand(0).setReg(Reg32);
6335 return true;
6336 }
6337
6338 case X86::RDFLAGS32:
6339 case X86::RDFLAGS64: {
6340 unsigned Is64Bit = MI.getOpcode() == X86::RDFLAGS64;
6341 MachineBasicBlock &MBB = *MIB->getParent();
6342
6343 MachineInstr *NewMI = BuildMI(MBB, MI, MIB->getDebugLoc(),
6344 get(Is64Bit ? X86::PUSHF64 : X86::PUSHF32))
6345 .getInstr();
6346
6347 // Permit reads of the EFLAGS and DF registers without them being defined.
6348 // This intrinsic exists to read external processor state in flags, such as
6349 // the trap flag, interrupt flag, and direction flag, none of which are
6350 // modeled by the backend.
6351 assert(NewMI->getOperand(2).getReg() == X86::EFLAGS &&
6352 "Unexpected register in operand! Should be EFLAGS.");
6353 NewMI->getOperand(2).setIsUndef();
6354 assert(NewMI->getOperand(3).getReg() == X86::DF &&
6355 "Unexpected register in operand! Should be DF.");
6356 NewMI->getOperand(3).setIsUndef();
6357
6358 MIB->setDesc(get(Is64Bit ? X86::POP64r : X86::POP32r));
6359 return true;
6360 }
6361
6362 case X86::WRFLAGS32:
6363 case X86::WRFLAGS64: {
6364 unsigned Is64Bit = MI.getOpcode() == X86::WRFLAGS64;
6365 MachineBasicBlock &MBB = *MIB->getParent();
6366
6367 BuildMI(MBB, MI, MIB->getDebugLoc(),
6368 get(Is64Bit ? X86::PUSH64r : X86::PUSH32r))
6369 .addReg(MI.getOperand(0).getReg());
6370 BuildMI(MBB, MI, MIB->getDebugLoc(),
6371 get(Is64Bit ? X86::POPF64 : X86::POPF32));
6372 MI.eraseFromParent();
6373 return true;
6374 }
6375
6376 // KNL does not recognize dependency-breaking idioms for mask registers,
6377 // so kxnor %k1, %k1, %k2 has a RAW dependence on %k1.
6378 // Using %k0 as the undef input register is a performance heuristic based
6379 // on the assumption that %k0 is used less frequently than the other mask
6380 // registers, since it is not usable as a write mask.
6381 // FIXME: A more advanced approach would be to choose the best input mask
6382 // register based on context.
6383 case X86::KSET0B:
6384 return Expand2AddrKreg(MIB, get(X86::KXORBkk), X86::K0);
6385 case X86::KSET0W:
6386 return Expand2AddrKreg(MIB, get(X86::KXORWkk), X86::K0);
6387 case X86::KSET0D:
6388 return Expand2AddrKreg(MIB, get(X86::KXORDkk), X86::K0);
6389 case X86::KSET0Q:
6390 return Expand2AddrKreg(MIB, get(X86::KXORQkk), X86::K0);
6391 case X86::KSET1B:
6392 return Expand2AddrKreg(MIB, get(X86::KXNORBkk), X86::K0);
6393 case X86::KSET1W:
6394 return Expand2AddrKreg(MIB, get(X86::KXNORWkk), X86::K0);
6395 case X86::KSET1D:
6396 return Expand2AddrKreg(MIB, get(X86::KXNORDkk), X86::K0);
6397 case X86::KSET1Q:
6398 return Expand2AddrKreg(MIB, get(X86::KXNORQkk), X86::K0);
6399 case TargetOpcode::LOAD_STACK_GUARD:
6400 expandLoadStackGuard(MIB, *this);
6401 return true;
6402 case X86::XOR64_FP:
6403 case X86::XOR32_FP:
6404 return expandXorFP(MIB, *this);
6405 case X86::SHLDROT32ri:
6406 return expandSHXDROT(MIB, get(X86::SHLD32rri8));
6407 case X86::SHLDROT64ri:
6408 return expandSHXDROT(MIB, get(X86::SHLD64rri8));
6409 case X86::SHRDROT32ri:
6410 return expandSHXDROT(MIB, get(X86::SHRD32rri8));
6411 case X86::SHRDROT64ri:
6412 return expandSHXDROT(MIB, get(X86::SHRD64rri8));
6413 case X86::ADD8rr_DB:
6414 MIB->setDesc(get(X86::OR8rr));
6415 break;
6416 case X86::ADD16rr_DB:
6417 MIB->setDesc(get(X86::OR16rr));
6418 break;
6419 case X86::ADD32rr_DB:
6420 MIB->setDesc(get(X86::OR32rr));
6421 break;
6422 case X86::ADD64rr_DB:
6423 MIB->setDesc(get(X86::OR64rr));
6424 break;
6425 case X86::ADD8ri_DB:
6426 MIB->setDesc(get(X86::OR8ri));
6427 break;
6428 case X86::ADD16ri_DB:
6429 MIB->setDesc(get(X86::OR16ri));
6430 break;
6431 case X86::ADD32ri_DB:
6432 MIB->setDesc(get(X86::OR32ri));
6433 break;
6434 case X86::ADD64ri32_DB:
6435 MIB->setDesc(get(X86::OR64ri32));
6436 break;
6437 }
6438 return false;
6439}
6440
6441/// Return true for all instructions that only update
6442/// the first 32 or 64-bits of the destination register and leave the rest
6443/// unmodified. This can be used to avoid folding loads if the instructions
6444/// only update part of the destination register, and the non-updated part is
6445/// not needed. e.g. cvtss2sd, sqrtss. Unfolding the load from these
6446/// instructions breaks the partial register dependency and it can improve
6447/// performance. e.g.:
6448///
6449/// movss (%rdi), %xmm0
6450/// cvtss2sd %xmm0, %xmm0
6451///
6452/// Instead of
6453/// cvtss2sd (%rdi), %xmm0
6454///
6455/// FIXME: This should be turned into a TSFlags.
6456///
6457static bool hasPartialRegUpdate(unsigned Opcode, const X86Subtarget &Subtarget,
6458 bool ForLoadFold = false) {
6459 switch (Opcode) {
6460 case X86::CVTSI2SSrr:
6461 case X86::CVTSI2SSrm:
6462 case X86::CVTSI642SSrr:
6463 case X86::CVTSI642SSrm:
6464 case X86::CVTSI2SDrr:
6465 case X86::CVTSI2SDrm:
6466 case X86::CVTSI642SDrr:
6467 case X86::CVTSI642SDrm:
6468 // Load folding won't effect the undef register update since the input is
6469 // a GPR.
6470 return !ForLoadFold;
6471 case X86::CVTSD2SSrr:
6472 case X86::CVTSD2SSrm:
6473 case X86::CVTSS2SDrr:
6474 case X86::CVTSS2SDrm:
6475 case X86::MOVHPDrm:
6476 case X86::MOVHPSrm:
6477 case X86::MOVLPDrm:
6478 case X86::MOVLPSrm:
6479 case X86::RCPSSr:
6480 case X86::RCPSSm:
6481 case X86::RCPSSr_Int:
6482 case X86::RCPSSm_Int:
6483 case X86::ROUNDSDri:
6484 case X86::ROUNDSDmi:
6485 case X86::ROUNDSSri:
6486 case X86::ROUNDSSmi:
6487 case X86::RSQRTSSr:
6488 case X86::RSQRTSSm:
6489 case X86::RSQRTSSr_Int:
6490 case X86::RSQRTSSm_Int:
6491 case X86::SQRTSSr:
6492 case X86::SQRTSSm:
6493 case X86::SQRTSSr_Int:
6494 case X86::SQRTSSm_Int:
6495 case X86::SQRTSDr:
6496 case X86::SQRTSDm:
6497 case X86::SQRTSDr_Int:
6498 case X86::SQRTSDm_Int:
6499 return true;
6500 case X86::VFCMULCPHZ128rm:
6501 case X86::VFCMULCPHZ128rmb:
6502 case X86::VFCMULCPHZ128rmbkz:
6503 case X86::VFCMULCPHZ128rmkz:
6504 case X86::VFCMULCPHZ128rr:
6505 case X86::VFCMULCPHZ128rrkz:
6506 case X86::VFCMULCPHZ256rm:
6507 case X86::VFCMULCPHZ256rmb:
6508 case X86::VFCMULCPHZ256rmbkz:
6509 case X86::VFCMULCPHZ256rmkz:
6510 case X86::VFCMULCPHZ256rr:
6511 case X86::VFCMULCPHZ256rrkz:
6512 case X86::VFCMULCPHZrm:
6513 case X86::VFCMULCPHZrmb:
6514 case X86::VFCMULCPHZrmbkz:
6515 case X86::VFCMULCPHZrmkz:
6516 case X86::VFCMULCPHZrr:
6517 case X86::VFCMULCPHZrrb:
6518 case X86::VFCMULCPHZrrbkz:
6519 case X86::VFCMULCPHZrrkz:
6520 case X86::VFMULCPHZ128rm:
6521 case X86::VFMULCPHZ128rmb:
6522 case X86::VFMULCPHZ128rmbkz:
6523 case X86::VFMULCPHZ128rmkz:
6524 case X86::VFMULCPHZ128rr:
6525 case X86::VFMULCPHZ128rrkz:
6526 case X86::VFMULCPHZ256rm:
6527 case X86::VFMULCPHZ256rmb:
6528 case X86::VFMULCPHZ256rmbkz:
6529 case X86::VFMULCPHZ256rmkz:
6530 case X86::VFMULCPHZ256rr:
6531 case X86::VFMULCPHZ256rrkz:
6532 case X86::VFMULCPHZrm:
6533 case X86::VFMULCPHZrmb:
6534 case X86::VFMULCPHZrmbkz:
6535 case X86::VFMULCPHZrmkz:
6536 case X86::VFMULCPHZrr:
6537 case X86::VFMULCPHZrrb:
6538 case X86::VFMULCPHZrrbkz:
6539 case X86::VFMULCPHZrrkz:
6540 case X86::VFCMULCSHZrm:
6541 case X86::VFCMULCSHZrmkz:
6542 case X86::VFCMULCSHZrr:
6543 case X86::VFCMULCSHZrrb:
6544 case X86::VFCMULCSHZrrbkz:
6545 case X86::VFCMULCSHZrrkz:
6546 case X86::VFMULCSHZrm:
6547 case X86::VFMULCSHZrmkz:
6548 case X86::VFMULCSHZrr:
6549 case X86::VFMULCSHZrrb:
6550 case X86::VFMULCSHZrrbkz:
6551 case X86::VFMULCSHZrrkz:
6552 return Subtarget.hasMULCFalseDeps();
6553 case X86::VPERMDYrm:
6554 case X86::VPERMDYrr:
6555 case X86::VPERMQYmi:
6556 case X86::VPERMQYri:
6557 case X86::VPERMPSYrm:
6558 case X86::VPERMPSYrr:
6559 case X86::VPERMPDYmi:
6560 case X86::VPERMPDYri:
6561 case X86::VPERMDZ256rm:
6562 case X86::VPERMDZ256rmb:
6563 case X86::VPERMDZ256rmbkz:
6564 case X86::VPERMDZ256rmkz:
6565 case X86::VPERMDZ256rr:
6566 case X86::VPERMDZ256rrkz:
6567 case X86::VPERMDZrm:
6568 case X86::VPERMDZrmb:
6569 case X86::VPERMDZrmbkz:
6570 case X86::VPERMDZrmkz:
6571 case X86::VPERMDZrr:
6572 case X86::VPERMDZrrkz:
6573 case X86::VPERMQZ256mbi:
6574 case X86::VPERMQZ256mbikz:
6575 case X86::VPERMQZ256mi:
6576 case X86::VPERMQZ256mikz:
6577 case X86::VPERMQZ256ri:
6578 case X86::VPERMQZ256rikz:
6579 case X86::VPERMQZ256rm:
6580 case X86::VPERMQZ256rmb:
6581 case X86::VPERMQZ256rmbkz:
6582 case X86::VPERMQZ256rmkz:
6583 case X86::VPERMQZ256rr:
6584 case X86::VPERMQZ256rrkz:
6585 case X86::VPERMQZmbi:
6586 case X86::VPERMQZmbikz:
6587 case X86::VPERMQZmi:
6588 case X86::VPERMQZmikz:
6589 case X86::VPERMQZri:
6590 case X86::VPERMQZrikz:
6591 case X86::VPERMQZrm:
6592 case X86::VPERMQZrmb:
6593 case X86::VPERMQZrmbkz:
6594 case X86::VPERMQZrmkz:
6595 case X86::VPERMQZrr:
6596 case X86::VPERMQZrrkz:
6597 case X86::VPERMPSZ256rm:
6598 case X86::VPERMPSZ256rmb:
6599 case X86::VPERMPSZ256rmbkz:
6600 case X86::VPERMPSZ256rmkz:
6601 case X86::VPERMPSZ256rr:
6602 case X86::VPERMPSZ256rrkz:
6603 case X86::VPERMPSZrm:
6604 case X86::VPERMPSZrmb:
6605 case X86::VPERMPSZrmbkz:
6606 case X86::VPERMPSZrmkz:
6607 case X86::VPERMPSZrr:
6608 case X86::VPERMPSZrrkz:
6609 case X86::VPERMPDZ256mbi:
6610 case X86::VPERMPDZ256mbikz:
6611 case X86::VPERMPDZ256mi:
6612 case X86::VPERMPDZ256mikz:
6613 case X86::VPERMPDZ256ri:
6614 case X86::VPERMPDZ256rikz:
6615 case X86::VPERMPDZ256rm:
6616 case X86::VPERMPDZ256rmb:
6617 case X86::VPERMPDZ256rmbkz:
6618 case X86::VPERMPDZ256rmkz:
6619 case X86::VPERMPDZ256rr:
6620 case X86::VPERMPDZ256rrkz:
6621 case X86::VPERMPDZmbi:
6622 case X86::VPERMPDZmbikz:
6623 case X86::VPERMPDZmi:
6624 case X86::VPERMPDZmikz:
6625 case X86::VPERMPDZri:
6626 case X86::VPERMPDZrikz:
6627 case X86::VPERMPDZrm:
6628 case X86::VPERMPDZrmb:
6629 case X86::VPERMPDZrmbkz:
6630 case X86::VPERMPDZrmkz:
6631 case X86::VPERMPDZrr:
6632 case X86::VPERMPDZrrkz:
6633 return Subtarget.hasPERMFalseDeps();
6634 case X86::VRANGEPDZ128rmbi:
6635 case X86::VRANGEPDZ128rmbikz:
6636 case X86::VRANGEPDZ128rmi:
6637 case X86::VRANGEPDZ128rmikz:
6638 case X86::VRANGEPDZ128rri:
6639 case X86::VRANGEPDZ128rrikz:
6640 case X86::VRANGEPDZ256rmbi:
6641 case X86::VRANGEPDZ256rmbikz:
6642 case X86::VRANGEPDZ256rmi:
6643 case X86::VRANGEPDZ256rmikz:
6644 case X86::VRANGEPDZ256rri:
6645 case X86::VRANGEPDZ256rrikz:
6646 case X86::VRANGEPDZrmbi:
6647 case X86::VRANGEPDZrmbikz:
6648 case X86::VRANGEPDZrmi:
6649 case X86::VRANGEPDZrmikz:
6650 case X86::VRANGEPDZrri:
6651 case X86::VRANGEPDZrrib:
6652 case X86::VRANGEPDZrribkz:
6653 case X86::VRANGEPDZrrikz:
6654 case X86::VRANGEPSZ128rmbi:
6655 case X86::VRANGEPSZ128rmbikz:
6656 case X86::VRANGEPSZ128rmi:
6657 case X86::VRANGEPSZ128rmikz:
6658 case X86::VRANGEPSZ128rri:
6659 case X86::VRANGEPSZ128rrikz:
6660 case X86::VRANGEPSZ256rmbi:
6661 case X86::VRANGEPSZ256rmbikz:
6662 case X86::VRANGEPSZ256rmi:
6663 case X86::VRANGEPSZ256rmikz:
6664 case X86::VRANGEPSZ256rri:
6665 case X86::VRANGEPSZ256rrikz:
6666 case X86::VRANGEPSZrmbi:
6667 case X86::VRANGEPSZrmbikz:
6668 case X86::VRANGEPSZrmi:
6669 case X86::VRANGEPSZrmikz:
6670 case X86::VRANGEPSZrri:
6671 case X86::VRANGEPSZrrib:
6672 case X86::VRANGEPSZrribkz:
6673 case X86::VRANGEPSZrrikz:
6674 case X86::VRANGESDZrmi:
6675 case X86::VRANGESDZrmikz:
6676 case X86::VRANGESDZrri:
6677 case X86::VRANGESDZrrib:
6678 case X86::VRANGESDZrribkz:
6679 case X86::VRANGESDZrrikz:
6680 case X86::VRANGESSZrmi:
6681 case X86::VRANGESSZrmikz:
6682 case X86::VRANGESSZrri:
6683 case X86::VRANGESSZrrib:
6684 case X86::VRANGESSZrribkz:
6685 case X86::VRANGESSZrrikz:
6686 return Subtarget.hasRANGEFalseDeps();
6687 case X86::VGETMANTSSZrmi:
6688 case X86::VGETMANTSSZrmikz:
6689 case X86::VGETMANTSSZrri:
6690 case X86::VGETMANTSSZrrib:
6691 case X86::VGETMANTSSZrribkz:
6692 case X86::VGETMANTSSZrrikz:
6693 case X86::VGETMANTSDZrmi:
6694 case X86::VGETMANTSDZrmikz:
6695 case X86::VGETMANTSDZrri:
6696 case X86::VGETMANTSDZrrib:
6697 case X86::VGETMANTSDZrribkz:
6698 case X86::VGETMANTSDZrrikz:
6699 case X86::VGETMANTSHZrmi:
6700 case X86::VGETMANTSHZrmikz:
6701 case X86::VGETMANTSHZrri:
6702 case X86::VGETMANTSHZrrib:
6703 case X86::VGETMANTSHZrribkz:
6704 case X86::VGETMANTSHZrrikz:
6705 case X86::VGETMANTPSZ128rmbi:
6706 case X86::VGETMANTPSZ128rmbikz:
6707 case X86::VGETMANTPSZ128rmi:
6708 case X86::VGETMANTPSZ128rmikz:
6709 case X86::VGETMANTPSZ256rmbi:
6710 case X86::VGETMANTPSZ256rmbikz:
6711 case X86::VGETMANTPSZ256rmi:
6712 case X86::VGETMANTPSZ256rmikz:
6713 case X86::VGETMANTPSZrmbi:
6714 case X86::VGETMANTPSZrmbikz:
6715 case X86::VGETMANTPSZrmi:
6716 case X86::VGETMANTPSZrmikz:
6717 case X86::VGETMANTPDZ128rmbi:
6718 case X86::VGETMANTPDZ128rmbikz:
6719 case X86::VGETMANTPDZ128rmi:
6720 case X86::VGETMANTPDZ128rmikz:
6721 case X86::VGETMANTPDZ256rmbi:
6722 case X86::VGETMANTPDZ256rmbikz:
6723 case X86::VGETMANTPDZ256rmi:
6724 case X86::VGETMANTPDZ256rmikz:
6725 case X86::VGETMANTPDZrmbi:
6726 case X86::VGETMANTPDZrmbikz:
6727 case X86::VGETMANTPDZrmi:
6728 case X86::VGETMANTPDZrmikz:
6729 return Subtarget.hasGETMANTFalseDeps();
6730 case X86::VPMULLQZ128rm:
6731 case X86::VPMULLQZ128rmb:
6732 case X86::VPMULLQZ128rmbkz:
6733 case X86::VPMULLQZ128rmkz:
6734 case X86::VPMULLQZ128rr:
6735 case X86::VPMULLQZ128rrkz:
6736 case X86::VPMULLQZ256rm:
6737 case X86::VPMULLQZ256rmb:
6738 case X86::VPMULLQZ256rmbkz:
6739 case X86::VPMULLQZ256rmkz:
6740 case X86::VPMULLQZ256rr:
6741 case X86::VPMULLQZ256rrkz:
6742 case X86::VPMULLQZrm:
6743 case X86::VPMULLQZrmb:
6744 case X86::VPMULLQZrmbkz:
6745 case X86::VPMULLQZrmkz:
6746 case X86::VPMULLQZrr:
6747 case X86::VPMULLQZrrkz:
6748 return Subtarget.hasMULLQFalseDeps();
6749 // GPR
6750 case X86::POPCNT32rm:
6751 case X86::POPCNT32rr:
6752 case X86::POPCNT64rm:
6753 case X86::POPCNT64rr:
6754 return Subtarget.hasPOPCNTFalseDeps();
6755 case X86::LZCNT32rm:
6756 case X86::LZCNT32rr:
6757 case X86::LZCNT64rm:
6758 case X86::LZCNT64rr:
6759 case X86::TZCNT32rm:
6760 case X86::TZCNT32rr:
6761 case X86::TZCNT64rm:
6762 case X86::TZCNT64rr:
6763 return Subtarget.hasLZCNTFalseDeps();
6764 }
6765
6766 return false;
6767}
6768
6769/// Inform the BreakFalseDeps pass how many idle
6770/// instructions we would like before a partial register update.
6772 const MachineInstr &MI, unsigned OpNum,
6773 const TargetRegisterInfo *TRI) const {
6774
6775 if (OpNum != 0)
6776 return 0;
6777
6778 // NDD ops with 8/16b results may appear to be partial register
6779 // updates after register allocation.
6780 bool HasNDDPartialWrite = false;
6781 if (X86II::hasNewDataDest(MI.getDesc().TSFlags)) {
6782 Register Reg = MI.getOperand(0).getReg();
6783 if (!Reg.isVirtual())
6784 HasNDDPartialWrite =
6785 X86::GR8RegClass.contains(Reg) || X86::GR16RegClass.contains(Reg);
6786 }
6787
6788 if (!(HasNDDPartialWrite || hasPartialRegUpdate(MI.getOpcode(), Subtarget)))
6789 return 0;
6790
6791 // Check if the result register is also used as a source.
6792 // For non-NDD ops, this means a partial update is wanted, hence we return 0.
6793 // For NDD ops, this means it is possible to compress the instruction
6794 // to a legacy form in CompressEVEX, which would create an unwanted partial
6795 // update, so we return the clearance.
6796 const MachineOperand &MO = MI.getOperand(0);
6797 Register Reg = MO.getReg();
6798 bool ReadsReg = false;
6799 if (Reg.isVirtual())
6800 ReadsReg = (MO.readsReg() || MI.readsVirtualRegister(Reg));
6801 else
6802 ReadsReg = MI.readsRegister(Reg, TRI);
6803 if (ReadsReg != HasNDDPartialWrite)
6804 return 0;
6805
6806 // If any instructions in the clearance range are reading Reg, insert a
6807 // dependency breaking instruction, which is inexpensive and is likely to
6808 // be hidden in other instruction's cycles.
6810}
6811
6812// Return true for any instruction the copies the high bits of the first source
6813// operand into the unused high bits of the destination operand.
6814// Also returns true for instructions that have two inputs where one may
6815// be undef and we want it to use the same register as the other input.
6816static bool hasUndefRegUpdate(unsigned Opcode, unsigned OpNum,
6817 bool ForLoadFold = false) {
6818 // Set the OpNum parameter to the first source operand.
6819 switch (Opcode) {
6820 case X86::MMX_PUNPCKHBWrr:
6821 case X86::MMX_PUNPCKHWDrr:
6822 case X86::MMX_PUNPCKHDQrr:
6823 case X86::MMX_PUNPCKLBWrr:
6824 case X86::MMX_PUNPCKLWDrr:
6825 case X86::MMX_PUNPCKLDQrr:
6826 case X86::MOVHLPSrr:
6827 case X86::PACKSSWBrr:
6828 case X86::PACKUSWBrr:
6829 case X86::PACKSSDWrr:
6830 case X86::PACKUSDWrr:
6831 case X86::PUNPCKHBWrr:
6832 case X86::PUNPCKLBWrr:
6833 case X86::PUNPCKHWDrr:
6834 case X86::PUNPCKLWDrr:
6835 case X86::PUNPCKHDQrr:
6836 case X86::PUNPCKLDQrr:
6837 case X86::PUNPCKHQDQrr:
6838 case X86::PUNPCKLQDQrr:
6839 case X86::SHUFPDrri:
6840 case X86::SHUFPSrri:
6841 // These instructions are sometimes used with an undef first or second
6842 // source. Return true here so BreakFalseDeps will assign this source to the
6843 // same register as the first source to avoid a false dependency.
6844 // Operand 1 of these instructions is tied so they're separate from their
6845 // VEX counterparts.
6846 return OpNum == 2 && !ForLoadFold;
6847
6848 case X86::VMOVLHPSrr:
6849 case X86::VMOVLHPSZrr:
6850 case X86::VPACKSSWBrr:
6851 case X86::VPACKUSWBrr:
6852 case X86::VPACKSSDWrr:
6853 case X86::VPACKUSDWrr:
6854 case X86::VPACKSSWBZ128rr:
6855 case X86::VPACKUSWBZ128rr:
6856 case X86::VPACKSSDWZ128rr:
6857 case X86::VPACKUSDWZ128rr:
6858 case X86::VPERM2F128rri:
6859 case X86::VPERM2I128rri:
6860 case X86::VSHUFF32X4Z256rri:
6861 case X86::VSHUFF32X4Zrri:
6862 case X86::VSHUFF64X2Z256rri:
6863 case X86::VSHUFF64X2Zrri:
6864 case X86::VSHUFI32X4Z256rri:
6865 case X86::VSHUFI32X4Zrri:
6866 case X86::VSHUFI64X2Z256rri:
6867 case X86::VSHUFI64X2Zrri:
6868 case X86::VPUNPCKHBWrr:
6869 case X86::VPUNPCKLBWrr:
6870 case X86::VPUNPCKHBWYrr:
6871 case X86::VPUNPCKLBWYrr:
6872 case X86::VPUNPCKHBWZ128rr:
6873 case X86::VPUNPCKLBWZ128rr:
6874 case X86::VPUNPCKHBWZ256rr:
6875 case X86::VPUNPCKLBWZ256rr:
6876 case X86::VPUNPCKHBWZrr:
6877 case X86::VPUNPCKLBWZrr:
6878 case X86::VPUNPCKHWDrr:
6879 case X86::VPUNPCKLWDrr:
6880 case X86::VPUNPCKHWDYrr:
6881 case X86::VPUNPCKLWDYrr:
6882 case X86::VPUNPCKHWDZ128rr:
6883 case X86::VPUNPCKLWDZ128rr:
6884 case X86::VPUNPCKHWDZ256rr:
6885 case X86::VPUNPCKLWDZ256rr:
6886 case X86::VPUNPCKHWDZrr:
6887 case X86::VPUNPCKLWDZrr:
6888 case X86::VPUNPCKHDQrr:
6889 case X86::VPUNPCKLDQrr:
6890 case X86::VPUNPCKHDQYrr:
6891 case X86::VPUNPCKLDQYrr:
6892 case X86::VPUNPCKHDQZ128rr:
6893 case X86::VPUNPCKLDQZ128rr:
6894 case X86::VPUNPCKHDQZ256rr:
6895 case X86::VPUNPCKLDQZ256rr:
6896 case X86::VPUNPCKHDQZrr:
6897 case X86::VPUNPCKLDQZrr:
6898 case X86::VPUNPCKHQDQrr:
6899 case X86::VPUNPCKLQDQrr:
6900 case X86::VPUNPCKHQDQYrr:
6901 case X86::VPUNPCKLQDQYrr:
6902 case X86::VPUNPCKHQDQZ128rr:
6903 case X86::VPUNPCKLQDQZ128rr:
6904 case X86::VPUNPCKHQDQZ256rr:
6905 case X86::VPUNPCKLQDQZ256rr:
6906 case X86::VPUNPCKHQDQZrr:
6907 case X86::VPUNPCKLQDQZrr:
6908 // These instructions are sometimes used with an undef first or second
6909 // source. Return true here so BreakFalseDeps will assign this source to the
6910 // same register as the first source to avoid a false dependency.
6911 return (OpNum == 1 || OpNum == 2) && !ForLoadFold;
6912
6913 case X86::VCVTSI2SSrr:
6914 case X86::VCVTSI2SSrm:
6915 case X86::VCVTSI2SSrr_Int:
6916 case X86::VCVTSI2SSrm_Int:
6917 case X86::VCVTSI642SSrr:
6918 case X86::VCVTSI642SSrm:
6919 case X86::VCVTSI642SSrr_Int:
6920 case X86::VCVTSI642SSrm_Int:
6921 case X86::VCVTSI2SDrr:
6922 case X86::VCVTSI2SDrm:
6923 case X86::VCVTSI2SDrr_Int:
6924 case X86::VCVTSI2SDrm_Int:
6925 case X86::VCVTSI642SDrr:
6926 case X86::VCVTSI642SDrm:
6927 case X86::VCVTSI642SDrr_Int:
6928 case X86::VCVTSI642SDrm_Int:
6929 // AVX-512
6930 case X86::VCVTSI2SSZrr:
6931 case X86::VCVTSI2SSZrm:
6932 case X86::VCVTSI2SSZrr_Int:
6933 case X86::VCVTSI2SSZrrb_Int:
6934 case X86::VCVTSI2SSZrm_Int:
6935 case X86::VCVTSI642SSZrr:
6936 case X86::VCVTSI642SSZrm:
6937 case X86::VCVTSI642SSZrr_Int:
6938 case X86::VCVTSI642SSZrrb_Int:
6939 case X86::VCVTSI642SSZrm_Int:
6940 case X86::VCVTSI2SDZrr:
6941 case X86::VCVTSI2SDZrm:
6942 case X86::VCVTSI2SDZrr_Int:
6943 case X86::VCVTSI2SDZrm_Int:
6944 case X86::VCVTSI642SDZrr:
6945 case X86::VCVTSI642SDZrm:
6946 case X86::VCVTSI642SDZrr_Int:
6947 case X86::VCVTSI642SDZrrb_Int:
6948 case X86::VCVTSI642SDZrm_Int:
6949 case X86::VCVTUSI2SSZrr:
6950 case X86::VCVTUSI2SSZrm:
6951 case X86::VCVTUSI2SSZrr_Int:
6952 case X86::VCVTUSI2SSZrrb_Int:
6953 case X86::VCVTUSI2SSZrm_Int:
6954 case X86::VCVTUSI642SSZrr:
6955 case X86::VCVTUSI642SSZrm:
6956 case X86::VCVTUSI642SSZrr_Int:
6957 case X86::VCVTUSI642SSZrrb_Int:
6958 case X86::VCVTUSI642SSZrm_Int:
6959 case X86::VCVTUSI2SDZrr:
6960 case X86::VCVTUSI2SDZrm:
6961 case X86::VCVTUSI2SDZrr_Int:
6962 case X86::VCVTUSI2SDZrm_Int:
6963 case X86::VCVTUSI642SDZrr:
6964 case X86::VCVTUSI642SDZrm:
6965 case X86::VCVTUSI642SDZrr_Int:
6966 case X86::VCVTUSI642SDZrrb_Int:
6967 case X86::VCVTUSI642SDZrm_Int:
6968 case X86::VCVTSI2SHZrr:
6969 case X86::VCVTSI2SHZrm:
6970 case X86::VCVTSI2SHZrr_Int:
6971 case X86::VCVTSI2SHZrrb_Int:
6972 case X86::VCVTSI2SHZrm_Int:
6973 case X86::VCVTSI642SHZrr:
6974 case X86::VCVTSI642SHZrm:
6975 case X86::VCVTSI642SHZrr_Int:
6976 case X86::VCVTSI642SHZrrb_Int:
6977 case X86::VCVTSI642SHZrm_Int:
6978 case X86::VCVTUSI2SHZrr:
6979 case X86::VCVTUSI2SHZrm:
6980 case X86::VCVTUSI2SHZrr_Int:
6981 case X86::VCVTUSI2SHZrrb_Int:
6982 case X86::VCVTUSI2SHZrm_Int:
6983 case X86::VCVTUSI642SHZrr:
6984 case X86::VCVTUSI642SHZrm:
6985 case X86::VCVTUSI642SHZrr_Int:
6986 case X86::VCVTUSI642SHZrrb_Int:
6987 case X86::VCVTUSI642SHZrm_Int:
6988 // Load folding won't effect the undef register update since the input is
6989 // a GPR.
6990 return OpNum == 1 && !ForLoadFold;
6991 case X86::VCVTSD2SSrr:
6992 case X86::VCVTSD2SSrm:
6993 case X86::VCVTSD2SSrr_Int:
6994 case X86::VCVTSD2SSrm_Int:
6995 case X86::VCVTSS2SDrr:
6996 case X86::VCVTSS2SDrm:
6997 case X86::VCVTSS2SDrr_Int:
6998 case X86::VCVTSS2SDrm_Int:
6999 case X86::VRCPSSr:
7000 case X86::VRCPSSr_Int:
7001 case X86::VRCPSSm:
7002 case X86::VRCPSSm_Int:
7003 case X86::VROUNDSDri:
7004 case X86::VROUNDSDmi:
7005 case X86::VROUNDSDri_Int:
7006 case X86::VROUNDSDmi_Int:
7007 case X86::VROUNDSSri:
7008 case X86::VROUNDSSmi:
7009 case X86::VROUNDSSri_Int:
7010 case X86::VROUNDSSmi_Int:
7011 case X86::VRSQRTSSr:
7012 case X86::VRSQRTSSr_Int:
7013 case X86::VRSQRTSSm:
7014 case X86::VRSQRTSSm_Int:
7015 case X86::VSQRTSSr:
7016 case X86::VSQRTSSr_Int:
7017 case X86::VSQRTSSm:
7018 case X86::VSQRTSSm_Int:
7019 case X86::VSQRTSDr:
7020 case X86::VSQRTSDr_Int:
7021 case X86::VSQRTSDm:
7022 case X86::VSQRTSDm_Int:
7023 // AVX-512
7024 case X86::VCVTSD2SSZrr:
7025 case X86::VCVTSD2SSZrr_Int:
7026 case X86::VCVTSD2SSZrrb_Int:
7027 case X86::VCVTSD2SSZrm:
7028 case X86::VCVTSD2SSZrm_Int:
7029 case X86::VCVTSS2SDZrr:
7030 case X86::VCVTSS2SDZrr_Int:
7031 case X86::VCVTSS2SDZrrb_Int:
7032 case X86::VCVTSS2SDZrm:
7033 case X86::VCVTSS2SDZrm_Int:
7034 case X86::VGETEXPSDZr:
7035 case X86::VGETEXPSDZrb:
7036 case X86::VGETEXPSDZm:
7037 case X86::VGETEXPSSZr:
7038 case X86::VGETEXPSSZrb:
7039 case X86::VGETEXPSSZm:
7040 case X86::VGETMANTSDZrri:
7041 case X86::VGETMANTSDZrrib:
7042 case X86::VGETMANTSDZrmi:
7043 case X86::VGETMANTSSZrri:
7044 case X86::VGETMANTSSZrrib:
7045 case X86::VGETMANTSSZrmi:
7046 case X86::VRNDSCALESDZrri:
7047 case X86::VRNDSCALESDZrri_Int:
7048 case X86::VRNDSCALESDZrrib_Int:
7049 case X86::VRNDSCALESDZrmi:
7050 case X86::VRNDSCALESDZrmi_Int:
7051 case X86::VRNDSCALESSZrri:
7052 case X86::VRNDSCALESSZrri_Int:
7053 case X86::VRNDSCALESSZrrib_Int:
7054 case X86::VRNDSCALESSZrmi:
7055 case X86::VRNDSCALESSZrmi_Int:
7056 case X86::VRCP14SDZrr:
7057 case X86::VRCP14SDZrm:
7058 case X86::VRCP14SSZrr:
7059 case X86::VRCP14SSZrm:
7060 case X86::VRCPSHZrr:
7061 case X86::VRCPSHZrm:
7062 case X86::VRSQRTSHZrr:
7063 case X86::VRSQRTSHZrm:
7064 case X86::VREDUCESHZrmi:
7065 case X86::VREDUCESHZrri:
7066 case X86::VREDUCESHZrrib:
7067 case X86::VGETEXPSHZr:
7068 case X86::VGETEXPSHZrb:
7069 case X86::VGETEXPSHZm:
7070 case X86::VGETMANTSHZrri:
7071 case X86::VGETMANTSHZrrib:
7072 case X86::VGETMANTSHZrmi:
7073 case X86::VRNDSCALESHZrri:
7074 case X86::VRNDSCALESHZrri_Int:
7075 case X86::VRNDSCALESHZrrib_Int:
7076 case X86::VRNDSCALESHZrmi:
7077 case X86::VRNDSCALESHZrmi_Int:
7078 case X86::VSQRTSHZr:
7079 case X86::VSQRTSHZr_Int:
7080 case X86::VSQRTSHZrb_Int:
7081 case X86::VSQRTSHZm:
7082 case X86::VSQRTSHZm_Int:
7083 case X86::VRCP28SDZr:
7084 case X86::VRCP28SDZrb:
7085 case X86::VRCP28SDZm:
7086 case X86::VRCP28SSZr:
7087 case X86::VRCP28SSZrb:
7088 case X86::VRCP28SSZm:
7089 case X86::VREDUCESSZrmi:
7090 case X86::VREDUCESSZrri:
7091 case X86::VREDUCESSZrrib:
7092 case X86::VRSQRT14SDZrr:
7093 case X86::VRSQRT14SDZrm:
7094 case X86::VRSQRT14SSZrr:
7095 case X86::VRSQRT14SSZrm:
7096 case X86::VRSQRT28SDZr:
7097 case X86::VRSQRT28SDZrb:
7098 case X86::VRSQRT28SDZm:
7099 case X86::VRSQRT28SSZr:
7100 case X86::VRSQRT28SSZrb:
7101 case X86::VRSQRT28SSZm:
7102 case X86::VSQRTSSZr:
7103 case X86::VSQRTSSZr_Int:
7104 case X86::VSQRTSSZrb_Int:
7105 case X86::VSQRTSSZm:
7106 case X86::VSQRTSSZm_Int:
7107 case X86::VSQRTSDZr:
7108 case X86::VSQRTSDZr_Int:
7109 case X86::VSQRTSDZrb_Int:
7110 case X86::VSQRTSDZm:
7111 case X86::VSQRTSDZm_Int:
7112 case X86::VCVTSD2SHZrr:
7113 case X86::VCVTSD2SHZrr_Int:
7114 case X86::VCVTSD2SHZrrb_Int:
7115 case X86::VCVTSD2SHZrm:
7116 case X86::VCVTSD2SHZrm_Int:
7117 case X86::VCVTSS2SHZrr:
7118 case X86::VCVTSS2SHZrr_Int:
7119 case X86::VCVTSS2SHZrrb_Int:
7120 case X86::VCVTSS2SHZrm:
7121 case X86::VCVTSS2SHZrm_Int:
7122 case X86::VCVTSH2SDZrr:
7123 case X86::VCVTSH2SDZrr_Int:
7124 case X86::VCVTSH2SDZrrb_Int:
7125 case X86::VCVTSH2SDZrm:
7126 case X86::VCVTSH2SDZrm_Int:
7127 case X86::VCVTSH2SSZrr:
7128 case X86::VCVTSH2SSZrr_Int:
7129 case X86::VCVTSH2SSZrrb_Int:
7130 case X86::VCVTSH2SSZrm:
7131 case X86::VCVTSH2SSZrm_Int:
7132 return OpNum == 1;
7133 case X86::VMOVSSZrrk:
7134 case X86::VMOVSDZrrk:
7135 return OpNum == 3 && !ForLoadFold;
7136 case X86::VMOVSSZrrkz:
7137 case X86::VMOVSDZrrkz:
7138 return OpNum == 2 && !ForLoadFold;
7139 }
7140
7141 return false;
7142}
7143
7144/// Inform the BreakFalseDeps pass how many idle instructions we would like
7145/// before certain undef register reads.
7146///
7147/// This catches the VCVTSI2SD family of instructions:
7148///
7149/// vcvtsi2sdq %rax, undef %xmm0, %xmm14
7150///
7151/// We should to be careful *not* to catch VXOR idioms which are presumably
7152/// handled specially in the pipeline:
7153///
7154/// vxorps undef %xmm1, undef %xmm1, %xmm1
7155///
7156/// Like getPartialRegUpdateClearance, this makes a strong assumption that the
7157/// high bits that are passed-through are not live.
7158unsigned
7160 const TargetRegisterInfo *TRI) const {
7161 const MachineOperand &MO = MI.getOperand(OpNum);
7162 if (MO.getReg().isPhysical() && hasUndefRegUpdate(MI.getOpcode(), OpNum))
7163 return UndefRegClearance;
7164
7165 return 0;
7166}
7167
7169 MachineInstr &MI, unsigned OpNum, const TargetRegisterInfo *TRI) const {
7170 Register Reg = MI.getOperand(OpNum).getReg();
7171 // If MI kills this register, the false dependence is already broken.
7172 if (MI.killsRegister(Reg, TRI))
7173 return;
7174
7175 if (X86::VR128RegClass.contains(Reg)) {
7176 // These instructions are all floating point domain, so xorps is the best
7177 // choice.
7178 unsigned Opc = Subtarget.hasAVX() ? X86::VXORPSrr : X86::XORPSrr;
7179 BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(Opc), Reg)
7180 .addReg(Reg, RegState::Undef)
7181 .addReg(Reg, RegState::Undef);
7182 MI.addRegisterKilled(Reg, TRI, true);
7183 } else if (X86::VR256RegClass.contains(Reg)) {
7184 // Use vxorps to clear the full ymm register.
7185 // It wants to read and write the xmm sub-register.
7186 Register XReg = TRI->getSubReg(Reg, X86::sub_xmm);
7187 BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(X86::VXORPSrr), XReg)
7188 .addReg(XReg, RegState::Undef)
7189 .addReg(XReg, RegState::Undef)
7191 MI.addRegisterKilled(Reg, TRI, true);
7192 } else if (X86::VR128XRegClass.contains(Reg)) {
7193 // Only handle VLX targets.
7194 if (!Subtarget.hasVLX())
7195 return;
7196 // Since vxorps requires AVX512DQ, vpxord should be the best choice.
7197 BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(X86::VPXORDZ128rr), Reg)
7198 .addReg(Reg, RegState::Undef)
7199 .addReg(Reg, RegState::Undef);
7200 MI.addRegisterKilled(Reg, TRI, true);
7201 } else if (X86::VR256XRegClass.contains(Reg) ||
7202 X86::VR512RegClass.contains(Reg)) {
7203 // Only handle VLX targets.
7204 if (!Subtarget.hasVLX())
7205 return;
7206 // Use vpxord to clear the full ymm/zmm register.
7207 // It wants to read and write the xmm sub-register.
7208 Register XReg = TRI->getSubReg(Reg, X86::sub_xmm);
7209 BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(X86::VPXORDZ128rr), XReg)
7210 .addReg(XReg, RegState::Undef)
7211 .addReg(XReg, RegState::Undef)
7213 MI.addRegisterKilled(Reg, TRI, true);
7214 } else if (X86::GR64RegClass.contains(Reg)) {
7215 // Using XOR32rr because it has shorter encoding and zeros up the upper bits
7216 // as well.
7217 Register XReg = TRI->getSubReg(Reg, X86::sub_32bit);
7218 BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(X86::XOR32rr), XReg)
7219 .addReg(XReg, RegState::Undef)
7220 .addReg(XReg, RegState::Undef)
7222 MI.addRegisterKilled(Reg, TRI, true);
7223 } else if (X86::GR32RegClass.contains(Reg)) {
7224 BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(X86::XOR32rr), Reg)
7225 .addReg(Reg, RegState::Undef)
7226 .addReg(Reg, RegState::Undef);
7227 MI.addRegisterKilled(Reg, TRI, true);
7228 } else if ((X86::GR16RegClass.contains(Reg) ||
7229 X86::GR8RegClass.contains(Reg)) &&
7230 X86II::hasNewDataDest(MI.getDesc().TSFlags)) {
7231 // This case is only expected for NDD ops which appear to be partial
7232 // writes, but are not due to the zeroing of the upper part. Here
7233 // we add an implicit def of the superegister, which prevents
7234 // CompressEVEX from converting this to a legacy form.
7235 Register SuperReg = getX86SubSuperRegister(Reg, 64);
7236 MachineInstrBuilder BuildMI(*MI.getParent()->getParent(), &MI);
7237 if (!MI.definesRegister(SuperReg, /*TRI=*/nullptr))
7238 BuildMI.addReg(SuperReg, RegState::ImplicitDefine);
7239 }
7240}
7241
7243 int PtrOffset = 0) {
7244 unsigned NumAddrOps = MOs.size();
7245
7246 if (NumAddrOps < 4) {
7247 // FrameIndex only - add an immediate offset (whether its zero or not).
7248 for (unsigned i = 0; i != NumAddrOps; ++i)
7249 MIB.add(MOs[i]);
7250 addOffset(MIB, PtrOffset);
7251 } else {
7252 // General Memory Addressing - we need to add any offset to an existing
7253 // offset.
7254 assert(MOs.size() == 5 && "Unexpected memory operand list length");
7255 for (unsigned i = 0; i != NumAddrOps; ++i) {
7256 const MachineOperand &MO = MOs[i];
7257 if (i == 3 && PtrOffset != 0) {
7258 MIB.addDisp(MO, PtrOffset);
7259 } else {
7260 MIB.add(MO);
7261 }
7262 }
7263 }
7264}
7265
7267 MachineInstr &NewMI,
7268 const TargetInstrInfo &TII) {
7270
7271 for (int Idx : llvm::seq<int>(0, NewMI.getNumOperands())) {
7272 MachineOperand &MO = NewMI.getOperand(Idx);
7273 // We only need to update constraints on virtual register operands.
7274 if (!MO.isReg())
7275 continue;
7276 Register Reg = MO.getReg();
7277 if (!Reg.isVirtual())
7278 continue;
7279
7280 auto *NewRC =
7281 MRI.constrainRegClass(Reg, TII.getRegClass(NewMI.getDesc(), Idx));
7282 if (!NewRC) {
7283 LLVM_DEBUG(
7284 dbgs() << "WARNING: Unable to update register constraint for operand "
7285 << Idx << " of instruction:\n";
7286 NewMI.dump(); dbgs() << "\n");
7287 }
7288 }
7289}
7290
7291static MachineInstr *fuseTwoAddrInst(MachineFunction &MF, unsigned Opcode,
7295 const TargetInstrInfo &TII) {
7296 // Create the base instruction with the memory operand as the first part.
7297 // Omit the implicit operands, something BuildMI can't do.
7298 MachineInstr *NewMI =
7299 MF.CreateMachineInstr(TII.get(Opcode), MI.getDebugLoc(), true);
7300 MachineInstrBuilder MIB(MF, NewMI);
7301 addOperands(MIB, MOs);
7302
7303 // Loop over the rest of the ri operands, converting them over.
7304 unsigned NumOps = MI.getDesc().getNumOperands() - 2;
7305 for (unsigned i = 0; i != NumOps; ++i) {
7306 MachineOperand &MO = MI.getOperand(i + 2);
7307 MIB.add(MO);
7308 }
7309 for (const MachineOperand &MO : llvm::drop_begin(MI.operands(), NumOps + 2))
7310 MIB.add(MO);
7311
7312 updateOperandRegConstraints(MF, *NewMI, TII);
7313
7314 MachineBasicBlock *MBB = InsertPt->getParent();
7315 MBB->insert(InsertPt, NewMI);
7316
7317 return MIB;
7318}
7319
7320static MachineInstr *fuseInst(MachineFunction &MF, unsigned Opcode,
7321 unsigned OpNo, ArrayRef<MachineOperand> MOs,
7324 int PtrOffset = 0) {
7325 // Omit the implicit operands, something BuildMI can't do.
7326 MachineInstr *NewMI =
7327 MF.CreateMachineInstr(TII.get(Opcode), MI.getDebugLoc(), true);
7328 MachineInstrBuilder MIB(MF, NewMI);
7329
7330 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
7331 MachineOperand &MO = MI.getOperand(i);
7332 if (i == OpNo) {
7333 assert(MO.isReg() && "Expected to fold into reg operand!");
7334 addOperands(MIB, MOs, PtrOffset);
7335 } else {
7336 MIB.add(MO);
7337 }
7338 }
7339
7340 updateOperandRegConstraints(MF, *NewMI, TII);
7341
7342 // Copy the NoFPExcept flag from the instruction we're fusing.
7345
7346 MachineBasicBlock *MBB = InsertPt->getParent();
7347 MBB->insert(InsertPt, NewMI);
7348
7349 return MIB;
7350}
7351
7352static MachineInstr *makeM0Inst(const TargetInstrInfo &TII, unsigned Opcode,
7355 MachineInstr &MI) {
7356 MachineInstrBuilder MIB = BuildMI(*InsertPt->getParent(), InsertPt,
7357 MI.getDebugLoc(), TII.get(Opcode));
7358 addOperands(MIB, MOs);
7359 return MIB.addImm(0);
7360}
7361
7362MachineInstr *X86InstrInfo::foldMemoryOperandCustom(
7363 MachineFunction &MF, MachineInstr &MI, unsigned OpNum,
7365 unsigned Size, Align Alignment) const {
7366 switch (MI.getOpcode()) {
7367 case X86::INSERTPSrri:
7368 case X86::VINSERTPSrri:
7369 case X86::VINSERTPSZrri:
7370 // Attempt to convert the load of inserted vector into a fold load
7371 // of a single float.
7372 if (OpNum == 2) {
7373 unsigned Imm = MI.getOperand(MI.getNumOperands() - 1).getImm();
7374 unsigned ZMask = Imm & 15;
7375 unsigned DstIdx = (Imm >> 4) & 3;
7376 unsigned SrcIdx = (Imm >> 6) & 3;
7377
7378 const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
7379 const TargetRegisterClass *RC = getRegClass(MI.getDesc(), OpNum);
7380 unsigned RCSize = TRI.getRegSizeInBits(*RC) / 8;
7381 if ((Size == 0 || Size >= 16) && RCSize >= 16 &&
7382 (MI.getOpcode() != X86::INSERTPSrri || Alignment >= Align(4))) {
7383 int PtrOffset = SrcIdx * 4;
7384 unsigned NewImm = (DstIdx << 4) | ZMask;
7385 unsigned NewOpCode =
7386 (MI.getOpcode() == X86::VINSERTPSZrri) ? X86::VINSERTPSZrmi
7387 : (MI.getOpcode() == X86::VINSERTPSrri) ? X86::VINSERTPSrmi
7388 : X86::INSERTPSrmi;
7389 MachineInstr *NewMI =
7390 fuseInst(MF, NewOpCode, OpNum, MOs, InsertPt, MI, *this, PtrOffset);
7391 NewMI->getOperand(NewMI->getNumOperands() - 1).setImm(NewImm);
7392 return NewMI;
7393 }
7394 }
7395 break;
7396 case X86::MOVHLPSrr:
7397 case X86::VMOVHLPSrr:
7398 case X86::VMOVHLPSZrr:
7399 // Move the upper 64-bits of the second operand to the lower 64-bits.
7400 // To fold the load, adjust the pointer to the upper and use (V)MOVLPS.
7401 // TODO: In most cases AVX doesn't have a 8-byte alignment requirement.
7402 if (OpNum == 2) {
7403 const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
7404 const TargetRegisterClass *RC = getRegClass(MI.getDesc(), OpNum);
7405 unsigned RCSize = TRI.getRegSizeInBits(*RC) / 8;
7406 if ((Size == 0 || Size >= 16) && RCSize >= 16 && Alignment >= Align(8)) {
7407 unsigned NewOpCode =
7408 (MI.getOpcode() == X86::VMOVHLPSZrr) ? X86::VMOVLPSZ128rm
7409 : (MI.getOpcode() == X86::VMOVHLPSrr) ? X86::VMOVLPSrm
7410 : X86::MOVLPSrm;
7411 MachineInstr *NewMI =
7412 fuseInst(MF, NewOpCode, OpNum, MOs, InsertPt, MI, *this, 8);
7413 return NewMI;
7414 }
7415 }
7416 break;
7417 case X86::UNPCKLPDrr:
7418 // If we won't be able to fold this to the memory form of UNPCKL, use
7419 // MOVHPD instead. Done as custom because we can't have this in the load
7420 // table twice.
7421 if (OpNum == 2) {
7422 const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
7423 const TargetRegisterClass *RC = getRegClass(MI.getDesc(), OpNum);
7424 unsigned RCSize = TRI.getRegSizeInBits(*RC) / 8;
7425 if ((Size == 0 || Size >= 16) && RCSize >= 16 && Alignment < Align(16)) {
7426 MachineInstr *NewMI =
7427 fuseInst(MF, X86::MOVHPDrm, OpNum, MOs, InsertPt, MI, *this);
7428 return NewMI;
7429 }
7430 }
7431 break;
7432 case X86::MOV32r0:
7433 if (auto *NewMI =
7434 makeM0Inst(*this, (Size == 4) ? X86::MOV32mi : X86::MOV64mi32, MOs,
7435 InsertPt, MI))
7436 return NewMI;
7437 break;
7438 }
7439
7440 return nullptr;
7441}
7442
7444 MachineInstr &MI) {
7445 if (!hasUndefRegUpdate(MI.getOpcode(), 1, /*ForLoadFold*/ true) ||
7446 !MI.getOperand(1).isReg())
7447 return false;
7448
7449 // The are two cases we need to handle depending on where in the pipeline
7450 // the folding attempt is being made.
7451 // -Register has the undef flag set.
7452 // -Register is produced by the IMPLICIT_DEF instruction.
7453
7454 if (MI.getOperand(1).isUndef())
7455 return true;
7456
7458 MachineInstr *VRegDef = RegInfo.getUniqueVRegDef(MI.getOperand(1).getReg());
7459 return VRegDef && VRegDef->isImplicitDef();
7460}
7461
7462unsigned X86InstrInfo::commuteOperandsForFold(MachineInstr &MI,
7463 unsigned Idx1) const {
7464 unsigned Idx2 = CommuteAnyOperandIndex;
7465 if (!findCommutedOpIndices(MI, Idx1, Idx2))
7466 return Idx1;
7467
7468 bool HasDef = MI.getDesc().getNumDefs();
7469 Register Reg0 = HasDef ? MI.getOperand(0).getReg() : Register();
7470 Register Reg1 = MI.getOperand(Idx1).getReg();
7471 Register Reg2 = MI.getOperand(Idx2).getReg();
7472 bool Tied1 = 0 == MI.getDesc().getOperandConstraint(Idx1, MCOI::TIED_TO);
7473 bool Tied2 = 0 == MI.getDesc().getOperandConstraint(Idx2, MCOI::TIED_TO);
7474
7475 // If either of the commutable operands are tied to the destination
7476 // then we can not commute + fold.
7477 if ((HasDef && Reg0 == Reg1 && Tied1) || (HasDef && Reg0 == Reg2 && Tied2))
7478 return Idx1;
7479
7480 return commuteInstruction(MI, false, Idx1, Idx2) ? Idx2 : Idx1;
7481}
7482
7483static void printFailMsgforFold(const MachineInstr &MI, unsigned Idx) {
7484 if (PrintFailedFusing && !MI.isCopy())
7485 dbgs() << "We failed to fuse operand " << Idx << " in " << MI;
7486}
7487
7489 MachineFunction &MF, MachineInstr &MI, unsigned OpNum,
7491 unsigned Size, Align Alignment, bool AllowCommute) const {
7492 bool isSlowTwoMemOps = Subtarget.slowTwoMemOps();
7493 unsigned Opc = MI.getOpcode();
7494
7495 // For CPUs that favor the register form of a call or push,
7496 // do not fold loads into calls or pushes, unless optimizing for size
7497 // aggressively.
7498 if (isSlowTwoMemOps && !MF.getFunction().hasMinSize() &&
7499 (Opc == X86::CALL32r || Opc == X86::CALL64r ||
7500 Opc == X86::CALL64r_ImpCall || Opc == X86::PUSH16r ||
7501 Opc == X86::PUSH32r || Opc == X86::PUSH64r))
7502 return nullptr;
7503
7504 // Avoid partial and undef register update stalls unless optimizing for size.
7505 if (!MF.getFunction().hasOptSize() &&
7506 (hasPartialRegUpdate(Opc, Subtarget, /*ForLoadFold*/ true) ||
7508 return nullptr;
7509
7510 unsigned NumOps = MI.getDesc().getNumOperands();
7511 bool IsTwoAddr = NumOps > 1 && OpNum < 2 && MI.getOperand(0).isReg() &&
7512 MI.getOperand(1).isReg() &&
7513 MI.getOperand(0).getReg() == MI.getOperand(1).getReg();
7514
7515 // FIXME: AsmPrinter doesn't know how to handle
7516 // X86II::MO_GOT_ABSOLUTE_ADDRESS after folding.
7517 if (Opc == X86::ADD32ri &&
7518 MI.getOperand(2).getTargetFlags() == X86II::MO_GOT_ABSOLUTE_ADDRESS)
7519 return nullptr;
7520
7521 // GOTTPOFF relocation loads can only be folded into add instructions.
7522 // FIXME: Need to exclude other relocations that only support specific
7523 // instructions.
7524 if (MOs.size() == X86::AddrNumOperands &&
7525 MOs[X86::AddrDisp].getTargetFlags() == X86II::MO_GOTTPOFF &&
7526 Opc != X86::ADD64rr)
7527 return nullptr;
7528
7529 // Don't fold loads into indirect calls that need a KCFI check as we'll
7530 // have to unfold these in X86TargetLowering::EmitKCFICheck anyway.
7531 if (MI.isCall() && MI.getCFIType())
7532 return nullptr;
7533
7534 // Attempt to fold any custom cases we have.
7535 if (auto *CustomMI = foldMemoryOperandCustom(MF, MI, OpNum, MOs, InsertPt,
7536 Size, Alignment))
7537 return CustomMI;
7538
7539 // Folding a memory location into the two-address part of a two-address
7540 // instruction is different than folding it other places. It requires
7541 // replacing the *two* registers with the memory location.
7542 //
7543 // Utilize the mapping NonNDD -> RMW for the NDD variant.
7544 unsigned NonNDOpc = Subtarget.hasNDD() ? X86::getNonNDVariant(Opc) : 0U;
7545 const X86FoldTableEntry *I =
7546 IsTwoAddr ? lookupTwoAddrFoldTable(NonNDOpc ? NonNDOpc : Opc)
7547 : lookupFoldTable(Opc, OpNum);
7548
7549 MachineInstr *NewMI = nullptr;
7550 if (I) {
7551 unsigned Opcode = I->DstOp;
7552 if (Alignment <
7553 Align(1ULL << ((I->Flags & TB_ALIGN_MASK) >> TB_ALIGN_SHIFT)))
7554 return nullptr;
7555 bool NarrowToMOV32rm = false;
7556 if (Size) {
7558 const TargetRegisterClass *RC = getRegClass(MI.getDesc(), OpNum);
7559 unsigned RCSize = TRI.getRegSizeInBits(*RC) / 8;
7560 // Check if it's safe to fold the load. If the size of the object is
7561 // narrower than the load width, then it's not.
7562 // FIXME: Allow scalar intrinsic instructions like ADDSSrm_Int.
7563 if ((I->Flags & TB_FOLDED_LOAD) && Size < RCSize) {
7564 // If this is a 64-bit load, but the spill slot is 32, then we can do
7565 // a 32-bit load which is implicitly zero-extended. This likely is
7566 // due to live interval analysis remat'ing a load from stack slot.
7567 if (Opcode != X86::MOV64rm || RCSize != 8 || Size != 4)
7568 return nullptr;
7569 if (MI.getOperand(0).getSubReg() || MI.getOperand(1).getSubReg())
7570 return nullptr;
7571 Opcode = X86::MOV32rm;
7572 NarrowToMOV32rm = true;
7573 }
7574 // For stores, make sure the size of the object is equal to the size of
7575 // the store. If the object is larger, the extra bits would be garbage. If
7576 // the object is smaller we might overwrite another object or fault.
7577 if ((I->Flags & TB_FOLDED_STORE) && Size != RCSize)
7578 return nullptr;
7579 }
7580
7581 NewMI = IsTwoAddr ? fuseTwoAddrInst(MF, Opcode, MOs, InsertPt, MI, *this)
7582 : fuseInst(MF, Opcode, OpNum, MOs, InsertPt, MI, *this);
7583
7584 if (NarrowToMOV32rm) {
7585 // If this is the special case where we use a MOV32rm to load a 32-bit
7586 // value and zero-extend the top bits. Change the destination register
7587 // to a 32-bit one.
7588 Register DstReg = NewMI->getOperand(0).getReg();
7589 if (DstReg.isPhysical())
7590 NewMI->getOperand(0).setReg(RI.getSubReg(DstReg, X86::sub_32bit));
7591 else
7592 NewMI->getOperand(0).setSubReg(X86::sub_32bit);
7593 }
7594 return NewMI;
7595 }
7596
7597 if (AllowCommute) {
7598 // If the instruction and target operand are commutable, commute the
7599 // instruction and try again.
7600 unsigned CommuteOpIdx2 = commuteOperandsForFold(MI, OpNum);
7601 if (CommuteOpIdx2 == OpNum) {
7602 printFailMsgforFold(MI, OpNum);
7603 return nullptr;
7604 }
7605 // Attempt to fold with the commuted version of the instruction.
7606 NewMI = foldMemoryOperandImpl(MF, MI, CommuteOpIdx2, MOs, InsertPt, Size,
7607 Alignment, /*AllowCommute=*/false);
7608 if (NewMI)
7609 return NewMI;
7610 // Folding failed again - undo the commute before returning.
7611 commuteInstruction(MI, false, OpNum, CommuteOpIdx2);
7612 }
7613
7614 printFailMsgforFold(MI, OpNum);
7615 return nullptr;
7616}
7617
7620 MachineBasicBlock::iterator InsertPt, int FrameIndex, LiveIntervals *LIS,
7621 VirtRegMap *VRM) const {
7622 // Check switch flag
7623 if (NoFusing)
7624 return nullptr;
7625
7626 // Avoid partial and undef register update stalls unless optimizing for size.
7627 if (!MF.getFunction().hasOptSize() &&
7628 (hasPartialRegUpdate(MI.getOpcode(), Subtarget, /*ForLoadFold*/ true) ||
7630 return nullptr;
7631
7632 // Don't fold subreg spills, or reloads that use a high subreg.
7633 for (auto Op : Ops) {
7634 MachineOperand &MO = MI.getOperand(Op);
7635 auto SubReg = MO.getSubReg();
7636 // MOV32r0 is special b/c it's used to clear a 64-bit register too.
7637 // (See patterns for MOV32r0 in TD files).
7638 if (MI.getOpcode() == X86::MOV32r0 && SubReg == X86::sub_32bit)
7639 continue;
7640 if (SubReg && (MO.isDef() || SubReg == X86::sub_8bit_hi))
7641 return nullptr;
7642 }
7643
7644 const MachineFrameInfo &MFI = MF.getFrameInfo();
7645 unsigned Size = MFI.getObjectSize(FrameIndex);
7646 Align Alignment = MFI.getObjectAlign(FrameIndex);
7647 // If the function stack isn't realigned we don't want to fold instructions
7648 // that need increased alignment.
7649 if (!RI.hasStackRealignment(MF))
7650 Alignment =
7651 std::min(Alignment, Subtarget.getFrameLowering()->getStackAlign());
7652
7653 auto Impl = [&]() {
7654 return foldMemoryOperandImpl(MF, MI, Ops[0],
7655 MachineOperand::CreateFI(FrameIndex), InsertPt,
7656 Size, Alignment, /*AllowCommute=*/true);
7657 };
7658 if (Ops.size() == 2 && Ops[0] == 0 && Ops[1] == 1) {
7659 unsigned NewOpc = 0;
7660 unsigned RCSize = 0;
7661 unsigned Opc = MI.getOpcode();
7662 switch (Opc) {
7663 default:
7664 // NDD can be folded into RMW though its Op0 and Op1 are not tied.
7665 return (Subtarget.hasNDD() ? X86::getNonNDVariant(Opc) : 0U) ? Impl()
7666 : nullptr;
7667 case X86::TEST8rr:
7668 NewOpc = X86::CMP8ri;
7669 RCSize = 1;
7670 break;
7671 case X86::TEST16rr:
7672 NewOpc = X86::CMP16ri;
7673 RCSize = 2;
7674 break;
7675 case X86::TEST32rr:
7676 NewOpc = X86::CMP32ri;
7677 RCSize = 4;
7678 break;
7679 case X86::TEST64rr:
7680 NewOpc = X86::CMP64ri32;
7681 RCSize = 8;
7682 break;
7683 }
7684 // Check if it's safe to fold the load. If the size of the object is
7685 // narrower than the load width, then it's not.
7686 if (Size < RCSize)
7687 return nullptr;
7688 // Change to CMPXXri r, 0 first.
7689 MI.setDesc(get(NewOpc));
7690 MI.getOperand(1).ChangeToImmediate(0);
7691 } else if (Ops.size() != 1)
7692 return nullptr;
7693
7694 return Impl();
7695}
7696
7697/// Check if \p LoadMI is a partial register load that we can't fold into \p MI
7698/// because the latter uses contents that wouldn't be defined in the folded
7699/// version. For instance, this transformation isn't legal:
7700/// movss (%rdi), %xmm0
7701/// addps %xmm0, %xmm0
7702/// ->
7703/// addps (%rdi), %xmm0
7704///
7705/// But this one is:
7706/// movss (%rdi), %xmm0
7707/// addss %xmm0, %xmm0
7708/// ->
7709/// addss (%rdi), %xmm0
7710///
7712 const MachineInstr &UserMI,
7713 const MachineFunction &MF) {
7714 unsigned Opc = LoadMI.getOpcode();
7715 unsigned UserOpc = UserMI.getOpcode();
7717 const TargetRegisterClass *RC =
7718 MF.getRegInfo().getRegClass(LoadMI.getOperand(0).getReg());
7719 unsigned RegSize = TRI.getRegSizeInBits(*RC);
7720
7721 if ((Opc == X86::MOVSSrm || Opc == X86::VMOVSSrm || Opc == X86::VMOVSSZrm ||
7722 Opc == X86::MOVSSrm_alt || Opc == X86::VMOVSSrm_alt ||
7723 Opc == X86::VMOVSSZrm_alt) &&
7724 RegSize > 32) {
7725 // These instructions only load 32 bits, we can't fold them if the
7726 // destination register is wider than 32 bits (4 bytes), and its user
7727 // instruction isn't scalar (SS).
7728 switch (UserOpc) {
7729 case X86::CVTSS2SDrr_Int:
7730 case X86::VCVTSS2SDrr_Int:
7731 case X86::VCVTSS2SDZrr_Int:
7732 case X86::VCVTSS2SDZrrk_Int:
7733 case X86::VCVTSS2SDZrrkz_Int:
7734 case X86::CVTSS2SIrr_Int:
7735 case X86::CVTSS2SI64rr_Int:
7736 case X86::VCVTSS2SIrr_Int:
7737 case X86::VCVTSS2SI64rr_Int:
7738 case X86::VCVTSS2SIZrr_Int:
7739 case X86::VCVTSS2SI64Zrr_Int:
7740 case X86::CVTTSS2SIrr_Int:
7741 case X86::CVTTSS2SI64rr_Int:
7742 case X86::VCVTTSS2SIrr_Int:
7743 case X86::VCVTTSS2SI64rr_Int:
7744 case X86::VCVTTSS2SIZrr_Int:
7745 case X86::VCVTTSS2SI64Zrr_Int:
7746 case X86::VCVTSS2USIZrr_Int:
7747 case X86::VCVTSS2USI64Zrr_Int:
7748 case X86::VCVTTSS2USIZrr_Int:
7749 case X86::VCVTTSS2USI64Zrr_Int:
7750 case X86::RCPSSr_Int:
7751 case X86::VRCPSSr_Int:
7752 case X86::RSQRTSSr_Int:
7753 case X86::VRSQRTSSr_Int:
7754 case X86::ROUNDSSri_Int:
7755 case X86::VROUNDSSri_Int:
7756 case X86::COMISSrr_Int:
7757 case X86::VCOMISSrr_Int:
7758 case X86::VCOMISSZrr_Int:
7759 case X86::UCOMISSrr_Int:
7760 case X86::VUCOMISSrr_Int:
7761 case X86::VUCOMISSZrr_Int:
7762 case X86::ADDSSrr_Int:
7763 case X86::VADDSSrr_Int:
7764 case X86::VADDSSZrr_Int:
7765 case X86::CMPSSrri_Int:
7766 case X86::VCMPSSrri_Int:
7767 case X86::VCMPSSZrri_Int:
7768 case X86::DIVSSrr_Int:
7769 case X86::VDIVSSrr_Int:
7770 case X86::VDIVSSZrr_Int:
7771 case X86::MAXSSrr_Int:
7772 case X86::VMAXSSrr_Int:
7773 case X86::VMAXSSZrr_Int:
7774 case X86::MINSSrr_Int:
7775 case X86::VMINSSrr_Int:
7776 case X86::VMINSSZrr_Int:
7777 case X86::MULSSrr_Int:
7778 case X86::VMULSSrr_Int:
7779 case X86::VMULSSZrr_Int:
7780 case X86::SQRTSSr_Int:
7781 case X86::VSQRTSSr_Int:
7782 case X86::VSQRTSSZr_Int:
7783 case X86::SUBSSrr_Int:
7784 case X86::VSUBSSrr_Int:
7785 case X86::VSUBSSZrr_Int:
7786 case X86::VADDSSZrrk_Int:
7787 case X86::VADDSSZrrkz_Int:
7788 case X86::VCMPSSZrrik_Int:
7789 case X86::VDIVSSZrrk_Int:
7790 case X86::VDIVSSZrrkz_Int:
7791 case X86::VMAXSSZrrk_Int:
7792 case X86::VMAXSSZrrkz_Int:
7793 case X86::VMINSSZrrk_Int:
7794 case X86::VMINSSZrrkz_Int:
7795 case X86::VMULSSZrrk_Int:
7796 case X86::VMULSSZrrkz_Int:
7797 case X86::VSQRTSSZrk_Int:
7798 case X86::VSQRTSSZrkz_Int:
7799 case X86::VSUBSSZrrk_Int:
7800 case X86::VSUBSSZrrkz_Int:
7801 case X86::VFMADDSS4rr_Int:
7802 case X86::VFNMADDSS4rr_Int:
7803 case X86::VFMSUBSS4rr_Int:
7804 case X86::VFNMSUBSS4rr_Int:
7805 case X86::VFMADD132SSr_Int:
7806 case X86::VFNMADD132SSr_Int:
7807 case X86::VFMADD213SSr_Int:
7808 case X86::VFNMADD213SSr_Int:
7809 case X86::VFMADD231SSr_Int:
7810 case X86::VFNMADD231SSr_Int:
7811 case X86::VFMSUB132SSr_Int:
7812 case X86::VFNMSUB132SSr_Int:
7813 case X86::VFMSUB213SSr_Int:
7814 case X86::VFNMSUB213SSr_Int:
7815 case X86::VFMSUB231SSr_Int:
7816 case X86::VFNMSUB231SSr_Int:
7817 case X86::VFMADD132SSZr_Int:
7818 case X86::VFNMADD132SSZr_Int:
7819 case X86::VFMADD213SSZr_Int:
7820 case X86::VFNMADD213SSZr_Int:
7821 case X86::VFMADD231SSZr_Int:
7822 case X86::VFNMADD231SSZr_Int:
7823 case X86::VFMSUB132SSZr_Int:
7824 case X86::VFNMSUB132SSZr_Int:
7825 case X86::VFMSUB213SSZr_Int:
7826 case X86::VFNMSUB213SSZr_Int:
7827 case X86::VFMSUB231SSZr_Int:
7828 case X86::VFNMSUB231SSZr_Int:
7829 case X86::VFMADD132SSZrk_Int:
7830 case X86::VFNMADD132SSZrk_Int:
7831 case X86::VFMADD213SSZrk_Int:
7832 case X86::VFNMADD213SSZrk_Int:
7833 case X86::VFMADD231SSZrk_Int:
7834 case X86::VFNMADD231SSZrk_Int:
7835 case X86::VFMSUB132SSZrk_Int:
7836 case X86::VFNMSUB132SSZrk_Int:
7837 case X86::VFMSUB213SSZrk_Int:
7838 case X86::VFNMSUB213SSZrk_Int:
7839 case X86::VFMSUB231SSZrk_Int:
7840 case X86::VFNMSUB231SSZrk_Int:
7841 case X86::VFMADD132SSZrkz_Int:
7842 case X86::VFNMADD132SSZrkz_Int:
7843 case X86::VFMADD213SSZrkz_Int:
7844 case X86::VFNMADD213SSZrkz_Int:
7845 case X86::VFMADD231SSZrkz_Int:
7846 case X86::VFNMADD231SSZrkz_Int:
7847 case X86::VFMSUB132SSZrkz_Int:
7848 case X86::VFNMSUB132SSZrkz_Int:
7849 case X86::VFMSUB213SSZrkz_Int:
7850 case X86::VFNMSUB213SSZrkz_Int:
7851 case X86::VFMSUB231SSZrkz_Int:
7852 case X86::VFNMSUB231SSZrkz_Int:
7853 case X86::VFIXUPIMMSSZrri:
7854 case X86::VFIXUPIMMSSZrrik:
7855 case X86::VFIXUPIMMSSZrrikz:
7856 case X86::VFPCLASSSSZri:
7857 case X86::VFPCLASSSSZrik:
7858 case X86::VGETEXPSSZr:
7859 case X86::VGETEXPSSZrk:
7860 case X86::VGETEXPSSZrkz:
7861 case X86::VGETMANTSSZrri:
7862 case X86::VGETMANTSSZrrik:
7863 case X86::VGETMANTSSZrrikz:
7864 case X86::VRANGESSZrri:
7865 case X86::VRANGESSZrrik:
7866 case X86::VRANGESSZrrikz:
7867 case X86::VRCP14SSZrr:
7868 case X86::VRCP14SSZrrk:
7869 case X86::VRCP14SSZrrkz:
7870 case X86::VRCP28SSZr:
7871 case X86::VRCP28SSZrk:
7872 case X86::VRCP28SSZrkz:
7873 case X86::VREDUCESSZrri:
7874 case X86::VREDUCESSZrrik:
7875 case X86::VREDUCESSZrrikz:
7876 case X86::VRNDSCALESSZrri_Int:
7877 case X86::VRNDSCALESSZrrik_Int:
7878 case X86::VRNDSCALESSZrrikz_Int:
7879 case X86::VRSQRT14SSZrr:
7880 case X86::VRSQRT14SSZrrk:
7881 case X86::VRSQRT14SSZrrkz:
7882 case X86::VRSQRT28SSZr:
7883 case X86::VRSQRT28SSZrk:
7884 case X86::VRSQRT28SSZrkz:
7885 case X86::VSCALEFSSZrr:
7886 case X86::VSCALEFSSZrrk:
7887 case X86::VSCALEFSSZrrkz:
7888 return false;
7889 default:
7890 return true;
7891 }
7892 }
7893
7894 if ((Opc == X86::MOVSDrm || Opc == X86::VMOVSDrm || Opc == X86::VMOVSDZrm ||
7895 Opc == X86::MOVSDrm_alt || Opc == X86::VMOVSDrm_alt ||
7896 Opc == X86::VMOVSDZrm_alt) &&
7897 RegSize > 64) {
7898 // These instructions only load 64 bits, we can't fold them if the
7899 // destination register is wider than 64 bits (8 bytes), and its user
7900 // instruction isn't scalar (SD).
7901 switch (UserOpc) {
7902 case X86::CVTSD2SSrr_Int:
7903 case X86::VCVTSD2SSrr_Int:
7904 case X86::VCVTSD2SSZrr_Int:
7905 case X86::VCVTSD2SSZrrk_Int:
7906 case X86::VCVTSD2SSZrrkz_Int:
7907 case X86::CVTSD2SIrr_Int:
7908 case X86::CVTSD2SI64rr_Int:
7909 case X86::VCVTSD2SIrr_Int:
7910 case X86::VCVTSD2SI64rr_Int:
7911 case X86::VCVTSD2SIZrr_Int:
7912 case X86::VCVTSD2SI64Zrr_Int:
7913 case X86::CVTTSD2SIrr_Int:
7914 case X86::CVTTSD2SI64rr_Int:
7915 case X86::VCVTTSD2SIrr_Int:
7916 case X86::VCVTTSD2SI64rr_Int:
7917 case X86::VCVTTSD2SIZrr_Int:
7918 case X86::VCVTTSD2SI64Zrr_Int:
7919 case X86::VCVTSD2USIZrr_Int:
7920 case X86::VCVTSD2USI64Zrr_Int:
7921 case X86::VCVTTSD2USIZrr_Int:
7922 case X86::VCVTTSD2USI64Zrr_Int:
7923 case X86::ROUNDSDri_Int:
7924 case X86::VROUNDSDri_Int:
7925 case X86::COMISDrr_Int:
7926 case X86::VCOMISDrr_Int:
7927 case X86::VCOMISDZrr_Int:
7928 case X86::UCOMISDrr_Int:
7929 case X86::VUCOMISDrr_Int:
7930 case X86::VUCOMISDZrr_Int:
7931 case X86::ADDSDrr_Int:
7932 case X86::VADDSDrr_Int:
7933 case X86::VADDSDZrr_Int:
7934 case X86::CMPSDrri_Int:
7935 case X86::VCMPSDrri_Int:
7936 case X86::VCMPSDZrri_Int:
7937 case X86::DIVSDrr_Int:
7938 case X86::VDIVSDrr_Int:
7939 case X86::VDIVSDZrr_Int:
7940 case X86::MAXSDrr_Int:
7941 case X86::VMAXSDrr_Int:
7942 case X86::VMAXSDZrr_Int:
7943 case X86::MINSDrr_Int:
7944 case X86::VMINSDrr_Int:
7945 case X86::VMINSDZrr_Int:
7946 case X86::MULSDrr_Int:
7947 case X86::VMULSDrr_Int:
7948 case X86::VMULSDZrr_Int:
7949 case X86::SQRTSDr_Int:
7950 case X86::VSQRTSDr_Int:
7951 case X86::VSQRTSDZr_Int:
7952 case X86::SUBSDrr_Int:
7953 case X86::VSUBSDrr_Int:
7954 case X86::VSUBSDZrr_Int:
7955 case X86::VADDSDZrrk_Int:
7956 case X86::VADDSDZrrkz_Int:
7957 case X86::VCMPSDZrrik_Int:
7958 case X86::VDIVSDZrrk_Int:
7959 case X86::VDIVSDZrrkz_Int:
7960 case X86::VMAXSDZrrk_Int:
7961 case X86::VMAXSDZrrkz_Int:
7962 case X86::VMINSDZrrk_Int:
7963 case X86::VMINSDZrrkz_Int:
7964 case X86::VMULSDZrrk_Int:
7965 case X86::VMULSDZrrkz_Int:
7966 case X86::VSQRTSDZrk_Int:
7967 case X86::VSQRTSDZrkz_Int:
7968 case X86::VSUBSDZrrk_Int:
7969 case X86::VSUBSDZrrkz_Int:
7970 case X86::VFMADDSD4rr_Int:
7971 case X86::VFNMADDSD4rr_Int:
7972 case X86::VFMSUBSD4rr_Int:
7973 case X86::VFNMSUBSD4rr_Int:
7974 case X86::VFMADD132SDr_Int:
7975 case X86::VFNMADD132SDr_Int:
7976 case X86::VFMADD213SDr_Int:
7977 case X86::VFNMADD213SDr_Int:
7978 case X86::VFMADD231SDr_Int:
7979 case X86::VFNMADD231SDr_Int:
7980 case X86::VFMSUB132SDr_Int:
7981 case X86::VFNMSUB132SDr_Int:
7982 case X86::VFMSUB213SDr_Int:
7983 case X86::VFNMSUB213SDr_Int:
7984 case X86::VFMSUB231SDr_Int:
7985 case X86::VFNMSUB231SDr_Int:
7986 case X86::VFMADD132SDZr_Int:
7987 case X86::VFNMADD132SDZr_Int:
7988 case X86::VFMADD213SDZr_Int:
7989 case X86::VFNMADD213SDZr_Int:
7990 case X86::VFMADD231SDZr_Int:
7991 case X86::VFNMADD231SDZr_Int:
7992 case X86::VFMSUB132SDZr_Int:
7993 case X86::VFNMSUB132SDZr_Int:
7994 case X86::VFMSUB213SDZr_Int:
7995 case X86::VFNMSUB213SDZr_Int:
7996 case X86::VFMSUB231SDZr_Int:
7997 case X86::VFNMSUB231SDZr_Int:
7998 case X86::VFMADD132SDZrk_Int:
7999 case X86::VFNMADD132SDZrk_Int:
8000 case X86::VFMADD213SDZrk_Int:
8001 case X86::VFNMADD213SDZrk_Int:
8002 case X86::VFMADD231SDZrk_Int:
8003 case X86::VFNMADD231SDZrk_Int:
8004 case X86::VFMSUB132SDZrk_Int:
8005 case X86::VFNMSUB132SDZrk_Int:
8006 case X86::VFMSUB213SDZrk_Int:
8007 case X86::VFNMSUB213SDZrk_Int:
8008 case X86::VFMSUB231SDZrk_Int:
8009 case X86::VFNMSUB231SDZrk_Int:
8010 case X86::VFMADD132SDZrkz_Int:
8011 case X86::VFNMADD132SDZrkz_Int:
8012 case X86::VFMADD213SDZrkz_Int:
8013 case X86::VFNMADD213SDZrkz_Int:
8014 case X86::VFMADD231SDZrkz_Int:
8015 case X86::VFNMADD231SDZrkz_Int:
8016 case X86::VFMSUB132SDZrkz_Int:
8017 case X86::VFNMSUB132SDZrkz_Int:
8018 case X86::VFMSUB213SDZrkz_Int:
8019 case X86::VFNMSUB213SDZrkz_Int:
8020 case X86::VFMSUB231SDZrkz_Int:
8021 case X86::VFNMSUB231SDZrkz_Int:
8022 case X86::VFIXUPIMMSDZrri:
8023 case X86::VFIXUPIMMSDZrrik:
8024 case X86::VFIXUPIMMSDZrrikz:
8025 case X86::VFPCLASSSDZri:
8026 case X86::VFPCLASSSDZrik:
8027 case X86::VGETEXPSDZr:
8028 case X86::VGETEXPSDZrk:
8029 case X86::VGETEXPSDZrkz:
8030 case X86::VGETMANTSDZrri:
8031 case X86::VGETMANTSDZrrik:
8032 case X86::VGETMANTSDZrrikz:
8033 case X86::VRANGESDZrri:
8034 case X86::VRANGESDZrrik:
8035 case X86::VRANGESDZrrikz:
8036 case X86::VRCP14SDZrr:
8037 case X86::VRCP14SDZrrk:
8038 case X86::VRCP14SDZrrkz:
8039 case X86::VRCP28SDZr:
8040 case X86::VRCP28SDZrk:
8041 case X86::VRCP28SDZrkz:
8042 case X86::VREDUCESDZrri:
8043 case X86::VREDUCESDZrrik:
8044 case X86::VREDUCESDZrrikz:
8045 case X86::VRNDSCALESDZrri_Int:
8046 case X86::VRNDSCALESDZrrik_Int:
8047 case X86::VRNDSCALESDZrrikz_Int:
8048 case X86::VRSQRT14SDZrr:
8049 case X86::VRSQRT14SDZrrk:
8050 case X86::VRSQRT14SDZrrkz:
8051 case X86::VRSQRT28SDZr:
8052 case X86::VRSQRT28SDZrk:
8053 case X86::VRSQRT28SDZrkz:
8054 case X86::VSCALEFSDZrr:
8055 case X86::VSCALEFSDZrrk:
8056 case X86::VSCALEFSDZrrkz:
8057 return false;
8058 default:
8059 return true;
8060 }
8061 }
8062
8063 if ((Opc == X86::VMOVSHZrm || Opc == X86::VMOVSHZrm_alt) && RegSize > 16) {
8064 // These instructions only load 16 bits, we can't fold them if the
8065 // destination register is wider than 16 bits (2 bytes), and its user
8066 // instruction isn't scalar (SH).
8067 switch (UserOpc) {
8068 case X86::VADDSHZrr_Int:
8069 case X86::VCMPSHZrri_Int:
8070 case X86::VDIVSHZrr_Int:
8071 case X86::VMAXSHZrr_Int:
8072 case X86::VMINSHZrr_Int:
8073 case X86::VMULSHZrr_Int:
8074 case X86::VSUBSHZrr_Int:
8075 case X86::VADDSHZrrk_Int:
8076 case X86::VADDSHZrrkz_Int:
8077 case X86::VCMPSHZrrik_Int:
8078 case X86::VDIVSHZrrk_Int:
8079 case X86::VDIVSHZrrkz_Int:
8080 case X86::VMAXSHZrrk_Int:
8081 case X86::VMAXSHZrrkz_Int:
8082 case X86::VMINSHZrrk_Int:
8083 case X86::VMINSHZrrkz_Int:
8084 case X86::VMULSHZrrk_Int:
8085 case X86::VMULSHZrrkz_Int:
8086 case X86::VSUBSHZrrk_Int:
8087 case X86::VSUBSHZrrkz_Int:
8088 case X86::VFMADD132SHZr_Int:
8089 case X86::VFNMADD132SHZr_Int:
8090 case X86::VFMADD213SHZr_Int:
8091 case X86::VFNMADD213SHZr_Int:
8092 case X86::VFMADD231SHZr_Int:
8093 case X86::VFNMADD231SHZr_Int:
8094 case X86::VFMSUB132SHZr_Int:
8095 case X86::VFNMSUB132SHZr_Int:
8096 case X86::VFMSUB213SHZr_Int:
8097 case X86::VFNMSUB213SHZr_Int:
8098 case X86::VFMSUB231SHZr_Int:
8099 case X86::VFNMSUB231SHZr_Int:
8100 case X86::VFMADD132SHZrk_Int:
8101 case X86::VFNMADD132SHZrk_Int:
8102 case X86::VFMADD213SHZrk_Int:
8103 case X86::VFNMADD213SHZrk_Int:
8104 case X86::VFMADD231SHZrk_Int:
8105 case X86::VFNMADD231SHZrk_Int:
8106 case X86::VFMSUB132SHZrk_Int:
8107 case X86::VFNMSUB132SHZrk_Int:
8108 case X86::VFMSUB213SHZrk_Int:
8109 case X86::VFNMSUB213SHZrk_Int:
8110 case X86::VFMSUB231SHZrk_Int:
8111 case X86::VFNMSUB231SHZrk_Int:
8112 case X86::VFMADD132SHZrkz_Int:
8113 case X86::VFNMADD132SHZrkz_Int:
8114 case X86::VFMADD213SHZrkz_Int:
8115 case X86::VFNMADD213SHZrkz_Int:
8116 case X86::VFMADD231SHZrkz_Int:
8117 case X86::VFNMADD231SHZrkz_Int:
8118 case X86::VFMSUB132SHZrkz_Int:
8119 case X86::VFNMSUB132SHZrkz_Int:
8120 case X86::VFMSUB213SHZrkz_Int:
8121 case X86::VFNMSUB213SHZrkz_Int:
8122 case X86::VFMSUB231SHZrkz_Int:
8123 case X86::VFNMSUB231SHZrkz_Int:
8124 return false;
8125 default:
8126 return true;
8127 }
8128 }
8129
8130 return false;
8131}
8132
8135 MachineBasicBlock::iterator InsertPt, MachineInstr &LoadMI,
8136 LiveIntervals *LIS) const {
8137
8138 // If LoadMI is a masked load, check MI having the same mask.
8139 const MCInstrDesc &MCID = get(LoadMI.getOpcode());
8140 unsigned NumOps = MCID.getNumOperands();
8141 if (NumOps >= 3) {
8142 Register MaskReg;
8143 const MachineOperand &Op1 = LoadMI.getOperand(1);
8144 const MachineOperand &Op2 = LoadMI.getOperand(2);
8145
8146 auto IsVKWMClass = [](const TargetRegisterClass *RC) {
8147 return RC == &X86::VK2WMRegClass || RC == &X86::VK4WMRegClass ||
8148 RC == &X86::VK8WMRegClass || RC == &X86::VK16WMRegClass ||
8149 RC == &X86::VK32WMRegClass || RC == &X86::VK64WMRegClass;
8150 };
8151
8152 if (Op1.isReg() && IsVKWMClass(getRegClass(MCID, 1)))
8153 MaskReg = Op1.getReg();
8154 else if (Op2.isReg() && IsVKWMClass(getRegClass(MCID, 2)))
8155 MaskReg = Op2.getReg();
8156
8157 if (MaskReg) {
8158 bool HasSameMask = false;
8159 for (unsigned I = 1, E = MI.getDesc().getNumOperands(); I < E; ++I) {
8160 const MachineOperand &Op = MI.getOperand(I);
8161 if (Op.isReg() && Op.getReg() == MaskReg) {
8162 HasSameMask = true;
8163 break;
8164 }
8165 }
8166 if (!HasSameMask)
8167 return nullptr;
8168 }
8169 }
8170
8171 // TODO: Support the case where LoadMI loads a wide register, but MI
8172 // only uses a subreg.
8173 for (auto Op : Ops) {
8174 if (MI.getOperand(Op).getSubReg())
8175 return nullptr;
8176 }
8177
8178 // If loading from a FrameIndex, fold directly from the FrameIndex.
8179 int FrameIndex;
8180 if (isLoadFromStackSlot(LoadMI, FrameIndex)) {
8181 if (isNonFoldablePartialRegisterLoad(LoadMI, MI, MF))
8182 return nullptr;
8183 return foldMemoryOperandImpl(MF, MI, Ops, InsertPt, FrameIndex, LIS);
8184 }
8185
8186 // Check switch flag
8187 if (NoFusing)
8188 return nullptr;
8189
8190 // Avoid partial and undef register update stalls unless optimizing for size.
8191 if (!MF.getFunction().hasOptSize() &&
8192 (hasPartialRegUpdate(MI.getOpcode(), Subtarget, /*ForLoadFold*/ true) ||
8194 return nullptr;
8195
8196 // Do not fold a NDD instruction and a memory instruction with relocation to
8197 // avoid emit APX relocation when the flag is disabled for backward
8198 // compatibility.
8199 uint64_t TSFlags = MI.getDesc().TSFlags;
8201 X86II::hasNewDataDest(TSFlags))
8202 return nullptr;
8203
8204 // Determine the alignment of the load.
8205 Align Alignment;
8206 unsigned LoadOpc = LoadMI.getOpcode();
8207 if (LoadMI.hasOneMemOperand())
8208 Alignment = (*LoadMI.memoperands_begin())->getAlign();
8209 else
8210 switch (LoadOpc) {
8211 case X86::AVX512_512_SET0:
8212 case X86::AVX512_512_SETALLONES:
8213 Alignment = Align(64);
8214 break;
8215 case X86::AVX2_SETALLONES:
8216 case X86::AVX1_SETALLONES:
8217 case X86::AVX_SET0:
8218 case X86::AVX512_256_SET0:
8219 case X86::AVX512_256_SETALLONES:
8220 Alignment = Align(32);
8221 break;
8222 case X86::V_SET0:
8223 case X86::V_SETALLONES:
8224 case X86::AVX512_128_SET0:
8225 case X86::FsFLD0F128:
8226 case X86::AVX512_FsFLD0F128:
8227 case X86::AVX512_128_SETALLONES:
8228 Alignment = Align(16);
8229 break;
8230 case X86::MMX_SET0:
8231 case X86::FsFLD0SD:
8232 case X86::AVX512_FsFLD0SD:
8233 Alignment = Align(8);
8234 break;
8235 case X86::FsFLD0SS:
8236 case X86::AVX512_FsFLD0SS:
8237 Alignment = Align(4);
8238 break;
8239 case X86::FsFLD0SH:
8240 case X86::AVX512_FsFLD0SH:
8241 Alignment = Align(2);
8242 break;
8243 default:
8244 return nullptr;
8245 }
8246 if (Ops.size() == 2 && Ops[0] == 0 && Ops[1] == 1) {
8247 unsigned NewOpc = 0;
8248 switch (MI.getOpcode()) {
8249 default:
8250 return nullptr;
8251 case X86::TEST8rr:
8252 NewOpc = X86::CMP8ri;
8253 break;
8254 case X86::TEST16rr:
8255 NewOpc = X86::CMP16ri;
8256 break;
8257 case X86::TEST32rr:
8258 NewOpc = X86::CMP32ri;
8259 break;
8260 case X86::TEST64rr:
8261 NewOpc = X86::CMP64ri32;
8262 break;
8263 }
8264 // Change to CMPXXri r, 0 first.
8265 MI.setDesc(get(NewOpc));
8266 MI.getOperand(1).ChangeToImmediate(0);
8267 } else if (Ops.size() != 1)
8268 return nullptr;
8269
8270 // Make sure the subregisters match.
8271 // Otherwise we risk changing the size of the load.
8272 if (LoadMI.getOperand(0).getSubReg() != MI.getOperand(Ops[0]).getSubReg())
8273 return nullptr;
8274
8276 switch (LoadOpc) {
8277 case X86::MMX_SET0:
8278 case X86::V_SET0:
8279 case X86::V_SETALLONES:
8280 case X86::AVX2_SETALLONES:
8281 case X86::AVX1_SETALLONES:
8282 case X86::AVX_SET0:
8283 case X86::AVX512_128_SET0:
8284 case X86::AVX512_256_SET0:
8285 case X86::AVX512_512_SET0:
8286 case X86::AVX512_128_SETALLONES:
8287 case X86::AVX512_256_SETALLONES:
8288 case X86::AVX512_512_SETALLONES:
8289 case X86::FsFLD0SH:
8290 case X86::AVX512_FsFLD0SH:
8291 case X86::FsFLD0SD:
8292 case X86::AVX512_FsFLD0SD:
8293 case X86::FsFLD0SS:
8294 case X86::AVX512_FsFLD0SS:
8295 case X86::FsFLD0F128:
8296 case X86::AVX512_FsFLD0F128: {
8297 // Folding a V_SET0 or V_SETALLONES as a load, to ease register pressure.
8298 // Create a constant-pool entry and operands to load from it.
8299
8300 // Large code model can't fold loads this way.
8302 return nullptr;
8303
8304 // x86-32 PIC requires a PIC base register for constant pools.
8305 unsigned PICBase = 0;
8306 // Since we're using Small or Kernel code model, we can always use
8307 // RIP-relative addressing for a smaller encoding.
8308 if (Subtarget.is64Bit()) {
8309 PICBase = X86::RIP;
8310 } else if (MF.getTarget().isPositionIndependent()) {
8311 // FIXME: PICBase = getGlobalBaseReg(&MF);
8312 // This doesn't work for several reasons.
8313 // 1. GlobalBaseReg may have been spilled.
8314 // 2. It may not be live at MI.
8315 return nullptr;
8316 }
8317
8318 // Create a constant-pool entry.
8320 Type *Ty;
8321 bool IsAllOnes = false;
8322 switch (LoadOpc) {
8323 case X86::FsFLD0SS:
8324 case X86::AVX512_FsFLD0SS:
8326 break;
8327 case X86::FsFLD0SD:
8328 case X86::AVX512_FsFLD0SD:
8330 break;
8331 case X86::FsFLD0F128:
8332 case X86::AVX512_FsFLD0F128:
8334 break;
8335 case X86::FsFLD0SH:
8336 case X86::AVX512_FsFLD0SH:
8338 break;
8339 case X86::AVX512_512_SETALLONES:
8340 IsAllOnes = true;
8341 [[fallthrough]];
8342 case X86::AVX512_512_SET0:
8344 16);
8345 break;
8346 case X86::AVX1_SETALLONES:
8347 case X86::AVX2_SETALLONES:
8348 case X86::AVX512_256_SETALLONES:
8349 IsAllOnes = true;
8350 [[fallthrough]];
8351 case X86::AVX512_256_SET0:
8352 case X86::AVX_SET0:
8354 8);
8355
8356 break;
8357 case X86::MMX_SET0:
8359 2);
8360 break;
8361 case X86::V_SETALLONES:
8362 case X86::AVX512_128_SETALLONES:
8363 IsAllOnes = true;
8364 [[fallthrough]];
8365 case X86::V_SET0:
8366 case X86::AVX512_128_SET0:
8368 4);
8369 break;
8370 }
8371
8372 const Constant *C =
8374 unsigned CPI = MCP.getConstantPoolIndex(C, Alignment);
8375
8376 // Create operands to load from the constant pool entry.
8377 MOs.push_back(MachineOperand::CreateReg(PICBase, false));
8379 MOs.push_back(MachineOperand::CreateReg(0, false));
8381 MOs.push_back(MachineOperand::CreateReg(0, false));
8382 break;
8383 }
8384 case X86::VPBROADCASTBZ128rm:
8385 case X86::VPBROADCASTBZ256rm:
8386 case X86::VPBROADCASTBZrm:
8387 case X86::VBROADCASTF32X2Z256rm:
8388 case X86::VBROADCASTF32X2Zrm:
8389 case X86::VBROADCASTI32X2Z128rm:
8390 case X86::VBROADCASTI32X2Z256rm:
8391 case X86::VBROADCASTI32X2Zrm:
8392 // No instructions currently fuse with 8bits or 32bits x 2.
8393 return nullptr;
8394
8395#define FOLD_BROADCAST(SIZE) \
8396 MOs.append(LoadMI.operands_begin() + NumOps - X86::AddrNumOperands, \
8397 LoadMI.operands_begin() + NumOps); \
8398 return foldMemoryBroadcast(MF, MI, Ops[0], MOs, InsertPt, /*Size=*/SIZE, \
8399 /*AllowCommute=*/true);
8400 case X86::VPBROADCASTWZ128rm:
8401 case X86::VPBROADCASTWZ256rm:
8402 case X86::VPBROADCASTWZrm:
8403 FOLD_BROADCAST(16);
8404 case X86::VPBROADCASTDZ128rm:
8405 case X86::VPBROADCASTDZ256rm:
8406 case X86::VPBROADCASTDZrm:
8407 case X86::VBROADCASTSSZ128rm:
8408 case X86::VBROADCASTSSZ256rm:
8409 case X86::VBROADCASTSSZrm:
8410 FOLD_BROADCAST(32);
8411 case X86::VPBROADCASTQZ128rm:
8412 case X86::VPBROADCASTQZ256rm:
8413 case X86::VPBROADCASTQZrm:
8414 case X86::VBROADCASTSDZ256rm:
8415 case X86::VBROADCASTSDZrm:
8416 FOLD_BROADCAST(64);
8417 default: {
8418 if (isNonFoldablePartialRegisterLoad(LoadMI, MI, MF))
8419 return nullptr;
8420
8421 // Folding a normal load. Just copy the load's address operands.
8423 LoadMI.operands_begin() + NumOps);
8424 break;
8425 }
8426 }
8427 return foldMemoryOperandImpl(MF, MI, Ops[0], MOs, InsertPt,
8428 /*Size=*/0, Alignment, /*AllowCommute=*/true);
8429}
8430
8432X86InstrInfo::foldMemoryBroadcast(MachineFunction &MF, MachineInstr &MI,
8433 unsigned OpNum, ArrayRef<MachineOperand> MOs,
8435 unsigned BitsSize, bool AllowCommute) const {
8436
8437 if (auto *I = lookupBroadcastFoldTable(MI.getOpcode(), OpNum))
8438 return matchBroadcastSize(*I, BitsSize)
8439 ? fuseInst(MF, I->DstOp, OpNum, MOs, InsertPt, MI, *this)
8440 : nullptr;
8441
8442 if (AllowCommute) {
8443 // If the instruction and target operand are commutable, commute the
8444 // instruction and try again.
8445 unsigned CommuteOpIdx2 = commuteOperandsForFold(MI, OpNum);
8446 if (CommuteOpIdx2 == OpNum) {
8447 printFailMsgforFold(MI, OpNum);
8448 return nullptr;
8449 }
8450 MachineInstr *NewMI =
8451 foldMemoryBroadcast(MF, MI, CommuteOpIdx2, MOs, InsertPt, BitsSize,
8452 /*AllowCommute=*/false);
8453 if (NewMI)
8454 return NewMI;
8455 // Folding failed again - undo the commute before returning.
8456 commuteInstruction(MI, false, OpNum, CommuteOpIdx2);
8457 }
8458
8459 printFailMsgforFold(MI, OpNum);
8460 return nullptr;
8461}
8462
8466
8467 for (MachineMemOperand *MMO : MMOs) {
8468 if (!MMO->isLoad())
8469 continue;
8470
8471 if (!MMO->isStore()) {
8472 // Reuse the MMO.
8473 LoadMMOs.push_back(MMO);
8474 } else {
8475 // Clone the MMO and unset the store flag.
8476 LoadMMOs.push_back(MF.getMachineMemOperand(
8477 MMO, MMO->getFlags() & ~MachineMemOperand::MOStore));
8478 }
8479 }
8480
8481 return LoadMMOs;
8482}
8483
8487
8488 for (MachineMemOperand *MMO : MMOs) {
8489 if (!MMO->isStore())
8490 continue;
8491
8492 if (!MMO->isLoad()) {
8493 // Reuse the MMO.
8494 StoreMMOs.push_back(MMO);
8495 } else {
8496 // Clone the MMO and unset the load flag.
8497 StoreMMOs.push_back(MF.getMachineMemOperand(
8498 MMO, MMO->getFlags() & ~MachineMemOperand::MOLoad));
8499 }
8500 }
8501
8502 return StoreMMOs;
8503}
8504
8506 const TargetRegisterClass *RC,
8507 const X86Subtarget &STI) {
8508 assert(STI.hasAVX512() && "Expected at least AVX512!");
8509 unsigned SpillSize = STI.getRegisterInfo()->getSpillSize(*RC);
8510 assert((SpillSize == 64 || STI.hasVLX()) &&
8511 "Can't broadcast less than 64 bytes without AVX512VL!");
8512
8513#define CASE_BCAST_TYPE_OPC(TYPE, OP16, OP32, OP64) \
8514 case TYPE: \
8515 switch (SpillSize) { \
8516 default: \
8517 llvm_unreachable("Unknown spill size"); \
8518 case 16: \
8519 return X86::OP16; \
8520 case 32: \
8521 return X86::OP32; \
8522 case 64: \
8523 return X86::OP64; \
8524 } \
8525 break;
8526
8527 switch (I->Flags & TB_BCAST_MASK) {
8528 default:
8529 llvm_unreachable("Unexpected broadcast type!");
8530 CASE_BCAST_TYPE_OPC(TB_BCAST_W, VPBROADCASTWZ128rm, VPBROADCASTWZ256rm,
8531 VPBROADCASTWZrm)
8532 CASE_BCAST_TYPE_OPC(TB_BCAST_D, VPBROADCASTDZ128rm, VPBROADCASTDZ256rm,
8533 VPBROADCASTDZrm)
8534 CASE_BCAST_TYPE_OPC(TB_BCAST_Q, VPBROADCASTQZ128rm, VPBROADCASTQZ256rm,
8535 VPBROADCASTQZrm)
8536 CASE_BCAST_TYPE_OPC(TB_BCAST_SH, VPBROADCASTWZ128rm, VPBROADCASTWZ256rm,
8537 VPBROADCASTWZrm)
8538 CASE_BCAST_TYPE_OPC(TB_BCAST_SS, VBROADCASTSSZ128rm, VBROADCASTSSZ256rm,
8539 VBROADCASTSSZrm)
8540 CASE_BCAST_TYPE_OPC(TB_BCAST_SD, VMOVDDUPZ128rm, VBROADCASTSDZ256rm,
8541 VBROADCASTSDZrm)
8542 }
8543}
8544
8546 MachineFunction &MF, MachineInstr &MI, Register Reg, bool UnfoldLoad,
8547 bool UnfoldStore, SmallVectorImpl<MachineInstr *> &NewMIs) const {
8548 const X86FoldTableEntry *I = lookupUnfoldTable(MI.getOpcode());
8549 if (I == nullptr)
8550 return false;
8551 unsigned Opc = I->DstOp;
8552 unsigned Index = I->Flags & TB_INDEX_MASK;
8553 bool FoldedLoad = I->Flags & TB_FOLDED_LOAD;
8554 bool FoldedStore = I->Flags & TB_FOLDED_STORE;
8555 if (UnfoldLoad && !FoldedLoad)
8556 return false;
8557 UnfoldLoad &= FoldedLoad;
8558 if (UnfoldStore && !FoldedStore)
8559 return false;
8560 UnfoldStore &= FoldedStore;
8561
8562 const MCInstrDesc &MCID = get(Opc);
8563
8564 const TargetRegisterClass *RC = getRegClass(MCID, Index);
8566 // TODO: Check if 32-byte or greater accesses are slow too?
8567 if (!MI.hasOneMemOperand() && RC == &X86::VR128RegClass &&
8568 Subtarget.isUnalignedMem16Slow())
8569 // Without memoperands, loadRegFromAddr and storeRegToStackSlot will
8570 // conservatively assume the address is unaligned. That's bad for
8571 // performance.
8572 return false;
8577 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
8578 MachineOperand &Op = MI.getOperand(i);
8579 if (i >= Index && i < Index + X86::AddrNumOperands)
8580 AddrOps.push_back(Op);
8581 else if (Op.isReg() && Op.isImplicit())
8582 ImpOps.push_back(Op);
8583 else if (i < Index)
8584 BeforeOps.push_back(Op);
8585 else if (i > Index)
8586 AfterOps.push_back(Op);
8587 }
8588
8589 // Emit the load or broadcast instruction.
8590 if (UnfoldLoad) {
8591 auto MMOs = extractLoadMMOs(MI.memoperands(), MF);
8592
8593 unsigned Opc;
8594 if (I->Flags & TB_BCAST_MASK) {
8595 Opc = getBroadcastOpcode(I, RC, Subtarget);
8596 } else {
8597 unsigned Alignment = std::max<uint32_t>(TRI.getSpillSize(*RC), 16);
8598 bool isAligned = !MMOs.empty() && MMOs.front()->getAlign() >= Alignment;
8599 Opc = getLoadRegOpcode(Reg, RC, isAligned, Subtarget);
8600 }
8601
8602 DebugLoc DL;
8603 MachineInstrBuilder MIB = BuildMI(MF, DL, get(Opc), Reg);
8604 for (const MachineOperand &AddrOp : AddrOps)
8605 MIB.add(AddrOp);
8606 MIB.setMemRefs(MMOs);
8607 NewMIs.push_back(MIB);
8608
8609 if (UnfoldStore) {
8610 // Address operands cannot be marked isKill.
8611 for (unsigned i = 1; i != 1 + X86::AddrNumOperands; ++i) {
8612 MachineOperand &MO = NewMIs[0]->getOperand(i);
8613 if (MO.isReg())
8614 MO.setIsKill(false);
8615 }
8616 }
8617 }
8618
8619 // Emit the data processing instruction.
8620 MachineInstr *DataMI = MF.CreateMachineInstr(MCID, MI.getDebugLoc(), true);
8621 MachineInstrBuilder MIB(MF, DataMI);
8622
8623 if (FoldedStore)
8624 MIB.addReg(Reg, RegState::Define);
8625 for (MachineOperand &BeforeOp : BeforeOps)
8626 MIB.add(BeforeOp);
8627 if (FoldedLoad)
8628 MIB.addReg(Reg);
8629 for (MachineOperand &AfterOp : AfterOps)
8630 MIB.add(AfterOp);
8631 for (MachineOperand &ImpOp : ImpOps) {
8632 MIB.addReg(ImpOp.getReg(), getDefRegState(ImpOp.isDef()) |
8634 getKillRegState(ImpOp.isKill()) |
8635 getDeadRegState(ImpOp.isDead()) |
8636 getUndefRegState(ImpOp.isUndef()));
8637 }
8638 // Change CMP32ri r, 0 back to TEST32rr r, r, etc.
8639 switch (DataMI->getOpcode()) {
8640 default:
8641 break;
8642 case X86::CMP64ri32:
8643 case X86::CMP32ri:
8644 case X86::CMP16ri:
8645 case X86::CMP8ri: {
8646 MachineOperand &MO0 = DataMI->getOperand(0);
8647 MachineOperand &MO1 = DataMI->getOperand(1);
8648 if (MO1.isImm() && MO1.getImm() == 0) {
8649 unsigned NewOpc;
8650 switch (DataMI->getOpcode()) {
8651 default:
8652 llvm_unreachable("Unreachable!");
8653 case X86::CMP64ri32:
8654 NewOpc = X86::TEST64rr;
8655 break;
8656 case X86::CMP32ri:
8657 NewOpc = X86::TEST32rr;
8658 break;
8659 case X86::CMP16ri:
8660 NewOpc = X86::TEST16rr;
8661 break;
8662 case X86::CMP8ri:
8663 NewOpc = X86::TEST8rr;
8664 break;
8665 }
8666 DataMI->setDesc(get(NewOpc));
8667 MO1.ChangeToRegister(MO0.getReg(), false);
8668 }
8669 }
8670 }
8671 NewMIs.push_back(DataMI);
8672
8673 // Emit the store instruction.
8674 if (UnfoldStore) {
8675 const TargetRegisterClass *DstRC = getRegClass(MCID, 0);
8676 auto MMOs = extractStoreMMOs(MI.memoperands(), MF);
8677 unsigned Alignment = std::max<uint32_t>(TRI.getSpillSize(*DstRC), 16);
8678 bool isAligned = !MMOs.empty() && MMOs.front()->getAlign() >= Alignment;
8679 unsigned Opc = getStoreRegOpcode(Reg, DstRC, isAligned, Subtarget);
8680 DebugLoc DL;
8681 MachineInstrBuilder MIB = BuildMI(MF, DL, get(Opc));
8682 for (const MachineOperand &AddrOp : AddrOps)
8683 MIB.add(AddrOp);
8684 MIB.addReg(Reg, RegState::Kill);
8685 MIB.setMemRefs(MMOs);
8686 NewMIs.push_back(MIB);
8687 }
8688
8689 return true;
8690}
8691
8693 SelectionDAG &DAG, SDNode *N, SmallVectorImpl<SDNode *> &NewNodes) const {
8694 if (!N->isMachineOpcode())
8695 return false;
8696
8697 const X86FoldTableEntry *I = lookupUnfoldTable(N->getMachineOpcode());
8698 if (I == nullptr)
8699 return false;
8700 unsigned Opc = I->DstOp;
8701 unsigned Index = I->Flags & TB_INDEX_MASK;
8702 bool FoldedLoad = I->Flags & TB_FOLDED_LOAD;
8703 bool FoldedStore = I->Flags & TB_FOLDED_STORE;
8704 const MCInstrDesc &MCID = get(Opc);
8707 const TargetRegisterClass *RC = getRegClass(MCID, Index);
8708 unsigned NumDefs = MCID.NumDefs;
8709 std::vector<SDValue> AddrOps;
8710 std::vector<SDValue> BeforeOps;
8711 std::vector<SDValue> AfterOps;
8712 SDLoc dl(N);
8713 unsigned NumOps = N->getNumOperands();
8714 for (unsigned i = 0; i != NumOps - 1; ++i) {
8715 SDValue Op = N->getOperand(i);
8716 if (i >= Index - NumDefs && i < Index - NumDefs + X86::AddrNumOperands)
8717 AddrOps.push_back(Op);
8718 else if (i < Index - NumDefs)
8719 BeforeOps.push_back(Op);
8720 else if (i > Index - NumDefs)
8721 AfterOps.push_back(Op);
8722 }
8723 SDValue Chain = N->getOperand(NumOps - 1);
8724 AddrOps.push_back(Chain);
8725
8726 // Emit the load instruction.
8727 SDNode *Load = nullptr;
8728 if (FoldedLoad) {
8729 EVT VT = *TRI.legalclasstypes_begin(*RC);
8730 auto MMOs = extractLoadMMOs(cast<MachineSDNode>(N)->memoperands(), MF);
8731 if (MMOs.empty() && RC == &X86::VR128RegClass &&
8732 Subtarget.isUnalignedMem16Slow())
8733 // Do not introduce a slow unaligned load.
8734 return false;
8735 // FIXME: If a VR128 can have size 32, we should be checking if a 32-byte
8736 // memory access is slow above.
8737
8738 unsigned Opc;
8739 if (I->Flags & TB_BCAST_MASK) {
8740 Opc = getBroadcastOpcode(I, RC, Subtarget);
8741 } else {
8742 unsigned Alignment = std::max<uint32_t>(TRI.getSpillSize(*RC), 16);
8743 bool isAligned = !MMOs.empty() && MMOs.front()->getAlign() >= Alignment;
8744 Opc = getLoadRegOpcode(0, RC, isAligned, Subtarget);
8745 }
8746
8747 Load = DAG.getMachineNode(Opc, dl, VT, MVT::Other, AddrOps);
8748 NewNodes.push_back(Load);
8749
8750 // Preserve memory reference information.
8751 DAG.setNodeMemRefs(cast<MachineSDNode>(Load), MMOs);
8752 }
8753
8754 // Emit the data processing instruction.
8755 std::vector<EVT> VTs;
8756 const TargetRegisterClass *DstRC = nullptr;
8757 if (MCID.getNumDefs() > 0) {
8758 DstRC = getRegClass(MCID, 0);
8759 VTs.push_back(*TRI.legalclasstypes_begin(*DstRC));
8760 }
8761 for (unsigned i = 0, e = N->getNumValues(); i != e; ++i) {
8762 EVT VT = N->getValueType(i);
8763 if (VT != MVT::Other && i >= (unsigned)MCID.getNumDefs())
8764 VTs.push_back(VT);
8765 }
8766 if (Load)
8767 BeforeOps.push_back(SDValue(Load, 0));
8768 llvm::append_range(BeforeOps, AfterOps);
8769 // Change CMP32ri r, 0 back to TEST32rr r, r, etc.
8770 switch (Opc) {
8771 default:
8772 break;
8773 case X86::CMP64ri32:
8774 case X86::CMP32ri:
8775 case X86::CMP16ri:
8776 case X86::CMP8ri:
8777 if (isNullConstant(BeforeOps[1])) {
8778 switch (Opc) {
8779 default:
8780 llvm_unreachable("Unreachable!");
8781 case X86::CMP64ri32:
8782 Opc = X86::TEST64rr;
8783 break;
8784 case X86::CMP32ri:
8785 Opc = X86::TEST32rr;
8786 break;
8787 case X86::CMP16ri:
8788 Opc = X86::TEST16rr;
8789 break;
8790 case X86::CMP8ri:
8791 Opc = X86::TEST8rr;
8792 break;
8793 }
8794 BeforeOps[1] = BeforeOps[0];
8795 }
8796 }
8797 SDNode *NewNode = DAG.getMachineNode(Opc, dl, VTs, BeforeOps);
8798 NewNodes.push_back(NewNode);
8799
8800 // Emit the store instruction.
8801 if (FoldedStore) {
8802 AddrOps.pop_back();
8803 AddrOps.push_back(SDValue(NewNode, 0));
8804 AddrOps.push_back(Chain);
8805 auto MMOs = extractStoreMMOs(cast<MachineSDNode>(N)->memoperands(), MF);
8806 if (MMOs.empty() && RC == &X86::VR128RegClass &&
8807 Subtarget.isUnalignedMem16Slow())
8808 // Do not introduce a slow unaligned store.
8809 return false;
8810 // FIXME: If a VR128 can have size 32, we should be checking if a 32-byte
8811 // memory access is slow above.
8812 unsigned Alignment = std::max<uint32_t>(TRI.getSpillSize(*RC), 16);
8813 bool isAligned = !MMOs.empty() && MMOs.front()->getAlign() >= Alignment;
8814 SDNode *Store =
8815 DAG.getMachineNode(getStoreRegOpcode(0, DstRC, isAligned, Subtarget),
8816 dl, MVT::Other, AddrOps);
8817 NewNodes.push_back(Store);
8818
8819 // Preserve memory reference information.
8820 DAG.setNodeMemRefs(cast<MachineSDNode>(Store), MMOs);
8821 }
8822
8823 return true;
8824}
8825
8826unsigned
8828 bool UnfoldStore,
8829 unsigned *LoadRegIndex) const {
8831 if (I == nullptr)
8832 return 0;
8833 bool FoldedLoad = I->Flags & TB_FOLDED_LOAD;
8834 bool FoldedStore = I->Flags & TB_FOLDED_STORE;
8835 if (UnfoldLoad && !FoldedLoad)
8836 return 0;
8837 if (UnfoldStore && !FoldedStore)
8838 return 0;
8839 if (LoadRegIndex)
8840 *LoadRegIndex = I->Flags & TB_INDEX_MASK;
8841 return I->DstOp;
8842}
8843
8845 int64_t &Offset1,
8846 int64_t &Offset2) const {
8847 if (!Load1->isMachineOpcode() || !Load2->isMachineOpcode())
8848 return false;
8849
8850 auto IsLoadOpcode = [&](unsigned Opcode) {
8851 switch (Opcode) {
8852 default:
8853 return false;
8854 case X86::MOV8rm:
8855 case X86::MOV16rm:
8856 case X86::MOV32rm:
8857 case X86::MOV64rm:
8858 case X86::LD_Fp32m:
8859 case X86::LD_Fp64m:
8860 case X86::LD_Fp80m:
8861 case X86::MOVSSrm:
8862 case X86::MOVSSrm_alt:
8863 case X86::MOVSDrm:
8864 case X86::MOVSDrm_alt:
8865 case X86::MMX_MOVD64rm:
8866 case X86::MMX_MOVQ64rm:
8867 case X86::MOVAPSrm:
8868 case X86::MOVUPSrm:
8869 case X86::MOVAPDrm:
8870 case X86::MOVUPDrm:
8871 case X86::MOVDQArm:
8872 case X86::MOVDQUrm:
8873 // AVX load instructions
8874 case X86::VMOVSSrm:
8875 case X86::VMOVSSrm_alt:
8876 case X86::VMOVSDrm:
8877 case X86::VMOVSDrm_alt:
8878 case X86::VMOVAPSrm:
8879 case X86::VMOVUPSrm:
8880 case X86::VMOVAPDrm:
8881 case X86::VMOVUPDrm:
8882 case X86::VMOVDQArm:
8883 case X86::VMOVDQUrm:
8884 case X86::VMOVAPSYrm:
8885 case X86::VMOVUPSYrm:
8886 case X86::VMOVAPDYrm:
8887 case X86::VMOVUPDYrm:
8888 case X86::VMOVDQAYrm:
8889 case X86::VMOVDQUYrm:
8890 // AVX512 load instructions
8891 case X86::VMOVSSZrm:
8892 case X86::VMOVSSZrm_alt:
8893 case X86::VMOVSDZrm:
8894 case X86::VMOVSDZrm_alt:
8895 case X86::VMOVAPSZ128rm:
8896 case X86::VMOVUPSZ128rm:
8897 case X86::VMOVAPSZ128rm_NOVLX:
8898 case X86::VMOVUPSZ128rm_NOVLX:
8899 case X86::VMOVAPDZ128rm:
8900 case X86::VMOVUPDZ128rm:
8901 case X86::VMOVDQU8Z128rm:
8902 case X86::VMOVDQU16Z128rm:
8903 case X86::VMOVDQA32Z128rm:
8904 case X86::VMOVDQU32Z128rm:
8905 case X86::VMOVDQA64Z128rm:
8906 case X86::VMOVDQU64Z128rm:
8907 case X86::VMOVAPSZ256rm:
8908 case X86::VMOVUPSZ256rm:
8909 case X86::VMOVAPSZ256rm_NOVLX:
8910 case X86::VMOVUPSZ256rm_NOVLX:
8911 case X86::VMOVAPDZ256rm:
8912 case X86::VMOVUPDZ256rm:
8913 case X86::VMOVDQU8Z256rm:
8914 case X86::VMOVDQU16Z256rm:
8915 case X86::VMOVDQA32Z256rm:
8916 case X86::VMOVDQU32Z256rm:
8917 case X86::VMOVDQA64Z256rm:
8918 case X86::VMOVDQU64Z256rm:
8919 case X86::VMOVAPSZrm:
8920 case X86::VMOVUPSZrm:
8921 case X86::VMOVAPDZrm:
8922 case X86::VMOVUPDZrm:
8923 case X86::VMOVDQU8Zrm:
8924 case X86::VMOVDQU16Zrm:
8925 case X86::VMOVDQA32Zrm:
8926 case X86::VMOVDQU32Zrm:
8927 case X86::VMOVDQA64Zrm:
8928 case X86::VMOVDQU64Zrm:
8929 case X86::KMOVBkm:
8930 case X86::KMOVBkm_EVEX:
8931 case X86::KMOVWkm:
8932 case X86::KMOVWkm_EVEX:
8933 case X86::KMOVDkm:
8934 case X86::KMOVDkm_EVEX:
8935 case X86::KMOVQkm:
8936 case X86::KMOVQkm_EVEX:
8937 return true;
8938 }
8939 };
8940
8941 if (!IsLoadOpcode(Load1->getMachineOpcode()) ||
8942 !IsLoadOpcode(Load2->getMachineOpcode()))
8943 return false;
8944
8945 // Lambda to check if both the loads have the same value for an operand index.
8946 auto HasSameOp = [&](int I) {
8947 return Load1->getOperand(I) == Load2->getOperand(I);
8948 };
8949
8950 // All operands except the displacement should match.
8951 if (!HasSameOp(X86::AddrBaseReg) || !HasSameOp(X86::AddrScaleAmt) ||
8952 !HasSameOp(X86::AddrIndexReg) || !HasSameOp(X86::AddrSegmentReg))
8953 return false;
8954
8955 // Chain Operand must be the same.
8956 if (!HasSameOp(5))
8957 return false;
8958
8959 // Now let's examine if the displacements are constants.
8962 if (!Disp1 || !Disp2)
8963 return false;
8964
8965 Offset1 = Disp1->getSExtValue();
8966 Offset2 = Disp2->getSExtValue();
8967 return true;
8968}
8969
8971 int64_t Offset1, int64_t Offset2,
8972 unsigned NumLoads) const {
8973 assert(Offset2 > Offset1);
8974 if ((Offset2 - Offset1) / 8 > 64)
8975 return false;
8976
8977 unsigned Opc1 = Load1->getMachineOpcode();
8978 unsigned Opc2 = Load2->getMachineOpcode();
8979 if (Opc1 != Opc2)
8980 return false; // FIXME: overly conservative?
8981
8982 switch (Opc1) {
8983 default:
8984 break;
8985 case X86::LD_Fp32m:
8986 case X86::LD_Fp64m:
8987 case X86::LD_Fp80m:
8988 case X86::MMX_MOVD64rm:
8989 case X86::MMX_MOVQ64rm:
8990 return false;
8991 }
8992
8993 EVT VT = Load1->getValueType(0);
8994 switch (VT.getSimpleVT().SimpleTy) {
8995 default:
8996 // XMM registers. In 64-bit mode we can be a bit more aggressive since we
8997 // have 16 of them to play with.
8998 if (Subtarget.is64Bit()) {
8999 if (NumLoads >= 3)
9000 return false;
9001 } else if (NumLoads) {
9002 return false;
9003 }
9004 break;
9005 case MVT::i8:
9006 case MVT::i16:
9007 case MVT::i32:
9008 case MVT::i64:
9009 case MVT::f32:
9010 case MVT::f64:
9011 if (NumLoads)
9012 return false;
9013 break;
9014 }
9015
9016 return true;
9017}
9018
9020 const MachineBasicBlock *MBB,
9021 const MachineFunction &MF) const {
9022
9023 // ENDBR instructions should not be scheduled around.
9024 unsigned Opcode = MI.getOpcode();
9025 if (Opcode == X86::ENDBR64 || Opcode == X86::ENDBR32 ||
9026 Opcode == X86::PLDTILECFGV)
9027 return true;
9028
9029 // Frame setup and destroy can't be scheduled around.
9030 if (MI.getFlag(MachineInstr::FrameSetup) ||
9032 return true;
9033
9035}
9036
9039 assert(Cond.size() == 1 && "Invalid X86 branch condition!");
9040 X86::CondCode CC = static_cast<X86::CondCode>(Cond[0].getImm());
9041 Cond[0].setImm(GetOppositeBranchCondition(CC));
9042 return false;
9043}
9044
9046 const TargetRegisterClass *RC) const {
9047 // FIXME: Return false for x87 stack register classes for now. We can't
9048 // allow any loads of these registers before FpGet_ST0_80.
9049 return !(RC == &X86::CCRRegClass || RC == &X86::DFCCRRegClass ||
9050 RC == &X86::RFP32RegClass || RC == &X86::RFP64RegClass ||
9051 RC == &X86::RFP80RegClass);
9052}
9053
9054/// Return a virtual register initialized with the
9055/// the global base register value. Output instructions required to
9056/// initialize the register in the function entry block, if necessary.
9057///
9058/// TODO: Eliminate this and move the code to X86MachineFunctionInfo.
9059///
9062 Register GlobalBaseReg = X86FI->getGlobalBaseReg();
9063 if (GlobalBaseReg)
9064 return GlobalBaseReg;
9065
9066 // Create the register. The code to initialize it is inserted
9067 // later, by the CGBR pass (below).
9068 MachineRegisterInfo &RegInfo = MF->getRegInfo();
9069 GlobalBaseReg = RegInfo.createVirtualRegister(
9070 Subtarget.is64Bit() ? &X86::GR64_NOSPRegClass : &X86::GR32_NOSPRegClass);
9071 X86FI->setGlobalBaseReg(GlobalBaseReg);
9072 return GlobalBaseReg;
9073}
9074
9075// FIXME: Some shuffle and unpack instructions have equivalents in different
9076// domains, but they require a bit more work than just switching opcodes.
9077
9078static const uint16_t *lookup(unsigned opcode, unsigned domain,
9079 ArrayRef<uint16_t[3]> Table) {
9080 for (const uint16_t(&Row)[3] : Table)
9081 if (Row[domain - 1] == opcode)
9082 return Row;
9083 return nullptr;
9084}
9085
9086static const uint16_t *lookupAVX512(unsigned opcode, unsigned domain,
9087 ArrayRef<uint16_t[4]> Table) {
9088 // If this is the integer domain make sure to check both integer columns.
9089 for (const uint16_t(&Row)[4] : Table)
9090 if (Row[domain - 1] == opcode || (domain == 3 && Row[3] == opcode))
9091 return Row;
9092 return nullptr;
9093}
9094
9095// Helper to attempt to widen/narrow blend masks.
9096static bool AdjustBlendMask(unsigned OldMask, unsigned OldWidth,
9097 unsigned NewWidth, unsigned *pNewMask = nullptr) {
9098 assert(((OldWidth % NewWidth) == 0 || (NewWidth % OldWidth) == 0) &&
9099 "Illegal blend mask scale");
9100 unsigned NewMask = 0;
9101
9102 if ((OldWidth % NewWidth) == 0) {
9103 unsigned Scale = OldWidth / NewWidth;
9104 unsigned SubMask = (1u << Scale) - 1;
9105 for (unsigned i = 0; i != NewWidth; ++i) {
9106 unsigned Sub = (OldMask >> (i * Scale)) & SubMask;
9107 if (Sub == SubMask)
9108 NewMask |= (1u << i);
9109 else if (Sub != 0x0)
9110 return false;
9111 }
9112 } else {
9113 unsigned Scale = NewWidth / OldWidth;
9114 unsigned SubMask = (1u << Scale) - 1;
9115 for (unsigned i = 0; i != OldWidth; ++i) {
9116 if (OldMask & (1 << i)) {
9117 NewMask |= (SubMask << (i * Scale));
9118 }
9119 }
9120 }
9121
9122 if (pNewMask)
9123 *pNewMask = NewMask;
9124 return true;
9125}
9126
9128 unsigned Opcode = MI.getOpcode();
9129 unsigned NumOperands = MI.getDesc().getNumOperands();
9130
9131 auto GetBlendDomains = [&](unsigned ImmWidth, bool Is256) {
9132 uint16_t validDomains = 0;
9133 if (MI.getOperand(NumOperands - 1).isImm()) {
9134 unsigned Imm = MI.getOperand(NumOperands - 1).getImm();
9135 if (AdjustBlendMask(Imm, ImmWidth, Is256 ? 8 : 4))
9136 validDomains |= 0x2; // PackedSingle
9137 if (AdjustBlendMask(Imm, ImmWidth, Is256 ? 4 : 2))
9138 validDomains |= 0x4; // PackedDouble
9139 if (!Is256 || Subtarget.hasAVX2())
9140 validDomains |= 0x8; // PackedInt
9141 }
9142 return validDomains;
9143 };
9144
9145 switch (Opcode) {
9146 case X86::BLENDPDrmi:
9147 case X86::BLENDPDrri:
9148 case X86::VBLENDPDrmi:
9149 case X86::VBLENDPDrri:
9150 return GetBlendDomains(2, false);
9151 case X86::VBLENDPDYrmi:
9152 case X86::VBLENDPDYrri:
9153 return GetBlendDomains(4, true);
9154 case X86::BLENDPSrmi:
9155 case X86::BLENDPSrri:
9156 case X86::VBLENDPSrmi:
9157 case X86::VBLENDPSrri:
9158 case X86::VPBLENDDrmi:
9159 case X86::VPBLENDDrri:
9160 return GetBlendDomains(4, false);
9161 case X86::VBLENDPSYrmi:
9162 case X86::VBLENDPSYrri:
9163 case X86::VPBLENDDYrmi:
9164 case X86::VPBLENDDYrri:
9165 return GetBlendDomains(8, true);
9166 case X86::PBLENDWrmi:
9167 case X86::PBLENDWrri:
9168 case X86::VPBLENDWrmi:
9169 case X86::VPBLENDWrri:
9170 // Treat VPBLENDWY as a 128-bit vector as it repeats the lo/hi masks.
9171 case X86::VPBLENDWYrmi:
9172 case X86::VPBLENDWYrri:
9173 return GetBlendDomains(8, false);
9174 case X86::VPANDDZ128rr:
9175 case X86::VPANDDZ128rm:
9176 case X86::VPANDDZ256rr:
9177 case X86::VPANDDZ256rm:
9178 case X86::VPANDQZ128rr:
9179 case X86::VPANDQZ128rm:
9180 case X86::VPANDQZ256rr:
9181 case X86::VPANDQZ256rm:
9182 case X86::VPANDNDZ128rr:
9183 case X86::VPANDNDZ128rm:
9184 case X86::VPANDNDZ256rr:
9185 case X86::VPANDNDZ256rm:
9186 case X86::VPANDNQZ128rr:
9187 case X86::VPANDNQZ128rm:
9188 case X86::VPANDNQZ256rr:
9189 case X86::VPANDNQZ256rm:
9190 case X86::VPORDZ128rr:
9191 case X86::VPORDZ128rm:
9192 case X86::VPORDZ256rr:
9193 case X86::VPORDZ256rm:
9194 case X86::VPORQZ128rr:
9195 case X86::VPORQZ128rm:
9196 case X86::VPORQZ256rr:
9197 case X86::VPORQZ256rm:
9198 case X86::VPXORDZ128rr:
9199 case X86::VPXORDZ128rm:
9200 case X86::VPXORDZ256rr:
9201 case X86::VPXORDZ256rm:
9202 case X86::VPXORQZ128rr:
9203 case X86::VPXORQZ128rm:
9204 case X86::VPXORQZ256rr:
9205 case X86::VPXORQZ256rm:
9206 // If we don't have DQI see if we can still switch from an EVEX integer
9207 // instruction to a VEX floating point instruction.
9208 if (Subtarget.hasDQI())
9209 return 0;
9210
9211 if (RI.getEncodingValue(MI.getOperand(0).getReg()) >= 16)
9212 return 0;
9213 if (RI.getEncodingValue(MI.getOperand(1).getReg()) >= 16)
9214 return 0;
9215 // Register forms will have 3 operands. Memory form will have more.
9216 if (NumOperands == 3 &&
9217 RI.getEncodingValue(MI.getOperand(2).getReg()) >= 16)
9218 return 0;
9219
9220 // All domains are valid.
9221 return 0xe;
9222 case X86::MOVHLPSrr:
9223 // We can swap domains when both inputs are the same register.
9224 // FIXME: This doesn't catch all the cases we would like. If the input
9225 // register isn't KILLed by the instruction, the two address instruction
9226 // pass puts a COPY on one input. The other input uses the original
9227 // register. This prevents the same physical register from being used by
9228 // both inputs.
9229 if (MI.getOperand(1).getReg() == MI.getOperand(2).getReg() &&
9230 MI.getOperand(0).getSubReg() == 0 &&
9231 MI.getOperand(1).getSubReg() == 0 && MI.getOperand(2).getSubReg() == 0)
9232 return 0x6;
9233 return 0;
9234 case X86::SHUFPDrri:
9235 return 0x6;
9236 }
9237 return 0;
9238}
9239
9240#include "X86ReplaceableInstrs.def"
9241
9243 unsigned Domain) const {
9244 assert(Domain > 0 && Domain < 4 && "Invalid execution domain");
9245 uint16_t dom = (MI.getDesc().TSFlags >> X86II::SSEDomainShift) & 3;
9246 assert(dom && "Not an SSE instruction");
9247
9248 unsigned Opcode = MI.getOpcode();
9249 unsigned NumOperands = MI.getDesc().getNumOperands();
9250
9251 auto SetBlendDomain = [&](unsigned ImmWidth, bool Is256) {
9252 if (MI.getOperand(NumOperands - 1).isImm()) {
9253 unsigned Imm = MI.getOperand(NumOperands - 1).getImm() & 255;
9254 Imm = (ImmWidth == 16 ? ((Imm << 8) | Imm) : Imm);
9255 unsigned NewImm = Imm;
9256
9257 const uint16_t *table = lookup(Opcode, dom, ReplaceableBlendInstrs);
9258 if (!table)
9259 table = lookup(Opcode, dom, ReplaceableBlendAVX2Instrs);
9260
9261 if (Domain == 1) { // PackedSingle
9262 AdjustBlendMask(Imm, ImmWidth, Is256 ? 8 : 4, &NewImm);
9263 } else if (Domain == 2) { // PackedDouble
9264 AdjustBlendMask(Imm, ImmWidth, Is256 ? 4 : 2, &NewImm);
9265 } else if (Domain == 3) { // PackedInt
9266 if (Subtarget.hasAVX2()) {
9267 // If we are already VPBLENDW use that, else use VPBLENDD.
9268 if ((ImmWidth / (Is256 ? 2 : 1)) != 8) {
9269 table = lookup(Opcode, dom, ReplaceableBlendAVX2Instrs);
9270 AdjustBlendMask(Imm, ImmWidth, Is256 ? 8 : 4, &NewImm);
9271 }
9272 } else {
9273 assert(!Is256 && "128-bit vector expected");
9274 AdjustBlendMask(Imm, ImmWidth, 8, &NewImm);
9275 }
9276 }
9277
9278 assert(table && table[Domain - 1] && "Unknown domain op");
9279 MI.setDesc(get(table[Domain - 1]));
9280 MI.getOperand(NumOperands - 1).setImm(NewImm & 255);
9281 }
9282 return true;
9283 };
9284
9285 switch (Opcode) {
9286 case X86::BLENDPDrmi:
9287 case X86::BLENDPDrri:
9288 case X86::VBLENDPDrmi:
9289 case X86::VBLENDPDrri:
9290 return SetBlendDomain(2, false);
9291 case X86::VBLENDPDYrmi:
9292 case X86::VBLENDPDYrri:
9293 return SetBlendDomain(4, true);
9294 case X86::BLENDPSrmi:
9295 case X86::BLENDPSrri:
9296 case X86::VBLENDPSrmi:
9297 case X86::VBLENDPSrri:
9298 case X86::VPBLENDDrmi:
9299 case X86::VPBLENDDrri:
9300 return SetBlendDomain(4, false);
9301 case X86::VBLENDPSYrmi:
9302 case X86::VBLENDPSYrri:
9303 case X86::VPBLENDDYrmi:
9304 case X86::VPBLENDDYrri:
9305 return SetBlendDomain(8, true);
9306 case X86::PBLENDWrmi:
9307 case X86::PBLENDWrri:
9308 case X86::VPBLENDWrmi:
9309 case X86::VPBLENDWrri:
9310 return SetBlendDomain(8, false);
9311 case X86::VPBLENDWYrmi:
9312 case X86::VPBLENDWYrri:
9313 return SetBlendDomain(16, true);
9314 case X86::VPANDDZ128rr:
9315 case X86::VPANDDZ128rm:
9316 case X86::VPANDDZ256rr:
9317 case X86::VPANDDZ256rm:
9318 case X86::VPANDQZ128rr:
9319 case X86::VPANDQZ128rm:
9320 case X86::VPANDQZ256rr:
9321 case X86::VPANDQZ256rm:
9322 case X86::VPANDNDZ128rr:
9323 case X86::VPANDNDZ128rm:
9324 case X86::VPANDNDZ256rr:
9325 case X86::VPANDNDZ256rm:
9326 case X86::VPANDNQZ128rr:
9327 case X86::VPANDNQZ128rm:
9328 case X86::VPANDNQZ256rr:
9329 case X86::VPANDNQZ256rm:
9330 case X86::VPORDZ128rr:
9331 case X86::VPORDZ128rm:
9332 case X86::VPORDZ256rr:
9333 case X86::VPORDZ256rm:
9334 case X86::VPORQZ128rr:
9335 case X86::VPORQZ128rm:
9336 case X86::VPORQZ256rr:
9337 case X86::VPORQZ256rm:
9338 case X86::VPXORDZ128rr:
9339 case X86::VPXORDZ128rm:
9340 case X86::VPXORDZ256rr:
9341 case X86::VPXORDZ256rm:
9342 case X86::VPXORQZ128rr:
9343 case X86::VPXORQZ128rm:
9344 case X86::VPXORQZ256rr:
9345 case X86::VPXORQZ256rm: {
9346 // Without DQI, convert EVEX instructions to VEX instructions.
9347 if (Subtarget.hasDQI())
9348 return false;
9349
9350 const uint16_t *table =
9351 lookupAVX512(MI.getOpcode(), dom, ReplaceableCustomAVX512LogicInstrs);
9352 assert(table && "Instruction not found in table?");
9353 // Don't change integer Q instructions to D instructions and
9354 // use D intructions if we started with a PS instruction.
9355 if (Domain == 3 && (dom == 1 || table[3] == MI.getOpcode()))
9356 Domain = 4;
9357 MI.setDesc(get(table[Domain - 1]));
9358 return true;
9359 }
9360 case X86::UNPCKHPDrr:
9361 case X86::MOVHLPSrr:
9362 // We just need to commute the instruction which will switch the domains.
9363 if (Domain != dom && Domain != 3 &&
9364 MI.getOperand(1).getReg() == MI.getOperand(2).getReg() &&
9365 MI.getOperand(0).getSubReg() == 0 &&
9366 MI.getOperand(1).getSubReg() == 0 &&
9367 MI.getOperand(2).getSubReg() == 0) {
9368 commuteInstruction(MI, false);
9369 return true;
9370 }
9371 // We must always return true for MOVHLPSrr.
9372 if (Opcode == X86::MOVHLPSrr)
9373 return true;
9374 break;
9375 case X86::SHUFPDrri: {
9376 if (Domain == 1) {
9377 unsigned Imm = MI.getOperand(3).getImm();
9378 unsigned NewImm = 0x44;
9379 if (Imm & 1)
9380 NewImm |= 0x0a;
9381 if (Imm & 2)
9382 NewImm |= 0xa0;
9383 MI.getOperand(3).setImm(NewImm);
9384 MI.setDesc(get(X86::SHUFPSrri));
9385 }
9386 return true;
9387 }
9388 }
9389 return false;
9390}
9391
9392std::pair<uint16_t, uint16_t>
9394 uint16_t domain = (MI.getDesc().TSFlags >> X86II::SSEDomainShift) & 3;
9395 unsigned opcode = MI.getOpcode();
9396 uint16_t validDomains = 0;
9397 if (domain) {
9398 // Attempt to match for custom instructions.
9399 validDomains = getExecutionDomainCustom(MI);
9400 if (validDomains)
9401 return std::make_pair(domain, validDomains);
9402
9403 if (lookup(opcode, domain, ReplaceableInstrs)) {
9404 validDomains = 0xe;
9405 } else if (lookup(opcode, domain, ReplaceableInstrsAVX2)) {
9406 validDomains = Subtarget.hasAVX2() ? 0xe : 0x6;
9407 } else if (lookup(opcode, domain, ReplaceableInstrsFP)) {
9408 validDomains = 0x6;
9409 } else if (lookup(opcode, domain, ReplaceableInstrsAVX2InsertExtract)) {
9410 // Insert/extract instructions should only effect domain if AVX2
9411 // is enabled.
9412 if (!Subtarget.hasAVX2())
9413 return std::make_pair(0, 0);
9414 validDomains = 0xe;
9415 } else if (lookupAVX512(opcode, domain, ReplaceableInstrsAVX512)) {
9416 validDomains = 0xe;
9417 } else if (Subtarget.hasDQI() &&
9418 lookupAVX512(opcode, domain, ReplaceableInstrsAVX512DQ)) {
9419 validDomains = 0xe;
9420 } else if (Subtarget.hasDQI()) {
9421 if (const uint16_t *table =
9422 lookupAVX512(opcode, domain, ReplaceableInstrsAVX512DQMasked)) {
9423 if (domain == 1 || (domain == 3 && table[3] == opcode))
9424 validDomains = 0xa;
9425 else
9426 validDomains = 0xc;
9427 }
9428 }
9429 }
9430 return std::make_pair(domain, validDomains);
9431}
9432
9434 assert(Domain > 0 && Domain < 4 && "Invalid execution domain");
9435 uint16_t dom = (MI.getDesc().TSFlags >> X86II::SSEDomainShift) & 3;
9436 assert(dom && "Not an SSE instruction");
9437
9438 // Attempt to match for custom instructions.
9440 return;
9441
9442 const uint16_t *table = lookup(MI.getOpcode(), dom, ReplaceableInstrs);
9443 if (!table) { // try the other table
9444 assert((Subtarget.hasAVX2() || Domain < 3) &&
9445 "256-bit vector operations only available in AVX2");
9446 table = lookup(MI.getOpcode(), dom, ReplaceableInstrsAVX2);
9447 }
9448 if (!table) { // try the FP table
9449 table = lookup(MI.getOpcode(), dom, ReplaceableInstrsFP);
9450 assert((!table || Domain < 3) &&
9451 "Can only select PackedSingle or PackedDouble");
9452 }
9453 if (!table) { // try the other table
9454 assert(Subtarget.hasAVX2() &&
9455 "256-bit insert/extract only available in AVX2");
9456 table = lookup(MI.getOpcode(), dom, ReplaceableInstrsAVX2InsertExtract);
9457 }
9458 if (!table) { // try the AVX512 table
9459 assert(Subtarget.hasAVX512() && "Requires AVX-512");
9460 table = lookupAVX512(MI.getOpcode(), dom, ReplaceableInstrsAVX512);
9461 // Don't change integer Q instructions to D instructions.
9462 if (table && Domain == 3 && table[3] == MI.getOpcode())
9463 Domain = 4;
9464 }
9465 if (!table) { // try the AVX512DQ table
9466 assert((Subtarget.hasDQI() || Domain >= 3) && "Requires AVX-512DQ");
9467 table = lookupAVX512(MI.getOpcode(), dom, ReplaceableInstrsAVX512DQ);
9468 // Don't change integer Q instructions to D instructions and
9469 // use D instructions if we started with a PS instruction.
9470 if (table && Domain == 3 && (dom == 1 || table[3] == MI.getOpcode()))
9471 Domain = 4;
9472 }
9473 if (!table) { // try the AVX512DQMasked table
9474 assert((Subtarget.hasDQI() || Domain >= 3) && "Requires AVX-512DQ");
9475 table = lookupAVX512(MI.getOpcode(), dom, ReplaceableInstrsAVX512DQMasked);
9476 if (table && Domain == 3 && (dom == 1 || table[3] == MI.getOpcode()))
9477 Domain = 4;
9478 }
9479 assert(table && "Cannot change domain");
9480 MI.setDesc(get(table[Domain - 1]));
9481}
9482
9488
9489/// Return the noop instruction to use for a noop.
9491 MCInst Nop;
9492 Nop.setOpcode(X86::NOOP);
9493 return Nop;
9494}
9495
9497 switch (opc) {
9498 default:
9499 return false;
9500 case X86::DIVPDrm:
9501 case X86::DIVPDrr:
9502 case X86::DIVPSrm:
9503 case X86::DIVPSrr:
9504 case X86::DIVSDrm:
9505 case X86::DIVSDrm_Int:
9506 case X86::DIVSDrr:
9507 case X86::DIVSDrr_Int:
9508 case X86::DIVSSrm:
9509 case X86::DIVSSrm_Int:
9510 case X86::DIVSSrr:
9511 case X86::DIVSSrr_Int:
9512 case X86::SQRTPDm:
9513 case X86::SQRTPDr:
9514 case X86::SQRTPSm:
9515 case X86::SQRTPSr:
9516 case X86::SQRTSDm:
9517 case X86::SQRTSDm_Int:
9518 case X86::SQRTSDr:
9519 case X86::SQRTSDr_Int:
9520 case X86::SQRTSSm:
9521 case X86::SQRTSSm_Int:
9522 case X86::SQRTSSr:
9523 case X86::SQRTSSr_Int:
9524 // AVX instructions with high latency
9525 case X86::VDIVPDrm:
9526 case X86::VDIVPDrr:
9527 case X86::VDIVPDYrm:
9528 case X86::VDIVPDYrr:
9529 case X86::VDIVPSrm:
9530 case X86::VDIVPSrr:
9531 case X86::VDIVPSYrm:
9532 case X86::VDIVPSYrr:
9533 case X86::VDIVSDrm:
9534 case X86::VDIVSDrm_Int:
9535 case X86::VDIVSDrr:
9536 case X86::VDIVSDrr_Int:
9537 case X86::VDIVSSrm:
9538 case X86::VDIVSSrm_Int:
9539 case X86::VDIVSSrr:
9540 case X86::VDIVSSrr_Int:
9541 case X86::VSQRTPDm:
9542 case X86::VSQRTPDr:
9543 case X86::VSQRTPDYm:
9544 case X86::VSQRTPDYr:
9545 case X86::VSQRTPSm:
9546 case X86::VSQRTPSr:
9547 case X86::VSQRTPSYm:
9548 case X86::VSQRTPSYr:
9549 case X86::VSQRTSDm:
9550 case X86::VSQRTSDm_Int:
9551 case X86::VSQRTSDr:
9552 case X86::VSQRTSDr_Int:
9553 case X86::VSQRTSSm:
9554 case X86::VSQRTSSm_Int:
9555 case X86::VSQRTSSr:
9556 case X86::VSQRTSSr_Int:
9557 // AVX512 instructions with high latency
9558 case X86::VDIVPDZ128rm:
9559 case X86::VDIVPDZ128rmb:
9560 case X86::VDIVPDZ128rmbk:
9561 case X86::VDIVPDZ128rmbkz:
9562 case X86::VDIVPDZ128rmk:
9563 case X86::VDIVPDZ128rmkz:
9564 case X86::VDIVPDZ128rr:
9565 case X86::VDIVPDZ128rrk:
9566 case X86::VDIVPDZ128rrkz:
9567 case X86::VDIVPDZ256rm:
9568 case X86::VDIVPDZ256rmb:
9569 case X86::VDIVPDZ256rmbk:
9570 case X86::VDIVPDZ256rmbkz:
9571 case X86::VDIVPDZ256rmk:
9572 case X86::VDIVPDZ256rmkz:
9573 case X86::VDIVPDZ256rr:
9574 case X86::VDIVPDZ256rrk:
9575 case X86::VDIVPDZ256rrkz:
9576 case X86::VDIVPDZrrb:
9577 case X86::VDIVPDZrrbk:
9578 case X86::VDIVPDZrrbkz:
9579 case X86::VDIVPDZrm:
9580 case X86::VDIVPDZrmb:
9581 case X86::VDIVPDZrmbk:
9582 case X86::VDIVPDZrmbkz:
9583 case X86::VDIVPDZrmk:
9584 case X86::VDIVPDZrmkz:
9585 case X86::VDIVPDZrr:
9586 case X86::VDIVPDZrrk:
9587 case X86::VDIVPDZrrkz:
9588 case X86::VDIVPSZ128rm:
9589 case X86::VDIVPSZ128rmb:
9590 case X86::VDIVPSZ128rmbk:
9591 case X86::VDIVPSZ128rmbkz:
9592 case X86::VDIVPSZ128rmk:
9593 case X86::VDIVPSZ128rmkz:
9594 case X86::VDIVPSZ128rr:
9595 case X86::VDIVPSZ128rrk:
9596 case X86::VDIVPSZ128rrkz:
9597 case X86::VDIVPSZ256rm:
9598 case X86::VDIVPSZ256rmb:
9599 case X86::VDIVPSZ256rmbk:
9600 case X86::VDIVPSZ256rmbkz:
9601 case X86::VDIVPSZ256rmk:
9602 case X86::VDIVPSZ256rmkz:
9603 case X86::VDIVPSZ256rr:
9604 case X86::VDIVPSZ256rrk:
9605 case X86::VDIVPSZ256rrkz:
9606 case X86::VDIVPSZrrb:
9607 case X86::VDIVPSZrrbk:
9608 case X86::VDIVPSZrrbkz:
9609 case X86::VDIVPSZrm:
9610 case X86::VDIVPSZrmb:
9611 case X86::VDIVPSZrmbk:
9612 case X86::VDIVPSZrmbkz:
9613 case X86::VDIVPSZrmk:
9614 case X86::VDIVPSZrmkz:
9615 case X86::VDIVPSZrr:
9616 case X86::VDIVPSZrrk:
9617 case X86::VDIVPSZrrkz:
9618 case X86::VDIVSDZrm:
9619 case X86::VDIVSDZrr:
9620 case X86::VDIVSDZrm_Int:
9621 case X86::VDIVSDZrmk_Int:
9622 case X86::VDIVSDZrmkz_Int:
9623 case X86::VDIVSDZrr_Int:
9624 case X86::VDIVSDZrrk_Int:
9625 case X86::VDIVSDZrrkz_Int:
9626 case X86::VDIVSDZrrb_Int:
9627 case X86::VDIVSDZrrbk_Int:
9628 case X86::VDIVSDZrrbkz_Int:
9629 case X86::VDIVSSZrm:
9630 case X86::VDIVSSZrr:
9631 case X86::VDIVSSZrm_Int:
9632 case X86::VDIVSSZrmk_Int:
9633 case X86::VDIVSSZrmkz_Int:
9634 case X86::VDIVSSZrr_Int:
9635 case X86::VDIVSSZrrk_Int:
9636 case X86::VDIVSSZrrkz_Int:
9637 case X86::VDIVSSZrrb_Int:
9638 case X86::VDIVSSZrrbk_Int:
9639 case X86::VDIVSSZrrbkz_Int:
9640 case X86::VSQRTPDZ128m:
9641 case X86::VSQRTPDZ128mb:
9642 case X86::VSQRTPDZ128mbk:
9643 case X86::VSQRTPDZ128mbkz:
9644 case X86::VSQRTPDZ128mk:
9645 case X86::VSQRTPDZ128mkz:
9646 case X86::VSQRTPDZ128r:
9647 case X86::VSQRTPDZ128rk:
9648 case X86::VSQRTPDZ128rkz:
9649 case X86::VSQRTPDZ256m:
9650 case X86::VSQRTPDZ256mb:
9651 case X86::VSQRTPDZ256mbk:
9652 case X86::VSQRTPDZ256mbkz:
9653 case X86::VSQRTPDZ256mk:
9654 case X86::VSQRTPDZ256mkz:
9655 case X86::VSQRTPDZ256r:
9656 case X86::VSQRTPDZ256rk:
9657 case X86::VSQRTPDZ256rkz:
9658 case X86::VSQRTPDZm:
9659 case X86::VSQRTPDZmb:
9660 case X86::VSQRTPDZmbk:
9661 case X86::VSQRTPDZmbkz:
9662 case X86::VSQRTPDZmk:
9663 case X86::VSQRTPDZmkz:
9664 case X86::VSQRTPDZr:
9665 case X86::VSQRTPDZrb:
9666 case X86::VSQRTPDZrbk:
9667 case X86::VSQRTPDZrbkz:
9668 case X86::VSQRTPDZrk:
9669 case X86::VSQRTPDZrkz:
9670 case X86::VSQRTPSZ128m:
9671 case X86::VSQRTPSZ128mb:
9672 case X86::VSQRTPSZ128mbk:
9673 case X86::VSQRTPSZ128mbkz:
9674 case X86::VSQRTPSZ128mk:
9675 case X86::VSQRTPSZ128mkz:
9676 case X86::VSQRTPSZ128r:
9677 case X86::VSQRTPSZ128rk:
9678 case X86::VSQRTPSZ128rkz:
9679 case X86::VSQRTPSZ256m:
9680 case X86::VSQRTPSZ256mb:
9681 case X86::VSQRTPSZ256mbk:
9682 case X86::VSQRTPSZ256mbkz:
9683 case X86::VSQRTPSZ256mk:
9684 case X86::VSQRTPSZ256mkz:
9685 case X86::VSQRTPSZ256r:
9686 case X86::VSQRTPSZ256rk:
9687 case X86::VSQRTPSZ256rkz:
9688 case X86::VSQRTPSZm:
9689 case X86::VSQRTPSZmb:
9690 case X86::VSQRTPSZmbk:
9691 case X86::VSQRTPSZmbkz:
9692 case X86::VSQRTPSZmk:
9693 case X86::VSQRTPSZmkz:
9694 case X86::VSQRTPSZr:
9695 case X86::VSQRTPSZrb:
9696 case X86::VSQRTPSZrbk:
9697 case X86::VSQRTPSZrbkz:
9698 case X86::VSQRTPSZrk:
9699 case X86::VSQRTPSZrkz:
9700 case X86::VSQRTSDZm:
9701 case X86::VSQRTSDZm_Int:
9702 case X86::VSQRTSDZmk_Int:
9703 case X86::VSQRTSDZmkz_Int:
9704 case X86::VSQRTSDZr:
9705 case X86::VSQRTSDZr_Int:
9706 case X86::VSQRTSDZrk_Int:
9707 case X86::VSQRTSDZrkz_Int:
9708 case X86::VSQRTSDZrb_Int:
9709 case X86::VSQRTSDZrbk_Int:
9710 case X86::VSQRTSDZrbkz_Int:
9711 case X86::VSQRTSSZm:
9712 case X86::VSQRTSSZm_Int:
9713 case X86::VSQRTSSZmk_Int:
9714 case X86::VSQRTSSZmkz_Int:
9715 case X86::VSQRTSSZr:
9716 case X86::VSQRTSSZr_Int:
9717 case X86::VSQRTSSZrk_Int:
9718 case X86::VSQRTSSZrkz_Int:
9719 case X86::VSQRTSSZrb_Int:
9720 case X86::VSQRTSSZrbk_Int:
9721 case X86::VSQRTSSZrbkz_Int:
9722
9723 case X86::VGATHERDPDYrm:
9724 case X86::VGATHERDPDZ128rm:
9725 case X86::VGATHERDPDZ256rm:
9726 case X86::VGATHERDPDZrm:
9727 case X86::VGATHERDPDrm:
9728 case X86::VGATHERDPSYrm:
9729 case X86::VGATHERDPSZ128rm:
9730 case X86::VGATHERDPSZ256rm:
9731 case X86::VGATHERDPSZrm:
9732 case X86::VGATHERDPSrm:
9733 case X86::VGATHERPF0DPDm:
9734 case X86::VGATHERPF0DPSm:
9735 case X86::VGATHERPF0QPDm:
9736 case X86::VGATHERPF0QPSm:
9737 case X86::VGATHERPF1DPDm:
9738 case X86::VGATHERPF1DPSm:
9739 case X86::VGATHERPF1QPDm:
9740 case X86::VGATHERPF1QPSm:
9741 case X86::VGATHERQPDYrm:
9742 case X86::VGATHERQPDZ128rm:
9743 case X86::VGATHERQPDZ256rm:
9744 case X86::VGATHERQPDZrm:
9745 case X86::VGATHERQPDrm:
9746 case X86::VGATHERQPSYrm:
9747 case X86::VGATHERQPSZ128rm:
9748 case X86::VGATHERQPSZ256rm:
9749 case X86::VGATHERQPSZrm:
9750 case X86::VGATHERQPSrm:
9751 case X86::VPGATHERDDYrm:
9752 case X86::VPGATHERDDZ128rm:
9753 case X86::VPGATHERDDZ256rm:
9754 case X86::VPGATHERDDZrm:
9755 case X86::VPGATHERDDrm:
9756 case X86::VPGATHERDQYrm:
9757 case X86::VPGATHERDQZ128rm:
9758 case X86::VPGATHERDQZ256rm:
9759 case X86::VPGATHERDQZrm:
9760 case X86::VPGATHERDQrm:
9761 case X86::VPGATHERQDYrm:
9762 case X86::VPGATHERQDZ128rm:
9763 case X86::VPGATHERQDZ256rm:
9764 case X86::VPGATHERQDZrm:
9765 case X86::VPGATHERQDrm:
9766 case X86::VPGATHERQQYrm:
9767 case X86::VPGATHERQQZ128rm:
9768 case X86::VPGATHERQQZ256rm:
9769 case X86::VPGATHERQQZrm:
9770 case X86::VPGATHERQQrm:
9771 case X86::VSCATTERDPDZ128mr:
9772 case X86::VSCATTERDPDZ256mr:
9773 case X86::VSCATTERDPDZmr:
9774 case X86::VSCATTERDPSZ128mr:
9775 case X86::VSCATTERDPSZ256mr:
9776 case X86::VSCATTERDPSZmr:
9777 case X86::VSCATTERPF0DPDm:
9778 case X86::VSCATTERPF0DPSm:
9779 case X86::VSCATTERPF0QPDm:
9780 case X86::VSCATTERPF0QPSm:
9781 case X86::VSCATTERPF1DPDm:
9782 case X86::VSCATTERPF1DPSm:
9783 case X86::VSCATTERPF1QPDm:
9784 case X86::VSCATTERPF1QPSm:
9785 case X86::VSCATTERQPDZ128mr:
9786 case X86::VSCATTERQPDZ256mr:
9787 case X86::VSCATTERQPDZmr:
9788 case X86::VSCATTERQPSZ128mr:
9789 case X86::VSCATTERQPSZ256mr:
9790 case X86::VSCATTERQPSZmr:
9791 case X86::VPSCATTERDDZ128mr:
9792 case X86::VPSCATTERDDZ256mr:
9793 case X86::VPSCATTERDDZmr:
9794 case X86::VPSCATTERDQZ128mr:
9795 case X86::VPSCATTERDQZ256mr:
9796 case X86::VPSCATTERDQZmr:
9797 case X86::VPSCATTERQDZ128mr:
9798 case X86::VPSCATTERQDZ256mr:
9799 case X86::VPSCATTERQDZmr:
9800 case X86::VPSCATTERQQZ128mr:
9801 case X86::VPSCATTERQQZ256mr:
9802 case X86::VPSCATTERQQZmr:
9803 return true;
9804 }
9805}
9806
9808 const MachineRegisterInfo *MRI,
9809 const MachineInstr &DefMI,
9810 unsigned DefIdx,
9811 const MachineInstr &UseMI,
9812 unsigned UseIdx) const {
9813 return isHighLatencyDef(DefMI.getOpcode());
9814}
9815
9817 const MachineBasicBlock *MBB) const {
9818 assert(Inst.getNumExplicitOperands() == 3 && Inst.getNumExplicitDefs() == 1 &&
9819 Inst.getNumDefs() <= 2 && "Reassociation needs binary operators");
9820
9821 // Integer binary math/logic instructions have a third source operand:
9822 // the EFLAGS register. That operand must be both defined here and never
9823 // used; ie, it must be dead. If the EFLAGS operand is live, then we can
9824 // not change anything because rearranging the operands could affect other
9825 // instructions that depend on the exact status flags (zero, sign, etc.)
9826 // that are set by using these particular operands with this operation.
9827 const MachineOperand *FlagDef =
9828 Inst.findRegisterDefOperand(X86::EFLAGS, /*TRI=*/nullptr);
9829 assert((Inst.getNumDefs() == 1 || FlagDef) && "Implicit def isn't flags?");
9830 if (FlagDef && !FlagDef->isDead())
9831 return false;
9832
9834}
9835
9836// TODO: There are many more machine instruction opcodes to match:
9837// 1. Other data types (integer, vectors)
9838// 2. Other math / logic operations (xor, or)
9839// 3. Other forms of the same operation (intrinsics and other variants)
9841 bool Invert) const {
9842 if (Invert)
9843 return false;
9844 switch (Inst.getOpcode()) {
9845 CASE_ND(ADD8rr)
9846 CASE_ND(ADD16rr)
9847 CASE_ND(ADD32rr)
9848 CASE_ND(ADD64rr)
9849 CASE_ND(AND8rr)
9850 CASE_ND(AND16rr)
9851 CASE_ND(AND32rr)
9852 CASE_ND(AND64rr)
9853 CASE_ND(OR8rr)
9854 CASE_ND(OR16rr)
9855 CASE_ND(OR32rr)
9856 CASE_ND(OR64rr)
9857 CASE_ND(XOR8rr)
9858 CASE_ND(XOR16rr)
9859 CASE_ND(XOR32rr)
9860 CASE_ND(XOR64rr)
9861 CASE_ND(IMUL16rr)
9862 CASE_ND(IMUL32rr)
9863 CASE_ND(IMUL64rr)
9864 case X86::PANDrr:
9865 case X86::PORrr:
9866 case X86::PXORrr:
9867 case X86::ANDPDrr:
9868 case X86::ANDPSrr:
9869 case X86::ORPDrr:
9870 case X86::ORPSrr:
9871 case X86::XORPDrr:
9872 case X86::XORPSrr:
9873 case X86::PADDBrr:
9874 case X86::PADDWrr:
9875 case X86::PADDDrr:
9876 case X86::PADDQrr:
9877 case X86::PMULLWrr:
9878 case X86::PMULLDrr:
9879 case X86::PMAXSBrr:
9880 case X86::PMAXSDrr:
9881 case X86::PMAXSWrr:
9882 case X86::PMAXUBrr:
9883 case X86::PMAXUDrr:
9884 case X86::PMAXUWrr:
9885 case X86::PMINSBrr:
9886 case X86::PMINSDrr:
9887 case X86::PMINSWrr:
9888 case X86::PMINUBrr:
9889 case X86::PMINUDrr:
9890 case X86::PMINUWrr:
9891 case X86::VPANDrr:
9892 case X86::VPANDYrr:
9893 case X86::VPANDDZ128rr:
9894 case X86::VPANDDZ256rr:
9895 case X86::VPANDDZrr:
9896 case X86::VPANDQZ128rr:
9897 case X86::VPANDQZ256rr:
9898 case X86::VPANDQZrr:
9899 case X86::VPORrr:
9900 case X86::VPORYrr:
9901 case X86::VPORDZ128rr:
9902 case X86::VPORDZ256rr:
9903 case X86::VPORDZrr:
9904 case X86::VPORQZ128rr:
9905 case X86::VPORQZ256rr:
9906 case X86::VPORQZrr:
9907 case X86::VPXORrr:
9908 case X86::VPXORYrr:
9909 case X86::VPXORDZ128rr:
9910 case X86::VPXORDZ256rr:
9911 case X86::VPXORDZrr:
9912 case X86::VPXORQZ128rr:
9913 case X86::VPXORQZ256rr:
9914 case X86::VPXORQZrr:
9915 case X86::VANDPDrr:
9916 case X86::VANDPSrr:
9917 case X86::VANDPDYrr:
9918 case X86::VANDPSYrr:
9919 case X86::VANDPDZ128rr:
9920 case X86::VANDPSZ128rr:
9921 case X86::VANDPDZ256rr:
9922 case X86::VANDPSZ256rr:
9923 case X86::VANDPDZrr:
9924 case X86::VANDPSZrr:
9925 case X86::VORPDrr:
9926 case X86::VORPSrr:
9927 case X86::VORPDYrr:
9928 case X86::VORPSYrr:
9929 case X86::VORPDZ128rr:
9930 case X86::VORPSZ128rr:
9931 case X86::VORPDZ256rr:
9932 case X86::VORPSZ256rr:
9933 case X86::VORPDZrr:
9934 case X86::VORPSZrr:
9935 case X86::VXORPDrr:
9936 case X86::VXORPSrr:
9937 case X86::VXORPDYrr:
9938 case X86::VXORPSYrr:
9939 case X86::VXORPDZ128rr:
9940 case X86::VXORPSZ128rr:
9941 case X86::VXORPDZ256rr:
9942 case X86::VXORPSZ256rr:
9943 case X86::VXORPDZrr:
9944 case X86::VXORPSZrr:
9945 case X86::KADDBkk:
9946 case X86::KADDWkk:
9947 case X86::KADDDkk:
9948 case X86::KADDQkk:
9949 case X86::KANDBkk:
9950 case X86::KANDWkk:
9951 case X86::KANDDkk:
9952 case X86::KANDQkk:
9953 case X86::KORBkk:
9954 case X86::KORWkk:
9955 case X86::KORDkk:
9956 case X86::KORQkk:
9957 case X86::KXORBkk:
9958 case X86::KXORWkk:
9959 case X86::KXORDkk:
9960 case X86::KXORQkk:
9961 case X86::VPADDBrr:
9962 case X86::VPADDWrr:
9963 case X86::VPADDDrr:
9964 case X86::VPADDQrr:
9965 case X86::VPADDBYrr:
9966 case X86::VPADDWYrr:
9967 case X86::VPADDDYrr:
9968 case X86::VPADDQYrr:
9969 case X86::VPADDBZ128rr:
9970 case X86::VPADDWZ128rr:
9971 case X86::VPADDDZ128rr:
9972 case X86::VPADDQZ128rr:
9973 case X86::VPADDBZ256rr:
9974 case X86::VPADDWZ256rr:
9975 case X86::VPADDDZ256rr:
9976 case X86::VPADDQZ256rr:
9977 case X86::VPADDBZrr:
9978 case X86::VPADDWZrr:
9979 case X86::VPADDDZrr:
9980 case X86::VPADDQZrr:
9981 case X86::VPMULLWrr:
9982 case X86::VPMULLWYrr:
9983 case X86::VPMULLWZ128rr:
9984 case X86::VPMULLWZ256rr:
9985 case X86::VPMULLWZrr:
9986 case X86::VPMULLDrr:
9987 case X86::VPMULLDYrr:
9988 case X86::VPMULLDZ128rr:
9989 case X86::VPMULLDZ256rr:
9990 case X86::VPMULLDZrr:
9991 case X86::VPMULLQZ128rr:
9992 case X86::VPMULLQZ256rr:
9993 case X86::VPMULLQZrr:
9994 case X86::VPMAXSBrr:
9995 case X86::VPMAXSBYrr:
9996 case X86::VPMAXSBZ128rr:
9997 case X86::VPMAXSBZ256rr:
9998 case X86::VPMAXSBZrr:
9999 case X86::VPMAXSDrr:
10000 case X86::VPMAXSDYrr:
10001 case X86::VPMAXSDZ128rr:
10002 case X86::VPMAXSDZ256rr:
10003 case X86::VPMAXSDZrr:
10004 case X86::VPMAXSQZ128rr:
10005 case X86::VPMAXSQZ256rr:
10006 case X86::VPMAXSQZrr:
10007 case X86::VPMAXSWrr:
10008 case X86::VPMAXSWYrr:
10009 case X86::VPMAXSWZ128rr:
10010 case X86::VPMAXSWZ256rr:
10011 case X86::VPMAXSWZrr:
10012 case X86::VPMAXUBrr:
10013 case X86::VPMAXUBYrr:
10014 case X86::VPMAXUBZ128rr:
10015 case X86::VPMAXUBZ256rr:
10016 case X86::VPMAXUBZrr:
10017 case X86::VPMAXUDrr:
10018 case X86::VPMAXUDYrr:
10019 case X86::VPMAXUDZ128rr:
10020 case X86::VPMAXUDZ256rr:
10021 case X86::VPMAXUDZrr:
10022 case X86::VPMAXUQZ128rr:
10023 case X86::VPMAXUQZ256rr:
10024 case X86::VPMAXUQZrr:
10025 case X86::VPMAXUWrr:
10026 case X86::VPMAXUWYrr:
10027 case X86::VPMAXUWZ128rr:
10028 case X86::VPMAXUWZ256rr:
10029 case X86::VPMAXUWZrr:
10030 case X86::VPMINSBrr:
10031 case X86::VPMINSBYrr:
10032 case X86::VPMINSBZ128rr:
10033 case X86::VPMINSBZ256rr:
10034 case X86::VPMINSBZrr:
10035 case X86::VPMINSDrr:
10036 case X86::VPMINSDYrr:
10037 case X86::VPMINSDZ128rr:
10038 case X86::VPMINSDZ256rr:
10039 case X86::VPMINSDZrr:
10040 case X86::VPMINSQZ128rr:
10041 case X86::VPMINSQZ256rr:
10042 case X86::VPMINSQZrr:
10043 case X86::VPMINSWrr:
10044 case X86::VPMINSWYrr:
10045 case X86::VPMINSWZ128rr:
10046 case X86::VPMINSWZ256rr:
10047 case X86::VPMINSWZrr:
10048 case X86::VPMINUBrr:
10049 case X86::VPMINUBYrr:
10050 case X86::VPMINUBZ128rr:
10051 case X86::VPMINUBZ256rr:
10052 case X86::VPMINUBZrr:
10053 case X86::VPMINUDrr:
10054 case X86::VPMINUDYrr:
10055 case X86::VPMINUDZ128rr:
10056 case X86::VPMINUDZ256rr:
10057 case X86::VPMINUDZrr:
10058 case X86::VPMINUQZ128rr:
10059 case X86::VPMINUQZ256rr:
10060 case X86::VPMINUQZrr:
10061 case X86::VPMINUWrr:
10062 case X86::VPMINUWYrr:
10063 case X86::VPMINUWZ128rr:
10064 case X86::VPMINUWZ256rr:
10065 case X86::VPMINUWZrr:
10066 // Normal min/max instructions are not commutative because of NaN and signed
10067 // zero semantics, but these are. Thus, there's no need to check for global
10068 // relaxed math; the instructions themselves have the properties we need.
10069 case X86::MAXCPDrr:
10070 case X86::MAXCPSrr:
10071 case X86::MAXCSDrr:
10072 case X86::MAXCSSrr:
10073 case X86::MINCPDrr:
10074 case X86::MINCPSrr:
10075 case X86::MINCSDrr:
10076 case X86::MINCSSrr:
10077 case X86::VMAXCPDrr:
10078 case X86::VMAXCPSrr:
10079 case X86::VMAXCPDYrr:
10080 case X86::VMAXCPSYrr:
10081 case X86::VMAXCPDZ128rr:
10082 case X86::VMAXCPSZ128rr:
10083 case X86::VMAXCPDZ256rr:
10084 case X86::VMAXCPSZ256rr:
10085 case X86::VMAXCPDZrr:
10086 case X86::VMAXCPSZrr:
10087 case X86::VMAXCSDrr:
10088 case X86::VMAXCSSrr:
10089 case X86::VMAXCSDZrr:
10090 case X86::VMAXCSSZrr:
10091 case X86::VMINCPDrr:
10092 case X86::VMINCPSrr:
10093 case X86::VMINCPDYrr:
10094 case X86::VMINCPSYrr:
10095 case X86::VMINCPDZ128rr:
10096 case X86::VMINCPSZ128rr:
10097 case X86::VMINCPDZ256rr:
10098 case X86::VMINCPSZ256rr:
10099 case X86::VMINCPDZrr:
10100 case X86::VMINCPSZrr:
10101 case X86::VMINCSDrr:
10102 case X86::VMINCSSrr:
10103 case X86::VMINCSDZrr:
10104 case X86::VMINCSSZrr:
10105 case X86::VMAXCPHZ128rr:
10106 case X86::VMAXCPHZ256rr:
10107 case X86::VMAXCPHZrr:
10108 case X86::VMAXCSHZrr:
10109 case X86::VMINCPHZ128rr:
10110 case X86::VMINCPHZ256rr:
10111 case X86::VMINCPHZrr:
10112 case X86::VMINCSHZrr:
10113 return true;
10114 case X86::ADDPDrr:
10115 case X86::ADDPSrr:
10116 case X86::ADDSDrr:
10117 case X86::ADDSSrr:
10118 case X86::MULPDrr:
10119 case X86::MULPSrr:
10120 case X86::MULSDrr:
10121 case X86::MULSSrr:
10122 case X86::VADDPDrr:
10123 case X86::VADDPSrr:
10124 case X86::VADDPDYrr:
10125 case X86::VADDPSYrr:
10126 case X86::VADDPDZ128rr:
10127 case X86::VADDPSZ128rr:
10128 case X86::VADDPDZ256rr:
10129 case X86::VADDPSZ256rr:
10130 case X86::VADDPDZrr:
10131 case X86::VADDPSZrr:
10132 case X86::VADDSDrr:
10133 case X86::VADDSSrr:
10134 case X86::VADDSDZrr:
10135 case X86::VADDSSZrr:
10136 case X86::VMULPDrr:
10137 case X86::VMULPSrr:
10138 case X86::VMULPDYrr:
10139 case X86::VMULPSYrr:
10140 case X86::VMULPDZ128rr:
10141 case X86::VMULPSZ128rr:
10142 case X86::VMULPDZ256rr:
10143 case X86::VMULPSZ256rr:
10144 case X86::VMULPDZrr:
10145 case X86::VMULPSZrr:
10146 case X86::VMULSDrr:
10147 case X86::VMULSSrr:
10148 case X86::VMULSDZrr:
10149 case X86::VMULSSZrr:
10150 case X86::VADDPHZ128rr:
10151 case X86::VADDPHZ256rr:
10152 case X86::VADDPHZrr:
10153 case X86::VADDSHZrr:
10154 case X86::VMULPHZ128rr:
10155 case X86::VMULPHZ256rr:
10156 case X86::VMULPHZrr:
10157 case X86::VMULSHZrr:
10160 default:
10161 return false;
10162 }
10163}
10164
10165/// If \p DescribedReg overlaps with the MOVrr instruction's destination
10166/// register then, if possible, describe the value in terms of the source
10167/// register.
10168static std::optional<ParamLoadedValue>
10170 const TargetRegisterInfo *TRI) {
10171 Register DestReg = MI.getOperand(0).getReg();
10172 Register SrcReg = MI.getOperand(1).getReg();
10173
10174 auto Expr = DIExpression::get(MI.getMF()->getFunction().getContext(), {});
10175
10176 // If the described register is the destination, just return the source.
10177 if (DestReg == DescribedReg)
10178 return ParamLoadedValue(MachineOperand::CreateReg(SrcReg, false), Expr);
10179
10180 // If the described register is a sub-register of the destination register,
10181 // then pick out the source register's corresponding sub-register.
10182 if (unsigned SubRegIdx = TRI->getSubRegIndex(DestReg, DescribedReg)) {
10183 Register SrcSubReg = TRI->getSubReg(SrcReg, SubRegIdx);
10184 return ParamLoadedValue(MachineOperand::CreateReg(SrcSubReg, false), Expr);
10185 }
10186
10187 // The remaining case to consider is when the described register is a
10188 // super-register of the destination register. MOV8rr and MOV16rr does not
10189 // write to any of the other bytes in the register, meaning that we'd have to
10190 // describe the value using a combination of the source register and the
10191 // non-overlapping bits in the described register, which is not currently
10192 // possible.
10193 if (MI.getOpcode() == X86::MOV8rr || MI.getOpcode() == X86::MOV16rr ||
10194 !TRI->isSuperRegister(DestReg, DescribedReg))
10195 return std::nullopt;
10196
10197 assert(MI.getOpcode() == X86::MOV32rr && "Unexpected super-register case");
10198 return ParamLoadedValue(MachineOperand::CreateReg(SrcReg, false), Expr);
10199}
10200
10201std::optional<ParamLoadedValue>
10203 const MachineOperand *Op = nullptr;
10204 DIExpression *Expr = nullptr;
10205
10207
10208 switch (MI.getOpcode()) {
10209 case X86::LEA32r:
10210 case X86::LEA64r:
10211 case X86::LEA64_32r: {
10212 // We may need to describe a 64-bit parameter with a 32-bit LEA.
10213 if (!TRI->isSuperRegisterEq(MI.getOperand(0).getReg(), Reg))
10214 return std::nullopt;
10215
10216 // Operand 4 could be global address. For now we do not support
10217 // such situation.
10218 if (!MI.getOperand(4).isImm() || !MI.getOperand(2).isImm())
10219 return std::nullopt;
10220
10221 const MachineOperand &Op1 = MI.getOperand(1);
10222 const MachineOperand &Op2 = MI.getOperand(3);
10223 assert(Op2.isReg() &&
10224 (Op2.getReg() == X86::NoRegister || Op2.getReg().isPhysical()));
10225
10226 // Omit situations like:
10227 // %rsi = lea %rsi, 4, ...
10228 if ((Op1.isReg() && Op1.getReg() == MI.getOperand(0).getReg()) ||
10229 Op2.getReg() == MI.getOperand(0).getReg())
10230 return std::nullopt;
10231 else if ((Op1.isReg() && Op1.getReg() != X86::NoRegister &&
10232 TRI->regsOverlap(Op1.getReg(), MI.getOperand(0).getReg())) ||
10233 (Op2.getReg() != X86::NoRegister &&
10234 TRI->regsOverlap(Op2.getReg(), MI.getOperand(0).getReg())))
10235 return std::nullopt;
10236
10237 int64_t Coef = MI.getOperand(2).getImm();
10238 int64_t Offset = MI.getOperand(4).getImm();
10240
10241 if ((Op1.isReg() && Op1.getReg() != X86::NoRegister)) {
10242 Op = &Op1;
10243 } else if (Op1.isFI())
10244 Op = &Op1;
10245
10246 if (Op && Op->isReg() && Op->getReg() == Op2.getReg() && Coef > 0) {
10247 Ops.push_back(dwarf::DW_OP_constu);
10248 Ops.push_back(Coef + 1);
10249 Ops.push_back(dwarf::DW_OP_mul);
10250 } else {
10251 if (Op && Op2.getReg() != X86::NoRegister) {
10252 int dwarfReg = TRI->getDwarfRegNum(Op2.getReg(), false);
10253 if (dwarfReg < 0)
10254 return std::nullopt;
10255 else if (dwarfReg < 32) {
10256 Ops.push_back(dwarf::DW_OP_breg0 + dwarfReg);
10257 Ops.push_back(0);
10258 } else {
10259 Ops.push_back(dwarf::DW_OP_bregx);
10260 Ops.push_back(dwarfReg);
10261 Ops.push_back(0);
10262 }
10263 } else if (!Op) {
10264 assert(Op2.getReg() != X86::NoRegister);
10265 Op = &Op2;
10266 }
10267
10268 if (Coef > 1) {
10269 assert(Op2.getReg() != X86::NoRegister);
10270 Ops.push_back(dwarf::DW_OP_constu);
10271 Ops.push_back(Coef);
10272 Ops.push_back(dwarf::DW_OP_mul);
10273 }
10274
10275 if (((Op1.isReg() && Op1.getReg() != X86::NoRegister) || Op1.isFI()) &&
10276 Op2.getReg() != X86::NoRegister) {
10277 Ops.push_back(dwarf::DW_OP_plus);
10278 }
10279 }
10280
10282 Expr = DIExpression::get(MI.getMF()->getFunction().getContext(), Ops);
10283
10284 return ParamLoadedValue(*Op, Expr);
10285 }
10286 case X86::MOV8ri:
10287 case X86::MOV16ri:
10288 // TODO: Handle MOV8ri and MOV16ri.
10289 return std::nullopt;
10290 case X86::MOV32ri:
10291 case X86::MOV64ri:
10292 case X86::MOV64ri32:
10293 // MOV32ri may be used for producing zero-extended 32-bit immediates in
10294 // 64-bit parameters, so we need to consider super-registers.
10295 if (!TRI->isSuperRegisterEq(MI.getOperand(0).getReg(), Reg))
10296 return std::nullopt;
10297 return ParamLoadedValue(MI.getOperand(1), Expr);
10298 case X86::MOV8rr:
10299 case X86::MOV16rr:
10300 case X86::MOV32rr:
10301 case X86::MOV64rr:
10302 return describeMOVrrLoadedValue(MI, Reg, TRI);
10303 case X86::XOR32rr: {
10304 // 64-bit parameters are zero-materialized using XOR32rr, so also consider
10305 // super-registers.
10306 if (!TRI->isSuperRegisterEq(MI.getOperand(0).getReg(), Reg))
10307 return std::nullopt;
10308 if (MI.getOperand(1).getReg() == MI.getOperand(2).getReg())
10310 return std::nullopt;
10311 }
10312 case X86::MOVSX64rr32: {
10313 // We may need to describe the lower 32 bits of the MOVSX; for example, in
10314 // cases like this:
10315 //
10316 // $ebx = [...]
10317 // $rdi = MOVSX64rr32 $ebx
10318 // $esi = MOV32rr $edi
10319 if (!TRI->isSubRegisterEq(MI.getOperand(0).getReg(), Reg))
10320 return std::nullopt;
10321
10322 Expr = DIExpression::get(MI.getMF()->getFunction().getContext(), {});
10323
10324 // If the described register is the destination register we need to
10325 // sign-extend the source register from 32 bits. The other case we handle
10326 // is when the described register is the 32-bit sub-register of the
10327 // destination register, in case we just need to return the source
10328 // register.
10329 if (Reg == MI.getOperand(0).getReg())
10330 Expr = DIExpression::appendExt(Expr, 32, 64, true);
10331 else
10332 assert(X86MCRegisterClasses[X86::GR32RegClassID].contains(Reg) &&
10333 "Unhandled sub-register case for MOVSX64rr32");
10334
10335 return ParamLoadedValue(MI.getOperand(1), Expr);
10336 }
10337 default:
10338 assert(!MI.isMoveImmediate() && "Unexpected MoveImm instruction");
10340 }
10341}
10342
10343/// This is an architecture-specific helper function of reassociateOps.
10344/// Set special operand attributes for new instructions after reassociation.
10346 MachineInstr &OldMI2,
10347 MachineInstr &NewMI1,
10348 MachineInstr &NewMI2) const {
10349 // Integer instructions may define an implicit EFLAGS dest register operand.
10350 MachineOperand *OldFlagDef1 =
10351 OldMI1.findRegisterDefOperand(X86::EFLAGS, /*TRI=*/nullptr);
10352 MachineOperand *OldFlagDef2 =
10353 OldMI2.findRegisterDefOperand(X86::EFLAGS, /*TRI=*/nullptr);
10354
10355 assert(!OldFlagDef1 == !OldFlagDef2 &&
10356 "Unexpected instruction type for reassociation");
10357
10358 if (!OldFlagDef1 || !OldFlagDef2)
10359 return;
10360
10361 assert(OldFlagDef1->isDead() && OldFlagDef2->isDead() &&
10362 "Must have dead EFLAGS operand in reassociable instruction");
10363
10364 MachineOperand *NewFlagDef1 =
10365 NewMI1.findRegisterDefOperand(X86::EFLAGS, /*TRI=*/nullptr);
10366 MachineOperand *NewFlagDef2 =
10367 NewMI2.findRegisterDefOperand(X86::EFLAGS, /*TRI=*/nullptr);
10368
10369 assert(NewFlagDef1 && NewFlagDef2 &&
10370 "Unexpected operand in reassociable instruction");
10371
10372 // Mark the new EFLAGS operands as dead to be helpful to subsequent iterations
10373 // of this pass or other passes. The EFLAGS operands must be dead in these new
10374 // instructions because the EFLAGS operands in the original instructions must
10375 // be dead in order for reassociation to occur.
10376 NewFlagDef1->setIsDead();
10377 NewFlagDef2->setIsDead();
10378}
10379
10380std::pair<unsigned, unsigned>
10382 return std::make_pair(TF, 0u);
10383}
10384
10387 using namespace X86II;
10388 static const std::pair<unsigned, const char *> TargetFlags[] = {
10389 {MO_GOT_ABSOLUTE_ADDRESS, "x86-got-absolute-address"},
10390 {MO_PIC_BASE_OFFSET, "x86-pic-base-offset"},
10391 {MO_GOT, "x86-got"},
10392 {MO_GOTOFF, "x86-gotoff"},
10393 {MO_GOTPCREL, "x86-gotpcrel"},
10394 {MO_GOTPCREL_NORELAX, "x86-gotpcrel-norelax"},
10395 {MO_PLT, "x86-plt"},
10396 {MO_TLSGD, "x86-tlsgd"},
10397 {MO_TLSLD, "x86-tlsld"},
10398 {MO_TLSLDM, "x86-tlsldm"},
10399 {MO_GOTTPOFF, "x86-gottpoff"},
10400 {MO_INDNTPOFF, "x86-indntpoff"},
10401 {MO_TPOFF, "x86-tpoff"},
10402 {MO_DTPOFF, "x86-dtpoff"},
10403 {MO_NTPOFF, "x86-ntpoff"},
10404 {MO_GOTNTPOFF, "x86-gotntpoff"},
10405 {MO_DLLIMPORT, "x86-dllimport"},
10406 {MO_DARWIN_NONLAZY, "x86-darwin-nonlazy"},
10407 {MO_DARWIN_NONLAZY_PIC_BASE, "x86-darwin-nonlazy-pic-base"},
10408 {MO_TLVP, "x86-tlvp"},
10409 {MO_TLVP_PIC_BASE, "x86-tlvp-pic-base"},
10410 {MO_SECREL, "x86-secrel"},
10411 {MO_COFFSTUB, "x86-coffstub"}};
10412 return ArrayRef(TargetFlags);
10413}
10414
10415namespace {
10416/// Create Global Base Reg pass. This initializes the PIC
10417/// global base register for x86-32.
10418struct CGBR : public MachineFunctionPass {
10419 static char ID;
10420 CGBR() : MachineFunctionPass(ID) {}
10421
10422 bool runOnMachineFunction(MachineFunction &MF) override {
10423 const X86TargetMachine *TM =
10424 static_cast<const X86TargetMachine *>(&MF.getTarget());
10425 const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>();
10426
10427 // Only emit a global base reg in PIC mode.
10428 if (!TM->isPositionIndependent())
10429 return false;
10430
10432 Register GlobalBaseReg = X86FI->getGlobalBaseReg();
10433
10434 // If we didn't need a GlobalBaseReg, don't insert code.
10435 if (GlobalBaseReg == 0)
10436 return false;
10437
10438 // Insert the set of GlobalBaseReg into the first MBB of the function
10439 MachineBasicBlock &FirstMBB = MF.front();
10441 DebugLoc DL = FirstMBB.findDebugLoc(MBBI);
10443 const X86InstrInfo *TII = STI.getInstrInfo();
10444
10445 Register PC;
10446 if (STI.isPICStyleGOT())
10447 PC = RegInfo.createVirtualRegister(&X86::GR32RegClass);
10448 else
10449 PC = GlobalBaseReg;
10450
10451 if (STI.is64Bit()) {
10452 if (TM->getCodeModel() == CodeModel::Large) {
10453 // In the large code model, we are aiming for this code, though the
10454 // register allocation may vary:
10455 // leaq .LN$pb(%rip), %rax
10456 // movq $_GLOBAL_OFFSET_TABLE_ - .LN$pb, %rcx
10457 // addq %rcx, %rax
10458 // RAX now holds address of _GLOBAL_OFFSET_TABLE_.
10459 Register PBReg = RegInfo.createVirtualRegister(&X86::GR64RegClass);
10460 Register GOTReg = RegInfo.createVirtualRegister(&X86::GR64RegClass);
10461 BuildMI(FirstMBB, MBBI, DL, TII->get(X86::LEA64r), PBReg)
10462 .addReg(X86::RIP)
10463 .addImm(0)
10464 .addReg(0)
10466 .addReg(0);
10467 std::prev(MBBI)->setPreInstrSymbol(MF, MF.getPICBaseSymbol());
10468 BuildMI(FirstMBB, MBBI, DL, TII->get(X86::MOV64ri), GOTReg)
10469 .addExternalSymbol("_GLOBAL_OFFSET_TABLE_",
10471 BuildMI(FirstMBB, MBBI, DL, TII->get(X86::ADD64rr), PC)
10472 .addReg(PBReg, RegState::Kill)
10473 .addReg(GOTReg, RegState::Kill);
10474 } else {
10475 // In other code models, use a RIP-relative LEA to materialize the
10476 // GOT.
10477 BuildMI(FirstMBB, MBBI, DL, TII->get(X86::LEA64r), PC)
10478 .addReg(X86::RIP)
10479 .addImm(0)
10480 .addReg(0)
10481 .addExternalSymbol("_GLOBAL_OFFSET_TABLE_")
10482 .addReg(0);
10483 }
10484 } else {
10485 // Operand of MovePCtoStack is completely ignored by asm printer. It's
10486 // only used in JIT code emission as displacement to pc.
10487 BuildMI(FirstMBB, MBBI, DL, TII->get(X86::MOVPC32r), PC).addImm(0);
10488
10489 // If we're using vanilla 'GOT' PIC style, we should use relative
10490 // addressing not to pc, but to _GLOBAL_OFFSET_TABLE_ external.
10491 if (STI.isPICStyleGOT()) {
10492 // Generate addl $__GLOBAL_OFFSET_TABLE_ + [.-piclabel],
10493 // %some_register
10494 BuildMI(FirstMBB, MBBI, DL, TII->get(X86::ADD32ri), GlobalBaseReg)
10495 .addReg(PC)
10496 .addExternalSymbol("_GLOBAL_OFFSET_TABLE_",
10498 }
10499 }
10500
10501 return true;
10502 }
10503
10504 StringRef getPassName() const override {
10505 return "X86 PIC Global Base Reg Initialization";
10506 }
10507
10508 void getAnalysisUsage(AnalysisUsage &AU) const override {
10509 AU.setPreservesCFG();
10511 }
10512};
10513} // namespace
10514
10515char CGBR::ID = 0;
10517
10518namespace {
10519struct LDTLSCleanup : public MachineFunctionPass {
10520 static char ID;
10521 LDTLSCleanup() : MachineFunctionPass(ID) {}
10522
10523 bool runOnMachineFunction(MachineFunction &MF) override {
10524 if (skipFunction(MF.getFunction()))
10525 return false;
10526
10527 X86MachineFunctionInfo *MFI = MF.getInfo<X86MachineFunctionInfo>();
10528 if (MFI->getNumLocalDynamicTLSAccesses() < 2) {
10529 // No point folding accesses if there isn't at least two.
10530 return false;
10531 }
10532
10533 MachineDominatorTree *DT =
10534 &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
10535 return VisitNode(DT->getRootNode(), Register());
10536 }
10537
10538 // Visit the dominator subtree rooted at Node in pre-order.
10539 // If TLSBaseAddrReg is non-null, then use that to replace any
10540 // TLS_base_addr instructions. Otherwise, create the register
10541 // when the first such instruction is seen, and then use it
10542 // as we encounter more instructions.
10543 bool VisitNode(MachineDomTreeNode *Node, Register TLSBaseAddrReg) {
10544 MachineBasicBlock *BB = Node->getBlock();
10545 bool Changed = false;
10546
10547 // Traverse the current block.
10548 for (MachineBasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;
10549 ++I) {
10550 switch (I->getOpcode()) {
10551 case X86::TLS_base_addr32:
10552 case X86::TLS_base_addr64:
10553 if (TLSBaseAddrReg)
10554 I = ReplaceTLSBaseAddrCall(*I, TLSBaseAddrReg);
10555 else
10556 I = SetRegister(*I, &TLSBaseAddrReg);
10557 Changed = true;
10558 break;
10559 default:
10560 break;
10561 }
10562 }
10563
10564 // Visit the children of this block in the dominator tree.
10565 for (auto &I : *Node) {
10566 Changed |= VisitNode(I, TLSBaseAddrReg);
10567 }
10568
10569 return Changed;
10570 }
10571
10572 // Replace the TLS_base_addr instruction I with a copy from
10573 // TLSBaseAddrReg, returning the new instruction.
10574 MachineInstr *ReplaceTLSBaseAddrCall(MachineInstr &I,
10575 Register TLSBaseAddrReg) {
10576 MachineFunction *MF = I.getParent()->getParent();
10577 const X86Subtarget &STI = MF->getSubtarget<X86Subtarget>();
10578 const bool is64Bit = STI.is64Bit();
10579 const X86InstrInfo *TII = STI.getInstrInfo();
10580
10581 // Insert a Copy from TLSBaseAddrReg to RAX/EAX.
10582 MachineInstr *Copy =
10583 BuildMI(*I.getParent(), I, I.getDebugLoc(),
10584 TII->get(TargetOpcode::COPY), is64Bit ? X86::RAX : X86::EAX)
10585 .addReg(TLSBaseAddrReg);
10586
10587 // Erase the TLS_base_addr instruction.
10588 I.eraseFromParent();
10589
10590 return Copy;
10591 }
10592
10593 // Create a virtual register in *TLSBaseAddrReg, and populate it by
10594 // inserting a copy instruction after I. Returns the new instruction.
10595 MachineInstr *SetRegister(MachineInstr &I, Register *TLSBaseAddrReg) {
10596 MachineFunction *MF = I.getParent()->getParent();
10597 const X86Subtarget &STI = MF->getSubtarget<X86Subtarget>();
10598 const bool is64Bit = STI.is64Bit();
10599 const X86InstrInfo *TII = STI.getInstrInfo();
10600
10601 // Create a virtual register for the TLS base address.
10602 MachineRegisterInfo &RegInfo = MF->getRegInfo();
10603 *TLSBaseAddrReg = RegInfo.createVirtualRegister(
10604 is64Bit ? &X86::GR64RegClass : &X86::GR32RegClass);
10605
10606 // Insert a copy from RAX/EAX to TLSBaseAddrReg.
10607 MachineInstr *Next = I.getNextNode();
10608 MachineInstr *Copy = BuildMI(*I.getParent(), Next, I.getDebugLoc(),
10609 TII->get(TargetOpcode::COPY), *TLSBaseAddrReg)
10610 .addReg(is64Bit ? X86::RAX : X86::EAX);
10611
10612 return Copy;
10613 }
10614
10615 StringRef getPassName() const override {
10616 return "Local Dynamic TLS Access Clean-up";
10617 }
10618
10619 void getAnalysisUsage(AnalysisUsage &AU) const override {
10620 AU.setPreservesCFG();
10621 AU.addRequired<MachineDominatorTreeWrapperPass>();
10623 }
10624};
10625} // namespace
10626
10627char LDTLSCleanup::ID = 0;
10629 return new LDTLSCleanup();
10630}
10631
10632/// Constants defining how certain sequences should be outlined.
10633///
10634/// \p MachineOutlinerDefault implies that the function is called with a call
10635/// instruction, and a return must be emitted for the outlined function frame.
10636///
10637/// That is,
10638///
10639/// I1 OUTLINED_FUNCTION:
10640/// I2 --> call OUTLINED_FUNCTION I1
10641/// I3 I2
10642/// I3
10643/// ret
10644///
10645/// * Call construction overhead: 1 (call instruction)
10646/// * Frame construction overhead: 1 (return instruction)
10647///
10648/// \p MachineOutlinerTailCall implies that the function is being tail called.
10649/// A jump is emitted instead of a call, and the return is already present in
10650/// the outlined sequence. That is,
10651///
10652/// I1 OUTLINED_FUNCTION:
10653/// I2 --> jmp OUTLINED_FUNCTION I1
10654/// ret I2
10655/// ret
10656///
10657/// * Call construction overhead: 1 (jump instruction)
10658/// * Frame construction overhead: 0 (don't need to return)
10659///
10661
10662std::optional<std::unique_ptr<outliner::OutlinedFunction>>
10664 const MachineModuleInfo &MMI,
10665 std::vector<outliner::Candidate> &RepeatedSequenceLocs,
10666 unsigned MinRepeats) const {
10667 unsigned SequenceSize = 0;
10668 for (auto &MI : RepeatedSequenceLocs[0]) {
10669 // FIXME: x86 doesn't implement getInstSizeInBytes, so
10670 // we can't tell the cost. Just assume each instruction
10671 // is one byte.
10672 if (MI.isDebugInstr() || MI.isKill())
10673 continue;
10674 SequenceSize += 1;
10675 }
10676
10677 // We check to see if CFI Instructions are present, and if they are
10678 // we find the number of CFI Instructions in the candidates.
10679 unsigned CFICount = 0;
10680 for (auto &I : RepeatedSequenceLocs[0]) {
10681 if (I.isCFIInstruction())
10682 CFICount++;
10683 }
10684
10685 // We compare the number of found CFI Instructions to the number of CFI
10686 // instructions in the parent function for each candidate. We must check this
10687 // since if we outline one of the CFI instructions in a function, we have to
10688 // outline them all for correctness. If we do not, the address offsets will be
10689 // incorrect between the two sections of the program.
10690 for (outliner::Candidate &C : RepeatedSequenceLocs) {
10691 std::vector<MCCFIInstruction> CFIInstructions =
10692 C.getMF()->getFrameInstructions();
10693
10694 if (CFICount > 0 && CFICount != CFIInstructions.size())
10695 return std::nullopt;
10696 }
10697
10698 // FIXME: Use real size in bytes for call and ret instructions.
10699 if (RepeatedSequenceLocs[0].back().isTerminator()) {
10700 for (outliner::Candidate &C : RepeatedSequenceLocs)
10701 C.setCallInfo(MachineOutlinerTailCall, 1);
10702
10703 return std::make_unique<outliner::OutlinedFunction>(
10704 RepeatedSequenceLocs, SequenceSize,
10705 0, // Number of bytes to emit frame.
10706 MachineOutlinerTailCall // Type of frame.
10707 );
10708 }
10709
10710 if (CFICount > 0)
10711 return std::nullopt;
10712
10713 for (outliner::Candidate &C : RepeatedSequenceLocs)
10714 C.setCallInfo(MachineOutlinerDefault, 1);
10715
10716 return std::make_unique<outliner::OutlinedFunction>(
10717 RepeatedSequenceLocs, SequenceSize, 1, MachineOutlinerDefault);
10718}
10719
10721 MachineFunction &MF, bool OutlineFromLinkOnceODRs) const {
10722 const Function &F = MF.getFunction();
10723
10724 // Does the function use a red zone? If it does, then we can't risk messing
10725 // with the stack.
10726 if (Subtarget.getFrameLowering()->has128ByteRedZone(MF)) {
10727 // It could have a red zone. If it does, then we don't want to touch it.
10729 if (!X86FI || X86FI->getUsesRedZone())
10730 return false;
10731 }
10732
10733 // If we *don't* want to outline from things that could potentially be deduped
10734 // then return false.
10735 if (!OutlineFromLinkOnceODRs && F.hasLinkOnceODRLinkage())
10736 return false;
10737
10738 // This function is viable for outlining, so return true.
10739 return true;
10740}
10741
10745 unsigned Flags) const {
10746 MachineInstr &MI = *MIT;
10747
10748 // Is this a terminator for a basic block?
10749 if (MI.isTerminator())
10750 // TargetInstrInfo::getOutliningType has already filtered out anything
10751 // that would break this, so we can allow it here.
10753
10754 // Don't outline anything that modifies or reads from the stack pointer.
10755 //
10756 // FIXME: There are instructions which are being manually built without
10757 // explicit uses/defs so we also have to check the MCInstrDesc. We should be
10758 // able to remove the extra checks once those are fixed up. For example,
10759 // sometimes we might get something like %rax = POP64r 1. This won't be
10760 // caught by modifiesRegister or readsRegister even though the instruction
10761 // really ought to be formed so that modifiesRegister/readsRegister would
10762 // catch it.
10763 if (MI.modifiesRegister(X86::RSP, &RI) || MI.readsRegister(X86::RSP, &RI) ||
10764 MI.getDesc().hasImplicitUseOfPhysReg(X86::RSP) ||
10765 MI.getDesc().hasImplicitDefOfPhysReg(X86::RSP))
10767
10768 // Outlined calls change the instruction pointer, so don't read from it.
10769 if (MI.readsRegister(X86::RIP, &RI) ||
10770 MI.getDesc().hasImplicitUseOfPhysReg(X86::RIP) ||
10771 MI.getDesc().hasImplicitDefOfPhysReg(X86::RIP))
10773
10774 // Don't outline CFI instructions.
10775 if (MI.isCFIInstruction())
10777
10779}
10780
10783 const outliner::OutlinedFunction &OF) const {
10784 // If we're a tail call, we already have a return, so don't do anything.
10785 if (OF.FrameConstructionID == MachineOutlinerTailCall)
10786 return;
10787
10788 // We're a normal call, so our sequence doesn't have a return instruction.
10789 // Add it in.
10790 MachineInstr *retq = BuildMI(MF, DebugLoc(), get(X86::RET64));
10791 MBB.insert(MBB.end(), retq);
10792}
10793
10797 // Is it a tail call?
10798 if (C.CallConstructionID == MachineOutlinerTailCall) {
10799 // Yes, just insert a JMP.
10800 It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(X86::TAILJMPd64))
10801 .addGlobalAddress(M.getNamedValue(MF.getName())));
10802 } else {
10803 // No, insert a call.
10804 It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(X86::CALL64pcrel32))
10805 .addGlobalAddress(M.getNamedValue(MF.getName())));
10806 }
10807
10808 return It;
10809}
10810
10813 DebugLoc &DL,
10814 bool AllowSideEffects) const {
10815 const MachineFunction &MF = *MBB.getParent();
10816 const X86Subtarget &ST = MF.getSubtarget<X86Subtarget>();
10818
10819 if (ST.hasMMX() && X86::VR64RegClass.contains(Reg))
10820 // FIXME: Should we ignore MMX registers?
10821 return;
10822
10823 if (TRI.isGeneralPurposeRegister(MF, Reg)) {
10824 // Convert register to the 32-bit version. Both 'movl' and 'xorl' clear the
10825 // upper bits of a 64-bit register automagically.
10826 Reg = getX86SubSuperRegister(Reg, 32);
10827
10828 if (!AllowSideEffects)
10829 // XOR affects flags, so use a MOV instead.
10830 BuildMI(MBB, Iter, DL, get(X86::MOV32ri), Reg).addImm(0);
10831 else
10832 BuildMI(MBB, Iter, DL, get(X86::XOR32rr), Reg)
10833 .addReg(Reg, RegState::Undef)
10834 .addReg(Reg, RegState::Undef);
10835 } else if (X86::VR128RegClass.contains(Reg)) {
10836 // XMM#
10837 if (!ST.hasSSE1())
10838 return;
10839
10840 BuildMI(MBB, Iter, DL, get(X86::V_SET0), Reg);
10841 } else if (X86::VR256RegClass.contains(Reg)) {
10842 // YMM#
10843 if (!ST.hasAVX())
10844 return;
10845
10846 BuildMI(MBB, Iter, DL, get(X86::AVX_SET0), Reg);
10847 } else if (X86::VR512RegClass.contains(Reg)) {
10848 // ZMM#
10849 if (!ST.hasAVX512())
10850 return;
10851
10852 BuildMI(MBB, Iter, DL, get(X86::AVX512_512_SET0), Reg);
10853 } else if (X86::VK1RegClass.contains(Reg) || X86::VK2RegClass.contains(Reg) ||
10854 X86::VK4RegClass.contains(Reg) || X86::VK8RegClass.contains(Reg) ||
10855 X86::VK16RegClass.contains(Reg)) {
10856 if (!ST.hasVLX())
10857 return;
10858
10859 unsigned Op = ST.hasBWI() ? X86::KSET0Q : X86::KSET0W;
10860 BuildMI(MBB, Iter, DL, get(Op), Reg);
10861 }
10862}
10863
10865 MachineInstr &Root, SmallVectorImpl<unsigned> &Patterns,
10866 bool DoRegPressureReduce) const {
10867 unsigned Opc = Root.getOpcode();
10868 switch (Opc) {
10869 case X86::VPDPWSSDrr:
10870 case X86::VPDPWSSDrm:
10871 case X86::VPDPWSSDYrr:
10872 case X86::VPDPWSSDYrm: {
10873 if (!Subtarget.hasFastDPWSSD()) {
10875 return true;
10876 }
10877 break;
10878 }
10879 case X86::VPDPWSSDZ128rr:
10880 case X86::VPDPWSSDZ128rm:
10881 case X86::VPDPWSSDZ256rr:
10882 case X86::VPDPWSSDZ256rm:
10883 case X86::VPDPWSSDZrr:
10884 case X86::VPDPWSSDZrm: {
10885 if (Subtarget.hasBWI() && !Subtarget.hasFastDPWSSD()) {
10887 return true;
10888 }
10889 break;
10890 }
10891 }
10893 Patterns, DoRegPressureReduce);
10894}
10895
10896static void
10900 DenseMap<Register, unsigned> &InstrIdxForVirtReg) {
10901 MachineFunction *MF = Root.getMF();
10903
10904 unsigned Opc = Root.getOpcode();
10905 unsigned AddOpc = 0;
10906 unsigned MaddOpc = 0;
10907 switch (Opc) {
10908 default:
10909 assert(false && "It should not reach here");
10910 break;
10911 // vpdpwssd xmm2,xmm3,xmm1
10912 // -->
10913 // vpmaddwd xmm3,xmm3,xmm1
10914 // vpaddd xmm2,xmm2,xmm3
10915 case X86::VPDPWSSDrr:
10916 MaddOpc = X86::VPMADDWDrr;
10917 AddOpc = X86::VPADDDrr;
10918 break;
10919 case X86::VPDPWSSDrm:
10920 MaddOpc = X86::VPMADDWDrm;
10921 AddOpc = X86::VPADDDrr;
10922 break;
10923 case X86::VPDPWSSDZ128rr:
10924 MaddOpc = X86::VPMADDWDZ128rr;
10925 AddOpc = X86::VPADDDZ128rr;
10926 break;
10927 case X86::VPDPWSSDZ128rm:
10928 MaddOpc = X86::VPMADDWDZ128rm;
10929 AddOpc = X86::VPADDDZ128rr;
10930 break;
10931 // vpdpwssd ymm2,ymm3,ymm1
10932 // -->
10933 // vpmaddwd ymm3,ymm3,ymm1
10934 // vpaddd ymm2,ymm2,ymm3
10935 case X86::VPDPWSSDYrr:
10936 MaddOpc = X86::VPMADDWDYrr;
10937 AddOpc = X86::VPADDDYrr;
10938 break;
10939 case X86::VPDPWSSDYrm:
10940 MaddOpc = X86::VPMADDWDYrm;
10941 AddOpc = X86::VPADDDYrr;
10942 break;
10943 case X86::VPDPWSSDZ256rr:
10944 MaddOpc = X86::VPMADDWDZ256rr;
10945 AddOpc = X86::VPADDDZ256rr;
10946 break;
10947 case X86::VPDPWSSDZ256rm:
10948 MaddOpc = X86::VPMADDWDZ256rm;
10949 AddOpc = X86::VPADDDZ256rr;
10950 break;
10951 // vpdpwssd zmm2,zmm3,zmm1
10952 // -->
10953 // vpmaddwd zmm3,zmm3,zmm1
10954 // vpaddd zmm2,zmm2,zmm3
10955 case X86::VPDPWSSDZrr:
10956 MaddOpc = X86::VPMADDWDZrr;
10957 AddOpc = X86::VPADDDZrr;
10958 break;
10959 case X86::VPDPWSSDZrm:
10960 MaddOpc = X86::VPMADDWDZrm;
10961 AddOpc = X86::VPADDDZrr;
10962 break;
10963 }
10964 // Create vpmaddwd.
10965 const TargetRegisterClass *RC =
10966 RegInfo.getRegClass(Root.getOperand(0).getReg());
10967 Register NewReg = RegInfo.createVirtualRegister(RC);
10968 MachineInstr *Madd = Root.getMF()->CloneMachineInstr(&Root);
10969 Madd->setDesc(TII.get(MaddOpc));
10970 Madd->untieRegOperand(1);
10971 Madd->removeOperand(1);
10972 Madd->getOperand(0).setReg(NewReg);
10973 InstrIdxForVirtReg.insert(std::make_pair(NewReg, 0));
10974 // Create vpaddd.
10975 Register DstReg = Root.getOperand(0).getReg();
10976 bool IsKill = Root.getOperand(1).isKill();
10977 MachineInstr *Add =
10978 BuildMI(*MF, MIMetadata(Root), TII.get(AddOpc), DstReg)
10979 .addReg(Root.getOperand(1).getReg(), getKillRegState(IsKill))
10980 .addReg(Madd->getOperand(0).getReg(), getKillRegState(true));
10981 InsInstrs.push_back(Madd);
10982 InsInstrs.push_back(Add);
10983 DelInstrs.push_back(&Root);
10984}
10985
10987 MachineInstr &Root, unsigned Pattern,
10990 DenseMap<Register, unsigned> &InstrIdxForVirtReg) const {
10991 switch (Pattern) {
10992 default:
10993 // Reassociate instructions.
10995 DelInstrs, InstrIdxForVirtReg);
10996 return;
10998 genAlternativeDpCodeSequence(Root, *this, InsInstrs, DelInstrs,
10999 InstrIdxForVirtReg);
11000 return;
11001 }
11002}
11003
11004// See also: X86DAGToDAGISel::SelectInlineAsmMemoryOperand().
11006 int FI) const {
11009 M.Base.FrameIndex = FI;
11010 M.getFullAddress(Ops);
11011}
11012
11013#define GET_INSTRINFO_HELPERS
11014#include "X86GenInstrInfo.inc"
unsigned SubReg
unsigned const MachineRegisterInfo * MRI
MachineInstrBuilder & UseMI
MachineInstrBuilder MachineInstrBuilder & DefMI
return SDValue()
static bool isFrameStoreOpcode(int Opcode)
static bool isFrameLoadOpcode(int Opcode)
MachineOutlinerClass
Constants defining how certain sequences should be outlined.
@ MachineOutlinerTailCall
Emit a save, restore, call, and return.
@ MachineOutlinerDefault
unsigned RegSize
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
DXIL Forward Handle Accesses
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
Module.h This file contains the declarations for the Module class.
static bool lookup(const GsymReader &GR, DataExtractor &Data, uint64_t &Offset, uint64_t BaseAddr, uint64_t Addr, SourceLocations &SrcLocs, llvm::Error &Err)
A Lookup helper functions.
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
This file implements the LivePhysRegs utility for tracking liveness of physical registers.
static SDValue isNOT(SDValue V, SelectionDAG &DAG)
static bool Expand2AddrUndef(MachineInstrBuilder &MIB, const MCInstrDesc &Desc)
Expand a single-def pseudo instruction to a two-addr instruction with two undef reads of the register...
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
This file declares the MachineConstantPool class which is an abstract constant pool to keep track of ...
Register Reg
Register const TargetRegisterInfo * TRI
Promote Memory to Register
Definition Mem2Reg.cpp:110
const SmallVectorImpl< MachineOperand > MachineBasicBlock * TBB
const SmallVectorImpl< MachineOperand > & Cond
bool IsDead
This file contains some templates that are useful if you are working with the STL at all.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition Value.cpp:480
Provides some synthesis utilities to produce sequences of values.
static SPCC::CondCodes GetOppositeBranchCondition(SPCC::CondCodes CC)
#define LLVM_DEBUG(...)
Definition Debug.h:114
#define FROM_TO(FROM, TO)
cl::opt< bool > X86EnableAPXForRelocation
static bool is64Bit(const char *name)
#define GET_EGPR_IF_ENABLED(OPC)
static bool isLEA(unsigned Opcode)
static void addOperands(MachineInstrBuilder &MIB, ArrayRef< MachineOperand > MOs, int PtrOffset=0)
static std::optional< ParamLoadedValue > describeMOVrrLoadedValue(const MachineInstr &MI, Register DescribedReg, const TargetRegisterInfo *TRI)
If DescribedReg overlaps with the MOVrr instruction's destination register then, if possible,...
static cl::opt< unsigned > PartialRegUpdateClearance("partial-reg-update-clearance", cl::desc("Clearance between two register writes " "for inserting XOR to avoid partial " "register update"), cl::init(64), cl::Hidden)
static bool shouldPreventUndefRegUpdateMemFold(MachineFunction &MF, MachineInstr &MI)
static unsigned CopyToFromAsymmetricReg(Register DestReg, Register SrcReg, const X86Subtarget &Subtarget)
static bool isConvertibleLEA(MachineInstr *MI)
static bool ExpandMOVImmSExti8(MachineInstrBuilder &MIB, const TargetInstrInfo &TII, const X86Subtarget &Subtarget)
static bool isAMXOpcode(unsigned Opc)
static int getJumpTableIndexFromReg(const MachineRegisterInfo &MRI, Register Reg)
static void updateOperandRegConstraints(MachineFunction &MF, MachineInstr &NewMI, const TargetInstrInfo &TII)
static int getJumpTableIndexFromAddr(const MachineInstr &MI)
static bool AdjustBlendMask(unsigned OldMask, unsigned OldWidth, unsigned NewWidth, unsigned *pNewMask=nullptr)
static bool expandMOV32r1(MachineInstrBuilder &MIB, const TargetInstrInfo &TII, bool MinusOne)
static unsigned getNewOpcFromTable(ArrayRef< X86TableEntry > Table, unsigned Opc)
static unsigned getStoreRegOpcode(Register SrcReg, const TargetRegisterClass *RC, bool IsStackAligned, const X86Subtarget &STI)
#define FOLD_BROADCAST(SIZE)
static cl::opt< unsigned > UndefRegClearance("undef-reg-clearance", cl::desc("How many idle instructions we would like before " "certain undef register reads"), cl::init(128), cl::Hidden)
#define CASE_BCAST_TYPE_OPC(TYPE, OP16, OP32, OP64)
static bool isTruncatedShiftCountForLEA(unsigned ShAmt)
Check whether the given shift count is appropriate can be represented by a LEA instruction.
static cl::opt< bool > ReMatPICStubLoad("remat-pic-stub-load", cl::desc("Re-materialize load from stub in PIC mode"), cl::init(false), cl::Hidden)
static SmallVector< MachineMemOperand *, 2 > extractLoadMMOs(ArrayRef< MachineMemOperand * > MMOs, MachineFunction &MF)
static MachineInstr * fuseTwoAddrInst(MachineFunction &MF, unsigned Opcode, ArrayRef< MachineOperand > MOs, MachineBasicBlock::iterator InsertPt, MachineInstr &MI, const TargetInstrInfo &TII)
static void printFailMsgforFold(const MachineInstr &MI, unsigned Idx)
static bool canConvert2Copy(unsigned Opc)
static cl::opt< bool > NoFusing("disable-spill-fusing", cl::desc("Disable fusing of spill code into instructions"), cl::Hidden)
static bool expandNOVLXStore(MachineInstrBuilder &MIB, const TargetRegisterInfo *TRI, const MCInstrDesc &StoreDesc, const MCInstrDesc &ExtractDesc, unsigned SubIdx)
static bool isX87Reg(Register Reg)
Return true if the Reg is X87 register.
static bool Expand2AddrKreg(MachineInstrBuilder &MIB, const MCInstrDesc &Desc, Register Reg)
Expand a single-def pseudo instruction to a two-addr instruction with two k0 reads.
#define VPERM_CASES_BROADCAST(Suffix)
static std::pair< X86::CondCode, unsigned > isUseDefConvertible(const MachineInstr &MI)
Check whether the use can be converted to remove a comparison against zero.
static bool findRedundantFlagInstr(MachineInstr &CmpInstr, MachineInstr &CmpValDefInstr, const MachineRegisterInfo *MRI, MachineInstr **AndInstr, const TargetRegisterInfo *TRI, const X86Subtarget &ST, bool &NoSignFlag, bool &ClearsOverflowFlag)
static bool expandSHXDROT(MachineInstrBuilder &MIB, const MCInstrDesc &Desc)
static unsigned getLoadRegOpcode(Register DestReg, const TargetRegisterClass *RC, bool IsStackAligned, const X86Subtarget &STI)
static void expandLoadStackGuard(MachineInstrBuilder &MIB, const TargetInstrInfo &TII)
static bool hasUndefRegUpdate(unsigned Opcode, unsigned OpNum, bool ForLoadFold=false)
static MachineInstr * makeM0Inst(const TargetInstrInfo &TII, unsigned Opcode, ArrayRef< MachineOperand > MOs, MachineBasicBlock::iterator InsertPt, MachineInstr &MI)
#define GET_ND_IF_ENABLED(OPC)
static bool expandMOVSHP(MachineInstrBuilder &MIB, MachineInstr &MI, const TargetInstrInfo &TII, bool HasAVX)
static bool hasPartialRegUpdate(unsigned Opcode, const X86Subtarget &Subtarget, bool ForLoadFold=false)
Return true for all instructions that only update the first 32 or 64-bits of the destination register...
#define CASE_NF(OP)
static const uint16_t * lookupAVX512(unsigned opcode, unsigned domain, ArrayRef< uint16_t[4]> Table)
static unsigned getLoadStoreRegOpcode(Register Reg, const TargetRegisterClass *RC, bool IsStackAligned, const X86Subtarget &STI, bool Load)
#define VPERM_CASES(Suffix)
#define FROM_TO_SIZE(A, B, S)
static void commuteVPTERNLOG(MachineInstr &MI, unsigned SrcOpIdx1, unsigned SrcOpIdx2)
static bool isDefConvertible(const MachineInstr &MI, bool &NoSignFlag, bool &ClearsOverflowFlag)
Check whether the definition can be converted to remove a comparison against zero.
static MachineInstr * fuseInst(MachineFunction &MF, unsigned Opcode, unsigned OpNo, ArrayRef< MachineOperand > MOs, MachineBasicBlock::iterator InsertPt, MachineInstr &MI, const TargetInstrInfo &TII, int PtrOffset=0)
static X86::CondCode getSwappedCondition(X86::CondCode CC)
Assuming the flags are set by MI(a,b), return the condition code if we modify the instructions such t...
static unsigned getCommutedVPERMV3Opcode(unsigned Opcode)
static bool expandXorFP(MachineInstrBuilder &MIB, const TargetInstrInfo &TII)
static MachineBasicBlock * getFallThroughMBB(MachineBasicBlock *MBB, MachineBasicBlock *TBB)
static bool isNonFoldablePartialRegisterLoad(const MachineInstr &LoadMI, const MachineInstr &UserMI, const MachineFunction &MF)
Check if LoadMI is a partial register load that we can't fold into MI because the latter uses content...
static unsigned getLoadStoreOpcodeForFP16(bool Load, const X86Subtarget &STI)
static bool isHReg(Register Reg)
Test if the given register is a physical h register.
static cl::opt< bool > PrintFailedFusing("print-failed-fuse-candidates", cl::desc("Print instructions that the allocator wants to" " fuse, but the X86 backend currently can't"), cl::Hidden)
static bool expandNOVLXLoad(MachineInstrBuilder &MIB, const TargetRegisterInfo *TRI, const MCInstrDesc &LoadDesc, const MCInstrDesc &BroadcastDesc, unsigned SubIdx)
static void genAlternativeDpCodeSequence(MachineInstr &Root, const TargetInstrInfo &TII, SmallVectorImpl< MachineInstr * > &InsInstrs, SmallVectorImpl< MachineInstr * > &DelInstrs, DenseMap< Register, unsigned > &InstrIdxForVirtReg)
#define CASE_ND(OP)
static unsigned getThreeSrcCommuteCase(uint64_t TSFlags, unsigned SrcOpIdx1, unsigned SrcOpIdx2)
This determines which of three possible cases of a three source commute the source indexes correspond...
static unsigned getTruncatedShiftCount(const MachineInstr &MI, unsigned ShiftAmtOperandIdx)
Check whether the shift count for a machine operand is non-zero.
static SmallVector< MachineMemOperand *, 2 > extractStoreMMOs(ArrayRef< MachineMemOperand * > MMOs, MachineFunction &MF)
static unsigned getBroadcastOpcode(const X86FoldTableEntry *I, const TargetRegisterClass *RC, const X86Subtarget &STI)
static unsigned convertALUrr2ALUri(unsigned Opc)
Convert an ALUrr opcode to corresponding ALUri opcode.
static bool regIsPICBase(Register BaseReg, const MachineRegisterInfo &MRI)
Return true if register is PIC base; i.e.g defined by X86::MOVPC32r.
static bool isCommutableVPERMV3Instruction(unsigned Opcode)
static APInt getMaxValue(unsigned numBits)
Gets maximum unsigned value of APInt for specific bit width.
Definition APInt.h:207
static APInt getSignedMaxValue(unsigned numBits)
Gets maximum signed value of APInt for a specific bit width.
Definition APInt.h:210
static APInt getSignedMinValue(unsigned numBits)
Gets minimum signed value of APInt for a specific bit width.
Definition APInt.h:220
AnalysisUsage & addRequired()
LLVM_ABI void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition Pass.cpp:270
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
iterator end() const
Definition ArrayRef.h:131
size_t size() const
size - Get the array size.
Definition ArrayRef.h:142
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:676
@ FCMP_OEQ
0 0 0 1 True if ordered and equal
Definition InstrTypes.h:679
@ ICMP_SLT
signed less than
Definition InstrTypes.h:705
@ ICMP_SLE
signed less or equal
Definition InstrTypes.h:706
@ FCMP_OLT
0 1 0 0 True if ordered and less than
Definition InstrTypes.h:682
@ FCMP_ULE
1 1 0 1 True if unordered, less than, or equal
Definition InstrTypes.h:691
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
Definition InstrTypes.h:680
@ FCMP_OGE
0 0 1 1 True if ordered and greater than or equal
Definition InstrTypes.h:681
@ ICMP_UGE
unsigned greater or equal
Definition InstrTypes.h:700
@ ICMP_UGT
unsigned greater than
Definition InstrTypes.h:699
@ ICMP_SGT
signed greater than
Definition InstrTypes.h:703
@ FCMP_ULT
1 1 0 0 True if unordered or less than
Definition InstrTypes.h:690
@ FCMP_ONE
0 1 1 0 True if ordered and operands are unequal
Definition InstrTypes.h:684
@ FCMP_UEQ
1 0 0 1 True if unordered or equal
Definition InstrTypes.h:687
@ ICMP_ULT
unsigned less than
Definition InstrTypes.h:701
@ FCMP_UGT
1 0 1 0 True if unordered or greater than
Definition InstrTypes.h:688
@ FCMP_OLE
0 1 0 1 True if ordered and less than or equal
Definition InstrTypes.h:683
@ FCMP_ORD
0 1 1 1 True if ordered (no nans)
Definition InstrTypes.h:685
@ ICMP_NE
not equal
Definition InstrTypes.h:698
@ ICMP_SGE
signed greater or equal
Definition InstrTypes.h:704
@ FCMP_UNE
1 1 1 0 True if unordered or not equal
Definition InstrTypes.h:692
@ ICMP_ULE
unsigned less or equal
Definition InstrTypes.h:702
@ FCMP_UGE
1 0 1 1 True if unordered, greater than, or equal
Definition InstrTypes.h:689
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition InstrTypes.h:686
This is an important base class in LLVM.
Definition Constant.h:43
static LLVM_ABI Constant * getAllOnesValue(Type *Ty)
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
DWARF expression.
static LLVM_ABI void appendOffset(SmallVectorImpl< uint64_t > &Ops, int64_t Offset)
Append Ops with operations to apply the Offset.
static LLVM_ABI DIExpression * appendExt(const DIExpression *Expr, unsigned FromSize, unsigned ToSize, bool Signed)
Append a zero- or sign-extension to Expr.
A debug info location.
Definition DebugLoc.h:124
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition DenseMap.h:241
DomTreeNodeBase< NodeT > * getRootNode()
getRootNode - This returns the entry node for the CFG of the function.
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:802
FunctionPass class - This class is used to implement most global optimizations.
Definition Pass.h:314
bool hasOptSize() const
Optimize this function for size (-Os) or minimum size (-Oz).
Definition Function.h:706
bool hasMinSize() const
Optimize this function for minimum size (-Oz).
Definition Function.h:703
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:359
LiveInterval - This class represents the liveness of a register, or stack slot.
SlotIndex InsertMachineInstrInMaps(MachineInstr &MI)
SlotIndex getInstructionIndex(const MachineInstr &Instr) const
Returns the base index of the given instruction.
LiveInterval & getInterval(Register Reg)
SlotIndex ReplaceMachineInstrInMaps(MachineInstr &MI, MachineInstr &NewMI)
A set of physical registers with utility functions to track liveness when walking backward/forward th...
const Segment * getSegmentContaining(SlotIndex Idx) const
Return the segment that contains the specified index, or null if there is none.
LLVM_ABI void replaceKillInstruction(Register Reg, MachineInstr &OldMI, MachineInstr &NewMI)
replaceKillInstruction - Update register kill info by replacing a kill instruction with a new one.
LLVM_ABI VarInfo & getVarInfo(Register Reg)
getVarInfo - Return the VarInfo structure for the specified VIRTUAL register.
static LocationSize precise(uint64_t Value)
bool usesWindowsCFI() const
Definition MCAsmInfo.h:652
static MCCFIInstruction createAdjustCfaOffset(MCSymbol *L, int64_t Adjustment, SMLoc Loc={})
.cfi_adjust_cfa_offset Same as .cfi_def_cfa_offset, but Offset is a relative value that is added/subt...
Definition MCDwarf.h:608
Instances of this class represent a single low-level machine instruction.
Definition MCInst.h:188
void setOpcode(unsigned Op)
Definition MCInst.h:201
Describe properties that are true of each instruction in the target description file.
This holds information about one operand of a machine instruction, indicating the register class for ...
Definition MCInstrDesc.h:86
Wrapper class representing physical registers. Should be passed by value.
Definition MCRegister.h:41
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition Metadata.h:1569
Set of metadata that should be preserved when using BuildMI().
SimpleValueType SimpleTy
MachineInstrBundleIterator< const MachineInstr > const_iterator
void push_back(MachineInstr *MI)
MachineInstr * remove(MachineInstr *I)
Remove the unbundled instruction from the instruction list without deleting it.
LLVM_ABI DebugLoc findDebugLoc(instr_iterator MBBI)
Find the next valid DebugLoc starting at MBBI, skipping any debug instructions.
MachineInstrBundleIterator< MachineInstr, true > reverse_iterator
LLVM_ABI bool isLayoutSuccessor(const MachineBasicBlock *MBB) const
Return true if the specified MBB will be emitted immediately after this block, such that if this bloc...
LLVM_ABI void eraseFromParent()
This method unlinks 'this' from the containing function and deletes it.
LLVM_ABI instr_iterator erase(instr_iterator I)
Remove an instruction from the instruction list and delete it.
iterator_range< succ_iterator > successors()
MachineInstrBundleIterator< MachineInstr > iterator
@ LQR_Dead
Register is known to be fully dead.
This class is a data container for one entry in a MachineConstantPool.
union llvm::MachineConstantPoolEntry::@004270020304201266316354007027341142157160323045 Val
The constant itself.
bool isMachineConstantPoolEntry() const
isMachineConstantPoolEntry - Return true if the MachineConstantPoolEntry is indeed a target specific ...
The MachineConstantPool class keeps track of constants referenced by a function which must be spilled...
unsigned getConstantPoolIndex(const Constant *C, Align Alignment)
getConstantPoolIndex - Create a new entry in the constant pool or return an existing one.
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
Align getObjectAlign(int ObjectIdx) const
Return the alignment of the specified stack object.
int64_t getObjectSize(int ObjectIdx) const
Return the size of the specified object.
bool isFixedObjectIndex(int ObjectIdx) const
Returns true if the specified index corresponds to a fixed stack object.
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
MCSymbol * getPICBaseSymbol() const
getPICBaseSymbol - Return a function-local symbol to represent the PIC base.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
StringRef getName() const
getName - Return the name of the corresponding LLVM function.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
bool needsFrameMoves() const
True if this function needs frame moves for debug or exceptions.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
MachineConstantPool * getConstantPool()
getConstantPool - Return the constant pool object for the current function.
const MachineBasicBlock & front() const
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
Register getReg(unsigned Idx) const
Get the register for the operand index.
const MachineInstrBuilder & setMemRefs(ArrayRef< MachineMemOperand * > MMOs) const
const MachineInstrBuilder & addExternalSymbol(const char *FnName, unsigned TargetFlags=0) const
const MachineInstrBuilder & setMIFlag(MachineInstr::MIFlag Flag) const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addSym(MCSymbol *Sym, unsigned char TargetFlags=0) const
const MachineInstrBuilder & addGlobalAddress(const GlobalValue *GV, int64_t Offset=0, unsigned TargetFlags=0) const
const MachineInstrBuilder & addDisp(const MachineOperand &Disp, int64_t off, unsigned char TargetFlags=0) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & copyImplicitOps(const MachineInstr &OtherMI) const
Copy all the implicit operands from OtherMI onto this one.
const MachineInstrBuilder & addMemOperand(MachineMemOperand *MMO) const
MachineInstr * getInstr() const
If conversion operators fail, use this method to get the MachineInstr explicitly.
Representation of each machine instruction.
mop_iterator operands_begin()
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
bool isImplicitDef() const
const MachineBasicBlock * getParent() const
void dropDebugNumber()
Drop any variable location debugging information associated with this instruction.
LLVM_ABI void setPreInstrSymbol(MachineFunction &MF, MCSymbol *Symbol)
Set a symbol that will be emitted just prior to the instruction itself.
LLVM_ABI void addImplicitDefUseOperands(MachineFunction &MF)
Add all implicit def and use operands to this instruction.
bool getFlag(MIFlag Flag) const
Return whether an MI flag is set.
unsigned getNumOperands() const
Retuns the total number of operands.
LLVM_ABI void addOperand(MachineFunction &MF, const MachineOperand &Op)
Add the specified operand to the instruction.
LLVM_ABI unsigned getNumExplicitOperands() const
Returns the number of non-implicit operands.
bool modifiesRegister(Register Reg, const TargetRegisterInfo *TRI) const
Return true if the MachineInstr modifies (fully define or partially define) the specified register.
const MCInstrDesc & getDesc() const
Returns the target instruction descriptor of this MachineInstr.
void untieRegOperand(unsigned OpIdx)
Break any tie involving OpIdx.
LLVM_ABI void setDesc(const MCInstrDesc &TID)
Replace the instruction descriptor (thus opcode) of the current instruction with a new one.
LLVM_ABI unsigned getNumExplicitDefs() const
Returns the number of non-implicit definitions.
LLVM_ABI void eraseFromBundle()
Unlink 'this' from its basic block and delete it.
bool hasOneMemOperand() const
Return true if this instruction has exactly one MachineMemOperand.
LLVM_ABI void substituteRegister(Register FromReg, Register ToReg, unsigned SubIdx, const TargetRegisterInfo &RegInfo)
Replace all occurrences of FromReg with ToReg:SubIdx, properly composing subreg indices where necessa...
mmo_iterator memoperands_begin() const
Access to memory operands of the instruction.
LLVM_ABI bool isIdenticalTo(const MachineInstr &Other, MICheckType Check=CheckDefs) const
Return true if this instruction is identical to Other.
LLVM_ABI const MachineFunction * getMF() const
Return the function that contains the basic block that this instruction belongs to.
void setFlag(MIFlag Flag)
Set a MI flag.
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
LLVM_ABI void eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
LLVM_ABI void removeOperand(unsigned OpNo)
Erase an operand from an instruction, leaving it with one fewer operand than it started with.
LLVM_ABI void dump() const
const MachineOperand & getOperand(unsigned i) const
unsigned getNumDefs() const
Returns the total number of definitions.
void setDebugLoc(DebugLoc DL)
Replace current source information with new such.
MachineOperand * findRegisterDefOperand(Register Reg, const TargetRegisterInfo *TRI, bool isDead=false, bool Overlap=false)
Wrapper for findRegisterDefOperandIdx, it returns a pointer to the MachineOperand rather than an inde...
A description of a memory reference used in the backend.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MOInvariant
The memory access always returns the same value (or traps).
@ MOStore
The memory access writes data.
This class contains meta information specific to a module.
MachineOperand class - Representation of each machine instruction operand.
void setSubReg(unsigned subReg)
unsigned getSubReg() const
void setImplicit(bool Val=true)
void setImm(int64_t immVal)
int64_t getImm() const
bool readsReg() const
readsReg - Returns true if this operand reads the previous value of its register.
bool isReg() const
isReg - Tests if this is a MO_Register operand.
MachineBasicBlock * getMBB() const
bool isCPI() const
isCPI - Tests if this is a MO_ConstantPoolIndex operand.
void setIsDead(bool Val=true)
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
void setIsKill(bool Val=true)
bool isJTI() const
isJTI - Tests if this is a MO_JumpTableIndex operand.
LLVM_ABI void ChangeToRegister(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isDebug=false)
ChangeToRegister - Replace this operand with a new register operand of the specified value.
static MachineOperand CreateImm(int64_t Val)
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
bool isFI() const
isFI - Tests if this is a MO_FrameIndex operand.
LLVM_ABI bool isIdenticalTo(const MachineOperand &Other) const
Returns true if this operand is identical to the specified operand except for liveness related flags ...
static MachineOperand CreateCPI(unsigned Idx, int Offset, unsigned TargetFlags=0)
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
int64_t getOffset() const
Return the offset from the symbol in this operand.
static MachineOperand CreateFI(int Idx)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
const TargetRegisterClass * getRegClass(Register Reg) const
Return the register class of the specified virtual register.
LLVM_ABI Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
LLVM_ABI const TargetRegisterClass * constrainRegClass(Register Reg, const TargetRegisterClass *RC, unsigned MinNumRegs=0)
constrainRegClass - Constrain the register class of the specified virtual register to be a common sub...
A Module instance is used to store all the information related to an LLVM module.
Definition Module.h:67
Wrapper class representing virtual and physical registers.
Definition Register.h:20
constexpr bool isValid() const
Definition Register.h:112
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
Definition Register.h:79
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition Register.h:83
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
Represents one node in the SelectionDAG.
bool isMachineOpcode() const
Test if this node has a post-isel opcode, directly corresponding to a MachineInstr opcode.
unsigned getMachineOpcode() const
This may only be called if isMachineOpcode returns true.
const SDValue & getOperand(unsigned Num) const
EVT getValueType(unsigned ResNo) const
Return the type of a specified result.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
LLVM_ABI MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
LLVM_ABI void setNodeMemRefs(MachineSDNode *N, ArrayRef< MachineMemOperand * > NewMemRefs)
Mutate the specified machine node's memory references to the provided list.
MachineFunction & getMachineFunction() const
SlotIndex - An opaque wrapper around machine indexes.
Definition SlotIndexes.h:66
SlotIndex getBaseIndex() const
Returns the base index for associated with this index.
SlotIndex getRegSlot(bool EC=false) const
Returns the register use/def slot in the current instruction for a normal or early-clobber def.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
Information about stack frame layout on the target.
bool hasFP(const MachineFunction &MF) const
hasFP - Return true if the specified function should have a dedicated frame pointer register.
Align getStackAlign() const
getStackAlignment - This method returns the number of bytes to which the stack pointer must be aligne...
TargetInstrInfo - Interface to description of machine instruction set.
virtual const TargetRegisterClass * getRegClass(const MCInstrDesc &MCID, unsigned OpNum) const
Given a machine instruction descriptor, returns the register class constraint for OpNum,...
virtual bool findCommutedOpIndices(const MachineInstr &MI, unsigned &SrcOpIdx1, unsigned &SrcOpIdx2) const
Returns true iff the routine could find two commutable operands in the given machine instruction.
virtual bool hasReassociableOperands(const MachineInstr &Inst, const MachineBasicBlock *MBB) const
Return true when \P Inst has reassociable operands in the same \P MBB.
virtual void genAlternativeCodeSequence(MachineInstr &Root, unsigned Pattern, SmallVectorImpl< MachineInstr * > &InsInstrs, SmallVectorImpl< MachineInstr * > &DelInstrs, DenseMap< Register, unsigned > &InstIdxForVirtReg) const
When getMachineCombinerPatterns() finds patterns, this function generates the instructions that could...
virtual std::optional< ParamLoadedValue > describeLoadedValue(const MachineInstr &MI, Register Reg) const
Produce the expression describing the MI loading a value into the physical register Reg.
virtual bool getMachineCombinerPatterns(MachineInstr &Root, SmallVectorImpl< unsigned > &Patterns, bool DoRegPressureReduce) const
Return true when there is potentially a faster code sequence for an instruction chain ending in Root.
virtual bool isReMaterializableImpl(const MachineInstr &MI) const
For instructions with opcodes for which the M_REMATERIALIZABLE flag is set, this hook lets the target...
virtual bool isSchedulingBoundary(const MachineInstr &MI, const MachineBasicBlock *MBB, const MachineFunction &MF) const
Test if the given instruction should be considered a scheduling boundary.
virtual MachineInstr * commuteInstructionImpl(MachineInstr &MI, bool NewMI, unsigned OpIdx1, unsigned OpIdx2) const
This method commutes the operands of the given machine instruction MI.
bool isPositionIndependent() const
CodeModel::Model getCodeModel() const
Returns the code model.
const MCAsmInfo * getMCAsmInfo() const
Return target specific asm information.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
Provide an instruction scheduling machine model to CodeGen passes.
virtual const TargetFrameLowering * getFrameLowering() const
virtual const TargetRegisterInfo * getRegisterInfo() const =0
Return the target's register information.
Target - Wrapper for Target specific information.
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:343
static constexpr TypeSize getZero()
Definition TypeSize.h:349
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:296
static LLVM_ABI Type * getFP128Ty(LLVMContext &C)
Definition Type.cpp:289
static LLVM_ABI Type * getDoubleTy(LLVMContext &C)
Definition Type.cpp:285
static LLVM_ABI Type * getFloatTy(LLVMContext &C)
Definition Type.cpp:284
static LLVM_ABI Type * getHalfTy(LLVMContext &C)
Definition Type.cpp:282
SlotIndex def
The index of the defining instruction.
LLVM Value Representation.
Definition Value.h:75
void BuildCFI(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, const MCCFIInstruction &CFIInst, MachineInstr::MIFlag Flag=MachineInstr::NoFlags) const
Wraps up getting a CFI index and building a MachineInstr for it.
void getFrameIndexOperands(SmallVectorImpl< MachineOperand > &Ops, int FI) const override
bool optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, Register SrcReg2, int64_t CmpMask, int64_t CmpValue, const MachineRegisterInfo *MRI) const override
Check if there exists an earlier instruction that operates on the same source operands and sets eflag...
bool getMachineCombinerPatterns(MachineInstr &Root, SmallVectorImpl< unsigned > &Patterns, bool DoRegPressureReduce) const override
void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, Register DestReg, Register SrcReg, bool KillSrc, bool RenamableDest=false, bool RenamableSrc=false) const override
bool isSchedulingBoundary(const MachineInstr &MI, const MachineBasicBlock *MBB, const MachineFunction &MF) const override
Overrides the isSchedulingBoundary from Codegen/TargetInstrInfo.cpp to make it capable of identifying...
MachineBasicBlock::iterator insertOutlinedCall(Module &M, MachineBasicBlock &MBB, MachineBasicBlock::iterator &It, MachineFunction &MF, outliner::Candidate &C) const override
void replaceBranchWithTailCall(MachineBasicBlock &MBB, SmallVectorImpl< MachineOperand > &Cond, const MachineInstr &TailCall) const override
bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl< MachineOperand > &Cond, bool AllowModify) const override
bool canInsertSelect(const MachineBasicBlock &, ArrayRef< MachineOperand > Cond, Register, Register, Register, int &, int &, int &) const override
void insertSelect(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, Register DstReg, ArrayRef< MachineOperand > Cond, Register TrueReg, Register FalseReg) const override
unsigned getOpcodeAfterMemoryUnfold(unsigned Opc, bool UnfoldLoad, bool UnfoldStore, unsigned *LoadRegIndex=nullptr) const override
bool findCommutedOpIndices(const MachineInstr &MI, unsigned &SrcOpIdx1, unsigned &SrcOpIdx2) const override
Returns true iff the routine could find two commutable operands in the given machine instruction.
bool areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2, int64_t &Offset1, int64_t &Offset2) const override
X86InstrInfo(const X86Subtarget &STI)
static bool isDataInvariantLoad(MachineInstr &MI)
Returns true if the instruction has no behavior (specified or otherwise) that is based on the value l...
MachineInstr * commuteInstructionImpl(MachineInstr &MI, bool NewMI, unsigned CommuteOpIdx1, unsigned CommuteOpIdx2) const override
bool isFunctionSafeToOutlineFrom(MachineFunction &MF, bool OutlineFromLinkOnceODRs) const override
const X86RegisterInfo & getRegisterInfo() const
getRegisterInfo - TargetInstrInfo is a superset of MRegister info.
bool hasCommutePreference(MachineInstr &MI, bool &Commute) const override
Returns true if we have preference on the operands order in MI, the commute decision is returned in C...
bool hasLiveCondCodeDef(MachineInstr &MI) const
True if MI has a condition code def, e.g.
std::optional< ParamLoadedValue > describeLoadedValue(const MachineInstr &MI, Register Reg) const override
bool canMakeTailCallConditional(SmallVectorImpl< MachineOperand > &Cond, const MachineInstr &TailCall) const override
bool getMemOperandsWithOffsetWidth(const MachineInstr &LdSt, SmallVectorImpl< const MachineOperand * > &BaseOps, int64_t &Offset, bool &OffsetIsScalable, LocationSize &Width, const TargetRegisterInfo *TRI) const override
bool unfoldMemoryOperand(MachineFunction &MF, MachineInstr &MI, Register Reg, bool UnfoldLoad, bool UnfoldStore, SmallVectorImpl< MachineInstr * > &NewMIs) const override
void loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, int FrameIndex, const TargetRegisterClass *RC, Register VReg, MachineInstr::MIFlag Flags=MachineInstr::NoFlags) const override
std::optional< DestSourcePair > isCopyInstrImpl(const MachineInstr &MI) const override
MachineInstr * convertToThreeAddress(MachineInstr &MI, LiveVariables *LV, LiveIntervals *LIS) const override
convertToThreeAddress - This method must be implemented by targets that set the M_CONVERTIBLE_TO_3_AD...
std::pair< unsigned, unsigned > decomposeMachineOperandsTargetFlags(unsigned TF) const override
bool expandPostRAPseudo(MachineInstr &MI) const override
void storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg, bool isKill, int FrameIndex, const TargetRegisterClass *RC, Register VReg, MachineInstr::MIFlag Flags=MachineInstr::NoFlags) const override
bool isAssociativeAndCommutative(const MachineInstr &Inst, bool Invert) const override
MCInst getNop() const override
Return the noop instruction to use for a noop.
void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, unsigned SubIdx, const MachineInstr &Orig) const override
outliner::InstrType getOutliningTypeImpl(const MachineModuleInfo &MMI, MachineBasicBlock::iterator &MIT, unsigned Flags) const override
bool shouldScheduleLoadsNear(SDNode *Load1, SDNode *Load2, int64_t Offset1, int64_t Offset2, unsigned NumLoads) const override
This is a used by the pre-regalloc scheduler to determine (in conjunction with areLoadsFromSameBasePt...
MachineInstr * foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI, ArrayRef< unsigned > Ops, MachineBasicBlock::iterator InsertPt, int FrameIndex, LiveIntervals *LIS=nullptr, VirtRegMap *VRM=nullptr) const override
Fold a load or store of the specified stack slot into the specified machine instruction for the speci...
bool analyzeCompare(const MachineInstr &MI, Register &SrcReg, Register &SrcReg2, int64_t &CmpMask, int64_t &CmpValue) const override
bool getConstValDefinedInReg(const MachineInstr &MI, const Register Reg, int64_t &ImmVal) const override
std::optional< ExtAddrMode > getAddrModeFromMemoryOp(const MachineInstr &MemI, const TargetRegisterInfo *TRI) const override
Register isStoreToStackSlotPostFE(const MachineInstr &MI, int &FrameIndex) const override
isStoreToStackSlotPostFE - Check for post-frame ptr elimination stack locations as well.
const TargetRegisterClass * getRegClass(const MCInstrDesc &MCID, unsigned OpNum) const override
Given a machine instruction descriptor, returns the register class constraint for OpNum,...
bool isUnconditionalTailCall(const MachineInstr &MI) const override
bool reverseBranchCondition(SmallVectorImpl< MachineOperand > &Cond) const override
std::optional< std::unique_ptr< outliner::OutlinedFunction > > getOutliningCandidateInfo(const MachineModuleInfo &MMI, std::vector< outliner::Candidate > &RepeatedSequenceLocs, unsigned MinRepeats) const override
bool classifyLEAReg(MachineInstr &MI, const MachineOperand &Src, unsigned LEAOpcode, bool AllowSP, Register &NewSrc, unsigned &NewSrcSubReg, bool &isKill, MachineOperand &ImplicitOp, LiveVariables *LV, LiveIntervals *LIS) const
Given an operand within a MachineInstr, insert preceding code to put it into the right format for a p...
Register isLoadFromStackSlotPostFE(const MachineInstr &MI, int &FrameIndex) const override
isLoadFromStackSlotPostFE - Check for post-frame ptr elimination stack locations as well.
void setExecutionDomain(MachineInstr &MI, unsigned Domain) const override
unsigned insertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB, ArrayRef< MachineOperand > Cond, const DebugLoc &DL, int *BytesAdded=nullptr) const override
ArrayRef< std::pair< unsigned, const char * > > getSerializableDirectMachineOperandTargetFlags() const override
Register isStoreToStackSlot(const MachineInstr &MI, int &FrameIndex) const override
bool setExecutionDomainCustom(MachineInstr &MI, unsigned Domain) const
int getSPAdjust(const MachineInstr &MI) const override
getSPAdjust - This returns the stack pointer adjustment made by this instruction.
bool verifyInstruction(const MachineInstr &MI, StringRef &ErrInfo) const override
bool isReMaterializableImpl(const MachineInstr &MI) const override
Register getGlobalBaseReg(MachineFunction *MF) const
getGlobalBaseReg - Return a virtual register initialized with the the global base register value.
int getJumpTableIndex(const MachineInstr &MI) const override
void insertNoop(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI) const override
void setSpecialOperandAttr(MachineInstr &OldMI1, MachineInstr &OldMI2, MachineInstr &NewMI1, MachineInstr &NewMI2) const override
This is an architecture-specific helper function of reassociateOps.
std::pair< uint16_t, uint16_t > getExecutionDomain(const MachineInstr &MI) const override
bool isCoalescableExtInstr(const MachineInstr &MI, Register &SrcReg, Register &DstReg, unsigned &SubIdx) const override
isCoalescableExtInstr - Return true if the instruction is a "coalescable" extension instruction.
void loadStoreTileReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, unsigned Opc, Register Reg, int FrameIdx, bool isKill=false) const
void genAlternativeCodeSequence(MachineInstr &Root, unsigned Pattern, SmallVectorImpl< MachineInstr * > &InsInstrs, SmallVectorImpl< MachineInstr * > &DelInstrs, DenseMap< Register, unsigned > &InstrIdxForVirtReg) const override
When getMachineCombinerPatterns() finds potential patterns, this function generates the instructions ...
bool hasReassociableOperands(const MachineInstr &Inst, const MachineBasicBlock *MBB) const override
bool analyzeBranchPredicate(MachineBasicBlock &MBB, TargetInstrInfo::MachineBranchPredicate &MBP, bool AllowModify=false) const override
static bool isDataInvariant(MachineInstr &MI)
Returns true if the instruction has no behavior (specified or otherwise) that is based on the value o...
unsigned getUndefRegClearance(const MachineInstr &MI, unsigned OpNum, const TargetRegisterInfo *TRI) const override
Inform the BreakFalseDeps pass how many idle instructions we would like before certain undef register...
void breakPartialRegDependency(MachineInstr &MI, unsigned OpNum, const TargetRegisterInfo *TRI) const override
void buildClearRegister(Register Reg, MachineBasicBlock &MBB, MachineBasicBlock::iterator Iter, DebugLoc &DL, bool AllowSideEffects=true) const override
Register isLoadFromStackSlot(const MachineInstr &MI, int &FrameIndex) const override
int64_t getFrameAdjustment(const MachineInstr &I) const
Returns the stack pointer adjustment that happens inside the frame setup..destroy sequence (e....
bool hasHighOperandLatency(const TargetSchedModel &SchedModel, const MachineRegisterInfo *MRI, const MachineInstr &DefMI, unsigned DefIdx, const MachineInstr &UseMI, unsigned UseIdx) const override
bool isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const override
uint16_t getExecutionDomainCustom(const MachineInstr &MI) const
bool isHighLatencyDef(int opc) const override
void buildOutlinedFrame(MachineBasicBlock &MBB, MachineFunction &MF, const outliner::OutlinedFunction &OF) const override
bool foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, Register Reg, MachineRegisterInfo *MRI) const override
foldImmediate - 'Reg' is known to be defined by a move immediate instruction, try to fold the immedia...
unsigned removeBranch(MachineBasicBlock &MBB, int *BytesRemoved=nullptr) const override
unsigned getFMA3OpcodeToCommuteOperands(const MachineInstr &MI, unsigned SrcOpIdx1, unsigned SrcOpIdx2, const X86InstrFMA3Group &FMA3Group) const
Returns an adjusted FMA opcode that must be used in FMA instruction that performs the same computatio...
bool preservesZeroValueInReg(const MachineInstr *MI, const Register NullValueReg, const TargetRegisterInfo *TRI) const override
unsigned getPartialRegUpdateClearance(const MachineInstr &MI, unsigned OpNum, const TargetRegisterInfo *TRI) const override
Inform the BreakFalseDeps pass how many idle instructions we would like before a partial register upd...
X86MachineFunctionInfo - This class is derived from MachineFunction and contains private X86 target-s...
const TargetRegisterClass * constrainRegClassToNonRex2(const TargetRegisterClass *RC) const
bool isPICStyleGOT() const
const X86InstrInfo * getInstrInfo() const override
bool hasAVX512() const
const X86RegisterInfo * getRegisterInfo() const override
bool hasAVX() const
const X86FrameLowering * getFrameLowering() const override
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Define
Register definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
@ X86
Windows x64, Windows Itanium (IA-64)
Definition MCAsmInfo.h:50
X86II - This namespace holds all of the target specific flags that instruction info tracks.
bool isKMergeMasked(uint64_t TSFlags)
bool hasNewDataDest(uint64_t TSFlags)
@ MO_GOT_ABSOLUTE_ADDRESS
MO_GOT_ABSOLUTE_ADDRESS - On a symbol operand, this represents a relocation of: SYMBOL_LABEL + [.
@ MO_INDNTPOFF
MO_INDNTPOFF - On a symbol operand this indicates that the immediate is the absolute address of the G...
@ MO_GOTNTPOFF
MO_GOTNTPOFF - On a symbol operand this indicates that the immediate is the offset of the GOT entry w...
@ MO_GOTTPOFF
MO_GOTTPOFF - On a symbol operand this indicates that the immediate is the offset of the GOT entry wi...
@ MO_PIC_BASE_OFFSET
MO_PIC_BASE_OFFSET - On a symbol operand this indicates that the immediate should get the value of th...
@ MO_GOTPCREL
MO_GOTPCREL - On a symbol operand this indicates that the immediate is offset to the GOT entry for th...
@ EVEX
EVEX - Specifies that this instruction use EVEX form which provides syntax support up to 32 512-bit r...
@ SSEDomainShift
Execution domain for SSE instructions.
bool canUseApxExtendedReg(const MCInstrDesc &Desc)
bool isPseudo(uint64_t TSFlags)
bool isKMasked(uint64_t TSFlags)
int getMemoryOperandNo(uint64_t TSFlags)
unsigned getOperandBias(const MCInstrDesc &Desc)
Compute whether all of the def operands are repeated in the uses and therefore should be skipped.
Define some predicates that are used for node matching.
CondCode getCondFromBranch(const MachineInstr &MI)
CondCode getCondFromCFCMov(const MachineInstr &MI)
@ LAST_VALID_COND
Definition X86BaseInfo.h:94
CondCode getCondFromMI(const MachineInstr &MI)
Return the condition code of the instruction.
int getFirstAddrOperandIdx(const MachineInstr &MI)
Return the index of the instruction's first address operand, if it has a memory reference,...
@ AddrNumOperands
Definition X86BaseInfo.h:36
unsigned getSwappedVCMPImm(unsigned Imm)
Get the VCMP immediate if the opcodes are swapped.
CondCode GetOppositeBranchCondition(CondCode CC)
GetOppositeBranchCondition - Return the inverse of the specified cond, e.g.
unsigned getSwappedVPCOMImm(unsigned Imm)
Get the VPCOM immediate if the opcodes are swapped.
bool isX87Instruction(MachineInstr &MI)
Check if the instruction is X87 instruction.
unsigned getNonNDVariant(unsigned Opc)
unsigned getVPCMPImmForCond(ISD::CondCode CC)
Get the VPCMP immediate for the given condition.
std::pair< CondCode, bool > getX86ConditionCode(CmpInst::Predicate Predicate)
Return a pair of condition code for the given predicate and whether the instruction operands should b...
CondCode getCondFromSETCC(const MachineInstr &MI)
unsigned getSwappedVPCMPImm(unsigned Imm)
Get the VPCMP immediate if the opcodes are swapped.
CondCode getCondFromCCMP(const MachineInstr &MI)
int getCCMPCondFlagsFromCondCode(CondCode CC)
int getCondSrcNoFromDesc(const MCInstrDesc &MCID)
Return the source operand # for condition code by MCID.
const Constant * getConstantFromPool(const MachineInstr &MI, unsigned OpNo)
Find any constant pool entry associated with a specific instruction operand.
unsigned getCMovOpcode(unsigned RegBytes, bool HasMemoryOperand=false, bool HasNDD=false)
Return a cmov opcode for the given register size in bytes, and operand type.
unsigned getNFVariant(unsigned Opc)
unsigned getVectorRegisterWidth(const MCOperandInfo &Info)
Get the width of the vector register operand.
CondCode getCondFromCMov(const MachineInstr &MI)
initializer< Ty > init(const Ty &Val)
InstrType
Represents how an instruction should be mapped by the outliner.
NodeAddr< NodeBase * > Node
Definition RDFGraph.h:381
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:316
@ Offset
Definition DWP.cpp:532
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1725
static bool isAddMemInstrWithRelocation(const MachineInstr &MI)
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:165
LLVM_ABI bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
static bool isMem(const MachineInstr &MI, unsigned Op)
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
bool isAligned(Align Lhs, uint64_t SizeInBytes)
Checks that SizeInBytes is a multiple of the alignment.
Definition Alignment.h:134
MCRegister getX86SubSuperRegister(MCRegister Reg, unsigned Size, bool High=false)
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
FunctionPass * createX86GlobalBaseRegPass()
This pass initializes a global base register for PIC on x86-32.
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2136
static const MachineInstrBuilder & addRegReg(const MachineInstrBuilder &MIB, Register Reg1, bool isKill1, unsigned SubReg1, Register Reg2, bool isKill2, unsigned SubReg2)
addRegReg - This function is used to add a memory reference of the form: [Reg + Reg].
unsigned getDeadRegState(bool B)
static const MachineInstrBuilder & addFrameReference(const MachineInstrBuilder &MIB, int FI, int Offset=0, bool mem=true)
addFrameReference - This function is used to add a reference to the base of an abstract object on the...
FunctionPass * createCleanupLocalDynamicTLSPass()
This pass combines multiple accesses to local-dynamic TLS variables so that the TLS base address for ...
Op::Description Desc
constexpr int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition bit.h:154
const X86FoldTableEntry * lookupBroadcastFoldTable(unsigned RegOp, unsigned OpNum)
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition bit.h:202
const X86InstrFMA3Group * getFMA3Group(unsigned Opcode, uint64_t TSFlags)
Returns a reference to a group of FMA3 opcodes to where the given Opcode is included.
auto reverse(ContainerTy &&C)
Definition STLExtras.h:406
MachineInstr * getImm(const MachineOperand &MO, const MachineRegisterInfo *MRI)
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1739
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:167
const X86FoldTableEntry * lookupTwoAddrFoldTable(unsigned RegOp)
FunctionAddr VTableAddr Count
Definition InstrProf.h:139
bool is_sorted(R &&Range, Compare C)
Wrapper function around std::is_sorted to check if elements in a range R are sorted with respect to a...
Definition STLExtras.h:1920
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
Definition MathExtras.h:189
DomTreeNodeBase< MachineBasicBlock > MachineDomTreeNode
static bool isMemInstrWithGOTPCREL(const MachineInstr &MI)
static const MachineInstrBuilder & addOffset(const MachineInstrBuilder &MIB, int Offset)
unsigned getUndefRegState(bool B)
unsigned getRegState(const MachineOperand &RegOp)
Get all register state flags from machine operand RegOp.
unsigned getDefRegState(bool B)
auto lower_bound(R &&Range, T &&Value)
Provide wrappers to std::lower_bound which take ranges instead of having to pass begin/end explicitly...
Definition STLExtras.h:1994
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
unsigned getKillRegState(bool B)
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition Alignment.h:144
FunctionAddr VTableAddr Next
Definition InstrProf.h:141
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
const X86FoldTableEntry * lookupUnfoldTable(unsigned MemOp)
constexpr unsigned BitWidth
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
bool matchBroadcastSize(const X86FoldTableEntry &Entry, unsigned BroadcastBits)
std::pair< MachineOperand, DIExpression * > ParamLoadedValue
auto seq(T Begin, T End)
Iterate over an integral type from Begin up to - but not including - End.
Definition Sequence.h:305
const X86FoldTableEntry * lookupFoldTable(unsigned RegOp, unsigned OpNum)
static const MachineInstrBuilder & addRegOffset(const MachineInstrBuilder &MIB, Register Reg, bool isKill, int Offset)
addRegOffset - This function is used to add a memory reference of the form [Reg + Offset],...
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:869
#define N
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
Extended Value Type.
Definition ValueTypes.h:35
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition ValueTypes.h:316
Used to describe addressing mode similar to ExtAddrMode in CodeGenPrepare.
This represents a simple continuous liveness interval for a value.
std::vector< MachineInstr * > Kills
Kills - List of MachineInstruction's which are the last use of this virtual register (kill it) in the...
static LLVM_ABI MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
X86AddressMode - This struct holds a generalized full x86 address mode.
enum llvm::X86AddressMode::@202116273335065351270200035056227005202106004277 BaseType
This class is used to group {132, 213, 231} forms of FMA opcodes together.
unsigned get213Opcode() const
Returns the 213 form of FMA opcode.
unsigned get231Opcode() const
Returns the 231 form of FMA opcode.
bool isIntrinsic() const
Returns true iff the group of FMA opcodes holds intrinsic opcodes.
unsigned get132Opcode() const
Returns the 132 form of FMA opcode.
An individual sequence of instructions to be replaced with a call to an outlined function.
The information necessary to create an outlined function for some class of candidate.