LLVM 23.0.0git
X86FixupInstTuning.cpp
Go to the documentation of this file.
1//===-- X86FixupInstTunings.cpp - replace instructions -----------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file does a tuning pass replacing slower machine instructions
10// with faster ones. We do this here, as opposed to during normal ISel, as
11// attempting to get the "right" instruction can break patterns. This pass
12// is not meant search for special cases where an instruction can be transformed
13// to another, it is only meant to do transformations where the old instruction
14// is always replacable with the new instructions. For example:
15//
16// `vpermq ymm` -> `vshufd ymm`
17// -- BAD, not always valid (lane cross/non-repeated mask)
18//
19// `vpermilps ymm` -> `vshufd ymm`
20// -- GOOD, always replaceable
21//
22//===----------------------------------------------------------------------===//
23
24#include "X86.h"
25#include "X86InstrInfo.h"
26#include "X86RegisterInfo.h"
27#include "X86Subtarget.h"
28#include "llvm/ADT/Statistic.h"
33#include "llvm/IR/Analysis.h"
34
35using namespace llvm;
36
37#define DEBUG_TYPE "x86-fixup-inst-tuning"
38
39STATISTIC(NumInstChanges, "Number of instructions changes");
40
41namespace {
42class X86FixupInstTuningImpl {
43public:
44 bool runOnMachineFunction(MachineFunction &MF);
45
46private:
47 bool processInstruction(MachineFunction &MF, MachineBasicBlock &MBB,
49
50 const X86InstrInfo *TII = nullptr;
51 const X86Subtarget *ST = nullptr;
52 const MCSchedModel *SM = nullptr;
53 const X86RegisterInfo *TRI = nullptr;
54};
55
56class X86FixupInstTuningLegacy : public MachineFunctionPass {
57public:
58 static char ID;
59
60 X86FixupInstTuningLegacy() : MachineFunctionPass(ID) {}
61
62 StringRef getPassName() const override { return "X86 Fixup Inst Tuning"; }
63
64 bool runOnMachineFunction(MachineFunction &MF) override;
65 bool processInstruction(MachineFunction &MF, MachineBasicBlock &MBB,
67
68 // This pass runs after regalloc and doesn't support VReg operands.
69 MachineFunctionProperties getRequiredProperties() const override {
70 return MachineFunctionProperties().setNoVRegs();
71 }
72};
73} // end anonymous namespace
74
75char X86FixupInstTuningLegacy ::ID = 0;
76
77INITIALIZE_PASS(X86FixupInstTuningLegacy, DEBUG_TYPE, DEBUG_TYPE, false, false)
78
80 return new X86FixupInstTuningLegacy();
81}
82
83template <typename T>
84static std::optional<bool> CmpOptionals(T NewVal, T CurVal) {
85 if (NewVal.has_value() && CurVal.has_value() && *NewVal != *CurVal)
86 return *NewVal < *CurVal;
87
88 return std::nullopt;
89}
90
91bool X86FixupInstTuningImpl::processInstruction(
94 MachineInstr &MI = *I;
95 unsigned Opc = MI.getOpcode();
96 unsigned NumOperands = MI.getDesc().getNumOperands();
97 bool OptSize = MF.getFunction().hasOptSize();
98
99 auto GetInstTput = [&](unsigned Opcode) -> std::optional<double> {
100 // We already checked that SchedModel exists in `NewOpcPreferable`.
102 *ST, *(SM->getSchedClassDesc(TII->get(Opcode).getSchedClass())));
103 };
104
105 auto GetInstLat = [&](unsigned Opcode) -> std::optional<double> {
106 // We already checked that SchedModel exists in `NewOpcPreferable`.
108 *ST, *(SM->getSchedClassDesc(TII->get(Opcode).getSchedClass())));
109 };
110
111 auto GetInstSize = [&](unsigned Opcode) -> std::optional<unsigned> {
112 if (unsigned Size = TII->get(Opcode).getSize())
113 return Size;
114 // Zero size means we where unable to compute it.
115 return std::nullopt;
116 };
117
118 auto NewOpcPreferable = [&](unsigned NewOpc,
119 bool ReplaceInTie = true) -> bool {
120 std::optional<bool> Res;
121 if (SM->hasInstrSchedModel()) {
122 // Compare tput -> lat -> code size.
123 Res = CmpOptionals(GetInstTput(NewOpc), GetInstTput(Opc));
124 if (Res.has_value())
125 return *Res;
126
127 Res = CmpOptionals(GetInstLat(NewOpc), GetInstLat(Opc));
128 if (Res.has_value())
129 return *Res;
130 }
131
132 Res = CmpOptionals(GetInstSize(Opc), GetInstSize(NewOpc));
133 if (Res.has_value())
134 return *Res;
135
136 // We either have either were unable to get tput/lat/codesize or all values
137 // were equal. Return specified option for a tie.
138 return ReplaceInTie;
139 };
140
141 // `vpermilpd r, i` -> `vshufpd r, r, i`
142 // `vpermilpd r, i, k` -> `vshufpd r, r, i, k`
143 // `vshufpd` is always as fast or faster than `vpermilpd` and takes
144 // 1 less byte of code size for VEX and EVEX encoding.
145 auto ProcessVPERMILPDri = [&](unsigned NewOpc) -> bool {
146 if (!NewOpcPreferable(NewOpc))
147 return false;
148 LLVM_DEBUG(dbgs() << "Replacing: " << MI);
149 {
150 unsigned MaskImm = MI.getOperand(NumOperands - 1).getImm();
151 MI.removeOperand(NumOperands - 1);
152 MI.addOperand(MI.getOperand(NumOperands - 2));
153 MI.setDesc(TII->get(NewOpc));
154 MI.addOperand(MachineOperand::CreateImm(MaskImm));
155 }
156 LLVM_DEBUG(dbgs() << " With: " << MI);
157 return true;
158 };
159
160 // `vpermilps r, i` -> `vshufps r, r, i`
161 // `vpermilps r, i, k` -> `vshufps r, r, i, k`
162 // `vshufps` is always as fast or faster than `vpermilps` and takes
163 // 1 less byte of code size for VEX and EVEX encoding.
164 auto ProcessVPERMILPSri = [&](unsigned NewOpc) -> bool {
165 if (!NewOpcPreferable(NewOpc))
166 return false;
167 LLVM_DEBUG(dbgs() << "Replacing: " << MI);
168 {
169 unsigned MaskImm = MI.getOperand(NumOperands - 1).getImm();
170 MI.removeOperand(NumOperands - 1);
171 MI.addOperand(MI.getOperand(NumOperands - 2));
172 MI.setDesc(TII->get(NewOpc));
173 MI.addOperand(MachineOperand::CreateImm(MaskImm));
174 }
175 LLVM_DEBUG(dbgs() << " With: " << MI);
176 return true;
177 };
178
179 // `vpermilps m, i` -> `vpshufd m, i` iff no domain delay penalty on shuffles.
180 // `vpshufd` is always as fast or faster than `vpermilps` and takes 1 less
181 // byte of code size.
182 auto ProcessVPERMILPSmi = [&](unsigned NewOpc) -> bool {
183 // TODO: Might be work adding bypass delay if -Os/-Oz is enabled as
184 // `vpshufd` saves a byte of code size.
185 if (!ST->hasNoDomainDelayShuffle() ||
186 !NewOpcPreferable(NewOpc, /*ReplaceInTie*/ false))
187 return false;
188 LLVM_DEBUG(dbgs() << "Replacing: " << MI);
189 {
190 MI.setDesc(TII->get(NewOpc));
191 }
192 LLVM_DEBUG(dbgs() << " With: " << MI);
193 return true;
194 };
195
196 // `vunpcklpd/vmovlhps r, r` -> `vunpcklqdq r, r`/`vshufpd r, r, 0x00`
197 // `vunpckhpd/vmovlhps r, r` -> `vunpckhqdq r, r`/`vshufpd r, r, 0xff`
198 // `vunpcklpd r, r, k` -> `vunpcklqdq r, r, k`/`vshufpd r, r, k, 0x00`
199 // `vunpckhpd r, r, k` -> `vunpckhqdq r, r, k`/`vshufpd r, r, k, 0xff`
200 // `vunpcklpd r, m` -> `vunpcklqdq r, m, k`
201 // `vunpckhpd r, m` -> `vunpckhqdq r, m, k`
202 // `vunpcklpd r, m, k` -> `vunpcklqdq r, m, k`
203 // `vunpckhpd r, m, k` -> `vunpckhqdq r, m, k`
204 // 1) If no bypass delay and `vunpck{l|h}qdq` faster than `vunpck{l|h}pd`
205 // -> `vunpck{l|h}qdq`
206 // 2) If `vshufpd` faster than `vunpck{l|h}pd`
207 // -> `vshufpd`
208 //
209 // `vunpcklps` -> `vunpckldq` (for all operand types if no bypass delay)
210 auto ProcessUNPCK = [&](unsigned NewOpc, unsigned MaskImm) -> bool {
211 if (!NewOpcPreferable(NewOpc, /*ReplaceInTie*/ false))
212 return false;
213 LLVM_DEBUG(dbgs() << "Replacing: " << MI);
214 {
215 MI.setDesc(TII->get(NewOpc));
216 MI.addOperand(MachineOperand::CreateImm(MaskImm));
217 }
218 LLVM_DEBUG(dbgs() << " With: " << MI);
219 return true;
220 };
221
222 auto ProcessUNPCKToIntDomain = [&](unsigned NewOpc) -> bool {
223 // TODO it may be worth it to set ReplaceInTie to `true` as there is no real
224 // downside to the integer unpck, but if someone doesn't specify exact
225 // target we won't find it faster.
226 if (!ST->hasNoDomainDelayShuffle() ||
227 !NewOpcPreferable(NewOpc, /*ReplaceInTie*/ false))
228 return false;
229 LLVM_DEBUG(dbgs() << "Replacing: " << MI);
230 {
231 MI.setDesc(TII->get(NewOpc));
232 }
233 LLVM_DEBUG(dbgs() << " With: " << MI);
234 return true;
235 };
236
237 auto ProcessUNPCKLPDrr = [&](unsigned NewOpcIntDomain,
238 unsigned NewOpc) -> bool {
239 if (ProcessUNPCKToIntDomain(NewOpcIntDomain))
240 return true;
241 return ProcessUNPCK(NewOpc, 0x00);
242 };
243 auto ProcessUNPCKHPDrr = [&](unsigned NewOpcIntDomain,
244 unsigned NewOpc) -> bool {
245 if (ProcessUNPCKToIntDomain(NewOpcIntDomain))
246 return true;
247 return ProcessUNPCK(NewOpc, 0xff);
248 };
249
250 auto ProcessUNPCKPDrm = [&](unsigned NewOpcIntDomain) -> bool {
251 return ProcessUNPCKToIntDomain(NewOpcIntDomain);
252 };
253
254 auto ProcessUNPCKPS = [&](unsigned NewOpc) -> bool {
255 return ProcessUNPCKToIntDomain(NewOpc);
256 };
257
258 auto ProcessBLENDWToBLENDD = [&](unsigned MovOpc, unsigned NumElts) -> bool {
259 if (!ST->hasAVX2() || !NewOpcPreferable(MovOpc))
260 return false;
261 // Convert to VPBLENDD if scaling the VPBLENDW mask down/up loses no bits.
262 APInt MaskW =
263 APInt(8, MI.getOperand(NumOperands - 1).getImm(), /*IsSigned=*/false,
264 /*implicitTrunc=*/true);
265 APInt MaskD = APIntOps::ScaleBitMask(MaskW, 4, /*MatchAllBits=*/true);
266 if (MaskW != APIntOps::ScaleBitMask(MaskD, 8, /*MatchAllBits=*/true))
267 return false;
268 APInt NewMaskD = APInt::getSplat(NumElts, MaskD);
269 LLVM_DEBUG(dbgs() << "Replacing: " << MI);
270 {
271 MI.setDesc(TII->get(MovOpc));
272 MI.removeOperand(NumOperands - 1);
273 MI.addOperand(MachineOperand::CreateImm(NewMaskD.getZExtValue()));
274 }
275 LLVM_DEBUG(dbgs() << " With: " << MI);
276 return true;
277 };
278
279 auto ProcessBLENDToMOV = [&](unsigned MovOpc, unsigned Mask,
280 unsigned MovImm) -> bool {
281 if ((MI.getOperand(NumOperands - 1).getImm() & Mask) != MovImm)
282 return false;
283 if (!OptSize && !NewOpcPreferable(MovOpc))
284 return false;
285 LLVM_DEBUG(dbgs() << "Replacing: " << MI);
286 {
287 MI.setDesc(TII->get(MovOpc));
288 MI.removeOperand(NumOperands - 1);
289 }
290 LLVM_DEBUG(dbgs() << " With: " << MI);
291 return true;
292 };
293
294 // Is ADD(X,X) more efficient than SHL(X,1)?
295 auto ProcessShiftLeftToAdd = [&](unsigned AddOpc) -> bool {
296 if (MI.getOperand(NumOperands - 1).getImm() != 1)
297 return false;
298 if (!NewOpcPreferable(AddOpc, /*ReplaceInTie*/ true))
299 return false;
300 LLVM_DEBUG(dbgs() << "Replacing: " << MI);
301 {
302 MI.setDesc(TII->get(AddOpc));
303 MI.removeOperand(NumOperands - 1);
304 MI.addOperand(MI.getOperand(NumOperands - 2));
305 }
306 LLVM_DEBUG(dbgs() << " With: " << MI);
307 return false;
308 };
309
310 // `vpermq ymm, ymm, 0x44` -> `vinserti128 ymm, ymm, xmm, 1`
311 // `vpermpd ymm, ymm, 0x44` -> `vinsertf128 ymm, ymm, xmm, 1`
312 // When the immediate is 0x44, VPERMQ/VPERMPD duplicates the lower 128-bit
313 // lane to both lanes. 0x44 = 0b01_00_01_00 means qwords[3:0] = {src[1],
314 // src[0], src[1], src[0]} This is equivalent to inserting the lower 128-bits
315 // into the upper 128-bit position.
316 auto ProcessVPERMQToVINSERT128 = [&](unsigned NewOpc) -> bool {
317 if (MI.getOperand(NumOperands - 1).getImm() != 0x44)
318 return false;
319 if (!NewOpcPreferable(NewOpc, /*ReplaceInTie*/ false))
320 return false;
321
322 // Get the XMM subregister of the source YMM register.
323 Register SrcReg = MI.getOperand(1).getReg();
324 Register XmmReg = TRI->getSubReg(SrcReg, X86::sub_xmm);
325
326 LLVM_DEBUG(dbgs() << "Replacing: " << MI);
327 {
328 // Transform: VPERMQ $dst, $src, $0x44
329 // Into: VINSERTI128 $dst, $src, $xmm_src, $1
330 MI.setDesc(TII->get(NewOpc));
331 // Remove the immediate operand.
332 MI.removeOperand(NumOperands - 1);
333 // Add the XMM subregister operand.
334 MI.addOperand(MachineOperand::CreateReg(XmmReg, /*isDef=*/false,
335 /*isImp=*/false,
336 /*isKill=*/false));
337 // Add the immediate (1 = insert into high 128-bits).
338 MI.addOperand(MachineOperand::CreateImm(1));
339 }
340 LLVM_DEBUG(dbgs() << " With: " << MI);
341 return true;
342 };
343
344 switch (Opc) {
345 case X86::BLENDPDrri:
346 return ProcessBLENDToMOV(X86::MOVSDrr, 0x3, 0x1);
347 case X86::VBLENDPDrri:
348 return ProcessBLENDToMOV(X86::VMOVSDrr, 0x3, 0x1);
349
350 case X86::BLENDPSrri:
351 return ProcessBLENDToMOV(X86::MOVSSrr, 0xF, 0x1) ||
352 ProcessBLENDToMOV(X86::MOVSDrr, 0xF, 0x3);
353 case X86::VBLENDPSrri:
354 return ProcessBLENDToMOV(X86::VMOVSSrr, 0xF, 0x1) ||
355 ProcessBLENDToMOV(X86::VMOVSDrr, 0xF, 0x3);
356
357 case X86::VPBLENDWrri:
358 // TODO: Add X86::VPBLENDWrmi handling
359 // TODO: Add X86::VPBLENDWYrri handling
360 // TODO: Add X86::VPBLENDWYrmi handling
361 return ProcessBLENDWToBLENDD(X86::VPBLENDDrri, 4);
362
363 case X86::VPERMILPDri:
364 return ProcessVPERMILPDri(X86::VSHUFPDrri);
365 case X86::VPERMILPDYri:
366 return ProcessVPERMILPDri(X86::VSHUFPDYrri);
367 case X86::VPERMILPDZ128ri:
368 return ProcessVPERMILPDri(X86::VSHUFPDZ128rri);
369 case X86::VPERMILPDZ256ri:
370 return ProcessVPERMILPDri(X86::VSHUFPDZ256rri);
371 case X86::VPERMILPDZri:
372 return ProcessVPERMILPDri(X86::VSHUFPDZrri);
373 case X86::VPERMILPDZ128rikz:
374 return ProcessVPERMILPDri(X86::VSHUFPDZ128rrikz);
375 case X86::VPERMILPDZ256rikz:
376 return ProcessVPERMILPDri(X86::VSHUFPDZ256rrikz);
377 case X86::VPERMILPDZrikz:
378 return ProcessVPERMILPDri(X86::VSHUFPDZrrikz);
379 case X86::VPERMILPDZ128rik:
380 return ProcessVPERMILPDri(X86::VSHUFPDZ128rrik);
381 case X86::VPERMILPDZ256rik:
382 return ProcessVPERMILPDri(X86::VSHUFPDZ256rrik);
383 case X86::VPERMILPDZrik:
384 return ProcessVPERMILPDri(X86::VSHUFPDZrrik);
385
386 case X86::VPERMILPSri:
387 return ProcessVPERMILPSri(X86::VSHUFPSrri);
388 case X86::VPERMILPSYri:
389 return ProcessVPERMILPSri(X86::VSHUFPSYrri);
390 case X86::VPERMILPSZ128ri:
391 return ProcessVPERMILPSri(X86::VSHUFPSZ128rri);
392 case X86::VPERMILPSZ256ri:
393 return ProcessVPERMILPSri(X86::VSHUFPSZ256rri);
394 case X86::VPERMILPSZri:
395 return ProcessVPERMILPSri(X86::VSHUFPSZrri);
396 case X86::VPERMILPSZ128rikz:
397 return ProcessVPERMILPSri(X86::VSHUFPSZ128rrikz);
398 case X86::VPERMILPSZ256rikz:
399 return ProcessVPERMILPSri(X86::VSHUFPSZ256rrikz);
400 case X86::VPERMILPSZrikz:
401 return ProcessVPERMILPSri(X86::VSHUFPSZrrikz);
402 case X86::VPERMILPSZ128rik:
403 return ProcessVPERMILPSri(X86::VSHUFPSZ128rrik);
404 case X86::VPERMILPSZ256rik:
405 return ProcessVPERMILPSri(X86::VSHUFPSZ256rrik);
406 case X86::VPERMILPSZrik:
407 return ProcessVPERMILPSri(X86::VSHUFPSZrrik);
408 case X86::VPERMILPSmi:
409 return ProcessVPERMILPSmi(X86::VPSHUFDmi);
410 case X86::VPERMILPSYmi:
411 // TODO: See if there is a more generic way we can test if the replacement
412 // instruction is supported.
413 return ST->hasAVX2() ? ProcessVPERMILPSmi(X86::VPSHUFDYmi) : false;
414 case X86::VPERMILPSZ128mi:
415 return ProcessVPERMILPSmi(X86::VPSHUFDZ128mi);
416 case X86::VPERMILPSZ256mi:
417 return ProcessVPERMILPSmi(X86::VPSHUFDZ256mi);
418 case X86::VPERMILPSZmi:
419 return ProcessVPERMILPSmi(X86::VPSHUFDZmi);
420 case X86::VPERMILPSZ128mikz:
421 return ProcessVPERMILPSmi(X86::VPSHUFDZ128mikz);
422 case X86::VPERMILPSZ256mikz:
423 return ProcessVPERMILPSmi(X86::VPSHUFDZ256mikz);
424 case X86::VPERMILPSZmikz:
425 return ProcessVPERMILPSmi(X86::VPSHUFDZmikz);
426 case X86::VPERMILPSZ128mik:
427 return ProcessVPERMILPSmi(X86::VPSHUFDZ128mik);
428 case X86::VPERMILPSZ256mik:
429 return ProcessVPERMILPSmi(X86::VPSHUFDZ256mik);
430 case X86::VPERMILPSZmik:
431 return ProcessVPERMILPSmi(X86::VPSHUFDZmik);
432 case X86::VPERMQYri:
433 return ProcessVPERMQToVINSERT128(X86::VINSERTI128rri);
434 case X86::VPERMPDYri:
435 return ProcessVPERMQToVINSERT128(X86::VINSERTF128rri);
436 case X86::MOVLHPSrr:
437 case X86::UNPCKLPDrr:
438 return ProcessUNPCKLPDrr(X86::PUNPCKLQDQrr, X86::SHUFPDrri);
439 case X86::VMOVLHPSrr:
440 case X86::VUNPCKLPDrr:
441 return ProcessUNPCKLPDrr(X86::VPUNPCKLQDQrr, X86::VSHUFPDrri);
442 case X86::VUNPCKLPDYrr:
443 return ProcessUNPCKLPDrr(X86::VPUNPCKLQDQYrr, X86::VSHUFPDYrri);
444 // VMOVLHPS is always 128 bits.
445 case X86::VMOVLHPSZrr:
446 case X86::VUNPCKLPDZ128rr:
447 return ProcessUNPCKLPDrr(X86::VPUNPCKLQDQZ128rr, X86::VSHUFPDZ128rri);
448 case X86::VUNPCKLPDZ256rr:
449 return ProcessUNPCKLPDrr(X86::VPUNPCKLQDQZ256rr, X86::VSHUFPDZ256rri);
450 case X86::VUNPCKLPDZrr:
451 return ProcessUNPCKLPDrr(X86::VPUNPCKLQDQZrr, X86::VSHUFPDZrri);
452 case X86::VUNPCKLPDZ128rrk:
453 return ProcessUNPCKLPDrr(X86::VPUNPCKLQDQZ128rrk, X86::VSHUFPDZ128rrik);
454 case X86::VUNPCKLPDZ256rrk:
455 return ProcessUNPCKLPDrr(X86::VPUNPCKLQDQZ256rrk, X86::VSHUFPDZ256rrik);
456 case X86::VUNPCKLPDZrrk:
457 return ProcessUNPCKLPDrr(X86::VPUNPCKLQDQZrrk, X86::VSHUFPDZrrik);
458 case X86::VUNPCKLPDZ128rrkz:
459 return ProcessUNPCKLPDrr(X86::VPUNPCKLQDQZ128rrkz, X86::VSHUFPDZ128rrikz);
460 case X86::VUNPCKLPDZ256rrkz:
461 return ProcessUNPCKLPDrr(X86::VPUNPCKLQDQZ256rrkz, X86::VSHUFPDZ256rrikz);
462 case X86::VUNPCKLPDZrrkz:
463 return ProcessUNPCKLPDrr(X86::VPUNPCKLQDQZrrkz, X86::VSHUFPDZrrikz);
464 case X86::UNPCKHPDrr:
465 return ProcessUNPCKHPDrr(X86::PUNPCKHQDQrr, X86::SHUFPDrri);
466 case X86::VUNPCKHPDrr:
467 return ProcessUNPCKHPDrr(X86::VPUNPCKHQDQrr, X86::VSHUFPDrri);
468 case X86::VUNPCKHPDYrr:
469 return ProcessUNPCKHPDrr(X86::VPUNPCKHQDQYrr, X86::VSHUFPDYrri);
470 case X86::VUNPCKHPDZ128rr:
471 return ProcessUNPCKHPDrr(X86::VPUNPCKHQDQZ128rr, X86::VSHUFPDZ128rri);
472 case X86::VUNPCKHPDZ256rr:
473 return ProcessUNPCKHPDrr(X86::VPUNPCKHQDQZ256rr, X86::VSHUFPDZ256rri);
474 case X86::VUNPCKHPDZrr:
475 return ProcessUNPCKHPDrr(X86::VPUNPCKHQDQZrr, X86::VSHUFPDZrri);
476 case X86::VUNPCKHPDZ128rrk:
477 return ProcessUNPCKHPDrr(X86::VPUNPCKHQDQZ128rrk, X86::VSHUFPDZ128rrik);
478 case X86::VUNPCKHPDZ256rrk:
479 return ProcessUNPCKHPDrr(X86::VPUNPCKHQDQZ256rrk, X86::VSHUFPDZ256rrik);
480 case X86::VUNPCKHPDZrrk:
481 return ProcessUNPCKHPDrr(X86::VPUNPCKHQDQZrrk, X86::VSHUFPDZrrik);
482 case X86::VUNPCKHPDZ128rrkz:
483 return ProcessUNPCKHPDrr(X86::VPUNPCKHQDQZ128rrkz, X86::VSHUFPDZ128rrikz);
484 case X86::VUNPCKHPDZ256rrkz:
485 return ProcessUNPCKHPDrr(X86::VPUNPCKHQDQZ256rrkz, X86::VSHUFPDZ256rrikz);
486 case X86::VUNPCKHPDZrrkz:
487 return ProcessUNPCKHPDrr(X86::VPUNPCKHQDQZrrkz, X86::VSHUFPDZrrikz);
488 case X86::UNPCKLPDrm:
489 return ProcessUNPCKPDrm(X86::PUNPCKLQDQrm);
490 case X86::VUNPCKLPDrm:
491 return ProcessUNPCKPDrm(X86::VPUNPCKLQDQrm);
492 case X86::VUNPCKLPDYrm:
493 return ProcessUNPCKPDrm(X86::VPUNPCKLQDQYrm);
494 case X86::VUNPCKLPDZ128rm:
495 return ProcessUNPCKPDrm(X86::VPUNPCKLQDQZ128rm);
496 case X86::VUNPCKLPDZ256rm:
497 return ProcessUNPCKPDrm(X86::VPUNPCKLQDQZ256rm);
498 case X86::VUNPCKLPDZrm:
499 return ProcessUNPCKPDrm(X86::VPUNPCKLQDQZrm);
500 case X86::VUNPCKLPDZ128rmk:
501 return ProcessUNPCKPDrm(X86::VPUNPCKLQDQZ128rmk);
502 case X86::VUNPCKLPDZ256rmk:
503 return ProcessUNPCKPDrm(X86::VPUNPCKLQDQZ256rmk);
504 case X86::VUNPCKLPDZrmk:
505 return ProcessUNPCKPDrm(X86::VPUNPCKLQDQZrmk);
506 case X86::VUNPCKLPDZ128rmkz:
507 return ProcessUNPCKPDrm(X86::VPUNPCKLQDQZ128rmkz);
508 case X86::VUNPCKLPDZ256rmkz:
509 return ProcessUNPCKPDrm(X86::VPUNPCKLQDQZ256rmkz);
510 case X86::VUNPCKLPDZrmkz:
511 return ProcessUNPCKPDrm(X86::VPUNPCKLQDQZrmkz);
512 case X86::UNPCKHPDrm:
513 return ProcessUNPCKPDrm(X86::PUNPCKHQDQrm);
514 case X86::VUNPCKHPDrm:
515 return ProcessUNPCKPDrm(X86::VPUNPCKHQDQrm);
516 case X86::VUNPCKHPDYrm:
517 return ProcessUNPCKPDrm(X86::VPUNPCKHQDQYrm);
518 case X86::VUNPCKHPDZ128rm:
519 return ProcessUNPCKPDrm(X86::VPUNPCKHQDQZ128rm);
520 case X86::VUNPCKHPDZ256rm:
521 return ProcessUNPCKPDrm(X86::VPUNPCKHQDQZ256rm);
522 case X86::VUNPCKHPDZrm:
523 return ProcessUNPCKPDrm(X86::VPUNPCKHQDQZrm);
524 case X86::VUNPCKHPDZ128rmk:
525 return ProcessUNPCKPDrm(X86::VPUNPCKHQDQZ128rmk);
526 case X86::VUNPCKHPDZ256rmk:
527 return ProcessUNPCKPDrm(X86::VPUNPCKHQDQZ256rmk);
528 case X86::VUNPCKHPDZrmk:
529 return ProcessUNPCKPDrm(X86::VPUNPCKHQDQZrmk);
530 case X86::VUNPCKHPDZ128rmkz:
531 return ProcessUNPCKPDrm(X86::VPUNPCKHQDQZ128rmkz);
532 case X86::VUNPCKHPDZ256rmkz:
533 return ProcessUNPCKPDrm(X86::VPUNPCKHQDQZ256rmkz);
534 case X86::VUNPCKHPDZrmkz:
535 return ProcessUNPCKPDrm(X86::VPUNPCKHQDQZrmkz);
536
537 case X86::UNPCKLPSrr:
538 return ProcessUNPCKPS(X86::PUNPCKLDQrr);
539 case X86::VUNPCKLPSrr:
540 return ProcessUNPCKPS(X86::VPUNPCKLDQrr);
541 case X86::VUNPCKLPSYrr:
542 return ProcessUNPCKPS(X86::VPUNPCKLDQYrr);
543 case X86::VUNPCKLPSZ128rr:
544 return ProcessUNPCKPS(X86::VPUNPCKLDQZ128rr);
545 case X86::VUNPCKLPSZ256rr:
546 return ProcessUNPCKPS(X86::VPUNPCKLDQZ256rr);
547 case X86::VUNPCKLPSZrr:
548 return ProcessUNPCKPS(X86::VPUNPCKLDQZrr);
549 case X86::VUNPCKLPSZ128rrk:
550 return ProcessUNPCKPS(X86::VPUNPCKLDQZ128rrk);
551 case X86::VUNPCKLPSZ256rrk:
552 return ProcessUNPCKPS(X86::VPUNPCKLDQZ256rrk);
553 case X86::VUNPCKLPSZrrk:
554 return ProcessUNPCKPS(X86::VPUNPCKLDQZrrk);
555 case X86::VUNPCKLPSZ128rrkz:
556 return ProcessUNPCKPS(X86::VPUNPCKLDQZ128rrkz);
557 case X86::VUNPCKLPSZ256rrkz:
558 return ProcessUNPCKPS(X86::VPUNPCKLDQZ256rrkz);
559 case X86::VUNPCKLPSZrrkz:
560 return ProcessUNPCKPS(X86::VPUNPCKLDQZrrkz);
561 case X86::UNPCKHPSrr:
562 return ProcessUNPCKPS(X86::PUNPCKHDQrr);
563 case X86::VUNPCKHPSrr:
564 return ProcessUNPCKPS(X86::VPUNPCKHDQrr);
565 case X86::VUNPCKHPSYrr:
566 return ProcessUNPCKPS(X86::VPUNPCKHDQYrr);
567 case X86::VUNPCKHPSZ128rr:
568 return ProcessUNPCKPS(X86::VPUNPCKHDQZ128rr);
569 case X86::VUNPCKHPSZ256rr:
570 return ProcessUNPCKPS(X86::VPUNPCKHDQZ256rr);
571 case X86::VUNPCKHPSZrr:
572 return ProcessUNPCKPS(X86::VPUNPCKHDQZrr);
573 case X86::VUNPCKHPSZ128rrk:
574 return ProcessUNPCKPS(X86::VPUNPCKHDQZ128rrk);
575 case X86::VUNPCKHPSZ256rrk:
576 return ProcessUNPCKPS(X86::VPUNPCKHDQZ256rrk);
577 case X86::VUNPCKHPSZrrk:
578 return ProcessUNPCKPS(X86::VPUNPCKHDQZrrk);
579 case X86::VUNPCKHPSZ128rrkz:
580 return ProcessUNPCKPS(X86::VPUNPCKHDQZ128rrkz);
581 case X86::VUNPCKHPSZ256rrkz:
582 return ProcessUNPCKPS(X86::VPUNPCKHDQZ256rrkz);
583 case X86::VUNPCKHPSZrrkz:
584 return ProcessUNPCKPS(X86::VPUNPCKHDQZrrkz);
585 case X86::UNPCKLPSrm:
586 return ProcessUNPCKPS(X86::PUNPCKLDQrm);
587 case X86::VUNPCKLPSrm:
588 return ProcessUNPCKPS(X86::VPUNPCKLDQrm);
589 case X86::VUNPCKLPSYrm:
590 return ProcessUNPCKPS(X86::VPUNPCKLDQYrm);
591 case X86::VUNPCKLPSZ128rm:
592 return ProcessUNPCKPS(X86::VPUNPCKLDQZ128rm);
593 case X86::VUNPCKLPSZ256rm:
594 return ProcessUNPCKPS(X86::VPUNPCKLDQZ256rm);
595 case X86::VUNPCKLPSZrm:
596 return ProcessUNPCKPS(X86::VPUNPCKLDQZrm);
597 case X86::VUNPCKLPSZ128rmk:
598 return ProcessUNPCKPS(X86::VPUNPCKLDQZ128rmk);
599 case X86::VUNPCKLPSZ256rmk:
600 return ProcessUNPCKPS(X86::VPUNPCKLDQZ256rmk);
601 case X86::VUNPCKLPSZrmk:
602 return ProcessUNPCKPS(X86::VPUNPCKLDQZrmk);
603 case X86::VUNPCKLPSZ128rmkz:
604 return ProcessUNPCKPS(X86::VPUNPCKLDQZ128rmkz);
605 case X86::VUNPCKLPSZ256rmkz:
606 return ProcessUNPCKPS(X86::VPUNPCKLDQZ256rmkz);
607 case X86::VUNPCKLPSZrmkz:
608 return ProcessUNPCKPS(X86::VPUNPCKLDQZrmkz);
609 case X86::UNPCKHPSrm:
610 return ProcessUNPCKPS(X86::PUNPCKHDQrm);
611 case X86::VUNPCKHPSrm:
612 return ProcessUNPCKPS(X86::VPUNPCKHDQrm);
613 case X86::VUNPCKHPSYrm:
614 return ProcessUNPCKPS(X86::VPUNPCKHDQYrm);
615 case X86::VUNPCKHPSZ128rm:
616 return ProcessUNPCKPS(X86::VPUNPCKHDQZ128rm);
617 case X86::VUNPCKHPSZ256rm:
618 return ProcessUNPCKPS(X86::VPUNPCKHDQZ256rm);
619 case X86::VUNPCKHPSZrm:
620 return ProcessUNPCKPS(X86::VPUNPCKHDQZrm);
621 case X86::VUNPCKHPSZ128rmk:
622 return ProcessUNPCKPS(X86::VPUNPCKHDQZ128rmk);
623 case X86::VUNPCKHPSZ256rmk:
624 return ProcessUNPCKPS(X86::VPUNPCKHDQZ256rmk);
625 case X86::VUNPCKHPSZrmk:
626 return ProcessUNPCKPS(X86::VPUNPCKHDQZrmk);
627 case X86::VUNPCKHPSZ128rmkz:
628 return ProcessUNPCKPS(X86::VPUNPCKHDQZ128rmkz);
629 case X86::VUNPCKHPSZ256rmkz:
630 return ProcessUNPCKPS(X86::VPUNPCKHDQZ256rmkz);
631 case X86::VUNPCKHPSZrmkz:
632 return ProcessUNPCKPS(X86::VPUNPCKHDQZrmkz);
633
634 case X86::PSLLWri:
635 return ProcessShiftLeftToAdd(X86::PADDWrr);
636 case X86::VPSLLWri:
637 return ProcessShiftLeftToAdd(X86::VPADDWrr);
638 case X86::VPSLLWYri:
639 return ProcessShiftLeftToAdd(X86::VPADDWYrr);
640 case X86::VPSLLWZ128ri:
641 return ProcessShiftLeftToAdd(X86::VPADDWZ128rr);
642 case X86::VPSLLWZ256ri:
643 return ProcessShiftLeftToAdd(X86::VPADDWZ256rr);
644 case X86::VPSLLWZri:
645 return ProcessShiftLeftToAdd(X86::VPADDWZrr);
646 case X86::PSLLDri:
647 return ProcessShiftLeftToAdd(X86::PADDDrr);
648 case X86::VPSLLDri:
649 return ProcessShiftLeftToAdd(X86::VPADDDrr);
650 case X86::VPSLLDYri:
651 return ProcessShiftLeftToAdd(X86::VPADDDYrr);
652 case X86::VPSLLDZ128ri:
653 return ProcessShiftLeftToAdd(X86::VPADDDZ128rr);
654 case X86::VPSLLDZ256ri:
655 return ProcessShiftLeftToAdd(X86::VPADDDZ256rr);
656 case X86::VPSLLDZri:
657 return ProcessShiftLeftToAdd(X86::VPADDDZrr);
658 case X86::PSLLQri:
659 return ProcessShiftLeftToAdd(X86::PADDQrr);
660 case X86::VPSLLQri:
661 return ProcessShiftLeftToAdd(X86::VPADDQrr);
662 case X86::VPSLLQYri:
663 return ProcessShiftLeftToAdd(X86::VPADDQYrr);
664 case X86::VPSLLQZ128ri:
665 return ProcessShiftLeftToAdd(X86::VPADDQZ128rr);
666 case X86::VPSLLQZ256ri:
667 return ProcessShiftLeftToAdd(X86::VPADDQZ256rr);
668 case X86::VPSLLQZri:
669 return ProcessShiftLeftToAdd(X86::VPADDQZrr);
670
671 default:
672 return false;
673 }
674}
675
676bool X86FixupInstTuningImpl::runOnMachineFunction(MachineFunction &MF) {
677 LLVM_DEBUG(dbgs() << "Start X86FixupInstTuning\n";);
678 bool Changed = false;
679 ST = &MF.getSubtarget<X86Subtarget>();
680 TII = ST->getInstrInfo();
681 TRI = ST->getRegisterInfo();
682 SM = &ST->getSchedModel();
683
684 for (MachineBasicBlock &MBB : MF) {
685 for (MachineBasicBlock::iterator I = MBB.begin(); I != MBB.end(); ++I) {
686 if (processInstruction(MF, MBB, I)) {
687 ++NumInstChanges;
688 Changed = true;
689 }
690 }
691 }
692 LLVM_DEBUG(dbgs() << "End X86FixupInstTuning\n";);
693 return Changed;
694}
695
696bool X86FixupInstTuningLegacy::runOnMachineFunction(MachineFunction &MF) {
697 X86FixupInstTuningImpl Impl;
698 return Impl.runOnMachineFunction(MF);
699}
700
701PreservedAnalyses
704 X86FixupInstTuningImpl Impl;
705 return Impl.runOnMachineFunction(MF)
709}
MachineBasicBlock & MBB
Function Alias Analysis false
#define DEBUG_TYPE
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
#define I(x, y, z)
Definition MD5.cpp:57
Register const TargetRegisterInfo * TRI
Promote Memory to Register
Definition Mem2Reg.cpp:110
#define T
#define INITIALIZE_PASS(passName, arg, name, cfg, analysis)
Definition PassSupport.h:56
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:171
#define LLVM_DEBUG(...)
Definition Debug.h:114
static std::optional< bool > CmpOptionals(T NewVal, T CurVal)
uint64_t getZExtValue() const
Get zero extended value.
Definition APInt.h:1555
static LLVM_ABI APInt getSplat(unsigned NewLen, const APInt &V)
Return a value containing V broadcasted over NewLen bits.
Definition APInt.cpp:651
Represents analyses that only rely on functions' control flow.
Definition Analysis.h:73
FunctionPass class - This class is used to implement most global optimizations.
Definition Pass.h:314
bool hasOptSize() const
Optimize this function for size (-Os) or minimum size (-Oz).
Definition Function.h:712
unsigned getSize(const MachineInstr &MI) const
MachineInstrBundleIterator< MachineInstr > iterator
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
Function & getFunction()
Return the LLVM function that this machine code represents.
static MachineOperand CreateImm(int64_t Val)
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
Definition Analysis.h:151
PreservedAnalyses run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM)
bool hasNoDomainDelayShuffle() const
const X86InstrInfo * getInstrInfo() const override
const X86RegisterInfo * getRegisterInfo() const override
bool hasAVX2() const
Changed
Pass manager infrastructure for declaring and invalidating analyses.
LLVM_ABI APInt ScaleBitMask(const APInt &A, unsigned NewBitWidth, bool MatchAllBits=false)
Splat/Merge neighboring bits to widen/narrow the bitmask represented by.
Definition APInt.cpp:3020
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
This is an optimization pass for GlobalISel generic memory operations.
Definition Types.h:26
AnalysisManager< MachineFunction > MachineFunctionAnalysisManager
LLVM_ABI PreservedAnalyses getMachineFunctionPassPreservedAnalyses()
Returns the minimum set of Analyses that all machine function passes must preserve.
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
FunctionPass * createX86FixupInstTuningLegacyPass()
Machine model for scheduling, bundling, and heuristics.
Definition MCSchedule.h:258
const MCSchedClassDesc * getSchedClassDesc(unsigned SchedClassIdx) const
Definition MCSchedule.h:366
bool hasInstrSchedModel() const
Does this machine model include instruction-level scheduling.
Definition MCSchedule.h:340
static LLVM_ABI int computeInstrLatency(const MCSubtargetInfo &STI, const MCSchedClassDesc &SCDesc)
Returns the latency value for the scheduling class.
static LLVM_ABI double getReciprocalThroughput(const MCSubtargetInfo &STI, const MCSchedClassDesc &SCDesc)