LLVM 23.0.0git
X86AvoidStoreForwardingBlocks.cpp
Go to the documentation of this file.
1//===- X86AvoidStoreForwardingBlocks.cpp - Avoid HW Store Forward Block ---===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// If a load follows a store and reloads data that the store has written to
10// memory, Intel microarchitectures can in many cases forward the data directly
11// from the store to the load, This "store forwarding" saves cycles by enabling
12// the load to directly obtain the data instead of accessing the data from
13// cache or memory.
14// A "store forward block" occurs in cases that a store cannot be forwarded to
15// the load. The most typical case of store forward block on Intel Core
16// microarchitecture that a small store cannot be forwarded to a large load.
17// The estimated penalty for a store forward block is ~13 cycles.
18//
19// This pass tries to recognize and handle cases where "store forward block"
20// is created by the compiler when lowering memcpy calls to a sequence
21// of a load and a store.
22//
23// The pass currently only handles cases where memcpy is lowered to
24// XMM/YMM registers, it tries to break the memcpy into smaller copies.
25// breaking the memcpy should be possible since there is no atomicity
26// guarantee for loads and stores to XMM/YMM.
27//
28// It could be better for performance to solve the problem by loading
29// to XMM/YMM then inserting the partial store before storing back from XMM/YMM
30// to memory, but this will result in a more conservative optimization since it
31// requires we prove that all memory accesses between the blocking store and the
32// load must alias/don't alias before we can move the store, whereas the
33// transformation done here is correct regardless to other memory accesses.
34//===----------------------------------------------------------------------===//
35
36#include "X86.h"
37#include "X86InstrInfo.h"
38#include "X86Subtarget.h"
47#include "llvm/IR/DebugLoc.h"
48#include "llvm/IR/Function.h"
50#include "llvm/MC/MCInstrDesc.h"
51
52using namespace llvm;
53
54#define DEBUG_TYPE "x86-avoid-sfb"
55
57 "x86-disable-avoid-SFB", cl::Hidden,
58 cl::desc("X86: Disable Store Forwarding Blocks fixup."), cl::init(false));
59
61 "x86-sfb-inspection-limit",
62 cl::desc("X86: Number of instructions backward to "
63 "inspect for store forwarding blocks."),
64 cl::init(20), cl::Hidden);
65
66namespace {
67
68using DisplacementSizeMap = std::map<int64_t, unsigned>;
69
70class X86AvoidSFBImpl {
71public:
72 X86AvoidSFBImpl(AliasAnalysis *AA) : AA(AA) {};
73 bool runOnMachineFunction(MachineFunction &MF);
74
75private:
76 MachineRegisterInfo *MRI = nullptr;
77 const X86InstrInfo *TII = nullptr;
78 const X86RegisterInfo *TRI = nullptr;
80 BlockedLoadsStoresPairs;
81 SmallVector<MachineInstr *, 2> ForRemoval;
82 AliasAnalysis *AA = nullptr;
83
84 /// Returns couples of Load then Store to memory which look
85 /// like a memcpy.
86 void findPotentiallylBlockedCopies(MachineFunction &MF);
87 /// Break the memcpy's load and store into smaller copies
88 /// such that each memory load that was blocked by a smaller store
89 /// would now be copied separately.
90 void breakBlockedCopies(MachineInstr *LoadInst, MachineInstr *StoreInst,
91 const DisplacementSizeMap &BlockingStoresDispSizeMap);
92 /// Break a copy of size Size to smaller copies.
93 void buildCopies(int Size, MachineInstr *LoadInst, int64_t LdDispImm,
94 MachineInstr *StoreInst, int64_t StDispImm, int64_t Offset);
95
96 void buildCopy(MachineInstr *LoadInst, unsigned NLoadOpcode, int64_t LoadDisp,
97 MachineInstr *StoreInst, unsigned NStoreOpcode,
98 int64_t StoreDisp, unsigned Size, int64_t Offset);
99
100 bool alias(const MachineMemOperand &Op1, const MachineMemOperand &Op2) const;
101
102 unsigned getRegSizeInBytes(MachineInstr *Inst);
103};
104
105class X86AvoidSFBLegacy : public MachineFunctionPass {
106public:
107 static char ID;
108 X86AvoidSFBLegacy() : MachineFunctionPass(ID) {}
109
110 StringRef getPassName() const override {
111 return "X86 Avoid Store Forwarding Blocks";
112 }
113
114 bool runOnMachineFunction(MachineFunction &MF) override;
115
116 void getAnalysisUsage(AnalysisUsage &AU) const override {
118 AU.addRequired<AAResultsWrapperPass>();
119 }
120};
121
122} // end anonymous namespace
123
124char X86AvoidSFBLegacy::ID = 0;
125
126INITIALIZE_PASS_BEGIN(X86AvoidSFBLegacy, DEBUG_TYPE, "Machine code sinking",
127 false, false)
129INITIALIZE_PASS_END(X86AvoidSFBLegacy, DEBUG_TYPE, "Machine code sinking",
131
133 return new X86AvoidSFBLegacy();
134}
135
136static bool isXMMLoadOpcode(unsigned Opcode) {
137 return Opcode == X86::MOVUPSrm || Opcode == X86::MOVAPSrm ||
138 Opcode == X86::VMOVUPSrm || Opcode == X86::VMOVAPSrm ||
139 Opcode == X86::VMOVUPDrm || Opcode == X86::VMOVAPDrm ||
140 Opcode == X86::VMOVDQUrm || Opcode == X86::VMOVDQArm ||
141 Opcode == X86::VMOVUPSZ128rm || Opcode == X86::VMOVAPSZ128rm ||
142 Opcode == X86::VMOVUPDZ128rm || Opcode == X86::VMOVAPDZ128rm ||
143 Opcode == X86::VMOVDQU64Z128rm || Opcode == X86::VMOVDQA64Z128rm ||
144 Opcode == X86::VMOVDQU32Z128rm || Opcode == X86::VMOVDQA32Z128rm;
145}
146static bool isYMMLoadOpcode(unsigned Opcode) {
147 return Opcode == X86::VMOVUPSYrm || Opcode == X86::VMOVAPSYrm ||
148 Opcode == X86::VMOVUPDYrm || Opcode == X86::VMOVAPDYrm ||
149 Opcode == X86::VMOVDQUYrm || Opcode == X86::VMOVDQAYrm ||
150 Opcode == X86::VMOVUPSZ256rm || Opcode == X86::VMOVAPSZ256rm ||
151 Opcode == X86::VMOVUPDZ256rm || Opcode == X86::VMOVAPDZ256rm ||
152 Opcode == X86::VMOVDQU64Z256rm || Opcode == X86::VMOVDQA64Z256rm ||
153 Opcode == X86::VMOVDQU32Z256rm || Opcode == X86::VMOVDQA32Z256rm;
154}
155
156static bool isPotentialBlockedMemCpyLd(unsigned Opcode) {
157 return isXMMLoadOpcode(Opcode) || isYMMLoadOpcode(Opcode);
158}
159
160static bool isPotentialBlockedMemCpyPair(unsigned LdOpcode, unsigned StOpcode) {
161 switch (LdOpcode) {
162 case X86::MOVUPSrm:
163 case X86::MOVAPSrm:
164 return StOpcode == X86::MOVUPSmr || StOpcode == X86::MOVAPSmr;
165 case X86::VMOVUPSrm:
166 case X86::VMOVAPSrm:
167 return StOpcode == X86::VMOVUPSmr || StOpcode == X86::VMOVAPSmr;
168 case X86::VMOVUPDrm:
169 case X86::VMOVAPDrm:
170 return StOpcode == X86::VMOVUPDmr || StOpcode == X86::VMOVAPDmr;
171 case X86::VMOVDQUrm:
172 case X86::VMOVDQArm:
173 return StOpcode == X86::VMOVDQUmr || StOpcode == X86::VMOVDQAmr;
174 case X86::VMOVUPSZ128rm:
175 case X86::VMOVAPSZ128rm:
176 return StOpcode == X86::VMOVUPSZ128mr || StOpcode == X86::VMOVAPSZ128mr;
177 case X86::VMOVUPDZ128rm:
178 case X86::VMOVAPDZ128rm:
179 return StOpcode == X86::VMOVUPDZ128mr || StOpcode == X86::VMOVAPDZ128mr;
180 case X86::VMOVUPSYrm:
181 case X86::VMOVAPSYrm:
182 return StOpcode == X86::VMOVUPSYmr || StOpcode == X86::VMOVAPSYmr;
183 case X86::VMOVUPDYrm:
184 case X86::VMOVAPDYrm:
185 return StOpcode == X86::VMOVUPDYmr || StOpcode == X86::VMOVAPDYmr;
186 case X86::VMOVDQUYrm:
187 case X86::VMOVDQAYrm:
188 return StOpcode == X86::VMOVDQUYmr || StOpcode == X86::VMOVDQAYmr;
189 case X86::VMOVUPSZ256rm:
190 case X86::VMOVAPSZ256rm:
191 return StOpcode == X86::VMOVUPSZ256mr || StOpcode == X86::VMOVAPSZ256mr;
192 case X86::VMOVUPDZ256rm:
193 case X86::VMOVAPDZ256rm:
194 return StOpcode == X86::VMOVUPDZ256mr || StOpcode == X86::VMOVAPDZ256mr;
195 case X86::VMOVDQU64Z128rm:
196 case X86::VMOVDQA64Z128rm:
197 return StOpcode == X86::VMOVDQU64Z128mr || StOpcode == X86::VMOVDQA64Z128mr;
198 case X86::VMOVDQU32Z128rm:
199 case X86::VMOVDQA32Z128rm:
200 return StOpcode == X86::VMOVDQU32Z128mr || StOpcode == X86::VMOVDQA32Z128mr;
201 case X86::VMOVDQU64Z256rm:
202 case X86::VMOVDQA64Z256rm:
203 return StOpcode == X86::VMOVDQU64Z256mr || StOpcode == X86::VMOVDQA64Z256mr;
204 case X86::VMOVDQU32Z256rm:
205 case X86::VMOVDQA32Z256rm:
206 return StOpcode == X86::VMOVDQU32Z256mr || StOpcode == X86::VMOVDQA32Z256mr;
207 default:
208 return false;
209 }
210}
211
212static bool isPotentialBlockingStoreInst(unsigned Opcode, unsigned LoadOpcode) {
213 bool PBlock = false;
214 PBlock |= Opcode == X86::MOV64mr || Opcode == X86::MOV64mi32 ||
215 Opcode == X86::MOV32mr || Opcode == X86::MOV32mi ||
216 Opcode == X86::MOV16mr || Opcode == X86::MOV16mi ||
217 Opcode == X86::MOV8mr || Opcode == X86::MOV8mi;
218 if (isYMMLoadOpcode(LoadOpcode))
219 PBlock |= Opcode == X86::VMOVUPSmr || Opcode == X86::VMOVAPSmr ||
220 Opcode == X86::VMOVUPDmr || Opcode == X86::VMOVAPDmr ||
221 Opcode == X86::VMOVDQUmr || Opcode == X86::VMOVDQAmr ||
222 Opcode == X86::VMOVUPSZ128mr || Opcode == X86::VMOVAPSZ128mr ||
223 Opcode == X86::VMOVUPDZ128mr || Opcode == X86::VMOVAPDZ128mr ||
224 Opcode == X86::VMOVDQU64Z128mr ||
225 Opcode == X86::VMOVDQA64Z128mr ||
226 Opcode == X86::VMOVDQU32Z128mr || Opcode == X86::VMOVDQA32Z128mr;
227 return PBlock;
228}
229
230static const int MOV128SZ = 16;
231static const int MOV64SZ = 8;
232static const int MOV32SZ = 4;
233static const int MOV16SZ = 2;
234static const int MOV8SZ = 1;
235
236static unsigned getYMMtoXMMLoadOpcode(unsigned LoadOpcode) {
237 switch (LoadOpcode) {
238 case X86::VMOVUPSYrm:
239 case X86::VMOVAPSYrm:
240 return X86::VMOVUPSrm;
241 case X86::VMOVUPDYrm:
242 case X86::VMOVAPDYrm:
243 return X86::VMOVUPDrm;
244 case X86::VMOVDQUYrm:
245 case X86::VMOVDQAYrm:
246 return X86::VMOVDQUrm;
247 case X86::VMOVUPSZ256rm:
248 case X86::VMOVAPSZ256rm:
249 return X86::VMOVUPSZ128rm;
250 case X86::VMOVUPDZ256rm:
251 case X86::VMOVAPDZ256rm:
252 return X86::VMOVUPDZ128rm;
253 case X86::VMOVDQU64Z256rm:
254 case X86::VMOVDQA64Z256rm:
255 return X86::VMOVDQU64Z128rm;
256 case X86::VMOVDQU32Z256rm:
257 case X86::VMOVDQA32Z256rm:
258 return X86::VMOVDQU32Z128rm;
259 default:
260 llvm_unreachable("Unexpected Load Instruction Opcode");
261 }
262 return 0;
263}
264
265static unsigned getYMMtoXMMStoreOpcode(unsigned StoreOpcode) {
266 switch (StoreOpcode) {
267 case X86::VMOVUPSYmr:
268 case X86::VMOVAPSYmr:
269 return X86::VMOVUPSmr;
270 case X86::VMOVUPDYmr:
271 case X86::VMOVAPDYmr:
272 return X86::VMOVUPDmr;
273 case X86::VMOVDQUYmr:
274 case X86::VMOVDQAYmr:
275 return X86::VMOVDQUmr;
276 case X86::VMOVUPSZ256mr:
277 case X86::VMOVAPSZ256mr:
278 return X86::VMOVUPSZ128mr;
279 case X86::VMOVUPDZ256mr:
280 case X86::VMOVAPDZ256mr:
281 return X86::VMOVUPDZ128mr;
282 case X86::VMOVDQU64Z256mr:
283 case X86::VMOVDQA64Z256mr:
284 return X86::VMOVDQU64Z128mr;
285 case X86::VMOVDQU32Z256mr:
286 case X86::VMOVDQA32Z256mr:
287 return X86::VMOVDQU32Z128mr;
288 default:
289 llvm_unreachable("Unexpected Load Instruction Opcode");
290 }
291 return 0;
292}
293
294static int getAddrOffset(const MachineInstr *MI) {
295 const MCInstrDesc &Descl = MI->getDesc();
296 int AddrOffset = X86II::getMemoryOperandNo(Descl.TSFlags);
297 assert(AddrOffset != -1 && "Expected Memory Operand");
298 AddrOffset += X86II::getOperandBias(Descl);
299 return AddrOffset;
300}
301
303 int AddrOffset = getAddrOffset(MI);
304 return MI->getOperand(AddrOffset + X86::AddrBaseReg);
305}
306
308 int AddrOffset = getAddrOffset(MI);
309 return MI->getOperand(AddrOffset + X86::AddrDisp);
310}
311
312// Relevant addressing modes contain only base register and immediate
313// displacement or frameindex and immediate displacement.
314// TODO: Consider expanding to other addressing modes in the future
316 int AddrOffset = getAddrOffset(MI);
318 const MachineOperand &Disp = getDispOperand(MI);
319 const MachineOperand &Scale = MI->getOperand(AddrOffset + X86::AddrScaleAmt);
320 const MachineOperand &Index = MI->getOperand(AddrOffset + X86::AddrIndexReg);
321 const MachineOperand &Segment = MI->getOperand(AddrOffset + X86::AddrSegmentReg);
322
323 if (!((Base.isReg() && Base.getReg() != X86::NoRegister) || Base.isFI()))
324 return false;
325 if (!Disp.isImm())
326 return false;
327 if (Scale.getImm() != 1)
328 return false;
329 if (!(Index.isReg() && Index.getReg() == X86::NoRegister))
330 return false;
331 if (!(Segment.isReg() && Segment.getReg() == X86::NoRegister))
332 return false;
333 return true;
334}
335
336// Collect potentially blocking stores.
337// Limit the number of instructions backwards we want to inspect
338// since the effect of store block won't be visible if the store
339// and load instructions have enough instructions in between to
340// keep the core busy.
343 SmallVector<MachineInstr *, 2> PotentialBlockers;
344 unsigned BlockCount = 0;
345 const unsigned InspectionLimit = X86AvoidSFBInspectionLimit;
346 for (auto PBInst = std::next(MachineBasicBlock::reverse_iterator(LoadInst)),
347 E = LoadInst->getParent()->rend();
348 PBInst != E; ++PBInst) {
349 if (PBInst->isMetaInstruction())
350 continue;
351 BlockCount++;
352 if (BlockCount >= InspectionLimit)
353 break;
354 MachineInstr &MI = *PBInst;
355 if (MI.getDesc().isCall())
356 return PotentialBlockers;
357 PotentialBlockers.push_back(&MI);
358 }
359 // If we didn't get to the instructions limit try predecessing blocks.
360 // Ideally we should traverse the predecessor blocks in depth with some
361 // coloring algorithm, but for now let's just look at the first order
362 // predecessors.
363 if (BlockCount < InspectionLimit) {
365 int LimitLeft = InspectionLimit - BlockCount;
366 for (MachineBasicBlock *PMBB : MBB->predecessors()) {
367 int PredCount = 0;
368 for (MachineInstr &PBInst : llvm::reverse(*PMBB)) {
369 if (PBInst.isMetaInstruction())
370 continue;
371 PredCount++;
372 if (PredCount >= LimitLeft)
373 break;
374 if (PBInst.getDesc().isCall())
375 break;
376 PotentialBlockers.push_back(&PBInst);
377 }
378 }
379 }
380 return PotentialBlockers;
381}
382
383void X86AvoidSFBImpl::buildCopy(MachineInstr *LoadInst, unsigned NLoadOpcode,
384 int64_t LoadDisp, MachineInstr *StoreInst,
385 unsigned NStoreOpcode, int64_t StoreDisp,
386 unsigned Size, int64_t Offset) {
387 MachineOperand &LoadBase = getBaseOperand(LoadInst);
388 MachineOperand &StoreBase = getBaseOperand(StoreInst);
389 MachineBasicBlock *MBB = LoadInst->getParent();
390 MachineMemOperand *LMMO = *LoadInst->memoperands_begin();
391 MachineMemOperand *SMMO = *StoreInst->memoperands_begin();
392
393 Register Reg1 =
394 MRI->createVirtualRegister(TII->getRegClass(TII->get(NLoadOpcode), 0));
395 MachineInstr *NewLoad =
396 BuildMI(*MBB, LoadInst, LoadInst->getDebugLoc(), TII->get(NLoadOpcode),
397 Reg1)
398 .add(LoadBase)
399 .addImm(1)
400 .addReg(X86::NoRegister)
401 .addImm(LoadDisp)
402 .addReg(X86::NoRegister)
405 if (LoadBase.isReg())
406 getBaseOperand(NewLoad).setIsKill(false);
407 LLVM_DEBUG(NewLoad->dump());
408 // If the load and store are consecutive, use the loadInst location to
409 // reduce register pressure.
410 MachineInstr *StInst = StoreInst;
411 auto PrevInstrIt = prev_nodbg(MachineBasicBlock::instr_iterator(StoreInst),
412 MBB->instr_begin());
413 if (PrevInstrIt.getNodePtr() == LoadInst)
414 StInst = LoadInst;
415 MachineInstr *NewStore =
416 BuildMI(*MBB, StInst, StInst->getDebugLoc(), TII->get(NStoreOpcode))
417 .add(StoreBase)
418 .addImm(1)
419 .addReg(X86::NoRegister)
420 .addImm(StoreDisp)
421 .addReg(X86::NoRegister)
422 .addReg(Reg1)
425 if (StoreBase.isReg())
426 getBaseOperand(NewStore).setIsKill(false);
427 MachineOperand &StoreSrcVReg = StoreInst->getOperand(X86::AddrNumOperands);
428 assert(StoreSrcVReg.isReg() && "Expected virtual register");
429 NewStore->getOperand(X86::AddrNumOperands).setIsKill(StoreSrcVReg.isKill());
430 LLVM_DEBUG(NewStore->dump());
431}
432
433void X86AvoidSFBImpl::buildCopies(int Size, MachineInstr *LoadInst,
434 int64_t LdDispImm, MachineInstr *StoreInst,
435 int64_t StDispImm, int64_t Offset) {
436 int LdDisp = LdDispImm;
437 int StDisp = StDispImm;
438 while (Size > 0) {
439 if ((Size - MOV128SZ >= 0) && isYMMLoadOpcode(LoadInst->getOpcode())) {
440 Size = Size - MOV128SZ;
441 buildCopy(LoadInst, getYMMtoXMMLoadOpcode(LoadInst->getOpcode()), LdDisp,
442 StoreInst, getYMMtoXMMStoreOpcode(StoreInst->getOpcode()),
443 StDisp, MOV128SZ, Offset);
444 LdDisp += MOV128SZ;
445 StDisp += MOV128SZ;
446 Offset += MOV128SZ;
447 continue;
448 }
449 if (Size - MOV64SZ >= 0) {
450 Size = Size - MOV64SZ;
451 buildCopy(LoadInst, X86::MOV64rm, LdDisp, StoreInst, X86::MOV64mr, StDisp,
452 MOV64SZ, Offset);
453 LdDisp += MOV64SZ;
454 StDisp += MOV64SZ;
455 Offset += MOV64SZ;
456 continue;
457 }
458 if (Size - MOV32SZ >= 0) {
459 Size = Size - MOV32SZ;
460 buildCopy(LoadInst, X86::MOV32rm, LdDisp, StoreInst, X86::MOV32mr, StDisp,
461 MOV32SZ, Offset);
462 LdDisp += MOV32SZ;
463 StDisp += MOV32SZ;
464 Offset += MOV32SZ;
465 continue;
466 }
467 if (Size - MOV16SZ >= 0) {
468 Size = Size - MOV16SZ;
469 buildCopy(LoadInst, X86::MOV16rm, LdDisp, StoreInst, X86::MOV16mr, StDisp,
470 MOV16SZ, Offset);
471 LdDisp += MOV16SZ;
472 StDisp += MOV16SZ;
473 Offset += MOV16SZ;
474 continue;
475 }
476 if (Size - MOV8SZ >= 0) {
477 Size = Size - MOV8SZ;
478 buildCopy(LoadInst, X86::MOV8rm, LdDisp, StoreInst, X86::MOV8mr, StDisp,
479 MOV8SZ, Offset);
480 LdDisp += MOV8SZ;
481 StDisp += MOV8SZ;
482 Offset += MOV8SZ;
483 continue;
484 }
485 }
486 assert(Size == 0 && "Wrong size division");
487}
488
492 auto *StorePrevNonDbgInstr =
494 LoadInst->getParent()->instr_begin())
495 .getNodePtr();
496 if (LoadBase.isReg()) {
497 MachineInstr *LastLoad = LoadInst->getPrevNode();
498 // If the original load and store to xmm/ymm were consecutive
499 // then the partial copies were also created in
500 // a consecutive order to reduce register pressure,
501 // and the location of the last load is before the last store.
502 if (StorePrevNonDbgInstr == LoadInst)
503 LastLoad = LoadInst->getPrevNode()->getPrevNode();
504 getBaseOperand(LastLoad).setIsKill(LoadBase.isKill());
505 }
506 if (StoreBase.isReg()) {
507 MachineInstr *StInst = StoreInst;
508 if (StorePrevNonDbgInstr == LoadInst)
509 StInst = LoadInst;
510 getBaseOperand(StInst->getPrevNode()).setIsKill(StoreBase.isKill());
511 }
512}
513
514bool X86AvoidSFBImpl::alias(const MachineMemOperand &Op1,
515 const MachineMemOperand &Op2) const {
516 if (!Op1.getValue() || !Op2.getValue())
517 return true;
518
519 int64_t MinOffset = std::min(Op1.getOffset(), Op2.getOffset());
520 int64_t Overlapa = Op1.getSize().getValue() + Op1.getOffset() - MinOffset;
521 int64_t Overlapb = Op2.getSize().getValue() + Op2.getOffset() - MinOffset;
522
523 return !AA->isNoAlias(
524 MemoryLocation(Op1.getValue(), Overlapa, Op1.getAAInfo()),
525 MemoryLocation(Op2.getValue(), Overlapb, Op2.getAAInfo()));
526}
527
528void X86AvoidSFBImpl::findPotentiallylBlockedCopies(MachineFunction &MF) {
529 for (auto &MBB : MF)
530 for (auto &MI : MBB) {
531 if (!isPotentialBlockedMemCpyLd(MI.getOpcode()))
532 continue;
533 Register DefVR = MI.getOperand(0).getReg();
534 if (!MRI->hasOneNonDBGUse(DefVR))
535 continue;
536 for (MachineOperand &StoreMO :
538 MachineInstr &StoreMI = *StoreMO.getParent();
539 // Skip cases where the memcpy may overlap.
540 if (StoreMI.getParent() == MI.getParent() &&
541 isPotentialBlockedMemCpyPair(MI.getOpcode(), StoreMI.getOpcode()) &&
543 isRelevantAddressingMode(&StoreMI) &&
544 MI.hasOneMemOperand() && StoreMI.hasOneMemOperand()) {
545 // Don't split volatile or atomic accesses.
546 const MachineMemOperand *LMMO = *MI.memoperands_begin();
547 const MachineMemOperand *SMMO = *StoreMI.memoperands_begin();
548 if (LMMO->isVolatile() || LMMO->isAtomic() || SMMO->isVolatile() ||
549 SMMO->isAtomic())
550 continue;
551 if (!alias(*LMMO, *SMMO))
552 BlockedLoadsStoresPairs.push_back(std::make_pair(&MI, &StoreMI));
553 }
554 }
555 }
556}
557
558unsigned X86AvoidSFBImpl::getRegSizeInBytes(MachineInstr *LoadInst) {
559 const auto *TRC = TII->getRegClass(TII->get(LoadInst->getOpcode()), 0);
560 return TRI->getRegSizeInBits(*TRC) / 8;
561}
562
563void X86AvoidSFBImpl::breakBlockedCopies(
564 MachineInstr *LoadInst, MachineInstr *StoreInst,
565 const DisplacementSizeMap &BlockingStoresDispSizeMap) {
566 int64_t LdDispImm = getDispOperand(LoadInst).getImm();
567 int64_t StDispImm = getDispOperand(StoreInst).getImm();
568 int64_t Offset = 0;
569
570 int64_t LdDisp1 = LdDispImm;
571 int64_t LdDisp2 = 0;
572 int64_t StDisp1 = StDispImm;
573 int64_t StDisp2 = 0;
574 unsigned Size1 = 0;
575 unsigned Size2 = 0;
576 int64_t LdStDelta = StDispImm - LdDispImm;
577
578 for (auto DispSizePair : BlockingStoresDispSizeMap) {
579 LdDisp2 = DispSizePair.first;
580 StDisp2 = DispSizePair.first + LdStDelta;
581 Size2 = DispSizePair.second;
582 // Avoid copying overlapping areas.
583 if (LdDisp2 < LdDisp1) {
584 int OverlapDelta = LdDisp1 - LdDisp2;
585 LdDisp2 += OverlapDelta;
586 StDisp2 += OverlapDelta;
587 Size2 -= OverlapDelta;
588 }
589 Size1 = LdDisp2 - LdDisp1;
590
591 // Build a copy for the point until the current blocking store's
592 // displacement.
593 buildCopies(Size1, LoadInst, LdDisp1, StoreInst, StDisp1, Offset);
594 // Build a copy for the current blocking store.
595 buildCopies(Size2, LoadInst, LdDisp2, StoreInst, StDisp2, Offset + Size1);
596 LdDisp1 = LdDisp2 + Size2;
597 StDisp1 = StDisp2 + Size2;
598 Offset += Size1 + Size2;
599 }
600 unsigned Size3 = (LdDispImm + getRegSizeInBytes(LoadInst)) - LdDisp1;
601 buildCopies(Size3, LoadInst, LdDisp1, StoreInst, StDisp1, Offset);
602}
603
606 const MachineOperand &LoadBase = getBaseOperand(LoadInst);
607 const MachineOperand &StoreBase = getBaseOperand(StoreInst);
608 if (LoadBase.isReg() != StoreBase.isReg())
609 return false;
610 if (LoadBase.isReg())
611 return LoadBase.getReg() == StoreBase.getReg();
612 return LoadBase.getIndex() == StoreBase.getIndex();
613}
614
615static bool isBlockingStore(int64_t LoadDispImm, unsigned LoadSize,
616 int64_t StoreDispImm, unsigned StoreSize) {
617 return ((StoreDispImm >= LoadDispImm) &&
618 (StoreDispImm <= LoadDispImm + (LoadSize - StoreSize)));
619}
620
621// Keep track of all stores blocking a load
622static void
623updateBlockingStoresDispSizeMap(DisplacementSizeMap &BlockingStoresDispSizeMap,
624 int64_t DispImm, unsigned Size) {
625 auto [It, Inserted] = BlockingStoresDispSizeMap.try_emplace(DispImm, Size);
626 // Choose the smallest blocking store starting at this displacement.
627 if (!Inserted && It->second > Size)
628 It->second = Size;
629}
630
631// Remove blocking stores contained in each other.
632static void
633removeRedundantBlockingStores(DisplacementSizeMap &BlockingStoresDispSizeMap) {
634 if (BlockingStoresDispSizeMap.size() <= 1)
635 return;
636
638 for (auto DispSizePair : BlockingStoresDispSizeMap) {
639 int64_t CurrDisp = DispSizePair.first;
640 unsigned CurrSize = DispSizePair.second;
641 while (DispSizeStack.size()) {
642 int64_t PrevDisp = DispSizeStack.back().first;
643 unsigned PrevSize = DispSizeStack.back().second;
644 if (CurrDisp + CurrSize > PrevDisp + PrevSize)
645 break;
646 DispSizeStack.pop_back();
647 }
648 DispSizeStack.push_back(DispSizePair);
649 }
650 BlockingStoresDispSizeMap.clear();
651 for (auto Disp : DispSizeStack)
652 BlockingStoresDispSizeMap.insert(Disp);
653}
654
655bool X86AvoidSFBImpl::runOnMachineFunction(MachineFunction &MF) {
656 bool Changed = false;
657
659 !MF.getSubtarget<X86Subtarget>().is64Bit())
660 return false;
661
662 MRI = &MF.getRegInfo();
663 assert(MRI->isSSA() && "Expected MIR to be in SSA form");
664 TII = MF.getSubtarget<X86Subtarget>().getInstrInfo();
665 TRI = MF.getSubtarget<X86Subtarget>().getRegisterInfo();
666 LLVM_DEBUG(dbgs() << "Start X86AvoidStoreForwardBlocks\n";);
667 // Look for a load then a store to XMM/YMM which look like a memcpy
668 findPotentiallylBlockedCopies(MF);
669
670 for (auto LoadStoreInstPair : BlockedLoadsStoresPairs) {
671 MachineInstr *LoadInst = LoadStoreInstPair.first;
672 int64_t LdDispImm = getDispOperand(LoadInst).getImm();
673 DisplacementSizeMap BlockingStoresDispSizeMap;
674
675 SmallVector<MachineInstr *, 2> PotentialBlockers =
676 findPotentialBlockers(LoadInst);
677 for (auto *PBInst : PotentialBlockers) {
678 if (!isPotentialBlockingStoreInst(PBInst->getOpcode(),
679 LoadInst->getOpcode()) ||
680 !isRelevantAddressingMode(PBInst) || !PBInst->hasOneMemOperand())
681 continue;
682 int64_t PBstDispImm = getDispOperand(PBInst).getImm();
683 unsigned PBstSize = (*PBInst->memoperands_begin())->getSize().getValue();
684 // This check doesn't cover all cases, but it will suffice for now.
685 // TODO: take branch probability into consideration, if the blocking
686 // store is in an unreached block, breaking the memcopy could lose
687 // performance.
688 if (hasSameBaseOpValue(LoadInst, PBInst) &&
689 isBlockingStore(LdDispImm, getRegSizeInBytes(LoadInst), PBstDispImm,
690 PBstSize))
691 updateBlockingStoresDispSizeMap(BlockingStoresDispSizeMap, PBstDispImm,
692 PBstSize);
693 }
694
695 if (BlockingStoresDispSizeMap.empty())
696 continue;
697
698 // We found a store forward block, break the memcpy's load and store
699 // into smaller copies such that each smaller store that was causing
700 // a store block would now be copied separately.
701 MachineInstr *StoreInst = LoadStoreInstPair.second;
702 LLVM_DEBUG(dbgs() << "Blocked load and store instructions: \n");
703 LLVM_DEBUG(LoadInst->dump());
704 LLVM_DEBUG(StoreInst->dump());
705 LLVM_DEBUG(dbgs() << "Replaced with:\n");
706 removeRedundantBlockingStores(BlockingStoresDispSizeMap);
707 breakBlockedCopies(LoadInst, StoreInst, BlockingStoresDispSizeMap);
708 updateKillStatus(LoadInst, StoreInst);
709 ForRemoval.push_back(LoadInst);
710 ForRemoval.push_back(StoreInst);
711 }
712 for (auto *RemovedInst : ForRemoval) {
713 RemovedInst->eraseFromParent();
714 }
715 ForRemoval.clear();
716 BlockedLoadsStoresPairs.clear();
717 LLVM_DEBUG(dbgs() << "End X86AvoidStoreForwardBlocks\n";);
718
719 return Changed;
720}
721
722bool X86AvoidSFBLegacy::runOnMachineFunction(MachineFunction &MF) {
723 if (skipFunction(MF.getFunction()))
724 return false;
725 AliasAnalysis *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
726 X86AvoidSFBImpl Impl(AA);
727 return Impl.runOnMachineFunction(MF);
728}
729
730PreservedAnalyses
735 .getManager()
736 .getResult<AAManager>(MF.getFunction());
737 X86AvoidSFBImpl Impl(AA);
738 bool Changed = Impl.runOnMachineFunction(MF);
741}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
MachineBasicBlock & MBB
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
#define DEBUG_TYPE
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
Register const TargetRegisterInfo * TRI
Promote Memory to Register
Definition Mem2Reg.cpp:110
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition PassSupport.h:42
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition PassSupport.h:44
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition PassSupport.h:39
#define LLVM_DEBUG(...)
Definition Debug.h:119
static unsigned getYMMtoXMMLoadOpcode(unsigned LoadOpcode)
static bool isPotentialBlockedMemCpyLd(unsigned Opcode)
static bool isPotentialBlockedMemCpyPair(unsigned LdOpcode, unsigned StOpcode)
static bool isPotentialBlockingStoreInst(unsigned Opcode, unsigned LoadOpcode)
static const int MOV64SZ
static const int MOV8SZ
static bool isXMMLoadOpcode(unsigned Opcode)
static int getAddrOffset(const MachineInstr *MI)
static cl::opt< unsigned > X86AvoidSFBInspectionLimit("x86-sfb-inspection-limit", cl::desc("X86: Number of instructions backward to " "inspect for store forwarding blocks."), cl::init(20), cl::Hidden)
static bool isBlockingStore(int64_t LoadDispImm, unsigned LoadSize, int64_t StoreDispImm, unsigned StoreSize)
static bool isRelevantAddressingMode(MachineInstr *MI)
static cl::opt< bool > DisableX86AvoidStoreForwardBlocks("x86-disable-avoid-SFB", cl::Hidden, cl::desc("X86: Disable Store Forwarding Blocks fixup."), cl::init(false))
static void removeRedundantBlockingStores(DisplacementSizeMap &BlockingStoresDispSizeMap)
static const int MOV16SZ
static bool hasSameBaseOpValue(MachineInstr *LoadInst, MachineInstr *StoreInst)
static void updateBlockingStoresDispSizeMap(DisplacementSizeMap &BlockingStoresDispSizeMap, int64_t DispImm, unsigned Size)
static MachineOperand & getBaseOperand(MachineInstr *MI)
static unsigned getYMMtoXMMStoreOpcode(unsigned StoreOpcode)
static SmallVector< MachineInstr *, 2 > findPotentialBlockers(MachineInstr *LoadInst)
static void updateKillStatus(MachineInstr *LoadInst, MachineInstr *StoreInst)
static const int MOV32SZ
static MachineOperand & getDispOperand(MachineInstr *MI)
static bool isYMMLoadOpcode(unsigned Opcode)
static const int MOV128SZ
A manager for alias analyses.
A wrapper pass to provide the legacy pass manager access to a suitably prepared AAResults object.
bool isNoAlias(const MemoryLocation &LocA, const MemoryLocation &LocB)
A trivial helper function to check to see if the specified pointers are no-alias.
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
AnalysisUsage & addRequired()
FunctionPass class - This class is used to implement most global optimizations.
Definition Pass.h:314
An instruction for reading from memory.
TypeSize getValue() const
Describe properties that are true of each instruction in the target description file.
Instructions::iterator instr_iterator
MachineInstrBundleIterator< MachineInstr, true > reverse_iterator
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
const MachineInstrBuilder & addReg(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addMemOperand(MachineMemOperand *MMO) const
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
const MachineBasicBlock * getParent() const
bool hasOneMemOperand() const
Return true if this instruction has exactly one MachineMemOperand.
mmo_iterator memoperands_begin() const
Access to memory operands of the instruction.
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
LLVM_ABI void dump() const
const MachineOperand & getOperand(unsigned i) const
LocationSize getSize() const
Return the size in bytes of the memory reference.
bool isAtomic() const
Returns true if this operation has an atomic ordering requirement of unordered or higher,...
AAMDNodes getAAInfo() const
Return the AA tags for the memory reference.
const Value * getValue() const
Return the base address of the memory access.
int64_t getOffset() const
For normal values, this is a byte offset added to the base address.
MachineOperand class - Representation of each machine instruction operand.
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
void setIsKill(bool Val=true)
Register getReg() const
getReg - Returns the register number.
LLVM_ABI bool hasOneNonDBGUse(Register RegNo) const
hasOneNonDBGUse - Return true if there is exactly one non-Debug use of the specified register.
iterator_range< use_nodbg_iterator > use_nodbg_operands(Register Reg) const
LLVM_ABI Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
PreservedAnalyses run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM)
const ParentTy * getParent() const
Definition ilist_node.h:34
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Abstract Attribute helper functions.
Definition Attributor.h:165
int getMemoryOperandNo(uint64_t TSFlags)
unsigned getOperandBias(const MCInstrDesc &Desc)
Compute whether all of the def operands are repeated in the uses and therefore should be skipped.
@ AddrNumOperands
Definition X86BaseInfo.h:36
initializer< Ty > init(const Ty &Val)
This is an optimization pass for GlobalISel generic memory operations.
@ Offset
Definition DWP.cpp:558
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
FunctionPass * createX86AvoidStoreForwardingBlocksLegacyPass()
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:633
AnalysisManager< MachineFunction > MachineFunctionAnalysisManager
LLVM_ABI PreservedAnalyses getMachineFunctionPassPreservedAnalyses()
Returns the minimum set of Analyses that all machine function passes must preserve.
auto reverse(ContainerTy &&C)
Definition STLExtras.h:407
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:209
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
AAResults AliasAnalysis
Temporary typedef for legacy code that uses a generic AliasAnalysis pointer or reference.
IterT prev_nodbg(IterT It, IterT Begin, bool SkipPseudoOp=true)
Decrement It, then continue decrementing it while it points to a debug instruction.