55#define DEBUG_TYPE "x86-avoid-SFB"
62 "x86-sfb-inspection-limit",
63 cl::desc(
"X86: Number of instructions backward to "
64 "inspect for store forwarding blocks."),
69using DisplacementSizeMap = std::map<int64_t, unsigned>;
77 return "X86 Avoid Store Forwarding Blocks";
92 BlockedLoadsStoresPairs;
103 const DisplacementSizeMap &BlockingStoresDispSizeMap);
107 int64_t LMMOffset, int64_t SMMOffset);
111 int64_t StoreDisp,
unsigned Size, int64_t LMMOffset,
121char X86AvoidSFBPass::ID = 0;
130 return new X86AvoidSFBPass();
134 return Opcode == X86::MOVUPSrm || Opcode == X86::MOVAPSrm ||
135 Opcode == X86::VMOVUPSrm || Opcode == X86::VMOVAPSrm ||
136 Opcode == X86::VMOVUPDrm || Opcode == X86::VMOVAPDrm ||
137 Opcode == X86::VMOVDQUrm || Opcode == X86::VMOVDQArm ||
138 Opcode == X86::VMOVUPSZ128rm || Opcode == X86::VMOVAPSZ128rm ||
139 Opcode == X86::VMOVUPDZ128rm || Opcode == X86::VMOVAPDZ128rm ||
140 Opcode == X86::VMOVDQU64Z128rm || Opcode == X86::VMOVDQA64Z128rm ||
141 Opcode == X86::VMOVDQU32Z128rm || Opcode == X86::VMOVDQA32Z128rm;
144 return Opcode == X86::VMOVUPSYrm || Opcode == X86::VMOVAPSYrm ||
145 Opcode == X86::VMOVUPDYrm || Opcode == X86::VMOVAPDYrm ||
146 Opcode == X86::VMOVDQUYrm || Opcode == X86::VMOVDQAYrm ||
147 Opcode == X86::VMOVUPSZ256rm || Opcode == X86::VMOVAPSZ256rm ||
148 Opcode == X86::VMOVUPDZ256rm || Opcode == X86::VMOVAPDZ256rm ||
149 Opcode == X86::VMOVDQU64Z256rm || Opcode == X86::VMOVDQA64Z256rm ||
150 Opcode == X86::VMOVDQU32Z256rm || Opcode == X86::VMOVDQA32Z256rm;
161 return StOpcode == X86::MOVUPSmr || StOpcode == X86::MOVAPSmr;
164 return StOpcode == X86::VMOVUPSmr || StOpcode == X86::VMOVAPSmr;
167 return StOpcode == X86::VMOVUPDmr || StOpcode == X86::VMOVAPDmr;
170 return StOpcode == X86::VMOVDQUmr || StOpcode == X86::VMOVDQAmr;
171 case X86::VMOVUPSZ128rm:
172 case X86::VMOVAPSZ128rm:
173 return StOpcode == X86::VMOVUPSZ128mr || StOpcode == X86::VMOVAPSZ128mr;
174 case X86::VMOVUPDZ128rm:
175 case X86::VMOVAPDZ128rm:
176 return StOpcode == X86::VMOVUPDZ128mr || StOpcode == X86::VMOVAPDZ128mr;
177 case X86::VMOVUPSYrm:
178 case X86::VMOVAPSYrm:
179 return StOpcode == X86::VMOVUPSYmr || StOpcode == X86::VMOVAPSYmr;
180 case X86::VMOVUPDYrm:
181 case X86::VMOVAPDYrm:
182 return StOpcode == X86::VMOVUPDYmr || StOpcode == X86::VMOVAPDYmr;
183 case X86::VMOVDQUYrm:
184 case X86::VMOVDQAYrm:
185 return StOpcode == X86::VMOVDQUYmr || StOpcode == X86::VMOVDQAYmr;
186 case X86::VMOVUPSZ256rm:
187 case X86::VMOVAPSZ256rm:
188 return StOpcode == X86::VMOVUPSZ256mr || StOpcode == X86::VMOVAPSZ256mr;
189 case X86::VMOVUPDZ256rm:
190 case X86::VMOVAPDZ256rm:
191 return StOpcode == X86::VMOVUPDZ256mr || StOpcode == X86::VMOVAPDZ256mr;
192 case X86::VMOVDQU64Z128rm:
193 case X86::VMOVDQA64Z128rm:
194 return StOpcode == X86::VMOVDQU64Z128mr || StOpcode == X86::VMOVDQA64Z128mr;
195 case X86::VMOVDQU32Z128rm:
196 case X86::VMOVDQA32Z128rm:
197 return StOpcode == X86::VMOVDQU32Z128mr || StOpcode == X86::VMOVDQA32Z128mr;
198 case X86::VMOVDQU64Z256rm:
199 case X86::VMOVDQA64Z256rm:
200 return StOpcode == X86::VMOVDQU64Z256mr || StOpcode == X86::VMOVDQA64Z256mr;
201 case X86::VMOVDQU32Z256rm:
202 case X86::VMOVDQA32Z256rm:
203 return StOpcode == X86::VMOVDQU32Z256mr || StOpcode == X86::VMOVDQA32Z256mr;
211 PBlock |= Opcode == X86::MOV64mr || Opcode == X86::MOV64mi32 ||
212 Opcode == X86::MOV32mr || Opcode == X86::MOV32mi ||
213 Opcode == X86::MOV16mr || Opcode == X86::MOV16mi ||
214 Opcode == X86::MOV8mr || Opcode == X86::MOV8mi;
216 PBlock |= Opcode == X86::VMOVUPSmr || Opcode == X86::VMOVAPSmr ||
217 Opcode == X86::VMOVUPDmr || Opcode == X86::VMOVAPDmr ||
218 Opcode == X86::VMOVDQUmr || Opcode == X86::VMOVDQAmr ||
219 Opcode == X86::VMOVUPSZ128mr || Opcode == X86::VMOVAPSZ128mr ||
220 Opcode == X86::VMOVUPDZ128mr || Opcode == X86::VMOVAPDZ128mr ||
221 Opcode == X86::VMOVDQU64Z128mr ||
222 Opcode == X86::VMOVDQA64Z128mr ||
223 Opcode == X86::VMOVDQU32Z128mr || Opcode == X86::VMOVDQA32Z128mr;
234 switch (LoadOpcode) {
235 case X86::VMOVUPSYrm:
236 case X86::VMOVAPSYrm:
237 return X86::VMOVUPSrm;
238 case X86::VMOVUPDYrm:
239 case X86::VMOVAPDYrm:
240 return X86::VMOVUPDrm;
241 case X86::VMOVDQUYrm:
242 case X86::VMOVDQAYrm:
243 return X86::VMOVDQUrm;
244 case X86::VMOVUPSZ256rm:
245 case X86::VMOVAPSZ256rm:
246 return X86::VMOVUPSZ128rm;
247 case X86::VMOVUPDZ256rm:
248 case X86::VMOVAPDZ256rm:
249 return X86::VMOVUPDZ128rm;
250 case X86::VMOVDQU64Z256rm:
251 case X86::VMOVDQA64Z256rm:
252 return X86::VMOVDQU64Z128rm;
253 case X86::VMOVDQU32Z256rm:
254 case X86::VMOVDQA32Z256rm:
255 return X86::VMOVDQU32Z128rm;
263 switch (StoreOpcode) {
264 case X86::VMOVUPSYmr:
265 case X86::VMOVAPSYmr:
266 return X86::VMOVUPSmr;
267 case X86::VMOVUPDYmr:
268 case X86::VMOVAPDYmr:
269 return X86::VMOVUPDmr;
270 case X86::VMOVDQUYmr:
271 case X86::VMOVDQAYmr:
272 return X86::VMOVDQUmr;
273 case X86::VMOVUPSZ256mr:
274 case X86::VMOVAPSZ256mr:
275 return X86::VMOVUPSZ128mr;
276 case X86::VMOVUPDZ256mr:
277 case X86::VMOVAPDZ256mr:
278 return X86::VMOVUPDZ128mr;
279 case X86::VMOVDQU64Z256mr:
280 case X86::VMOVDQA64Z256mr:
281 return X86::VMOVDQU64Z128mr;
282 case X86::VMOVDQU32Z256mr:
283 case X86::VMOVDQA32Z256mr:
284 return X86::VMOVDQU32Z128mr;
294 assert(AddrOffset != -1 &&
"Expected Memory Operand");
320 if (!((
Base.isReg() &&
Base.getReg() != X86::NoRegister) ||
Base.isFI()))
326 if (!(
Index.isReg() &&
Index.getReg() == X86::NoRegister))
328 if (!(Segment.
isReg() && Segment.
getReg() == X86::NoRegister))
341 unsigned BlockCount = 0;
345 PBInst !=
E; ++PBInst) {
346 if (PBInst->isMetaInstruction())
349 if (BlockCount >= InspectionLimit)
352 if (
MI.getDesc().isCall())
353 return PotentialBlockers;
360 if (BlockCount < InspectionLimit) {
362 int LimitLeft = InspectionLimit - BlockCount;
366 if (PBInst.isMetaInstruction())
369 if (PredCount >= LimitLeft)
371 if (PBInst.getDesc().isCall())
377 return PotentialBlockers;
382 unsigned NStoreOpcode, int64_t StoreDisp,
383 unsigned Size, int64_t LMMOffset,
403 if (LoadBase.
isReg())
411 if (PrevInstrIt.getNodePtr() ==
LoadInst)
423 if (StoreBase.
isReg())
426 assert(StoreSrcVReg.
isReg() &&
"Expected virtual register");
433 int64_t StDispImm, int64_t LMMOffset,
435 int LdDisp = LdDispImm;
436 int StDisp = StDispImm;
442 StDisp,
MOV128SZ, LMMOffset, SMMOffset);
452 MOV64SZ, LMMOffset, SMMOffset);
462 MOV32SZ, LMMOffset, SMMOffset);
472 MOV16SZ, LMMOffset, SMMOffset);
482 MOV8SZ, LMMOffset, SMMOffset);
496 auto *StorePrevNonDbgInstr =
500 if (LoadBase.
isReg()) {
506 if (StorePrevNonDbgInstr ==
LoadInst)
510 if (StoreBase.
isReg()) {
512 if (StorePrevNonDbgInstr ==
LoadInst)
527 return !AA->isNoAlias(
532void X86AvoidSFBPass::findPotentiallylBlockedCopies(
MachineFunction &MF) {
534 for (
auto &
MI :
MBB) {
537 int DefVR =
MI.getOperand(0).getReg();
538 if (!
MRI->hasOneNonDBGUse(DefVR))
550 BlockedLoadsStoresPairs.
push_back(std::make_pair(&
MI, &StoreMI));
559 return TRI->getRegSizeInBits(*TRC) / 8;
562void X86AvoidSFBPass::breakBlockedCopies(
564 const DisplacementSizeMap &BlockingStoresDispSizeMap) {
567 int64_t LMMOffset = 0;
568 int64_t SMMOffset = 0;
570 int64_t LdDisp1 = LdDispImm;
572 int64_t StDisp1 = StDispImm;
576 int64_t LdStDelta = StDispImm - LdDispImm;
578 for (
auto DispSizePair : BlockingStoresDispSizeMap) {
579 LdDisp2 = DispSizePair.first;
580 StDisp2 = DispSizePair.first + LdStDelta;
581 Size2 = DispSizePair.second;
583 if (LdDisp2 < LdDisp1) {
584 int OverlapDelta = LdDisp1 - LdDisp2;
585 LdDisp2 += OverlapDelta;
586 StDisp2 += OverlapDelta;
587 Size2 -= OverlapDelta;
589 Size1 = LdDisp2 - LdDisp1;
598 LdDisp1 = LdDisp2 + Size2;
599 StDisp1 = StDisp2 + Size2;
600 LMMOffset += Size1 + Size2;
601 SMMOffset += Size1 + Size2;
603 unsigned Size3 = (LdDispImm + getRegSizeInBytes(
LoadInst)) - LdDisp1;
614 if (LoadBase.
isReg())
620 int64_t StoreDispImm,
unsigned StoreSize) {
621 return ((StoreDispImm >= LoadDispImm) &&
622 (StoreDispImm <= LoadDispImm + (LoadSize - StoreSize)));
628 int64_t DispImm,
unsigned Size) {
629 if (BlockingStoresDispSizeMap.count(DispImm)) {
631 if (BlockingStoresDispSizeMap[DispImm] >
Size)
632 BlockingStoresDispSizeMap[DispImm] =
Size;
635 BlockingStoresDispSizeMap[DispImm] =
Size;
641 if (BlockingStoresDispSizeMap.size() <= 1)
645 for (
auto DispSizePair : BlockingStoresDispSizeMap) {
646 int64_t CurrDisp = DispSizePair.first;
647 unsigned CurrSize = DispSizePair.second;
648 while (DispSizeStack.
size()) {
649 int64_t PrevDisp = DispSizeStack.
back().first;
650 unsigned PrevSize = DispSizeStack.
back().second;
651 if (CurrDisp + CurrSize > PrevDisp + PrevSize)
657 BlockingStoresDispSizeMap.clear();
658 for (
auto Disp : DispSizeStack)
659 BlockingStoresDispSizeMap.insert(Disp);
663 bool Changed =
false;
670 assert(
MRI->isSSA() &&
"Expected MIR to be in SSA form");
673 AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
676 findPotentiallylBlockedCopies(MF);
678 for (
auto LoadStoreInstPair : BlockedLoadsStoresPairs) {
681 DisplacementSizeMap BlockingStoresDispSizeMap;
685 for (
auto *PBInst : PotentialBlockers) {
691 unsigned PBstSize = (*PBInst->memoperands_begin())->
getSize();
703 if (BlockingStoresDispSizeMap.empty())
720 for (
auto *RemovedInst : ForRemoval) {
721 RemovedInst->eraseFromParent();
724 BlockedLoadsStoresPairs.clear();
unsigned const MachineRegisterInfo * MRI
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
const HexagonInstrInfo * TII
unsigned const TargetRegisterInfo * TRI
#define INITIALIZE_PASS_DEPENDENCY(depName)
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static unsigned getYMMtoXMMLoadOpcode(unsigned LoadOpcode)
static bool isPotentialBlockedMemCpyLd(unsigned Opcode)
static bool isPotentialBlockedMemCpyPair(unsigned LdOpcode, unsigned StOpcode)
static bool isPotentialBlockingStoreInst(unsigned Opcode, unsigned LoadOpcode)
static bool isXMMLoadOpcode(unsigned Opcode)
static int getAddrOffset(const MachineInstr *MI)
static cl::opt< unsigned > X86AvoidSFBInspectionLimit("x86-sfb-inspection-limit", cl::desc("X86: Number of instructions backward to " "inspect for store forwarding blocks."), cl::init(20), cl::Hidden)
static bool isBlockingStore(int64_t LoadDispImm, unsigned LoadSize, int64_t StoreDispImm, unsigned StoreSize)
static bool isRelevantAddressingMode(MachineInstr *MI)
static cl::opt< bool > DisableX86AvoidStoreForwardBlocks("x86-disable-avoid-SFB", cl::Hidden, cl::desc("X86: Disable Store Forwarding Blocks fixup."), cl::init(false))
static void removeRedundantBlockingStores(DisplacementSizeMap &BlockingStoresDispSizeMap)
static bool hasSameBaseOpValue(MachineInstr *LoadInst, MachineInstr *StoreInst)
static void updateBlockingStoresDispSizeMap(DisplacementSizeMap &BlockingStoresDispSizeMap, int64_t DispImm, unsigned Size)
static MachineOperand & getBaseOperand(MachineInstr *MI)
static unsigned getYMMtoXMMStoreOpcode(unsigned StoreOpcode)
static SmallVector< MachineInstr *, 2 > findPotentialBlockers(MachineInstr *LoadInst)
static void updateKillStatus(MachineInstr *LoadInst, MachineInstr *StoreInst)
static MachineOperand & getDispOperand(MachineInstr *MI)
static bool isYMMLoadOpcode(unsigned Opcode)
static const int MOV128SZ
static unsigned getSize(unsigned Kind)
A wrapper pass to provide the legacy pass manager access to a suitably prepared AAResults object.
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
const Function * getParent() const
Return the enclosing method, or null if none.
FunctionPass class - This class is used to implement most global optimizations.
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
const BasicBlock * getParent() const
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
An instruction for reading from memory.
Describe properties that are true of each instruction in the target description file.
instr_iterator instr_begin()
void push_back(MachineInstr *MI)
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
iterator_range< pred_iterator > predecessors()
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
virtual bool runOnMachineFunction(MachineFunction &MF)=0
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, uint64_t s, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addMemOperand(MachineMemOperand *MMO) const
MachineBasicBlock iterator that automatically skips over MIs that are inside bundles (i....
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
const MachineBasicBlock * getParent() const
bool hasOneMemOperand() const
Return true if this instruction has exactly one MachineMemOperand.
mmo_iterator memoperands_begin() const
Access to memory operands of the instruction.
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
const MachineOperand & getOperand(unsigned i) const
A description of a memory reference used in the backend.
uint64_t getSize() const
Return the size in bytes of the memory reference.
AAMDNodes getAAInfo() const
Return the AA tags for the memory reference.
const Value * getValue() const
Return the base address of the memory access.
int64_t getOffset() const
For normal values, this is a byte offset added to the base address.
MachineOperand class - Representation of each machine instruction operand.
bool isReg() const
isReg - Tests if this is a MO_Register operand.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
void setIsKill(bool Val=true)
Register getReg() const
getReg - Returns the register number.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
Representation for a specific memory location.
virtual StringRef getPassName() const
getPassName - Return a nice clean name for a pass.
Wrapper class representing virtual and physical registers.
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
StringRef - Represent a constant reference to a string, i.e.
Value * getOperand(unsigned i) const
void dump() const
Support for debugging, callable in GDB: V->dump()
Iterator for intrusive lists based on ilist_node.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
int getMemoryOperandNo(uint64_t TSFlags)
The function returns the MCInst operand # for the first field of the memory operand.
unsigned getOperandBias(const MCInstrDesc &Desc)
Compute whether all of the def operands are repeated in the uses and therefore should be skipped.
@ AddrSegmentReg
AddrSegmentReg - The operand # of the segment in the memory operand.
@ AddrNumOperands
AddrNumOperands - Total number of operands in a memory reference.
initializer< Ty > init(const Ty &Val)
This is an optimization pass for GlobalISel generic memory operations.
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
auto reverse(ContainerTy &&C)
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
IterT prev_nodbg(IterT It, IterT Begin, bool SkipPseudoOp=true)
Decrement It, then continue decrementing it while it points to a debug instruction.
FunctionPass * createX86AvoidStoreForwardingBlocks()
Return a pass that avoids creating store forward block issues in the hardware.