LLVM  10.0.0svn
X86VZeroUpper.cpp
Go to the documentation of this file.
1 //===- X86VZeroUpper.cpp - AVX vzeroupper instruction inserter ------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file defines the pass which inserts x86 AVX vzeroupper instructions
10 // before calls to SSE encoded functions. This avoids transition latency
11 // penalty when transferring control between AVX encoded instructions and old
12 // SSE encoding mode.
13 //
14 //===----------------------------------------------------------------------===//
15 
16 #include "X86.h"
17 #include "X86InstrInfo.h"
18 #include "X86Subtarget.h"
19 #include "llvm/ADT/SmallVector.h"
20 #include "llvm/ADT/Statistic.h"
30 #include "llvm/IR/CallingConv.h"
31 #include "llvm/IR/DebugLoc.h"
32 #include "llvm/IR/Function.h"
33 #include "llvm/Support/Debug.h"
36 #include <cassert>
37 
38 using namespace llvm;
39 
40 #define DEBUG_TYPE "x86-vzeroupper"
41 
42 STATISTIC(NumVZU, "Number of vzeroupper instructions inserted");
43 
44 namespace {
45 
46  class VZeroUpperInserter : public MachineFunctionPass {
47  public:
48  VZeroUpperInserter() : MachineFunctionPass(ID) {}
49 
50  bool runOnMachineFunction(MachineFunction &MF) override;
51 
52  MachineFunctionProperties getRequiredProperties() const override {
55  }
56 
57  StringRef getPassName() const override { return "X86 vzeroupper inserter"; }
58 
59  private:
60  void processBasicBlock(MachineBasicBlock &MBB);
61  void insertVZeroUpper(MachineBasicBlock::iterator I,
62  MachineBasicBlock &MBB);
63  void addDirtySuccessor(MachineBasicBlock &MBB);
64 
65  using BlockExitState = enum { PASS_THROUGH, EXITS_CLEAN, EXITS_DIRTY };
66 
67  static const char* getBlockExitStateName(BlockExitState ST);
68 
69  // Core algorithm state:
70  // BlockState - Each block is either:
71  // - PASS_THROUGH: There are neither YMM/ZMM dirtying instructions nor
72  // vzeroupper instructions in this block.
73  // - EXITS_CLEAN: There is (or will be) a vzeroupper instruction in this
74  // block that will ensure that YMM/ZMM is clean on exit.
75  // - EXITS_DIRTY: An instruction in the block dirties YMM/ZMM and no
76  // subsequent vzeroupper in the block clears it.
77  //
78  // AddedToDirtySuccessors - This flag is raised when a block is added to the
79  // DirtySuccessors list to ensure that it's not
80  // added multiple times.
81  //
82  // FirstUnguardedCall - Records the location of the first unguarded call in
83  // each basic block that may need to be guarded by a
84  // vzeroupper. We won't know whether it actually needs
85  // to be guarded until we discover a predecessor that
86  // is DIRTY_OUT.
87  struct BlockState {
88  BlockExitState ExitState = PASS_THROUGH;
89  bool AddedToDirtySuccessors = false;
90  MachineBasicBlock::iterator FirstUnguardedCall;
91 
92  BlockState() = default;
93  };
94 
95  using BlockStateMap = SmallVector<BlockState, 8>;
96  using DirtySuccessorsWorkList = SmallVector<MachineBasicBlock *, 8>;
97 
98  BlockStateMap BlockStates;
99  DirtySuccessorsWorkList DirtySuccessors;
100  bool EverMadeChange;
101  bool IsX86INTR;
102  const TargetInstrInfo *TII;
103 
104  static char ID;
105  };
106 
107 } // end anonymous namespace
108 
109 char VZeroUpperInserter::ID = 0;
110 
112  return new VZeroUpperInserter();
113 }
114 
115 #ifndef NDEBUG
116 const char* VZeroUpperInserter::getBlockExitStateName(BlockExitState ST) {
117  switch (ST) {
118  case PASS_THROUGH: return "Pass-through";
119  case EXITS_DIRTY: return "Exits-dirty";
120  case EXITS_CLEAN: return "Exits-clean";
121  }
122  llvm_unreachable("Invalid block exit state.");
123 }
124 #endif
125 
126 /// VZEROUPPER cleans state that is related to Y/ZMM0-15 only.
127 /// Thus, there is no need to check for Y/ZMM16 and above.
128 static bool isYmmOrZmmReg(unsigned Reg) {
129  return (Reg >= X86::YMM0 && Reg <= X86::YMM15) ||
130  (Reg >= X86::ZMM0 && Reg <= X86::ZMM15);
131 }
132 
134  for (std::pair<unsigned, unsigned> LI : MRI.liveins())
135  if (isYmmOrZmmReg(LI.first))
136  return true;
137 
138  return false;
139 }
140 
141 static bool clobbersAllYmmAndZmmRegs(const MachineOperand &MO) {
142  for (unsigned reg = X86::YMM0; reg <= X86::YMM15; ++reg) {
143  if (!MO.clobbersPhysReg(reg))
144  return false;
145  }
146  for (unsigned reg = X86::ZMM0; reg <= X86::ZMM15; ++reg) {
147  if (!MO.clobbersPhysReg(reg))
148  return false;
149  }
150  return true;
151 }
152 
154  for (const MachineOperand &MO : MI.operands()) {
155  if (MI.isCall() && MO.isRegMask() && !clobbersAllYmmAndZmmRegs(MO))
156  return true;
157  if (!MO.isReg())
158  continue;
159  if (MO.isDebug())
160  continue;
161  if (isYmmOrZmmReg(MO.getReg()))
162  return true;
163  }
164  return false;
165 }
166 
167 /// Check if given call instruction has a RegMask operand.
169  assert(MI.isCall() && "Can only be called on call instructions.");
170  for (const MachineOperand &MO : MI.operands()) {
171  if (MO.isRegMask())
172  return true;
173  }
174  return false;
175 }
176 
177 /// Insert a vzeroupper instruction before I.
178 void VZeroUpperInserter::insertVZeroUpper(MachineBasicBlock::iterator I,
179  MachineBasicBlock &MBB) {
180  DebugLoc dl = I->getDebugLoc();
181  BuildMI(MBB, I, dl, TII->get(X86::VZEROUPPER));
182  ++NumVZU;
183  EverMadeChange = true;
184 }
185 
186 /// Add MBB to the DirtySuccessors list if it hasn't already been added.
187 void VZeroUpperInserter::addDirtySuccessor(MachineBasicBlock &MBB) {
188  if (!BlockStates[MBB.getNumber()].AddedToDirtySuccessors) {
189  DirtySuccessors.push_back(&MBB);
190  BlockStates[MBB.getNumber()].AddedToDirtySuccessors = true;
191  }
192 }
193 
194 /// Loop over all of the instructions in the basic block, inserting vzeroupper
195 /// instructions before function calls.
196 void VZeroUpperInserter::processBasicBlock(MachineBasicBlock &MBB) {
197  // Start by assuming that the block is PASS_THROUGH which implies no unguarded
198  // calls.
199  BlockExitState CurState = PASS_THROUGH;
200  BlockStates[MBB.getNumber()].FirstUnguardedCall = MBB.end();
201 
202  for (MachineInstr &MI : MBB) {
203  bool IsCall = MI.isCall();
204  bool IsReturn = MI.isReturn();
205  bool IsControlFlow = IsCall || IsReturn;
206 
207  // No need for vzeroupper before iret in interrupt handler function,
208  // epilogue will restore YMM/ZMM registers if needed.
209  if (IsX86INTR && IsReturn)
210  continue;
211 
212  // An existing VZERO* instruction resets the state.
213  if (MI.getOpcode() == X86::VZEROALL || MI.getOpcode() == X86::VZEROUPPER) {
214  CurState = EXITS_CLEAN;
215  continue;
216  }
217 
218  // Shortcut: don't need to check regular instructions in dirty state.
219  if (!IsControlFlow && CurState == EXITS_DIRTY)
220  continue;
221 
222  if (hasYmmOrZmmReg(MI)) {
223  // We found a ymm/zmm-using instruction; this could be an AVX/AVX512
224  // instruction, or it could be control flow.
225  CurState = EXITS_DIRTY;
226  continue;
227  }
228 
229  // Check for control-flow out of the current function (which might
230  // indirectly execute SSE instructions).
231  if (!IsControlFlow)
232  continue;
233 
234  // If the call has no RegMask, skip it as well. It usually happens on
235  // helper function calls (such as '_chkstk', '_ftol2') where standard
236  // calling convention is not used (RegMask is not used to mark register
237  // clobbered and register usage (def/implicit-def/use) is well-defined and
238  // explicitly specified.
239  if (IsCall && !callHasRegMask(MI))
240  continue;
241 
242  // The VZEROUPPER instruction resets the upper 128 bits of YMM0-YMM15
243  // registers. In addition, the processor changes back to Clean state, after
244  // which execution of SSE instructions or AVX instructions has no transition
245  // penalty. Add the VZEROUPPER instruction before any function call/return
246  // that might execute SSE code.
247  // FIXME: In some cases, we may want to move the VZEROUPPER into a
248  // predecessor block.
249  if (CurState == EXITS_DIRTY) {
250  // After the inserted VZEROUPPER the state becomes clean again, but
251  // other YMM/ZMM may appear before other subsequent calls or even before
252  // the end of the BB.
253  insertVZeroUpper(MI, MBB);
254  CurState = EXITS_CLEAN;
255  } else if (CurState == PASS_THROUGH) {
256  // If this block is currently in pass-through state and we encounter a
257  // call then whether we need a vzeroupper or not depends on whether this
258  // block has successors that exit dirty. Record the location of the call,
259  // and set the state to EXITS_CLEAN, but do not insert the vzeroupper yet.
260  // It will be inserted later if necessary.
261  BlockStates[MBB.getNumber()].FirstUnguardedCall = MI;
262  CurState = EXITS_CLEAN;
263  }
264  }
265 
266  LLVM_DEBUG(dbgs() << "MBB #" << MBB.getNumber() << " exit state: "
267  << getBlockExitStateName(CurState) << '\n');
268 
269  if (CurState == EXITS_DIRTY)
270  for (MachineBasicBlock::succ_iterator SI = MBB.succ_begin(),
271  SE = MBB.succ_end();
272  SI != SE; ++SI)
273  addDirtySuccessor(**SI);
274 
275  BlockStates[MBB.getNumber()].ExitState = CurState;
276 }
277 
278 /// Loop over all of the basic blocks, inserting vzeroupper instructions before
279 /// function calls.
280 bool VZeroUpperInserter::runOnMachineFunction(MachineFunction &MF) {
281  const X86Subtarget &ST = MF.getSubtarget<X86Subtarget>();
282  if (!ST.hasAVX() || ST.hasFastPartialYMMorZMMWrite())
283  return false;
284  TII = ST.getInstrInfo();
286  EverMadeChange = false;
287  IsX86INTR = MF.getFunction().getCallingConv() == CallingConv::X86_INTR;
288 
289  bool FnHasLiveInYmmOrZmm = checkFnHasLiveInYmmOrZmm(MRI);
290 
291  // Fast check: if the function doesn't use any ymm/zmm registers, we don't
292  // need to insert any VZEROUPPER instructions. This is constant-time, so it
293  // is cheap in the common case of no ymm/zmm use.
294  bool YmmOrZmmUsed = FnHasLiveInYmmOrZmm;
295  const TargetRegisterClass *RCs[2] = {&X86::VR256RegClass, &X86::VR512RegClass};
296  for (auto *RC : RCs) {
297  if (!YmmOrZmmUsed) {
298  for (TargetRegisterClass::iterator i = RC->begin(), e = RC->end(); i != e;
299  i++) {
300  if (!MRI.reg_nodbg_empty(*i)) {
301  YmmOrZmmUsed = true;
302  break;
303  }
304  }
305  }
306  }
307  if (!YmmOrZmmUsed) {
308  return false;
309  }
310 
311  assert(BlockStates.empty() && DirtySuccessors.empty() &&
312  "X86VZeroUpper state should be clear");
313  BlockStates.resize(MF.getNumBlockIDs());
314 
315  // Process all blocks. This will compute block exit states, record the first
316  // unguarded call in each block, and add successors of dirty blocks to the
317  // DirtySuccessors list.
318  for (MachineBasicBlock &MBB : MF)
319  processBasicBlock(MBB);
320 
321  // If any YMM/ZMM regs are live-in to this function, add the entry block to
322  // the DirtySuccessors list
323  if (FnHasLiveInYmmOrZmm)
324  addDirtySuccessor(MF.front());
325 
326  // Re-visit all blocks that are successors of EXITS_DIRTY blocks. Add
327  // vzeroupper instructions to unguarded calls, and propagate EXITS_DIRTY
328  // through PASS_THROUGH blocks.
329  while (!DirtySuccessors.empty()) {
330  MachineBasicBlock &MBB = *DirtySuccessors.back();
331  DirtySuccessors.pop_back();
332  BlockState &BBState = BlockStates[MBB.getNumber()];
333 
334  // MBB is a successor of a dirty block, so its first call needs to be
335  // guarded.
336  if (BBState.FirstUnguardedCall != MBB.end())
337  insertVZeroUpper(BBState.FirstUnguardedCall, MBB);
338 
339  // If this successor was a pass-through block, then it is now dirty. Its
340  // successors need to be added to the worklist (if they haven't been
341  // already).
342  if (BBState.ExitState == PASS_THROUGH) {
343  LLVM_DEBUG(dbgs() << "MBB #" << MBB.getNumber()
344  << " was Pass-through, is now Dirty-out.\n");
345  for (MachineBasicBlock *Succ : MBB.successors())
346  addDirtySuccessor(*Succ);
347  }
348  }
349 
350  BlockStates.clear();
351  return EverMadeChange;
352 }
bool reg_nodbg_empty(unsigned RegNo) const
reg_nodbg_empty - Return true if the only instructions using or defining Reg are Debug instructions...
bool hasAVX() const
Definition: X86Subtarget.h:581
bool isCall(QueryType Type=AnyInBundle) const
Definition: MachineInstr.h:635
This class represents lattice values for constants.
Definition: AllocatorList.h:23
static bool hasYmmOrZmmReg(MachineInstr &MI)
unsigned getNumBlockIDs() const
getNumBlockIDs - Return the number of MBB ID&#39;s allocated.
static bool isYmmOrZmmReg(unsigned Reg)
VZEROUPPER cleans state that is related to Y/ZMM0-15 only.
const X86InstrInfo * getInstrInfo() const override
Definition: X86Subtarget.h:501
unsigned Reg
STATISTIC(NumFunctions, "Total number of functions")
A debug info location.
Definition: DebugLoc.h:33
FunctionPass * createX86IssueVZeroUpperPass()
This pass inserts AVX vzeroupper instructions before each call to avoid transition penalty between fu...
iterator_range< mop_iterator > operands()
Definition: MachineInstr.h:461
iterator_range< succ_iterator > successors()
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
const HexagonInstrInfo * TII
int getNumber() const
MachineBasicBlocks are uniquely numbered at the function level, unless they&#39;re not in a MachineFuncti...
TargetInstrInfo - Interface to description of machine instruction set.
X86_INTR - x86 hardware interrupt context.
Definition: CallingConv.h:173
MachineInstrBuilder BuildMI(MachineFunction &MF, const DebugLoc &DL, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
unsigned const MachineRegisterInfo * MRI
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
static bool callHasRegMask(MachineInstr &MI)
Check if given call instruction has a RegMask operand.
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:284
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
bool hasFastPartialYMMorZMMWrite() const
Definition: X86Subtarget.h:652
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition: Function.h:212
MachineOperand class - Representation of each machine instruction operand.
This is a &#39;vector&#39; (really, a variable-sized array), optimized for the case when the array is small...
Definition: SmallVector.h:837
const Function & getFunction() const
Return the LLVM function that this machine code represents.
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:132
static bool clobbersPhysReg(const uint32_t *RegMask, unsigned PhysReg)
clobbersPhysReg - Returns true if this RegMask clobbers PhysReg.
ArrayRef< std::pair< unsigned, unsigned > > liveins() const
MachineRegisterInfo - Keep track of information for virtual and physical registers, including vreg register classes, use/def chains for registers, etc.
MachineFunctionProperties & set(Property P)
Representation of each machine instruction.
Definition: MachineInstr.h:64
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
#define I(x, y, z)
Definition: MD5.cpp:58
static bool checkFnHasLiveInYmmOrZmm(MachineRegisterInfo &MRI)
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static bool clobbersAllYmmAndZmmRegs(const MachineOperand &MO)
IRTranslator LLVM IR MI
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:48
#define LLVM_DEBUG(X)
Definition: Debug.h:122
std::vector< MachineBasicBlock * >::iterator succ_iterator
Properties which a MachineFunction may have at a given point in time.