File: | lib/Target/AMDGPU/SIMemoryLegalizer.cpp |
Warning: | line 148, column 7 Value stored to 'IsCrossAddressSpaceOrdering' is never read |
Press '?' to see keyboard shortcuts
Keyboard shortcuts:
1 | //===- SIMemoryLegalizer.cpp ----------------------------------------------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | /// \file |
10 | /// Memory legalizer - implements memory model. More information can be |
11 | /// found here: |
12 | /// http://llvm.org/docs/AMDGPUUsage.html#memory-model |
13 | // |
14 | //===----------------------------------------------------------------------===// |
15 | |
16 | #include "AMDGPU.h" |
17 | #include "AMDGPUMachineModuleInfo.h" |
18 | #include "AMDGPUSubtarget.h" |
19 | #include "SIDefines.h" |
20 | #include "SIInstrInfo.h" |
21 | #include "MCTargetDesc/AMDGPUMCTargetDesc.h" |
22 | #include "Utils/AMDGPUBaseInfo.h" |
23 | #include "llvm/ADT/BitmaskEnum.h" |
24 | #include "llvm/ADT/None.h" |
25 | #include "llvm/ADT/Optional.h" |
26 | #include "llvm/CodeGen/MachineBasicBlock.h" |
27 | #include "llvm/CodeGen/MachineFunction.h" |
28 | #include "llvm/CodeGen/MachineFunctionPass.h" |
29 | #include "llvm/CodeGen/MachineInstrBuilder.h" |
30 | #include "llvm/CodeGen/MachineMemOperand.h" |
31 | #include "llvm/CodeGen/MachineModuleInfo.h" |
32 | #include "llvm/CodeGen/MachineOperand.h" |
33 | #include "llvm/IR/DebugLoc.h" |
34 | #include "llvm/IR/DiagnosticInfo.h" |
35 | #include "llvm/IR/Function.h" |
36 | #include "llvm/IR/LLVMContext.h" |
37 | #include "llvm/MC/MCInstrDesc.h" |
38 | #include "llvm/Pass.h" |
39 | #include "llvm/Support/AtomicOrdering.h" |
40 | #include "llvm/Support/MathExtras.h" |
41 | #include <cassert> |
42 | #include <list> |
43 | |
44 | using namespace llvm; |
45 | using namespace llvm::AMDGPU; |
46 | |
47 | #define DEBUG_TYPE"si-memory-legalizer" "si-memory-legalizer" |
48 | #define PASS_NAME"SI Memory Legalizer" "SI Memory Legalizer" |
49 | |
50 | namespace { |
51 | |
52 | LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE()using ::llvm::BitmaskEnumDetail::operator~; using ::llvm::BitmaskEnumDetail ::operator|; using ::llvm::BitmaskEnumDetail::operator&; using ::llvm::BitmaskEnumDetail::operator^; using ::llvm::BitmaskEnumDetail ::operator|=; using ::llvm::BitmaskEnumDetail::operator&= ; using ::llvm::BitmaskEnumDetail::operator^=; |
53 | |
54 | /// Memory operation flags. Can be ORed together. |
55 | enum class SIMemOp { |
56 | NONE = 0u, |
57 | LOAD = 1u << 0, |
58 | STORE = 1u << 1, |
59 | LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ STORE)LLVM_BITMASK_LARGEST_ENUMERATOR = STORE |
60 | }; |
61 | |
62 | /// Position to insert a new instruction relative to an existing |
63 | /// instruction. |
64 | enum class Position { |
65 | BEFORE, |
66 | AFTER |
67 | }; |
68 | |
69 | /// The atomic synchronization scopes supported by the AMDGPU target. |
70 | enum class SIAtomicScope { |
71 | NONE, |
72 | SINGLETHREAD, |
73 | WAVEFRONT, |
74 | WORKGROUP, |
75 | AGENT, |
76 | SYSTEM |
77 | }; |
78 | |
79 | /// The distinct address spaces supported by the AMDGPU target for |
80 | /// atomic memory operation. Can be ORed toether. |
81 | enum class SIAtomicAddrSpace { |
82 | NONE = 0u, |
83 | GLOBAL = 1u << 0, |
84 | LDS = 1u << 1, |
85 | SCRATCH = 1u << 2, |
86 | GDS = 1u << 3, |
87 | OTHER = 1u << 4, |
88 | |
89 | /// The address spaces that can be accessed by a FLAT instruction. |
90 | FLAT = GLOBAL | LDS | SCRATCH, |
91 | |
92 | /// The address spaces that support atomic instructions. |
93 | ATOMIC = GLOBAL | LDS | SCRATCH | GDS, |
94 | |
95 | /// All address spaces. |
96 | ALL = GLOBAL | LDS | SCRATCH | GDS | OTHER, |
97 | |
98 | LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ ALL)LLVM_BITMASK_LARGEST_ENUMERATOR = ALL |
99 | }; |
100 | |
101 | /// Sets named bit \p BitName to "true" if present in instruction \p MI. |
102 | /// \returns Returns true if \p MI is modified, false otherwise. |
103 | template <uint16_t BitName> |
104 | bool enableNamedBit(const MachineBasicBlock::iterator &MI) { |
105 | int BitIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), BitName); |
106 | if (BitIdx == -1) |
107 | return false; |
108 | |
109 | MachineOperand &Bit = MI->getOperand(BitIdx); |
110 | if (Bit.getImm() != 0) |
111 | return false; |
112 | |
113 | Bit.setImm(1); |
114 | return true; |
115 | } |
116 | |
117 | class SIMemOpInfo final { |
118 | private: |
119 | |
120 | friend class SIMemOpAccess; |
121 | |
122 | AtomicOrdering Ordering = AtomicOrdering::NotAtomic; |
123 | AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic; |
124 | SIAtomicScope Scope = SIAtomicScope::SYSTEM; |
125 | SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE; |
126 | SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE; |
127 | bool IsCrossAddressSpaceOrdering = false; |
128 | bool IsNonTemporal = false; |
129 | |
130 | SIMemOpInfo(AtomicOrdering Ordering = AtomicOrdering::SequentiallyConsistent, |
131 | SIAtomicScope Scope = SIAtomicScope::SYSTEM, |
132 | SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::ATOMIC, |
133 | SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::ALL, |
134 | bool IsCrossAddressSpaceOrdering = true, |
135 | AtomicOrdering FailureOrdering = |
136 | AtomicOrdering::SequentiallyConsistent, |
137 | bool IsNonTemporal = false) |
138 | : Ordering(Ordering), FailureOrdering(FailureOrdering), |
139 | Scope(Scope), OrderingAddrSpace(OrderingAddrSpace), |
140 | InstrAddrSpace(InstrAddrSpace), |
141 | IsCrossAddressSpaceOrdering(IsCrossAddressSpaceOrdering), |
142 | IsNonTemporal(IsNonTemporal) { |
143 | // There is also no cross address space ordering if the ordering |
144 | // address space is the same as the instruction address space and |
145 | // only contains a single address space. |
146 | if ((OrderingAddrSpace == InstrAddrSpace) && |
147 | isPowerOf2_32(uint32_t(InstrAddrSpace))) |
148 | IsCrossAddressSpaceOrdering = false; |
Value stored to 'IsCrossAddressSpaceOrdering' is never read | |
149 | } |
150 | |
151 | public: |
152 | /// \returns Atomic synchronization scope of the machine instruction used to |
153 | /// create this SIMemOpInfo. |
154 | SIAtomicScope getScope() const { |
155 | return Scope; |
156 | } |
157 | |
158 | /// \returns Ordering constraint of the machine instruction used to |
159 | /// create this SIMemOpInfo. |
160 | AtomicOrdering getOrdering() const { |
161 | return Ordering; |
162 | } |
163 | |
164 | /// \returns Failure ordering constraint of the machine instruction used to |
165 | /// create this SIMemOpInfo. |
166 | AtomicOrdering getFailureOrdering() const { |
167 | return FailureOrdering; |
168 | } |
169 | |
170 | /// \returns The address spaces be accessed by the machine |
171 | /// instruction used to create this SiMemOpInfo. |
172 | SIAtomicAddrSpace getInstrAddrSpace() const { |
173 | return InstrAddrSpace; |
174 | } |
175 | |
176 | /// \returns The address spaces that must be ordered by the machine |
177 | /// instruction used to create this SiMemOpInfo. |
178 | SIAtomicAddrSpace getOrderingAddrSpace() const { |
179 | return OrderingAddrSpace; |
180 | } |
181 | |
182 | /// \returns Return true iff memory ordering of operations on |
183 | /// different address spaces is required. |
184 | bool getIsCrossAddressSpaceOrdering() const { |
185 | return IsCrossAddressSpaceOrdering; |
186 | } |
187 | |
188 | /// \returns True if memory access of the machine instruction used to |
189 | /// create this SIMemOpInfo is non-temporal, false otherwise. |
190 | bool isNonTemporal() const { |
191 | return IsNonTemporal; |
192 | } |
193 | |
194 | /// \returns True if ordering constraint of the machine instruction used to |
195 | /// create this SIMemOpInfo is unordered or higher, false otherwise. |
196 | bool isAtomic() const { |
197 | return Ordering != AtomicOrdering::NotAtomic; |
198 | } |
199 | |
200 | }; |
201 | |
202 | class SIMemOpAccess final { |
203 | private: |
204 | AMDGPUMachineModuleInfo *MMI = nullptr; |
205 | |
206 | /// Reports unsupported message \p Msg for \p MI to LLVM context. |
207 | void reportUnsupported(const MachineBasicBlock::iterator &MI, |
208 | const char *Msg) const; |
209 | |
210 | /// Inspects the target synchonization scope \p SSID and determines |
211 | /// the SI atomic scope it corresponds to, the address spaces it |
212 | /// covers, and whether the memory ordering applies between address |
213 | /// spaces. |
214 | Optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>> |
215 | toSIAtomicScope(SyncScope::ID SSID, SIAtomicAddrSpace InstrScope) const; |
216 | |
217 | /// \return Return a bit set of the address spaces accessed by \p AS. |
218 | SIAtomicAddrSpace toSIAtomicAddrSpace(unsigned AS) const; |
219 | |
220 | /// \returns Info constructed from \p MI, which has at least machine memory |
221 | /// operand. |
222 | Optional<SIMemOpInfo> constructFromMIWithMMO( |
223 | const MachineBasicBlock::iterator &MI) const; |
224 | |
225 | public: |
226 | /// Construct class to support accessing the machine memory operands |
227 | /// of instructions in the machine function \p MF. |
228 | SIMemOpAccess(MachineFunction &MF); |
229 | |
230 | /// \returns Load info if \p MI is a load operation, "None" otherwise. |
231 | Optional<SIMemOpInfo> getLoadInfo( |
232 | const MachineBasicBlock::iterator &MI) const; |
233 | |
234 | /// \returns Store info if \p MI is a store operation, "None" otherwise. |
235 | Optional<SIMemOpInfo> getStoreInfo( |
236 | const MachineBasicBlock::iterator &MI) const; |
237 | |
238 | /// \returns Atomic fence info if \p MI is an atomic fence operation, |
239 | /// "None" otherwise. |
240 | Optional<SIMemOpInfo> getAtomicFenceInfo( |
241 | const MachineBasicBlock::iterator &MI) const; |
242 | |
243 | /// \returns Atomic cmpxchg/rmw info if \p MI is an atomic cmpxchg or |
244 | /// rmw operation, "None" otherwise. |
245 | Optional<SIMemOpInfo> getAtomicCmpxchgOrRmwInfo( |
246 | const MachineBasicBlock::iterator &MI) const; |
247 | }; |
248 | |
249 | class SICacheControl { |
250 | protected: |
251 | |
252 | /// Instruction info. |
253 | const SIInstrInfo *TII = nullptr; |
254 | |
255 | IsaVersion IV; |
256 | |
257 | SICacheControl(const GCNSubtarget &ST); |
258 | |
259 | public: |
260 | |
261 | /// Create a cache control for the subtarget \p ST. |
262 | static std::unique_ptr<SICacheControl> create(const GCNSubtarget &ST); |
263 | |
264 | /// Update \p MI memory load instruction to bypass any caches up to |
265 | /// the \p Scope memory scope for address spaces \p |
266 | /// AddrSpace. Return true iff the instruction was modified. |
267 | virtual bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI, |
268 | SIAtomicScope Scope, |
269 | SIAtomicAddrSpace AddrSpace) const = 0; |
270 | |
271 | /// Update \p MI memory instruction to indicate it is |
272 | /// nontemporal. Return true iff the instruction was modified. |
273 | virtual bool enableNonTemporal(const MachineBasicBlock::iterator &MI) |
274 | const = 0; |
275 | |
276 | /// Inserts any necessary instructions at position \p Pos relative |
277 | /// to instruction \p MI to ensure any caches associated with |
278 | /// address spaces \p AddrSpace for memory scopes up to memory scope |
279 | /// \p Scope are invalidated. Returns true iff any instructions |
280 | /// inserted. |
281 | virtual bool insertCacheInvalidate(MachineBasicBlock::iterator &MI, |
282 | SIAtomicScope Scope, |
283 | SIAtomicAddrSpace AddrSpace, |
284 | Position Pos) const = 0; |
285 | |
286 | /// Inserts any necessary instructions at position \p Pos relative |
287 | /// to instruction \p MI to ensure memory instructions of kind \p Op |
288 | /// associated with address spaces \p AddrSpace have completed as |
289 | /// observed by other memory instructions executing in memory scope |
290 | /// \p Scope. \p IsCrossAddrSpaceOrdering indicates if the memory |
291 | /// ordering is between address spaces. Returns true iff any |
292 | /// instructions inserted. |
293 | virtual bool insertWait(MachineBasicBlock::iterator &MI, |
294 | SIAtomicScope Scope, |
295 | SIAtomicAddrSpace AddrSpace, |
296 | SIMemOp Op, |
297 | bool IsCrossAddrSpaceOrdering, |
298 | Position Pos) const = 0; |
299 | |
300 | /// Virtual destructor to allow derivations to be deleted. |
301 | virtual ~SICacheControl() = default; |
302 | |
303 | }; |
304 | |
305 | class SIGfx6CacheControl : public SICacheControl { |
306 | protected: |
307 | |
308 | /// Sets GLC bit to "true" if present in \p MI. Returns true if \p MI |
309 | /// is modified, false otherwise. |
310 | bool enableGLCBit(const MachineBasicBlock::iterator &MI) const { |
311 | return enableNamedBit<AMDGPU::OpName::glc>(MI); |
312 | } |
313 | |
314 | /// Sets SLC bit to "true" if present in \p MI. Returns true if \p MI |
315 | /// is modified, false otherwise. |
316 | bool enableSLCBit(const MachineBasicBlock::iterator &MI) const { |
317 | return enableNamedBit<AMDGPU::OpName::slc>(MI); |
318 | } |
319 | |
320 | public: |
321 | |
322 | SIGfx6CacheControl(const GCNSubtarget &ST) : SICacheControl(ST) {}; |
323 | |
324 | bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI, |
325 | SIAtomicScope Scope, |
326 | SIAtomicAddrSpace AddrSpace) const override; |
327 | |
328 | bool enableNonTemporal(const MachineBasicBlock::iterator &MI) const override; |
329 | |
330 | bool insertCacheInvalidate(MachineBasicBlock::iterator &MI, |
331 | SIAtomicScope Scope, |
332 | SIAtomicAddrSpace AddrSpace, |
333 | Position Pos) const override; |
334 | |
335 | bool insertWait(MachineBasicBlock::iterator &MI, |
336 | SIAtomicScope Scope, |
337 | SIAtomicAddrSpace AddrSpace, |
338 | SIMemOp Op, |
339 | bool IsCrossAddrSpaceOrdering, |
340 | Position Pos) const override; |
341 | }; |
342 | |
343 | class SIGfx7CacheControl : public SIGfx6CacheControl { |
344 | public: |
345 | |
346 | SIGfx7CacheControl(const GCNSubtarget &ST) : SIGfx6CacheControl(ST) {}; |
347 | |
348 | bool insertCacheInvalidate(MachineBasicBlock::iterator &MI, |
349 | SIAtomicScope Scope, |
350 | SIAtomicAddrSpace AddrSpace, |
351 | Position Pos) const override; |
352 | |
353 | }; |
354 | |
355 | class SIGfx10CacheControl : public SIGfx7CacheControl { |
356 | protected: |
357 | bool CuMode = false; |
358 | |
359 | /// Sets DLC bit to "true" if present in \p MI. Returns true if \p MI |
360 | /// is modified, false otherwise. |
361 | bool enableDLCBit(const MachineBasicBlock::iterator &MI) const { |
362 | return enableNamedBit<AMDGPU::OpName::dlc>(MI); |
363 | } |
364 | |
365 | public: |
366 | |
367 | SIGfx10CacheControl(const GCNSubtarget &ST, bool CuMode) : |
368 | SIGfx7CacheControl(ST), CuMode(CuMode) {}; |
369 | |
370 | bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI, |
371 | SIAtomicScope Scope, |
372 | SIAtomicAddrSpace AddrSpace) const override; |
373 | |
374 | bool enableNonTemporal(const MachineBasicBlock::iterator &MI) const override; |
375 | |
376 | bool insertCacheInvalidate(MachineBasicBlock::iterator &MI, |
377 | SIAtomicScope Scope, |
378 | SIAtomicAddrSpace AddrSpace, |
379 | Position Pos) const override; |
380 | |
381 | bool insertWait(MachineBasicBlock::iterator &MI, |
382 | SIAtomicScope Scope, |
383 | SIAtomicAddrSpace AddrSpace, |
384 | SIMemOp Op, |
385 | bool IsCrossAddrSpaceOrdering, |
386 | Position Pos) const override; |
387 | }; |
388 | |
389 | class SIMemoryLegalizer final : public MachineFunctionPass { |
390 | private: |
391 | |
392 | /// Cache Control. |
393 | std::unique_ptr<SICacheControl> CC = nullptr; |
394 | |
395 | /// List of atomic pseudo instructions. |
396 | std::list<MachineBasicBlock::iterator> AtomicPseudoMIs; |
397 | |
398 | /// Return true iff instruction \p MI is a atomic instruction that |
399 | /// returns a result. |
400 | bool isAtomicRet(const MachineInstr &MI) const { |
401 | return AMDGPU::getAtomicNoRetOp(MI.getOpcode()) != -1; |
402 | } |
403 | |
404 | /// Removes all processed atomic pseudo instructions from the current |
405 | /// function. Returns true if current function is modified, false otherwise. |
406 | bool removeAtomicPseudoMIs(); |
407 | |
408 | /// Expands load operation \p MI. Returns true if instructions are |
409 | /// added/deleted or \p MI is modified, false otherwise. |
410 | bool expandLoad(const SIMemOpInfo &MOI, |
411 | MachineBasicBlock::iterator &MI); |
412 | /// Expands store operation \p MI. Returns true if instructions are |
413 | /// added/deleted or \p MI is modified, false otherwise. |
414 | bool expandStore(const SIMemOpInfo &MOI, |
415 | MachineBasicBlock::iterator &MI); |
416 | /// Expands atomic fence operation \p MI. Returns true if |
417 | /// instructions are added/deleted or \p MI is modified, false otherwise. |
418 | bool expandAtomicFence(const SIMemOpInfo &MOI, |
419 | MachineBasicBlock::iterator &MI); |
420 | /// Expands atomic cmpxchg or rmw operation \p MI. Returns true if |
421 | /// instructions are added/deleted or \p MI is modified, false otherwise. |
422 | bool expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI, |
423 | MachineBasicBlock::iterator &MI); |
424 | |
425 | public: |
426 | static char ID; |
427 | |
428 | SIMemoryLegalizer() : MachineFunctionPass(ID) {} |
429 | |
430 | void getAnalysisUsage(AnalysisUsage &AU) const override { |
431 | AU.setPreservesCFG(); |
432 | MachineFunctionPass::getAnalysisUsage(AU); |
433 | } |
434 | |
435 | StringRef getPassName() const override { |
436 | return PASS_NAME"SI Memory Legalizer"; |
437 | } |
438 | |
439 | bool runOnMachineFunction(MachineFunction &MF) override; |
440 | }; |
441 | |
442 | } // end namespace anonymous |
443 | |
444 | void SIMemOpAccess::reportUnsupported(const MachineBasicBlock::iterator &MI, |
445 | const char *Msg) const { |
446 | const Function &Func = MI->getParent()->getParent()->getFunction(); |
447 | DiagnosticInfoUnsupported Diag(Func, Msg, MI->getDebugLoc()); |
448 | Func.getContext().diagnose(Diag); |
449 | } |
450 | |
451 | Optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>> |
452 | SIMemOpAccess::toSIAtomicScope(SyncScope::ID SSID, |
453 | SIAtomicAddrSpace InstrScope) const { |
454 | if (SSID == SyncScope::System) |
455 | return std::make_tuple(SIAtomicScope::SYSTEM, |
456 | SIAtomicAddrSpace::ATOMIC, |
457 | true); |
458 | if (SSID == MMI->getAgentSSID()) |
459 | return std::make_tuple(SIAtomicScope::AGENT, |
460 | SIAtomicAddrSpace::ATOMIC, |
461 | true); |
462 | if (SSID == MMI->getWorkgroupSSID()) |
463 | return std::make_tuple(SIAtomicScope::WORKGROUP, |
464 | SIAtomicAddrSpace::ATOMIC, |
465 | true); |
466 | if (SSID == MMI->getWavefrontSSID()) |
467 | return std::make_tuple(SIAtomicScope::WAVEFRONT, |
468 | SIAtomicAddrSpace::ATOMIC, |
469 | true); |
470 | if (SSID == SyncScope::SingleThread) |
471 | return std::make_tuple(SIAtomicScope::SINGLETHREAD, |
472 | SIAtomicAddrSpace::ATOMIC, |
473 | true); |
474 | if (SSID == MMI->getSystemOneAddressSpaceSSID()) |
475 | return std::make_tuple(SIAtomicScope::SYSTEM, |
476 | SIAtomicAddrSpace::ATOMIC & InstrScope, |
477 | false); |
478 | if (SSID == MMI->getAgentOneAddressSpaceSSID()) |
479 | return std::make_tuple(SIAtomicScope::AGENT, |
480 | SIAtomicAddrSpace::ATOMIC & InstrScope, |
481 | false); |
482 | if (SSID == MMI->getWorkgroupOneAddressSpaceSSID()) |
483 | return std::make_tuple(SIAtomicScope::WORKGROUP, |
484 | SIAtomicAddrSpace::ATOMIC & InstrScope, |
485 | false); |
486 | if (SSID == MMI->getWavefrontOneAddressSpaceSSID()) |
487 | return std::make_tuple(SIAtomicScope::WAVEFRONT, |
488 | SIAtomicAddrSpace::ATOMIC & InstrScope, |
489 | false); |
490 | if (SSID == MMI->getSingleThreadOneAddressSpaceSSID()) |
491 | return std::make_tuple(SIAtomicScope::SINGLETHREAD, |
492 | SIAtomicAddrSpace::ATOMIC & InstrScope, |
493 | false); |
494 | return None; |
495 | } |
496 | |
497 | SIAtomicAddrSpace SIMemOpAccess::toSIAtomicAddrSpace(unsigned AS) const { |
498 | if (AS == AMDGPUAS::FLAT_ADDRESS) |
499 | return SIAtomicAddrSpace::FLAT; |
500 | if (AS == AMDGPUAS::GLOBAL_ADDRESS) |
501 | return SIAtomicAddrSpace::GLOBAL; |
502 | if (AS == AMDGPUAS::LOCAL_ADDRESS) |
503 | return SIAtomicAddrSpace::LDS; |
504 | if (AS == AMDGPUAS::PRIVATE_ADDRESS) |
505 | return SIAtomicAddrSpace::SCRATCH; |
506 | if (AS == AMDGPUAS::REGION_ADDRESS) |
507 | return SIAtomicAddrSpace::GDS; |
508 | |
509 | return SIAtomicAddrSpace::OTHER; |
510 | } |
511 | |
512 | SIMemOpAccess::SIMemOpAccess(MachineFunction &MF) { |
513 | MMI = &MF.getMMI().getObjFileInfo<AMDGPUMachineModuleInfo>(); |
514 | } |
515 | |
516 | Optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO( |
517 | const MachineBasicBlock::iterator &MI) const { |
518 | assert(MI->getNumMemOperands() > 0)((MI->getNumMemOperands() > 0) ? static_cast<void> (0) : __assert_fail ("MI->getNumMemOperands() > 0", "/build/llvm-toolchain-snapshot-9~svn362543/lib/Target/AMDGPU/SIMemoryLegalizer.cpp" , 518, __PRETTY_FUNCTION__)); |
519 | |
520 | SyncScope::ID SSID = SyncScope::SingleThread; |
521 | AtomicOrdering Ordering = AtomicOrdering::NotAtomic; |
522 | AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic; |
523 | SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE; |
524 | bool IsNonTemporal = true; |
525 | |
526 | // Validator should check whether or not MMOs cover the entire set of |
527 | // locations accessed by the memory instruction. |
528 | for (const auto &MMO : MI->memoperands()) { |
529 | IsNonTemporal &= MMO->isNonTemporal(); |
530 | InstrAddrSpace |= |
531 | toSIAtomicAddrSpace(MMO->getPointerInfo().getAddrSpace()); |
532 | AtomicOrdering OpOrdering = MMO->getOrdering(); |
533 | if (OpOrdering != AtomicOrdering::NotAtomic) { |
534 | const auto &IsSyncScopeInclusion = |
535 | MMI->isSyncScopeInclusion(SSID, MMO->getSyncScopeID()); |
536 | if (!IsSyncScopeInclusion) { |
537 | reportUnsupported(MI, |
538 | "Unsupported non-inclusive atomic synchronization scope"); |
539 | return None; |
540 | } |
541 | |
542 | SSID = IsSyncScopeInclusion.getValue() ? SSID : MMO->getSyncScopeID(); |
543 | Ordering = |
544 | isStrongerThan(Ordering, OpOrdering) ? |
545 | Ordering : MMO->getOrdering(); |
546 | assert(MMO->getFailureOrdering() != AtomicOrdering::Release &&((MMO->getFailureOrdering() != AtomicOrdering::Release && MMO->getFailureOrdering() != AtomicOrdering::AcquireRelease ) ? static_cast<void> (0) : __assert_fail ("MMO->getFailureOrdering() != AtomicOrdering::Release && MMO->getFailureOrdering() != AtomicOrdering::AcquireRelease" , "/build/llvm-toolchain-snapshot-9~svn362543/lib/Target/AMDGPU/SIMemoryLegalizer.cpp" , 547, __PRETTY_FUNCTION__)) |
547 | MMO->getFailureOrdering() != AtomicOrdering::AcquireRelease)((MMO->getFailureOrdering() != AtomicOrdering::Release && MMO->getFailureOrdering() != AtomicOrdering::AcquireRelease ) ? static_cast<void> (0) : __assert_fail ("MMO->getFailureOrdering() != AtomicOrdering::Release && MMO->getFailureOrdering() != AtomicOrdering::AcquireRelease" , "/build/llvm-toolchain-snapshot-9~svn362543/lib/Target/AMDGPU/SIMemoryLegalizer.cpp" , 547, __PRETTY_FUNCTION__)); |
548 | FailureOrdering = |
549 | isStrongerThan(FailureOrdering, MMO->getFailureOrdering()) ? |
550 | FailureOrdering : MMO->getFailureOrdering(); |
551 | } |
552 | } |
553 | |
554 | SIAtomicScope Scope = SIAtomicScope::NONE; |
555 | SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE; |
556 | bool IsCrossAddressSpaceOrdering = false; |
557 | if (Ordering != AtomicOrdering::NotAtomic) { |
558 | auto ScopeOrNone = toSIAtomicScope(SSID, InstrAddrSpace); |
559 | if (!ScopeOrNone) { |
560 | reportUnsupported(MI, "Unsupported atomic synchronization scope"); |
561 | return None; |
562 | } |
563 | std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) = |
564 | ScopeOrNone.getValue(); |
565 | if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) || |
566 | ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace)) { |
567 | reportUnsupported(MI, "Unsupported atomic address space"); |
568 | return None; |
569 | } |
570 | } |
571 | return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, InstrAddrSpace, |
572 | IsCrossAddressSpaceOrdering, FailureOrdering, IsNonTemporal); |
573 | } |
574 | |
575 | Optional<SIMemOpInfo> SIMemOpAccess::getLoadInfo( |
576 | const MachineBasicBlock::iterator &MI) const { |
577 | assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic)((MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic) ? static_cast<void> (0) : __assert_fail ("MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic" , "/build/llvm-toolchain-snapshot-9~svn362543/lib/Target/AMDGPU/SIMemoryLegalizer.cpp" , 577, __PRETTY_FUNCTION__)); |
578 | |
579 | if (!(MI->mayLoad() && !MI->mayStore())) |
580 | return None; |
581 | |
582 | // Be conservative if there are no memory operands. |
583 | if (MI->getNumMemOperands() == 0) |
584 | return SIMemOpInfo(); |
585 | |
586 | return constructFromMIWithMMO(MI); |
587 | } |
588 | |
589 | Optional<SIMemOpInfo> SIMemOpAccess::getStoreInfo( |
590 | const MachineBasicBlock::iterator &MI) const { |
591 | assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic)((MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic) ? static_cast<void> (0) : __assert_fail ("MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic" , "/build/llvm-toolchain-snapshot-9~svn362543/lib/Target/AMDGPU/SIMemoryLegalizer.cpp" , 591, __PRETTY_FUNCTION__)); |
592 | |
593 | if (!(!MI->mayLoad() && MI->mayStore())) |
594 | return None; |
595 | |
596 | // Be conservative if there are no memory operands. |
597 | if (MI->getNumMemOperands() == 0) |
598 | return SIMemOpInfo(); |
599 | |
600 | return constructFromMIWithMMO(MI); |
601 | } |
602 | |
603 | Optional<SIMemOpInfo> SIMemOpAccess::getAtomicFenceInfo( |
604 | const MachineBasicBlock::iterator &MI) const { |
605 | assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic)((MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic) ? static_cast<void> (0) : __assert_fail ("MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic" , "/build/llvm-toolchain-snapshot-9~svn362543/lib/Target/AMDGPU/SIMemoryLegalizer.cpp" , 605, __PRETTY_FUNCTION__)); |
606 | |
607 | if (MI->getOpcode() != AMDGPU::ATOMIC_FENCE) |
608 | return None; |
609 | |
610 | AtomicOrdering Ordering = |
611 | static_cast<AtomicOrdering>(MI->getOperand(0).getImm()); |
612 | |
613 | SyncScope::ID SSID = static_cast<SyncScope::ID>(MI->getOperand(1).getImm()); |
614 | auto ScopeOrNone = toSIAtomicScope(SSID, SIAtomicAddrSpace::ATOMIC); |
615 | if (!ScopeOrNone) { |
616 | reportUnsupported(MI, "Unsupported atomic synchronization scope"); |
617 | return None; |
618 | } |
619 | |
620 | SIAtomicScope Scope = SIAtomicScope::NONE; |
621 | SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE; |
622 | bool IsCrossAddressSpaceOrdering = false; |
623 | std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) = |
624 | ScopeOrNone.getValue(); |
625 | |
626 | if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) || |
627 | ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace)) { |
628 | reportUnsupported(MI, "Unsupported atomic address space"); |
629 | return None; |
630 | } |
631 | |
632 | return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, SIAtomicAddrSpace::ATOMIC, |
633 | IsCrossAddressSpaceOrdering); |
634 | } |
635 | |
636 | Optional<SIMemOpInfo> SIMemOpAccess::getAtomicCmpxchgOrRmwInfo( |
637 | const MachineBasicBlock::iterator &MI) const { |
638 | assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic)((MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic) ? static_cast<void> (0) : __assert_fail ("MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic" , "/build/llvm-toolchain-snapshot-9~svn362543/lib/Target/AMDGPU/SIMemoryLegalizer.cpp" , 638, __PRETTY_FUNCTION__)); |
639 | |
640 | if (!(MI->mayLoad() && MI->mayStore())) |
641 | return None; |
642 | |
643 | // Be conservative if there are no memory operands. |
644 | if (MI->getNumMemOperands() == 0) |
645 | return SIMemOpInfo(); |
646 | |
647 | return constructFromMIWithMMO(MI); |
648 | } |
649 | |
650 | SICacheControl::SICacheControl(const GCNSubtarget &ST) { |
651 | TII = ST.getInstrInfo(); |
652 | IV = getIsaVersion(ST.getCPU()); |
653 | } |
654 | |
655 | /* static */ |
656 | std::unique_ptr<SICacheControl> SICacheControl::create(const GCNSubtarget &ST) { |
657 | GCNSubtarget::Generation Generation = ST.getGeneration(); |
658 | if (Generation <= AMDGPUSubtarget::SOUTHERN_ISLANDS) |
659 | return make_unique<SIGfx6CacheControl>(ST); |
660 | if (Generation < AMDGPUSubtarget::GFX10) |
661 | return make_unique<SIGfx7CacheControl>(ST); |
662 | return make_unique<SIGfx10CacheControl>(ST, ST.isCuModeEnabled()); |
663 | } |
664 | |
665 | bool SIGfx6CacheControl::enableLoadCacheBypass( |
666 | const MachineBasicBlock::iterator &MI, |
667 | SIAtomicScope Scope, |
668 | SIAtomicAddrSpace AddrSpace) const { |
669 | assert(MI->mayLoad() && !MI->mayStore())((MI->mayLoad() && !MI->mayStore()) ? static_cast <void> (0) : __assert_fail ("MI->mayLoad() && !MI->mayStore()" , "/build/llvm-toolchain-snapshot-9~svn362543/lib/Target/AMDGPU/SIMemoryLegalizer.cpp" , 669, __PRETTY_FUNCTION__)); |
670 | bool Changed = false; |
671 | |
672 | if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { |
673 | /// TODO: Do not set glc for rmw atomic operations as they |
674 | /// implicitly bypass the L1 cache. |
675 | |
676 | switch (Scope) { |
677 | case SIAtomicScope::SYSTEM: |
678 | case SIAtomicScope::AGENT: |
679 | Changed |= enableGLCBit(MI); |
680 | break; |
681 | case SIAtomicScope::WORKGROUP: |
682 | case SIAtomicScope::WAVEFRONT: |
683 | case SIAtomicScope::SINGLETHREAD: |
684 | // No cache to bypass. |
685 | break; |
686 | default: |
687 | llvm_unreachable("Unsupported synchronization scope")::llvm::llvm_unreachable_internal("Unsupported synchronization scope" , "/build/llvm-toolchain-snapshot-9~svn362543/lib/Target/AMDGPU/SIMemoryLegalizer.cpp" , 687); |
688 | } |
689 | } |
690 | |
691 | /// The scratch address space does not need the global memory caches |
692 | /// to be bypassed as all memory operations by the same thread are |
693 | /// sequentially consistent, and no other thread can access scratch |
694 | /// memory. |
695 | |
696 | /// Other address spaces do not hava a cache. |
697 | |
698 | return Changed; |
699 | } |
700 | |
701 | bool SIGfx6CacheControl::enableNonTemporal( |
702 | const MachineBasicBlock::iterator &MI) const { |
703 | assert(MI->mayLoad() ^ MI->mayStore())((MI->mayLoad() ^ MI->mayStore()) ? static_cast<void > (0) : __assert_fail ("MI->mayLoad() ^ MI->mayStore()" , "/build/llvm-toolchain-snapshot-9~svn362543/lib/Target/AMDGPU/SIMemoryLegalizer.cpp" , 703, __PRETTY_FUNCTION__)); |
704 | bool Changed = false; |
705 | |
706 | /// TODO: Do not enableGLCBit if rmw atomic. |
707 | Changed |= enableGLCBit(MI); |
708 | Changed |= enableSLCBit(MI); |
709 | |
710 | return Changed; |
711 | } |
712 | |
713 | bool SIGfx6CacheControl::insertCacheInvalidate(MachineBasicBlock::iterator &MI, |
714 | SIAtomicScope Scope, |
715 | SIAtomicAddrSpace AddrSpace, |
716 | Position Pos) const { |
717 | bool Changed = false; |
718 | |
719 | MachineBasicBlock &MBB = *MI->getParent(); |
720 | DebugLoc DL = MI->getDebugLoc(); |
721 | |
722 | if (Pos == Position::AFTER) |
723 | ++MI; |
724 | |
725 | if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { |
726 | switch (Scope) { |
727 | case SIAtomicScope::SYSTEM: |
728 | case SIAtomicScope::AGENT: |
729 | BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBINVL1)); |
730 | Changed = true; |
731 | break; |
732 | case SIAtomicScope::WORKGROUP: |
733 | case SIAtomicScope::WAVEFRONT: |
734 | case SIAtomicScope::SINGLETHREAD: |
735 | // No cache to invalidate. |
736 | break; |
737 | default: |
738 | llvm_unreachable("Unsupported synchronization scope")::llvm::llvm_unreachable_internal("Unsupported synchronization scope" , "/build/llvm-toolchain-snapshot-9~svn362543/lib/Target/AMDGPU/SIMemoryLegalizer.cpp" , 738); |
739 | } |
740 | } |
741 | |
742 | /// The scratch address space does not need the global memory cache |
743 | /// to be flushed as all memory operations by the same thread are |
744 | /// sequentially consistent, and no other thread can access scratch |
745 | /// memory. |
746 | |
747 | /// Other address spaces do not hava a cache. |
748 | |
749 | if (Pos == Position::AFTER) |
750 | --MI; |
751 | |
752 | return Changed; |
753 | } |
754 | |
755 | bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI, |
756 | SIAtomicScope Scope, |
757 | SIAtomicAddrSpace AddrSpace, |
758 | SIMemOp Op, |
759 | bool IsCrossAddrSpaceOrdering, |
760 | Position Pos) const { |
761 | bool Changed = false; |
762 | |
763 | MachineBasicBlock &MBB = *MI->getParent(); |
764 | DebugLoc DL = MI->getDebugLoc(); |
765 | |
766 | if (Pos == Position::AFTER) |
767 | ++MI; |
768 | |
769 | bool VMCnt = false; |
770 | bool LGKMCnt = false; |
771 | |
772 | if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { |
773 | switch (Scope) { |
774 | case SIAtomicScope::SYSTEM: |
775 | case SIAtomicScope::AGENT: |
776 | VMCnt |= true; |
777 | break; |
778 | case SIAtomicScope::WORKGROUP: |
779 | case SIAtomicScope::WAVEFRONT: |
780 | case SIAtomicScope::SINGLETHREAD: |
781 | // The L1 cache keeps all memory operations in order for |
782 | // wavefronts in the same work-group. |
783 | break; |
784 | default: |
785 | llvm_unreachable("Unsupported synchronization scope")::llvm::llvm_unreachable_internal("Unsupported synchronization scope" , "/build/llvm-toolchain-snapshot-9~svn362543/lib/Target/AMDGPU/SIMemoryLegalizer.cpp" , 785); |
786 | } |
787 | } |
788 | |
789 | if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) { |
790 | switch (Scope) { |
791 | case SIAtomicScope::SYSTEM: |
792 | case SIAtomicScope::AGENT: |
793 | case SIAtomicScope::WORKGROUP: |
794 | // If no cross address space ordering then an LDS waitcnt is not |
795 | // needed as LDS operations for all waves are executed in a |
796 | // total global ordering as observed by all waves. Required if |
797 | // also synchronizing with global/GDS memory as LDS operations |
798 | // could be reordered with respect to later global/GDS memory |
799 | // operations of the same wave. |
800 | LGKMCnt |= IsCrossAddrSpaceOrdering; |
801 | break; |
802 | case SIAtomicScope::WAVEFRONT: |
803 | case SIAtomicScope::SINGLETHREAD: |
804 | // The LDS keeps all memory operations in order for |
805 | // the same wavesfront. |
806 | break; |
807 | default: |
808 | llvm_unreachable("Unsupported synchronization scope")::llvm::llvm_unreachable_internal("Unsupported synchronization scope" , "/build/llvm-toolchain-snapshot-9~svn362543/lib/Target/AMDGPU/SIMemoryLegalizer.cpp" , 808); |
809 | } |
810 | } |
811 | |
812 | if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) { |
813 | switch (Scope) { |
814 | case SIAtomicScope::SYSTEM: |
815 | case SIAtomicScope::AGENT: |
816 | // If no cross address space ordering then an GDS waitcnt is not |
817 | // needed as GDS operations for all waves are executed in a |
818 | // total global ordering as observed by all waves. Required if |
819 | // also synchronizing with global/LDS memory as GDS operations |
820 | // could be reordered with respect to later global/LDS memory |
821 | // operations of the same wave. |
822 | LGKMCnt |= IsCrossAddrSpaceOrdering; |
823 | break; |
824 | case SIAtomicScope::WORKGROUP: |
825 | case SIAtomicScope::WAVEFRONT: |
826 | case SIAtomicScope::SINGLETHREAD: |
827 | // The GDS keeps all memory operations in order for |
828 | // the same work-group. |
829 | break; |
830 | default: |
831 | llvm_unreachable("Unsupported synchronization scope")::llvm::llvm_unreachable_internal("Unsupported synchronization scope" , "/build/llvm-toolchain-snapshot-9~svn362543/lib/Target/AMDGPU/SIMemoryLegalizer.cpp" , 831); |
832 | } |
833 | } |
834 | |
835 | if (VMCnt || LGKMCnt) { |
836 | unsigned WaitCntImmediate = |
837 | AMDGPU::encodeWaitcnt(IV, |
838 | VMCnt ? 0 : getVmcntBitMask(IV), |
839 | getExpcntBitMask(IV), |
840 | LGKMCnt ? 0 : getLgkmcntBitMask(IV)); |
841 | BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(WaitCntImmediate); |
842 | Changed = true; |
843 | } |
844 | |
845 | if (Pos == Position::AFTER) |
846 | --MI; |
847 | |
848 | return Changed; |
849 | } |
850 | |
851 | bool SIGfx7CacheControl::insertCacheInvalidate(MachineBasicBlock::iterator &MI, |
852 | SIAtomicScope Scope, |
853 | SIAtomicAddrSpace AddrSpace, |
854 | Position Pos) const { |
855 | bool Changed = false; |
856 | |
857 | MachineBasicBlock &MBB = *MI->getParent(); |
858 | DebugLoc DL = MI->getDebugLoc(); |
859 | |
860 | const GCNSubtarget &STM = MBB.getParent()->getSubtarget<GCNSubtarget>(); |
861 | |
862 | const unsigned Flush = STM.isAmdPalOS() || STM.isMesa3DOS() |
863 | ? AMDGPU::BUFFER_WBINVL1 |
864 | : AMDGPU::BUFFER_WBINVL1_VOL; |
865 | |
866 | if (Pos == Position::AFTER) |
867 | ++MI; |
868 | |
869 | if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { |
870 | switch (Scope) { |
871 | case SIAtomicScope::SYSTEM: |
872 | case SIAtomicScope::AGENT: |
873 | BuildMI(MBB, MI, DL, TII->get(Flush)); |
874 | Changed = true; |
875 | break; |
876 | case SIAtomicScope::WORKGROUP: |
877 | case SIAtomicScope::WAVEFRONT: |
878 | case SIAtomicScope::SINGLETHREAD: |
879 | // No cache to invalidate. |
880 | break; |
881 | default: |
882 | llvm_unreachable("Unsupported synchronization scope")::llvm::llvm_unreachable_internal("Unsupported synchronization scope" , "/build/llvm-toolchain-snapshot-9~svn362543/lib/Target/AMDGPU/SIMemoryLegalizer.cpp" , 882); |
883 | } |
884 | } |
885 | |
886 | /// The scratch address space does not need the global memory cache |
887 | /// to be flushed as all memory operations by the same thread are |
888 | /// sequentially consistent, and no other thread can access scratch |
889 | /// memory. |
890 | |
891 | /// Other address spaces do not hava a cache. |
892 | |
893 | if (Pos == Position::AFTER) |
894 | --MI; |
895 | |
896 | return Changed; |
897 | } |
898 | |
899 | bool SIGfx10CacheControl::enableLoadCacheBypass( |
900 | const MachineBasicBlock::iterator &MI, |
901 | SIAtomicScope Scope, |
902 | SIAtomicAddrSpace AddrSpace) const { |
903 | assert(MI->mayLoad() && !MI->mayStore())((MI->mayLoad() && !MI->mayStore()) ? static_cast <void> (0) : __assert_fail ("MI->mayLoad() && !MI->mayStore()" , "/build/llvm-toolchain-snapshot-9~svn362543/lib/Target/AMDGPU/SIMemoryLegalizer.cpp" , 903, __PRETTY_FUNCTION__)); |
904 | bool Changed = false; |
905 | |
906 | if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { |
907 | /// TODO Do not set glc for rmw atomic operations as they |
908 | /// implicitly bypass the L0/L1 caches. |
909 | |
910 | switch (Scope) { |
911 | case SIAtomicScope::SYSTEM: |
912 | case SIAtomicScope::AGENT: |
913 | Changed |= enableGLCBit(MI); |
914 | Changed |= enableDLCBit(MI); |
915 | break; |
916 | case SIAtomicScope::WORKGROUP: |
917 | // In WGP mode the waves of a work-group can be executing on either CU of |
918 | // the WGP. Therefore need to bypass the L0 which is per CU. Otherwise in |
919 | // CU mode and all waves of a work-group are on the same CU, and so the |
920 | // L0 does not need to be bypassed. |
921 | if (!CuMode) Changed |= enableGLCBit(MI); |
922 | break; |
923 | case SIAtomicScope::WAVEFRONT: |
924 | case SIAtomicScope::SINGLETHREAD: |
925 | // No cache to bypass. |
926 | break; |
927 | default: |
928 | llvm_unreachable("Unsupported synchronization scope")::llvm::llvm_unreachable_internal("Unsupported synchronization scope" , "/build/llvm-toolchain-snapshot-9~svn362543/lib/Target/AMDGPU/SIMemoryLegalizer.cpp" , 928); |
929 | } |
930 | } |
931 | |
932 | /// The scratch address space does not need the global memory caches |
933 | /// to be bypassed as all memory operations by the same thread are |
934 | /// sequentially consistent, and no other thread can access scratch |
935 | /// memory. |
936 | |
937 | /// Other address spaces do not hava a cache. |
938 | |
939 | return Changed; |
940 | } |
941 | |
942 | bool SIGfx10CacheControl::enableNonTemporal( |
943 | const MachineBasicBlock::iterator &MI) const { |
944 | assert(MI->mayLoad() ^ MI->mayStore())((MI->mayLoad() ^ MI->mayStore()) ? static_cast<void > (0) : __assert_fail ("MI->mayLoad() ^ MI->mayStore()" , "/build/llvm-toolchain-snapshot-9~svn362543/lib/Target/AMDGPU/SIMemoryLegalizer.cpp" , 944, __PRETTY_FUNCTION__)); |
945 | bool Changed = false; |
946 | |
947 | Changed |= enableSLCBit(MI); |
948 | /// TODO for store (non-rmw atomic) instructions also enableGLCBit(MI) |
949 | |
950 | return Changed; |
951 | } |
952 | |
953 | bool SIGfx10CacheControl::insertCacheInvalidate(MachineBasicBlock::iterator &MI, |
954 | SIAtomicScope Scope, |
955 | SIAtomicAddrSpace AddrSpace, |
956 | Position Pos) const { |
957 | bool Changed = false; |
958 | |
959 | MachineBasicBlock &MBB = *MI->getParent(); |
960 | DebugLoc DL = MI->getDebugLoc(); |
961 | |
962 | if (Pos == Position::AFTER) |
963 | ++MI; |
964 | |
965 | if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { |
966 | switch (Scope) { |
967 | case SIAtomicScope::SYSTEM: |
968 | case SIAtomicScope::AGENT: |
969 | BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV)); |
970 | BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL1_INV)); |
971 | Changed = true; |
972 | break; |
973 | case SIAtomicScope::WORKGROUP: |
974 | // In WGP mode the waves of a work-group can be executing on either CU of |
975 | // the WGP. Therefore need to invalidate the L0 which is per CU. Otherwise |
976 | // in CU mode and all waves of a work-group are on the same CU, and so the |
977 | // L0 does not need to be invalidated. |
978 | if (!CuMode) { |
979 | BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV)); |
980 | Changed = true; |
981 | } |
982 | break; |
983 | case SIAtomicScope::WAVEFRONT: |
984 | case SIAtomicScope::SINGLETHREAD: |
985 | // No cache to invalidate. |
986 | break; |
987 | default: |
988 | llvm_unreachable("Unsupported synchronization scope")::llvm::llvm_unreachable_internal("Unsupported synchronization scope" , "/build/llvm-toolchain-snapshot-9~svn362543/lib/Target/AMDGPU/SIMemoryLegalizer.cpp" , 988); |
989 | } |
990 | } |
991 | |
992 | /// The scratch address space does not need the global memory cache |
993 | /// to be flushed as all memory operations by the same thread are |
994 | /// sequentially consistent, and no other thread can access scratch |
995 | /// memory. |
996 | |
997 | /// Other address spaces do not hava a cache. |
998 | |
999 | if (Pos == Position::AFTER) |
1000 | --MI; |
1001 | |
1002 | return Changed; |
1003 | } |
1004 | |
1005 | bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI, |
1006 | SIAtomicScope Scope, |
1007 | SIAtomicAddrSpace AddrSpace, |
1008 | SIMemOp Op, |
1009 | bool IsCrossAddrSpaceOrdering, |
1010 | Position Pos) const { |
1011 | bool Changed = false; |
1012 | |
1013 | MachineBasicBlock &MBB = *MI->getParent(); |
1014 | DebugLoc DL = MI->getDebugLoc(); |
1015 | |
1016 | if (Pos == Position::AFTER) |
1017 | ++MI; |
1018 | |
1019 | bool VMCnt = false; |
1020 | bool VSCnt = false; |
1021 | bool LGKMCnt = false; |
1022 | |
1023 | if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { |
1024 | switch (Scope) { |
1025 | case SIAtomicScope::SYSTEM: |
1026 | case SIAtomicScope::AGENT: |
1027 | if ((Op & SIMemOp::LOAD) != SIMemOp::NONE) |
1028 | VMCnt |= true; |
1029 | if ((Op & SIMemOp::STORE) != SIMemOp::NONE) |
1030 | VSCnt |= true; |
1031 | break; |
1032 | case SIAtomicScope::WORKGROUP: |
1033 | // In WGP mode the waves of a work-group can be executing on either CU of |
1034 | // the WGP. Therefore need to wait for operations to complete to ensure |
1035 | // they are visible to waves in the other CU as the L0 is per CU. |
1036 | // Otherwise in CU mode and all waves of a work-group are on the same CU |
1037 | // which shares the same L0. |
1038 | if (!CuMode) { |
1039 | if ((Op & SIMemOp::LOAD) != SIMemOp::NONE) |
1040 | VMCnt |= true; |
1041 | if ((Op & SIMemOp::STORE) != SIMemOp::NONE) |
1042 | VSCnt |= true; |
1043 | } |
1044 | break; |
1045 | case SIAtomicScope::WAVEFRONT: |
1046 | case SIAtomicScope::SINGLETHREAD: |
1047 | // The L0 cache keeps all memory operations in order for |
1048 | // work-items in the same wavefront. |
1049 | break; |
1050 | default: |
1051 | llvm_unreachable("Unsupported synchronization scope")::llvm::llvm_unreachable_internal("Unsupported synchronization scope" , "/build/llvm-toolchain-snapshot-9~svn362543/lib/Target/AMDGPU/SIMemoryLegalizer.cpp" , 1051); |
1052 | } |
1053 | } |
1054 | |
1055 | if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) { |
1056 | switch (Scope) { |
1057 | case SIAtomicScope::SYSTEM: |
1058 | case SIAtomicScope::AGENT: |
1059 | case SIAtomicScope::WORKGROUP: |
1060 | // If no cross address space ordering then an LDS waitcnt is not |
1061 | // needed as LDS operations for all waves are executed in a |
1062 | // total global ordering as observed by all waves. Required if |
1063 | // also synchronizing with global/GDS memory as LDS operations |
1064 | // could be reordered with respect to later global/GDS memory |
1065 | // operations of the same wave. |
1066 | LGKMCnt |= IsCrossAddrSpaceOrdering; |
1067 | break; |
1068 | case SIAtomicScope::WAVEFRONT: |
1069 | case SIAtomicScope::SINGLETHREAD: |
1070 | // The LDS keeps all memory operations in order for |
1071 | // the same wavesfront. |
1072 | break; |
1073 | default: |
1074 | llvm_unreachable("Unsupported synchronization scope")::llvm::llvm_unreachable_internal("Unsupported synchronization scope" , "/build/llvm-toolchain-snapshot-9~svn362543/lib/Target/AMDGPU/SIMemoryLegalizer.cpp" , 1074); |
1075 | } |
1076 | } |
1077 | |
1078 | if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) { |
1079 | switch (Scope) { |
1080 | case SIAtomicScope::SYSTEM: |
1081 | case SIAtomicScope::AGENT: |
1082 | // If no cross address space ordering then an GDS waitcnt is not |
1083 | // needed as GDS operations for all waves are executed in a |
1084 | // total global ordering as observed by all waves. Required if |
1085 | // also synchronizing with global/LDS memory as GDS operations |
1086 | // could be reordered with respect to later global/LDS memory |
1087 | // operations of the same wave. |
1088 | LGKMCnt |= IsCrossAddrSpaceOrdering; |
1089 | break; |
1090 | case SIAtomicScope::WORKGROUP: |
1091 | case SIAtomicScope::WAVEFRONT: |
1092 | case SIAtomicScope::SINGLETHREAD: |
1093 | // The GDS keeps all memory operations in order for |
1094 | // the same work-group. |
1095 | break; |
1096 | default: |
1097 | llvm_unreachable("Unsupported synchronization scope")::llvm::llvm_unreachable_internal("Unsupported synchronization scope" , "/build/llvm-toolchain-snapshot-9~svn362543/lib/Target/AMDGPU/SIMemoryLegalizer.cpp" , 1097); |
1098 | } |
1099 | } |
1100 | |
1101 | if (VMCnt || LGKMCnt) { |
1102 | unsigned WaitCntImmediate = |
1103 | AMDGPU::encodeWaitcnt(IV, |
1104 | VMCnt ? 0 : getVmcntBitMask(IV), |
1105 | getExpcntBitMask(IV), |
1106 | LGKMCnt ? 0 : getLgkmcntBitMask(IV)); |
1107 | BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(WaitCntImmediate); |
1108 | Changed = true; |
1109 | } |
1110 | |
1111 | if (VSCnt) { |
1112 | BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT)) |
1113 | .addReg(AMDGPU::SGPR_NULL, RegState::Undef) |
1114 | .addImm(0); |
1115 | Changed = true; |
1116 | } |
1117 | |
1118 | if (Pos == Position::AFTER) |
1119 | --MI; |
1120 | |
1121 | return Changed; |
1122 | } |
1123 | |
1124 | bool SIMemoryLegalizer::removeAtomicPseudoMIs() { |
1125 | if (AtomicPseudoMIs.empty()) |
1126 | return false; |
1127 | |
1128 | for (auto &MI : AtomicPseudoMIs) |
1129 | MI->eraseFromParent(); |
1130 | |
1131 | AtomicPseudoMIs.clear(); |
1132 | return true; |
1133 | } |
1134 | |
1135 | bool SIMemoryLegalizer::expandLoad(const SIMemOpInfo &MOI, |
1136 | MachineBasicBlock::iterator &MI) { |
1137 | assert(MI->mayLoad() && !MI->mayStore())((MI->mayLoad() && !MI->mayStore()) ? static_cast <void> (0) : __assert_fail ("MI->mayLoad() && !MI->mayStore()" , "/build/llvm-toolchain-snapshot-9~svn362543/lib/Target/AMDGPU/SIMemoryLegalizer.cpp" , 1137, __PRETTY_FUNCTION__)); |
1138 | |
1139 | bool Changed = false; |
1140 | |
1141 | if (MOI.isAtomic()) { |
1142 | if (MOI.getOrdering() == AtomicOrdering::Monotonic || |
1143 | MOI.getOrdering() == AtomicOrdering::Acquire || |
1144 | MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) { |
1145 | Changed |= CC->enableLoadCacheBypass(MI, MOI.getScope(), |
1146 | MOI.getOrderingAddrSpace()); |
1147 | } |
1148 | |
1149 | if (MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) |
1150 | Changed |= CC->insertWait(MI, MOI.getScope(), |
1151 | MOI.getOrderingAddrSpace(), |
1152 | SIMemOp::LOAD | SIMemOp::STORE, |
1153 | MOI.getIsCrossAddressSpaceOrdering(), |
1154 | Position::BEFORE); |
1155 | |
1156 | if (MOI.getOrdering() == AtomicOrdering::Acquire || |
1157 | MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) { |
1158 | Changed |= CC->insertWait(MI, MOI.getScope(), |
1159 | MOI.getInstrAddrSpace(), |
1160 | SIMemOp::LOAD, |
1161 | MOI.getIsCrossAddressSpaceOrdering(), |
1162 | Position::AFTER); |
1163 | Changed |= CC->insertCacheInvalidate(MI, MOI.getScope(), |
1164 | MOI.getOrderingAddrSpace(), |
1165 | Position::AFTER); |
1166 | } |
1167 | |
1168 | return Changed; |
1169 | } |
1170 | |
1171 | // Atomic instructions do not have the nontemporal attribute. |
1172 | if (MOI.isNonTemporal()) { |
1173 | Changed |= CC->enableNonTemporal(MI); |
1174 | return Changed; |
1175 | } |
1176 | |
1177 | return Changed; |
1178 | } |
1179 | |
1180 | bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI, |
1181 | MachineBasicBlock::iterator &MI) { |
1182 | assert(!MI->mayLoad() && MI->mayStore())((!MI->mayLoad() && MI->mayStore()) ? static_cast <void> (0) : __assert_fail ("!MI->mayLoad() && MI->mayStore()" , "/build/llvm-toolchain-snapshot-9~svn362543/lib/Target/AMDGPU/SIMemoryLegalizer.cpp" , 1182, __PRETTY_FUNCTION__)); |
1183 | |
1184 | bool Changed = false; |
1185 | |
1186 | if (MOI.isAtomic()) { |
1187 | if (MOI.getOrdering() == AtomicOrdering::Release || |
1188 | MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) |
1189 | Changed |= CC->insertWait(MI, MOI.getScope(), |
1190 | MOI.getOrderingAddrSpace(), |
1191 | SIMemOp::LOAD | SIMemOp::STORE, |
1192 | MOI.getIsCrossAddressSpaceOrdering(), |
1193 | Position::BEFORE); |
1194 | |
1195 | return Changed; |
1196 | } |
1197 | |
1198 | // Atomic instructions do not have the nontemporal attribute. |
1199 | if (MOI.isNonTemporal()) { |
1200 | Changed |= CC->enableNonTemporal(MI); |
1201 | return Changed; |
1202 | } |
1203 | |
1204 | return Changed; |
1205 | } |
1206 | |
1207 | bool SIMemoryLegalizer::expandAtomicFence(const SIMemOpInfo &MOI, |
1208 | MachineBasicBlock::iterator &MI) { |
1209 | assert(MI->getOpcode() == AMDGPU::ATOMIC_FENCE)((MI->getOpcode() == AMDGPU::ATOMIC_FENCE) ? static_cast< void> (0) : __assert_fail ("MI->getOpcode() == AMDGPU::ATOMIC_FENCE" , "/build/llvm-toolchain-snapshot-9~svn362543/lib/Target/AMDGPU/SIMemoryLegalizer.cpp" , 1209, __PRETTY_FUNCTION__)); |
1210 | |
1211 | AtomicPseudoMIs.push_back(MI); |
1212 | bool Changed = false; |
1213 | |
1214 | if (MOI.isAtomic()) { |
1215 | if (MOI.getOrdering() == AtomicOrdering::Acquire || |
1216 | MOI.getOrdering() == AtomicOrdering::Release || |
1217 | MOI.getOrdering() == AtomicOrdering::AcquireRelease || |
1218 | MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) |
1219 | /// TODO: This relies on a barrier always generating a waitcnt |
1220 | /// for LDS to ensure it is not reordered with the completion of |
1221 | /// the proceeding LDS operations. If barrier had a memory |
1222 | /// ordering and memory scope, then library does not need to |
1223 | /// generate a fence. Could add support in this file for |
1224 | /// barrier. SIInsertWaitcnt.cpp could then stop unconditionally |
1225 | /// adding waitcnt before a S_BARRIER. |
1226 | Changed |= CC->insertWait(MI, MOI.getScope(), |
1227 | MOI.getOrderingAddrSpace(), |
1228 | SIMemOp::LOAD | SIMemOp::STORE, |
1229 | MOI.getIsCrossAddressSpaceOrdering(), |
1230 | Position::BEFORE); |
1231 | |
1232 | if (MOI.getOrdering() == AtomicOrdering::Acquire || |
1233 | MOI.getOrdering() == AtomicOrdering::AcquireRelease || |
1234 | MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) |
1235 | Changed |= CC->insertCacheInvalidate(MI, MOI.getScope(), |
1236 | MOI.getOrderingAddrSpace(), |
1237 | Position::BEFORE); |
1238 | |
1239 | return Changed; |
1240 | } |
1241 | |
1242 | return Changed; |
1243 | } |
1244 | |
1245 | bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI, |
1246 | MachineBasicBlock::iterator &MI) { |
1247 | assert(MI->mayLoad() && MI->mayStore())((MI->mayLoad() && MI->mayStore()) ? static_cast <void> (0) : __assert_fail ("MI->mayLoad() && MI->mayStore()" , "/build/llvm-toolchain-snapshot-9~svn362543/lib/Target/AMDGPU/SIMemoryLegalizer.cpp" , 1247, __PRETTY_FUNCTION__)); |
1248 | |
1249 | bool Changed = false; |
1250 | |
1251 | if (MOI.isAtomic()) { |
1252 | if (MOI.getOrdering() == AtomicOrdering::Release || |
1253 | MOI.getOrdering() == AtomicOrdering::AcquireRelease || |
1254 | MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent || |
1255 | MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent) |
1256 | Changed |= CC->insertWait(MI, MOI.getScope(), |
1257 | MOI.getOrderingAddrSpace(), |
1258 | SIMemOp::LOAD | SIMemOp::STORE, |
1259 | MOI.getIsCrossAddressSpaceOrdering(), |
1260 | Position::BEFORE); |
1261 | |
1262 | if (MOI.getOrdering() == AtomicOrdering::Acquire || |
1263 | MOI.getOrdering() == AtomicOrdering::AcquireRelease || |
1264 | MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent || |
1265 | MOI.getFailureOrdering() == AtomicOrdering::Acquire || |
1266 | MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent) { |
1267 | Changed |= CC->insertWait(MI, MOI.getScope(), |
1268 | MOI.getOrderingAddrSpace(), |
1269 | isAtomicRet(*MI) ? SIMemOp::LOAD : |
1270 | SIMemOp::STORE, |
1271 | MOI.getIsCrossAddressSpaceOrdering(), |
1272 | Position::AFTER); |
1273 | Changed |= CC->insertCacheInvalidate(MI, MOI.getScope(), |
1274 | MOI.getOrderingAddrSpace(), |
1275 | Position::AFTER); |
1276 | } |
1277 | |
1278 | return Changed; |
1279 | } |
1280 | |
1281 | return Changed; |
1282 | } |
1283 | |
1284 | bool SIMemoryLegalizer::runOnMachineFunction(MachineFunction &MF) { |
1285 | bool Changed = false; |
1286 | |
1287 | SIMemOpAccess MOA(MF); |
1288 | CC = SICacheControl::create(MF.getSubtarget<GCNSubtarget>()); |
1289 | |
1290 | for (auto &MBB : MF) { |
1291 | for (auto MI = MBB.begin(); MI != MBB.end(); ++MI) { |
1292 | if (!(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic)) |
1293 | continue; |
1294 | |
1295 | if (const auto &MOI = MOA.getLoadInfo(MI)) |
1296 | Changed |= expandLoad(MOI.getValue(), MI); |
1297 | else if (const auto &MOI = MOA.getStoreInfo(MI)) |
1298 | Changed |= expandStore(MOI.getValue(), MI); |
1299 | else if (const auto &MOI = MOA.getAtomicFenceInfo(MI)) |
1300 | Changed |= expandAtomicFence(MOI.getValue(), MI); |
1301 | else if (const auto &MOI = MOA.getAtomicCmpxchgOrRmwInfo(MI)) |
1302 | Changed |= expandAtomicCmpxchgOrRmw(MOI.getValue(), MI); |
1303 | } |
1304 | } |
1305 | |
1306 | Changed |= removeAtomicPseudoMIs(); |
1307 | return Changed; |
1308 | } |
1309 | |
1310 | INITIALIZE_PASS(SIMemoryLegalizer, DEBUG_TYPE, PASS_NAME, false, false)static void *initializeSIMemoryLegalizerPassOnce(PassRegistry &Registry) { PassInfo *PI = new PassInfo( "SI Memory Legalizer" , "si-memory-legalizer", &SIMemoryLegalizer::ID, PassInfo ::NormalCtor_t(callDefaultCtor<SIMemoryLegalizer>), false , false); Registry.registerPass(*PI, true); return PI; } static llvm::once_flag InitializeSIMemoryLegalizerPassFlag; void llvm ::initializeSIMemoryLegalizerPass(PassRegistry &Registry) { llvm::call_once(InitializeSIMemoryLegalizerPassFlag, initializeSIMemoryLegalizerPassOnce , std::ref(Registry)); } |
1311 | |
1312 | char SIMemoryLegalizer::ID = 0; |
1313 | char &llvm::SIMemoryLegalizerID = SIMemoryLegalizer::ID; |
1314 | |
1315 | FunctionPass *llvm::createSIMemoryLegalizerPass() { |
1316 | return new SIMemoryLegalizer(); |
1317 | } |