LLVM  13.0.0git
SIMemoryLegalizer.cpp
Go to the documentation of this file.
1 //===- SIMemoryLegalizer.cpp ----------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// Memory legalizer - implements memory model. More information can be
11 /// found here:
12 /// http://llvm.org/docs/AMDGPUUsage.html#memory-model
13 //
14 //===----------------------------------------------------------------------===//
15 
16 #include "AMDGPU.h"
18 #include "GCNSubtarget.h"
20 #include "llvm/ADT/BitmaskEnum.h"
22 #include "llvm/IR/DiagnosticInfo.h"
25 
26 using namespace llvm;
27 using namespace llvm::AMDGPU;
28 
29 #define DEBUG_TYPE "si-memory-legalizer"
30 #define PASS_NAME "SI Memory Legalizer"
31 
33  "amdgcn-skip-cache-invalidations", cl::init(false), cl::Hidden,
34  cl::desc("Use this to skip inserting cache invalidating instructions."));
35 
36 namespace {
37 
39 
40 /// Memory operation flags. Can be ORed together.
41 enum class SIMemOp {
42  NONE = 0u,
43  LOAD = 1u << 0,
44  STORE = 1u << 1,
45  LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ STORE)
46 };
47 
48 /// Position to insert a new instruction relative to an existing
49 /// instruction.
50 enum class Position {
51  BEFORE,
52  AFTER
53 };
54 
55 /// The atomic synchronization scopes supported by the AMDGPU target.
56 enum class SIAtomicScope {
57  NONE,
58  SINGLETHREAD,
59  WAVEFRONT,
60  WORKGROUP,
61  AGENT,
62  SYSTEM
63 };
64 
65 /// The distinct address spaces supported by the AMDGPU target for
66 /// atomic memory operation. Can be ORed toether.
67 enum class SIAtomicAddrSpace {
68  NONE = 0u,
69  GLOBAL = 1u << 0,
70  LDS = 1u << 1,
71  SCRATCH = 1u << 2,
72  GDS = 1u << 3,
73  OTHER = 1u << 4,
74 
75  /// The address spaces that can be accessed by a FLAT instruction.
76  FLAT = GLOBAL | LDS | SCRATCH,
77 
78  /// The address spaces that support atomic instructions.
79  ATOMIC = GLOBAL | LDS | SCRATCH | GDS,
80 
81  /// All address spaces.
82  ALL = GLOBAL | LDS | SCRATCH | GDS | OTHER,
83 
84  LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ ALL)
85 };
86 
87 class SIMemOpInfo final {
88 private:
89 
90  friend class SIMemOpAccess;
91 
93  AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic;
94  SIAtomicScope Scope = SIAtomicScope::SYSTEM;
95  SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
96  SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE;
97  bool IsCrossAddressSpaceOrdering = false;
98  bool IsVolatile = false;
99  bool IsNonTemporal = false;
100 
102  SIAtomicScope Scope = SIAtomicScope::SYSTEM,
103  SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::ATOMIC,
104  SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::ALL,
105  bool IsCrossAddressSpaceOrdering = true,
106  AtomicOrdering FailureOrdering =
108  bool IsVolatile = false,
109  bool IsNonTemporal = false)
110  : Ordering(Ordering), FailureOrdering(FailureOrdering),
111  Scope(Scope), OrderingAddrSpace(OrderingAddrSpace),
112  InstrAddrSpace(InstrAddrSpace),
113  IsCrossAddressSpaceOrdering(IsCrossAddressSpaceOrdering),
115  IsNonTemporal(IsNonTemporal) {
116 
117  if (Ordering == AtomicOrdering::NotAtomic) {
119  OrderingAddrSpace == SIAtomicAddrSpace::NONE &&
120  !IsCrossAddressSpaceOrdering &&
121  FailureOrdering == AtomicOrdering::NotAtomic);
122  return;
123  }
124 
126  (OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) !=
128  (InstrAddrSpace & SIAtomicAddrSpace::ATOMIC) !=
130  !isStrongerThan(FailureOrdering, Ordering));
131 
132  // There is also no cross address space ordering if the ordering
133  // address space is the same as the instruction address space and
134  // only contains a single address space.
135  if ((OrderingAddrSpace == InstrAddrSpace) &&
136  isPowerOf2_32(uint32_t(InstrAddrSpace)))
137  this->IsCrossAddressSpaceOrdering = false;
138 
139  // Limit the scope to the maximum supported by the instruction's address
140  // spaces.
141  if ((InstrAddrSpace & ~SIAtomicAddrSpace::SCRATCH) ==
143  this->Scope = std::min(Scope, SIAtomicScope::SINGLETHREAD);
144  } else if ((InstrAddrSpace &
145  ~(SIAtomicAddrSpace::SCRATCH | SIAtomicAddrSpace::LDS)) ==
147  this->Scope = std::min(Scope, SIAtomicScope::WORKGROUP);
148  } else if ((InstrAddrSpace &
149  ~(SIAtomicAddrSpace::SCRATCH | SIAtomicAddrSpace::LDS |
150  SIAtomicAddrSpace::GDS)) == SIAtomicAddrSpace::NONE) {
151  this->Scope = std::min(Scope, SIAtomicScope::AGENT);
152  }
153  }
154 
155 public:
156  /// \returns Atomic synchronization scope of the machine instruction used to
157  /// create this SIMemOpInfo.
158  SIAtomicScope getScope() const {
159  return Scope;
160  }
161 
162  /// \returns Ordering constraint of the machine instruction used to
163  /// create this SIMemOpInfo.
164  AtomicOrdering getOrdering() const {
165  return Ordering;
166  }
167 
168  /// \returns Failure ordering constraint of the machine instruction used to
169  /// create this SIMemOpInfo.
170  AtomicOrdering getFailureOrdering() const {
171  return FailureOrdering;
172  }
173 
174  /// \returns The address spaces be accessed by the machine
175  /// instruction used to create this SiMemOpInfo.
176  SIAtomicAddrSpace getInstrAddrSpace() const {
177  return InstrAddrSpace;
178  }
179 
180  /// \returns The address spaces that must be ordered by the machine
181  /// instruction used to create this SiMemOpInfo.
182  SIAtomicAddrSpace getOrderingAddrSpace() const {
183  return OrderingAddrSpace;
184  }
185 
186  /// \returns Return true iff memory ordering of operations on
187  /// different address spaces is required.
188  bool getIsCrossAddressSpaceOrdering() const {
189  return IsCrossAddressSpaceOrdering;
190  }
191 
192  /// \returns True if memory access of the machine instruction used to
193  /// create this SIMemOpInfo is volatile, false otherwise.
194  bool isVolatile() const {
195  return IsVolatile;
196  }
197 
198  /// \returns True if memory access of the machine instruction used to
199  /// create this SIMemOpInfo is nontemporal, false otherwise.
200  bool isNonTemporal() const {
201  return IsNonTemporal;
202  }
203 
204  /// \returns True if ordering constraint of the machine instruction used to
205  /// create this SIMemOpInfo is unordered or higher, false otherwise.
206  bool isAtomic() const {
207  return Ordering != AtomicOrdering::NotAtomic;
208  }
209 
210 };
211 
212 class SIMemOpAccess final {
213 private:
214  AMDGPUMachineModuleInfo *MMI = nullptr;
215 
216  /// Reports unsupported message \p Msg for \p MI to LLVM context.
217  void reportUnsupported(const MachineBasicBlock::iterator &MI,
218  const char *Msg) const;
219 
220  /// Inspects the target synchronization scope \p SSID and determines
221  /// the SI atomic scope it corresponds to, the address spaces it
222  /// covers, and whether the memory ordering applies between address
223  /// spaces.
225  toSIAtomicScope(SyncScope::ID SSID, SIAtomicAddrSpace InstrAddrSpace) const;
226 
227  /// \return Return a bit set of the address spaces accessed by \p AS.
228  SIAtomicAddrSpace toSIAtomicAddrSpace(unsigned AS) const;
229 
230  /// \returns Info constructed from \p MI, which has at least machine memory
231  /// operand.
232  Optional<SIMemOpInfo> constructFromMIWithMMO(
233  const MachineBasicBlock::iterator &MI) const;
234 
235 public:
236  /// Construct class to support accessing the machine memory operands
237  /// of instructions in the machine function \p MF.
238  SIMemOpAccess(MachineFunction &MF);
239 
240  /// \returns Load info if \p MI is a load operation, "None" otherwise.
242  const MachineBasicBlock::iterator &MI) const;
243 
244  /// \returns Store info if \p MI is a store operation, "None" otherwise.
245  Optional<SIMemOpInfo> getStoreInfo(
246  const MachineBasicBlock::iterator &MI) const;
247 
248  /// \returns Atomic fence info if \p MI is an atomic fence operation,
249  /// "None" otherwise.
250  Optional<SIMemOpInfo> getAtomicFenceInfo(
251  const MachineBasicBlock::iterator &MI) const;
252 
253  /// \returns Atomic cmpxchg/rmw info if \p MI is an atomic cmpxchg or
254  /// rmw operation, "None" otherwise.
255  Optional<SIMemOpInfo> getAtomicCmpxchgOrRmwInfo(
256  const MachineBasicBlock::iterator &MI) const;
257 };
258 
259 class SICacheControl {
260 protected:
261 
262  /// AMDGPU subtarget info.
263  const GCNSubtarget &ST;
264 
265  /// Instruction info.
266  const SIInstrInfo *TII = nullptr;
267 
268  IsaVersion IV;
269 
270  /// Whether to insert cache invalidating instructions.
271  bool InsertCacheInv;
272 
273  SICacheControl(const GCNSubtarget &ST);
274 
275  /// Sets named bit \p BitName to "true" if present in instruction \p MI.
276  /// \returns Returns true if \p MI is modified, false otherwise.
277  bool enableNamedBit(const MachineBasicBlock::iterator MI,
278  AMDGPU::CPol::CPol Bit) const;
279 
280 public:
281 
282  /// Create a cache control for the subtarget \p ST.
283  static std::unique_ptr<SICacheControl> create(const GCNSubtarget &ST);
284 
285  /// Update \p MI memory load instruction to bypass any caches up to
286  /// the \p Scope memory scope for address spaces \p
287  /// AddrSpace. Return true iff the instruction was modified.
288  virtual bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
289  SIAtomicScope Scope,
290  SIAtomicAddrSpace AddrSpace) const = 0;
291 
292  /// Update \p MI memory store instruction to bypass any caches up to
293  /// the \p Scope memory scope for address spaces \p
294  /// AddrSpace. Return true iff the instruction was modified.
295  virtual bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
296  SIAtomicScope Scope,
297  SIAtomicAddrSpace AddrSpace) const = 0;
298 
299  /// Update \p MI memory read-modify-write instruction to bypass any caches up
300  /// to the \p Scope memory scope for address spaces \p AddrSpace. Return true
301  /// iff the instruction was modified.
302  virtual bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
303  SIAtomicScope Scope,
304  SIAtomicAddrSpace AddrSpace) const = 0;
305 
306  /// Update \p MI memory instruction of kind \p Op associated with address
307  /// spaces \p AddrSpace to indicate it is volatile and/or nontemporal. Return
308  /// true iff the instruction was modified.
309  virtual bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
310  SIAtomicAddrSpace AddrSpace,
311  SIMemOp Op, bool IsVolatile,
312  bool IsNonTemporal) const = 0;
313 
314  /// Inserts any necessary instructions at position \p Pos relative
315  /// to instruction \p MI to ensure memory instructions before \p Pos of kind
316  /// \p Op associated with address spaces \p AddrSpace have completed. Used
317  /// between memory instructions to enforce the order they become visible as
318  /// observed by other memory instructions executing in memory scope \p Scope.
319  /// \p IsCrossAddrSpaceOrdering indicates if the memory ordering is between
320  /// address spaces. Returns true iff any instructions inserted.
321  virtual bool insertWait(MachineBasicBlock::iterator &MI,
322  SIAtomicScope Scope,
323  SIAtomicAddrSpace AddrSpace,
324  SIMemOp Op,
325  bool IsCrossAddrSpaceOrdering,
326  Position Pos) const = 0;
327 
328  /// Inserts any necessary instructions at position \p Pos relative to
329  /// instruction \p MI to ensure any subsequent memory instructions of this
330  /// thread with address spaces \p AddrSpace will observe the previous memory
331  /// operations by any thread for memory scopes up to memory scope \p Scope .
332  /// Returns true iff any instructions inserted.
333  virtual bool insertAcquire(MachineBasicBlock::iterator &MI,
334  SIAtomicScope Scope,
335  SIAtomicAddrSpace AddrSpace,
336  Position Pos) const = 0;
337 
338  /// Inserts any necessary instructions at position \p Pos relative to
339  /// instruction \p MI to ensure previous memory instructions by this thread
340  /// with address spaces \p AddrSpace have completed and can be observed by
341  /// subsequent memory instructions by any thread executing in memory scope \p
342  /// Scope. \p IsCrossAddrSpaceOrdering indicates if the memory ordering is
343  /// between address spaces. Returns true iff any instructions inserted.
344  virtual bool insertRelease(MachineBasicBlock::iterator &MI,
345  SIAtomicScope Scope,
346  SIAtomicAddrSpace AddrSpace,
347  bool IsCrossAddrSpaceOrdering,
348  Position Pos) const = 0;
349 
350  /// Virtual destructor to allow derivations to be deleted.
351  virtual ~SICacheControl() = default;
352 
353 };
354 
355 class SIGfx6CacheControl : public SICacheControl {
356 protected:
357 
358  /// Sets GLC bit to "true" if present in \p MI. Returns true if \p MI
359  /// is modified, false otherwise.
360  bool enableGLCBit(const MachineBasicBlock::iterator &MI) const {
361  return enableNamedBit(MI, AMDGPU::CPol::GLC);
362  }
363 
364  /// Sets SLC bit to "true" if present in \p MI. Returns true if \p MI
365  /// is modified, false otherwise.
366  bool enableSLCBit(const MachineBasicBlock::iterator &MI) const {
367  return enableNamedBit(MI, AMDGPU::CPol::SLC);
368  }
369 
370 public:
371 
372  SIGfx6CacheControl(const GCNSubtarget &ST) : SICacheControl(ST) {};
373 
374  bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
375  SIAtomicScope Scope,
376  SIAtomicAddrSpace AddrSpace) const override;
377 
378  bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
379  SIAtomicScope Scope,
380  SIAtomicAddrSpace AddrSpace) const override;
381 
382  bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
383  SIAtomicScope Scope,
384  SIAtomicAddrSpace AddrSpace) const override;
385 
386  bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
387  SIAtomicAddrSpace AddrSpace, SIMemOp Op,
388  bool IsVolatile,
389  bool IsNonTemporal) const override;
390 
391  bool insertWait(MachineBasicBlock::iterator &MI,
392  SIAtomicScope Scope,
393  SIAtomicAddrSpace AddrSpace,
394  SIMemOp Op,
395  bool IsCrossAddrSpaceOrdering,
396  Position Pos) const override;
397 
398  bool insertAcquire(MachineBasicBlock::iterator &MI,
399  SIAtomicScope Scope,
400  SIAtomicAddrSpace AddrSpace,
401  Position Pos) const override;
402 
403  bool insertRelease(MachineBasicBlock::iterator &MI,
404  SIAtomicScope Scope,
405  SIAtomicAddrSpace AddrSpace,
406  bool IsCrossAddrSpaceOrdering,
407  Position Pos) const override;
408 };
409 
410 class SIGfx7CacheControl : public SIGfx6CacheControl {
411 public:
412 
413  SIGfx7CacheControl(const GCNSubtarget &ST) : SIGfx6CacheControl(ST) {};
414 
415  bool insertAcquire(MachineBasicBlock::iterator &MI,
416  SIAtomicScope Scope,
417  SIAtomicAddrSpace AddrSpace,
418  Position Pos) const override;
419 
420 };
421 
422 class SIGfx90ACacheControl : public SIGfx7CacheControl {
423 public:
424 
425  SIGfx90ACacheControl(const GCNSubtarget &ST) : SIGfx7CacheControl(ST) {};
426 
427  bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
428  SIAtomicScope Scope,
429  SIAtomicAddrSpace AddrSpace) const override;
430 
431  bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
432  SIAtomicScope Scope,
433  SIAtomicAddrSpace AddrSpace) const override;
434 
435  bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
436  SIAtomicScope Scope,
437  SIAtomicAddrSpace AddrSpace) const override;
438 
439  bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
440  SIAtomicAddrSpace AddrSpace, SIMemOp Op,
441  bool IsVolatile,
442  bool IsNonTemporal) const override;
443 
444  bool insertWait(MachineBasicBlock::iterator &MI,
445  SIAtomicScope Scope,
446  SIAtomicAddrSpace AddrSpace,
447  SIMemOp Op,
448  bool IsCrossAddrSpaceOrdering,
449  Position Pos) const override;
450 
451  bool insertAcquire(MachineBasicBlock::iterator &MI,
452  SIAtomicScope Scope,
453  SIAtomicAddrSpace AddrSpace,
454  Position Pos) const override;
455 };
456 
457 class SIGfx10CacheControl : public SIGfx7CacheControl {
458 protected:
459 
460  /// Sets DLC bit to "true" if present in \p MI. Returns true if \p MI
461  /// is modified, false otherwise.
462  bool enableDLCBit(const MachineBasicBlock::iterator &MI) const {
463  return enableNamedBit(MI, AMDGPU::CPol::DLC);
464  }
465 
466 public:
467 
468  SIGfx10CacheControl(const GCNSubtarget &ST) : SIGfx7CacheControl(ST) {};
469 
470  bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
471  SIAtomicScope Scope,
472  SIAtomicAddrSpace AddrSpace) const override;
473 
474  bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
475  SIAtomicAddrSpace AddrSpace, SIMemOp Op,
476  bool IsVolatile,
477  bool IsNonTemporal) const override;
478 
479  bool insertWait(MachineBasicBlock::iterator &MI,
480  SIAtomicScope Scope,
481  SIAtomicAddrSpace AddrSpace,
482  SIMemOp Op,
483  bool IsCrossAddrSpaceOrdering,
484  Position Pos) const override;
485 
486  bool insertAcquire(MachineBasicBlock::iterator &MI,
487  SIAtomicScope Scope,
488  SIAtomicAddrSpace AddrSpace,
489  Position Pos) const override;
490 };
491 
492 class SIMemoryLegalizer final : public MachineFunctionPass {
493 private:
494 
495  /// Cache Control.
496  std::unique_ptr<SICacheControl> CC = nullptr;
497 
498  /// List of atomic pseudo instructions.
499  std::list<MachineBasicBlock::iterator> AtomicPseudoMIs;
500 
501  /// Return true iff instruction \p MI is a atomic instruction that
502  /// returns a result.
503  bool isAtomicRet(const MachineInstr &MI) const {
505  }
506 
507  /// Removes all processed atomic pseudo instructions from the current
508  /// function. Returns true if current function is modified, false otherwise.
509  bool removeAtomicPseudoMIs();
510 
511  /// Expands load operation \p MI. Returns true if instructions are
512  /// added/deleted or \p MI is modified, false otherwise.
513  bool expandLoad(const SIMemOpInfo &MOI,
515  /// Expands store operation \p MI. Returns true if instructions are
516  /// added/deleted or \p MI is modified, false otherwise.
517  bool expandStore(const SIMemOpInfo &MOI,
519  /// Expands atomic fence operation \p MI. Returns true if
520  /// instructions are added/deleted or \p MI is modified, false otherwise.
521  bool expandAtomicFence(const SIMemOpInfo &MOI,
523  /// Expands atomic cmpxchg or rmw operation \p MI. Returns true if
524  /// instructions are added/deleted or \p MI is modified, false otherwise.
525  bool expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
527 
528 public:
529  static char ID;
530 
531  SIMemoryLegalizer() : MachineFunctionPass(ID) {}
532 
533  void getAnalysisUsage(AnalysisUsage &AU) const override {
534  AU.setPreservesCFG();
536  }
537 
538  StringRef getPassName() const override {
539  return PASS_NAME;
540  }
541 
542  bool runOnMachineFunction(MachineFunction &MF) override;
543 };
544 
545 } // end namespace anonymous
546 
547 void SIMemOpAccess::reportUnsupported(const MachineBasicBlock::iterator &MI,
548  const char *Msg) const {
549  const Function &Func = MI->getParent()->getParent()->getFunction();
550  DiagnosticInfoUnsupported Diag(Func, Msg, MI->getDebugLoc());
551  Func.getContext().diagnose(Diag);
552 }
553 
555 SIMemOpAccess::toSIAtomicScope(SyncScope::ID SSID,
556  SIAtomicAddrSpace InstrAddrSpace) const {
557  if (SSID == SyncScope::System)
558  return std::make_tuple(SIAtomicScope::SYSTEM,
559  SIAtomicAddrSpace::ATOMIC,
560  true);
561  if (SSID == MMI->getAgentSSID())
562  return std::make_tuple(SIAtomicScope::AGENT,
563  SIAtomicAddrSpace::ATOMIC,
564  true);
565  if (SSID == MMI->getWorkgroupSSID())
566  return std::make_tuple(SIAtomicScope::WORKGROUP,
567  SIAtomicAddrSpace::ATOMIC,
568  true);
569  if (SSID == MMI->getWavefrontSSID())
570  return std::make_tuple(SIAtomicScope::WAVEFRONT,
571  SIAtomicAddrSpace::ATOMIC,
572  true);
573  if (SSID == SyncScope::SingleThread)
574  return std::make_tuple(SIAtomicScope::SINGLETHREAD,
575  SIAtomicAddrSpace::ATOMIC,
576  true);
577  if (SSID == MMI->getSystemOneAddressSpaceSSID())
578  return std::make_tuple(SIAtomicScope::SYSTEM,
579  SIAtomicAddrSpace::ATOMIC & InstrAddrSpace,
580  false);
581  if (SSID == MMI->getAgentOneAddressSpaceSSID())
582  return std::make_tuple(SIAtomicScope::AGENT,
583  SIAtomicAddrSpace::ATOMIC & InstrAddrSpace,
584  false);
585  if (SSID == MMI->getWorkgroupOneAddressSpaceSSID())
586  return std::make_tuple(SIAtomicScope::WORKGROUP,
587  SIAtomicAddrSpace::ATOMIC & InstrAddrSpace,
588  false);
589  if (SSID == MMI->getWavefrontOneAddressSpaceSSID())
590  return std::make_tuple(SIAtomicScope::WAVEFRONT,
591  SIAtomicAddrSpace::ATOMIC & InstrAddrSpace,
592  false);
593  if (SSID == MMI->getSingleThreadOneAddressSpaceSSID())
594  return std::make_tuple(SIAtomicScope::SINGLETHREAD,
595  SIAtomicAddrSpace::ATOMIC & InstrAddrSpace,
596  false);
597  return None;
598 }
599 
600 SIAtomicAddrSpace SIMemOpAccess::toSIAtomicAddrSpace(unsigned AS) const {
601  if (AS == AMDGPUAS::FLAT_ADDRESS)
603  if (AS == AMDGPUAS::GLOBAL_ADDRESS)
605  if (AS == AMDGPUAS::LOCAL_ADDRESS)
606  return SIAtomicAddrSpace::LDS;
607  if (AS == AMDGPUAS::PRIVATE_ADDRESS)
608  return SIAtomicAddrSpace::SCRATCH;
609  if (AS == AMDGPUAS::REGION_ADDRESS)
610  return SIAtomicAddrSpace::GDS;
611 
612  return SIAtomicAddrSpace::OTHER;
613 }
614 
615 SIMemOpAccess::SIMemOpAccess(MachineFunction &MF) {
617 }
618 
619 Optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO(
620  const MachineBasicBlock::iterator &MI) const {
621  assert(MI->getNumMemOperands() > 0);
622 
625  AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic;
626  SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE;
627  bool IsNonTemporal = true;
628  bool IsVolatile = false;
629 
630  // Validator should check whether or not MMOs cover the entire set of
631  // locations accessed by the memory instruction.
632  for (const auto &MMO : MI->memoperands()) {
633  IsNonTemporal &= MMO->isNonTemporal();
634  IsVolatile |= MMO->isVolatile();
635  InstrAddrSpace |=
636  toSIAtomicAddrSpace(MMO->getPointerInfo().getAddrSpace());
637  AtomicOrdering OpOrdering = MMO->getOrdering();
638  if (OpOrdering != AtomicOrdering::NotAtomic) {
639  const auto &IsSyncScopeInclusion =
640  MMI->isSyncScopeInclusion(SSID, MMO->getSyncScopeID());
641  if (!IsSyncScopeInclusion) {
642  reportUnsupported(MI,
643  "Unsupported non-inclusive atomic synchronization scope");
644  return None;
645  }
646 
647  SSID = IsSyncScopeInclusion.getValue() ? SSID : MMO->getSyncScopeID();
648  Ordering =
649  isStrongerThan(Ordering, OpOrdering) ?
650  Ordering : MMO->getOrdering();
651  assert(MMO->getFailureOrdering() != AtomicOrdering::Release &&
652  MMO->getFailureOrdering() != AtomicOrdering::AcquireRelease);
653  FailureOrdering =
654  isStrongerThan(FailureOrdering, MMO->getFailureOrdering()) ?
655  FailureOrdering : MMO->getFailureOrdering();
656  }
657  }
658 
659  SIAtomicScope Scope = SIAtomicScope::NONE;
660  SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
661  bool IsCrossAddressSpaceOrdering = false;
662  if (Ordering != AtomicOrdering::NotAtomic) {
663  auto ScopeOrNone = toSIAtomicScope(SSID, InstrAddrSpace);
664  if (!ScopeOrNone) {
665  reportUnsupported(MI, "Unsupported atomic synchronization scope");
666  return None;
667  }
668  std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) =
669  ScopeOrNone.getValue();
670  if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) ||
671  ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace) ||
672  ((InstrAddrSpace & SIAtomicAddrSpace::ATOMIC) == SIAtomicAddrSpace::NONE)) {
673  reportUnsupported(MI, "Unsupported atomic address space");
674  return None;
675  }
676  }
677  return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, InstrAddrSpace,
678  IsCrossAddressSpaceOrdering, FailureOrdering, IsVolatile,
679  IsNonTemporal);
680 }
681 
683  const MachineBasicBlock::iterator &MI) const {
684  assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
685 
686  if (!(MI->mayLoad() && !MI->mayStore()))
687  return None;
688 
689  // Be conservative if there are no memory operands.
690  if (MI->getNumMemOperands() == 0)
691  return SIMemOpInfo();
692 
693  return constructFromMIWithMMO(MI);
694 }
695 
696 Optional<SIMemOpInfo> SIMemOpAccess::getStoreInfo(
697  const MachineBasicBlock::iterator &MI) const {
698  assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
699 
700  if (!(!MI->mayLoad() && MI->mayStore()))
701  return None;
702 
703  // Be conservative if there are no memory operands.
704  if (MI->getNumMemOperands() == 0)
705  return SIMemOpInfo();
706 
707  return constructFromMIWithMMO(MI);
708 }
709 
710 Optional<SIMemOpInfo> SIMemOpAccess::getAtomicFenceInfo(
711  const MachineBasicBlock::iterator &MI) const {
712  assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
713 
714  if (MI->getOpcode() != AMDGPU::ATOMIC_FENCE)
715  return None;
716 
717  AtomicOrdering Ordering =
718  static_cast<AtomicOrdering>(MI->getOperand(0).getImm());
719 
720  SyncScope::ID SSID = static_cast<SyncScope::ID>(MI->getOperand(1).getImm());
721  auto ScopeOrNone = toSIAtomicScope(SSID, SIAtomicAddrSpace::ATOMIC);
722  if (!ScopeOrNone) {
723  reportUnsupported(MI, "Unsupported atomic synchronization scope");
724  return None;
725  }
726 
727  SIAtomicScope Scope = SIAtomicScope::NONE;
728  SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
729  bool IsCrossAddressSpaceOrdering = false;
730  std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) =
731  ScopeOrNone.getValue();
732 
733  if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) ||
734  ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace)) {
735  reportUnsupported(MI, "Unsupported atomic address space");
736  return None;
737  }
738 
739  return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, SIAtomicAddrSpace::ATOMIC,
740  IsCrossAddressSpaceOrdering, AtomicOrdering::NotAtomic);
741 }
742 
743 Optional<SIMemOpInfo> SIMemOpAccess::getAtomicCmpxchgOrRmwInfo(
744  const MachineBasicBlock::iterator &MI) const {
745  assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
746 
747  if (!(MI->mayLoad() && MI->mayStore()))
748  return None;
749 
750  // Be conservative if there are no memory operands.
751  if (MI->getNumMemOperands() == 0)
752  return SIMemOpInfo();
753 
754  return constructFromMIWithMMO(MI);
755 }
756 
757 SICacheControl::SICacheControl(const GCNSubtarget &ST) : ST(ST) {
758  TII = ST.getInstrInfo();
759  IV = getIsaVersion(ST.getCPU());
760  InsertCacheInv = !AmdgcnSkipCacheInvalidations;
761 }
762 
763 bool SICacheControl::enableNamedBit(const MachineBasicBlock::iterator MI,
764  AMDGPU::CPol::CPol Bit) const {
765  MachineOperand *CPol = TII->getNamedOperand(*MI, AMDGPU::OpName::cpol);
766  if (!CPol)
767  return false;
768 
769  CPol->setImm(CPol->getImm() | Bit);
770  return true;
771 }
772 
773 /* static */
774 std::unique_ptr<SICacheControl> SICacheControl::create(const GCNSubtarget &ST) {
775  GCNSubtarget::Generation Generation = ST.getGeneration();
776  if (ST.hasGFX90AInsts())
777  return std::make_unique<SIGfx90ACacheControl>(ST);
778  if (Generation <= AMDGPUSubtarget::SOUTHERN_ISLANDS)
779  return std::make_unique<SIGfx6CacheControl>(ST);
780  if (Generation < AMDGPUSubtarget::GFX10)
781  return std::make_unique<SIGfx7CacheControl>(ST);
782  return std::make_unique<SIGfx10CacheControl>(ST);
783 }
784 
785 bool SIGfx6CacheControl::enableLoadCacheBypass(
787  SIAtomicScope Scope,
788  SIAtomicAddrSpace AddrSpace) const {
789  assert(MI->mayLoad() && !MI->mayStore());
790  bool Changed = false;
791 
792  if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
793  switch (Scope) {
794  case SIAtomicScope::SYSTEM:
795  case SIAtomicScope::AGENT:
796  Changed |= enableGLCBit(MI);
797  break;
798  case SIAtomicScope::WORKGROUP:
799  case SIAtomicScope::WAVEFRONT:
800  case SIAtomicScope::SINGLETHREAD:
801  // No cache to bypass.
802  break;
803  default:
804  llvm_unreachable("Unsupported synchronization scope");
805  }
806  }
807 
808  /// The scratch address space does not need the global memory caches
809  /// to be bypassed as all memory operations by the same thread are
810  /// sequentially consistent, and no other thread can access scratch
811  /// memory.
812 
813  /// Other address spaces do not have a cache.
814 
815  return Changed;
816 }
817 
818 bool SIGfx6CacheControl::enableStoreCacheBypass(
820  SIAtomicScope Scope,
821  SIAtomicAddrSpace AddrSpace) const {
822  assert(!MI->mayLoad() && MI->mayStore());
823  bool Changed = false;
824 
825  /// The L1 cache is write through so does not need to be bypassed. There is no
826  /// bypass control for the L2 cache at the isa level.
827 
828  return Changed;
829 }
830 
831 bool SIGfx6CacheControl::enableRMWCacheBypass(
833  SIAtomicScope Scope,
834  SIAtomicAddrSpace AddrSpace) const {
835  assert(MI->mayLoad() && MI->mayStore());
836  bool Changed = false;
837 
838  /// The L1 cache is write through so does not need to be bypassed. There is no
839  /// bypass control for the L2 cache at the isa level.
840 
841  return Changed;
842 }
843 
844 bool SIGfx6CacheControl::enableVolatileAndOrNonTemporal(
845  MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
846  bool IsVolatile, bool IsNonTemporal) const {
847  // Only handle load and store, not atomic read-modify-write insructions. The
848  // latter use glc to indicate if the atomic returns a result and so must not
849  // be used for cache control.
850  assert(MI->mayLoad() ^ MI->mayStore());
851 
852  // Only update load and store, not LLVM IR atomic read-modify-write
853  // instructions. The latter are always marked as volatile so cannot sensibly
854  // handle it as do not want to pessimize all atomics. Also they do not support
855  // the nontemporal attribute.
857 
858  bool Changed = false;
859 
860  if (IsVolatile) {
861  if (Op == SIMemOp::LOAD)
862  Changed |= enableGLCBit(MI);
863 
864  // Ensure operation has completed at system scope to cause all volatile
865  // operations to be visible outside the program in a global order. Do not
866  // request cross address space as only the global address space can be
867  // observable outside the program, so no need to cause a waitcnt for LDS
868  // address space operations.
869  Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
870  Position::AFTER);
871 
872  return Changed;
873  }
874 
875  if (IsNonTemporal) {
876  // Request L1 MISS_EVICT and L2 STREAM for load and store instructions.
877  Changed |= enableGLCBit(MI);
878  Changed |= enableSLCBit(MI);
879  return Changed;
880  }
881 
882  return Changed;
883 }
884 
885 bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI,
886  SIAtomicScope Scope,
887  SIAtomicAddrSpace AddrSpace,
888  SIMemOp Op,
889  bool IsCrossAddrSpaceOrdering,
890  Position Pos) const {
891  bool Changed = false;
892 
893  MachineBasicBlock &MBB = *MI->getParent();
894  DebugLoc DL = MI->getDebugLoc();
895 
896  if (Pos == Position::AFTER)
897  ++MI;
898 
899  bool VMCnt = false;
900  bool LGKMCnt = false;
901 
902  if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) !=
904  switch (Scope) {
905  case SIAtomicScope::SYSTEM:
906  case SIAtomicScope::AGENT:
907  VMCnt |= true;
908  break;
909  case SIAtomicScope::WORKGROUP:
910  case SIAtomicScope::WAVEFRONT:
911  case SIAtomicScope::SINGLETHREAD:
912  // The L1 cache keeps all memory operations in order for
913  // wavefronts in the same work-group.
914  break;
915  default:
916  llvm_unreachable("Unsupported synchronization scope");
917  }
918  }
919 
920  if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
921  switch (Scope) {
922  case SIAtomicScope::SYSTEM:
923  case SIAtomicScope::AGENT:
924  case SIAtomicScope::WORKGROUP:
925  // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is
926  // not needed as LDS operations for all waves are executed in a total
927  // global ordering as observed by all waves. Required if also
928  // synchronizing with global/GDS memory as LDS operations could be
929  // reordered with respect to later global/GDS memory operations of the
930  // same wave.
931  LGKMCnt |= IsCrossAddrSpaceOrdering;
932  break;
933  case SIAtomicScope::WAVEFRONT:
934  case SIAtomicScope::SINGLETHREAD:
935  // The LDS keeps all memory operations in order for
936  // the same wavesfront.
937  break;
938  default:
939  llvm_unreachable("Unsupported synchronization scope");
940  }
941  }
942 
943  if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) {
944  switch (Scope) {
945  case SIAtomicScope::SYSTEM:
946  case SIAtomicScope::AGENT:
947  // If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)"
948  // is not needed as GDS operations for all waves are executed in a total
949  // global ordering as observed by all waves. Required if also
950  // synchronizing with global/LDS memory as GDS operations could be
951  // reordered with respect to later global/LDS memory operations of the
952  // same wave.
953  LGKMCnt |= IsCrossAddrSpaceOrdering;
954  break;
955  case SIAtomicScope::WORKGROUP:
956  case SIAtomicScope::WAVEFRONT:
957  case SIAtomicScope::SINGLETHREAD:
958  // The GDS keeps all memory operations in order for
959  // the same work-group.
960  break;
961  default:
962  llvm_unreachable("Unsupported synchronization scope");
963  }
964  }
965 
966  if (VMCnt || LGKMCnt) {
967  unsigned WaitCntImmediate =
969  VMCnt ? 0 : getVmcntBitMask(IV),
970  getExpcntBitMask(IV),
971  LGKMCnt ? 0 : getLgkmcntBitMask(IV));
972  BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(WaitCntImmediate);
973  Changed = true;
974  }
975 
976  if (Pos == Position::AFTER)
977  --MI;
978 
979  return Changed;
980 }
981 
982 bool SIGfx6CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
983  SIAtomicScope Scope,
984  SIAtomicAddrSpace AddrSpace,
985  Position Pos) const {
986  if (!InsertCacheInv)
987  return false;
988 
989  bool Changed = false;
990 
991  MachineBasicBlock &MBB = *MI->getParent();
992  DebugLoc DL = MI->getDebugLoc();
993 
994  if (Pos == Position::AFTER)
995  ++MI;
996 
997  if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
998  switch (Scope) {
999  case SIAtomicScope::SYSTEM:
1000  case SIAtomicScope::AGENT:
1001  BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBINVL1));
1002  Changed = true;
1003  break;
1004  case SIAtomicScope::WORKGROUP:
1005  case SIAtomicScope::WAVEFRONT:
1006  case SIAtomicScope::SINGLETHREAD:
1007  // No cache to invalidate.
1008  break;
1009  default:
1010  llvm_unreachable("Unsupported synchronization scope");
1011  }
1012  }
1013 
1014  /// The scratch address space does not need the global memory cache
1015  /// to be flushed as all memory operations by the same thread are
1016  /// sequentially consistent, and no other thread can access scratch
1017  /// memory.
1018 
1019  /// Other address spaces do not have a cache.
1020 
1021  if (Pos == Position::AFTER)
1022  --MI;
1023 
1024  return Changed;
1025 }
1026 
1027 bool SIGfx6CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
1028  SIAtomicScope Scope,
1029  SIAtomicAddrSpace AddrSpace,
1030  bool IsCrossAddrSpaceOrdering,
1031  Position Pos) const {
1032  return insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
1033  IsCrossAddrSpaceOrdering, Pos);
1034 }
1035 
1036 bool SIGfx7CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1037  SIAtomicScope Scope,
1038  SIAtomicAddrSpace AddrSpace,
1039  Position Pos) const {
1040  if (!InsertCacheInv)
1041  return false;
1042 
1043  bool Changed = false;
1044 
1045  MachineBasicBlock &MBB = *MI->getParent();
1046  DebugLoc DL = MI->getDebugLoc();
1047 
1048  const GCNSubtarget &STM = MBB.getParent()->getSubtarget<GCNSubtarget>();
1049 
1050  const unsigned InvalidateL1 = STM.isAmdPalOS() || STM.isMesa3DOS()
1051  ? AMDGPU::BUFFER_WBINVL1
1052  : AMDGPU::BUFFER_WBINVL1_VOL;
1053 
1054  if (Pos == Position::AFTER)
1055  ++MI;
1056 
1057  if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1058  switch (Scope) {
1059  case SIAtomicScope::SYSTEM:
1060  case SIAtomicScope::AGENT:
1061  BuildMI(MBB, MI, DL, TII->get(InvalidateL1));
1062  Changed = true;
1063  break;
1064  case SIAtomicScope::WORKGROUP:
1065  case SIAtomicScope::WAVEFRONT:
1066  case SIAtomicScope::SINGLETHREAD:
1067  // No cache to invalidate.
1068  break;
1069  default:
1070  llvm_unreachable("Unsupported synchronization scope");
1071  }
1072  }
1073 
1074  /// The scratch address space does not need the global memory cache
1075  /// to be flushed as all memory operations by the same thread are
1076  /// sequentially consistent, and no other thread can access scratch
1077  /// memory.
1078 
1079  /// Other address spaces do not have a cache.
1080 
1081  if (Pos == Position::AFTER)
1082  --MI;
1083 
1084  return Changed;
1085 }
1086 
1087 bool SIGfx90ACacheControl::enableLoadCacheBypass(
1089  SIAtomicScope Scope,
1090  SIAtomicAddrSpace AddrSpace) const {
1091  assert(MI->mayLoad() && !MI->mayStore());
1092  bool Changed = false;
1093 
1094  if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1095  switch (Scope) {
1096  case SIAtomicScope::SYSTEM:
1097  case SIAtomicScope::AGENT:
1098  Changed |= enableGLCBit(MI);
1099  break;
1100  case SIAtomicScope::WORKGROUP:
1101  // In threadgroup split mode the waves of a work-group can be executing on
1102  // different CUs. Therefore need to bypass the L1 which is per CU.
1103  // Otherwise in non-threadgroup split mode all waves of a work-group are
1104  // on the same CU, and so the L1 does not need to be bypassed.
1105  if (ST.isTgSplitEnabled()) Changed |= enableGLCBit(MI);
1106  break;
1107  case SIAtomicScope::WAVEFRONT:
1108  case SIAtomicScope::SINGLETHREAD:
1109  // No cache to bypass.
1110  break;
1111  default:
1112  llvm_unreachable("Unsupported synchronization scope");
1113  }
1114  }
1115 
1116  /// The scratch address space does not need the global memory caches
1117  /// to be bypassed as all memory operations by the same thread are
1118  /// sequentially consistent, and no other thread can access scratch
1119  /// memory.
1120 
1121  /// Other address spaces do not have a cache.
1122 
1123  return Changed;
1124 }
1125 
1126 bool SIGfx90ACacheControl::enableStoreCacheBypass(
1128  SIAtomicScope Scope,
1129  SIAtomicAddrSpace AddrSpace) const {
1130  assert(!MI->mayLoad() && MI->mayStore());
1131  bool Changed = false;
1132 
1133  if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1134  switch (Scope) {
1135  case SIAtomicScope::SYSTEM:
1136  case SIAtomicScope::AGENT:
1137  /// Do not set glc for store atomic operations as they implicitly write
1138  /// through the L1 cache.
1139  break;
1140  case SIAtomicScope::WORKGROUP:
1141  case SIAtomicScope::WAVEFRONT:
1142  case SIAtomicScope::SINGLETHREAD:
1143  // No cache to bypass. Store atomics implicitly write through the L1
1144  // cache.
1145  break;
1146  default:
1147  llvm_unreachable("Unsupported synchronization scope");
1148  }
1149  }
1150 
1151  /// The scratch address space does not need the global memory caches
1152  /// to be bypassed as all memory operations by the same thread are
1153  /// sequentially consistent, and no other thread can access scratch
1154  /// memory.
1155 
1156  /// Other address spaces do not have a cache.
1157 
1158  return Changed;
1159 }
1160 
1161 bool SIGfx90ACacheControl::enableRMWCacheBypass(
1163  SIAtomicScope Scope,
1164  SIAtomicAddrSpace AddrSpace) const {
1165  assert(MI->mayLoad() && MI->mayStore());
1166  bool Changed = false;
1167 
1168  if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1169  switch (Scope) {
1170  case SIAtomicScope::SYSTEM:
1171  case SIAtomicScope::AGENT:
1172  /// Do not set glc for RMW atomic operations as they implicitly bypass
1173  /// the L1 cache, and the glc bit is instead used to indicate if they are
1174  /// return or no-return.
1175  break;
1176  case SIAtomicScope::WORKGROUP:
1177  case SIAtomicScope::WAVEFRONT:
1178  case SIAtomicScope::SINGLETHREAD:
1179  // No cache to bypass. RMW atomics implicitly bypass the L1 cache.
1180  break;
1181  default:
1182  llvm_unreachable("Unsupported synchronization scope");
1183  }
1184  }
1185 
1186  return Changed;
1187 }
1188 
1189 bool SIGfx90ACacheControl::enableVolatileAndOrNonTemporal(
1190  MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1191  bool IsVolatile, bool IsNonTemporal) const {
1192  // Only handle load and store, not atomic read-modify-write insructions. The
1193  // latter use glc to indicate if the atomic returns a result and so must not
1194  // be used for cache control.
1195  assert(MI->mayLoad() ^ MI->mayStore());
1196 
1197  // Only update load and store, not LLVM IR atomic read-modify-write
1198  // instructions. The latter are always marked as volatile so cannot sensibly
1199  // handle it as do not want to pessimize all atomics. Also they do not support
1200  // the nontemporal attribute.
1202 
1203  bool Changed = false;
1204 
1205  if (IsVolatile) {
1206  if (Op == SIMemOp::LOAD) {
1207  Changed |= enableGLCBit(MI);
1208  }
1209 
1210  // Ensure operation has completed at system scope to cause all volatile
1211  // operations to be visible outside the program in a global order. Do not
1212  // request cross address space as only the global address space can be
1213  // observable outside the program, so no need to cause a waitcnt for LDS
1214  // address space operations.
1215  Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
1216  Position::AFTER);
1217 
1218  return Changed;
1219  }
1220 
1221  if (IsNonTemporal) {
1222  // Request L1 MISS_EVICT and L2 STREAM for load and store instructions.
1223  Changed |= enableGLCBit(MI);
1224  Changed |= enableSLCBit(MI);
1225  return Changed;
1226  }
1227 
1228  return Changed;
1229 }
1230 
1231 bool SIGfx90ACacheControl::insertWait(MachineBasicBlock::iterator &MI,
1232  SIAtomicScope Scope,
1233  SIAtomicAddrSpace AddrSpace,
1234  SIMemOp Op,
1235  bool IsCrossAddrSpaceOrdering,
1236  Position Pos) const {
1237  if (ST.isTgSplitEnabled()) {
1238  // In threadgroup split mode the waves of a work-group can be executing on
1239  // different CUs. Therefore need to wait for global or GDS memory operations
1240  // to complete to ensure they are visible to waves in the other CUs.
1241  // Otherwise in non-threadgroup split mode all waves of a work-group are on
1242  // the same CU, so no need to wait for global memory as all waves in the
1243  // work-group access the same the L1, nor wait for GDS as access are ordered
1244  // on a CU.
1245  if (((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH |
1246  SIAtomicAddrSpace::GDS)) != SIAtomicAddrSpace::NONE) &&
1247  (Scope == SIAtomicScope::WORKGROUP)) {
1248  // Same as GFX7 using agent scope.
1249  Scope = SIAtomicScope::AGENT;
1250  }
1251  // In threadgroup split mode LDS cannot be allocated so no need to wait for
1252  // LDS memory operations.
1253  AddrSpace &= ~SIAtomicAddrSpace::LDS;
1254  }
1255  return SIGfx7CacheControl::insertWait(MI, Scope, AddrSpace, Op,
1256  IsCrossAddrSpaceOrdering, Pos);
1257 }
1258 
1259 bool SIGfx90ACacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1260  SIAtomicScope Scope,
1261  SIAtomicAddrSpace AddrSpace,
1262  Position Pos) const {
1263  if (!InsertCacheInv)
1264  return false;
1265 
1266  bool Changed = false;
1267 
1268  if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1269  switch (Scope) {
1270  case SIAtomicScope::SYSTEM:
1271  case SIAtomicScope::AGENT:
1272  // Same as GFX7.
1273  break;
1274  case SIAtomicScope::WORKGROUP:
1275  // In threadgroup split mode the waves of a work-group can be executing on
1276  // different CUs. Therefore need to invalidate the L1 which is per CU.
1277  // Otherwise in non-threadgroup split mode all waves of a work-group are
1278  // on the same CU, and so the L1 does not need to be invalidated.
1279  if (ST.isTgSplitEnabled()) {
1280  // Same as GFX7 using agent scope.
1281  Scope = SIAtomicScope::AGENT;
1282  }
1283  break;
1284  case SIAtomicScope::WAVEFRONT:
1285  case SIAtomicScope::SINGLETHREAD:
1286  // Same as GFX7.
1287  break;
1288  default:
1289  llvm_unreachable("Unsupported synchronization scope");
1290  }
1291  }
1292 
1293  /// The scratch address space does not need the global memory cache
1294  /// to be flushed as all memory operations by the same thread are
1295  /// sequentially consistent, and no other thread can access scratch
1296  /// memory.
1297 
1298  /// Other address spaces do not have a cache.
1299 
1300  Changed |= SIGfx7CacheControl::insertAcquire(MI, Scope, AddrSpace, Pos);
1301 
1302  return Changed;
1303 }
1304 
1305 bool SIGfx10CacheControl::enableLoadCacheBypass(
1307  SIAtomicScope Scope,
1308  SIAtomicAddrSpace AddrSpace) const {
1309  assert(MI->mayLoad() && !MI->mayStore());
1310  bool Changed = false;
1311 
1312  if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1313  /// TODO Do not set glc for rmw atomic operations as they
1314  /// implicitly bypass the L0/L1 caches.
1315 
1316  switch (Scope) {
1317  case SIAtomicScope::SYSTEM:
1318  case SIAtomicScope::AGENT:
1319  Changed |= enableGLCBit(MI);
1320  Changed |= enableDLCBit(MI);
1321  break;
1322  case SIAtomicScope::WORKGROUP:
1323  // In WGP mode the waves of a work-group can be executing on either CU of
1324  // the WGP. Therefore need to bypass the L0 which is per CU. Otherwise in
1325  // CU mode all waves of a work-group are on the same CU, and so the L0
1326  // does not need to be bypassed.
1327  if (!ST.isCuModeEnabled()) Changed |= enableGLCBit(MI);
1328  break;
1329  case SIAtomicScope::WAVEFRONT:
1330  case SIAtomicScope::SINGLETHREAD:
1331  // No cache to bypass.
1332  break;
1333  default:
1334  llvm_unreachable("Unsupported synchronization scope");
1335  }
1336  }
1337 
1338  /// The scratch address space does not need the global memory caches
1339  /// to be bypassed as all memory operations by the same thread are
1340  /// sequentially consistent, and no other thread can access scratch
1341  /// memory.
1342 
1343  /// Other address spaces do not have a cache.
1344 
1345  return Changed;
1346 }
1347 
1348 bool SIGfx10CacheControl::enableVolatileAndOrNonTemporal(
1349  MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1350  bool IsVolatile, bool IsNonTemporal) const {
1351 
1352  // Only handle load and store, not atomic read-modify-write insructions. The
1353  // latter use glc to indicate if the atomic returns a result and so must not
1354  // be used for cache control.
1355  assert(MI->mayLoad() ^ MI->mayStore());
1356 
1357  // Only update load and store, not LLVM IR atomic read-modify-write
1358  // instructions. The latter are always marked as volatile so cannot sensibly
1359  // handle it as do not want to pessimize all atomics. Also they do not support
1360  // the nontemporal attribute.
1362 
1363  bool Changed = false;
1364 
1365  if (IsVolatile) {
1366 
1367  if (Op == SIMemOp::LOAD) {
1368  Changed |= enableGLCBit(MI);
1369  Changed |= enableDLCBit(MI);
1370  }
1371 
1372  // Ensure operation has completed at system scope to cause all volatile
1373  // operations to be visible outside the program in a global order. Do not
1374  // request cross address space as only the global address space can be
1375  // observable outside the program, so no need to cause a waitcnt for LDS
1376  // address space operations.
1377  Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
1378  Position::AFTER);
1379  return Changed;
1380  }
1381 
1382  if (IsNonTemporal) {
1383  // Request L0/L1 HIT_EVICT and L2 STREAM for load and store instructions.
1384  Changed |= enableSLCBit(MI);
1385  return Changed;
1386  }
1387 
1388  return Changed;
1389 }
1390 
1391 bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI,
1392  SIAtomicScope Scope,
1393  SIAtomicAddrSpace AddrSpace,
1394  SIMemOp Op,
1395  bool IsCrossAddrSpaceOrdering,
1396  Position Pos) const {
1397  bool Changed = false;
1398 
1399  MachineBasicBlock &MBB = *MI->getParent();
1400  DebugLoc DL = MI->getDebugLoc();
1401 
1402  if (Pos == Position::AFTER)
1403  ++MI;
1404 
1405  bool VMCnt = false;
1406  bool VSCnt = false;
1407  bool LGKMCnt = false;
1408 
1409  if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) !=
1411  switch (Scope) {
1412  case SIAtomicScope::SYSTEM:
1413  case SIAtomicScope::AGENT:
1414  if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
1415  VMCnt |= true;
1416  if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
1417  VSCnt |= true;
1418  break;
1419  case SIAtomicScope::WORKGROUP:
1420  // In WGP mode the waves of a work-group can be executing on either CU of
1421  // the WGP. Therefore need to wait for operations to complete to ensure
1422  // they are visible to waves in the other CU as the L0 is per CU.
1423  // Otherwise in CU mode and all waves of a work-group are on the same CU
1424  // which shares the same L0.
1425  if (!ST.isCuModeEnabled()) {
1426  if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
1427  VMCnt |= true;
1428  if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
1429  VSCnt |= true;
1430  }
1431  break;
1432  case SIAtomicScope::WAVEFRONT:
1433  case SIAtomicScope::SINGLETHREAD:
1434  // The L0 cache keeps all memory operations in order for
1435  // work-items in the same wavefront.
1436  break;
1437  default:
1438  llvm_unreachable("Unsupported synchronization scope");
1439  }
1440  }
1441 
1442  if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
1443  switch (Scope) {
1444  case SIAtomicScope::SYSTEM:
1445  case SIAtomicScope::AGENT:
1446  case SIAtomicScope::WORKGROUP:
1447  // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is
1448  // not needed as LDS operations for all waves are executed in a total
1449  // global ordering as observed by all waves. Required if also
1450  // synchronizing with global/GDS memory as LDS operations could be
1451  // reordered with respect to later global/GDS memory operations of the
1452  // same wave.
1453  LGKMCnt |= IsCrossAddrSpaceOrdering;
1454  break;
1455  case SIAtomicScope::WAVEFRONT:
1456  case SIAtomicScope::SINGLETHREAD:
1457  // The LDS keeps all memory operations in order for
1458  // the same wavesfront.
1459  break;
1460  default:
1461  llvm_unreachable("Unsupported synchronization scope");
1462  }
1463  }
1464 
1465  if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) {
1466  switch (Scope) {
1467  case SIAtomicScope::SYSTEM:
1468  case SIAtomicScope::AGENT:
1469  // If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)"
1470  // is not needed as GDS operations for all waves are executed in a total
1471  // global ordering as observed by all waves. Required if also
1472  // synchronizing with global/LDS memory as GDS operations could be
1473  // reordered with respect to later global/LDS memory operations of the
1474  // same wave.
1475  LGKMCnt |= IsCrossAddrSpaceOrdering;
1476  break;
1477  case SIAtomicScope::WORKGROUP:
1478  case SIAtomicScope::WAVEFRONT:
1479  case SIAtomicScope::SINGLETHREAD:
1480  // The GDS keeps all memory operations in order for
1481  // the same work-group.
1482  break;
1483  default:
1484  llvm_unreachable("Unsupported synchronization scope");
1485  }
1486  }
1487 
1488  if (VMCnt || LGKMCnt) {
1489  unsigned WaitCntImmediate =
1491  VMCnt ? 0 : getVmcntBitMask(IV),
1492  getExpcntBitMask(IV),
1493  LGKMCnt ? 0 : getLgkmcntBitMask(IV));
1494  BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(WaitCntImmediate);
1495  Changed = true;
1496  }
1497 
1498  if (VSCnt) {
1499  BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT))
1500  .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
1501  .addImm(0);
1502  Changed = true;
1503  }
1504 
1505  if (Pos == Position::AFTER)
1506  --MI;
1507 
1508  return Changed;
1509 }
1510 
1511 bool SIGfx10CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1512  SIAtomicScope Scope,
1513  SIAtomicAddrSpace AddrSpace,
1514  Position Pos) const {
1515  if (!InsertCacheInv)
1516  return false;
1517 
1518  bool Changed = false;
1519 
1520  MachineBasicBlock &MBB = *MI->getParent();
1521  DebugLoc DL = MI->getDebugLoc();
1522 
1523  if (Pos == Position::AFTER)
1524  ++MI;
1525 
1526  if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1527  switch (Scope) {
1528  case SIAtomicScope::SYSTEM:
1529  case SIAtomicScope::AGENT:
1530  BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV));
1531  BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL1_INV));
1532  Changed = true;
1533  break;
1534  case SIAtomicScope::WORKGROUP:
1535  // In WGP mode the waves of a work-group can be executing on either CU of
1536  // the WGP. Therefore need to invalidate the L0 which is per CU. Otherwise
1537  // in CU mode and all waves of a work-group are on the same CU, and so the
1538  // L0 does not need to be invalidated.
1539  if (!ST.isCuModeEnabled()) {
1540  BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV));
1541  Changed = true;
1542  }
1543  break;
1544  case SIAtomicScope::WAVEFRONT:
1545  case SIAtomicScope::SINGLETHREAD:
1546  // No cache to invalidate.
1547  break;
1548  default:
1549  llvm_unreachable("Unsupported synchronization scope");
1550  }
1551  }
1552 
1553  /// The scratch address space does not need the global memory cache
1554  /// to be flushed as all memory operations by the same thread are
1555  /// sequentially consistent, and no other thread can access scratch
1556  /// memory.
1557 
1558  /// Other address spaces do not have a cache.
1559 
1560  if (Pos == Position::AFTER)
1561  --MI;
1562 
1563  return Changed;
1564 }
1565 
1566 bool SIMemoryLegalizer::removeAtomicPseudoMIs() {
1567  if (AtomicPseudoMIs.empty())
1568  return false;
1569 
1570  for (auto &MI : AtomicPseudoMIs)
1571  MI->eraseFromParent();
1572 
1573  AtomicPseudoMIs.clear();
1574  return true;
1575 }
1576 
1577 bool SIMemoryLegalizer::expandLoad(const SIMemOpInfo &MOI,
1579  assert(MI->mayLoad() && !MI->mayStore());
1580 
1581  bool Changed = false;
1582 
1583  if (MOI.isAtomic()) {
1584  if (MOI.getOrdering() == AtomicOrdering::Monotonic ||
1585  MOI.getOrdering() == AtomicOrdering::Acquire ||
1586  MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
1587  Changed |= CC->enableLoadCacheBypass(MI, MOI.getScope(),
1588  MOI.getOrderingAddrSpace());
1589  }
1590 
1591  if (MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
1592  Changed |= CC->insertWait(MI, MOI.getScope(),
1593  MOI.getOrderingAddrSpace(),
1595  MOI.getIsCrossAddressSpaceOrdering(),
1596  Position::BEFORE);
1597 
1598  if (MOI.getOrdering() == AtomicOrdering::Acquire ||
1599  MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
1600  Changed |= CC->insertWait(MI, MOI.getScope(),
1601  MOI.getInstrAddrSpace(),
1602  SIMemOp::LOAD,
1603  MOI.getIsCrossAddressSpaceOrdering(),
1604  Position::AFTER);
1605  Changed |= CC->insertAcquire(MI, MOI.getScope(),
1606  MOI.getOrderingAddrSpace(),
1607  Position::AFTER);
1608  }
1609 
1610  return Changed;
1611  }
1612 
1613  // Atomic instructions already bypass caches to the scope specified by the
1614  // SyncScope operand. Only non-atomic volatile and nontemporal instructions
1615  // need additional treatment.
1616  Changed |= CC->enableVolatileAndOrNonTemporal(MI, MOI.getInstrAddrSpace(),
1617  SIMemOp::LOAD, MOI.isVolatile(),
1618  MOI.isNonTemporal());
1619  return Changed;
1620 }
1621 
1622 bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI,
1624  assert(!MI->mayLoad() && MI->mayStore());
1625 
1626  bool Changed = false;
1627 
1628  if (MOI.isAtomic()) {
1629  if (MOI.getOrdering() == AtomicOrdering::Monotonic ||
1630  MOI.getOrdering() == AtomicOrdering::Release ||
1631  MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
1632  Changed |= CC->enableStoreCacheBypass(MI, MOI.getScope(),
1633  MOI.getOrderingAddrSpace());
1634  }
1635 
1636  if (MOI.getOrdering() == AtomicOrdering::Release ||
1637  MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
1638  Changed |= CC->insertRelease(MI, MOI.getScope(),
1639  MOI.getOrderingAddrSpace(),
1640  MOI.getIsCrossAddressSpaceOrdering(),
1641  Position::BEFORE);
1642 
1643  return Changed;
1644  }
1645 
1646  // Atomic instructions already bypass caches to the scope specified by the
1647  // SyncScope operand. Only non-atomic volatile and nontemporal instructions
1648  // need additional treatment.
1649  Changed |= CC->enableVolatileAndOrNonTemporal(
1650  MI, MOI.getInstrAddrSpace(), SIMemOp::STORE, MOI.isVolatile(),
1651  MOI.isNonTemporal());
1652  return Changed;
1653 }
1654 
1655 bool SIMemoryLegalizer::expandAtomicFence(const SIMemOpInfo &MOI,
1657  assert(MI->getOpcode() == AMDGPU::ATOMIC_FENCE);
1658 
1659  AtomicPseudoMIs.push_back(MI);
1660  bool Changed = false;
1661 
1662  if (MOI.isAtomic()) {
1663  if (MOI.getOrdering() == AtomicOrdering::Acquire ||
1664  MOI.getOrdering() == AtomicOrdering::Release ||
1665  MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
1666  MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
1667  /// TODO: This relies on a barrier always generating a waitcnt
1668  /// for LDS to ensure it is not reordered with the completion of
1669  /// the proceeding LDS operations. If barrier had a memory
1670  /// ordering and memory scope, then library does not need to
1671  /// generate a fence. Could add support in this file for
1672  /// barrier. SIInsertWaitcnt.cpp could then stop unconditionally
1673  /// adding S_WAITCNT before a S_BARRIER.
1674  Changed |= CC->insertRelease(MI, MOI.getScope(),
1675  MOI.getOrderingAddrSpace(),
1676  MOI.getIsCrossAddressSpaceOrdering(),
1677  Position::BEFORE);
1678 
1679  // TODO: If both release and invalidate are happening they could be combined
1680  // to use the single "BUFFER_WBINV*" instruction. This could be done by
1681  // reorganizing this code or as part of optimizing SIInsertWaitcnt pass to
1682  // track cache invalidate and write back instructions.
1683 
1684  if (MOI.getOrdering() == AtomicOrdering::Acquire ||
1685  MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
1686  MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
1687  Changed |= CC->insertAcquire(MI, MOI.getScope(),
1688  MOI.getOrderingAddrSpace(),
1689  Position::BEFORE);
1690 
1691  return Changed;
1692  }
1693 
1694  return Changed;
1695 }
1696 
1697 bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
1699  assert(MI->mayLoad() && MI->mayStore());
1700 
1701  bool Changed = false;
1702 
1703  if (MOI.isAtomic()) {
1704  if (MOI.getOrdering() == AtomicOrdering::Monotonic ||
1705  MOI.getOrdering() == AtomicOrdering::Acquire ||
1706  MOI.getOrdering() == AtomicOrdering::Release ||
1707  MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
1708  MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
1709  Changed |= CC->enableRMWCacheBypass(MI, MOI.getScope(),
1710  MOI.getInstrAddrSpace());
1711  }
1712 
1713  if (MOI.getOrdering() == AtomicOrdering::Release ||
1714  MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
1715  MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent ||
1716  MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent)
1717  Changed |= CC->insertRelease(MI, MOI.getScope(),
1718  MOI.getOrderingAddrSpace(),
1719  MOI.getIsCrossAddressSpaceOrdering(),
1720  Position::BEFORE);
1721 
1722  if (MOI.getOrdering() == AtomicOrdering::Acquire ||
1723  MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
1724  MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent ||
1725  MOI.getFailureOrdering() == AtomicOrdering::Acquire ||
1726  MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent) {
1727  Changed |= CC->insertWait(MI, MOI.getScope(),
1728  MOI.getInstrAddrSpace(),
1729  isAtomicRet(*MI) ? SIMemOp::LOAD :
1731  MOI.getIsCrossAddressSpaceOrdering(),
1732  Position::AFTER);
1733  Changed |= CC->insertAcquire(MI, MOI.getScope(),
1734  MOI.getOrderingAddrSpace(),
1735  Position::AFTER);
1736  }
1737 
1738  return Changed;
1739  }
1740 
1741  return Changed;
1742 }
1743 
1744 bool SIMemoryLegalizer::runOnMachineFunction(MachineFunction &MF) {
1745  bool Changed = false;
1746 
1747  SIMemOpAccess MOA(MF);
1748  CC = SICacheControl::create(MF.getSubtarget<GCNSubtarget>());
1749 
1750  for (auto &MBB : MF) {
1751  for (auto MI = MBB.begin(); MI != MBB.end(); ++MI) {
1752 
1753  // Unbundle instructions after the post-RA scheduler.
1754  if (MI->isBundle() && MI->mayLoadOrStore()) {
1755  MachineBasicBlock::instr_iterator II(MI->getIterator());
1757  I != E && I->isBundledWithPred(); ++I) {
1758  I->unbundleFromPred();
1759  for (MachineOperand &MO : I->operands())
1760  if (MO.isReg())
1761  MO.setIsInternalRead(false);
1762  }
1763 
1764  MI->eraseFromParent();
1765  MI = II->getIterator();
1766  }
1767 
1768  if (!(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic))
1769  continue;
1770 
1771  if (const auto &MOI = MOA.getLoadInfo(MI))
1772  Changed |= expandLoad(MOI.getValue(), MI);
1773  else if (const auto &MOI = MOA.getStoreInfo(MI))
1774  Changed |= expandStore(MOI.getValue(), MI);
1775  else if (const auto &MOI = MOA.getAtomicFenceInfo(MI))
1776  Changed |= expandAtomicFence(MOI.getValue(), MI);
1777  else if (const auto &MOI = MOA.getAtomicCmpxchgOrRmwInfo(MI))
1778  Changed |= expandAtomicCmpxchgOrRmw(MOI.getValue(), MI);
1779  }
1780  }
1781 
1782  Changed |= removeAtomicPseudoMIs();
1783  return Changed;
1784 }
1785 
1786 INITIALIZE_PASS(SIMemoryLegalizer, DEBUG_TYPE, PASS_NAME, false, false)
1787 
1788 char SIMemoryLegalizer::ID = 0;
1789 char &llvm::SIMemoryLegalizerID = SIMemoryLegalizer::ID;
1790 
1792  return new SIMemoryLegalizer();
1793 }
llvm::AtomicOrdering::AcquireRelease
@ AcquireRelease
llvm::SIInstrInfo::isAtomicRet
static bool isAtomicRet(const MachineInstr &MI)
Definition: SIInstrInfo.h:554
MI
IRTranslator LLVM IR MI
Definition: IRTranslator.cpp:100
llvm::MachineInstrBuilder::addImm
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
Definition: MachineInstrBuilder.h:131
llvm
Definition: AllocatorList.h:23
GFX10
@ GFX10
Definition: SIInstrInfo.cpp:7417
llvm::createSIMemoryLegalizerPass
FunctionPass * createSIMemoryLegalizerPass()
Definition: SIMemoryLegalizer.cpp:1791
llvm::DiagnosticInfoUnsupported
Diagnostic information for unsupported feature in backend.
Definition: DiagnosticInfo.h:993
AtomicOrdering.h
llvm::AMDGPU::getIsaVersion
IsaVersion getIsaVersion(StringRef GPU)
Definition: TargetParser.cpp:186
llvm::Function
Definition: Function.h:61
LLVM_MARK_AS_BITMASK_ENUM
#define LLVM_MARK_AS_BITMASK_ENUM(LargestValue)
LLVM_MARK_AS_BITMASK_ENUM lets you opt in an individual enum type so you can perform bitwise operatio...
Definition: BitmaskEnum.h:41
llvm::ALL
@ ALL
Definition: Attributor.h:3847
AmdgcnSkipCacheInvalidations
static cl::opt< bool > AmdgcnSkipCacheInvalidations("amdgcn-skip-cache-invalidations", cl::init(false), cl::Hidden, cl::desc("Use this to skip inserting cache invalidating instructions."))
llvm::MachineFunctionPass
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
Definition: MachineFunctionPass.h:30
llvm::AtomicOrdering::SequentiallyConsistent
@ SequentiallyConsistent
MachineBasicBlock.h
llvm::cl::Hidden
@ Hidden
Definition: CommandLine.h:143
isAtomic
static bool isAtomic(Instruction *I)
Definition: ThreadSanitizer.cpp:501
llvm::Optional
Definition: APInt.h:33
llvm::GCNSubtarget
Definition: GCNSubtarget.h:38
llvm::AMDGPU::getVmcntBitMask
unsigned getVmcntBitMask(const IsaVersion &Version)
Definition: AMDGPUBaseInfo.cpp:858
llvm::SIInstrFlags::maybeAtomic
@ maybeAtomic
Definition: SIDefines.h:71
getLoadInfo
static Optional< LoadInfo > getLoadInfo(const MachineInstr &MI)
Definition: AArch64FalkorHWPFFix.cpp:237
TargetParser.h
llvm::isPowerOf2_32
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:491
llvm::MachineFunctionPass::getAnalysisUsage
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
Definition: MachineFunctionPass.cpp:102
llvm::AMDGPU::IsaVersion
Instruction set architecture version.
Definition: TargetParser.h:102
llvm::AMDGPU::CPol::CPol
CPol
Definition: SIDefines.h:281
GCNSubtarget.h
E
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
llvm::AMDGPU::CPol::DLC
@ DLC
Definition: SIDefines.h:284
llvm::AMDGPUSubtarget::isMesa3DOS
bool isMesa3DOS() const
Definition: AMDGPUSubtarget.h:114
INITIALIZE_PASS
#define INITIALIZE_PASS(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:37
llvm::AnalysisUsage
Represent the analysis usage information of a pass.
Definition: PassAnalysisSupport.h:47
llvm::AMDGPUAS::GLOBAL_ADDRESS
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
Definition: AMDGPU.h:360
TII
const HexagonInstrInfo * TII
Definition: HexagonCopyToCombine.cpp:129
llvm::MachineOperand
MachineOperand class - Representation of each machine instruction operand.
Definition: MachineOperand.h:49
llvm::AMDGPU
Definition: AMDGPUMetadataVerifier.h:22
llvm::AMDGPUAS::REGION_ADDRESS
@ REGION_ADDRESS
Address space for region memory. (GDS)
Definition: AMDGPU.h:361
llvm::None
const NoneType None
Definition: None.h:23
llvm::SIMemoryLegalizerID
char & SIMemoryLegalizerID
Definition: SIMemoryLegalizer.cpp:1789
llvm::MachineBasicBlock
Definition: MachineBasicBlock.h:95
llvm::AMDGPUAS::PRIVATE_ADDRESS
@ PRIVATE_ADDRESS
Address space for private memory.
Definition: AMDGPU.h:365
llvm::MachineFunction::getMMI
MachineModuleInfo & getMMI() const
Definition: MachineFunction.h:509
llvm::MachineFunction::getSubtarget
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
Definition: MachineFunction.h:558
llvm::AtomicOrdering
AtomicOrdering
Atomic ordering for LLVM's memory model.
Definition: AtomicOrdering.h:56
llvm::cl::opt< bool >
llvm::AMDGPU::getExpcntBitMask
unsigned getExpcntBitMask(const IsaVersion &Version)
Definition: AMDGPUBaseInfo.cpp:867
AMDGPUMCTargetDesc.h
llvm::AMDGPU::CPol::GLC
@ GLC
Definition: SIDefines.h:282
llvm::MachineInstr
Representation of each machine instruction.
Definition: MachineInstr.h:64
llvm::ISD::ATOMIC_FENCE
@ ATOMIC_FENCE
OUTCHAIN = ATOMIC_FENCE(INCHAIN, ordering, scope) This corresponds to the fence instruction.
Definition: ISDOpcodes.h:1096
llvm::ISD::LOAD
@ LOAD
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
Definition: ISDOpcodes.h:911
llvm::ARM_MB::ST
@ ST
Definition: ARMBaseInfo.h:73
llvm::SyncScope::ID
uint8_t ID
Definition: LLVMContext.h:47
I
#define I(x, y, z)
Definition: MD5.cpp:59
llvm::cl::init
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
llvm::LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE
LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE()
llvm::AMDGPUISD::LDS
@ LDS
Definition: AMDGPUISelLowering.h:475
AMDGPUMachineModuleInfo.h
assert
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
llvm::RegState::Undef
@ Undef
Value of the register doesn't matter.
Definition: MachineInstrBuilder.h:52
llvm::elfabi::ELFSymbolType::Func
@ Func
llvm::MachineBasicBlock::getParent
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
Definition: MachineBasicBlock.h:225
llvm::SyncScope::System
@ System
Synchronized with respect to all concurrently executing threads.
Definition: LLVMContext.h:58
llvm::MachineInstrBuilder::addReg
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
Definition: MachineInstrBuilder.h:97
DEBUG_TYPE
#define DEBUG_TYPE
Definition: SIMemoryLegalizer.cpp:29
llvm::MachineBasicBlock::instr_end
instr_iterator instr_end()
Definition: MachineBasicBlock.h:254
llvm::MachineFunction
Definition: MachineFunction.h:230
llvm::min
Expected< ExpressionValue > min(const ExpressionValue &Lhs, const ExpressionValue &Rhs)
Definition: FileCheck.cpp:357
llvm::StringRef
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:57
llvm::AnalysisUsage::setPreservesCFG
void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition: Pass.cpp:253
AMDGPU.h
llvm_unreachable
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Definition: ErrorHandling.h:136
BitmaskEnum.h
uint32_t
DL
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Definition: AArch64SLSHardening.cpp:76
llvm::AMDGPU::HSAMD::Kernel::Arg::Key::IsVolatile
constexpr char IsVolatile[]
Key for Kernel::Arg::Metadata::mIsVolatile.
Definition: AMDGPUMetadata.h:194
llvm::AtomicOrdering::Release
@ Release
MBB
MachineBasicBlock & MBB
Definition: AArch64SLSHardening.cpp:74
llvm::AMDGPU::encodeWaitcnt
unsigned encodeWaitcnt(const IsaVersion &Version, unsigned Vmcnt, unsigned Expcnt, unsigned Lgkmcnt)
Encodes Vmcnt, Expcnt and Lgkmcnt into Waitcnt for given isa Version.
Definition: AMDGPUBaseInfo.cpp:946
llvm::AMDGPU::SendMsg::Op
Op
Definition: SIDefines.h:314
llvm::ilist_iterator
Iterator for intrusive lists based on ilist_node.
Definition: ilist_iterator.h:57
llvm::AMDGPUAS::LOCAL_ADDRESS
@ LOCAL_ADDRESS
Address space for local memory.
Definition: AMDGPU.h:364
llvm::AMDGPU::CPol::SLC
@ SLC
Definition: SIDefines.h:283
DiagnosticInfo.h
llvm::ISD::STORE
@ STORE
Definition: ISDOpcodes.h:912
llvm::AMDGPU::getLgkmcntBitMask
unsigned getLgkmcntBitMask(const IsaVersion &Version)
Definition: AMDGPUBaseInfo.cpp:871
llvm::NVPTX::PTXLdStInstCode::GLOBAL
@ GLOBAL
Definition: NVPTX.h:109
llvm::isStrongerThan
bool isStrongerThan(AtomicOrdering AO, AtomicOrdering Other)
Returns true if ao is stronger than other as defined by the AtomicOrdering lattice,...
Definition: AtomicOrdering.h:90
llvm::SIInstrInfo
Definition: SIInstrInfo.h:38
llvm::SyncScope::SingleThread
@ SingleThread
Synchronized with respect to signal handlers executing in the same thread.
Definition: LLVMContext.h:55
llvm::NONE
@ NONE
Definition: Attributor.h:3844
llvm::MachineBasicBlock::begin
iterator begin()
Definition: MachineBasicBlock.h:268
llvm::BuildMI
MachineInstrBuilder BuildMI(MachineFunction &MF, const DebugLoc &DL, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
Definition: MachineInstrBuilder.h:328
llvm::tgtok::Bit
@ Bit
Definition: TGLexer.h:50
PASS_NAME
#define PASS_NAME
Definition: SIMemoryLegalizer.cpp:30
llvm::FunctionPass
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:298
llvm::AMDGPUAS::FLAT_ADDRESS
@ FLAT_ADDRESS
Address space for flat memory.
Definition: AMDGPU.h:359
llvm::AMDGPUMachineModuleInfo
Definition: AMDGPUMachineModuleInfo.h:22
llvm::DebugLoc
A debug info location.
Definition: DebugLoc.h:33
llvm::cl::desc
Definition: CommandLine.h:414
llvm::MachineInstrBundleIterator< MachineInstr >
LDS
AMDGPU promote alloca to vector or LDS
Definition: AMDGPUPromoteAlloca.cpp:135
llvm::MachineBasicBlock::end
iterator end()
Definition: MachineBasicBlock.h:270
llvm::SIInstrFlags::FLAT
@ FLAT
Definition: SIDefines.h:51
llvm::MachineModuleInfo::getObjFileInfo
Ty & getObjFileInfo()
Keep track of various per-function pieces of information for backends that would like to do so.
Definition: MachineModuleInfo.h:191
llvm::AtomicOrdering::NotAtomic
@ NotAtomic
llvm::AMDGPUSubtarget::isAmdPalOS
bool isAmdPalOS() const
Definition: AMDGPUSubtarget.h:110
llvm::Intrinsic::ID
unsigned ID
Definition: TargetTransformInfo.h:38