LLVM  15.0.0git
SIMemoryLegalizer.cpp
Go to the documentation of this file.
1 //===- SIMemoryLegalizer.cpp ----------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// Memory legalizer - implements memory model. More information can be
11 /// found here:
12 /// http://llvm.org/docs/AMDGPUUsage.html#memory-model
13 //
14 //===----------------------------------------------------------------------===//
15 
16 #include "AMDGPU.h"
18 #include "GCNSubtarget.h"
20 #include "llvm/ADT/BitmaskEnum.h"
23 #include "llvm/IR/DiagnosticInfo.h"
26 
27 using namespace llvm;
28 using namespace llvm::AMDGPU;
29 
30 #define DEBUG_TYPE "si-memory-legalizer"
31 #define PASS_NAME "SI Memory Legalizer"
32 
34  "amdgcn-skip-cache-invalidations", cl::init(false), cl::Hidden,
35  cl::desc("Use this to skip inserting cache invalidating instructions."));
36 
37 namespace {
38 
40 
41 /// Memory operation flags. Can be ORed together.
42 enum class SIMemOp {
43  NONE = 0u,
44  LOAD = 1u << 0,
45  STORE = 1u << 1,
46  LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ STORE)
47 };
48 
49 /// Position to insert a new instruction relative to an existing
50 /// instruction.
51 enum class Position {
52  BEFORE,
53  AFTER
54 };
55 
56 /// The atomic synchronization scopes supported by the AMDGPU target.
57 enum class SIAtomicScope {
58  NONE,
59  SINGLETHREAD,
60  WAVEFRONT,
61  WORKGROUP,
62  AGENT,
63  SYSTEM
64 };
65 
66 /// The distinct address spaces supported by the AMDGPU target for
67 /// atomic memory operation. Can be ORed together.
68 enum class SIAtomicAddrSpace {
69  NONE = 0u,
70  GLOBAL = 1u << 0,
71  LDS = 1u << 1,
72  SCRATCH = 1u << 2,
73  GDS = 1u << 3,
74  OTHER = 1u << 4,
75 
76  /// The address spaces that can be accessed by a FLAT instruction.
77  FLAT = GLOBAL | LDS | SCRATCH,
78 
79  /// The address spaces that support atomic instructions.
80  ATOMIC = GLOBAL | LDS | SCRATCH | GDS,
81 
82  /// All address spaces.
83  ALL = GLOBAL | LDS | SCRATCH | GDS | OTHER,
84 
85  LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ ALL)
86 };
87 
88 class SIMemOpInfo final {
89 private:
90 
91  friend class SIMemOpAccess;
92 
94  AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic;
95  SIAtomicScope Scope = SIAtomicScope::SYSTEM;
96  SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
97  SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE;
98  bool IsCrossAddressSpaceOrdering = false;
99  bool IsVolatile = false;
100  bool IsNonTemporal = false;
101 
103  SIAtomicScope Scope = SIAtomicScope::SYSTEM,
104  SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::ATOMIC,
105  SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::ALL,
106  bool IsCrossAddressSpaceOrdering = true,
107  AtomicOrdering FailureOrdering =
109  bool IsVolatile = false,
110  bool IsNonTemporal = false)
111  : Ordering(Ordering), FailureOrdering(FailureOrdering),
112  Scope(Scope), OrderingAddrSpace(OrderingAddrSpace),
113  InstrAddrSpace(InstrAddrSpace),
114  IsCrossAddressSpaceOrdering(IsCrossAddressSpaceOrdering),
116  IsNonTemporal(IsNonTemporal) {
117 
118  if (Ordering == AtomicOrdering::NotAtomic) {
120  OrderingAddrSpace == SIAtomicAddrSpace::NONE &&
121  !IsCrossAddressSpaceOrdering &&
122  FailureOrdering == AtomicOrdering::NotAtomic);
123  return;
124  }
125 
127  (OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) !=
129  (InstrAddrSpace & SIAtomicAddrSpace::ATOMIC) !=
131 
132  // There is also no cross address space ordering if the ordering
133  // address space is the same as the instruction address space and
134  // only contains a single address space.
135  if ((OrderingAddrSpace == InstrAddrSpace) &&
136  isPowerOf2_32(uint32_t(InstrAddrSpace)))
137  this->IsCrossAddressSpaceOrdering = false;
138 
139  // Limit the scope to the maximum supported by the instruction's address
140  // spaces.
141  if ((InstrAddrSpace & ~SIAtomicAddrSpace::SCRATCH) ==
143  this->Scope = std::min(Scope, SIAtomicScope::SINGLETHREAD);
144  } else if ((InstrAddrSpace &
145  ~(SIAtomicAddrSpace::SCRATCH | SIAtomicAddrSpace::LDS)) ==
147  this->Scope = std::min(Scope, SIAtomicScope::WORKGROUP);
148  } else if ((InstrAddrSpace &
149  ~(SIAtomicAddrSpace::SCRATCH | SIAtomicAddrSpace::LDS |
150  SIAtomicAddrSpace::GDS)) == SIAtomicAddrSpace::NONE) {
151  this->Scope = std::min(Scope, SIAtomicScope::AGENT);
152  }
153  }
154 
155 public:
156  /// \returns Atomic synchronization scope of the machine instruction used to
157  /// create this SIMemOpInfo.
158  SIAtomicScope getScope() const {
159  return Scope;
160  }
161 
162  /// \returns Ordering constraint of the machine instruction used to
163  /// create this SIMemOpInfo.
164  AtomicOrdering getOrdering() const {
165  return Ordering;
166  }
167 
168  /// \returns Failure ordering constraint of the machine instruction used to
169  /// create this SIMemOpInfo.
170  AtomicOrdering getFailureOrdering() const {
171  return FailureOrdering;
172  }
173 
174  /// \returns The address spaces be accessed by the machine
175  /// instruction used to create this SiMemOpInfo.
176  SIAtomicAddrSpace getInstrAddrSpace() const {
177  return InstrAddrSpace;
178  }
179 
180  /// \returns The address spaces that must be ordered by the machine
181  /// instruction used to create this SiMemOpInfo.
182  SIAtomicAddrSpace getOrderingAddrSpace() const {
183  return OrderingAddrSpace;
184  }
185 
186  /// \returns Return true iff memory ordering of operations on
187  /// different address spaces is required.
188  bool getIsCrossAddressSpaceOrdering() const {
189  return IsCrossAddressSpaceOrdering;
190  }
191 
192  /// \returns True if memory access of the machine instruction used to
193  /// create this SIMemOpInfo is volatile, false otherwise.
194  bool isVolatile() const {
195  return IsVolatile;
196  }
197 
198  /// \returns True if memory access of the machine instruction used to
199  /// create this SIMemOpInfo is nontemporal, false otherwise.
200  bool isNonTemporal() const {
201  return IsNonTemporal;
202  }
203 
204  /// \returns True if ordering constraint of the machine instruction used to
205  /// create this SIMemOpInfo is unordered or higher, false otherwise.
206  bool isAtomic() const {
207  return Ordering != AtomicOrdering::NotAtomic;
208  }
209 
210 };
211 
212 class SIMemOpAccess final {
213 private:
214  AMDGPUMachineModuleInfo *MMI = nullptr;
215 
216  /// Reports unsupported message \p Msg for \p MI to LLVM context.
217  void reportUnsupported(const MachineBasicBlock::iterator &MI,
218  const char *Msg) const;
219 
220  /// Inspects the target synchronization scope \p SSID and determines
221  /// the SI atomic scope it corresponds to, the address spaces it
222  /// covers, and whether the memory ordering applies between address
223  /// spaces.
225  toSIAtomicScope(SyncScope::ID SSID, SIAtomicAddrSpace InstrAddrSpace) const;
226 
227  /// \return Return a bit set of the address spaces accessed by \p AS.
228  SIAtomicAddrSpace toSIAtomicAddrSpace(unsigned AS) const;
229 
230  /// \returns Info constructed from \p MI, which has at least machine memory
231  /// operand.
232  Optional<SIMemOpInfo> constructFromMIWithMMO(
233  const MachineBasicBlock::iterator &MI) const;
234 
235 public:
236  /// Construct class to support accessing the machine memory operands
237  /// of instructions in the machine function \p MF.
238  SIMemOpAccess(MachineFunction &MF);
239 
240  /// \returns Load info if \p MI is a load operation, "None" otherwise.
242  const MachineBasicBlock::iterator &MI) const;
243 
244  /// \returns Store info if \p MI is a store operation, "None" otherwise.
245  Optional<SIMemOpInfo> getStoreInfo(
246  const MachineBasicBlock::iterator &MI) const;
247 
248  /// \returns Atomic fence info if \p MI is an atomic fence operation,
249  /// "None" otherwise.
250  Optional<SIMemOpInfo> getAtomicFenceInfo(
251  const MachineBasicBlock::iterator &MI) const;
252 
253  /// \returns Atomic cmpxchg/rmw info if \p MI is an atomic cmpxchg or
254  /// rmw operation, "None" otherwise.
255  Optional<SIMemOpInfo> getAtomicCmpxchgOrRmwInfo(
256  const MachineBasicBlock::iterator &MI) const;
257 };
258 
259 class SICacheControl {
260 protected:
261 
262  /// AMDGPU subtarget info.
263  const GCNSubtarget &ST;
264 
265  /// Instruction info.
266  const SIInstrInfo *TII = nullptr;
267 
268  IsaVersion IV;
269 
270  /// Whether to insert cache invalidating instructions.
271  bool InsertCacheInv;
272 
273  SICacheControl(const GCNSubtarget &ST);
274 
275  /// Sets named bit \p BitName to "true" if present in instruction \p MI.
276  /// \returns Returns true if \p MI is modified, false otherwise.
277  bool enableNamedBit(const MachineBasicBlock::iterator MI,
278  AMDGPU::CPol::CPol Bit) const;
279 
280 public:
281 
282  /// Create a cache control for the subtarget \p ST.
283  static std::unique_ptr<SICacheControl> create(const GCNSubtarget &ST);
284 
285  /// Update \p MI memory load instruction to bypass any caches up to
286  /// the \p Scope memory scope for address spaces \p
287  /// AddrSpace. Return true iff the instruction was modified.
288  virtual bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
289  SIAtomicScope Scope,
290  SIAtomicAddrSpace AddrSpace) const = 0;
291 
292  /// Update \p MI memory store instruction to bypass any caches up to
293  /// the \p Scope memory scope for address spaces \p
294  /// AddrSpace. Return true iff the instruction was modified.
295  virtual bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
296  SIAtomicScope Scope,
297  SIAtomicAddrSpace AddrSpace) const = 0;
298 
299  /// Update \p MI memory read-modify-write instruction to bypass any caches up
300  /// to the \p Scope memory scope for address spaces \p AddrSpace. Return true
301  /// iff the instruction was modified.
302  virtual bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
303  SIAtomicScope Scope,
304  SIAtomicAddrSpace AddrSpace) const = 0;
305 
306  /// Update \p MI memory instruction of kind \p Op associated with address
307  /// spaces \p AddrSpace to indicate it is volatile and/or nontemporal. Return
308  /// true iff the instruction was modified.
309  virtual bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
310  SIAtomicAddrSpace AddrSpace,
311  SIMemOp Op, bool IsVolatile,
312  bool IsNonTemporal) const = 0;
313 
314  /// Inserts any necessary instructions at position \p Pos relative
315  /// to instruction \p MI to ensure memory instructions before \p Pos of kind
316  /// \p Op associated with address spaces \p AddrSpace have completed. Used
317  /// between memory instructions to enforce the order they become visible as
318  /// observed by other memory instructions executing in memory scope \p Scope.
319  /// \p IsCrossAddrSpaceOrdering indicates if the memory ordering is between
320  /// address spaces. Returns true iff any instructions inserted.
321  virtual bool insertWait(MachineBasicBlock::iterator &MI,
322  SIAtomicScope Scope,
323  SIAtomicAddrSpace AddrSpace,
324  SIMemOp Op,
325  bool IsCrossAddrSpaceOrdering,
326  Position Pos) const = 0;
327 
328  /// Inserts any necessary instructions at position \p Pos relative to
329  /// instruction \p MI to ensure any subsequent memory instructions of this
330  /// thread with address spaces \p AddrSpace will observe the previous memory
331  /// operations by any thread for memory scopes up to memory scope \p Scope .
332  /// Returns true iff any instructions inserted.
333  virtual bool insertAcquire(MachineBasicBlock::iterator &MI,
334  SIAtomicScope Scope,
335  SIAtomicAddrSpace AddrSpace,
336  Position Pos) const = 0;
337 
338  /// Inserts any necessary instructions at position \p Pos relative to
339  /// instruction \p MI to ensure previous memory instructions by this thread
340  /// with address spaces \p AddrSpace have completed and can be observed by
341  /// subsequent memory instructions by any thread executing in memory scope \p
342  /// Scope. \p IsCrossAddrSpaceOrdering indicates if the memory ordering is
343  /// between address spaces. Returns true iff any instructions inserted.
344  virtual bool insertRelease(MachineBasicBlock::iterator &MI,
345  SIAtomicScope Scope,
346  SIAtomicAddrSpace AddrSpace,
347  bool IsCrossAddrSpaceOrdering,
348  Position Pos) const = 0;
349 
350  /// Virtual destructor to allow derivations to be deleted.
351  virtual ~SICacheControl() = default;
352 
353 };
354 
355 class SIGfx6CacheControl : public SICacheControl {
356 protected:
357 
358  /// Sets GLC bit to "true" if present in \p MI. Returns true if \p MI
359  /// is modified, false otherwise.
360  bool enableGLCBit(const MachineBasicBlock::iterator &MI) const {
361  return enableNamedBit(MI, AMDGPU::CPol::GLC);
362  }
363 
364  /// Sets SLC bit to "true" if present in \p MI. Returns true if \p MI
365  /// is modified, false otherwise.
366  bool enableSLCBit(const MachineBasicBlock::iterator &MI) const {
367  return enableNamedBit(MI, AMDGPU::CPol::SLC);
368  }
369 
370 public:
371 
372  SIGfx6CacheControl(const GCNSubtarget &ST) : SICacheControl(ST) {}
373 
374  bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
375  SIAtomicScope Scope,
376  SIAtomicAddrSpace AddrSpace) const override;
377 
378  bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
379  SIAtomicScope Scope,
380  SIAtomicAddrSpace AddrSpace) const override;
381 
382  bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
383  SIAtomicScope Scope,
384  SIAtomicAddrSpace AddrSpace) const override;
385 
386  bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
387  SIAtomicAddrSpace AddrSpace, SIMemOp Op,
388  bool IsVolatile,
389  bool IsNonTemporal) const override;
390 
391  bool insertWait(MachineBasicBlock::iterator &MI,
392  SIAtomicScope Scope,
393  SIAtomicAddrSpace AddrSpace,
394  SIMemOp Op,
395  bool IsCrossAddrSpaceOrdering,
396  Position Pos) const override;
397 
398  bool insertAcquire(MachineBasicBlock::iterator &MI,
399  SIAtomicScope Scope,
400  SIAtomicAddrSpace AddrSpace,
401  Position Pos) const override;
402 
403  bool insertRelease(MachineBasicBlock::iterator &MI,
404  SIAtomicScope Scope,
405  SIAtomicAddrSpace AddrSpace,
406  bool IsCrossAddrSpaceOrdering,
407  Position Pos) const override;
408 };
409 
410 class SIGfx7CacheControl : public SIGfx6CacheControl {
411 public:
412 
413  SIGfx7CacheControl(const GCNSubtarget &ST) : SIGfx6CacheControl(ST) {}
414 
415  bool insertAcquire(MachineBasicBlock::iterator &MI,
416  SIAtomicScope Scope,
417  SIAtomicAddrSpace AddrSpace,
418  Position Pos) const override;
419 
420 };
421 
422 class SIGfx90ACacheControl : public SIGfx7CacheControl {
423 public:
424 
425  SIGfx90ACacheControl(const GCNSubtarget &ST) : SIGfx7CacheControl(ST) {}
426 
427  bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
428  SIAtomicScope Scope,
429  SIAtomicAddrSpace AddrSpace) const override;
430 
431  bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
432  SIAtomicScope Scope,
433  SIAtomicAddrSpace AddrSpace) const override;
434 
435  bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
436  SIAtomicScope Scope,
437  SIAtomicAddrSpace AddrSpace) const override;
438 
439  bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
440  SIAtomicAddrSpace AddrSpace, SIMemOp Op,
441  bool IsVolatile,
442  bool IsNonTemporal) const override;
443 
444  bool insertWait(MachineBasicBlock::iterator &MI,
445  SIAtomicScope Scope,
446  SIAtomicAddrSpace AddrSpace,
447  SIMemOp Op,
448  bool IsCrossAddrSpaceOrdering,
449  Position Pos) const override;
450 
451  bool insertAcquire(MachineBasicBlock::iterator &MI,
452  SIAtomicScope Scope,
453  SIAtomicAddrSpace AddrSpace,
454  Position Pos) const override;
455 
456  bool insertRelease(MachineBasicBlock::iterator &MI,
457  SIAtomicScope Scope,
458  SIAtomicAddrSpace AddrSpace,
459  bool IsCrossAddrSpaceOrdering,
460  Position Pos) const override;
461 };
462 
463 class SIGfx940CacheControl : public SIGfx90ACacheControl {
464 protected:
465 
466  /// Sets SC0 bit to "true" if present in \p MI. Returns true if \p MI
467  /// is modified, false otherwise.
468  bool enableSC0Bit(const MachineBasicBlock::iterator &MI) const {
469  return enableNamedBit(MI, AMDGPU::CPol::SC0);
470  }
471 
472  /// Sets SC1 bit to "true" if present in \p MI. Returns true if \p MI
473  /// is modified, false otherwise.
474  bool enableSC1Bit(const MachineBasicBlock::iterator &MI) const {
475  return enableNamedBit(MI, AMDGPU::CPol::SC1);
476  }
477 
478  /// Sets NT bit to "true" if present in \p MI. Returns true if \p MI
479  /// is modified, false otherwise.
480  bool enableNTBit(const MachineBasicBlock::iterator &MI) const {
481  return enableNamedBit(MI, AMDGPU::CPol::NT);
482  }
483 
484 public:
485 
486  SIGfx940CacheControl(const GCNSubtarget &ST) : SIGfx90ACacheControl(ST) {};
487 
488  bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
489  SIAtomicScope Scope,
490  SIAtomicAddrSpace AddrSpace) const override;
491 
492  bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
493  SIAtomicScope Scope,
494  SIAtomicAddrSpace AddrSpace) const override;
495 
496  bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
497  SIAtomicScope Scope,
498  SIAtomicAddrSpace AddrSpace) const override;
499 
500  bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
501  SIAtomicAddrSpace AddrSpace, SIMemOp Op,
502  bool IsVolatile,
503  bool IsNonTemporal) const override;
504 
505  bool insertAcquire(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
506  SIAtomicAddrSpace AddrSpace, Position Pos) const override;
507 
508  bool insertRelease(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
509  SIAtomicAddrSpace AddrSpace, bool IsCrossAddrSpaceOrdering,
510  Position Pos) const override;
511 };
512 
513 class SIGfx10CacheControl : public SIGfx7CacheControl {
514 protected:
515 
516  /// Sets DLC bit to "true" if present in \p MI. Returns true if \p MI
517  /// is modified, false otherwise.
518  bool enableDLCBit(const MachineBasicBlock::iterator &MI) const {
519  return enableNamedBit(MI, AMDGPU::CPol::DLC);
520  }
521 
522 public:
523 
524  SIGfx10CacheControl(const GCNSubtarget &ST) : SIGfx7CacheControl(ST) {}
525 
526  bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
527  SIAtomicScope Scope,
528  SIAtomicAddrSpace AddrSpace) const override;
529 
530  bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
531  SIAtomicAddrSpace AddrSpace, SIMemOp Op,
532  bool IsVolatile,
533  bool IsNonTemporal) const override;
534 
535  bool insertWait(MachineBasicBlock::iterator &MI,
536  SIAtomicScope Scope,
537  SIAtomicAddrSpace AddrSpace,
538  SIMemOp Op,
539  bool IsCrossAddrSpaceOrdering,
540  Position Pos) const override;
541 
542  bool insertAcquire(MachineBasicBlock::iterator &MI,
543  SIAtomicScope Scope,
544  SIAtomicAddrSpace AddrSpace,
545  Position Pos) const override;
546 };
547 
548 class SIGfx11CacheControl : public SIGfx10CacheControl {
549 public:
550  SIGfx11CacheControl(const GCNSubtarget &ST) : SIGfx10CacheControl(ST) {}
551 
552  bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
553  SIAtomicScope Scope,
554  SIAtomicAddrSpace AddrSpace) const override;
555 
556  bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
557  SIAtomicAddrSpace AddrSpace, SIMemOp Op,
558  bool IsVolatile,
559  bool IsNonTemporal) const override;
560 };
561 
562 class SIMemoryLegalizer final : public MachineFunctionPass {
563 private:
564 
565  /// Cache Control.
566  std::unique_ptr<SICacheControl> CC = nullptr;
567 
568  /// List of atomic pseudo instructions.
569  std::list<MachineBasicBlock::iterator> AtomicPseudoMIs;
570 
571  /// Return true iff instruction \p MI is a atomic instruction that
572  /// returns a result.
573  bool isAtomicRet(const MachineInstr &MI) const {
575  }
576 
577  /// Removes all processed atomic pseudo instructions from the current
578  /// function. Returns true if current function is modified, false otherwise.
579  bool removeAtomicPseudoMIs();
580 
581  /// Expands load operation \p MI. Returns true if instructions are
582  /// added/deleted or \p MI is modified, false otherwise.
583  bool expandLoad(const SIMemOpInfo &MOI,
585  /// Expands store operation \p MI. Returns true if instructions are
586  /// added/deleted or \p MI is modified, false otherwise.
587  bool expandStore(const SIMemOpInfo &MOI,
589  /// Expands atomic fence operation \p MI. Returns true if
590  /// instructions are added/deleted or \p MI is modified, false otherwise.
591  bool expandAtomicFence(const SIMemOpInfo &MOI,
593  /// Expands atomic cmpxchg or rmw operation \p MI. Returns true if
594  /// instructions are added/deleted or \p MI is modified, false otherwise.
595  bool expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
597 
598 public:
599  static char ID;
600 
601  SIMemoryLegalizer() : MachineFunctionPass(ID) {}
602 
603  void getAnalysisUsage(AnalysisUsage &AU) const override {
604  AU.setPreservesCFG();
606  }
607 
608  StringRef getPassName() const override {
609  return PASS_NAME;
610  }
611 
612  bool runOnMachineFunction(MachineFunction &MF) override;
613 };
614 
615 } // end namespace anonymous
616 
617 void SIMemOpAccess::reportUnsupported(const MachineBasicBlock::iterator &MI,
618  const char *Msg) const {
619  const Function &Func = MI->getParent()->getParent()->getFunction();
620  DiagnosticInfoUnsupported Diag(Func, Msg, MI->getDebugLoc());
621  Func.getContext().diagnose(Diag);
622 }
623 
625 SIMemOpAccess::toSIAtomicScope(SyncScope::ID SSID,
626  SIAtomicAddrSpace InstrAddrSpace) const {
627  if (SSID == SyncScope::System)
628  return std::make_tuple(SIAtomicScope::SYSTEM,
629  SIAtomicAddrSpace::ATOMIC,
630  true);
631  if (SSID == MMI->getAgentSSID())
632  return std::make_tuple(SIAtomicScope::AGENT,
633  SIAtomicAddrSpace::ATOMIC,
634  true);
635  if (SSID == MMI->getWorkgroupSSID())
636  return std::make_tuple(SIAtomicScope::WORKGROUP,
637  SIAtomicAddrSpace::ATOMIC,
638  true);
639  if (SSID == MMI->getWavefrontSSID())
640  return std::make_tuple(SIAtomicScope::WAVEFRONT,
641  SIAtomicAddrSpace::ATOMIC,
642  true);
643  if (SSID == SyncScope::SingleThread)
644  return std::make_tuple(SIAtomicScope::SINGLETHREAD,
645  SIAtomicAddrSpace::ATOMIC,
646  true);
647  if (SSID == MMI->getSystemOneAddressSpaceSSID())
648  return std::make_tuple(SIAtomicScope::SYSTEM,
649  SIAtomicAddrSpace::ATOMIC & InstrAddrSpace,
650  false);
651  if (SSID == MMI->getAgentOneAddressSpaceSSID())
652  return std::make_tuple(SIAtomicScope::AGENT,
653  SIAtomicAddrSpace::ATOMIC & InstrAddrSpace,
654  false);
655  if (SSID == MMI->getWorkgroupOneAddressSpaceSSID())
656  return std::make_tuple(SIAtomicScope::WORKGROUP,
657  SIAtomicAddrSpace::ATOMIC & InstrAddrSpace,
658  false);
659  if (SSID == MMI->getWavefrontOneAddressSpaceSSID())
660  return std::make_tuple(SIAtomicScope::WAVEFRONT,
661  SIAtomicAddrSpace::ATOMIC & InstrAddrSpace,
662  false);
663  if (SSID == MMI->getSingleThreadOneAddressSpaceSSID())
664  return std::make_tuple(SIAtomicScope::SINGLETHREAD,
665  SIAtomicAddrSpace::ATOMIC & InstrAddrSpace,
666  false);
667  return None;
668 }
669 
670 SIAtomicAddrSpace SIMemOpAccess::toSIAtomicAddrSpace(unsigned AS) const {
671  if (AS == AMDGPUAS::FLAT_ADDRESS)
673  if (AS == AMDGPUAS::GLOBAL_ADDRESS)
675  if (AS == AMDGPUAS::LOCAL_ADDRESS)
676  return SIAtomicAddrSpace::LDS;
677  if (AS == AMDGPUAS::PRIVATE_ADDRESS)
678  return SIAtomicAddrSpace::SCRATCH;
679  if (AS == AMDGPUAS::REGION_ADDRESS)
680  return SIAtomicAddrSpace::GDS;
681 
682  return SIAtomicAddrSpace::OTHER;
683 }
684 
685 SIMemOpAccess::SIMemOpAccess(MachineFunction &MF) {
687 }
688 
689 Optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO(
690  const MachineBasicBlock::iterator &MI) const {
691  assert(MI->getNumMemOperands() > 0);
692 
695  AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic;
696  SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE;
697  bool IsNonTemporal = true;
698  bool IsVolatile = false;
699 
700  // Validator should check whether or not MMOs cover the entire set of
701  // locations accessed by the memory instruction.
702  for (const auto &MMO : MI->memoperands()) {
703  IsNonTemporal &= MMO->isNonTemporal();
704  IsVolatile |= MMO->isVolatile();
705  InstrAddrSpace |=
706  toSIAtomicAddrSpace(MMO->getPointerInfo().getAddrSpace());
707  AtomicOrdering OpOrdering = MMO->getSuccessOrdering();
708  if (OpOrdering != AtomicOrdering::NotAtomic) {
709  const auto &IsSyncScopeInclusion =
710  MMI->isSyncScopeInclusion(SSID, MMO->getSyncScopeID());
711  if (!IsSyncScopeInclusion) {
712  reportUnsupported(MI,
713  "Unsupported non-inclusive atomic synchronization scope");
714  return None;
715  }
716 
717  SSID = *IsSyncScopeInclusion ? SSID : MMO->getSyncScopeID();
718  Ordering = getMergedAtomicOrdering(Ordering, OpOrdering);
719  assert(MMO->getFailureOrdering() != AtomicOrdering::Release &&
720  MMO->getFailureOrdering() != AtomicOrdering::AcquireRelease);
721  FailureOrdering =
722  getMergedAtomicOrdering(FailureOrdering, MMO->getFailureOrdering());
723  }
724  }
725 
726  SIAtomicScope Scope = SIAtomicScope::NONE;
727  SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
728  bool IsCrossAddressSpaceOrdering = false;
729  if (Ordering != AtomicOrdering::NotAtomic) {
730  auto ScopeOrNone = toSIAtomicScope(SSID, InstrAddrSpace);
731  if (!ScopeOrNone) {
732  reportUnsupported(MI, "Unsupported atomic synchronization scope");
733  return None;
734  }
735  std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) =
736  *ScopeOrNone;
737  if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) ||
738  ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace) ||
739  ((InstrAddrSpace & SIAtomicAddrSpace::ATOMIC) == SIAtomicAddrSpace::NONE)) {
740  reportUnsupported(MI, "Unsupported atomic address space");
741  return None;
742  }
743  }
744  return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, InstrAddrSpace,
745  IsCrossAddressSpaceOrdering, FailureOrdering, IsVolatile,
746  IsNonTemporal);
747 }
748 
750  const MachineBasicBlock::iterator &MI) const {
751  assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
752 
753  if (!(MI->mayLoad() && !MI->mayStore()))
754  return None;
755 
756  // Be conservative if there are no memory operands.
757  if (MI->getNumMemOperands() == 0)
758  return SIMemOpInfo();
759 
760  return constructFromMIWithMMO(MI);
761 }
762 
763 Optional<SIMemOpInfo> SIMemOpAccess::getStoreInfo(
764  const MachineBasicBlock::iterator &MI) const {
765  assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
766 
767  if (!(!MI->mayLoad() && MI->mayStore()))
768  return None;
769 
770  // Be conservative if there are no memory operands.
771  if (MI->getNumMemOperands() == 0)
772  return SIMemOpInfo();
773 
774  return constructFromMIWithMMO(MI);
775 }
776 
777 Optional<SIMemOpInfo> SIMemOpAccess::getAtomicFenceInfo(
778  const MachineBasicBlock::iterator &MI) const {
779  assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
780 
781  if (MI->getOpcode() != AMDGPU::ATOMIC_FENCE)
782  return None;
783 
784  AtomicOrdering Ordering =
785  static_cast<AtomicOrdering>(MI->getOperand(0).getImm());
786 
787  SyncScope::ID SSID = static_cast<SyncScope::ID>(MI->getOperand(1).getImm());
788  auto ScopeOrNone = toSIAtomicScope(SSID, SIAtomicAddrSpace::ATOMIC);
789  if (!ScopeOrNone) {
790  reportUnsupported(MI, "Unsupported atomic synchronization scope");
791  return None;
792  }
793 
794  SIAtomicScope Scope = SIAtomicScope::NONE;
795  SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
796  bool IsCrossAddressSpaceOrdering = false;
797  std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) =
798  *ScopeOrNone;
799 
800  if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) ||
801  ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace)) {
802  reportUnsupported(MI, "Unsupported atomic address space");
803  return None;
804  }
805 
806  return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, SIAtomicAddrSpace::ATOMIC,
807  IsCrossAddressSpaceOrdering, AtomicOrdering::NotAtomic);
808 }
809 
810 Optional<SIMemOpInfo> SIMemOpAccess::getAtomicCmpxchgOrRmwInfo(
811  const MachineBasicBlock::iterator &MI) const {
812  assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
813 
814  if (!(MI->mayLoad() && MI->mayStore()))
815  return None;
816 
817  // Be conservative if there are no memory operands.
818  if (MI->getNumMemOperands() == 0)
819  return SIMemOpInfo();
820 
821  return constructFromMIWithMMO(MI);
822 }
823 
824 SICacheControl::SICacheControl(const GCNSubtarget &ST) : ST(ST) {
825  TII = ST.getInstrInfo();
826  IV = getIsaVersion(ST.getCPU());
827  InsertCacheInv = !AmdgcnSkipCacheInvalidations;
828 }
829 
830 bool SICacheControl::enableNamedBit(const MachineBasicBlock::iterator MI,
831  AMDGPU::CPol::CPol Bit) const {
832  MachineOperand *CPol = TII->getNamedOperand(*MI, AMDGPU::OpName::cpol);
833  if (!CPol)
834  return false;
835 
836  CPol->setImm(CPol->getImm() | Bit);
837  return true;
838 }
839 
840 /* static */
841 std::unique_ptr<SICacheControl> SICacheControl::create(const GCNSubtarget &ST) {
842  GCNSubtarget::Generation Generation = ST.getGeneration();
843  if (ST.hasGFX940Insts())
844  return std::make_unique<SIGfx940CacheControl>(ST);
845  if (ST.hasGFX90AInsts())
846  return std::make_unique<SIGfx90ACacheControl>(ST);
847  if (Generation <= AMDGPUSubtarget::SOUTHERN_ISLANDS)
848  return std::make_unique<SIGfx6CacheControl>(ST);
849  if (Generation < AMDGPUSubtarget::GFX10)
850  return std::make_unique<SIGfx7CacheControl>(ST);
851  if (Generation < AMDGPUSubtarget::GFX11)
852  return std::make_unique<SIGfx10CacheControl>(ST);
853  return std::make_unique<SIGfx11CacheControl>(ST);
854 }
855 
856 bool SIGfx6CacheControl::enableLoadCacheBypass(
858  SIAtomicScope Scope,
859  SIAtomicAddrSpace AddrSpace) const {
860  assert(MI->mayLoad() && !MI->mayStore());
861  bool Changed = false;
862 
863  if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
864  switch (Scope) {
865  case SIAtomicScope::SYSTEM:
866  case SIAtomicScope::AGENT:
867  // Set L1 cache policy to MISS_EVICT.
868  // Note: there is no L2 cache bypass policy at the ISA level.
869  Changed |= enableGLCBit(MI);
870  break;
871  case SIAtomicScope::WORKGROUP:
872  case SIAtomicScope::WAVEFRONT:
873  case SIAtomicScope::SINGLETHREAD:
874  // No cache to bypass.
875  break;
876  default:
877  llvm_unreachable("Unsupported synchronization scope");
878  }
879  }
880 
881  /// The scratch address space does not need the global memory caches
882  /// to be bypassed as all memory operations by the same thread are
883  /// sequentially consistent, and no other thread can access scratch
884  /// memory.
885 
886  /// Other address spaces do not have a cache.
887 
888  return Changed;
889 }
890 
891 bool SIGfx6CacheControl::enableStoreCacheBypass(
893  SIAtomicScope Scope,
894  SIAtomicAddrSpace AddrSpace) const {
895  assert(!MI->mayLoad() && MI->mayStore());
896  bool Changed = false;
897 
898  /// The L1 cache is write through so does not need to be bypassed. There is no
899  /// bypass control for the L2 cache at the isa level.
900 
901  return Changed;
902 }
903 
904 bool SIGfx6CacheControl::enableRMWCacheBypass(
906  SIAtomicScope Scope,
907  SIAtomicAddrSpace AddrSpace) const {
908  assert(MI->mayLoad() && MI->mayStore());
909  bool Changed = false;
910 
911  /// Do not set GLC for RMW atomic operations as L0/L1 cache is automatically
912  /// bypassed, and the GLC bit is instead used to indicate if they are
913  /// return or no-return.
914  /// Note: there is no L2 cache coherent bypass control at the ISA level.
915 
916  return Changed;
917 }
918 
919 bool SIGfx6CacheControl::enableVolatileAndOrNonTemporal(
920  MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
921  bool IsVolatile, bool IsNonTemporal) const {
922  // Only handle load and store, not atomic read-modify-write insructions. The
923  // latter use glc to indicate if the atomic returns a result and so must not
924  // be used for cache control.
925  assert(MI->mayLoad() ^ MI->mayStore());
926 
927  // Only update load and store, not LLVM IR atomic read-modify-write
928  // instructions. The latter are always marked as volatile so cannot sensibly
929  // handle it as do not want to pessimize all atomics. Also they do not support
930  // the nontemporal attribute.
932 
933  bool Changed = false;
934 
935  if (IsVolatile) {
936  // Set L1 cache policy to be MISS_EVICT for load instructions
937  // and MISS_LRU for store instructions.
938  // Note: there is no L2 cache bypass policy at the ISA level.
939  if (Op == SIMemOp::LOAD)
940  Changed |= enableGLCBit(MI);
941 
942  // Ensure operation has completed at system scope to cause all volatile
943  // operations to be visible outside the program in a global order. Do not
944  // request cross address space as only the global address space can be
945  // observable outside the program, so no need to cause a waitcnt for LDS
946  // address space operations.
947  Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
948  Position::AFTER);
949 
950  return Changed;
951  }
952 
953  if (IsNonTemporal) {
954  // Setting both GLC and SLC configures L1 cache policy to MISS_EVICT
955  // for both loads and stores, and the L2 cache policy to STREAM.
956  Changed |= enableGLCBit(MI);
957  Changed |= enableSLCBit(MI);
958  return Changed;
959  }
960 
961  return Changed;
962 }
963 
964 bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI,
965  SIAtomicScope Scope,
966  SIAtomicAddrSpace AddrSpace,
967  SIMemOp Op,
968  bool IsCrossAddrSpaceOrdering,
969  Position Pos) const {
970  bool Changed = false;
971 
972  MachineBasicBlock &MBB = *MI->getParent();
973  DebugLoc DL = MI->getDebugLoc();
974 
975  if (Pos == Position::AFTER)
976  ++MI;
977 
978  bool VMCnt = false;
979  bool LGKMCnt = false;
980 
981  if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) !=
983  switch (Scope) {
984  case SIAtomicScope::SYSTEM:
985  case SIAtomicScope::AGENT:
986  VMCnt |= true;
987  break;
988  case SIAtomicScope::WORKGROUP:
989  case SIAtomicScope::WAVEFRONT:
990  case SIAtomicScope::SINGLETHREAD:
991  // The L1 cache keeps all memory operations in order for
992  // wavefronts in the same work-group.
993  break;
994  default:
995  llvm_unreachable("Unsupported synchronization scope");
996  }
997  }
998 
999  if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
1000  switch (Scope) {
1001  case SIAtomicScope::SYSTEM:
1002  case SIAtomicScope::AGENT:
1003  case SIAtomicScope::WORKGROUP:
1004  // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is
1005  // not needed as LDS operations for all waves are executed in a total
1006  // global ordering as observed by all waves. Required if also
1007  // synchronizing with global/GDS memory as LDS operations could be
1008  // reordered with respect to later global/GDS memory operations of the
1009  // same wave.
1010  LGKMCnt |= IsCrossAddrSpaceOrdering;
1011  break;
1012  case SIAtomicScope::WAVEFRONT:
1013  case SIAtomicScope::SINGLETHREAD:
1014  // The LDS keeps all memory operations in order for
1015  // the same wavefront.
1016  break;
1017  default:
1018  llvm_unreachable("Unsupported synchronization scope");
1019  }
1020  }
1021 
1022  if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) {
1023  switch (Scope) {
1024  case SIAtomicScope::SYSTEM:
1025  case SIAtomicScope::AGENT:
1026  // If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)"
1027  // is not needed as GDS operations for all waves are executed in a total
1028  // global ordering as observed by all waves. Required if also
1029  // synchronizing with global/LDS memory as GDS operations could be
1030  // reordered with respect to later global/LDS memory operations of the
1031  // same wave.
1032  LGKMCnt |= IsCrossAddrSpaceOrdering;
1033  break;
1034  case SIAtomicScope::WORKGROUP:
1035  case SIAtomicScope::WAVEFRONT:
1036  case SIAtomicScope::SINGLETHREAD:
1037  // The GDS keeps all memory operations in order for
1038  // the same work-group.
1039  break;
1040  default:
1041  llvm_unreachable("Unsupported synchronization scope");
1042  }
1043  }
1044 
1045  if (VMCnt || LGKMCnt) {
1046  unsigned WaitCntImmediate =
1048  VMCnt ? 0 : getVmcntBitMask(IV),
1050  LGKMCnt ? 0 : getLgkmcntBitMask(IV));
1051  BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(WaitCntImmediate);
1052  Changed = true;
1053  }
1054 
1055  if (Pos == Position::AFTER)
1056  --MI;
1057 
1058  return Changed;
1059 }
1060 
1061 bool SIGfx6CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1062  SIAtomicScope Scope,
1063  SIAtomicAddrSpace AddrSpace,
1064  Position Pos) const {
1065  if (!InsertCacheInv)
1066  return false;
1067 
1068  bool Changed = false;
1069 
1070  MachineBasicBlock &MBB = *MI->getParent();
1071  DebugLoc DL = MI->getDebugLoc();
1072 
1073  if (Pos == Position::AFTER)
1074  ++MI;
1075 
1076  if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1077  switch (Scope) {
1078  case SIAtomicScope::SYSTEM:
1079  case SIAtomicScope::AGENT:
1080  BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBINVL1));
1081  Changed = true;
1082  break;
1083  case SIAtomicScope::WORKGROUP:
1084  case SIAtomicScope::WAVEFRONT:
1085  case SIAtomicScope::SINGLETHREAD:
1086  // No cache to invalidate.
1087  break;
1088  default:
1089  llvm_unreachable("Unsupported synchronization scope");
1090  }
1091  }
1092 
1093  /// The scratch address space does not need the global memory cache
1094  /// to be flushed as all memory operations by the same thread are
1095  /// sequentially consistent, and no other thread can access scratch
1096  /// memory.
1097 
1098  /// Other address spaces do not have a cache.
1099 
1100  if (Pos == Position::AFTER)
1101  --MI;
1102 
1103  return Changed;
1104 }
1105 
1106 bool SIGfx6CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
1107  SIAtomicScope Scope,
1108  SIAtomicAddrSpace AddrSpace,
1109  bool IsCrossAddrSpaceOrdering,
1110  Position Pos) const {
1111  return insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
1112  IsCrossAddrSpaceOrdering, Pos);
1113 }
1114 
1115 bool SIGfx7CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1116  SIAtomicScope Scope,
1117  SIAtomicAddrSpace AddrSpace,
1118  Position Pos) const {
1119  if (!InsertCacheInv)
1120  return false;
1121 
1122  bool Changed = false;
1123 
1124  MachineBasicBlock &MBB = *MI->getParent();
1125  DebugLoc DL = MI->getDebugLoc();
1126 
1127  const GCNSubtarget &STM = MBB.getParent()->getSubtarget<GCNSubtarget>();
1128 
1129  const unsigned InvalidateL1 = STM.isAmdPalOS() || STM.isMesa3DOS()
1130  ? AMDGPU::BUFFER_WBINVL1
1131  : AMDGPU::BUFFER_WBINVL1_VOL;
1132 
1133  if (Pos == Position::AFTER)
1134  ++MI;
1135 
1136  if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1137  switch (Scope) {
1138  case SIAtomicScope::SYSTEM:
1139  case SIAtomicScope::AGENT:
1140  BuildMI(MBB, MI, DL, TII->get(InvalidateL1));
1141  Changed = true;
1142  break;
1143  case SIAtomicScope::WORKGROUP:
1144  case SIAtomicScope::WAVEFRONT:
1145  case SIAtomicScope::SINGLETHREAD:
1146  // No cache to invalidate.
1147  break;
1148  default:
1149  llvm_unreachable("Unsupported synchronization scope");
1150  }
1151  }
1152 
1153  /// The scratch address space does not need the global memory cache
1154  /// to be flushed as all memory operations by the same thread are
1155  /// sequentially consistent, and no other thread can access scratch
1156  /// memory.
1157 
1158  /// Other address spaces do not have a cache.
1159 
1160  if (Pos == Position::AFTER)
1161  --MI;
1162 
1163  return Changed;
1164 }
1165 
1166 bool SIGfx90ACacheControl::enableLoadCacheBypass(
1168  SIAtomicScope Scope,
1169  SIAtomicAddrSpace AddrSpace) const {
1170  assert(MI->mayLoad() && !MI->mayStore());
1171  bool Changed = false;
1172 
1173  if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1174  switch (Scope) {
1175  case SIAtomicScope::SYSTEM:
1176  case SIAtomicScope::AGENT:
1177  // Set the L1 cache policy to MISS_LRU.
1178  // Note: there is no L2 cache bypass policy at the ISA level.
1179  Changed |= enableGLCBit(MI);
1180  break;
1181  case SIAtomicScope::WORKGROUP:
1182  // In threadgroup split mode the waves of a work-group can be executing on
1183  // different CUs. Therefore need to bypass the L1 which is per CU.
1184  // Otherwise in non-threadgroup split mode all waves of a work-group are
1185  // on the same CU, and so the L1 does not need to be bypassed.
1186  if (ST.isTgSplitEnabled())
1187  Changed |= enableGLCBit(MI);
1188  break;
1189  case SIAtomicScope::WAVEFRONT:
1190  case SIAtomicScope::SINGLETHREAD:
1191  // No cache to bypass.
1192  break;
1193  default:
1194  llvm_unreachable("Unsupported synchronization scope");
1195  }
1196  }
1197 
1198  /// The scratch address space does not need the global memory caches
1199  /// to be bypassed as all memory operations by the same thread are
1200  /// sequentially consistent, and no other thread can access scratch
1201  /// memory.
1202 
1203  /// Other address spaces do not have a cache.
1204 
1205  return Changed;
1206 }
1207 
1208 bool SIGfx90ACacheControl::enableStoreCacheBypass(
1210  SIAtomicScope Scope,
1211  SIAtomicAddrSpace AddrSpace) const {
1212  assert(!MI->mayLoad() && MI->mayStore());
1213  bool Changed = false;
1214 
1215  if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1216  switch (Scope) {
1217  case SIAtomicScope::SYSTEM:
1218  case SIAtomicScope::AGENT:
1219  /// Do not set glc for store atomic operations as they implicitly write
1220  /// through the L1 cache.
1221  break;
1222  case SIAtomicScope::WORKGROUP:
1223  case SIAtomicScope::WAVEFRONT:
1224  case SIAtomicScope::SINGLETHREAD:
1225  // No cache to bypass. Store atomics implicitly write through the L1
1226  // cache.
1227  break;
1228  default:
1229  llvm_unreachable("Unsupported synchronization scope");
1230  }
1231  }
1232 
1233  /// The scratch address space does not need the global memory caches
1234  /// to be bypassed as all memory operations by the same thread are
1235  /// sequentially consistent, and no other thread can access scratch
1236  /// memory.
1237 
1238  /// Other address spaces do not have a cache.
1239 
1240  return Changed;
1241 }
1242 
1243 bool SIGfx90ACacheControl::enableRMWCacheBypass(
1245  SIAtomicScope Scope,
1246  SIAtomicAddrSpace AddrSpace) const {
1247  assert(MI->mayLoad() && MI->mayStore());
1248  bool Changed = false;
1249 
1250  if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1251  switch (Scope) {
1252  case SIAtomicScope::SYSTEM:
1253  case SIAtomicScope::AGENT:
1254  /// Do not set glc for RMW atomic operations as they implicitly bypass
1255  /// the L1 cache, and the glc bit is instead used to indicate if they are
1256  /// return or no-return.
1257  break;
1258  case SIAtomicScope::WORKGROUP:
1259  case SIAtomicScope::WAVEFRONT:
1260  case SIAtomicScope::SINGLETHREAD:
1261  // No cache to bypass. RMW atomics implicitly bypass the L1 cache.
1262  break;
1263  default:
1264  llvm_unreachable("Unsupported synchronization scope");
1265  }
1266  }
1267 
1268  return Changed;
1269 }
1270 
1271 bool SIGfx90ACacheControl::enableVolatileAndOrNonTemporal(
1272  MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1273  bool IsVolatile, bool IsNonTemporal) const {
1274  // Only handle load and store, not atomic read-modify-write insructions. The
1275  // latter use glc to indicate if the atomic returns a result and so must not
1276  // be used for cache control.
1277  assert(MI->mayLoad() ^ MI->mayStore());
1278 
1279  // Only update load and store, not LLVM IR atomic read-modify-write
1280  // instructions. The latter are always marked as volatile so cannot sensibly
1281  // handle it as do not want to pessimize all atomics. Also they do not support
1282  // the nontemporal attribute.
1284 
1285  bool Changed = false;
1286 
1287  if (IsVolatile) {
1288  // Set L1 cache policy to be MISS_EVICT for load instructions
1289  // and MISS_LRU for store instructions.
1290  // Note: there is no L2 cache bypass policy at the ISA level.
1291  if (Op == SIMemOp::LOAD)
1292  Changed |= enableGLCBit(MI);
1293 
1294  // Ensure operation has completed at system scope to cause all volatile
1295  // operations to be visible outside the program in a global order. Do not
1296  // request cross address space as only the global address space can be
1297  // observable outside the program, so no need to cause a waitcnt for LDS
1298  // address space operations.
1299  Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
1300  Position::AFTER);
1301 
1302  return Changed;
1303  }
1304 
1305  if (IsNonTemporal) {
1306  // Setting both GLC and SLC configures L1 cache policy to MISS_EVICT
1307  // for both loads and stores, and the L2 cache policy to STREAM.
1308  Changed |= enableGLCBit(MI);
1309  Changed |= enableSLCBit(MI);
1310  return Changed;
1311  }
1312 
1313  return Changed;
1314 }
1315 
1316 bool SIGfx90ACacheControl::insertWait(MachineBasicBlock::iterator &MI,
1317  SIAtomicScope Scope,
1318  SIAtomicAddrSpace AddrSpace,
1319  SIMemOp Op,
1320  bool IsCrossAddrSpaceOrdering,
1321  Position Pos) const {
1322  if (ST.isTgSplitEnabled()) {
1323  // In threadgroup split mode the waves of a work-group can be executing on
1324  // different CUs. Therefore need to wait for global or GDS memory operations
1325  // to complete to ensure they are visible to waves in the other CUs.
1326  // Otherwise in non-threadgroup split mode all waves of a work-group are on
1327  // the same CU, so no need to wait for global memory as all waves in the
1328  // work-group access the same the L1, nor wait for GDS as access are ordered
1329  // on a CU.
1330  if (((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH |
1331  SIAtomicAddrSpace::GDS)) != SIAtomicAddrSpace::NONE) &&
1332  (Scope == SIAtomicScope::WORKGROUP)) {
1333  // Same as GFX7 using agent scope.
1334  Scope = SIAtomicScope::AGENT;
1335  }
1336  // In threadgroup split mode LDS cannot be allocated so no need to wait for
1337  // LDS memory operations.
1338  AddrSpace &= ~SIAtomicAddrSpace::LDS;
1339  }
1340  return SIGfx7CacheControl::insertWait(MI, Scope, AddrSpace, Op,
1341  IsCrossAddrSpaceOrdering, Pos);
1342 }
1343 
1344 bool SIGfx90ACacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1345  SIAtomicScope Scope,
1346  SIAtomicAddrSpace AddrSpace,
1347  Position Pos) const {
1348  if (!InsertCacheInv)
1349  return false;
1350 
1351  bool Changed = false;
1352 
1353  MachineBasicBlock &MBB = *MI->getParent();
1354  DebugLoc DL = MI->getDebugLoc();
1355 
1356  if (Pos == Position::AFTER)
1357  ++MI;
1358 
1359  if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1360  switch (Scope) {
1361  case SIAtomicScope::SYSTEM:
1362  // Ensures that following loads will not see stale remote VMEM data or
1363  // stale local VMEM data with MTYPE NC. Local VMEM data with MTYPE RW and
1364  // CC will never be stale due to the local memory probes.
1365  BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INVL2));
1366  // Inserting a "S_WAITCNT vmcnt(0)" after is not required because the
1367  // hardware does not reorder memory operations by the same wave with
1368  // respect to a preceding "BUFFER_INVL2". The invalidate is guaranteed to
1369  // remove any cache lines of earlier writes by the same wave and ensures
1370  // later reads by the same wave will refetch the cache lines.
1371  Changed = true;
1372  break;
1373  case SIAtomicScope::AGENT:
1374  // Same as GFX7.
1375  break;
1376  case SIAtomicScope::WORKGROUP:
1377  // In threadgroup split mode the waves of a work-group can be executing on
1378  // different CUs. Therefore need to invalidate the L1 which is per CU.
1379  // Otherwise in non-threadgroup split mode all waves of a work-group are
1380  // on the same CU, and so the L1 does not need to be invalidated.
1381  if (ST.isTgSplitEnabled()) {
1382  // Same as GFX7 using agent scope.
1383  Scope = SIAtomicScope::AGENT;
1384  }
1385  break;
1386  case SIAtomicScope::WAVEFRONT:
1387  case SIAtomicScope::SINGLETHREAD:
1388  // Same as GFX7.
1389  break;
1390  default:
1391  llvm_unreachable("Unsupported synchronization scope");
1392  }
1393  }
1394 
1395  /// The scratch address space does not need the global memory cache
1396  /// to be flushed as all memory operations by the same thread are
1397  /// sequentially consistent, and no other thread can access scratch
1398  /// memory.
1399 
1400  /// Other address spaces do not have a cache.
1401 
1402  if (Pos == Position::AFTER)
1403  --MI;
1404 
1405  Changed |= SIGfx7CacheControl::insertAcquire(MI, Scope, AddrSpace, Pos);
1406 
1407  return Changed;
1408 }
1409 
1410 bool SIGfx90ACacheControl::insertRelease(MachineBasicBlock::iterator &MI,
1411  SIAtomicScope Scope,
1412  SIAtomicAddrSpace AddrSpace,
1413  bool IsCrossAddrSpaceOrdering,
1414  Position Pos) const {
1415  bool Changed = false;
1416 
1417  MachineBasicBlock &MBB = *MI->getParent();
1418  DebugLoc DL = MI->getDebugLoc();
1419 
1420  if (Pos == Position::AFTER)
1421  ++MI;
1422 
1423  if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1424  switch (Scope) {
1425  case SIAtomicScope::SYSTEM:
1426  // Inserting a "S_WAITCNT vmcnt(0)" before is not required because the
1427  // hardware does not reorder memory operations by the same wave with
1428  // respect to a following "BUFFER_WBL2". The "BUFFER_WBL2" is guaranteed
1429  // to initiate writeback of any dirty cache lines of earlier writes by the
1430  // same wave. A "S_WAITCNT vmcnt(0)" is needed after to ensure the
1431  // writeback has completed.
1432  BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2))
1433  // Set SC bits to indicate system scope.
1435  // Followed by same as GFX7, which will ensure the necessary "S_WAITCNT
1436  // vmcnt(0)" needed by the "BUFFER_WBL2".
1437  Changed = true;
1438  break;
1439  case SIAtomicScope::AGENT:
1440  case SIAtomicScope::WORKGROUP:
1441  case SIAtomicScope::WAVEFRONT:
1442  case SIAtomicScope::SINGLETHREAD:
1443  // Same as GFX7.
1444  break;
1445  default:
1446  llvm_unreachable("Unsupported synchronization scope");
1447  }
1448  }
1449 
1450  if (Pos == Position::AFTER)
1451  --MI;
1452 
1453  Changed |=
1454  SIGfx7CacheControl::insertRelease(MI, Scope, AddrSpace,
1455  IsCrossAddrSpaceOrdering, Pos);
1456 
1457  return Changed;
1458 }
1459 
1460 bool SIGfx940CacheControl::enableLoadCacheBypass(
1461  const MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
1462  SIAtomicAddrSpace AddrSpace) const {
1463  assert(MI->mayLoad() && !MI->mayStore());
1464  bool Changed = false;
1465 
1466  if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1467  switch (Scope) {
1468  case SIAtomicScope::SYSTEM:
1469  // Set SC bits to indicate system scope.
1470  Changed |= enableSC0Bit(MI);
1471  Changed |= enableSC1Bit(MI);
1472  break;
1473  case SIAtomicScope::AGENT:
1474  // Set SC bits to indicate agent scope.
1475  Changed |= enableSC1Bit(MI);
1476  break;
1477  case SIAtomicScope::WORKGROUP:
1478  // In threadgroup split mode the waves of a work-group can be executing on
1479  // different CUs. Therefore need to bypass the L1 which is per CU.
1480  // Otherwise in non-threadgroup split mode all waves of a work-group are
1481  // on the same CU, and so the L1 does not need to be bypassed. Setting SC
1482  // bits to indicate work-group scope will do this automatically.
1483  Changed |= enableSC0Bit(MI);
1484  break;
1485  case SIAtomicScope::WAVEFRONT:
1486  case SIAtomicScope::SINGLETHREAD:
1487  // Leave SC bits unset to indicate wavefront scope.
1488  break;
1489  default:
1490  llvm_unreachable("Unsupported synchronization scope");
1491  }
1492  }
1493 
1494  /// The scratch address space does not need the global memory caches
1495  /// to be bypassed as all memory operations by the same thread are
1496  /// sequentially consistent, and no other thread can access scratch
1497  /// memory.
1498 
1499  /// Other address spaces do not have a cache.
1500 
1501  return Changed;
1502 }
1503 
1504 bool SIGfx940CacheControl::enableStoreCacheBypass(
1506  SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace) const {
1507  assert(!MI->mayLoad() && MI->mayStore());
1508  bool Changed = false;
1509 
1510  if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1511  switch (Scope) {
1512  case SIAtomicScope::SYSTEM:
1513  // Set SC bits to indicate system scope.
1514  Changed |= enableSC0Bit(MI);
1515  Changed |= enableSC1Bit(MI);
1516  break;
1517  case SIAtomicScope::AGENT:
1518  // Set SC bits to indicate agent scope.
1519  Changed |= enableSC1Bit(MI);
1520  break;
1521  case SIAtomicScope::WORKGROUP:
1522  // Set SC bits to indicate workgroup scope.
1523  Changed |= enableSC0Bit(MI);
1524  break;
1525  case SIAtomicScope::WAVEFRONT:
1526  case SIAtomicScope::SINGLETHREAD:
1527  // Leave SC bits unset to indicate wavefront scope.
1528  break;
1529  default:
1530  llvm_unreachable("Unsupported synchronization scope");
1531  }
1532  }
1533 
1534  /// The scratch address space does not need the global memory caches
1535  /// to be bypassed as all memory operations by the same thread are
1536  /// sequentially consistent, and no other thread can access scratch
1537  /// memory.
1538 
1539  /// Other address spaces do not have a cache.
1540 
1541  return Changed;
1542 }
1543 
1544 bool SIGfx940CacheControl::enableRMWCacheBypass(
1545  const MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
1546  SIAtomicAddrSpace AddrSpace) const {
1547  assert(MI->mayLoad() && MI->mayStore());
1548  bool Changed = false;
1549 
1550  if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1551  switch (Scope) {
1552  case SIAtomicScope::SYSTEM:
1553  // Set SC1 bit to indicate system scope.
1554  Changed |= enableSC1Bit(MI);
1555  break;
1556  case SIAtomicScope::AGENT:
1557  case SIAtomicScope::WORKGROUP:
1558  case SIAtomicScope::WAVEFRONT:
1559  case SIAtomicScope::SINGLETHREAD:
1560  // RMW atomic operations implicitly bypass the L1 cache and only use SC1
1561  // to indicate system or agent scope. The SC0 bit is used to indicate if
1562  // they are return or no-return. Leave SC1 bit unset to indicate agent
1563  // scope.
1564  break;
1565  default:
1566  llvm_unreachable("Unsupported synchronization scope");
1567  }
1568  }
1569 
1570  return Changed;
1571 }
1572 
1573 bool SIGfx940CacheControl::enableVolatileAndOrNonTemporal(
1574  MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1575  bool IsVolatile, bool IsNonTemporal) const {
1576  // Only handle load and store, not atomic read-modify-write insructions. The
1577  // latter use glc to indicate if the atomic returns a result and so must not
1578  // be used for cache control.
1579  assert(MI->mayLoad() ^ MI->mayStore());
1580 
1581  // Only update load and store, not LLVM IR atomic read-modify-write
1582  // instructions. The latter are always marked as volatile so cannot sensibly
1583  // handle it as do not want to pessimize all atomics. Also they do not support
1584  // the nontemporal attribute.
1586 
1587  bool Changed = false;
1588 
1589  if (IsVolatile) {
1590  // Set SC bits to indicate system scope.
1591  Changed |= enableSC0Bit(MI);
1592  Changed |= enableSC1Bit(MI);
1593 
1594  // Ensure operation has completed at system scope to cause all volatile
1595  // operations to be visible outside the program in a global order. Do not
1596  // request cross address space as only the global address space can be
1597  // observable outside the program, so no need to cause a waitcnt for LDS
1598  // address space operations.
1599  Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
1600  Position::AFTER);
1601 
1602  return Changed;
1603  }
1604 
1605  if (IsNonTemporal) {
1606  Changed |= enableNTBit(MI);
1607  return Changed;
1608  }
1609 
1610  return Changed;
1611 }
1612 
1613 bool SIGfx940CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1614  SIAtomicScope Scope,
1615  SIAtomicAddrSpace AddrSpace,
1616  Position Pos) const {
1617  if (!InsertCacheInv)
1618  return false;
1619 
1620  bool Changed = false;
1621 
1622  MachineBasicBlock &MBB = *MI->getParent();
1623  DebugLoc DL = MI->getDebugLoc();
1624 
1625  if (Pos == Position::AFTER)
1626  ++MI;
1627 
1628  if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1629  switch (Scope) {
1630  case SIAtomicScope::SYSTEM:
1631  // Ensures that following loads will not see stale remote VMEM data or
1632  // stale local VMEM data with MTYPE NC. Local VMEM data with MTYPE RW and
1633  // CC will never be stale due to the local memory probes.
1634  BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV))
1635  // Set SC bits to indicate system scope.
1637  // Inserting a "S_WAITCNT vmcnt(0)" after is not required because the
1638  // hardware does not reorder memory operations by the same wave with
1639  // respect to a preceding "BUFFER_INV". The invalidate is guaranteed to
1640  // remove any cache lines of earlier writes by the same wave and ensures
1641  // later reads by the same wave will refetch the cache lines.
1642  Changed = true;
1643  break;
1644  case SIAtomicScope::AGENT:
1645  // Ensures that following loads will not see stale remote date or local
1646  // MTYPE NC global data. Local MTYPE RW and CC memory will never be stale
1647  // due to the memory probes.
1648  BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV))
1649  // Set SC bits to indicate agent scope.
1651  // Inserting "S_WAITCNT vmcnt(0)" is not required because the hardware
1652  // does not reorder memory operations with respect to preceeding buffer
1653  // invalidate. The invalidate is guaranteed to remove any cache lines of
1654  // earlier writes and ensures later writes will refetch the cache lines.
1655  Changed = true;
1656  break;
1657  case SIAtomicScope::WORKGROUP:
1658  // In threadgroup split mode the waves of a work-group can be executing on
1659  // different CUs. Therefore need to invalidate the L1 which is per CU.
1660  // Otherwise in non-threadgroup split mode all waves of a work-group are
1661  // on the same CU, and so the L1 does not need to be invalidated.
1662  if (ST.isTgSplitEnabled()) {
1663  // Ensures L1 is invalidated if in threadgroup split mode. In
1664  // non-threadgroup split mode it is a NOP, but no point generating it in
1665  // that case if know not in that mode.
1666  BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV))
1667  // Set SC bits to indicate work-group scope.
1669  // Inserting "S_WAITCNT vmcnt(0)" is not required because the hardware
1670  // does not reorder memory operations with respect to preceeding buffer
1671  // invalidate. The invalidate is guaranteed to remove any cache lines of
1672  // earlier writes and ensures later writes will refetch the cache lines.
1673  Changed = true;
1674  }
1675  break;
1676  case SIAtomicScope::WAVEFRONT:
1677  case SIAtomicScope::SINGLETHREAD:
1678  // Could generate "BUFFER_INV" but it would do nothing as there are no
1679  // caches to invalidate.
1680  break;
1681  default:
1682  llvm_unreachable("Unsupported synchronization scope");
1683  }
1684  }
1685 
1686  /// The scratch address space does not need the global memory cache
1687  /// to be flushed as all memory operations by the same thread are
1688  /// sequentially consistent, and no other thread can access scratch
1689  /// memory.
1690 
1691  /// Other address spaces do not have a cache.
1692 
1693  if (Pos == Position::AFTER)
1694  --MI;
1695 
1696  return Changed;
1697 }
1698 
1699 bool SIGfx940CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
1700  SIAtomicScope Scope,
1701  SIAtomicAddrSpace AddrSpace,
1702  bool IsCrossAddrSpaceOrdering,
1703  Position Pos) const {
1704  bool Changed = false;
1705 
1706  MachineBasicBlock &MBB = *MI->getParent();
1707  DebugLoc DL = MI->getDebugLoc();
1708 
1709  if (Pos == Position::AFTER)
1710  ++MI;
1711 
1712  if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1713  switch (Scope) {
1714  case SIAtomicScope::SYSTEM:
1715  // Inserting a "S_WAITCNT vmcnt(0)" before is not required because the
1716  // hardware does not reorder memory operations by the same wave with
1717  // respect to a following "BUFFER_WBL2". The "BUFFER_WBL2" is guaranteed
1718  // to initiate writeback of any dirty cache lines of earlier writes by the
1719  // same wave. A "S_WAITCNT vmcnt(0)" is needed after to ensure the
1720  // writeback has completed.
1721  BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2))
1722  // Set SC bits to indicate system scope.
1724  // Since AddrSpace contains SIAtomicAddrSpace::GLOBAL and Scope is
1725  // SIAtomicScope::SYSTEM, the following insertWait will generate the
1726  // required "S_WAITCNT vmcnt(0)" needed by the "BUFFER_WBL2".
1727  Changed = true;
1728  break;
1729  case SIAtomicScope::AGENT:
1730  BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2))
1731  // Set SC bits to indicate agent scope.
1733 
1734  // Since AddrSpace contains SIAtomicAddrSpace::GLOBAL and Scope is
1735  // SIAtomicScope::AGENT, the following insertWait will generate the
1736  // required "S_WAITCNT vmcnt(0)".
1737  Changed = true;
1738  break;
1739  case SIAtomicScope::WORKGROUP:
1740  case SIAtomicScope::WAVEFRONT:
1741  case SIAtomicScope::SINGLETHREAD:
1742  // Do not generate "BUFFER_WBL2" as there are no caches it would
1743  // writeback, and would require an otherwise unnecessary
1744  // "S_WAITCNT vmcnt(0)".
1745  break;
1746  default:
1747  llvm_unreachable("Unsupported synchronization scope");
1748  }
1749  }
1750 
1751  if (Pos == Position::AFTER)
1752  --MI;
1753 
1754  // Ensure the necessary S_WAITCNT needed by any "BUFFER_WBL2" as well as other
1755  // S_WAITCNT needed.
1756  Changed |= insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
1757  IsCrossAddrSpaceOrdering, Pos);
1758 
1759  return Changed;
1760 }
1761 
1762 bool SIGfx10CacheControl::enableLoadCacheBypass(
1764  SIAtomicScope Scope,
1765  SIAtomicAddrSpace AddrSpace) const {
1766  assert(MI->mayLoad() && !MI->mayStore());
1767  bool Changed = false;
1768 
1769  if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1770  switch (Scope) {
1771  case SIAtomicScope::SYSTEM:
1772  case SIAtomicScope::AGENT:
1773  // Set the L0 and L1 cache policies to MISS_EVICT.
1774  // Note: there is no L2 cache coherent bypass control at the ISA level.
1775  Changed |= enableGLCBit(MI);
1776  Changed |= enableDLCBit(MI);
1777  break;
1778  case SIAtomicScope::WORKGROUP:
1779  // In WGP mode the waves of a work-group can be executing on either CU of
1780  // the WGP. Therefore need to bypass the L0 which is per CU. Otherwise in
1781  // CU mode all waves of a work-group are on the same CU, and so the L0
1782  // does not need to be bypassed.
1783  if (!ST.isCuModeEnabled())
1784  Changed |= enableGLCBit(MI);
1785  break;
1786  case SIAtomicScope::WAVEFRONT:
1787  case SIAtomicScope::SINGLETHREAD:
1788  // No cache to bypass.
1789  break;
1790  default:
1791  llvm_unreachable("Unsupported synchronization scope");
1792  }
1793  }
1794 
1795  /// The scratch address space does not need the global memory caches
1796  /// to be bypassed as all memory operations by the same thread are
1797  /// sequentially consistent, and no other thread can access scratch
1798  /// memory.
1799 
1800  /// Other address spaces do not have a cache.
1801 
1802  return Changed;
1803 }
1804 
1805 bool SIGfx10CacheControl::enableVolatileAndOrNonTemporal(
1806  MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1807  bool IsVolatile, bool IsNonTemporal) const {
1808 
1809  // Only handle load and store, not atomic read-modify-write insructions. The
1810  // latter use glc to indicate if the atomic returns a result and so must not
1811  // be used for cache control.
1812  assert(MI->mayLoad() ^ MI->mayStore());
1813 
1814  // Only update load and store, not LLVM IR atomic read-modify-write
1815  // instructions. The latter are always marked as volatile so cannot sensibly
1816  // handle it as do not want to pessimize all atomics. Also they do not support
1817  // the nontemporal attribute.
1819 
1820  bool Changed = false;
1821 
1822  if (IsVolatile) {
1823  // Set L0 and L1 cache policy to be MISS_EVICT for load instructions
1824  // and MISS_LRU for store instructions.
1825  // Note: there is no L2 cache coherent bypass control at the ISA level.
1826  if (Op == SIMemOp::LOAD) {
1827  Changed |= enableGLCBit(MI);
1828  Changed |= enableDLCBit(MI);
1829  }
1830 
1831  // Ensure operation has completed at system scope to cause all volatile
1832  // operations to be visible outside the program in a global order. Do not
1833  // request cross address space as only the global address space can be
1834  // observable outside the program, so no need to cause a waitcnt for LDS
1835  // address space operations.
1836  Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
1837  Position::AFTER);
1838  return Changed;
1839  }
1840 
1841  if (IsNonTemporal) {
1842  // For loads setting SLC configures L0 and L1 cache policy to HIT_EVICT
1843  // and L2 cache policy to STREAM.
1844  // For stores setting both GLC and SLC configures L0 and L1 cache policy
1845  // to MISS_EVICT and the L2 cache policy to STREAM.
1846  if (Op == SIMemOp::STORE)
1847  Changed |= enableGLCBit(MI);
1848  Changed |= enableSLCBit(MI);
1849 
1850  return Changed;
1851  }
1852 
1853  return Changed;
1854 }
1855 
1856 bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI,
1857  SIAtomicScope Scope,
1858  SIAtomicAddrSpace AddrSpace,
1859  SIMemOp Op,
1860  bool IsCrossAddrSpaceOrdering,
1861  Position Pos) const {
1862  bool Changed = false;
1863 
1864  MachineBasicBlock &MBB = *MI->getParent();
1865  DebugLoc DL = MI->getDebugLoc();
1866 
1867  if (Pos == Position::AFTER)
1868  ++MI;
1869 
1870  bool VMCnt = false;
1871  bool VSCnt = false;
1872  bool LGKMCnt = false;
1873 
1874  if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) !=
1876  switch (Scope) {
1877  case SIAtomicScope::SYSTEM:
1878  case SIAtomicScope::AGENT:
1879  if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
1880  VMCnt |= true;
1881  if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
1882  VSCnt |= true;
1883  break;
1884  case SIAtomicScope::WORKGROUP:
1885  // In WGP mode the waves of a work-group can be executing on either CU of
1886  // the WGP. Therefore need to wait for operations to complete to ensure
1887  // they are visible to waves in the other CU as the L0 is per CU.
1888  // Otherwise in CU mode and all waves of a work-group are on the same CU
1889  // which shares the same L0.
1890  if (!ST.isCuModeEnabled()) {
1891  if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
1892  VMCnt |= true;
1893  if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
1894  VSCnt |= true;
1895  }
1896  break;
1897  case SIAtomicScope::WAVEFRONT:
1898  case SIAtomicScope::SINGLETHREAD:
1899  // The L0 cache keeps all memory operations in order for
1900  // work-items in the same wavefront.
1901  break;
1902  default:
1903  llvm_unreachable("Unsupported synchronization scope");
1904  }
1905  }
1906 
1907  if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
1908  switch (Scope) {
1909  case SIAtomicScope::SYSTEM:
1910  case SIAtomicScope::AGENT:
1911  case SIAtomicScope::WORKGROUP:
1912  // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is
1913  // not needed as LDS operations for all waves are executed in a total
1914  // global ordering as observed by all waves. Required if also
1915  // synchronizing with global/GDS memory as LDS operations could be
1916  // reordered with respect to later global/GDS memory operations of the
1917  // same wave.
1918  LGKMCnt |= IsCrossAddrSpaceOrdering;
1919  break;
1920  case SIAtomicScope::WAVEFRONT:
1921  case SIAtomicScope::SINGLETHREAD:
1922  // The LDS keeps all memory operations in order for
1923  // the same wavefront.
1924  break;
1925  default:
1926  llvm_unreachable("Unsupported synchronization scope");
1927  }
1928  }
1929 
1930  if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) {
1931  switch (Scope) {
1932  case SIAtomicScope::SYSTEM:
1933  case SIAtomicScope::AGENT:
1934  // If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)"
1935  // is not needed as GDS operations for all waves are executed in a total
1936  // global ordering as observed by all waves. Required if also
1937  // synchronizing with global/LDS memory as GDS operations could be
1938  // reordered with respect to later global/LDS memory operations of the
1939  // same wave.
1940  LGKMCnt |= IsCrossAddrSpaceOrdering;
1941  break;
1942  case SIAtomicScope::WORKGROUP:
1943  case SIAtomicScope::WAVEFRONT:
1944  case SIAtomicScope::SINGLETHREAD:
1945  // The GDS keeps all memory operations in order for
1946  // the same work-group.
1947  break;
1948  default:
1949  llvm_unreachable("Unsupported synchronization scope");
1950  }
1951  }
1952 
1953  if (VMCnt || LGKMCnt) {
1954  unsigned WaitCntImmediate =
1956  VMCnt ? 0 : getVmcntBitMask(IV),
1958  LGKMCnt ? 0 : getLgkmcntBitMask(IV));
1959  BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(WaitCntImmediate);
1960  Changed = true;
1961  }
1962 
1963  if (VSCnt) {
1964  BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT))
1965  .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
1966  .addImm(0);
1967  Changed = true;
1968  }
1969 
1970  if (Pos == Position::AFTER)
1971  --MI;
1972 
1973  return Changed;
1974 }
1975 
1976 bool SIGfx10CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1977  SIAtomicScope Scope,
1978  SIAtomicAddrSpace AddrSpace,
1979  Position Pos) const {
1980  if (!InsertCacheInv)
1981  return false;
1982 
1983  bool Changed = false;
1984 
1985  MachineBasicBlock &MBB = *MI->getParent();
1986  DebugLoc DL = MI->getDebugLoc();
1987 
1988  if (Pos == Position::AFTER)
1989  ++MI;
1990 
1991  if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1992  switch (Scope) {
1993  case SIAtomicScope::SYSTEM:
1994  case SIAtomicScope::AGENT:
1995  BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV));
1996  BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL1_INV));
1997  Changed = true;
1998  break;
1999  case SIAtomicScope::WORKGROUP:
2000  // In WGP mode the waves of a work-group can be executing on either CU of
2001  // the WGP. Therefore need to invalidate the L0 which is per CU. Otherwise
2002  // in CU mode and all waves of a work-group are on the same CU, and so the
2003  // L0 does not need to be invalidated.
2004  if (!ST.isCuModeEnabled()) {
2005  BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV));
2006  Changed = true;
2007  }
2008  break;
2009  case SIAtomicScope::WAVEFRONT:
2010  case SIAtomicScope::SINGLETHREAD:
2011  // No cache to invalidate.
2012  break;
2013  default:
2014  llvm_unreachable("Unsupported synchronization scope");
2015  }
2016  }
2017 
2018  /// The scratch address space does not need the global memory cache
2019  /// to be flushed as all memory operations by the same thread are
2020  /// sequentially consistent, and no other thread can access scratch
2021  /// memory.
2022 
2023  /// Other address spaces do not have a cache.
2024 
2025  if (Pos == Position::AFTER)
2026  --MI;
2027 
2028  return Changed;
2029 }
2030 
2031 bool SIGfx11CacheControl::enableLoadCacheBypass(
2032  const MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
2033  SIAtomicAddrSpace AddrSpace) const {
2034  assert(MI->mayLoad() && !MI->mayStore());
2035  bool Changed = false;
2036 
2037  if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
2038  switch (Scope) {
2039  case SIAtomicScope::SYSTEM:
2040  case SIAtomicScope::AGENT:
2041  // Set the L0 and L1 cache policies to MISS_EVICT.
2042  // Note: there is no L2 cache coherent bypass control at the ISA level.
2043  Changed |= enableGLCBit(MI);
2044  break;
2045  case SIAtomicScope::WORKGROUP:
2046  // In WGP mode the waves of a work-group can be executing on either CU of
2047  // the WGP. Therefore need to bypass the L0 which is per CU. Otherwise in
2048  // CU mode all waves of a work-group are on the same CU, and so the L0
2049  // does not need to be bypassed.
2050  if (!ST.isCuModeEnabled())
2051  Changed |= enableGLCBit(MI);
2052  break;
2053  case SIAtomicScope::WAVEFRONT:
2054  case SIAtomicScope::SINGLETHREAD:
2055  // No cache to bypass.
2056  break;
2057  default:
2058  llvm_unreachable("Unsupported synchronization scope");
2059  }
2060  }
2061 
2062  /// The scratch address space does not need the global memory caches
2063  /// to be bypassed as all memory operations by the same thread are
2064  /// sequentially consistent, and no other thread can access scratch
2065  /// memory.
2066 
2067  /// Other address spaces do not have a cache.
2068 
2069  return Changed;
2070 }
2071 
2072 bool SIGfx11CacheControl::enableVolatileAndOrNonTemporal(
2073  MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
2074  bool IsVolatile, bool IsNonTemporal) const {
2075 
2076  // Only handle load and store, not atomic read-modify-write insructions. The
2077  // latter use glc to indicate if the atomic returns a result and so must not
2078  // be used for cache control.
2079  assert(MI->mayLoad() ^ MI->mayStore());
2080 
2081  // Only update load and store, not LLVM IR atomic read-modify-write
2082  // instructions. The latter are always marked as volatile so cannot sensibly
2083  // handle it as do not want to pessimize all atomics. Also they do not support
2084  // the nontemporal attribute.
2086 
2087  bool Changed = false;
2088 
2089  if (IsVolatile) {
2090  // Set L0 and L1 cache policy to be MISS_EVICT for load instructions
2091  // and MISS_LRU for store instructions.
2092  // Note: there is no L2 cache coherent bypass control at the ISA level.
2093  if (Op == SIMemOp::LOAD)
2094  Changed |= enableGLCBit(MI);
2095 
2096  // Set MALL NOALLOC for load and store instructions.
2097  Changed |= enableDLCBit(MI);
2098 
2099  // Ensure operation has completed at system scope to cause all volatile
2100  // operations to be visible outside the program in a global order. Do not
2101  // request cross address space as only the global address space can be
2102  // observable outside the program, so no need to cause a waitcnt for LDS
2103  // address space operations.
2104  Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
2105  Position::AFTER);
2106  return Changed;
2107  }
2108 
2109  if (IsNonTemporal) {
2110  // For loads setting SLC configures L0 and L1 cache policy to HIT_EVICT
2111  // and L2 cache policy to STREAM.
2112  // For stores setting both GLC and SLC configures L0 and L1 cache policy
2113  // to MISS_EVICT and the L2 cache policy to STREAM.
2114  if (Op == SIMemOp::STORE)
2115  Changed |= enableGLCBit(MI);
2116  Changed |= enableSLCBit(MI);
2117 
2118  // Set MALL NOALLOC for load and store instructions.
2119  Changed |= enableDLCBit(MI);
2120  return Changed;
2121  }
2122 
2123  return Changed;
2124 }
2125 
2126 bool SIMemoryLegalizer::removeAtomicPseudoMIs() {
2127  if (AtomicPseudoMIs.empty())
2128  return false;
2129 
2130  for (auto &MI : AtomicPseudoMIs)
2131  MI->eraseFromParent();
2132 
2133  AtomicPseudoMIs.clear();
2134  return true;
2135 }
2136 
2137 bool SIMemoryLegalizer::expandLoad(const SIMemOpInfo &MOI,
2139  assert(MI->mayLoad() && !MI->mayStore());
2140 
2141  bool Changed = false;
2142 
2143  if (MOI.isAtomic()) {
2144  if (MOI.getOrdering() == AtomicOrdering::Monotonic ||
2145  MOI.getOrdering() == AtomicOrdering::Acquire ||
2146  MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
2147  Changed |= CC->enableLoadCacheBypass(MI, MOI.getScope(),
2148  MOI.getOrderingAddrSpace());
2149  }
2150 
2151  if (MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
2152  Changed |= CC->insertWait(MI, MOI.getScope(),
2153  MOI.getOrderingAddrSpace(),
2155  MOI.getIsCrossAddressSpaceOrdering(),
2156  Position::BEFORE);
2157 
2158  if (MOI.getOrdering() == AtomicOrdering::Acquire ||
2159  MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
2160  Changed |= CC->insertWait(MI, MOI.getScope(),
2161  MOI.getInstrAddrSpace(),
2162  SIMemOp::LOAD,
2163  MOI.getIsCrossAddressSpaceOrdering(),
2164  Position::AFTER);
2165  Changed |= CC->insertAcquire(MI, MOI.getScope(),
2166  MOI.getOrderingAddrSpace(),
2167  Position::AFTER);
2168  }
2169 
2170  return Changed;
2171  }
2172 
2173  // Atomic instructions already bypass caches to the scope specified by the
2174  // SyncScope operand. Only non-atomic volatile and nontemporal instructions
2175  // need additional treatment.
2176  Changed |= CC->enableVolatileAndOrNonTemporal(MI, MOI.getInstrAddrSpace(),
2177  SIMemOp::LOAD, MOI.isVolatile(),
2178  MOI.isNonTemporal());
2179  return Changed;
2180 }
2181 
2182 bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI,
2184  assert(!MI->mayLoad() && MI->mayStore());
2185 
2186  bool Changed = false;
2187 
2188  if (MOI.isAtomic()) {
2189  if (MOI.getOrdering() == AtomicOrdering::Monotonic ||
2190  MOI.getOrdering() == AtomicOrdering::Release ||
2191  MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
2192  Changed |= CC->enableStoreCacheBypass(MI, MOI.getScope(),
2193  MOI.getOrderingAddrSpace());
2194  }
2195 
2196  if (MOI.getOrdering() == AtomicOrdering::Release ||
2197  MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
2198  Changed |= CC->insertRelease(MI, MOI.getScope(),
2199  MOI.getOrderingAddrSpace(),
2200  MOI.getIsCrossAddressSpaceOrdering(),
2201  Position::BEFORE);
2202 
2203  return Changed;
2204  }
2205 
2206  // Atomic instructions already bypass caches to the scope specified by the
2207  // SyncScope operand. Only non-atomic volatile and nontemporal instructions
2208  // need additional treatment.
2209  Changed |= CC->enableVolatileAndOrNonTemporal(
2210  MI, MOI.getInstrAddrSpace(), SIMemOp::STORE, MOI.isVolatile(),
2211  MOI.isNonTemporal());
2212  return Changed;
2213 }
2214 
2215 bool SIMemoryLegalizer::expandAtomicFence(const SIMemOpInfo &MOI,
2217  assert(MI->getOpcode() == AMDGPU::ATOMIC_FENCE);
2218 
2219  AtomicPseudoMIs.push_back(MI);
2220  bool Changed = false;
2221 
2222  if (MOI.isAtomic()) {
2223  if (MOI.getOrdering() == AtomicOrdering::Acquire ||
2224  MOI.getOrdering() == AtomicOrdering::Release ||
2225  MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
2226  MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
2227  /// TODO: This relies on a barrier always generating a waitcnt
2228  /// for LDS to ensure it is not reordered with the completion of
2229  /// the proceeding LDS operations. If barrier had a memory
2230  /// ordering and memory scope, then library does not need to
2231  /// generate a fence. Could add support in this file for
2232  /// barrier. SIInsertWaitcnt.cpp could then stop unconditionally
2233  /// adding S_WAITCNT before a S_BARRIER.
2234  Changed |= CC->insertRelease(MI, MOI.getScope(),
2235  MOI.getOrderingAddrSpace(),
2236  MOI.getIsCrossAddressSpaceOrdering(),
2237  Position::BEFORE);
2238 
2239  // TODO: If both release and invalidate are happening they could be combined
2240  // to use the single "BUFFER_WBINV*" instruction. This could be done by
2241  // reorganizing this code or as part of optimizing SIInsertWaitcnt pass to
2242  // track cache invalidate and write back instructions.
2243 
2244  if (MOI.getOrdering() == AtomicOrdering::Acquire ||
2245  MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
2246  MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
2247  Changed |= CC->insertAcquire(MI, MOI.getScope(),
2248  MOI.getOrderingAddrSpace(),
2249  Position::BEFORE);
2250 
2251  return Changed;
2252  }
2253 
2254  return Changed;
2255 }
2256 
2257 bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
2259  assert(MI->mayLoad() && MI->mayStore());
2260 
2261  bool Changed = false;
2262 
2263  if (MOI.isAtomic()) {
2264  if (MOI.getOrdering() == AtomicOrdering::Monotonic ||
2265  MOI.getOrdering() == AtomicOrdering::Acquire ||
2266  MOI.getOrdering() == AtomicOrdering::Release ||
2267  MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
2268  MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
2269  Changed |= CC->enableRMWCacheBypass(MI, MOI.getScope(),
2270  MOI.getInstrAddrSpace());
2271  }
2272 
2273  if (MOI.getOrdering() == AtomicOrdering::Release ||
2274  MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
2275  MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent ||
2276  MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent)
2277  Changed |= CC->insertRelease(MI, MOI.getScope(),
2278  MOI.getOrderingAddrSpace(),
2279  MOI.getIsCrossAddressSpaceOrdering(),
2280  Position::BEFORE);
2281 
2282  if (MOI.getOrdering() == AtomicOrdering::Acquire ||
2283  MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
2284  MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent ||
2285  MOI.getFailureOrdering() == AtomicOrdering::Acquire ||
2286  MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent) {
2287  Changed |= CC->insertWait(MI, MOI.getScope(),
2288  MOI.getInstrAddrSpace(),
2289  isAtomicRet(*MI) ? SIMemOp::LOAD :
2291  MOI.getIsCrossAddressSpaceOrdering(),
2292  Position::AFTER);
2293  Changed |= CC->insertAcquire(MI, MOI.getScope(),
2294  MOI.getOrderingAddrSpace(),
2295  Position::AFTER);
2296  }
2297 
2298  return Changed;
2299  }
2300 
2301  return Changed;
2302 }
2303 
2304 bool SIMemoryLegalizer::runOnMachineFunction(MachineFunction &MF) {
2305  bool Changed = false;
2306 
2307  SIMemOpAccess MOA(MF);
2308  CC = SICacheControl::create(MF.getSubtarget<GCNSubtarget>());
2309 
2310  for (auto &MBB : MF) {
2311  for (auto MI = MBB.begin(); MI != MBB.end(); ++MI) {
2312 
2313  // Unbundle instructions after the post-RA scheduler.
2314  if (MI->isBundle() && MI->mayLoadOrStore()) {
2315  MachineBasicBlock::instr_iterator II(MI->getIterator());
2317  I != E && I->isBundledWithPred(); ++I) {
2318  I->unbundleFromPred();
2319  for (MachineOperand &MO : I->operands())
2320  if (MO.isReg())
2321  MO.setIsInternalRead(false);
2322  }
2323 
2324  MI->eraseFromParent();
2325  MI = II->getIterator();
2326  }
2327 
2328  if (!(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic))
2329  continue;
2330 
2331  if (const auto &MOI = MOA.getLoadInfo(MI))
2332  Changed |= expandLoad(MOI.getValue(), MI);
2333  else if (const auto &MOI = MOA.getStoreInfo(MI))
2334  Changed |= expandStore(MOI.getValue(), MI);
2335  else if (const auto &MOI = MOA.getAtomicFenceInfo(MI))
2336  Changed |= expandAtomicFence(MOI.getValue(), MI);
2337  else if (const auto &MOI = MOA.getAtomicCmpxchgOrRmwInfo(MI))
2338  Changed |= expandAtomicCmpxchgOrRmw(MOI.getValue(), MI);
2339  }
2340  }
2341 
2342  Changed |= removeAtomicPseudoMIs();
2343  return Changed;
2344 }
2345 
2346 INITIALIZE_PASS(SIMemoryLegalizer, DEBUG_TYPE, PASS_NAME, false, false)
2347 
2348 char SIMemoryLegalizer::ID = 0;
2349 char &llvm::SIMemoryLegalizerID = SIMemoryLegalizer::ID;
2350 
2352  return new SIMemoryLegalizer();
2353 }
llvm::AtomicOrdering::AcquireRelease
@ AcquireRelease
llvm::SIInstrInfo::isAtomicRet
static bool isAtomicRet(const MachineInstr &MI)
Definition: SIInstrInfo.h:577
MI
IRTranslator LLVM IR MI
Definition: IRTranslator.cpp:104
llvm::MachineInstrBuilder::addImm
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
Definition: MachineInstrBuilder.h:131
llvm
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:17
GFX10
@ GFX10
Definition: SIInstrInfo.cpp:7850
llvm::createSIMemoryLegalizerPass
FunctionPass * createSIMemoryLegalizerPass()
Definition: SIMemoryLegalizer.cpp:2351
llvm::DiagnosticInfoUnsupported
Diagnostic information for unsupported feature in backend.
Definition: DiagnosticInfo.h:1009
AtomicOrdering.h
llvm::AMDGPU::getIsaVersion
IsaVersion getIsaVersion(StringRef GPU)
Definition: TargetParser.cpp:193
llvm::Function
Definition: Function.h:60
LLVM_MARK_AS_BITMASK_ENUM
#define LLVM_MARK_AS_BITMASK_ENUM(LargestValue)
LLVM_MARK_AS_BITMASK_ENUM lets you opt in an individual enum type so you can perform bitwise operatio...
Definition: BitmaskEnum.h:41
llvm::ALL
@ ALL
Definition: Attributor.h:5034
llvm::getMergedAtomicOrdering
AtomicOrdering getMergedAtomicOrdering(AtomicOrdering AO, AtomicOrdering Other)
Return a single atomic ordering that is at least as strong as both the AO and Other orderings for an ...
Definition: AtomicOrdering.h:138
AmdgcnSkipCacheInvalidations
static cl::opt< bool > AmdgcnSkipCacheInvalidations("amdgcn-skip-cache-invalidations", cl::init(false), cl::Hidden, cl::desc("Use this to skip inserting cache invalidating instructions."))
llvm::MachineFunctionPass
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
Definition: MachineFunctionPass.h:30
llvm::AtomicOrdering::SequentiallyConsistent
@ SequentiallyConsistent
MachineBasicBlock.h
llvm::cl::Hidden
@ Hidden
Definition: CommandLine.h:139
llvm::Optional
Definition: APInt.h:33
llvm::GCNSubtarget
Definition: GCNSubtarget.h:31
llvm::AMDGPU::getVmcntBitMask
unsigned getVmcntBitMask(const IsaVersion &Version)
Definition: AMDGPUBaseInfo.cpp:1001
getLoadInfo
static Optional< LoadInfo > getLoadInfo(const MachineInstr &MI)
Definition: AArch64FalkorHWPFFix.cpp:237
TargetParser.h
llvm::isPowerOf2_32
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:491
llvm::MachineFunctionPass::getAnalysisUsage
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
Definition: MachineFunctionPass.cpp:103
llvm::AMDGPU::IsaVersion
Instruction set architecture version.
Definition: TargetParser.h:113
llvm::AMDGPU::CPol::CPol
CPol
Definition: SIDefines.h:303
GCNSubtarget.h
E
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
llvm::AMDGPU::CPol::DLC
@ DLC
Definition: SIDefines.h:306
llvm::AMDGPUSubtarget::isMesa3DOS
bool isMesa3DOS() const
Definition: AMDGPUSubtarget.h:128
INITIALIZE_PASS
#define INITIALIZE_PASS(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:37
llvm::AnalysisUsage
Represent the analysis usage information of a pass.
Definition: PassAnalysisSupport.h:47
TII
const HexagonInstrInfo * TII
Definition: HexagonCopyToCombine.cpp:125
llvm::AMDGPU::CPol::NT
@ NT
Definition: SIDefines.h:310
llvm::MachineOperand
MachineOperand class - Representation of each machine instruction operand.
Definition: MachineOperand.h:48
llvm::AMDGPU
Definition: AMDGPUMetadataVerifier.h:34
llvm::AMDGPUAS::LOCAL_ADDRESS
@ LOCAL_ADDRESS
Address space for local memory.
Definition: AMDGPU.h:369
llvm::AMDGPU::CPol::SC0
@ SC0
Definition: SIDefines.h:308
llvm::SyncScope::System
@ System
Synchronized with respect to all concurrently executing threads.
Definition: LLVMContext.h:58
llvm::None
const NoneType None
Definition: None.h:24
llvm::CallingConv::ID
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
llvm::SIMemoryLegalizerID
char & SIMemoryLegalizerID
Definition: SIMemoryLegalizer.cpp:2349
llvm::MachineBasicBlock
Definition: MachineBasicBlock.h:94
llvm::MachineFunction::getMMI
MachineModuleInfo & getMMI() const
Definition: MachineFunction.h:607
llvm::MachineFunction::getSubtarget
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
Definition: MachineFunction.h:656
llvm::AtomicOrdering
AtomicOrdering
Atomic ordering for LLVM's memory model.
Definition: AtomicOrdering.h:56
llvm::cl::opt< bool >
llvm::AMDGPU::getExpcntBitMask
unsigned getExpcntBitMask(const IsaVersion &Version)
Definition: AMDGPUBaseInfo.cpp:1007
AMDGPUMCTargetDesc.h
llvm::AMDGPU::CPol::GLC
@ GLC
Definition: SIDefines.h:304
llvm::MachineInstr
Representation of each machine instruction.
Definition: MachineInstr.h:66
llvm::ISD::ATOMIC_FENCE
@ ATOMIC_FENCE
OUTCHAIN = ATOMIC_FENCE(INCHAIN, ordering, scope) This corresponds to the fence instruction.
Definition: ISDOpcodes.h:1155
llvm::ISD::LOAD
@ LOAD
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
Definition: ISDOpcodes.h:966
llvm::ARM_MB::ST
@ ST
Definition: ARMBaseInfo.h:73
llvm::SyncScope::ID
uint8_t ID
Definition: LLVMContext.h:47
I
#define I(x, y, z)
Definition: MD5.cpp:58
llvm::cl::init
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:432
llvm::LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE
LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE()
llvm::AMDGPUISD::LDS
@ LDS
Definition: AMDGPUISelLowering.h:483
MachineFunctionPass.h
AMDGPUMachineModuleInfo.h
assert
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
llvm::MachineBasicBlock::getParent
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
Definition: MachineBasicBlock.h:234
llvm::MachineInstrBuilder::addReg
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
Definition: MachineInstrBuilder.h:97
DEBUG_TYPE
#define DEBUG_TYPE
Definition: SIMemoryLegalizer.cpp:30
llvm::AMDGPUAS::PRIVATE_ADDRESS
@ PRIVATE_ADDRESS
Address space for private memory.
Definition: AMDGPU.h:370
llvm::MachineBasicBlock::instr_end
instr_iterator instr_end()
Definition: MachineBasicBlock.h:264
llvm::MachineFunction
Definition: MachineFunction.h:257
llvm::min
Expected< ExpressionValue > min(const ExpressionValue &Lhs, const ExpressionValue &Rhs)
Definition: FileCheck.cpp:357
llvm::StringRef
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:58
llvm::AnalysisUsage::setPreservesCFG
void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition: Pass.cpp:263
llvm::AMDGPUAS::GLOBAL_ADDRESS
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
Definition: AMDGPU.h:365
AMDGPU.h
llvm::CSKYAttrs::NONE
@ NONE
Definition: CSKYAttributes.h:76
llvm_unreachable
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Definition: ErrorHandling.h:143
BitmaskEnum.h
uint32_t
DL
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Definition: AArch64SLSHardening.cpp:76
llvm::SIInstrFlags::FLAT
@ FLAT
Definition: SIDefines.h:59
llvm::AMDGPU::HSAMD::Kernel::Arg::Key::IsVolatile
constexpr char IsVolatile[]
Key for Kernel::Arg::Metadata::mIsVolatile.
Definition: AMDGPUMetadata.h:199
llvm::ifs::IFSSymbolType::Func
@ Func
llvm::AMDGPU::SendMsg::Msg
const CustomOperand< const MCSubtargetInfo & > Msg[]
Definition: AMDGPUAsmUtils.cpp:39
llvm::AtomicOrdering::Release
@ Release
MBB
MachineBasicBlock & MBB
Definition: AArch64SLSHardening.cpp:74
llvm::AMDGPU::encodeWaitcnt
unsigned encodeWaitcnt(const IsaVersion &Version, unsigned Vmcnt, unsigned Expcnt, unsigned Lgkmcnt)
Encodes Vmcnt, Expcnt and Lgkmcnt into Waitcnt for given isa Version.
Definition: AMDGPUBaseInfo.cpp:1081
llvm::AMDGPU::SendMsg::Op
Op
Definition: SIDefines.h:348
llvm::ilist_iterator
Iterator for intrusive lists based on ilist_node.
Definition: ilist_iterator.h:57
llvm::AMDGPUAS::REGION_ADDRESS
@ REGION_ADDRESS
Address space for region memory. (GDS)
Definition: AMDGPU.h:366
llvm::AMDGPU::CPol::SLC
@ SLC
Definition: SIDefines.h:305
DiagnosticInfo.h
GFX11
@ GFX11
Definition: SIInstrInfo.cpp:7854
llvm::ISD::STORE
@ STORE
Definition: ISDOpcodes.h:967
llvm::AMDGPU::getLgkmcntBitMask
unsigned getLgkmcntBitMask(const IsaVersion &Version)
Definition: AMDGPUBaseInfo.cpp:1011
llvm::NVPTX::PTXLdStInstCode::GLOBAL
@ GLOBAL
Definition: NVPTX.h:109
llvm::SIInstrInfo
Definition: SIInstrInfo.h:44
llvm::MachineBasicBlock::begin
iterator begin()
Definition: MachineBasicBlock.h:278
llvm::BuildMI
MachineInstrBuilder BuildMI(MachineFunction &MF, const DebugLoc &DL, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
Definition: MachineInstrBuilder.h:328
llvm::AMDGPUAS::FLAT_ADDRESS
@ FLAT_ADDRESS
Address space for flat memory.
Definition: AMDGPU.h:364
llvm::tgtok::Bit
@ Bit
Definition: TGLexer.h:50
IV
static const uint32_t IV[8]
Definition: blake3_impl.h:85
PASS_NAME
#define PASS_NAME
Definition: SIMemoryLegalizer.cpp:31
llvm::SIInstrFlags::maybeAtomic
@ maybeAtomic
Definition: SIDefines.h:85
llvm::FunctionPass
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:308
llvm::AMDGPU::CPol::SC1
@ SC1
Definition: SIDefines.h:309
llvm::AMDGPUMachineModuleInfo
Definition: AMDGPUMachineModuleInfo.h:22
llvm::DebugLoc
A debug info location.
Definition: DebugLoc.h:33
llvm::cl::desc
Definition: CommandLine.h:405
llvm::MachineInstrBundleIterator< MachineInstr >
LDS
AMDGPU promote alloca to vector or LDS
Definition: AMDGPUPromoteAlloca.cpp:137
getScope
static SPIRV::Scope getScope(SyncScope::ID Ord)
Definition: SPIRVInstructionSelector.cpp:407
llvm::MachineBasicBlock::end
iterator end()
Definition: MachineBasicBlock.h:280
llvm::MachineModuleInfo::getObjFileInfo
Ty & getObjFileInfo()
Keep track of various per-module pieces of information for backends that would like to do so.
Definition: MachineModuleInfo.h:171
llvm::AtomicOrdering::NotAtomic
@ NotAtomic
llvm::AMDGPUSubtarget::isAmdPalOS
bool isAmdPalOS() const
Definition: AMDGPUSubtarget.h:124
llvm::SyncScope::SingleThread
@ SingleThread
Synchronized with respect to signal handlers executing in the same thread.
Definition: LLVMContext.h:55
llvm::Intrinsic::ID
unsigned ID
Definition: TargetTransformInfo.h:38
llvm::RegState::Undef
@ Undef
Value of the register doesn't matter.
Definition: MachineInstrBuilder.h:52