LLVM  14.0.0git
SIMemoryLegalizer.cpp
Go to the documentation of this file.
1 //===- SIMemoryLegalizer.cpp ----------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// Memory legalizer - implements memory model. More information can be
11 /// found here:
12 /// http://llvm.org/docs/AMDGPUUsage.html#memory-model
13 //
14 //===----------------------------------------------------------------------===//
15 
16 #include "AMDGPU.h"
18 #include "GCNSubtarget.h"
20 #include "llvm/ADT/BitmaskEnum.h"
22 #include "llvm/IR/DiagnosticInfo.h"
25 
26 using namespace llvm;
27 using namespace llvm::AMDGPU;
28 
29 #define DEBUG_TYPE "si-memory-legalizer"
30 #define PASS_NAME "SI Memory Legalizer"
31 
33  "amdgcn-skip-cache-invalidations", cl::init(false), cl::Hidden,
34  cl::desc("Use this to skip inserting cache invalidating instructions."));
35 
36 namespace {
37 
39 
40 /// Memory operation flags. Can be ORed together.
41 enum class SIMemOp {
42  NONE = 0u,
43  LOAD = 1u << 0,
44  STORE = 1u << 1,
45  LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ STORE)
46 };
47 
48 /// Position to insert a new instruction relative to an existing
49 /// instruction.
50 enum class Position {
51  BEFORE,
52  AFTER
53 };
54 
55 /// The atomic synchronization scopes supported by the AMDGPU target.
56 enum class SIAtomicScope {
57  NONE,
58  SINGLETHREAD,
59  WAVEFRONT,
60  WORKGROUP,
61  AGENT,
62  SYSTEM
63 };
64 
65 /// The distinct address spaces supported by the AMDGPU target for
66 /// atomic memory operation. Can be ORed toether.
67 enum class SIAtomicAddrSpace {
68  NONE = 0u,
69  GLOBAL = 1u << 0,
70  LDS = 1u << 1,
71  SCRATCH = 1u << 2,
72  GDS = 1u << 3,
73  OTHER = 1u << 4,
74 
75  /// The address spaces that can be accessed by a FLAT instruction.
76  FLAT = GLOBAL | LDS | SCRATCH,
77 
78  /// The address spaces that support atomic instructions.
79  ATOMIC = GLOBAL | LDS | SCRATCH | GDS,
80 
81  /// All address spaces.
82  ALL = GLOBAL | LDS | SCRATCH | GDS | OTHER,
83 
84  LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ ALL)
85 };
86 
87 class SIMemOpInfo final {
88 private:
89 
90  friend class SIMemOpAccess;
91 
93  AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic;
94  SIAtomicScope Scope = SIAtomicScope::SYSTEM;
95  SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
96  SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE;
97  bool IsCrossAddressSpaceOrdering = false;
98  bool IsVolatile = false;
99  bool IsNonTemporal = false;
100 
102  SIAtomicScope Scope = SIAtomicScope::SYSTEM,
103  SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::ATOMIC,
104  SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::ALL,
105  bool IsCrossAddressSpaceOrdering = true,
106  AtomicOrdering FailureOrdering =
108  bool IsVolatile = false,
109  bool IsNonTemporal = false)
110  : Ordering(Ordering), FailureOrdering(FailureOrdering),
111  Scope(Scope), OrderingAddrSpace(OrderingAddrSpace),
112  InstrAddrSpace(InstrAddrSpace),
113  IsCrossAddressSpaceOrdering(IsCrossAddressSpaceOrdering),
115  IsNonTemporal(IsNonTemporal) {
116 
117  if (Ordering == AtomicOrdering::NotAtomic) {
119  OrderingAddrSpace == SIAtomicAddrSpace::NONE &&
120  !IsCrossAddressSpaceOrdering &&
121  FailureOrdering == AtomicOrdering::NotAtomic);
122  return;
123  }
124 
126  (OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) !=
128  (InstrAddrSpace & SIAtomicAddrSpace::ATOMIC) !=
130 
131  // There is also no cross address space ordering if the ordering
132  // address space is the same as the instruction address space and
133  // only contains a single address space.
134  if ((OrderingAddrSpace == InstrAddrSpace) &&
135  isPowerOf2_32(uint32_t(InstrAddrSpace)))
136  this->IsCrossAddressSpaceOrdering = false;
137 
138  // Limit the scope to the maximum supported by the instruction's address
139  // spaces.
140  if ((InstrAddrSpace & ~SIAtomicAddrSpace::SCRATCH) ==
142  this->Scope = std::min(Scope, SIAtomicScope::SINGLETHREAD);
143  } else if ((InstrAddrSpace &
144  ~(SIAtomicAddrSpace::SCRATCH | SIAtomicAddrSpace::LDS)) ==
146  this->Scope = std::min(Scope, SIAtomicScope::WORKGROUP);
147  } else if ((InstrAddrSpace &
148  ~(SIAtomicAddrSpace::SCRATCH | SIAtomicAddrSpace::LDS |
149  SIAtomicAddrSpace::GDS)) == SIAtomicAddrSpace::NONE) {
150  this->Scope = std::min(Scope, SIAtomicScope::AGENT);
151  }
152  }
153 
154 public:
155  /// \returns Atomic synchronization scope of the machine instruction used to
156  /// create this SIMemOpInfo.
157  SIAtomicScope getScope() const {
158  return Scope;
159  }
160 
161  /// \returns Ordering constraint of the machine instruction used to
162  /// create this SIMemOpInfo.
163  AtomicOrdering getOrdering() const {
164  return Ordering;
165  }
166 
167  /// \returns Failure ordering constraint of the machine instruction used to
168  /// create this SIMemOpInfo.
169  AtomicOrdering getFailureOrdering() const {
170  return FailureOrdering;
171  }
172 
173  /// \returns The address spaces be accessed by the machine
174  /// instruction used to create this SiMemOpInfo.
175  SIAtomicAddrSpace getInstrAddrSpace() const {
176  return InstrAddrSpace;
177  }
178 
179  /// \returns The address spaces that must be ordered by the machine
180  /// instruction used to create this SiMemOpInfo.
181  SIAtomicAddrSpace getOrderingAddrSpace() const {
182  return OrderingAddrSpace;
183  }
184 
185  /// \returns Return true iff memory ordering of operations on
186  /// different address spaces is required.
187  bool getIsCrossAddressSpaceOrdering() const {
188  return IsCrossAddressSpaceOrdering;
189  }
190 
191  /// \returns True if memory access of the machine instruction used to
192  /// create this SIMemOpInfo is volatile, false otherwise.
193  bool isVolatile() const {
194  return IsVolatile;
195  }
196 
197  /// \returns True if memory access of the machine instruction used to
198  /// create this SIMemOpInfo is nontemporal, false otherwise.
199  bool isNonTemporal() const {
200  return IsNonTemporal;
201  }
202 
203  /// \returns True if ordering constraint of the machine instruction used to
204  /// create this SIMemOpInfo is unordered or higher, false otherwise.
205  bool isAtomic() const {
206  return Ordering != AtomicOrdering::NotAtomic;
207  }
208 
209 };
210 
211 class SIMemOpAccess final {
212 private:
213  AMDGPUMachineModuleInfo *MMI = nullptr;
214 
215  /// Reports unsupported message \p Msg for \p MI to LLVM context.
216  void reportUnsupported(const MachineBasicBlock::iterator &MI,
217  const char *Msg) const;
218 
219  /// Inspects the target synchronization scope \p SSID and determines
220  /// the SI atomic scope it corresponds to, the address spaces it
221  /// covers, and whether the memory ordering applies between address
222  /// spaces.
224  toSIAtomicScope(SyncScope::ID SSID, SIAtomicAddrSpace InstrAddrSpace) const;
225 
226  /// \return Return a bit set of the address spaces accessed by \p AS.
227  SIAtomicAddrSpace toSIAtomicAddrSpace(unsigned AS) const;
228 
229  /// \returns Info constructed from \p MI, which has at least machine memory
230  /// operand.
231  Optional<SIMemOpInfo> constructFromMIWithMMO(
232  const MachineBasicBlock::iterator &MI) const;
233 
234 public:
235  /// Construct class to support accessing the machine memory operands
236  /// of instructions in the machine function \p MF.
237  SIMemOpAccess(MachineFunction &MF);
238 
239  /// \returns Load info if \p MI is a load operation, "None" otherwise.
241  const MachineBasicBlock::iterator &MI) const;
242 
243  /// \returns Store info if \p MI is a store operation, "None" otherwise.
244  Optional<SIMemOpInfo> getStoreInfo(
245  const MachineBasicBlock::iterator &MI) const;
246 
247  /// \returns Atomic fence info if \p MI is an atomic fence operation,
248  /// "None" otherwise.
249  Optional<SIMemOpInfo> getAtomicFenceInfo(
250  const MachineBasicBlock::iterator &MI) const;
251 
252  /// \returns Atomic cmpxchg/rmw info if \p MI is an atomic cmpxchg or
253  /// rmw operation, "None" otherwise.
254  Optional<SIMemOpInfo> getAtomicCmpxchgOrRmwInfo(
255  const MachineBasicBlock::iterator &MI) const;
256 };
257 
258 class SICacheControl {
259 protected:
260 
261  /// AMDGPU subtarget info.
262  const GCNSubtarget &ST;
263 
264  /// Instruction info.
265  const SIInstrInfo *TII = nullptr;
266 
267  IsaVersion IV;
268 
269  /// Whether to insert cache invalidating instructions.
270  bool InsertCacheInv;
271 
272  SICacheControl(const GCNSubtarget &ST);
273 
274  /// Sets named bit \p BitName to "true" if present in instruction \p MI.
275  /// \returns Returns true if \p MI is modified, false otherwise.
276  bool enableNamedBit(const MachineBasicBlock::iterator MI,
277  AMDGPU::CPol::CPol Bit) const;
278 
279 public:
280 
281  /// Create a cache control for the subtarget \p ST.
282  static std::unique_ptr<SICacheControl> create(const GCNSubtarget &ST);
283 
284  /// Update \p MI memory load instruction to bypass any caches up to
285  /// the \p Scope memory scope for address spaces \p
286  /// AddrSpace. Return true iff the instruction was modified.
287  virtual bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
288  SIAtomicScope Scope,
289  SIAtomicAddrSpace AddrSpace) const = 0;
290 
291  /// Update \p MI memory store instruction to bypass any caches up to
292  /// the \p Scope memory scope for address spaces \p
293  /// AddrSpace. Return true iff the instruction was modified.
294  virtual bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
295  SIAtomicScope Scope,
296  SIAtomicAddrSpace AddrSpace) const = 0;
297 
298  /// Update \p MI memory read-modify-write instruction to bypass any caches up
299  /// to the \p Scope memory scope for address spaces \p AddrSpace. Return true
300  /// iff the instruction was modified.
301  virtual bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
302  SIAtomicScope Scope,
303  SIAtomicAddrSpace AddrSpace) const = 0;
304 
305  /// Update \p MI memory instruction of kind \p Op associated with address
306  /// spaces \p AddrSpace to indicate it is volatile and/or nontemporal. Return
307  /// true iff the instruction was modified.
308  virtual bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
309  SIAtomicAddrSpace AddrSpace,
310  SIMemOp Op, bool IsVolatile,
311  bool IsNonTemporal) const = 0;
312 
313  /// Inserts any necessary instructions at position \p Pos relative
314  /// to instruction \p MI to ensure memory instructions before \p Pos of kind
315  /// \p Op associated with address spaces \p AddrSpace have completed. Used
316  /// between memory instructions to enforce the order they become visible as
317  /// observed by other memory instructions executing in memory scope \p Scope.
318  /// \p IsCrossAddrSpaceOrdering indicates if the memory ordering is between
319  /// address spaces. Returns true iff any instructions inserted.
320  virtual bool insertWait(MachineBasicBlock::iterator &MI,
321  SIAtomicScope Scope,
322  SIAtomicAddrSpace AddrSpace,
323  SIMemOp Op,
324  bool IsCrossAddrSpaceOrdering,
325  Position Pos) const = 0;
326 
327  /// Inserts any necessary instructions at position \p Pos relative to
328  /// instruction \p MI to ensure any subsequent memory instructions of this
329  /// thread with address spaces \p AddrSpace will observe the previous memory
330  /// operations by any thread for memory scopes up to memory scope \p Scope .
331  /// Returns true iff any instructions inserted.
332  virtual bool insertAcquire(MachineBasicBlock::iterator &MI,
333  SIAtomicScope Scope,
334  SIAtomicAddrSpace AddrSpace,
335  Position Pos) const = 0;
336 
337  /// Inserts any necessary instructions at position \p Pos relative to
338  /// instruction \p MI to ensure previous memory instructions by this thread
339  /// with address spaces \p AddrSpace have completed and can be observed by
340  /// subsequent memory instructions by any thread executing in memory scope \p
341  /// Scope. \p IsCrossAddrSpaceOrdering indicates if the memory ordering is
342  /// between address spaces. Returns true iff any instructions inserted.
343  virtual bool insertRelease(MachineBasicBlock::iterator &MI,
344  SIAtomicScope Scope,
345  SIAtomicAddrSpace AddrSpace,
346  bool IsCrossAddrSpaceOrdering,
347  Position Pos) const = 0;
348 
349  /// Virtual destructor to allow derivations to be deleted.
350  virtual ~SICacheControl() = default;
351 
352 };
353 
354 class SIGfx6CacheControl : public SICacheControl {
355 protected:
356 
357  /// Sets GLC bit to "true" if present in \p MI. Returns true if \p MI
358  /// is modified, false otherwise.
359  bool enableGLCBit(const MachineBasicBlock::iterator &MI) const {
360  return enableNamedBit(MI, AMDGPU::CPol::GLC);
361  }
362 
363  /// Sets SLC bit to "true" if present in \p MI. Returns true if \p MI
364  /// is modified, false otherwise.
365  bool enableSLCBit(const MachineBasicBlock::iterator &MI) const {
366  return enableNamedBit(MI, AMDGPU::CPol::SLC);
367  }
368 
369 public:
370 
371  SIGfx6CacheControl(const GCNSubtarget &ST) : SICacheControl(ST) {};
372 
373  bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
374  SIAtomicScope Scope,
375  SIAtomicAddrSpace AddrSpace) const override;
376 
377  bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
378  SIAtomicScope Scope,
379  SIAtomicAddrSpace AddrSpace) const override;
380 
381  bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
382  SIAtomicScope Scope,
383  SIAtomicAddrSpace AddrSpace) const override;
384 
385  bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
386  SIAtomicAddrSpace AddrSpace, SIMemOp Op,
387  bool IsVolatile,
388  bool IsNonTemporal) const override;
389 
390  bool insertWait(MachineBasicBlock::iterator &MI,
391  SIAtomicScope Scope,
392  SIAtomicAddrSpace AddrSpace,
393  SIMemOp Op,
394  bool IsCrossAddrSpaceOrdering,
395  Position Pos) const override;
396 
397  bool insertAcquire(MachineBasicBlock::iterator &MI,
398  SIAtomicScope Scope,
399  SIAtomicAddrSpace AddrSpace,
400  Position Pos) const override;
401 
402  bool insertRelease(MachineBasicBlock::iterator &MI,
403  SIAtomicScope Scope,
404  SIAtomicAddrSpace AddrSpace,
405  bool IsCrossAddrSpaceOrdering,
406  Position Pos) const override;
407 };
408 
409 class SIGfx7CacheControl : public SIGfx6CacheControl {
410 public:
411 
412  SIGfx7CacheControl(const GCNSubtarget &ST) : SIGfx6CacheControl(ST) {};
413 
414  bool insertAcquire(MachineBasicBlock::iterator &MI,
415  SIAtomicScope Scope,
416  SIAtomicAddrSpace AddrSpace,
417  Position Pos) const override;
418 
419 };
420 
421 class SIGfx90ACacheControl : public SIGfx7CacheControl {
422 public:
423 
424  SIGfx90ACacheControl(const GCNSubtarget &ST) : SIGfx7CacheControl(ST) {};
425 
426  bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
427  SIAtomicScope Scope,
428  SIAtomicAddrSpace AddrSpace) const override;
429 
430  bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
431  SIAtomicScope Scope,
432  SIAtomicAddrSpace AddrSpace) const override;
433 
434  bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
435  SIAtomicScope Scope,
436  SIAtomicAddrSpace AddrSpace) const override;
437 
438  bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
439  SIAtomicAddrSpace AddrSpace, SIMemOp Op,
440  bool IsVolatile,
441  bool IsNonTemporal) const override;
442 
443  bool insertWait(MachineBasicBlock::iterator &MI,
444  SIAtomicScope Scope,
445  SIAtomicAddrSpace AddrSpace,
446  SIMemOp Op,
447  bool IsCrossAddrSpaceOrdering,
448  Position Pos) const override;
449 
450  bool insertAcquire(MachineBasicBlock::iterator &MI,
451  SIAtomicScope Scope,
452  SIAtomicAddrSpace AddrSpace,
453  Position Pos) const override;
454 
455  bool insertRelease(MachineBasicBlock::iterator &MI,
456  SIAtomicScope Scope,
457  SIAtomicAddrSpace AddrSpace,
458  bool IsCrossAddrSpaceOrdering,
459  Position Pos) const override;
460 };
461 
462 class SIGfx10CacheControl : public SIGfx7CacheControl {
463 protected:
464 
465  /// Sets DLC bit to "true" if present in \p MI. Returns true if \p MI
466  /// is modified, false otherwise.
467  bool enableDLCBit(const MachineBasicBlock::iterator &MI) const {
468  return enableNamedBit(MI, AMDGPU::CPol::DLC);
469  }
470 
471 public:
472 
473  SIGfx10CacheControl(const GCNSubtarget &ST) : SIGfx7CacheControl(ST) {};
474 
475  bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
476  SIAtomicScope Scope,
477  SIAtomicAddrSpace AddrSpace) const override;
478 
479  bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
480  SIAtomicAddrSpace AddrSpace, SIMemOp Op,
481  bool IsVolatile,
482  bool IsNonTemporal) const override;
483 
484  bool insertWait(MachineBasicBlock::iterator &MI,
485  SIAtomicScope Scope,
486  SIAtomicAddrSpace AddrSpace,
487  SIMemOp Op,
488  bool IsCrossAddrSpaceOrdering,
489  Position Pos) const override;
490 
491  bool insertAcquire(MachineBasicBlock::iterator &MI,
492  SIAtomicScope Scope,
493  SIAtomicAddrSpace AddrSpace,
494  Position Pos) const override;
495 };
496 
497 class SIMemoryLegalizer final : public MachineFunctionPass {
498 private:
499 
500  /// Cache Control.
501  std::unique_ptr<SICacheControl> CC = nullptr;
502 
503  /// List of atomic pseudo instructions.
504  std::list<MachineBasicBlock::iterator> AtomicPseudoMIs;
505 
506  /// Return true iff instruction \p MI is a atomic instruction that
507  /// returns a result.
508  bool isAtomicRet(const MachineInstr &MI) const {
510  }
511 
512  /// Removes all processed atomic pseudo instructions from the current
513  /// function. Returns true if current function is modified, false otherwise.
514  bool removeAtomicPseudoMIs();
515 
516  /// Expands load operation \p MI. Returns true if instructions are
517  /// added/deleted or \p MI is modified, false otherwise.
518  bool expandLoad(const SIMemOpInfo &MOI,
520  /// Expands store operation \p MI. Returns true if instructions are
521  /// added/deleted or \p MI is modified, false otherwise.
522  bool expandStore(const SIMemOpInfo &MOI,
524  /// Expands atomic fence operation \p MI. Returns true if
525  /// instructions are added/deleted or \p MI is modified, false otherwise.
526  bool expandAtomicFence(const SIMemOpInfo &MOI,
528  /// Expands atomic cmpxchg or rmw operation \p MI. Returns true if
529  /// instructions are added/deleted or \p MI is modified, false otherwise.
530  bool expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
532 
533 public:
534  static char ID;
535 
536  SIMemoryLegalizer() : MachineFunctionPass(ID) {}
537 
538  void getAnalysisUsage(AnalysisUsage &AU) const override {
539  AU.setPreservesCFG();
541  }
542 
543  StringRef getPassName() const override {
544  return PASS_NAME;
545  }
546 
547  bool runOnMachineFunction(MachineFunction &MF) override;
548 };
549 
550 } // end namespace anonymous
551 
552 void SIMemOpAccess::reportUnsupported(const MachineBasicBlock::iterator &MI,
553  const char *Msg) const {
554  const Function &Func = MI->getParent()->getParent()->getFunction();
555  DiagnosticInfoUnsupported Diag(Func, Msg, MI->getDebugLoc());
556  Func.getContext().diagnose(Diag);
557 }
558 
560 SIMemOpAccess::toSIAtomicScope(SyncScope::ID SSID,
561  SIAtomicAddrSpace InstrAddrSpace) const {
562  if (SSID == SyncScope::System)
563  return std::make_tuple(SIAtomicScope::SYSTEM,
564  SIAtomicAddrSpace::ATOMIC,
565  true);
566  if (SSID == MMI->getAgentSSID())
567  return std::make_tuple(SIAtomicScope::AGENT,
568  SIAtomicAddrSpace::ATOMIC,
569  true);
570  if (SSID == MMI->getWorkgroupSSID())
571  return std::make_tuple(SIAtomicScope::WORKGROUP,
572  SIAtomicAddrSpace::ATOMIC,
573  true);
574  if (SSID == MMI->getWavefrontSSID())
575  return std::make_tuple(SIAtomicScope::WAVEFRONT,
576  SIAtomicAddrSpace::ATOMIC,
577  true);
578  if (SSID == SyncScope::SingleThread)
579  return std::make_tuple(SIAtomicScope::SINGLETHREAD,
580  SIAtomicAddrSpace::ATOMIC,
581  true);
582  if (SSID == MMI->getSystemOneAddressSpaceSSID())
583  return std::make_tuple(SIAtomicScope::SYSTEM,
584  SIAtomicAddrSpace::ATOMIC & InstrAddrSpace,
585  false);
586  if (SSID == MMI->getAgentOneAddressSpaceSSID())
587  return std::make_tuple(SIAtomicScope::AGENT,
588  SIAtomicAddrSpace::ATOMIC & InstrAddrSpace,
589  false);
590  if (SSID == MMI->getWorkgroupOneAddressSpaceSSID())
591  return std::make_tuple(SIAtomicScope::WORKGROUP,
592  SIAtomicAddrSpace::ATOMIC & InstrAddrSpace,
593  false);
594  if (SSID == MMI->getWavefrontOneAddressSpaceSSID())
595  return std::make_tuple(SIAtomicScope::WAVEFRONT,
596  SIAtomicAddrSpace::ATOMIC & InstrAddrSpace,
597  false);
598  if (SSID == MMI->getSingleThreadOneAddressSpaceSSID())
599  return std::make_tuple(SIAtomicScope::SINGLETHREAD,
600  SIAtomicAddrSpace::ATOMIC & InstrAddrSpace,
601  false);
602  return None;
603 }
604 
605 SIAtomicAddrSpace SIMemOpAccess::toSIAtomicAddrSpace(unsigned AS) const {
606  if (AS == AMDGPUAS::FLAT_ADDRESS)
608  if (AS == AMDGPUAS::GLOBAL_ADDRESS)
610  if (AS == AMDGPUAS::LOCAL_ADDRESS)
611  return SIAtomicAddrSpace::LDS;
612  if (AS == AMDGPUAS::PRIVATE_ADDRESS)
613  return SIAtomicAddrSpace::SCRATCH;
614  if (AS == AMDGPUAS::REGION_ADDRESS)
615  return SIAtomicAddrSpace::GDS;
616 
617  return SIAtomicAddrSpace::OTHER;
618 }
619 
620 SIMemOpAccess::SIMemOpAccess(MachineFunction &MF) {
622 }
623 
624 Optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO(
625  const MachineBasicBlock::iterator &MI) const {
626  assert(MI->getNumMemOperands() > 0);
627 
630  AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic;
631  SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE;
632  bool IsNonTemporal = true;
633  bool IsVolatile = false;
634 
635  // Validator should check whether or not MMOs cover the entire set of
636  // locations accessed by the memory instruction.
637  for (const auto &MMO : MI->memoperands()) {
638  IsNonTemporal &= MMO->isNonTemporal();
639  IsVolatile |= MMO->isVolatile();
640  InstrAddrSpace |=
641  toSIAtomicAddrSpace(MMO->getPointerInfo().getAddrSpace());
642  AtomicOrdering OpOrdering = MMO->getSuccessOrdering();
643  if (OpOrdering != AtomicOrdering::NotAtomic) {
644  const auto &IsSyncScopeInclusion =
645  MMI->isSyncScopeInclusion(SSID, MMO->getSyncScopeID());
646  if (!IsSyncScopeInclusion) {
647  reportUnsupported(MI,
648  "Unsupported non-inclusive atomic synchronization scope");
649  return None;
650  }
651 
652  SSID = IsSyncScopeInclusion.getValue() ? SSID : MMO->getSyncScopeID();
653  Ordering = getMergedAtomicOrdering(Ordering, OpOrdering);
654  assert(MMO->getFailureOrdering() != AtomicOrdering::Release &&
655  MMO->getFailureOrdering() != AtomicOrdering::AcquireRelease);
656  FailureOrdering =
657  getMergedAtomicOrdering(FailureOrdering, MMO->getFailureOrdering());
658  }
659  }
660 
661  SIAtomicScope Scope = SIAtomicScope::NONE;
662  SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
663  bool IsCrossAddressSpaceOrdering = false;
664  if (Ordering != AtomicOrdering::NotAtomic) {
665  auto ScopeOrNone = toSIAtomicScope(SSID, InstrAddrSpace);
666  if (!ScopeOrNone) {
667  reportUnsupported(MI, "Unsupported atomic synchronization scope");
668  return None;
669  }
670  std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) =
671  ScopeOrNone.getValue();
672  if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) ||
673  ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace) ||
674  ((InstrAddrSpace & SIAtomicAddrSpace::ATOMIC) == SIAtomicAddrSpace::NONE)) {
675  reportUnsupported(MI, "Unsupported atomic address space");
676  return None;
677  }
678  }
679  return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, InstrAddrSpace,
680  IsCrossAddressSpaceOrdering, FailureOrdering, IsVolatile,
681  IsNonTemporal);
682 }
683 
685  const MachineBasicBlock::iterator &MI) const {
686  assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
687 
688  if (!(MI->mayLoad() && !MI->mayStore()))
689  return None;
690 
691  // Be conservative if there are no memory operands.
692  if (MI->getNumMemOperands() == 0)
693  return SIMemOpInfo();
694 
695  return constructFromMIWithMMO(MI);
696 }
697 
698 Optional<SIMemOpInfo> SIMemOpAccess::getStoreInfo(
699  const MachineBasicBlock::iterator &MI) const {
700  assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
701 
702  if (!(!MI->mayLoad() && MI->mayStore()))
703  return None;
704 
705  // Be conservative if there are no memory operands.
706  if (MI->getNumMemOperands() == 0)
707  return SIMemOpInfo();
708 
709  return constructFromMIWithMMO(MI);
710 }
711 
712 Optional<SIMemOpInfo> SIMemOpAccess::getAtomicFenceInfo(
713  const MachineBasicBlock::iterator &MI) const {
714  assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
715 
716  if (MI->getOpcode() != AMDGPU::ATOMIC_FENCE)
717  return None;
718 
719  AtomicOrdering Ordering =
720  static_cast<AtomicOrdering>(MI->getOperand(0).getImm());
721 
722  SyncScope::ID SSID = static_cast<SyncScope::ID>(MI->getOperand(1).getImm());
723  auto ScopeOrNone = toSIAtomicScope(SSID, SIAtomicAddrSpace::ATOMIC);
724  if (!ScopeOrNone) {
725  reportUnsupported(MI, "Unsupported atomic synchronization scope");
726  return None;
727  }
728 
729  SIAtomicScope Scope = SIAtomicScope::NONE;
730  SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
731  bool IsCrossAddressSpaceOrdering = false;
732  std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) =
733  ScopeOrNone.getValue();
734 
735  if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) ||
736  ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace)) {
737  reportUnsupported(MI, "Unsupported atomic address space");
738  return None;
739  }
740 
741  return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, SIAtomicAddrSpace::ATOMIC,
742  IsCrossAddressSpaceOrdering, AtomicOrdering::NotAtomic);
743 }
744 
745 Optional<SIMemOpInfo> SIMemOpAccess::getAtomicCmpxchgOrRmwInfo(
746  const MachineBasicBlock::iterator &MI) const {
747  assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
748 
749  if (!(MI->mayLoad() && MI->mayStore()))
750  return None;
751 
752  // Be conservative if there are no memory operands.
753  if (MI->getNumMemOperands() == 0)
754  return SIMemOpInfo();
755 
756  return constructFromMIWithMMO(MI);
757 }
758 
759 SICacheControl::SICacheControl(const GCNSubtarget &ST) : ST(ST) {
760  TII = ST.getInstrInfo();
761  IV = getIsaVersion(ST.getCPU());
762  InsertCacheInv = !AmdgcnSkipCacheInvalidations;
763 }
764 
765 bool SICacheControl::enableNamedBit(const MachineBasicBlock::iterator MI,
766  AMDGPU::CPol::CPol Bit) const {
767  MachineOperand *CPol = TII->getNamedOperand(*MI, AMDGPU::OpName::cpol);
768  if (!CPol)
769  return false;
770 
771  CPol->setImm(CPol->getImm() | Bit);
772  return true;
773 }
774 
775 /* static */
776 std::unique_ptr<SICacheControl> SICacheControl::create(const GCNSubtarget &ST) {
777  GCNSubtarget::Generation Generation = ST.getGeneration();
778  if (ST.hasGFX90AInsts())
779  return std::make_unique<SIGfx90ACacheControl>(ST);
780  if (Generation <= AMDGPUSubtarget::SOUTHERN_ISLANDS)
781  return std::make_unique<SIGfx6CacheControl>(ST);
782  if (Generation < AMDGPUSubtarget::GFX10)
783  return std::make_unique<SIGfx7CacheControl>(ST);
784  return std::make_unique<SIGfx10CacheControl>(ST);
785 }
786 
787 bool SIGfx6CacheControl::enableLoadCacheBypass(
789  SIAtomicScope Scope,
790  SIAtomicAddrSpace AddrSpace) const {
791  assert(MI->mayLoad() && !MI->mayStore());
792  bool Changed = false;
793 
794  if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
795  switch (Scope) {
796  case SIAtomicScope::SYSTEM:
797  case SIAtomicScope::AGENT:
798  Changed |= enableGLCBit(MI);
799  break;
800  case SIAtomicScope::WORKGROUP:
801  case SIAtomicScope::WAVEFRONT:
802  case SIAtomicScope::SINGLETHREAD:
803  // No cache to bypass.
804  break;
805  default:
806  llvm_unreachable("Unsupported synchronization scope");
807  }
808  }
809 
810  /// The scratch address space does not need the global memory caches
811  /// to be bypassed as all memory operations by the same thread are
812  /// sequentially consistent, and no other thread can access scratch
813  /// memory.
814 
815  /// Other address spaces do not have a cache.
816 
817  return Changed;
818 }
819 
820 bool SIGfx6CacheControl::enableStoreCacheBypass(
822  SIAtomicScope Scope,
823  SIAtomicAddrSpace AddrSpace) const {
824  assert(!MI->mayLoad() && MI->mayStore());
825  bool Changed = false;
826 
827  /// The L1 cache is write through so does not need to be bypassed. There is no
828  /// bypass control for the L2 cache at the isa level.
829 
830  return Changed;
831 }
832 
833 bool SIGfx6CacheControl::enableRMWCacheBypass(
835  SIAtomicScope Scope,
836  SIAtomicAddrSpace AddrSpace) const {
837  assert(MI->mayLoad() && MI->mayStore());
838  bool Changed = false;
839 
840  /// The L1 cache is write through so does not need to be bypassed. There is no
841  /// bypass control for the L2 cache at the isa level.
842 
843  return Changed;
844 }
845 
846 bool SIGfx6CacheControl::enableVolatileAndOrNonTemporal(
847  MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
848  bool IsVolatile, bool IsNonTemporal) const {
849  // Only handle load and store, not atomic read-modify-write insructions. The
850  // latter use glc to indicate if the atomic returns a result and so must not
851  // be used for cache control.
852  assert(MI->mayLoad() ^ MI->mayStore());
853 
854  // Only update load and store, not LLVM IR atomic read-modify-write
855  // instructions. The latter are always marked as volatile so cannot sensibly
856  // handle it as do not want to pessimize all atomics. Also they do not support
857  // the nontemporal attribute.
859 
860  bool Changed = false;
861 
862  if (IsVolatile) {
863  if (Op == SIMemOp::LOAD)
864  Changed |= enableGLCBit(MI);
865 
866  // Ensure operation has completed at system scope to cause all volatile
867  // operations to be visible outside the program in a global order. Do not
868  // request cross address space as only the global address space can be
869  // observable outside the program, so no need to cause a waitcnt for LDS
870  // address space operations.
871  Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
872  Position::AFTER);
873 
874  return Changed;
875  }
876 
877  if (IsNonTemporal) {
878  // Request L1 MISS_EVICT and L2 STREAM for load and store instructions.
879  Changed |= enableGLCBit(MI);
880  Changed |= enableSLCBit(MI);
881  return Changed;
882  }
883 
884  return Changed;
885 }
886 
887 bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI,
888  SIAtomicScope Scope,
889  SIAtomicAddrSpace AddrSpace,
890  SIMemOp Op,
891  bool IsCrossAddrSpaceOrdering,
892  Position Pos) const {
893  bool Changed = false;
894 
895  MachineBasicBlock &MBB = *MI->getParent();
896  DebugLoc DL = MI->getDebugLoc();
897 
898  if (Pos == Position::AFTER)
899  ++MI;
900 
901  bool VMCnt = false;
902  bool LGKMCnt = false;
903 
904  if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) !=
906  switch (Scope) {
907  case SIAtomicScope::SYSTEM:
908  case SIAtomicScope::AGENT:
909  VMCnt |= true;
910  break;
911  case SIAtomicScope::WORKGROUP:
912  case SIAtomicScope::WAVEFRONT:
913  case SIAtomicScope::SINGLETHREAD:
914  // The L1 cache keeps all memory operations in order for
915  // wavefronts in the same work-group.
916  break;
917  default:
918  llvm_unreachable("Unsupported synchronization scope");
919  }
920  }
921 
922  if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
923  switch (Scope) {
924  case SIAtomicScope::SYSTEM:
925  case SIAtomicScope::AGENT:
926  case SIAtomicScope::WORKGROUP:
927  // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is
928  // not needed as LDS operations for all waves are executed in a total
929  // global ordering as observed by all waves. Required if also
930  // synchronizing with global/GDS memory as LDS operations could be
931  // reordered with respect to later global/GDS memory operations of the
932  // same wave.
933  LGKMCnt |= IsCrossAddrSpaceOrdering;
934  break;
935  case SIAtomicScope::WAVEFRONT:
936  case SIAtomicScope::SINGLETHREAD:
937  // The LDS keeps all memory operations in order for
938  // the same wavesfront.
939  break;
940  default:
941  llvm_unreachable("Unsupported synchronization scope");
942  }
943  }
944 
945  if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) {
946  switch (Scope) {
947  case SIAtomicScope::SYSTEM:
948  case SIAtomicScope::AGENT:
949  // If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)"
950  // is not needed as GDS operations for all waves are executed in a total
951  // global ordering as observed by all waves. Required if also
952  // synchronizing with global/LDS memory as GDS operations could be
953  // reordered with respect to later global/LDS memory operations of the
954  // same wave.
955  LGKMCnt |= IsCrossAddrSpaceOrdering;
956  break;
957  case SIAtomicScope::WORKGROUP:
958  case SIAtomicScope::WAVEFRONT:
959  case SIAtomicScope::SINGLETHREAD:
960  // The GDS keeps all memory operations in order for
961  // the same work-group.
962  break;
963  default:
964  llvm_unreachable("Unsupported synchronization scope");
965  }
966  }
967 
968  if (VMCnt || LGKMCnt) {
969  unsigned WaitCntImmediate =
971  VMCnt ? 0 : getVmcntBitMask(IV),
972  getExpcntBitMask(IV),
973  LGKMCnt ? 0 : getLgkmcntBitMask(IV));
974  BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(WaitCntImmediate);
975  Changed = true;
976  }
977 
978  if (Pos == Position::AFTER)
979  --MI;
980 
981  return Changed;
982 }
983 
984 bool SIGfx6CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
985  SIAtomicScope Scope,
986  SIAtomicAddrSpace AddrSpace,
987  Position Pos) const {
988  if (!InsertCacheInv)
989  return false;
990 
991  bool Changed = false;
992 
993  MachineBasicBlock &MBB = *MI->getParent();
994  DebugLoc DL = MI->getDebugLoc();
995 
996  if (Pos == Position::AFTER)
997  ++MI;
998 
999  if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1000  switch (Scope) {
1001  case SIAtomicScope::SYSTEM:
1002  case SIAtomicScope::AGENT:
1003  BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBINVL1));
1004  Changed = true;
1005  break;
1006  case SIAtomicScope::WORKGROUP:
1007  case SIAtomicScope::WAVEFRONT:
1008  case SIAtomicScope::SINGLETHREAD:
1009  // No cache to invalidate.
1010  break;
1011  default:
1012  llvm_unreachable("Unsupported synchronization scope");
1013  }
1014  }
1015 
1016  /// The scratch address space does not need the global memory cache
1017  /// to be flushed as all memory operations by the same thread are
1018  /// sequentially consistent, and no other thread can access scratch
1019  /// memory.
1020 
1021  /// Other address spaces do not have a cache.
1022 
1023  if (Pos == Position::AFTER)
1024  --MI;
1025 
1026  return Changed;
1027 }
1028 
1029 bool SIGfx6CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
1030  SIAtomicScope Scope,
1031  SIAtomicAddrSpace AddrSpace,
1032  bool IsCrossAddrSpaceOrdering,
1033  Position Pos) const {
1034  return insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
1035  IsCrossAddrSpaceOrdering, Pos);
1036 }
1037 
1038 bool SIGfx7CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1039  SIAtomicScope Scope,
1040  SIAtomicAddrSpace AddrSpace,
1041  Position Pos) const {
1042  if (!InsertCacheInv)
1043  return false;
1044 
1045  bool Changed = false;
1046 
1047  MachineBasicBlock &MBB = *MI->getParent();
1048  DebugLoc DL = MI->getDebugLoc();
1049 
1050  const GCNSubtarget &STM = MBB.getParent()->getSubtarget<GCNSubtarget>();
1051 
1052  const unsigned InvalidateL1 = STM.isAmdPalOS() || STM.isMesa3DOS()
1053  ? AMDGPU::BUFFER_WBINVL1
1054  : AMDGPU::BUFFER_WBINVL1_VOL;
1055 
1056  if (Pos == Position::AFTER)
1057  ++MI;
1058 
1059  if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1060  switch (Scope) {
1061  case SIAtomicScope::SYSTEM:
1062  case SIAtomicScope::AGENT:
1063  BuildMI(MBB, MI, DL, TII->get(InvalidateL1));
1064  Changed = true;
1065  break;
1066  case SIAtomicScope::WORKGROUP:
1067  case SIAtomicScope::WAVEFRONT:
1068  case SIAtomicScope::SINGLETHREAD:
1069  // No cache to invalidate.
1070  break;
1071  default:
1072  llvm_unreachable("Unsupported synchronization scope");
1073  }
1074  }
1075 
1076  /// The scratch address space does not need the global memory cache
1077  /// to be flushed as all memory operations by the same thread are
1078  /// sequentially consistent, and no other thread can access scratch
1079  /// memory.
1080 
1081  /// Other address spaces do not have a cache.
1082 
1083  if (Pos == Position::AFTER)
1084  --MI;
1085 
1086  return Changed;
1087 }
1088 
1089 bool SIGfx90ACacheControl::enableLoadCacheBypass(
1091  SIAtomicScope Scope,
1092  SIAtomicAddrSpace AddrSpace) const {
1093  assert(MI->mayLoad() && !MI->mayStore());
1094  bool Changed = false;
1095 
1096  if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1097  switch (Scope) {
1098  case SIAtomicScope::SYSTEM:
1099  case SIAtomicScope::AGENT:
1100  Changed |= enableGLCBit(MI);
1101  break;
1102  case SIAtomicScope::WORKGROUP:
1103  // In threadgroup split mode the waves of a work-group can be executing on
1104  // different CUs. Therefore need to bypass the L1 which is per CU.
1105  // Otherwise in non-threadgroup split mode all waves of a work-group are
1106  // on the same CU, and so the L1 does not need to be bypassed.
1107  if (ST.isTgSplitEnabled()) Changed |= enableGLCBit(MI);
1108  break;
1109  case SIAtomicScope::WAVEFRONT:
1110  case SIAtomicScope::SINGLETHREAD:
1111  // No cache to bypass.
1112  break;
1113  default:
1114  llvm_unreachable("Unsupported synchronization scope");
1115  }
1116  }
1117 
1118  /// The scratch address space does not need the global memory caches
1119  /// to be bypassed as all memory operations by the same thread are
1120  /// sequentially consistent, and no other thread can access scratch
1121  /// memory.
1122 
1123  /// Other address spaces do not have a cache.
1124 
1125  return Changed;
1126 }
1127 
1128 bool SIGfx90ACacheControl::enableStoreCacheBypass(
1130  SIAtomicScope Scope,
1131  SIAtomicAddrSpace AddrSpace) const {
1132  assert(!MI->mayLoad() && MI->mayStore());
1133  bool Changed = false;
1134 
1135  if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1136  switch (Scope) {
1137  case SIAtomicScope::SYSTEM:
1138  case SIAtomicScope::AGENT:
1139  /// Do not set glc for store atomic operations as they implicitly write
1140  /// through the L1 cache.
1141  break;
1142  case SIAtomicScope::WORKGROUP:
1143  case SIAtomicScope::WAVEFRONT:
1144  case SIAtomicScope::SINGLETHREAD:
1145  // No cache to bypass. Store atomics implicitly write through the L1
1146  // cache.
1147  break;
1148  default:
1149  llvm_unreachable("Unsupported synchronization scope");
1150  }
1151  }
1152 
1153  /// The scratch address space does not need the global memory caches
1154  /// to be bypassed as all memory operations by the same thread are
1155  /// sequentially consistent, and no other thread can access scratch
1156  /// memory.
1157 
1158  /// Other address spaces do not have a cache.
1159 
1160  return Changed;
1161 }
1162 
1163 bool SIGfx90ACacheControl::enableRMWCacheBypass(
1165  SIAtomicScope Scope,
1166  SIAtomicAddrSpace AddrSpace) const {
1167  assert(MI->mayLoad() && MI->mayStore());
1168  bool Changed = false;
1169 
1170  if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1171  switch (Scope) {
1172  case SIAtomicScope::SYSTEM:
1173  case SIAtomicScope::AGENT:
1174  /// Do not set glc for RMW atomic operations as they implicitly bypass
1175  /// the L1 cache, and the glc bit is instead used to indicate if they are
1176  /// return or no-return.
1177  break;
1178  case SIAtomicScope::WORKGROUP:
1179  case SIAtomicScope::WAVEFRONT:
1180  case SIAtomicScope::SINGLETHREAD:
1181  // No cache to bypass. RMW atomics implicitly bypass the L1 cache.
1182  break;
1183  default:
1184  llvm_unreachable("Unsupported synchronization scope");
1185  }
1186  }
1187 
1188  return Changed;
1189 }
1190 
1191 bool SIGfx90ACacheControl::enableVolatileAndOrNonTemporal(
1192  MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1193  bool IsVolatile, bool IsNonTemporal) const {
1194  // Only handle load and store, not atomic read-modify-write insructions. The
1195  // latter use glc to indicate if the atomic returns a result and so must not
1196  // be used for cache control.
1197  assert(MI->mayLoad() ^ MI->mayStore());
1198 
1199  // Only update load and store, not LLVM IR atomic read-modify-write
1200  // instructions. The latter are always marked as volatile so cannot sensibly
1201  // handle it as do not want to pessimize all atomics. Also they do not support
1202  // the nontemporal attribute.
1204 
1205  bool Changed = false;
1206 
1207  if (IsVolatile) {
1208  if (Op == SIMemOp::LOAD) {
1209  Changed |= enableGLCBit(MI);
1210  }
1211 
1212  // Ensure operation has completed at system scope to cause all volatile
1213  // operations to be visible outside the program in a global order. Do not
1214  // request cross address space as only the global address space can be
1215  // observable outside the program, so no need to cause a waitcnt for LDS
1216  // address space operations.
1217  Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
1218  Position::AFTER);
1219 
1220  return Changed;
1221  }
1222 
1223  if (IsNonTemporal) {
1224  // Request L1 MISS_EVICT and L2 STREAM for load and store instructions.
1225  Changed |= enableGLCBit(MI);
1226  Changed |= enableSLCBit(MI);
1227  return Changed;
1228  }
1229 
1230  return Changed;
1231 }
1232 
1233 bool SIGfx90ACacheControl::insertWait(MachineBasicBlock::iterator &MI,
1234  SIAtomicScope Scope,
1235  SIAtomicAddrSpace AddrSpace,
1236  SIMemOp Op,
1237  bool IsCrossAddrSpaceOrdering,
1238  Position Pos) const {
1239  if (ST.isTgSplitEnabled()) {
1240  // In threadgroup split mode the waves of a work-group can be executing on
1241  // different CUs. Therefore need to wait for global or GDS memory operations
1242  // to complete to ensure they are visible to waves in the other CUs.
1243  // Otherwise in non-threadgroup split mode all waves of a work-group are on
1244  // the same CU, so no need to wait for global memory as all waves in the
1245  // work-group access the same the L1, nor wait for GDS as access are ordered
1246  // on a CU.
1247  if (((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH |
1248  SIAtomicAddrSpace::GDS)) != SIAtomicAddrSpace::NONE) &&
1249  (Scope == SIAtomicScope::WORKGROUP)) {
1250  // Same as GFX7 using agent scope.
1251  Scope = SIAtomicScope::AGENT;
1252  }
1253  // In threadgroup split mode LDS cannot be allocated so no need to wait for
1254  // LDS memory operations.
1255  AddrSpace &= ~SIAtomicAddrSpace::LDS;
1256  }
1257  return SIGfx7CacheControl::insertWait(MI, Scope, AddrSpace, Op,
1258  IsCrossAddrSpaceOrdering, Pos);
1259 }
1260 
1261 bool SIGfx90ACacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1262  SIAtomicScope Scope,
1263  SIAtomicAddrSpace AddrSpace,
1264  Position Pos) const {
1265  if (!InsertCacheInv)
1266  return false;
1267 
1268  bool Changed = false;
1269 
1270  MachineBasicBlock &MBB = *MI->getParent();
1271  DebugLoc DL = MI->getDebugLoc();
1272 
1273  if (Pos == Position::AFTER)
1274  ++MI;
1275 
1276  if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1277  switch (Scope) {
1278  case SIAtomicScope::SYSTEM:
1279  // Ensures that following loads will not see stale remote VMEM data or
1280  // stale local VMEM data with MTYPE NC. Local VMEM data with MTYPE RW and
1281  // CC will never be stale due to the local memory probes.
1282  BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INVL2));
1283  // Inserting a "S_WAITCNT vmcnt(0)" after is not required because the
1284  // hardware does not reorder memory operations by the same wave with
1285  // respect to a preceding "BUFFER_INVL2". The invalidate is guaranteed to
1286  // remove any cache lines of earlier writes by the same wave and ensures
1287  // later reads by the same wave will refetch the cache lines.
1288  Changed = true;
1289  break;
1290  case SIAtomicScope::AGENT:
1291  // Same as GFX7.
1292  break;
1293  case SIAtomicScope::WORKGROUP:
1294  // In threadgroup split mode the waves of a work-group can be executing on
1295  // different CUs. Therefore need to invalidate the L1 which is per CU.
1296  // Otherwise in non-threadgroup split mode all waves of a work-group are
1297  // on the same CU, and so the L1 does not need to be invalidated.
1298  if (ST.isTgSplitEnabled()) {
1299  // Same as GFX7 using agent scope.
1300  Scope = SIAtomicScope::AGENT;
1301  }
1302  break;
1303  case SIAtomicScope::WAVEFRONT:
1304  case SIAtomicScope::SINGLETHREAD:
1305  // Same as GFX7.
1306  break;
1307  default:
1308  llvm_unreachable("Unsupported synchronization scope");
1309  }
1310  }
1311 
1312  /// The scratch address space does not need the global memory cache
1313  /// to be flushed as all memory operations by the same thread are
1314  /// sequentially consistent, and no other thread can access scratch
1315  /// memory.
1316 
1317  /// Other address spaces do not have a cache.
1318 
1319  if (Pos == Position::AFTER)
1320  --MI;
1321 
1322  Changed |= SIGfx7CacheControl::insertAcquire(MI, Scope, AddrSpace, Pos);
1323 
1324  return Changed;
1325 }
1326 
1327 bool SIGfx90ACacheControl::insertRelease(MachineBasicBlock::iterator &MI,
1328  SIAtomicScope Scope,
1329  SIAtomicAddrSpace AddrSpace,
1330  bool IsCrossAddrSpaceOrdering,
1331  Position Pos) const {
1332  bool Changed = false;
1333 
1334  MachineBasicBlock &MBB = *MI->getParent();
1335  DebugLoc DL = MI->getDebugLoc();
1336 
1337  if (Pos == Position::AFTER)
1338  ++MI;
1339 
1340  if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1341  switch (Scope) {
1342  case SIAtomicScope::SYSTEM:
1343  // Inserting a "S_WAITCNT vmcnt(0)" before is not required because the
1344  // hardware does not reorder memory operations by the same wave with
1345  // respect to a following "BUFFER_WBL2". The "BUFFER_WBL2" is guaranteed
1346  // to initiate writeback of any dirty cache lines of earlier writes by the
1347  // same wave. A "S_WAITCNT vmcnt(0)" is needed after to ensure the
1348  // writeback has completed.
1349  BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2));
1350  // Followed by same as GFX7, which will ensure the necessary "S_WAITCNT
1351  // vmcnt(0)" needed by the "BUFFER_WBL2".
1352  Changed = true;
1353  break;
1354  case SIAtomicScope::AGENT:
1355  case SIAtomicScope::WORKGROUP:
1356  case SIAtomicScope::WAVEFRONT:
1357  case SIAtomicScope::SINGLETHREAD:
1358  // Same as GFX7.
1359  break;
1360  default:
1361  llvm_unreachable("Unsupported synchronization scope");
1362  }
1363  }
1364 
1365  if (Pos == Position::AFTER)
1366  --MI;
1367 
1368  Changed |=
1369  SIGfx7CacheControl::insertRelease(MI, Scope, AddrSpace,
1370  IsCrossAddrSpaceOrdering, Pos);
1371 
1372  return Changed;
1373 }
1374 
1375 bool SIGfx10CacheControl::enableLoadCacheBypass(
1377  SIAtomicScope Scope,
1378  SIAtomicAddrSpace AddrSpace) const {
1379  assert(MI->mayLoad() && !MI->mayStore());
1380  bool Changed = false;
1381 
1382  if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1383  /// TODO Do not set glc for rmw atomic operations as they
1384  /// implicitly bypass the L0/L1 caches.
1385 
1386  switch (Scope) {
1387  case SIAtomicScope::SYSTEM:
1388  case SIAtomicScope::AGENT:
1389  Changed |= enableGLCBit(MI);
1390  Changed |= enableDLCBit(MI);
1391  break;
1392  case SIAtomicScope::WORKGROUP:
1393  // In WGP mode the waves of a work-group can be executing on either CU of
1394  // the WGP. Therefore need to bypass the L0 which is per CU. Otherwise in
1395  // CU mode all waves of a work-group are on the same CU, and so the L0
1396  // does not need to be bypassed.
1397  if (!ST.isCuModeEnabled()) Changed |= enableGLCBit(MI);
1398  break;
1399  case SIAtomicScope::WAVEFRONT:
1400  case SIAtomicScope::SINGLETHREAD:
1401  // No cache to bypass.
1402  break;
1403  default:
1404  llvm_unreachable("Unsupported synchronization scope");
1405  }
1406  }
1407 
1408  /// The scratch address space does not need the global memory caches
1409  /// to be bypassed as all memory operations by the same thread are
1410  /// sequentially consistent, and no other thread can access scratch
1411  /// memory.
1412 
1413  /// Other address spaces do not have a cache.
1414 
1415  return Changed;
1416 }
1417 
1418 bool SIGfx10CacheControl::enableVolatileAndOrNonTemporal(
1419  MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1420  bool IsVolatile, bool IsNonTemporal) const {
1421 
1422  // Only handle load and store, not atomic read-modify-write insructions. The
1423  // latter use glc to indicate if the atomic returns a result and so must not
1424  // be used for cache control.
1425  assert(MI->mayLoad() ^ MI->mayStore());
1426 
1427  // Only update load and store, not LLVM IR atomic read-modify-write
1428  // instructions. The latter are always marked as volatile so cannot sensibly
1429  // handle it as do not want to pessimize all atomics. Also they do not support
1430  // the nontemporal attribute.
1432 
1433  bool Changed = false;
1434 
1435  if (IsVolatile) {
1436 
1437  if (Op == SIMemOp::LOAD) {
1438  Changed |= enableGLCBit(MI);
1439  Changed |= enableDLCBit(MI);
1440  }
1441 
1442  // Ensure operation has completed at system scope to cause all volatile
1443  // operations to be visible outside the program in a global order. Do not
1444  // request cross address space as only the global address space can be
1445  // observable outside the program, so no need to cause a waitcnt for LDS
1446  // address space operations.
1447  Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
1448  Position::AFTER);
1449  return Changed;
1450  }
1451 
1452  if (IsNonTemporal) {
1453  // Request L0/L1 HIT_EVICT and L2 STREAM for load and store instructions.
1454  Changed |= enableSLCBit(MI);
1455  return Changed;
1456  }
1457 
1458  return Changed;
1459 }
1460 
1461 bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI,
1462  SIAtomicScope Scope,
1463  SIAtomicAddrSpace AddrSpace,
1464  SIMemOp Op,
1465  bool IsCrossAddrSpaceOrdering,
1466  Position Pos) const {
1467  bool Changed = false;
1468 
1469  MachineBasicBlock &MBB = *MI->getParent();
1470  DebugLoc DL = MI->getDebugLoc();
1471 
1472  if (Pos == Position::AFTER)
1473  ++MI;
1474 
1475  bool VMCnt = false;
1476  bool VSCnt = false;
1477  bool LGKMCnt = false;
1478 
1479  if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) !=
1481  switch (Scope) {
1482  case SIAtomicScope::SYSTEM:
1483  case SIAtomicScope::AGENT:
1484  if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
1485  VMCnt |= true;
1486  if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
1487  VSCnt |= true;
1488  break;
1489  case SIAtomicScope::WORKGROUP:
1490  // In WGP mode the waves of a work-group can be executing on either CU of
1491  // the WGP. Therefore need to wait for operations to complete to ensure
1492  // they are visible to waves in the other CU as the L0 is per CU.
1493  // Otherwise in CU mode and all waves of a work-group are on the same CU
1494  // which shares the same L0.
1495  if (!ST.isCuModeEnabled()) {
1496  if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
1497  VMCnt |= true;
1498  if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
1499  VSCnt |= true;
1500  }
1501  break;
1502  case SIAtomicScope::WAVEFRONT:
1503  case SIAtomicScope::SINGLETHREAD:
1504  // The L0 cache keeps all memory operations in order for
1505  // work-items in the same wavefront.
1506  break;
1507  default:
1508  llvm_unreachable("Unsupported synchronization scope");
1509  }
1510  }
1511 
1512  if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
1513  switch (Scope) {
1514  case SIAtomicScope::SYSTEM:
1515  case SIAtomicScope::AGENT:
1516  case SIAtomicScope::WORKGROUP:
1517  // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is
1518  // not needed as LDS operations for all waves are executed in a total
1519  // global ordering as observed by all waves. Required if also
1520  // synchronizing with global/GDS memory as LDS operations could be
1521  // reordered with respect to later global/GDS memory operations of the
1522  // same wave.
1523  LGKMCnt |= IsCrossAddrSpaceOrdering;
1524  break;
1525  case SIAtomicScope::WAVEFRONT:
1526  case SIAtomicScope::SINGLETHREAD:
1527  // The LDS keeps all memory operations in order for
1528  // the same wavesfront.
1529  break;
1530  default:
1531  llvm_unreachable("Unsupported synchronization scope");
1532  }
1533  }
1534 
1535  if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) {
1536  switch (Scope) {
1537  case SIAtomicScope::SYSTEM:
1538  case SIAtomicScope::AGENT:
1539  // If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)"
1540  // is not needed as GDS operations for all waves are executed in a total
1541  // global ordering as observed by all waves. Required if also
1542  // synchronizing with global/LDS memory as GDS operations could be
1543  // reordered with respect to later global/LDS memory operations of the
1544  // same wave.
1545  LGKMCnt |= IsCrossAddrSpaceOrdering;
1546  break;
1547  case SIAtomicScope::WORKGROUP:
1548  case SIAtomicScope::WAVEFRONT:
1549  case SIAtomicScope::SINGLETHREAD:
1550  // The GDS keeps all memory operations in order for
1551  // the same work-group.
1552  break;
1553  default:
1554  llvm_unreachable("Unsupported synchronization scope");
1555  }
1556  }
1557 
1558  if (VMCnt || LGKMCnt) {
1559  unsigned WaitCntImmediate =
1561  VMCnt ? 0 : getVmcntBitMask(IV),
1562  getExpcntBitMask(IV),
1563  LGKMCnt ? 0 : getLgkmcntBitMask(IV));
1564  BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(WaitCntImmediate);
1565  Changed = true;
1566  }
1567 
1568  if (VSCnt) {
1569  BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT))
1570  .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
1571  .addImm(0);
1572  Changed = true;
1573  }
1574 
1575  if (Pos == Position::AFTER)
1576  --MI;
1577 
1578  return Changed;
1579 }
1580 
1581 bool SIGfx10CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1582  SIAtomicScope Scope,
1583  SIAtomicAddrSpace AddrSpace,
1584  Position Pos) const {
1585  if (!InsertCacheInv)
1586  return false;
1587 
1588  bool Changed = false;
1589 
1590  MachineBasicBlock &MBB = *MI->getParent();
1591  DebugLoc DL = MI->getDebugLoc();
1592 
1593  if (Pos == Position::AFTER)
1594  ++MI;
1595 
1596  if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1597  switch (Scope) {
1598  case SIAtomicScope::SYSTEM:
1599  case SIAtomicScope::AGENT:
1600  BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV));
1601  BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL1_INV));
1602  Changed = true;
1603  break;
1604  case SIAtomicScope::WORKGROUP:
1605  // In WGP mode the waves of a work-group can be executing on either CU of
1606  // the WGP. Therefore need to invalidate the L0 which is per CU. Otherwise
1607  // in CU mode and all waves of a work-group are on the same CU, and so the
1608  // L0 does not need to be invalidated.
1609  if (!ST.isCuModeEnabled()) {
1610  BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV));
1611  Changed = true;
1612  }
1613  break;
1614  case SIAtomicScope::WAVEFRONT:
1615  case SIAtomicScope::SINGLETHREAD:
1616  // No cache to invalidate.
1617  break;
1618  default:
1619  llvm_unreachable("Unsupported synchronization scope");
1620  }
1621  }
1622 
1623  /// The scratch address space does not need the global memory cache
1624  /// to be flushed as all memory operations by the same thread are
1625  /// sequentially consistent, and no other thread can access scratch
1626  /// memory.
1627 
1628  /// Other address spaces do not have a cache.
1629 
1630  if (Pos == Position::AFTER)
1631  --MI;
1632 
1633  return Changed;
1634 }
1635 
1636 bool SIMemoryLegalizer::removeAtomicPseudoMIs() {
1637  if (AtomicPseudoMIs.empty())
1638  return false;
1639 
1640  for (auto &MI : AtomicPseudoMIs)
1641  MI->eraseFromParent();
1642 
1643  AtomicPseudoMIs.clear();
1644  return true;
1645 }
1646 
1647 bool SIMemoryLegalizer::expandLoad(const SIMemOpInfo &MOI,
1649  assert(MI->mayLoad() && !MI->mayStore());
1650 
1651  bool Changed = false;
1652 
1653  if (MOI.isAtomic()) {
1654  if (MOI.getOrdering() == AtomicOrdering::Monotonic ||
1655  MOI.getOrdering() == AtomicOrdering::Acquire ||
1656  MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
1657  Changed |= CC->enableLoadCacheBypass(MI, MOI.getScope(),
1658  MOI.getOrderingAddrSpace());
1659  }
1660 
1661  if (MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
1662  Changed |= CC->insertWait(MI, MOI.getScope(),
1663  MOI.getOrderingAddrSpace(),
1665  MOI.getIsCrossAddressSpaceOrdering(),
1666  Position::BEFORE);
1667 
1668  if (MOI.getOrdering() == AtomicOrdering::Acquire ||
1669  MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
1670  Changed |= CC->insertWait(MI, MOI.getScope(),
1671  MOI.getInstrAddrSpace(),
1672  SIMemOp::LOAD,
1673  MOI.getIsCrossAddressSpaceOrdering(),
1674  Position::AFTER);
1675  Changed |= CC->insertAcquire(MI, MOI.getScope(),
1676  MOI.getOrderingAddrSpace(),
1677  Position::AFTER);
1678  }
1679 
1680  return Changed;
1681  }
1682 
1683  // Atomic instructions already bypass caches to the scope specified by the
1684  // SyncScope operand. Only non-atomic volatile and nontemporal instructions
1685  // need additional treatment.
1686  Changed |= CC->enableVolatileAndOrNonTemporal(MI, MOI.getInstrAddrSpace(),
1687  SIMemOp::LOAD, MOI.isVolatile(),
1688  MOI.isNonTemporal());
1689  return Changed;
1690 }
1691 
1692 bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI,
1694  assert(!MI->mayLoad() && MI->mayStore());
1695 
1696  bool Changed = false;
1697 
1698  if (MOI.isAtomic()) {
1699  if (MOI.getOrdering() == AtomicOrdering::Monotonic ||
1700  MOI.getOrdering() == AtomicOrdering::Release ||
1701  MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
1702  Changed |= CC->enableStoreCacheBypass(MI, MOI.getScope(),
1703  MOI.getOrderingAddrSpace());
1704  }
1705 
1706  if (MOI.getOrdering() == AtomicOrdering::Release ||
1707  MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
1708  Changed |= CC->insertRelease(MI, MOI.getScope(),
1709  MOI.getOrderingAddrSpace(),
1710  MOI.getIsCrossAddressSpaceOrdering(),
1711  Position::BEFORE);
1712 
1713  return Changed;
1714  }
1715 
1716  // Atomic instructions already bypass caches to the scope specified by the
1717  // SyncScope operand. Only non-atomic volatile and nontemporal instructions
1718  // need additional treatment.
1719  Changed |= CC->enableVolatileAndOrNonTemporal(
1720  MI, MOI.getInstrAddrSpace(), SIMemOp::STORE, MOI.isVolatile(),
1721  MOI.isNonTemporal());
1722  return Changed;
1723 }
1724 
1725 bool SIMemoryLegalizer::expandAtomicFence(const SIMemOpInfo &MOI,
1727  assert(MI->getOpcode() == AMDGPU::ATOMIC_FENCE);
1728 
1729  AtomicPseudoMIs.push_back(MI);
1730  bool Changed = false;
1731 
1732  if (MOI.isAtomic()) {
1733  if (MOI.getOrdering() == AtomicOrdering::Acquire ||
1734  MOI.getOrdering() == AtomicOrdering::Release ||
1735  MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
1736  MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
1737  /// TODO: This relies on a barrier always generating a waitcnt
1738  /// for LDS to ensure it is not reordered with the completion of
1739  /// the proceeding LDS operations. If barrier had a memory
1740  /// ordering and memory scope, then library does not need to
1741  /// generate a fence. Could add support in this file for
1742  /// barrier. SIInsertWaitcnt.cpp could then stop unconditionally
1743  /// adding S_WAITCNT before a S_BARRIER.
1744  Changed |= CC->insertRelease(MI, MOI.getScope(),
1745  MOI.getOrderingAddrSpace(),
1746  MOI.getIsCrossAddressSpaceOrdering(),
1747  Position::BEFORE);
1748 
1749  // TODO: If both release and invalidate are happening they could be combined
1750  // to use the single "BUFFER_WBINV*" instruction. This could be done by
1751  // reorganizing this code or as part of optimizing SIInsertWaitcnt pass to
1752  // track cache invalidate and write back instructions.
1753 
1754  if (MOI.getOrdering() == AtomicOrdering::Acquire ||
1755  MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
1756  MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
1757  Changed |= CC->insertAcquire(MI, MOI.getScope(),
1758  MOI.getOrderingAddrSpace(),
1759  Position::BEFORE);
1760 
1761  return Changed;
1762  }
1763 
1764  return Changed;
1765 }
1766 
1767 bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
1769  assert(MI->mayLoad() && MI->mayStore());
1770 
1771  bool Changed = false;
1772 
1773  if (MOI.isAtomic()) {
1774  if (MOI.getOrdering() == AtomicOrdering::Monotonic ||
1775  MOI.getOrdering() == AtomicOrdering::Acquire ||
1776  MOI.getOrdering() == AtomicOrdering::Release ||
1777  MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
1778  MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
1779  Changed |= CC->enableRMWCacheBypass(MI, MOI.getScope(),
1780  MOI.getInstrAddrSpace());
1781  }
1782 
1783  if (MOI.getOrdering() == AtomicOrdering::Release ||
1784  MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
1785  MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent ||
1786  MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent)
1787  Changed |= CC->insertRelease(MI, MOI.getScope(),
1788  MOI.getOrderingAddrSpace(),
1789  MOI.getIsCrossAddressSpaceOrdering(),
1790  Position::BEFORE);
1791 
1792  if (MOI.getOrdering() == AtomicOrdering::Acquire ||
1793  MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
1794  MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent ||
1795  MOI.getFailureOrdering() == AtomicOrdering::Acquire ||
1796  MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent) {
1797  Changed |= CC->insertWait(MI, MOI.getScope(),
1798  MOI.getInstrAddrSpace(),
1799  isAtomicRet(*MI) ? SIMemOp::LOAD :
1801  MOI.getIsCrossAddressSpaceOrdering(),
1802  Position::AFTER);
1803  Changed |= CC->insertAcquire(MI, MOI.getScope(),
1804  MOI.getOrderingAddrSpace(),
1805  Position::AFTER);
1806  }
1807 
1808  return Changed;
1809  }
1810 
1811  return Changed;
1812 }
1813 
1814 bool SIMemoryLegalizer::runOnMachineFunction(MachineFunction &MF) {
1815  bool Changed = false;
1816 
1817  SIMemOpAccess MOA(MF);
1818  CC = SICacheControl::create(MF.getSubtarget<GCNSubtarget>());
1819 
1820  for (auto &MBB : MF) {
1821  for (auto MI = MBB.begin(); MI != MBB.end(); ++MI) {
1822 
1823  // Unbundle instructions after the post-RA scheduler.
1824  if (MI->isBundle() && MI->mayLoadOrStore()) {
1825  MachineBasicBlock::instr_iterator II(MI->getIterator());
1827  I != E && I->isBundledWithPred(); ++I) {
1828  I->unbundleFromPred();
1829  for (MachineOperand &MO : I->operands())
1830  if (MO.isReg())
1831  MO.setIsInternalRead(false);
1832  }
1833 
1834  MI->eraseFromParent();
1835  MI = II->getIterator();
1836  }
1837 
1838  if (!(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic))
1839  continue;
1840 
1841  if (const auto &MOI = MOA.getLoadInfo(MI))
1842  Changed |= expandLoad(MOI.getValue(), MI);
1843  else if (const auto &MOI = MOA.getStoreInfo(MI))
1844  Changed |= expandStore(MOI.getValue(), MI);
1845  else if (const auto &MOI = MOA.getAtomicFenceInfo(MI))
1846  Changed |= expandAtomicFence(MOI.getValue(), MI);
1847  else if (const auto &MOI = MOA.getAtomicCmpxchgOrRmwInfo(MI))
1848  Changed |= expandAtomicCmpxchgOrRmw(MOI.getValue(), MI);
1849  }
1850  }
1851 
1852  Changed |= removeAtomicPseudoMIs();
1853  return Changed;
1854 }
1855 
1856 INITIALIZE_PASS(SIMemoryLegalizer, DEBUG_TYPE, PASS_NAME, false, false)
1857 
1858 char SIMemoryLegalizer::ID = 0;
1859 char &llvm::SIMemoryLegalizerID = SIMemoryLegalizer::ID;
1860 
1862  return new SIMemoryLegalizer();
1863 }
llvm::AtomicOrdering::AcquireRelease
@ AcquireRelease
llvm::SIInstrInfo::isAtomicRet
static bool isAtomicRet(const MachineInstr &MI)
Definition: SIInstrInfo.h:568
llvm::SIInstrFlags::maybeAtomic
@ maybeAtomic
Definition: SIDefines.h:78
MI
IRTranslator LLVM IR MI
Definition: IRTranslator.cpp:103
llvm::MachineInstrBuilder::addImm
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
Definition: MachineInstrBuilder.h:131
llvm
---------------------— PointerInfo ------------------------------------—
Definition: AllocatorList.h:23
GFX10
@ GFX10
Definition: SIInstrInfo.cpp:7684
llvm::SIInstrFlags::FLAT
@ FLAT
Definition: SIDefines.h:58
llvm::createSIMemoryLegalizerPass
FunctionPass * createSIMemoryLegalizerPass()
Definition: SIMemoryLegalizer.cpp:1861
llvm::DiagnosticInfoUnsupported
Diagnostic information for unsupported feature in backend.
Definition: DiagnosticInfo.h:1003
AtomicOrdering.h
llvm::AMDGPU::getIsaVersion
IsaVersion getIsaVersion(StringRef GPU)
Definition: TargetParser.cpp:189
llvm::Function
Definition: Function.h:61
LLVM_MARK_AS_BITMASK_ENUM
#define LLVM_MARK_AS_BITMASK_ENUM(LargestValue)
LLVM_MARK_AS_BITMASK_ENUM lets you opt in an individual enum type so you can perform bitwise operatio...
Definition: BitmaskEnum.h:41
llvm::ALL
@ ALL
Definition: Attributor.h:4619
llvm::getMergedAtomicOrdering
AtomicOrdering getMergedAtomicOrdering(AtomicOrdering AO, AtomicOrdering Other)
Return a single atomic ordering that is at least as strong as both the AO and Other orderings for an ...
Definition: AtomicOrdering.h:138
AmdgcnSkipCacheInvalidations
static cl::opt< bool > AmdgcnSkipCacheInvalidations("amdgcn-skip-cache-invalidations", cl::init(false), cl::Hidden, cl::desc("Use this to skip inserting cache invalidating instructions."))
llvm::SyncScope::SingleThread
@ SingleThread
Synchronized with respect to signal handlers executing in the same thread.
Definition: LLVMContext.h:55
llvm::MachineFunctionPass
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
Definition: MachineFunctionPass.h:30
llvm::AtomicOrdering::SequentiallyConsistent
@ SequentiallyConsistent
MachineBasicBlock.h
llvm::cl::Hidden
@ Hidden
Definition: CommandLine.h:143
isAtomic
static bool isAtomic(Instruction *I)
Definition: ThreadSanitizer.cpp:530
llvm::Optional
Definition: APInt.h:33
llvm::GCNSubtarget
Definition: GCNSubtarget.h:31
llvm::AMDGPU::getVmcntBitMask
unsigned getVmcntBitMask(const IsaVersion &Version)
Definition: AMDGPUBaseInfo.cpp:893
getLoadInfo
static Optional< LoadInfo > getLoadInfo(const MachineInstr &MI)
Definition: AArch64FalkorHWPFFix.cpp:237
TargetParser.h
llvm::isPowerOf2_32
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:491
llvm::MachineFunctionPass::getAnalysisUsage
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
Definition: MachineFunctionPass.cpp:102
llvm::AMDGPU::IsaVersion
Instruction set architecture version.
Definition: TargetParser.h:105
llvm::AMDGPU::CPol::CPol
CPol
Definition: SIDefines.h:288
GCNSubtarget.h
E
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
llvm::AMDGPU::CPol::DLC
@ DLC
Definition: SIDefines.h:291
llvm::RegState::Undef
@ Undef
Value of the register doesn't matter.
Definition: MachineInstrBuilder.h:52
llvm::AMDGPUSubtarget::isMesa3DOS
bool isMesa3DOS() const
Definition: AMDGPUSubtarget.h:126
INITIALIZE_PASS
#define INITIALIZE_PASS(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:37
llvm::AnalysisUsage
Represent the analysis usage information of a pass.
Definition: PassAnalysisSupport.h:47
TII
const HexagonInstrInfo * TII
Definition: HexagonCopyToCombine.cpp:129
llvm::MachineOperand
MachineOperand class - Representation of each machine instruction operand.
Definition: MachineOperand.h:49
llvm::AMDGPU
Definition: AMDGPUMetadataVerifier.h:22
llvm::None
const NoneType None
Definition: None.h:23
llvm::SIMemoryLegalizerID
char & SIMemoryLegalizerID
Definition: SIMemoryLegalizer.cpp:1859
llvm::MachineBasicBlock
Definition: MachineBasicBlock.h:95
llvm::MachineFunction::getMMI
MachineModuleInfo & getMMI() const
Definition: MachineFunction.h:577
llvm::MachineFunction::getSubtarget
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
Definition: MachineFunction.h:626
llvm::AtomicOrdering
AtomicOrdering
Atomic ordering for LLVM's memory model.
Definition: AtomicOrdering.h:56
llvm::cl::opt< bool >
llvm::AMDGPU::getExpcntBitMask
unsigned getExpcntBitMask(const IsaVersion &Version)
Definition: AMDGPUBaseInfo.cpp:902
AMDGPUMCTargetDesc.h
llvm::AMDGPU::CPol::GLC
@ GLC
Definition: SIDefines.h:289
llvm::MachineInstr
Representation of each machine instruction.
Definition: MachineInstr.h:64
llvm::ISD::ATOMIC_FENCE
@ ATOMIC_FENCE
OUTCHAIN = ATOMIC_FENCE(INCHAIN, ordering, scope) This corresponds to the fence instruction.
Definition: ISDOpcodes.h:1110
llvm::ISD::LOAD
@ LOAD
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
Definition: ISDOpcodes.h:921
llvm::ARM_MB::ST
@ ST
Definition: ARMBaseInfo.h:73
llvm::SyncScope::ID
uint8_t ID
Definition: LLVMContext.h:47
I
#define I(x, y, z)
Definition: MD5.cpp:59
llvm::cl::init
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
llvm::LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE
LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE()
llvm::AMDGPUISD::LDS
@ LDS
Definition: AMDGPUISelLowering.h:475
llvm::AMDGPUAS::LOCAL_ADDRESS
@ LOCAL_ADDRESS
Address space for local memory.
Definition: AMDGPU.h:354
AMDGPUMachineModuleInfo.h
assert
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
llvm::MachineBasicBlock::getParent
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
Definition: MachineBasicBlock.h:225
llvm::MachineInstrBuilder::addReg
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
Definition: MachineInstrBuilder.h:97
DEBUG_TYPE
#define DEBUG_TYPE
Definition: SIMemoryLegalizer.cpp:29
llvm::MachineBasicBlock::instr_end
instr_iterator instr_end()
Definition: MachineBasicBlock.h:254
llvm::MachineFunction
Definition: MachineFunction.h:230
llvm::AMDGPUAS::REGION_ADDRESS
@ REGION_ADDRESS
Address space for region memory. (GDS)
Definition: AMDGPU.h:351
llvm::min
Expected< ExpressionValue > min(const ExpressionValue &Lhs, const ExpressionValue &Rhs)
Definition: FileCheck.cpp:357
llvm::StringRef
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:58
llvm::AnalysisUsage::setPreservesCFG
void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition: Pass.cpp:253
AMDGPU.h
llvm_unreachable
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Definition: ErrorHandling.h:136
BitmaskEnum.h
uint32_t
DL
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Definition: AArch64SLSHardening.cpp:76
llvm::AMDGPU::HSAMD::Kernel::Arg::Key::IsVolatile
constexpr char IsVolatile[]
Key for Kernel::Arg::Metadata::mIsVolatile.
Definition: AMDGPUMetadata.h:194
llvm::ifs::IFSSymbolType::Func
@ Func
llvm::AtomicOrdering::Release
@ Release
llvm::AMDGPUAS::FLAT_ADDRESS
@ FLAT_ADDRESS
Address space for flat memory.
Definition: AMDGPU.h:349
MBB
MachineBasicBlock & MBB
Definition: AArch64SLSHardening.cpp:74
llvm::AMDGPU::encodeWaitcnt
unsigned encodeWaitcnt(const IsaVersion &Version, unsigned Vmcnt, unsigned Expcnt, unsigned Lgkmcnt)
Encodes Vmcnt, Expcnt and Lgkmcnt into Waitcnt for given isa Version.
Definition: AMDGPUBaseInfo.cpp:981
llvm::AMDGPUAS::PRIVATE_ADDRESS
@ PRIVATE_ADDRESS
Address space for private memory.
Definition: AMDGPU.h:355
llvm::AMDGPU::SendMsg::Op
Op
Definition: SIDefines.h:321
llvm::ilist_iterator
Iterator for intrusive lists based on ilist_node.
Definition: ilist_iterator.h:57
llvm::AMDGPU::CPol::SLC
@ SLC
Definition: SIDefines.h:290
DiagnosticInfo.h
llvm::SyncScope::System
@ System
Synchronized with respect to all concurrently executing threads.
Definition: LLVMContext.h:58
llvm::ISD::STORE
@ STORE
Definition: ISDOpcodes.h:922
llvm::AMDGPU::getLgkmcntBitMask
unsigned getLgkmcntBitMask(const IsaVersion &Version)
Definition: AMDGPUBaseInfo.cpp:906
llvm::NVPTX::PTXLdStInstCode::GLOBAL
@ GLOBAL
Definition: NVPTX.h:109
llvm::SIInstrInfo
Definition: SIInstrInfo.h:38
llvm::NONE
@ NONE
Definition: Attributor.h:4616
llvm::MachineBasicBlock::begin
iterator begin()
Definition: MachineBasicBlock.h:268
llvm::BuildMI
MachineInstrBuilder BuildMI(MachineFunction &MF, const DebugLoc &DL, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
Definition: MachineInstrBuilder.h:328
llvm::tgtok::Bit
@ Bit
Definition: TGLexer.h:50
PASS_NAME
#define PASS_NAME
Definition: SIMemoryLegalizer.cpp:30
llvm::FunctionPass
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:298
llvm::AMDGPUAS::GLOBAL_ADDRESS
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
Definition: AMDGPU.h:350
llvm::AMDGPUMachineModuleInfo
Definition: AMDGPUMachineModuleInfo.h:22
llvm::DebugLoc
A debug info location.
Definition: DebugLoc.h:33
llvm::cl::desc
Definition: CommandLine.h:414
llvm::MachineInstrBundleIterator< MachineInstr >
LDS
AMDGPU promote alloca to vector or LDS
Definition: AMDGPUPromoteAlloca.cpp:136
llvm::MachineBasicBlock::end
iterator end()
Definition: MachineBasicBlock.h:270
llvm::MachineModuleInfo::getObjFileInfo
Ty & getObjFileInfo()
Keep track of various per-module pieces of information for backends that would like to do so.
Definition: MachineModuleInfo.h:191
llvm::AtomicOrdering::NotAtomic
@ NotAtomic
llvm::AMDGPUSubtarget::isAmdPalOS
bool isAmdPalOS() const
Definition: AMDGPUSubtarget.h:122
llvm::Intrinsic::ID
unsigned ID
Definition: TargetTransformInfo.h:37