LLVM 23.0.0git
SIMemoryLegalizer.cpp
Go to the documentation of this file.
1//===- SIMemoryLegalizer.cpp ----------------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// Memory legalizer - implements memory model. More information can be
11/// found here:
12/// http://llvm.org/docs/AMDGPUUsage.html#memory-model
13//
14//===----------------------------------------------------------------------===//
15
16#include "AMDGPU.h"
18#include "GCNSubtarget.h"
27#include "llvm/IR/PassManager.h"
30#include "llvm/Support/Debug.h"
32
33using namespace llvm;
34using namespace llvm::AMDGPU;
35
36#define DEBUG_TYPE "si-memory-legalizer"
37#define PASS_NAME "SI Memory Legalizer"
38
40 "amdgcn-skip-cache-invalidations", cl::init(false), cl::Hidden,
41 cl::desc("Use this to skip inserting cache invalidating instructions."));
42
43namespace {
44
46
47/// Memory operation flags. Can be ORed together.
48enum class SIMemOp {
49 NONE = 0u,
50 LOAD = 1u << 0,
51 STORE = 1u << 1,
52 LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ STORE)
53};
54
55/// Position to insert a new instruction relative to an existing
56/// instruction.
57enum class Position {
58 BEFORE,
59 AFTER
60};
61
62/// The atomic synchronization scopes supported by the AMDGPU target.
63enum class SIAtomicScope {
64 NONE,
65 SINGLETHREAD,
66 WAVEFRONT,
67 WORKGROUP,
68 CLUSTER, // Promoted to AGENT on targets without workgroup clusters.
69 AGENT,
70 SYSTEM
71};
72
73/// The distinct address spaces supported by the AMDGPU target for
74/// atomic memory operation. Can be ORed together.
75enum class SIAtomicAddrSpace {
76 NONE = 0u,
77 GLOBAL = 1u << 0,
78 LDS = 1u << 1,
79 SCRATCH = 1u << 2,
80 GDS = 1u << 3,
81 OTHER = 1u << 4,
82
83 /// The address spaces that can be accessed by a FLAT instruction.
84 FLAT = GLOBAL | LDS | SCRATCH,
85
86 /// The address spaces that support atomic instructions.
87 ATOMIC = GLOBAL | LDS | SCRATCH | GDS,
88
89 /// All address spaces.
90 ALL = GLOBAL | LDS | SCRATCH | GDS | OTHER,
91
92 LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ ALL)
93};
94
95#ifndef NDEBUG
96static StringRef toString(SIAtomicScope S) {
97 switch (S) {
98 case SIAtomicScope::NONE:
99 return "none";
100 case SIAtomicScope::SINGLETHREAD:
101 return "singlethread";
102 case SIAtomicScope::WAVEFRONT:
103 return "wavefront";
104 case SIAtomicScope::WORKGROUP:
105 return "workgroup";
106 case SIAtomicScope::CLUSTER:
107 return "cluster";
108 case SIAtomicScope::AGENT:
109 return "agent";
110 case SIAtomicScope::SYSTEM:
111 return "system";
112 }
113 llvm_unreachable("unknown atomic scope");
114}
115
116static raw_ostream &operator<<(raw_ostream &OS, SIAtomicAddrSpace AS) {
117 if (AS == SIAtomicAddrSpace::NONE) {
118 OS << "none";
119 return OS;
120 }
121 ListSeparator LS("|");
122 if ((AS & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE)
123 OS << LS << "global";
124 if ((AS & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE)
125 OS << LS << "lds";
126 if ((AS & SIAtomicAddrSpace::SCRATCH) != SIAtomicAddrSpace::NONE)
127 OS << LS << "scratch";
128 if ((AS & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE)
129 OS << LS << "gds";
130 if ((AS & SIAtomicAddrSpace::OTHER) != SIAtomicAddrSpace::NONE)
131 OS << LS << "other";
132 return OS;
133}
134#endif
135
136class SIMemOpInfo final {
137private:
138
139 friend class SIMemOpAccess;
140
141 AtomicOrdering Ordering = AtomicOrdering::NotAtomic;
142 AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic;
143 SIAtomicScope Scope = SIAtomicScope::SYSTEM;
144 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
145 SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE;
146 bool IsCrossAddressSpaceOrdering = false;
147 bool IsVolatile = false;
148 bool IsNonTemporal = false;
149 bool IsLastUse = false;
150 bool IsCooperative = false;
151
152 // TODO: Should we assume Cooperative=true if no MMO is present?
153 SIMemOpInfo(
154 const GCNSubtarget &ST,
155 AtomicOrdering Ordering = AtomicOrdering::SequentiallyConsistent,
156 SIAtomicScope Scope = SIAtomicScope::SYSTEM,
157 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::ATOMIC,
158 SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::ALL,
159 bool IsCrossAddressSpaceOrdering = true,
160 AtomicOrdering FailureOrdering = AtomicOrdering::SequentiallyConsistent,
161 bool IsVolatile = false, bool IsNonTemporal = false,
162 bool IsLastUse = false, bool IsCooperative = false,
163 bool CanDemoteWorkgroupToWavefront = false)
164 : Ordering(Ordering), FailureOrdering(FailureOrdering), Scope(Scope),
165 OrderingAddrSpace(OrderingAddrSpace), InstrAddrSpace(InstrAddrSpace),
166 IsCrossAddressSpaceOrdering(IsCrossAddressSpaceOrdering),
167 IsVolatile(IsVolatile), IsNonTemporal(IsNonTemporal),
168 IsLastUse(IsLastUse), IsCooperative(IsCooperative) {
169
170 if (Ordering == AtomicOrdering::NotAtomic) {
171 assert(!IsCooperative && "Cannot be cooperative & non-atomic!");
172 assert(Scope == SIAtomicScope::NONE &&
173 OrderingAddrSpace == SIAtomicAddrSpace::NONE &&
174 !IsCrossAddressSpaceOrdering &&
175 FailureOrdering == AtomicOrdering::NotAtomic);
176 return;
177 }
178
179 assert(Scope != SIAtomicScope::NONE &&
180 (OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) !=
181 SIAtomicAddrSpace::NONE &&
182 (InstrAddrSpace & SIAtomicAddrSpace::ATOMIC) !=
183 SIAtomicAddrSpace::NONE);
184
185 // There is also no cross address space ordering if the ordering
186 // address space is the same as the instruction address space and
187 // only contains a single address space.
188 if ((OrderingAddrSpace == InstrAddrSpace) &&
189 isPowerOf2_32(uint32_t(InstrAddrSpace)))
190 this->IsCrossAddressSpaceOrdering = false;
191
192 // Limit the scope to the maximum supported by the instruction's address
193 // spaces.
194 if ((InstrAddrSpace & ~SIAtomicAddrSpace::SCRATCH) ==
195 SIAtomicAddrSpace::NONE) {
196 this->Scope = std::min(Scope, SIAtomicScope::SINGLETHREAD);
197 } else if ((InstrAddrSpace &
198 ~(SIAtomicAddrSpace::SCRATCH | SIAtomicAddrSpace::LDS)) ==
199 SIAtomicAddrSpace::NONE) {
200 this->Scope = std::min(Scope, SIAtomicScope::WORKGROUP);
201 } else if ((InstrAddrSpace &
202 ~(SIAtomicAddrSpace::SCRATCH | SIAtomicAddrSpace::LDS |
203 SIAtomicAddrSpace::GDS)) == SIAtomicAddrSpace::NONE) {
204 this->Scope = std::min(Scope, SIAtomicScope::AGENT);
205 }
206
207 // On targets that have no concept of a workgroup cluster, use
208 // AGENT scope as a conservatively correct alternative.
209 if (this->Scope == SIAtomicScope::CLUSTER && !ST.hasClusters())
210 this->Scope = SIAtomicScope::AGENT;
211
212 // When max flat work-group size is at most the wavefront size, the
213 // work-group fits in a single wave, so LLVM workgroup scope matches
214 // wavefront scope. Demote workgroup → wavefront here for fences and for
215 // atomics with ordering stronger than monotonic.
216 if (CanDemoteWorkgroupToWavefront &&
217 this->Scope == SIAtomicScope::WORKGROUP &&
218 (llvm::isStrongerThan(this->Ordering, AtomicOrdering::Monotonic) ||
219 llvm::isStrongerThan(this->FailureOrdering,
220 AtomicOrdering::Monotonic)))
221 this->Scope = SIAtomicScope::WAVEFRONT;
222 }
223
224public:
225 /// \returns Atomic synchronization scope of the machine instruction used to
226 /// create this SIMemOpInfo.
227 SIAtomicScope getScope() const {
228 return Scope;
229 }
230
231 /// \returns Ordering constraint of the machine instruction used to
232 /// create this SIMemOpInfo.
233 AtomicOrdering getOrdering() const {
234 return Ordering;
235 }
236
237 /// \returns Failure ordering constraint of the machine instruction used to
238 /// create this SIMemOpInfo.
239 AtomicOrdering getFailureOrdering() const {
240 return FailureOrdering;
241 }
242
243 /// \returns The address spaces be accessed by the machine
244 /// instruction used to create this SIMemOpInfo.
245 SIAtomicAddrSpace getInstrAddrSpace() const {
246 return InstrAddrSpace;
247 }
248
249 /// \returns The address spaces that must be ordered by the machine
250 /// instruction used to create this SIMemOpInfo.
251 SIAtomicAddrSpace getOrderingAddrSpace() const {
252 return OrderingAddrSpace;
253 }
254
255 /// \returns Return true iff memory ordering of operations on
256 /// different address spaces is required.
257 bool getIsCrossAddressSpaceOrdering() const {
258 return IsCrossAddressSpaceOrdering;
259 }
260
261 /// \returns True if memory access of the machine instruction used to
262 /// create this SIMemOpInfo is volatile, false otherwise.
263 bool isVolatile() const {
264 return IsVolatile;
265 }
266
267 /// \returns True if memory access of the machine instruction used to
268 /// create this SIMemOpInfo is nontemporal, false otherwise.
269 bool isNonTemporal() const {
270 return IsNonTemporal;
271 }
272
273 /// \returns True if memory access of the machine instruction used to
274 /// create this SIMemOpInfo is last use, false otherwise.
275 bool isLastUse() const { return IsLastUse; }
276
277 /// \returns True if this is a cooperative load or store atomic.
278 bool isCooperative() const { return IsCooperative; }
279
280 /// \returns True if ordering constraint of the machine instruction used to
281 /// create this SIMemOpInfo is unordered or higher, false otherwise.
282 bool isAtomic() const {
283 return Ordering != AtomicOrdering::NotAtomic;
284 }
285
286};
287
288class SIMemOpAccess final {
289private:
290 const AMDGPUMachineModuleInfo *MMI = nullptr;
291 const GCNSubtarget &ST;
292 const bool CanDemoteWorkgroupToWavefront;
293
294 /// Reports unsupported message \p Msg for \p MI to LLVM context.
295 void reportUnsupported(const MachineBasicBlock::iterator &MI,
296 const char *Msg) const;
297
298 /// Inspects the target synchronization scope \p SSID and determines
299 /// the SI atomic scope it corresponds to, the address spaces it
300 /// covers, and whether the memory ordering applies between address
301 /// spaces.
302 std::optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>>
303 toSIAtomicScope(SyncScope::ID SSID, SIAtomicAddrSpace InstrAddrSpace) const;
304
305 /// \return Return a bit set of the address spaces accessed by \p AS.
306 SIAtomicAddrSpace toSIAtomicAddrSpace(unsigned AS) const;
307
308 /// \returns Info constructed from \p MI, which has at least machine memory
309 /// operand.
310 std::optional<SIMemOpInfo>
311 constructFromMIWithMMO(const MachineBasicBlock::iterator &MI) const;
312
313public:
314 /// Construct class to support accessing the machine memory operands
315 /// of instructions in the machine function \p MF.
316 SIMemOpAccess(const AMDGPUMachineModuleInfo &MMI, const GCNSubtarget &ST,
317 const Function &F);
318
319 /// \returns Load info if \p MI is a load operation, "std::nullopt" otherwise.
320 std::optional<SIMemOpInfo>
322
323 /// \returns Store info if \p MI is a store operation, "std::nullopt"
324 /// otherwise.
325 std::optional<SIMemOpInfo>
326 getStoreInfo(const MachineBasicBlock::iterator &MI) const;
327
328 /// \returns Atomic fence info if \p MI is an atomic fence operation,
329 /// "std::nullopt" otherwise.
330 std::optional<SIMemOpInfo>
331 getAtomicFenceInfo(const MachineBasicBlock::iterator &MI) const;
332
333 /// \returns Atomic cmpxchg/rmw info if \p MI is an atomic cmpxchg or
334 /// rmw operation, "std::nullopt" otherwise.
335 std::optional<SIMemOpInfo>
336 getAtomicCmpxchgOrRmwInfo(const MachineBasicBlock::iterator &MI) const;
337
338 /// \returns DMA to LDS info if \p MI is as a direct-to/from-LDS load/store,
339 /// along with an indication of whether this is a load or store. If it is not
340 /// a direct-to-LDS operation, returns std::nullopt.
341 std::optional<SIMemOpInfo>
342 getLDSDMAInfo(const MachineBasicBlock::iterator &MI) const;
343};
344
345class SICacheControl {
346protected:
347
348 /// AMDGPU subtarget info.
349 const GCNSubtarget &ST;
350
351 /// Instruction info.
352 const SIInstrInfo *TII = nullptr;
353
354 IsaVersion IV;
355
356 /// Whether to insert cache invalidating instructions.
357 bool InsertCacheInv;
358
359 SICacheControl(const GCNSubtarget &ST);
360
361 /// Sets CPol \p Bits to "true" if present in instruction \p MI.
362 /// \returns Returns true if \p MI is modified, false otherwise.
363 bool enableCPolBits(const MachineBasicBlock::iterator MI,
364 unsigned Bits) const;
365
366 /// Check if any atomic operation on AS can affect memory accessible via the
367 /// global address space.
368 bool canAffectGlobalAddrSpace(SIAtomicAddrSpace AS) const;
369
370public:
371 using CPol = AMDGPU::CPol::CPol;
372
373 /// Create a cache control for the subtarget \p ST.
374 static std::unique_ptr<SICacheControl> create(const GCNSubtarget &ST);
375
376 /// Update \p MI memory load instruction to bypass any caches up to
377 /// the \p Scope memory scope for address spaces \p
378 /// AddrSpace. Return true iff the instruction was modified.
379 virtual bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
380 SIAtomicScope Scope,
381 SIAtomicAddrSpace AddrSpace) const = 0;
382
383 /// Update \p MI memory store instruction to bypass any caches up to
384 /// the \p Scope memory scope for address spaces \p
385 /// AddrSpace. Return true iff the instruction was modified.
386 virtual bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
387 SIAtomicScope Scope,
388 SIAtomicAddrSpace AddrSpace) const = 0;
389
390 /// Update \p MI memory read-modify-write instruction to bypass any caches up
391 /// to the \p Scope memory scope for address spaces \p AddrSpace. Return true
392 /// iff the instruction was modified.
393 virtual bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
394 SIAtomicScope Scope,
395 SIAtomicAddrSpace AddrSpace) const = 0;
396
397 /// Update \p MI memory instruction of kind \p Op associated with address
398 /// spaces \p AddrSpace to indicate it is volatile and/or
399 /// nontemporal/last-use. Return true iff the instruction was modified.
400 virtual bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
401 SIAtomicAddrSpace AddrSpace,
402 SIMemOp Op, bool IsVolatile,
403 bool IsNonTemporal,
404 bool IsLastUse = false) const = 0;
405
406 /// Add final touches to a `mayStore` instruction \p MI, which may be a
407 /// Store or RMW instruction.
408 /// FIXME: This takes a MI because iterators aren't handled properly. When
409 /// this is called, they often point to entirely different insts. Thus we back
410 /// up the inst early and pass it here instead.
411 virtual bool finalizeStore(MachineInstr &MI, bool Atomic) const {
412 return false;
413 };
414
415 /// Handle cooperative load/store atomics.
416 virtual bool handleCooperativeAtomic(MachineInstr &MI) const {
418 "cooperative atomics are not available on this architecture");
419 }
420
421 /// Inserts any necessary instructions at position \p Pos relative
422 /// to instruction \p MI to ensure memory instructions before \p Pos of kind
423 /// \p Op associated with address spaces \p AddrSpace have completed. Used
424 /// between memory instructions to enforce the order they become visible as
425 /// observed by other memory instructions executing in memory scope \p Scope.
426 /// \p IsCrossAddrSpaceOrdering indicates if the memory ordering is between
427 /// address spaces. If \p AtomicsOnly is true, only insert waits for counters
428 /// that are used by atomic instructions.
429 /// Returns true iff any instructions inserted.
430 virtual bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
431 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
432 bool IsCrossAddrSpaceOrdering, Position Pos,
433 AtomicOrdering Order, bool AtomicsOnly) const = 0;
434
435 /// Inserts any necessary instructions at position \p Pos relative to
436 /// instruction \p MI to ensure any subsequent memory instructions of this
437 /// thread with address spaces \p AddrSpace will observe the previous memory
438 /// operations by any thread for memory scopes up to memory scope \p Scope .
439 /// Returns true iff any instructions inserted.
440 virtual bool insertAcquire(MachineBasicBlock::iterator &MI,
441 SIAtomicScope Scope,
442 SIAtomicAddrSpace AddrSpace,
443 Position Pos) const = 0;
444
445 /// Inserts any necessary writeback instructions at position \p Pos relative
446 /// to instruction \p MI to make previous memory operations by this thread
447 /// with address spaces \p AddrSpace available to other threads in memory
448 /// scope \p Scope. Does not insert waits; callers must call insertWait
449 /// separately. Returns true iff any instructions inserted.
450 virtual bool insertWriteback(MachineBasicBlock::iterator &MI,
451 SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace,
452 Position Pos) const = 0;
453
454 /// Inserts writeback followed by an unconditional wait to implement a
455 /// release operation.
456 bool insertRelease(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
457 SIAtomicAddrSpace AddrSpace, bool IsCrossAddrSpaceOrdering,
458 Position Pos) const {
459 bool Changed = insertWriteback(MI, Scope, AddrSpace, Pos);
460 Changed |= insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
461 IsCrossAddrSpaceOrdering, Pos,
462 AtomicOrdering::Release, /*AtomicsOnly=*/false);
463 return Changed;
464 }
465
466 /// Handle operations that are considered non-volatile.
467 /// See \ref isNonVolatileMemoryAccess
468 virtual bool handleNonVolatile(MachineInstr &MI) const { return false; }
469
470 /// Virtual destructor to allow derivations to be deleted.
471 virtual ~SICacheControl() = default;
472};
473
474/// Generates code sequences for the memory model of all GFX targets below
475/// GFX10.
476class SIGfx6CacheControl final : public SICacheControl {
477public:
478
479 SIGfx6CacheControl(const GCNSubtarget &ST) : SICacheControl(ST) {}
480
481 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
482 SIAtomicScope Scope,
483 SIAtomicAddrSpace AddrSpace) const override;
484
485 bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
486 SIAtomicScope Scope,
487 SIAtomicAddrSpace AddrSpace) const override;
488
489 bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
490 SIAtomicScope Scope,
491 SIAtomicAddrSpace AddrSpace) const override;
492
493 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
494 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
495 bool IsVolatile, bool IsNonTemporal,
496 bool IsLastUse) const override;
497
498 bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
499 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
500 bool IsCrossAddrSpaceOrdering, Position Pos,
501 AtomicOrdering Order, bool AtomicsOnly) const override;
502
503 bool insertAcquire(MachineBasicBlock::iterator &MI,
504 SIAtomicScope Scope,
505 SIAtomicAddrSpace AddrSpace,
506 Position Pos) const override;
507
508 bool insertWriteback(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
509 SIAtomicAddrSpace AddrSpace,
510 Position Pos) const override;
511};
512
513/// Generates code sequences for the memory model of GFX10/11.
514class SIGfx10CacheControl final : public SICacheControl {
515public:
516 SIGfx10CacheControl(const GCNSubtarget &ST) : SICacheControl(ST) {}
517
518 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
519 SIAtomicScope Scope,
520 SIAtomicAddrSpace AddrSpace) const override;
521
522 bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
523 SIAtomicScope Scope,
524 SIAtomicAddrSpace AddrSpace) const override {
525 return false;
526 }
527
528 bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
529 SIAtomicScope Scope,
530 SIAtomicAddrSpace AddrSpace) const override {
531 return false;
532 }
533
534 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
535 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
536 bool IsVolatile, bool IsNonTemporal,
537 bool IsLastUse) const override;
538
539 bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
540 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
541 bool IsCrossAddrSpaceOrdering, Position Pos,
542 AtomicOrdering Order, bool AtomicsOnly) const override;
543
544 bool insertAcquire(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
545 SIAtomicAddrSpace AddrSpace, Position Pos) const override;
546
547 bool insertWriteback(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
548 SIAtomicAddrSpace AddrSpace,
549 Position Pos) const override {
550 return false;
551 }
552};
553
554class SIGfx12CacheControl final : public SICacheControl {
555protected:
556 // Sets TH policy to \p Value if CPol operand is present in instruction \p MI.
557 // \returns Returns true if \p MI is modified, false otherwise.
558 bool setTH(const MachineBasicBlock::iterator MI,
560
561 // Sets Scope policy to \p Value if CPol operand is present in instruction \p
562 // MI. \returns Returns true if \p MI is modified, false otherwise.
563 bool setScope(const MachineBasicBlock::iterator MI,
565
566 // Stores with system scope (SCOPE_SYS) need to wait for:
567 // - loads or atomics(returning) - wait for {LOAD|SAMPLE|BVH|KM}CNT==0
568 // - non-returning-atomics - wait for STORECNT==0
569 // TODO: SIInsertWaitcnts will not always be able to remove STORECNT waits
570 // since it does not distinguish atomics-with-return from regular stores.
571 // There is no need to wait if memory is cached (mtype != UC).
572 bool
573 insertWaitsBeforeSystemScopeStore(const MachineBasicBlock::iterator MI) const;
574
575 bool setAtomicScope(const MachineBasicBlock::iterator &MI,
576 SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace) const;
577
578public:
579 SIGfx12CacheControl(const GCNSubtarget &ST) : SICacheControl(ST) {
580 // GFX120x and GFX125x memory models greatly overlap, and in some cases
581 // the behavior is the same if assuming GFX120x in CU mode.
582 assert(!ST.hasGFX1250Insts() || ST.hasGFX13Insts() || ST.isCuModeEnabled());
583 }
584
585 bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
586 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
587 bool IsCrossAddrSpaceOrdering, Position Pos,
588 AtomicOrdering Order, bool AtomicsOnly) const override;
589
590 bool insertAcquire(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
591 SIAtomicAddrSpace AddrSpace, Position Pos) const override;
592
593 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
594 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
595 bool IsVolatile, bool IsNonTemporal,
596 bool IsLastUse) const override;
597
598 bool finalizeStore(MachineInstr &MI, bool Atomic) const override;
599
600 bool handleCooperativeAtomic(MachineInstr &MI) const override;
601
602 bool insertWriteback(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
603 SIAtomicAddrSpace AddrSpace,
604 Position Pos) const override;
605
606 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
607 SIAtomicScope Scope,
608 SIAtomicAddrSpace AddrSpace) const override {
609 return setAtomicScope(MI, Scope, AddrSpace);
610 }
611
612 bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
613 SIAtomicScope Scope,
614 SIAtomicAddrSpace AddrSpace) const override {
615 return setAtomicScope(MI, Scope, AddrSpace);
616 }
617
618 bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
619 SIAtomicScope Scope,
620 SIAtomicAddrSpace AddrSpace) const override {
621 return setAtomicScope(MI, Scope, AddrSpace);
622 }
623
624 bool handleNonVolatile(MachineInstr &MI) const override;
625};
626
627class SIMemoryLegalizer final {
628private:
629 const MachineModuleInfo &MMI;
630 /// Cache Control.
631 std::unique_ptr<SICacheControl> CC = nullptr;
632
633 /// List of atomic pseudo instructions.
634 std::list<MachineBasicBlock::iterator> AtomicPseudoMIs;
635
636 /// Return true iff instruction \p MI is a atomic instruction that
637 /// returns a result.
638 bool isAtomicRet(const MachineInstr &MI) const {
640 }
641
642 /// Removes all processed atomic pseudo instructions from the current
643 /// function. Returns true if current function is modified, false otherwise.
644 bool removeAtomicPseudoMIs();
645
646 /// Expands load operation \p MI. Returns true if instructions are
647 /// added/deleted or \p MI is modified, false otherwise.
648 bool expandLoad(const SIMemOpInfo &MOI,
650 /// Expands store operation \p MI. Returns true if instructions are
651 /// added/deleted or \p MI is modified, false otherwise.
652 bool expandStore(const SIMemOpInfo &MOI,
654 /// Expands atomic fence operation \p MI. Returns true if
655 /// instructions are added/deleted or \p MI is modified, false otherwise.
656 bool expandAtomicFence(const SIMemOpInfo &MOI,
658 /// Expands atomic cmpxchg or rmw operation \p MI. Returns true if
659 /// instructions are added/deleted or \p MI is modified, false otherwise.
660 bool expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
662 /// Expands LDS DMA operation \p MI. Returns true if instructions are
663 /// added/deleted or \p MI is modified, false otherwise.
664 bool expandLDSDMA(const SIMemOpInfo &MOI, MachineBasicBlock::iterator &MI);
665
666public:
667 SIMemoryLegalizer(const MachineModuleInfo &MMI) : MMI(MMI) {};
668 bool run(MachineFunction &MF);
669};
670
671class SIMemoryLegalizerLegacy final : public MachineFunctionPass {
672public:
673 static char ID;
674
675 SIMemoryLegalizerLegacy() : MachineFunctionPass(ID) {}
676
677 void getAnalysisUsage(AnalysisUsage &AU) const override {
678 AU.setPreservesCFG();
680 }
681
682 StringRef getPassName() const override {
683 return PASS_NAME;
684 }
685
686 bool runOnMachineFunction(MachineFunction &MF) override;
687};
688
689static const StringMap<SIAtomicAddrSpace> ASNames = {{
690 {"global", SIAtomicAddrSpace::GLOBAL},
691 {"local", SIAtomicAddrSpace::LDS},
692}};
693
694void diagnoseUnknownMMRAASName(const MachineInstr &MI, StringRef AS) {
695 const MachineFunction *MF = MI.getMF();
696 const Function &Fn = MF->getFunction();
698 raw_svector_ostream OS(Str);
699 OS << "unknown address space '" << AS << "'; expected one of ";
701 for (const auto &[Name, Val] : ASNames)
702 OS << LS << '\'' << Name << '\'';
703 Fn.getContext().diagnose(
704 DiagnosticInfoUnsupported(Fn, Str.str(), MI.getDebugLoc(), DS_Warning));
705}
706
707/// Reads \p MI's MMRAs to parse the "amdgpu-synchronize-as" MMRA.
708/// If this tag isn't present, or if it has no meaningful values, returns
709/// \p none, otherwise returns the address spaces specified by the MD.
710static std::optional<SIAtomicAddrSpace>
711getSynchronizeAddrSpaceMD(const MachineInstr &MI) {
712 static constexpr StringLiteral FenceASPrefix = "amdgpu-synchronize-as";
713
714 auto MMRA = MMRAMetadata(MI.getMMRAMetadata());
715 if (!MMRA)
716 return std::nullopt;
717
718 SIAtomicAddrSpace Result = SIAtomicAddrSpace::NONE;
719 for (const auto &[Prefix, Suffix] : MMRA) {
720 if (Prefix != FenceASPrefix)
721 continue;
722
723 if (auto It = ASNames.find(Suffix); It != ASNames.end())
724 Result |= It->second;
725 else
726 diagnoseUnknownMMRAASName(MI, Suffix);
727 }
728
729 if (Result == SIAtomicAddrSpace::NONE)
730 return std::nullopt;
731
732 return Result;
733}
734
735} // end anonymous namespace
736
737void SIMemOpAccess::reportUnsupported(const MachineBasicBlock::iterator &MI,
738 const char *Msg) const {
739 const Function &Func = MI->getMF()->getFunction();
740 Func.getContext().diagnose(
741 DiagnosticInfoUnsupported(Func, Msg, MI->getDebugLoc()));
742}
743
744std::optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>>
745SIMemOpAccess::toSIAtomicScope(SyncScope::ID SSID,
746 SIAtomicAddrSpace InstrAddrSpace) const {
747 if (SSID == SyncScope::System)
748 return std::tuple(SIAtomicScope::SYSTEM, SIAtomicAddrSpace::ATOMIC, true);
749 if (SSID == MMI->getAgentSSID())
750 return std::tuple(SIAtomicScope::AGENT, SIAtomicAddrSpace::ATOMIC, true);
751 if (SSID == MMI->getClusterSSID())
752 return std::tuple(SIAtomicScope::CLUSTER, SIAtomicAddrSpace::ATOMIC, true);
753 if (SSID == MMI->getWorkgroupSSID())
754 return std::tuple(SIAtomicScope::WORKGROUP, SIAtomicAddrSpace::ATOMIC,
755 true);
756 if (SSID == MMI->getWavefrontSSID())
757 return std::tuple(SIAtomicScope::WAVEFRONT, SIAtomicAddrSpace::ATOMIC,
758 true);
759 if (SSID == SyncScope::SingleThread)
760 return std::tuple(SIAtomicScope::SINGLETHREAD, SIAtomicAddrSpace::ATOMIC,
761 true);
762 if (SSID == MMI->getSystemOneAddressSpaceSSID())
763 return std::tuple(SIAtomicScope::SYSTEM,
764 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
765 if (SSID == MMI->getAgentOneAddressSpaceSSID())
766 return std::tuple(SIAtomicScope::AGENT,
767 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
768 if (SSID == MMI->getClusterOneAddressSpaceSSID())
769 return std::tuple(SIAtomicScope::CLUSTER,
770 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
771 if (SSID == MMI->getWorkgroupOneAddressSpaceSSID())
772 return std::tuple(SIAtomicScope::WORKGROUP,
773 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
774 if (SSID == MMI->getWavefrontOneAddressSpaceSSID())
775 return std::tuple(SIAtomicScope::WAVEFRONT,
776 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
777 if (SSID == MMI->getSingleThreadOneAddressSpaceSSID())
778 return std::tuple(SIAtomicScope::SINGLETHREAD,
779 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
780 return std::nullopt;
781}
782
783SIAtomicAddrSpace SIMemOpAccess::toSIAtomicAddrSpace(unsigned AS) const {
784 if (AS == AMDGPUAS::FLAT_ADDRESS)
785 return SIAtomicAddrSpace::FLAT;
786 if (AS == AMDGPUAS::GLOBAL_ADDRESS)
787 return SIAtomicAddrSpace::GLOBAL;
788 if (AS == AMDGPUAS::LOCAL_ADDRESS)
789 return SIAtomicAddrSpace::LDS;
791 return SIAtomicAddrSpace::SCRATCH;
792 if (AS == AMDGPUAS::REGION_ADDRESS)
793 return SIAtomicAddrSpace::GDS;
796 return SIAtomicAddrSpace::GLOBAL;
797
798 return SIAtomicAddrSpace::OTHER;
799}
800
801// TODO: Consider moving single-wave workgroup->wavefront scope relaxation to an
802// IR pass (and extending it to other scoped operations), so middle-end
803// optimizations see wavefront scope earlier.
804SIMemOpAccess::SIMemOpAccess(const AMDGPUMachineModuleInfo &MMI_,
805 const GCNSubtarget &ST, const Function &F)
806 : MMI(&MMI_), ST(ST),
807 CanDemoteWorkgroupToWavefront(ST.isSingleWavefrontWorkgroup(F)) {}
808
809std::optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO(
810 const MachineBasicBlock::iterator &MI) const {
811 assert(MI->getNumMemOperands() > 0);
812
814 AtomicOrdering Ordering = AtomicOrdering::NotAtomic;
815 AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic;
816 SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE;
817 bool IsNonTemporal = true;
818 bool IsVolatile = false;
819 bool IsLastUse = false;
820 bool IsCooperative = false;
821
822 // Validator should check whether or not MMOs cover the entire set of
823 // locations accessed by the memory instruction.
824 for (const auto &MMO : MI->memoperands()) {
825 IsNonTemporal &= MMO->isNonTemporal();
826 IsVolatile |= MMO->isVolatile();
827 IsLastUse |= MMO->getFlags() & MOLastUse;
828 IsCooperative |= MMO->getFlags() & MOCooperative;
829 InstrAddrSpace |=
830 toSIAtomicAddrSpace(MMO->getPointerInfo().getAddrSpace());
831 AtomicOrdering OpOrdering = MMO->getSuccessOrdering();
832 if (OpOrdering != AtomicOrdering::NotAtomic) {
833 const auto &IsSyncScopeInclusion =
834 MMI->isSyncScopeInclusion(SSID, MMO->getSyncScopeID());
835 if (!IsSyncScopeInclusion) {
836 reportUnsupported(MI,
837 "Unsupported non-inclusive atomic synchronization scope");
838 return std::nullopt;
839 }
840
841 SSID = *IsSyncScopeInclusion ? SSID : MMO->getSyncScopeID();
842 Ordering = getMergedAtomicOrdering(Ordering, OpOrdering);
843 assert(MMO->getFailureOrdering() != AtomicOrdering::Release &&
844 MMO->getFailureOrdering() != AtomicOrdering::AcquireRelease);
845 FailureOrdering =
846 getMergedAtomicOrdering(FailureOrdering, MMO->getFailureOrdering());
847 }
848 }
849
850 // FIXME: The MMO of buffer atomic instructions does not always have an atomic
851 // ordering. We only need to handle VBUFFER atomics on GFX12+ so we can fix it
852 // here, but the lowering should really be cleaned up at some point.
853 if ((ST.getGeneration() >= GCNSubtarget::GFX12) && SIInstrInfo::isBUF(*MI) &&
854 SIInstrInfo::isAtomic(*MI) && Ordering == AtomicOrdering::NotAtomic)
855 Ordering = AtomicOrdering::Monotonic;
856
857 SIAtomicScope Scope = SIAtomicScope::NONE;
858 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
859 bool IsCrossAddressSpaceOrdering = false;
860 if (Ordering != AtomicOrdering::NotAtomic) {
861 auto ScopeOrNone = toSIAtomicScope(SSID, InstrAddrSpace);
862 if (!ScopeOrNone) {
863 reportUnsupported(MI, "Unsupported atomic synchronization scope");
864 return std::nullopt;
865 }
866 std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) =
867 *ScopeOrNone;
868 if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) ||
869 ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace) ||
870 ((InstrAddrSpace & SIAtomicAddrSpace::ATOMIC) == SIAtomicAddrSpace::NONE)) {
871 reportUnsupported(MI, "Unsupported atomic address space");
872 return std::nullopt;
873 }
874 }
875 return SIMemOpInfo(ST, Ordering, Scope, OrderingAddrSpace, InstrAddrSpace,
876 IsCrossAddressSpaceOrdering, FailureOrdering, IsVolatile,
877 IsNonTemporal, IsLastUse, IsCooperative,
878 CanDemoteWorkgroupToWavefront);
879}
880
881std::optional<SIMemOpInfo>
882SIMemOpAccess::getLoadInfo(const MachineBasicBlock::iterator &MI) const {
883 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
884
885 if (!(MI->mayLoad() && !MI->mayStore()))
886 return std::nullopt;
887
888 // Be conservative if there are no memory operands.
889 if (MI->getNumMemOperands() == 0)
890 return SIMemOpInfo(ST);
891
892 return constructFromMIWithMMO(MI);
893}
894
895std::optional<SIMemOpInfo>
896SIMemOpAccess::getStoreInfo(const MachineBasicBlock::iterator &MI) const {
897 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
898
899 if (!(!MI->mayLoad() && MI->mayStore()))
900 return std::nullopt;
901
902 // Be conservative if there are no memory operands.
903 if (MI->getNumMemOperands() == 0)
904 return SIMemOpInfo(ST);
905
906 return constructFromMIWithMMO(MI);
907}
908
909std::optional<SIMemOpInfo>
910SIMemOpAccess::getAtomicFenceInfo(const MachineBasicBlock::iterator &MI) const {
911 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
912
913 if (MI->getOpcode() != AMDGPU::ATOMIC_FENCE)
914 return std::nullopt;
915
917 static_cast<AtomicOrdering>(MI->getOperand(0).getImm());
918
919 SyncScope::ID SSID = static_cast<SyncScope::ID>(MI->getOperand(1).getImm());
920 auto ScopeOrNone = toSIAtomicScope(SSID, SIAtomicAddrSpace::ATOMIC);
921 if (!ScopeOrNone) {
922 reportUnsupported(MI, "Unsupported atomic synchronization scope");
923 return std::nullopt;
924 }
925
926 SIAtomicScope Scope = SIAtomicScope::NONE;
927 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
928 bool IsCrossAddressSpaceOrdering = false;
929 std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) =
930 *ScopeOrNone;
931
932 if (OrderingAddrSpace != SIAtomicAddrSpace::ATOMIC) {
933 // We currently expect refineOrderingAS to be the only place that
934 // can refine the AS ordered by the fence.
935 // If that changes, we need to review the semantics of that function
936 // in case it needs to preserve certain address spaces.
937 reportUnsupported(MI, "Unsupported atomic address space");
938 return std::nullopt;
939 }
940
941 auto SynchronizeAS = getSynchronizeAddrSpaceMD(*MI);
942 if (SynchronizeAS)
943 OrderingAddrSpace = *SynchronizeAS;
944
945 return SIMemOpInfo(ST, Ordering, Scope, OrderingAddrSpace,
946 SIAtomicAddrSpace::ATOMIC, IsCrossAddressSpaceOrdering,
947 AtomicOrdering::NotAtomic, false, false, false, false,
948 CanDemoteWorkgroupToWavefront);
949}
950
951std::optional<SIMemOpInfo> SIMemOpAccess::getAtomicCmpxchgOrRmwInfo(
952 const MachineBasicBlock::iterator &MI) const {
953 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
954
955 if (!(MI->mayLoad() && MI->mayStore()))
956 return std::nullopt;
957
958 // Be conservative if there are no memory operands.
959 if (MI->getNumMemOperands() == 0)
960 return SIMemOpInfo(ST);
961
962 return constructFromMIWithMMO(MI);
963}
964
965std::optional<SIMemOpInfo>
966SIMemOpAccess::getLDSDMAInfo(const MachineBasicBlock::iterator &MI) const {
967 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
968
970 return std::nullopt;
971
972 return constructFromMIWithMMO(MI);
973}
974
975/// \returns true if \p MI has one or more MMO, and all of them are fit for
976/// being marked as non-volatile. This means that either they are accessing the
977/// constant address space, are accessing a known invariant memory location, or
978/// that they are marked with the non-volatile metadata/MMO flag.
980 if (MI.getNumMemOperands() == 0)
981 return false;
982 return all_of(MI.memoperands(), [&](const MachineMemOperand *MMO) {
983 return MMO->getFlags() & (MOThreadPrivate | MachineMemOperand::MOInvariant);
984 });
985}
986
987SICacheControl::SICacheControl(const GCNSubtarget &ST) : ST(ST) {
988 TII = ST.getInstrInfo();
989 IV = getIsaVersion(ST.getCPU());
990 InsertCacheInv = !AmdgcnSkipCacheInvalidations;
991}
992
993bool SICacheControl::enableCPolBits(const MachineBasicBlock::iterator MI,
994 unsigned Bits) const {
995 MachineOperand *CPol = TII->getNamedOperand(*MI, AMDGPU::OpName::cpol);
996 if (!CPol)
997 return false;
998
999 CPol->setImm(CPol->getImm() | Bits);
1000 return true;
1001}
1002
1003bool SICacheControl::canAffectGlobalAddrSpace(SIAtomicAddrSpace AS) const {
1004 assert((!ST.hasGloballyAddressableScratch() ||
1005 (AS & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE ||
1006 (AS & SIAtomicAddrSpace::SCRATCH) == SIAtomicAddrSpace::NONE) &&
1007 "scratch instructions should already be replaced by flat "
1008 "instructions if GloballyAddressableScratch is enabled");
1009 return (AS & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE;
1010}
1011
1012/* static */
1013std::unique_ptr<SICacheControl> SICacheControl::create(const GCNSubtarget &ST) {
1014 GCNSubtarget::Generation Generation = ST.getGeneration();
1015 if (Generation < AMDGPUSubtarget::GFX10)
1016 return std::make_unique<SIGfx6CacheControl>(ST);
1017 if (Generation < AMDGPUSubtarget::GFX12)
1018 return std::make_unique<SIGfx10CacheControl>(ST);
1019 return std::make_unique<SIGfx12CacheControl>(ST);
1020}
1021
1022bool SIGfx6CacheControl::enableLoadCacheBypass(
1024 SIAtomicScope Scope,
1025 SIAtomicAddrSpace AddrSpace) const {
1026 assert(MI->mayLoad() && !MI->mayStore());
1027
1028 if (!canAffectGlobalAddrSpace(AddrSpace)) {
1029 /// The scratch address space does not need the global memory caches
1030 /// to be bypassed as all memory operations by the same thread are
1031 /// sequentially consistent, and no other thread can access scratch
1032 /// memory.
1033
1034 /// Other address spaces do not have a cache.
1035 return false;
1036 }
1037
1038 bool Changed = false;
1039 switch (Scope) {
1040 case SIAtomicScope::SYSTEM:
1041 if (ST.hasGFX940Insts()) {
1042 // Set SC bits to indicate system scope.
1043 Changed |= enableCPolBits(MI, CPol::SC0 | CPol::SC1);
1044 break;
1045 }
1046 [[fallthrough]];
1047 case SIAtomicScope::AGENT:
1048 if (ST.hasGFX940Insts()) {
1049 // Set SC bits to indicate agent scope.
1050 Changed |= enableCPolBits(MI, CPol::SC1);
1051 } else {
1052 // Set L1 cache policy to MISS_EVICT.
1053 // Note: there is no L2 cache bypass policy at the ISA level.
1054 Changed |= enableCPolBits(MI, CPol::GLC);
1055 }
1056 break;
1057 case SIAtomicScope::WORKGROUP:
1058 if (ST.hasGFX940Insts()) {
1059 // In threadgroup split mode the waves of a work-group can be executing
1060 // on different CUs. Therefore need to bypass the L1 which is per CU.
1061 // Otherwise in non-threadgroup split mode all waves of a work-group are
1062 // on the same CU, and so the L1 does not need to be bypassed. Setting
1063 // SC bits to indicate work-group scope will do this automatically.
1064 Changed |= enableCPolBits(MI, CPol::SC0);
1065 } else if (ST.hasGFX90AInsts()) {
1066 // In threadgroup split mode the waves of a work-group can be executing
1067 // on different CUs. Therefore need to bypass the L1 which is per CU.
1068 // Otherwise in non-threadgroup split mode all waves of a work-group are
1069 // on the same CU, and so the L1 does not need to be bypassed.
1070 if (ST.isTgSplitEnabled())
1071 Changed |= enableCPolBits(MI, CPol::GLC);
1072 }
1073 break;
1074 case SIAtomicScope::WAVEFRONT:
1075 case SIAtomicScope::SINGLETHREAD:
1076 // No cache to bypass.
1077 break;
1078 default:
1079 llvm_unreachable("Unsupported synchronization scope");
1080 }
1081
1082 return Changed;
1083}
1084
1085bool SIGfx6CacheControl::enableStoreCacheBypass(
1087 SIAtomicScope Scope,
1088 SIAtomicAddrSpace AddrSpace) const {
1089 assert(!MI->mayLoad() && MI->mayStore());
1090 bool Changed = false;
1091
1092 /// For targets other than GFX940, the L1 cache is write through so does not
1093 /// need to be bypassed. There is no bypass control for the L2 cache at the
1094 /// isa level.
1095
1096 if (ST.hasGFX940Insts() && canAffectGlobalAddrSpace(AddrSpace)) {
1097 switch (Scope) {
1098 case SIAtomicScope::SYSTEM:
1099 // Set SC bits to indicate system scope.
1100 Changed |= enableCPolBits(MI, CPol::SC0 | CPol::SC1);
1101 break;
1102 case SIAtomicScope::AGENT:
1103 // Set SC bits to indicate agent scope.
1104 Changed |= enableCPolBits(MI, CPol::SC1);
1105 break;
1106 case SIAtomicScope::WORKGROUP:
1107 // Set SC bits to indicate workgroup scope.
1108 Changed |= enableCPolBits(MI, CPol::SC0);
1109 break;
1110 case SIAtomicScope::WAVEFRONT:
1111 case SIAtomicScope::SINGLETHREAD:
1112 // Leave SC bits unset to indicate wavefront scope.
1113 break;
1114 default:
1115 llvm_unreachable("Unsupported synchronization scope");
1116 }
1117
1118 /// The scratch address space does not need the global memory caches
1119 /// to be bypassed as all memory operations by the same thread are
1120 /// sequentially consistent, and no other thread can access scratch
1121 /// memory.
1122
1123 /// Other address spaces do not have a cache.
1124 }
1125
1126 return Changed;
1127}
1128
1129bool SIGfx6CacheControl::enableRMWCacheBypass(
1131 SIAtomicScope Scope,
1132 SIAtomicAddrSpace AddrSpace) const {
1133 assert(MI->mayLoad() && MI->mayStore());
1134 bool Changed = false;
1135
1136 /// For targets other than GFX940, do not set GLC for RMW atomic operations as
1137 /// L0/L1 cache is automatically bypassed, and the GLC bit is instead used to
1138 /// indicate if they are return or no-return. Note: there is no L2 cache
1139 /// coherent bypass control at the ISA level.
1140 /// For GFX90A+, RMW atomics implicitly bypass the L1 cache.
1141
1142 if (ST.hasGFX940Insts() && canAffectGlobalAddrSpace(AddrSpace)) {
1143 switch (Scope) {
1144 case SIAtomicScope::SYSTEM:
1145 // Set SC1 bit to indicate system scope.
1146 Changed |= enableCPolBits(MI, CPol::SC1);
1147 break;
1148 case SIAtomicScope::AGENT:
1149 case SIAtomicScope::WORKGROUP:
1150 case SIAtomicScope::WAVEFRONT:
1151 case SIAtomicScope::SINGLETHREAD:
1152 // RMW atomic operations implicitly bypass the L1 cache and only use SC1
1153 // to indicate system or agent scope. The SC0 bit is used to indicate if
1154 // they are return or no-return. Leave SC1 bit unset to indicate agent
1155 // scope.
1156 break;
1157 default:
1158 llvm_unreachable("Unsupported synchronization scope");
1159 }
1160 }
1161
1162 return Changed;
1163}
1164
1165bool SIGfx6CacheControl::enableVolatileAndOrNonTemporal(
1166 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1167 bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {
1168 // Only handle load and store, not atomic read-modify-write insructions. The
1169 // latter use glc to indicate if the atomic returns a result and so must not
1170 // be used for cache control.
1171 assert((MI->mayLoad() ^ MI->mayStore()) || SIInstrInfo::isLDSDMA(*MI));
1172
1173 // Only update load and store, not LLVM IR atomic read-modify-write
1174 // instructions. The latter are always marked as volatile so cannot sensibly
1175 // handle it as do not want to pessimize all atomics. Also they do not support
1176 // the nontemporal attribute.
1177 assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
1178
1179 bool Changed = false;
1180
1181 if (IsVolatile) {
1182 if (ST.hasGFX940Insts()) {
1183 // Set SC bits to indicate system scope.
1184 Changed |= enableCPolBits(MI, CPol::SC0 | CPol::SC1);
1185 } else if (Op == SIMemOp::LOAD) {
1186 // Set L1 cache policy to be MISS_EVICT for load instructions
1187 // and MISS_LRU for store instructions.
1188 // Note: there is no L2 cache bypass policy at the ISA level.
1189 Changed |= enableCPolBits(MI, CPol::GLC);
1190 }
1191
1192 // Ensure operation has completed at system scope to cause all volatile
1193 // operations to be visible outside the program in a global order. Do not
1194 // request cross address space as only the global address space can be
1195 // observable outside the program, so no need to cause a waitcnt for LDS
1196 // address space operations.
1197 Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
1198 Position::AFTER, AtomicOrdering::Unordered,
1199 /*AtomicsOnly=*/false);
1200
1201 return Changed;
1202 }
1203
1204 if (IsNonTemporal) {
1205 if (ST.hasGFX940Insts()) {
1206 Changed |= enableCPolBits(MI, CPol::NT);
1207 } else {
1208 // Setting both GLC and SLC configures L1 cache policy to MISS_EVICT
1209 // for both loads and stores, and the L2 cache policy to STREAM.
1210 Changed |= enableCPolBits(MI, CPol::SLC | CPol::GLC);
1211 }
1212 return Changed;
1213 }
1214
1215 return Changed;
1216}
1217
1218bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI,
1219 SIAtomicScope Scope,
1220 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1221 bool IsCrossAddrSpaceOrdering, Position Pos,
1222 AtomicOrdering Order,
1223 bool AtomicsOnly) const {
1224 bool Changed = false;
1225
1226 MachineBasicBlock &MBB = *MI->getParent();
1227 const DebugLoc &DL = MI->getDebugLoc();
1228
1229 if (Pos == Position::AFTER)
1230 ++MI;
1231
1232 // GFX90A+
1233 if (ST.hasGFX90AInsts() && ST.isTgSplitEnabled()) {
1234 // In threadgroup split mode the waves of a work-group can be executing on
1235 // different CUs. Therefore need to wait for global or GDS memory operations
1236 // to complete to ensure they are visible to waves in the other CUs.
1237 // Otherwise in non-threadgroup split mode all waves of a work-group are on
1238 // the same CU, so no need to wait for global memory as all waves in the
1239 // work-group access the same the L1, nor wait for GDS as access are ordered
1240 // on a CU.
1241 if (((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH |
1242 SIAtomicAddrSpace::GDS)) != SIAtomicAddrSpace::NONE) &&
1243 (Scope == SIAtomicScope::WORKGROUP)) {
1244 // Same as <GFX90A at AGENT scope;
1245 Scope = SIAtomicScope::AGENT;
1246 }
1247 // In threadgroup split mode LDS cannot be allocated so no need to wait for
1248 // LDS memory operations.
1249 AddrSpace &= ~SIAtomicAddrSpace::LDS;
1250 }
1251
1252 bool VMCnt = false;
1253 bool LGKMCnt = false;
1254
1255 if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) !=
1256 SIAtomicAddrSpace::NONE) {
1257 switch (Scope) {
1258 case SIAtomicScope::SYSTEM:
1259 case SIAtomicScope::AGENT:
1260 VMCnt |= true;
1261 break;
1262 case SIAtomicScope::WORKGROUP:
1263 case SIAtomicScope::WAVEFRONT:
1264 case SIAtomicScope::SINGLETHREAD:
1265 // The L1 cache keeps all memory operations in order for
1266 // wavefronts in the same work-group.
1267 break;
1268 default:
1269 llvm_unreachable("Unsupported synchronization scope");
1270 }
1271 }
1272
1273 if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
1274 switch (Scope) {
1275 case SIAtomicScope::SYSTEM:
1276 case SIAtomicScope::AGENT:
1277 case SIAtomicScope::WORKGROUP:
1278 // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is
1279 // not needed as LDS operations for all waves are executed in a total
1280 // global ordering as observed by all waves. Required if also
1281 // synchronizing with global/GDS memory as LDS operations could be
1282 // reordered with respect to later global/GDS memory operations of the
1283 // same wave.
1284 LGKMCnt |= IsCrossAddrSpaceOrdering;
1285 break;
1286 case SIAtomicScope::WAVEFRONT:
1287 case SIAtomicScope::SINGLETHREAD:
1288 // The LDS keeps all memory operations in order for
1289 // the same wavefront.
1290 break;
1291 default:
1292 llvm_unreachable("Unsupported synchronization scope");
1293 }
1294 }
1295
1296 if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) {
1297 switch (Scope) {
1298 case SIAtomicScope::SYSTEM:
1299 case SIAtomicScope::AGENT:
1300 // If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)"
1301 // is not needed as GDS operations for all waves are executed in a total
1302 // global ordering as observed by all waves. Required if also
1303 // synchronizing with global/LDS memory as GDS operations could be
1304 // reordered with respect to later global/LDS memory operations of the
1305 // same wave.
1306 LGKMCnt |= IsCrossAddrSpaceOrdering;
1307 break;
1308 case SIAtomicScope::WORKGROUP:
1309 case SIAtomicScope::WAVEFRONT:
1310 case SIAtomicScope::SINGLETHREAD:
1311 // The GDS keeps all memory operations in order for
1312 // the same work-group.
1313 break;
1314 default:
1315 llvm_unreachable("Unsupported synchronization scope");
1316 }
1317 }
1318
1319 if (VMCnt || LGKMCnt) {
1320 unsigned WaitCntImmediate =
1322 VMCnt ? 0 : getVmcntBitMask(IV),
1324 LGKMCnt ? 0 : getLgkmcntBitMask(IV));
1325 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_soft))
1326 .addImm(WaitCntImmediate);
1327 Changed = true;
1328 }
1329
1330 // On architectures that support direct loads to LDS, emit an unknown waitcnt
1331 // at workgroup-scoped release operations that specify the LDS address space.
1332 // SIInsertWaitcnts will later replace this with a vmcnt().
1333 if (ST.hasVMemToLDSLoad() && isReleaseOrStronger(Order) &&
1334 Scope == SIAtomicScope::WORKGROUP &&
1335 (AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
1336 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_lds_direct));
1337 Changed = true;
1338 }
1339
1340 if (Pos == Position::AFTER)
1341 --MI;
1342
1343 return Changed;
1344}
1345
1347 if (ST.getGeneration() <= AMDGPUSubtarget::SOUTHERN_ISLANDS)
1348 return false;
1349 return !ST.isAmdPalOS() && !ST.isMesa3DOS();
1350}
1351
1352bool SIGfx6CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1353 SIAtomicScope Scope,
1354 SIAtomicAddrSpace AddrSpace,
1355 Position Pos) const {
1356 if (!InsertCacheInv)
1357 return false;
1358
1359 bool Changed = false;
1360
1361 MachineBasicBlock &MBB = *MI->getParent();
1362 const DebugLoc &DL = MI->getDebugLoc();
1363
1364 if (Pos == Position::AFTER)
1365 ++MI;
1366
1367 const unsigned InvalidateL1 = canUseBUFFER_WBINVL1_VOL(ST)
1368 ? AMDGPU::BUFFER_WBINVL1_VOL
1369 : AMDGPU::BUFFER_WBINVL1;
1370
1371 if (canAffectGlobalAddrSpace(AddrSpace)) {
1372 switch (Scope) {
1373 case SIAtomicScope::SYSTEM:
1374 if (ST.hasGFX940Insts()) {
1375 // Ensures that following loads will not see stale remote VMEM data or
1376 // stale local VMEM data with MTYPE NC. Local VMEM data with MTYPE RW
1377 // and CC will never be stale due to the local memory probes.
1378 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV))
1379 // Set SC bits to indicate system scope.
1381 // Inserting a "S_WAITCNT vmcnt(0)" after is not required because the
1382 // hardware does not reorder memory operations by the same wave with
1383 // respect to a preceding "BUFFER_INV". The invalidate is guaranteed to
1384 // remove any cache lines of earlier writes by the same wave and ensures
1385 // later reads by the same wave will refetch the cache lines.
1386 Changed = true;
1387 break;
1388 }
1389
1390 if (ST.hasGFX90AInsts()) {
1391 // Ensures that following loads will not see stale remote VMEM data or
1392 // stale local VMEM data with MTYPE NC. Local VMEM data with MTYPE RW
1393 // and CC will never be stale due to the local memory probes.
1394 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INVL2));
1395 BuildMI(MBB, MI, DL, TII->get(InvalidateL1));
1396 // Inserting a "S_WAITCNT vmcnt(0)" after is not required because the
1397 // hardware does not reorder memory operations by the same wave with
1398 // respect to a preceding "BUFFER_INVL2". The invalidate is guaranteed
1399 // to remove any cache lines of earlier writes by the same wave and
1400 // ensures later reads by the same wave will refetch the cache lines.
1401 Changed = true;
1402 break;
1403 }
1404 [[fallthrough]];
1405 case SIAtomicScope::AGENT:
1406 if (ST.hasGFX940Insts()) {
1407 // Ensures that following loads will not see stale remote date or local
1408 // MTYPE NC global data. Local MTYPE RW and CC memory will never be
1409 // stale due to the memory probes.
1410 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV))
1411 // Set SC bits to indicate agent scope.
1413 // Inserting "S_WAITCNT vmcnt(0)" is not required because the hardware
1414 // does not reorder memory operations with respect to preceeding buffer
1415 // invalidate. The invalidate is guaranteed to remove any cache lines of
1416 // earlier writes and ensures later writes will refetch the cache lines.
1417 } else
1418 BuildMI(MBB, MI, DL, TII->get(InvalidateL1));
1419 Changed = true;
1420 break;
1421 case SIAtomicScope::WORKGROUP:
1422 if (ST.isTgSplitEnabled()) {
1423 if (ST.hasGFX940Insts()) {
1424 // In threadgroup split mode the waves of a work-group can be
1425 // executing on different CUs. Therefore need to invalidate the L1
1426 // which is per CU. Otherwise in non-threadgroup split mode all waves
1427 // of a work-group are on the same CU, and so the L1 does not need to
1428 // be invalidated.
1429
1430 // Ensures L1 is invalidated if in threadgroup split mode. In
1431 // non-threadgroup split mode it is a NOP, but no point generating it
1432 // in that case if know not in that mode.
1433 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV))
1434 // Set SC bits to indicate work-group scope.
1436 // Inserting "S_WAITCNT vmcnt(0)" is not required because the hardware
1437 // does not reorder memory operations with respect to preceeding
1438 // buffer invalidate. The invalidate is guaranteed to remove any cache
1439 // lines of earlier writes and ensures later writes will refetch the
1440 // cache lines.
1441 Changed = true;
1442 } else if (ST.hasGFX90AInsts()) {
1443 BuildMI(MBB, MI, DL, TII->get(InvalidateL1));
1444 Changed = true;
1445 }
1446 }
1447 break;
1448 case SIAtomicScope::WAVEFRONT:
1449 case SIAtomicScope::SINGLETHREAD:
1450 // For GFX940, we could generate "BUFFER_INV" but it would do nothing as
1451 // there are no caches to invalidate. All other targets have no cache to
1452 // invalidate.
1453 break;
1454 default:
1455 llvm_unreachable("Unsupported synchronization scope");
1456 }
1457 }
1458
1459 /// The scratch address space does not need the global memory cache
1460 /// to be flushed as all memory operations by the same thread are
1461 /// sequentially consistent, and no other thread can access scratch
1462 /// memory.
1463
1464 /// Other address spaces do not have a cache.
1465
1466 if (Pos == Position::AFTER)
1467 --MI;
1468
1469 return Changed;
1470}
1471
1472bool SIGfx6CacheControl::insertWriteback(MachineBasicBlock::iterator &MI,
1473 SIAtomicScope Scope,
1474 SIAtomicAddrSpace AddrSpace,
1475 Position Pos) const {
1476 if (!ST.hasGFX90AInsts())
1477 return false;
1478
1479 bool Changed = false;
1480 MachineBasicBlock &MBB = *MI->getParent();
1481 const DebugLoc &DL = MI->getDebugLoc();
1482
1483 if (Pos == Position::AFTER)
1484 ++MI;
1485
1486 if (canAffectGlobalAddrSpace(AddrSpace)) {
1487 switch (Scope) {
1488 case SIAtomicScope::SYSTEM:
1489 // Inserting a "S_WAITCNT vmcnt(0)" before is not required because the
1490 // hardware does not reorder memory operations by the same wave with
1491 // respect to a following "BUFFER_WBL2". The "BUFFER_WBL2" is guaranteed
1492 // to initiate writeback of any dirty cache lines of earlier writes by
1493 // the same wave. A "S_WAITCNT vmcnt(0)" is needed after to ensure the
1494 // writeback has completed.
1495 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2))
1496 // Set SC bits to indicate system scope.
1498 Changed = true;
1499 break;
1500 case SIAtomicScope::AGENT:
1501 if (ST.hasGFX940Insts()) {
1502 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2))
1503 // Set SC bits to indicate agent scope.
1505 Changed = true;
1506 }
1507 break;
1508 case SIAtomicScope::WORKGROUP:
1509 case SIAtomicScope::WAVEFRONT:
1510 case SIAtomicScope::SINGLETHREAD:
1511 // For GFX940, do not generate "BUFFER_WBL2" as there are no caches it
1512 // would writeback, and would require an otherwise unnecessary
1513 // "S_WAITCNT vmcnt(0)".
1514 break;
1515 default:
1516 llvm_unreachable("Unsupported synchronization scope");
1517 }
1518 }
1519
1520 if (Pos == Position::AFTER)
1521 --MI;
1522
1523 return Changed;
1524}
1525
1526bool SIGfx10CacheControl::enableLoadCacheBypass(
1527 const MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
1528 SIAtomicAddrSpace AddrSpace) const {
1529 assert(MI->mayLoad() && !MI->mayStore());
1530 bool Changed = false;
1531
1532 if (canAffectGlobalAddrSpace(AddrSpace)) {
1533 switch (Scope) {
1534 case SIAtomicScope::SYSTEM:
1535 case SIAtomicScope::AGENT:
1536 // Set the L0 and L1 cache policies to MISS_EVICT.
1537 // Note: there is no L2 cache coherent bypass control at the ISA level.
1538 // For GFX10, set GLC+DLC, for GFX11, only set GLC.
1539 Changed |=
1540 enableCPolBits(MI, CPol::GLC | (AMDGPU::isGFX10(ST) ? CPol::DLC : 0));
1541 break;
1542 case SIAtomicScope::WORKGROUP:
1543 // In WGP mode the waves of a work-group can be executing on either CU of
1544 // the WGP. Therefore need to bypass the L0 which is per CU. Otherwise in
1545 // CU mode all waves of a work-group are on the same CU, and so the L0
1546 // does not need to be bypassed.
1547 if (!ST.isCuModeEnabled())
1548 Changed |= enableCPolBits(MI, CPol::GLC);
1549 break;
1550 case SIAtomicScope::WAVEFRONT:
1551 case SIAtomicScope::SINGLETHREAD:
1552 // No cache to bypass.
1553 break;
1554 default:
1555 llvm_unreachable("Unsupported synchronization scope");
1556 }
1557 }
1558
1559 /// The scratch address space does not need the global memory caches
1560 /// to be bypassed as all memory operations by the same thread are
1561 /// sequentially consistent, and no other thread can access scratch
1562 /// memory.
1563
1564 /// Other address spaces do not have a cache.
1565
1566 return Changed;
1567}
1568
1569bool SIGfx10CacheControl::enableVolatileAndOrNonTemporal(
1570 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1571 bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {
1572
1573 // Only handle load and store, not atomic read-modify-write insructions. The
1574 // latter use glc to indicate if the atomic returns a result and so must not
1575 // be used for cache control.
1576 assert((MI->mayLoad() ^ MI->mayStore()) || SIInstrInfo::isLDSDMA(*MI));
1577
1578 // Only update load and store, not LLVM IR atomic read-modify-write
1579 // instructions. The latter are always marked as volatile so cannot sensibly
1580 // handle it as do not want to pessimize all atomics. Also they do not support
1581 // the nontemporal attribute.
1582 assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
1583
1584 bool Changed = false;
1585
1586 if (IsVolatile) {
1587 // Set L0 and L1 cache policy to be MISS_EVICT for load instructions
1588 // and MISS_LRU for store instructions.
1589 // Note: there is no L2 cache coherent bypass control at the ISA level.
1590 if (Op == SIMemOp::LOAD) {
1591 Changed |= enableCPolBits(MI, CPol::GLC | CPol::DLC);
1592 }
1593
1594 // GFX11: Set MALL NOALLOC for both load and store instructions.
1595 if (AMDGPU::isGFX11(ST))
1596 Changed |= enableCPolBits(MI, CPol::DLC);
1597
1598 // Ensure operation has completed at system scope to cause all volatile
1599 // operations to be visible outside the program in a global order. Do not
1600 // request cross address space as only the global address space can be
1601 // observable outside the program, so no need to cause a waitcnt for LDS
1602 // address space operations.
1603 Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
1604 Position::AFTER, AtomicOrdering::Unordered,
1605 /*AtomicsOnly=*/false);
1606 return Changed;
1607 }
1608
1609 if (IsNonTemporal) {
1610 // For loads setting SLC configures L0 and L1 cache policy to HIT_EVICT
1611 // and L2 cache policy to STREAM.
1612 // For stores setting both GLC and SLC configures L0 and L1 cache policy
1613 // to MISS_EVICT and the L2 cache policy to STREAM.
1614 if (Op == SIMemOp::STORE)
1615 Changed |= enableCPolBits(MI, CPol::GLC);
1616 Changed |= enableCPolBits(MI, CPol::SLC);
1617
1618 // GFX11: Set MALL NOALLOC for both load and store instructions.
1619 if (AMDGPU::isGFX11(ST))
1620 Changed |= enableCPolBits(MI, CPol::DLC);
1621
1622 return Changed;
1623 }
1624
1625 return Changed;
1626}
1627
1628bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI,
1629 SIAtomicScope Scope,
1630 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1631 bool IsCrossAddrSpaceOrdering,
1632 Position Pos, AtomicOrdering Order,
1633 bool AtomicsOnly) const {
1634 bool Changed = false;
1635
1636 MachineBasicBlock &MBB = *MI->getParent();
1637 const DebugLoc &DL = MI->getDebugLoc();
1638
1639 if (Pos == Position::AFTER)
1640 ++MI;
1641
1642 bool VMCnt = false;
1643 bool VSCnt = false;
1644 bool LGKMCnt = false;
1645
1646 if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) !=
1647 SIAtomicAddrSpace::NONE) {
1648 switch (Scope) {
1649 case SIAtomicScope::SYSTEM:
1650 case SIAtomicScope::AGENT:
1651 if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
1652 VMCnt |= true;
1653 if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
1654 VSCnt |= true;
1655 break;
1656 case SIAtomicScope::WORKGROUP:
1657 // In WGP mode the waves of a work-group can be executing on either CU of
1658 // the WGP. Therefore need to wait for operations to complete to ensure
1659 // they are visible to waves in the other CU as the L0 is per CU.
1660 // Otherwise in CU mode and all waves of a work-group are on the same CU
1661 // which shares the same L0. Note that we still need to wait when
1662 // performing a release in this mode to respect the transitivity of
1663 // happens-before, e.g. other waves of the workgroup must be able to
1664 // release the memory from another wave at a wider scope.
1665 if (!ST.isCuModeEnabled() || isReleaseOrStronger(Order)) {
1666 if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
1667 VMCnt |= true;
1668 if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
1669 VSCnt |= true;
1670 }
1671 break;
1672 case SIAtomicScope::WAVEFRONT:
1673 case SIAtomicScope::SINGLETHREAD:
1674 // The L0 cache keeps all memory operations in order for
1675 // work-items in the same wavefront.
1676 break;
1677 default:
1678 llvm_unreachable("Unsupported synchronization scope");
1679 }
1680 }
1681
1682 if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
1683 switch (Scope) {
1684 case SIAtomicScope::SYSTEM:
1685 case SIAtomicScope::AGENT:
1686 case SIAtomicScope::WORKGROUP:
1687 // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is
1688 // not needed as LDS operations for all waves are executed in a total
1689 // global ordering as observed by all waves. Required if also
1690 // synchronizing with global/GDS memory as LDS operations could be
1691 // reordered with respect to later global/GDS memory operations of the
1692 // same wave.
1693 LGKMCnt |= IsCrossAddrSpaceOrdering;
1694 break;
1695 case SIAtomicScope::WAVEFRONT:
1696 case SIAtomicScope::SINGLETHREAD:
1697 // The LDS keeps all memory operations in order for
1698 // the same wavefront.
1699 break;
1700 default:
1701 llvm_unreachable("Unsupported synchronization scope");
1702 }
1703 }
1704
1705 if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) {
1706 switch (Scope) {
1707 case SIAtomicScope::SYSTEM:
1708 case SIAtomicScope::AGENT:
1709 // If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)"
1710 // is not needed as GDS operations for all waves are executed in a total
1711 // global ordering as observed by all waves. Required if also
1712 // synchronizing with global/LDS memory as GDS operations could be
1713 // reordered with respect to later global/LDS memory operations of the
1714 // same wave.
1715 LGKMCnt |= IsCrossAddrSpaceOrdering;
1716 break;
1717 case SIAtomicScope::WORKGROUP:
1718 case SIAtomicScope::WAVEFRONT:
1719 case SIAtomicScope::SINGLETHREAD:
1720 // The GDS keeps all memory operations in order for
1721 // the same work-group.
1722 break;
1723 default:
1724 llvm_unreachable("Unsupported synchronization scope");
1725 }
1726 }
1727
1728 if (VMCnt || LGKMCnt) {
1729 unsigned WaitCntImmediate =
1731 VMCnt ? 0 : getVmcntBitMask(IV),
1733 LGKMCnt ? 0 : getLgkmcntBitMask(IV));
1734 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_soft))
1735 .addImm(WaitCntImmediate);
1736 Changed = true;
1737 }
1738
1739 // On architectures that support direct loads to LDS, emit an unknown waitcnt
1740 // at workgroup-scoped release operations that specify the LDS address space.
1741 // SIInsertWaitcnts will later replace this with a vmcnt().
1742 if (ST.hasVMemToLDSLoad() && isReleaseOrStronger(Order) &&
1743 Scope == SIAtomicScope::WORKGROUP &&
1744 (AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
1745 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_lds_direct));
1746 Changed = true;
1747 }
1748
1749 if (VSCnt) {
1750 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT_soft))
1751 .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
1752 .addImm(0);
1753 Changed = true;
1754 }
1755
1756 if (Pos == Position::AFTER)
1757 --MI;
1758
1759 return Changed;
1760}
1761
1762bool SIGfx10CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1763 SIAtomicScope Scope,
1764 SIAtomicAddrSpace AddrSpace,
1765 Position Pos) const {
1766 if (!InsertCacheInv)
1767 return false;
1768
1769 bool Changed = false;
1770
1771 MachineBasicBlock &MBB = *MI->getParent();
1772 const DebugLoc &DL = MI->getDebugLoc();
1773
1774 if (Pos == Position::AFTER)
1775 ++MI;
1776
1777 if (canAffectGlobalAddrSpace(AddrSpace)) {
1778 switch (Scope) {
1779 case SIAtomicScope::SYSTEM:
1780 case SIAtomicScope::AGENT:
1781 // The order of invalidates matter here. We must invalidate "outer in"
1782 // so L1 -> L0 to avoid L0 pulling in stale data from L1 when it is
1783 // invalidated.
1784 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL1_INV));
1785 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV));
1786 Changed = true;
1787 break;
1788 case SIAtomicScope::WORKGROUP:
1789 // In WGP mode the waves of a work-group can be executing on either CU of
1790 // the WGP. Therefore need to invalidate the L0 which is per CU. Otherwise
1791 // in CU mode and all waves of a work-group are on the same CU, and so the
1792 // L0 does not need to be invalidated.
1793 if (!ST.isCuModeEnabled()) {
1794 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV));
1795 Changed = true;
1796 }
1797 break;
1798 case SIAtomicScope::WAVEFRONT:
1799 case SIAtomicScope::SINGLETHREAD:
1800 // No cache to invalidate.
1801 break;
1802 default:
1803 llvm_unreachable("Unsupported synchronization scope");
1804 }
1805 }
1806
1807 /// The scratch address space does not need the global memory cache
1808 /// to be flushed as all memory operations by the same thread are
1809 /// sequentially consistent, and no other thread can access scratch
1810 /// memory.
1811
1812 /// Other address spaces do not have a cache.
1813
1814 if (Pos == Position::AFTER)
1815 --MI;
1816
1817 return Changed;
1818}
1819
1820bool SIGfx12CacheControl::setTH(const MachineBasicBlock::iterator MI,
1821 AMDGPU::CPol::CPol Value) const {
1822 MachineOperand *CPol = TII->getNamedOperand(*MI, OpName::cpol);
1823 if (!CPol)
1824 return false;
1825
1826 uint64_t NewTH = Value & AMDGPU::CPol::TH;
1827 if ((CPol->getImm() & AMDGPU::CPol::TH) != NewTH) {
1828 CPol->setImm((CPol->getImm() & ~AMDGPU::CPol::TH) | NewTH);
1829 return true;
1830 }
1831
1832 return false;
1833}
1834
1835bool SIGfx12CacheControl::setScope(const MachineBasicBlock::iterator MI,
1836 AMDGPU::CPol::CPol Value) const {
1837 MachineOperand *CPol = TII->getNamedOperand(*MI, OpName::cpol);
1838 if (!CPol)
1839 return false;
1840
1841 uint64_t NewScope = Value & AMDGPU::CPol::SCOPE;
1842 if ((CPol->getImm() & AMDGPU::CPol::SCOPE) != NewScope) {
1843 CPol->setImm((CPol->getImm() & ~AMDGPU::CPol::SCOPE) | NewScope);
1844 return true;
1845 }
1846
1847 return false;
1848}
1849
1850bool SIGfx12CacheControl::insertWaitsBeforeSystemScopeStore(
1851 const MachineBasicBlock::iterator MI) const {
1852 // TODO: implement flag for frontend to give us a hint not to insert waits.
1853
1854 MachineBasicBlock &MBB = *MI->getParent();
1855 const DebugLoc &DL = MI->getDebugLoc();
1856
1857 BuildMI(MBB, MI, DL, TII->get(S_WAIT_LOADCNT_soft)).addImm(0);
1858 if (ST.hasImageInsts()) {
1859 BuildMI(MBB, MI, DL, TII->get(S_WAIT_SAMPLECNT_soft)).addImm(0);
1860 BuildMI(MBB, MI, DL, TII->get(S_WAIT_BVHCNT_soft)).addImm(0);
1861 }
1862 BuildMI(MBB, MI, DL, TII->get(S_WAIT_KMCNT_soft)).addImm(0);
1863 BuildMI(MBB, MI, DL, TII->get(S_WAIT_STORECNT_soft)).addImm(0);
1864
1865 return true;
1866}
1867
1868bool SIGfx12CacheControl::insertWait(MachineBasicBlock::iterator &MI,
1869 SIAtomicScope Scope,
1870 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1871 bool IsCrossAddrSpaceOrdering,
1872 Position Pos, AtomicOrdering Order,
1873 bool AtomicsOnly) const {
1874 bool Changed = false;
1875
1876 MachineBasicBlock &MBB = *MI->getParent();
1877 const DebugLoc &DL = MI->getDebugLoc();
1878
1879 bool LOADCnt = false;
1880 bool DSCnt = false;
1881 bool STORECnt = false;
1882
1883 if (Pos == Position::AFTER)
1884 ++MI;
1885
1886 if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) !=
1887 SIAtomicAddrSpace::NONE) {
1888 switch (Scope) {
1889 case SIAtomicScope::SYSTEM:
1890 case SIAtomicScope::AGENT:
1891 case SIAtomicScope::CLUSTER:
1892 if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
1893 LOADCnt |= true;
1894 if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
1895 STORECnt |= true;
1896 break;
1897 case SIAtomicScope::WORKGROUP:
1898 // GFX12.0:
1899 // In WGP mode the waves of a work-group can be executing on either CU
1900 // of the WGP. Therefore need to wait for operations to complete to
1901 // ensure they are visible to waves in the other CU as the L0 is per CU.
1902 //
1903 // Otherwise in CU mode and all waves of a work-group are on the same CU
1904 // which shares the same L0. Note that we still need to wait when
1905 // performing a release in this mode to respect the transitivity of
1906 // happens-before, e.g. other waves of the workgroup must be able to
1907 // release the memory from another wave at a wider scope.
1908 //
1909 // GFX12.5:
1910 // CU$ has two ports. To ensure operations are visible at the workgroup
1911 // level, we need to ensure all operations in this port have completed
1912 // so the other SIMDs in the WG can see them. There is no ordering
1913 // guarantee between the ports.
1914 if (!ST.isCuModeEnabled() || ST.hasGFX1250Insts() ||
1915 isReleaseOrStronger(Order)) {
1916 if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
1917 LOADCnt |= true;
1918 if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
1919 STORECnt |= true;
1920 }
1921 break;
1922 case SIAtomicScope::WAVEFRONT:
1923 case SIAtomicScope::SINGLETHREAD:
1924 // The L0 cache keeps all memory operations in order for
1925 // work-items in the same wavefront.
1926 break;
1927 default:
1928 llvm_unreachable("Unsupported synchronization scope");
1929 }
1930 }
1931
1932 if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
1933 switch (Scope) {
1934 case SIAtomicScope::SYSTEM:
1935 case SIAtomicScope::AGENT:
1936 case SIAtomicScope::CLUSTER:
1937 case SIAtomicScope::WORKGROUP:
1938 // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is
1939 // not needed as LDS operations for all waves are executed in a total
1940 // global ordering as observed by all waves. Required if also
1941 // synchronizing with global/GDS memory as LDS operations could be
1942 // reordered with respect to later global/GDS memory operations of the
1943 // same wave.
1944 DSCnt |= IsCrossAddrSpaceOrdering;
1945 break;
1946 case SIAtomicScope::WAVEFRONT:
1947 case SIAtomicScope::SINGLETHREAD:
1948 // The LDS keeps all memory operations in order for
1949 // the same wavefront.
1950 break;
1951 default:
1952 llvm_unreachable("Unsupported synchronization scope");
1953 }
1954 }
1955
1956 if (LOADCnt) {
1957 // Acquire sequences only need to wait on the previous atomic operation.
1958 // e.g. a typical sequence looks like
1959 // atomic load
1960 // (wait)
1961 // global_inv
1962 //
1963 // We do not have BVH or SAMPLE atomics, so the atomic load is always going
1964 // to be tracked using loadcnt.
1965 //
1966 // This also applies to fences. Fences cannot pair with an instruction
1967 // tracked with bvh/samplecnt as we don't have any atomics that do that.
1968 if (!AtomicsOnly && ST.hasImageInsts()) {
1969 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_BVHCNT_soft)).addImm(0);
1970 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_SAMPLECNT_soft)).addImm(0);
1971 }
1972 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_LOADCNT_soft)).addImm(0);
1973 Changed = true;
1974 }
1975
1976 if (STORECnt) {
1977 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_STORECNT_soft)).addImm(0);
1978 Changed = true;
1979 }
1980
1981 if (DSCnt) {
1982 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_DSCNT_soft)).addImm(0);
1983 Changed = true;
1984 }
1985
1986 if (Pos == Position::AFTER)
1987 --MI;
1988
1989 return Changed;
1990}
1991
1992bool SIGfx12CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1993 SIAtomicScope Scope,
1994 SIAtomicAddrSpace AddrSpace,
1995 Position Pos) const {
1996 if (!InsertCacheInv)
1997 return false;
1998
1999 MachineBasicBlock &MBB = *MI->getParent();
2000 const DebugLoc &DL = MI->getDebugLoc();
2001
2002 /// The scratch address space does not need the global memory cache
2003 /// to be flushed as all memory operations by the same thread are
2004 /// sequentially consistent, and no other thread can access scratch
2005 /// memory.
2006
2007 /// Other address spaces do not have a cache.
2008 if (!canAffectGlobalAddrSpace(AddrSpace))
2009 return false;
2010
2012 switch (Scope) {
2013 case SIAtomicScope::SYSTEM:
2014 ScopeImm = AMDGPU::CPol::SCOPE_SYS;
2015 break;
2016 case SIAtomicScope::AGENT:
2017 ScopeImm = AMDGPU::CPol::SCOPE_DEV;
2018 break;
2019 case SIAtomicScope::CLUSTER:
2020 ScopeImm = AMDGPU::CPol::SCOPE_SE;
2021 break;
2022 case SIAtomicScope::WORKGROUP:
2023 // GFX12.0:
2024 // In WGP mode the waves of a work-group can be executing on either CU of
2025 // the WGP. Therefore we need to invalidate the L0 which is per CU.
2026 // Otherwise in CU mode all waves of a work-group are on the same CU, and
2027 // so the L0 does not need to be invalidated.
2028 //
2029 // GFX12.5 has a shared WGP$, so no invalidates are required.
2030 if (ST.isCuModeEnabled())
2031 return false;
2032
2033 ScopeImm = AMDGPU::CPol::SCOPE_SE;
2034 break;
2035 case SIAtomicScope::WAVEFRONT:
2036 case SIAtomicScope::SINGLETHREAD:
2037 // No cache to invalidate.
2038 return false;
2039 default:
2040 llvm_unreachable("Unsupported synchronization scope");
2041 }
2042
2043 if (Pos == Position::AFTER)
2044 ++MI;
2045
2046 BuildMI(MBB, MI, DL, TII->get(AMDGPU::GLOBAL_INV)).addImm(ScopeImm);
2047
2048 if (Pos == Position::AFTER)
2049 --MI;
2050
2051 // Target requires a waitcnt to ensure that the proceeding INV has completed
2052 // as it may get reorded with following load instructions.
2053 if (ST.hasINVWBL2WaitCntRequirement() && Scope > SIAtomicScope::CLUSTER) {
2054 insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD,
2055 /*IsCrossAddrSpaceOrdering=*/false, Pos, AtomicOrdering::Acquire,
2056 /*AtomicsOnly=*/false);
2057
2058 if (Pos == Position::AFTER)
2059 --MI;
2060 }
2061
2062 return true;
2063}
2064
2065bool SIGfx12CacheControl::insertWriteback(MachineBasicBlock::iterator &MI,
2066 SIAtomicScope Scope,
2067 SIAtomicAddrSpace AddrSpace,
2068 Position Pos) const {
2069 // The scratch address space does not need the global memory cache
2070 // writeback as all memory operations by the same thread are
2071 // sequentially consistent, and no other thread can access scratch
2072 // memory.
2073 if (!canAffectGlobalAddrSpace(AddrSpace))
2074 return false;
2075
2076 bool Changed = false;
2077 MachineBasicBlock &MBB = *MI->getParent();
2078 const DebugLoc &DL = MI->getDebugLoc();
2079
2080 if (Pos == Position::AFTER)
2081 ++MI;
2082
2083 // global_wb is only necessary at system scope for GFX12.0,
2084 // they're also necessary at device scope for GFX12.5 as stores
2085 // cannot report completion earlier than L2.
2086 //
2087 // Emitting it for lower scopes is a slow no-op, so we omit it
2088 // for performance.
2089 std::optional<AMDGPU::CPol::CPol> NeedsWB;
2090 switch (Scope) {
2091 case SIAtomicScope::SYSTEM:
2092 NeedsWB = AMDGPU::CPol::SCOPE_SYS;
2093 break;
2094 case SIAtomicScope::AGENT:
2095 // GFX12.5 may have >1 L2 per device so we must emit a device scope WB.
2096 if (ST.hasGFX1250Insts())
2097 NeedsWB = AMDGPU::CPol::SCOPE_DEV;
2098 break;
2099 case SIAtomicScope::CLUSTER:
2100 case SIAtomicScope::WORKGROUP:
2101 case SIAtomicScope::WAVEFRONT:
2102 case SIAtomicScope::SINGLETHREAD:
2103 break;
2104 case SIAtomicScope::NONE:
2105 llvm_unreachable("Unsupported synchronization scope");
2106 break;
2107 }
2108
2109 if (NeedsWB) {
2110 // Target requires a waitcnt to ensure that the proceeding store
2111 // proceeding store/rmw operations have completed in L2 so their data will
2112 // be written back by the WB instruction.
2113 if (ST.hasINVWBL2WaitCntRequirement()) {
2114 insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
2115 /*IsCrossAddrSpaceOrdering=*/false, Pos,
2116 AtomicOrdering::Release,
2117 /*AtomicsOnly=*/false);
2118 }
2119
2120 BuildMI(MBB, MI, DL, TII->get(AMDGPU::GLOBAL_WB)).addImm(*NeedsWB);
2121 Changed = true;
2122 }
2123
2124 if (Pos == Position::AFTER)
2125 --MI;
2126
2127 return Changed;
2128}
2129
2130bool SIGfx12CacheControl::handleNonVolatile(MachineInstr &MI) const {
2131 // On GFX12.5, set the NV CPol bit.
2132 if (!ST.hasGFX1250Insts())
2133 return false;
2134 MachineOperand *CPol = TII->getNamedOperand(MI, OpName::cpol);
2135 if (!CPol)
2136 return false;
2137 CPol->setImm(CPol->getImm() | AMDGPU::CPol::NV);
2138 return true;
2139}
2140
2141bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal(
2142 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
2143 bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {
2144
2145 // Only handle load and store, not atomic read-modify-write instructions.
2146 assert((MI->mayLoad() ^ MI->mayStore()) || SIInstrInfo::isLDSDMA(*MI));
2147
2148 // Only update load and store, not LLVM IR atomic read-modify-write
2149 // instructions. The latter are always marked as volatile so cannot sensibly
2150 // handle it as do not want to pessimize all atomics. Also they do not support
2151 // the nontemporal attribute.
2152 assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
2153
2154 bool Changed = false;
2155
2156 if (IsLastUse) {
2157 // Set last-use hint.
2158 Changed |= setTH(MI, AMDGPU::CPol::TH_LU);
2159 } else if (IsNonTemporal) {
2160 // Set non-temporal hint for all cache levels.
2161 Changed |= setTH(MI, AMDGPU::CPol::TH_NT);
2162 }
2163
2164 if (IsVolatile) {
2165 Changed |= setScope(MI, AMDGPU::CPol::SCOPE_SYS);
2166
2167 if (ST.requiresWaitXCntForSingleAccessInstructions() &&
2169 MachineBasicBlock &MBB = *MI->getParent();
2170 BuildMI(MBB, MI, MI->getDebugLoc(), TII->get(S_WAIT_XCNT_soft)).addImm(0);
2171 Changed = true;
2172 }
2173
2174 // Ensure operation has completed at system scope to cause all volatile
2175 // operations to be visible outside the program in a global order. Do not
2176 // request cross address space as only the global address space can be
2177 // observable outside the program, so no need to cause a waitcnt for LDS
2178 // address space operations.
2179 Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
2180 Position::AFTER, AtomicOrdering::Unordered,
2181 /*AtomicsOnly=*/false);
2182 }
2183
2184 return Changed;
2185}
2186
2187bool SIGfx12CacheControl::finalizeStore(MachineInstr &MI, bool Atomic) const {
2188 assert(MI.mayStore() && "Not a Store inst");
2189 const bool IsRMW = (MI.mayLoad() && MI.mayStore());
2190 bool Changed = false;
2191
2192 if (Atomic && ST.requiresWaitXCntForSingleAccessInstructions() &&
2194 MachineBasicBlock &MBB = *MI.getParent();
2195 BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(S_WAIT_XCNT_soft)).addImm(0);
2196 Changed = true;
2197 }
2198
2199 // Remaining fixes do not apply to RMWs.
2200 if (IsRMW)
2201 return Changed;
2202
2203 MachineOperand *CPol = TII->getNamedOperand(MI, OpName::cpol);
2204 if (!CPol) // Some vmem operations do not have a scope and are not concerned.
2205 return Changed;
2206 const unsigned Scope = CPol->getImm() & CPol::SCOPE;
2207
2208 // GFX12.0 only: Extra waits needed before system scope stores.
2209 if (ST.requiresWaitsBeforeSystemScopeStores() && !Atomic &&
2210 Scope == CPol::SCOPE_SYS)
2211 Changed |= insertWaitsBeforeSystemScopeStore(MI.getIterator());
2212
2213 return Changed;
2214}
2215
2216bool SIGfx12CacheControl::handleCooperativeAtomic(MachineInstr &MI) const {
2217 if (!ST.hasGFX1250Insts())
2218 return false;
2219
2220 // Cooperative atomics need to be SCOPE_DEV or higher.
2221 MachineOperand *CPol = TII->getNamedOperand(MI, OpName::cpol);
2222 assert(CPol && "No CPol operand?");
2223 const unsigned Scope = CPol->getImm() & CPol::SCOPE;
2224 if (Scope < CPol::SCOPE_DEV)
2225 return setScope(MI, CPol::SCOPE_DEV);
2226 return false;
2227}
2228
2229bool SIGfx12CacheControl::setAtomicScope(const MachineBasicBlock::iterator &MI,
2230 SIAtomicScope Scope,
2231 SIAtomicAddrSpace AddrSpace) const {
2232 bool Changed = false;
2233
2234 if (canAffectGlobalAddrSpace(AddrSpace)) {
2235 switch (Scope) {
2236 case SIAtomicScope::SYSTEM:
2237 Changed |= setScope(MI, AMDGPU::CPol::SCOPE_SYS);
2238 break;
2239 case SIAtomicScope::AGENT:
2240 Changed |= setScope(MI, AMDGPU::CPol::SCOPE_DEV);
2241 break;
2242 case SIAtomicScope::CLUSTER:
2243 Changed |= setScope(MI, AMDGPU::CPol::SCOPE_SE);
2244 break;
2245 case SIAtomicScope::WORKGROUP:
2246 // In workgroup mode, SCOPE_SE is needed as waves can executes on
2247 // different CUs that access different L0s.
2248 if (!ST.isCuModeEnabled())
2249 Changed |= setScope(MI, AMDGPU::CPol::SCOPE_SE);
2250 break;
2251 case SIAtomicScope::WAVEFRONT:
2252 case SIAtomicScope::SINGLETHREAD:
2253 // No cache to bypass.
2254 break;
2255 default:
2256 llvm_unreachable("Unsupported synchronization scope");
2257 }
2258 }
2259
2260 // The scratch address space does not need the global memory caches
2261 // to be bypassed as all memory operations by the same thread are
2262 // sequentially consistent, and no other thread can access scratch
2263 // memory.
2264
2265 // Other address spaces do not have a cache.
2266
2267 return Changed;
2268}
2269
2270bool SIMemoryLegalizer::removeAtomicPseudoMIs() {
2271 if (AtomicPseudoMIs.empty())
2272 return false;
2273
2274 for (auto &MI : AtomicPseudoMIs)
2275 MI->eraseFromParent();
2276
2277 AtomicPseudoMIs.clear();
2278 return true;
2279}
2280
2281bool SIMemoryLegalizer::expandLoad(const SIMemOpInfo &MOI,
2283 assert(MI->mayLoad() && !MI->mayStore());
2284
2285 LLVM_DEBUG(dbgs() << "Expanding load: " << *MI);
2286
2287 bool Changed = false;
2288
2289 if (MOI.isAtomic()) {
2290 LLVM_DEBUG(dbgs() << " Atomic: ordering=" << toIRString(MOI.getOrdering())
2291 << ", scope=" << toString(MOI.getScope())
2292 << ", ordering-AS=" << MOI.getOrderingAddrSpace()
2293 << ", instr-AS=" << MOI.getInstrAddrSpace() << "\n");
2294 const AtomicOrdering Order = MOI.getOrdering();
2295 if (Order == AtomicOrdering::Monotonic ||
2296 Order == AtomicOrdering::Acquire ||
2297 Order == AtomicOrdering::SequentiallyConsistent) {
2298 Changed |= CC->enableLoadCacheBypass(MI, MOI.getScope(),
2299 MOI.getOrderingAddrSpace());
2300 }
2301
2302 // Handle cooperative atomics after cache bypass step, as it may override
2303 // the scope of the instruction to a greater scope.
2304 if (MOI.isCooperative())
2305 Changed |= CC->handleCooperativeAtomic(*MI);
2306
2307 if (Order == AtomicOrdering::SequentiallyConsistent)
2308 Changed |= CC->insertWait(MI, MOI.getScope(), MOI.getOrderingAddrSpace(),
2309 SIMemOp::LOAD | SIMemOp::STORE,
2310 MOI.getIsCrossAddressSpaceOrdering(),
2311 Position::BEFORE, Order, /*AtomicsOnly=*/false);
2312
2313 if (Order == AtomicOrdering::Acquire ||
2314 Order == AtomicOrdering::SequentiallyConsistent) {
2315 // The wait below only needs to wait on the prior atomic.
2316 Changed |=
2317 CC->insertWait(MI, MOI.getScope(), MOI.getInstrAddrSpace(),
2318 SIMemOp::LOAD, MOI.getIsCrossAddressSpaceOrdering(),
2319 Position::AFTER, Order, /*AtomicsOnly=*/true);
2320 Changed |= CC->insertAcquire(MI, MOI.getScope(),
2321 MOI.getOrderingAddrSpace(),
2322 Position::AFTER);
2323 }
2324
2325 return Changed;
2326 }
2327
2328 // Atomic instructions already bypass caches to the scope specified by the
2329 // SyncScope operand. Only non-atomic volatile and nontemporal/last-use
2330 // instructions need additional treatment.
2331 Changed |= CC->enableVolatileAndOrNonTemporal(
2332 MI, MOI.getInstrAddrSpace(), SIMemOp::LOAD, MOI.isVolatile(),
2333 MOI.isNonTemporal(), MOI.isLastUse());
2334
2335 return Changed;
2336}
2337
2338bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI,
2340 assert(!MI->mayLoad() && MI->mayStore());
2341
2342 LLVM_DEBUG(dbgs() << "Expanding store: " << *MI);
2343
2344 bool Changed = false;
2345 // FIXME: Necessary hack because iterator can lose track of the store.
2346 MachineInstr &StoreMI = *MI;
2347
2348 if (MOI.isAtomic()) {
2349 LLVM_DEBUG(dbgs() << " Atomic: ordering=" << toIRString(MOI.getOrdering())
2350 << ", scope=" << toString(MOI.getScope())
2351 << ", ordering-AS=" << MOI.getOrderingAddrSpace()
2352 << ", instr-AS=" << MOI.getInstrAddrSpace() << "\n");
2353 if (MOI.getOrdering() == AtomicOrdering::Monotonic ||
2354 MOI.getOrdering() == AtomicOrdering::Release ||
2355 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
2356 Changed |= CC->enableStoreCacheBypass(MI, MOI.getScope(),
2357 MOI.getOrderingAddrSpace());
2358 }
2359
2360 // Handle cooperative atomics after cache bypass step, as it may override
2361 // the scope of the instruction to a greater scope.
2362 if (MOI.isCooperative())
2363 Changed |= CC->handleCooperativeAtomic(*MI);
2364
2365 if (MOI.getOrdering() == AtomicOrdering::Release ||
2366 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
2367 Changed |= CC->insertRelease(MI, MOI.getScope(),
2368 MOI.getOrderingAddrSpace(),
2369 MOI.getIsCrossAddressSpaceOrdering(),
2370 Position::BEFORE);
2371
2372 Changed |= CC->finalizeStore(StoreMI, /*Atomic=*/true);
2373 return Changed;
2374 }
2375
2376 // Atomic instructions already bypass caches to the scope specified by the
2377 // SyncScope operand. Only non-atomic volatile and nontemporal instructions
2378 // need additional treatment.
2379 Changed |= CC->enableVolatileAndOrNonTemporal(
2380 MI, MOI.getInstrAddrSpace(), SIMemOp::STORE, MOI.isVolatile(),
2381 MOI.isNonTemporal());
2382
2383 // GFX12 specific, scope(desired coherence domain in cache hierarchy) is
2384 // instruction field, do not confuse it with atomic scope.
2385 Changed |= CC->finalizeStore(StoreMI, /*Atomic=*/false);
2386 return Changed;
2387}
2388
2389bool SIMemoryLegalizer::expandAtomicFence(const SIMemOpInfo &MOI,
2391 assert(MI->getOpcode() == AMDGPU::ATOMIC_FENCE);
2392
2393 LLVM_DEBUG(dbgs() << "Expanding atomic fence: " << *MI);
2394
2395 AtomicPseudoMIs.push_back(MI);
2396 bool Changed = false;
2397
2398 const SIAtomicAddrSpace OrderingAddrSpace = MOI.getOrderingAddrSpace();
2399
2400 if (MOI.isAtomic()) {
2401 LLVM_DEBUG(dbgs() << " Atomic: ordering=" << toIRString(MOI.getOrdering())
2402 << ", scope=" << toString(MOI.getScope())
2403 << ", ordering-AS=" << OrderingAddrSpace << "\n");
2404 const AtomicOrdering Order = MOI.getOrdering();
2405 if (Order == AtomicOrdering::Acquire) {
2406 // Acquire fences only need to wait on the previous atomic they pair with.
2407 Changed |= CC->insertWait(MI, MOI.getScope(), OrderingAddrSpace,
2408 SIMemOp::LOAD | SIMemOp::STORE,
2409 MOI.getIsCrossAddressSpaceOrdering(),
2410 Position::BEFORE, Order, /*AtomicsOnly=*/true);
2411 }
2412
2413 if (Order == AtomicOrdering::Release ||
2414 Order == AtomicOrdering::AcquireRelease ||
2415 Order == AtomicOrdering::SequentiallyConsistent)
2416 /// TODO: This relies on a barrier always generating a waitcnt
2417 /// for LDS to ensure it is not reordered with the completion of
2418 /// the proceeding LDS operations. If barrier had a memory
2419 /// ordering and memory scope, then library does not need to
2420 /// generate a fence. Could add support in this file for
2421 /// barrier. SIInsertWaitcnt.cpp could then stop unconditionally
2422 /// adding S_WAITCNT before a S_BARRIER.
2423 Changed |= CC->insertRelease(MI, MOI.getScope(), OrderingAddrSpace,
2424 MOI.getIsCrossAddressSpaceOrdering(),
2425 Position::BEFORE);
2426
2427 // TODO: If both release and invalidate are happening they could be combined
2428 // to use the single "BUFFER_WBINV*" instruction. This could be done by
2429 // reorganizing this code or as part of optimizing SIInsertWaitcnt pass to
2430 // track cache invalidate and write back instructions.
2431
2432 if (Order == AtomicOrdering::Acquire ||
2433 Order == AtomicOrdering::AcquireRelease ||
2434 Order == AtomicOrdering::SequentiallyConsistent)
2435 Changed |= CC->insertAcquire(MI, MOI.getScope(), OrderingAddrSpace,
2436 Position::BEFORE);
2437
2438 return Changed;
2439 }
2440
2441 return Changed;
2442}
2443
2444bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
2446 assert(MI->mayLoad() && MI->mayStore());
2447
2448 LLVM_DEBUG(dbgs() << "Expanding atomic cmpxchg/rmw: " << *MI);
2449
2450 bool Changed = false;
2451 MachineInstr &RMWMI = *MI;
2452
2453 if (MOI.isAtomic()) {
2454 LLVM_DEBUG(dbgs() << " Atomic: ordering=" << toIRString(MOI.getOrdering())
2455 << ", failure-ordering="
2456 << toIRString(MOI.getFailureOrdering())
2457 << ", scope=" << toString(MOI.getScope())
2458 << ", ordering-AS=" << MOI.getOrderingAddrSpace()
2459 << ", instr-AS=" << MOI.getInstrAddrSpace() << "\n");
2460 const AtomicOrdering Order = MOI.getOrdering();
2461 if (Order == AtomicOrdering::Monotonic ||
2462 Order == AtomicOrdering::Acquire || Order == AtomicOrdering::Release ||
2463 Order == AtomicOrdering::AcquireRelease ||
2464 Order == AtomicOrdering::SequentiallyConsistent) {
2465 Changed |= CC->enableRMWCacheBypass(MI, MOI.getScope(),
2466 MOI.getInstrAddrSpace());
2467 }
2468
2469 if (Order == AtomicOrdering::Release ||
2470 Order == AtomicOrdering::AcquireRelease ||
2471 Order == AtomicOrdering::SequentiallyConsistent ||
2472 MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent)
2473 Changed |= CC->insertRelease(MI, MOI.getScope(),
2474 MOI.getOrderingAddrSpace(),
2475 MOI.getIsCrossAddressSpaceOrdering(),
2476 Position::BEFORE);
2477
2478 if (Order == AtomicOrdering::Acquire ||
2479 Order == AtomicOrdering::AcquireRelease ||
2480 Order == AtomicOrdering::SequentiallyConsistent ||
2481 MOI.getFailureOrdering() == AtomicOrdering::Acquire ||
2482 MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent) {
2483 // Only wait on the previous atomic.
2484 Changed |=
2485 CC->insertWait(MI, MOI.getScope(), MOI.getInstrAddrSpace(),
2486 isAtomicRet(*MI) ? SIMemOp::LOAD : SIMemOp::STORE,
2487 MOI.getIsCrossAddressSpaceOrdering(), Position::AFTER,
2488 Order, /*AtomicsOnly=*/true);
2489 Changed |= CC->insertAcquire(MI, MOI.getScope(),
2490 MOI.getOrderingAddrSpace(),
2491 Position::AFTER);
2492 }
2493
2494 Changed |= CC->finalizeStore(RMWMI, /*Atomic=*/true);
2495 return Changed;
2496 }
2497
2498 return Changed;
2499}
2500
2501bool SIMemoryLegalizer::expandLDSDMA(const SIMemOpInfo &MOI,
2503 assert(MI->mayLoad() && MI->mayStore());
2504
2505 LLVM_DEBUG(dbgs() << "Expanding LDS DMA: " << *MI);
2506
2507 // The volatility or nontemporal-ness of the operation is a
2508 // function of the global memory, not the LDS.
2509 SIMemOp OpKind =
2510 SIInstrInfo::mayWriteLDSThroughDMA(*MI) ? SIMemOp::LOAD : SIMemOp::STORE;
2511
2512 // Handle volatile and/or nontemporal markers on direct-to-LDS loads and
2513 // stores. The operation is treated as a volatile/nontemporal store
2514 // to its second argument.
2515 return CC->enableVolatileAndOrNonTemporal(
2516 MI, MOI.getInstrAddrSpace(), OpKind, MOI.isVolatile(),
2517 MOI.isNonTemporal(), MOI.isLastUse());
2518}
2519
2520bool SIMemoryLegalizerLegacy::runOnMachineFunction(MachineFunction &MF) {
2521 const MachineModuleInfo &MMI =
2522 getAnalysis<MachineModuleInfoWrapperPass>().getMMI();
2523 return SIMemoryLegalizer(MMI).run(MF);
2524}
2525
2526PreservedAnalyses
2530 .getCachedResult<MachineModuleAnalysis>(
2531 *MF.getFunction().getParent());
2532 assert(MMI && "MachineModuleAnalysis must be available");
2533 if (!SIMemoryLegalizer(MMI->getMMI()).run(MF))
2534 return PreservedAnalyses::all();
2536}
2537
2538bool SIMemoryLegalizer::run(MachineFunction &MF) {
2539 bool Changed = false;
2540
2541 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
2542 SIMemOpAccess MOA(MMI.getObjFileInfo<AMDGPUMachineModuleInfo>(), ST,
2543 MF.getFunction());
2544 CC = SICacheControl::create(ST);
2545
2546 for (auto &MBB : MF) {
2547 for (auto MI = MBB.begin(); MI != MBB.end(); ++MI) {
2548
2549 // Unbundle instructions after the post-RA scheduler.
2550 if (MI->isBundle() && MI->mayLoadOrStore()) {
2551 MachineBasicBlock::instr_iterator II(MI->getIterator());
2552 for (MachineBasicBlock::instr_iterator I = ++II, E = MBB.instr_end();
2553 I != E && I->isBundledWithPred(); ++I) {
2554 I->unbundleFromPred();
2555 for (MachineOperand &MO : I->operands())
2556 if (MO.isReg())
2557 MO.setIsInternalRead(false);
2558 }
2559
2560 MI = MI->eraseFromParent();
2561 }
2562
2563 if (MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic) {
2564 if (const auto &MOI = MOA.getLoadInfo(MI))
2565 Changed |= expandLoad(*MOI, MI);
2566 else if (const auto &MOI = MOA.getStoreInfo(MI))
2567 Changed |= expandStore(*MOI, MI);
2568 else if (const auto &MOI = MOA.getLDSDMAInfo(MI))
2569 Changed |= expandLDSDMA(*MOI, MI);
2570 else if (const auto &MOI = MOA.getAtomicFenceInfo(MI))
2571 Changed |= expandAtomicFence(*MOI, MI);
2572 else if (const auto &MOI = MOA.getAtomicCmpxchgOrRmwInfo(MI))
2573 Changed |= expandAtomicCmpxchgOrRmw(*MOI, MI);
2574 }
2575
2577 Changed |= CC->handleNonVolatile(*MI);
2578 }
2579 }
2580
2581 Changed |= removeAtomicPseudoMIs();
2582 return Changed;
2583}
2584
2585INITIALIZE_PASS(SIMemoryLegalizerLegacy, DEBUG_TYPE, PASS_NAME, false, false)
2586
2587char SIMemoryLegalizerLegacy::ID = 0;
2588char &llvm::SIMemoryLegalizerID = SIMemoryLegalizerLegacy::ID;
2589
2591 return new SIMemoryLegalizerLegacy();
2592}
static std::optional< LoadInfo > getLoadInfo(const MachineInstr &MI)
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
AMDGPU address space definition.
Provides AMDGPU specific target descriptions.
AMDGPU Machine Module Info.
AMDGPU promote alloca to vector or LDS
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Atomic ordering constants.
AMD GCN specific subclass of TargetSubtarget.
#define DEBUG_TYPE
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
This header defines various interfaces for pass management in LLVM.
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
This file provides utility for Memory Model Relaxation Annotations (MMRAs).
uint64_t IntrinsicInst * II
#define INITIALIZE_PASS(passName, arg, name, cfg, analysis)
Definition PassSupport.h:56
static cl::opt< bool > AmdgcnSkipCacheInvalidations("amdgcn-skip-cache-invalidations", cl::init(false), cl::Hidden, cl::desc("Use this to skip inserting cache invalidating instructions."))
static bool isNonVolatileMemoryAccess(const MachineInstr &MI)
#define PASS_NAME
static bool canUseBUFFER_WBINVL1_VOL(const GCNSubtarget &ST)
This file contains some functions that are useful when dealing with strings.
#define LLVM_DEBUG(...)
Definition Debug.h:119
#define PASS_NAME
static const uint32_t IV[8]
Definition blake3_impl.h:83
SyncScope::ID getClusterOneAddressSpaceSSID() const
std::optional< bool > isSyncScopeInclusion(SyncScope::ID A, SyncScope::ID B) const
In AMDGPU target synchronization scopes are inclusive, meaning a larger synchronization scope is incl...
SyncScope::ID getAgentOneAddressSpaceSSID() const
SyncScope::ID getSingleThreadOneAddressSpaceSSID() const
SyncScope::ID getWavefrontOneAddressSpaceSSID() const
SyncScope::ID getSystemOneAddressSpaceSSID() const
SyncScope::ID getWorkgroupOneAddressSpaceSSID() const
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
LLVM_ABI void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition Pass.cpp:270
Represents analyses that only rely on functions' control flow.
Definition Analysis.h:73
Diagnostic information for unsupported feature in backend.
FunctionPass class - This class is used to implement most global optimizations.
Definition Pass.h:314
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:358
Module * getParent()
Get the module that this global value is contained inside of...
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
A helper class to return the specified delimiter string after the first invocation of operator String...
Helper class to manipulate !mmra metadata nodes.
Instructions::iterator instr_iterator
MachineInstrBundleIterator< MachineInstr > iterator
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
Function & getFunction()
Return the LLVM function that this machine code represents.
const MachineInstrBuilder & addReg(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
Representation of each machine instruction.
A description of a memory reference used in the backend.
Ty & getObjFileInfo()
Keep track of various per-module pieces of information for backends that would like to do so.
MachineOperand class - Representation of each machine instruction operand.
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
Definition Analysis.h:151
static bool isVMEM(const MachineInstr &MI)
static bool mayWriteLDSThroughDMA(const MachineInstr &MI)
static bool isBUF(const MachineInstr &MI)
static bool isAtomicRet(const MachineInstr &MI)
static bool isAtomic(const MachineInstr &MI)
static bool isLDSDMA(const MachineInstr &MI)
PreservedAnalyses run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM)
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
Definition SmallString.h:26
A wrapper around a string literal that serves as a proxy for constructing global tables of StringRefs...
Definition StringRef.h:882
StringMap - This is an unconventional map that is specialized for handling keys that are "strings",...
Definition StringMap.h:133
Represent a constant reference to a string, i.e.
Definition StringRef.h:56
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition raw_ostream.h:53
A raw_ostream that writes to an SmallVector or SmallString.
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ BUFFER_STRIDED_POINTER
Address space for 192-bit fat buffer pointers with an additional index.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ BUFFER_FAT_POINTER
Address space for 160-bit buffer fat pointers.
@ PRIVATE_ADDRESS
Address space for private memory.
@ BUFFER_RESOURCE
Address space for 128-bit buffer resources.
constexpr char IsVolatile[]
Key for Kernel::Arg::Metadata::mIsVolatile.
bool isGFX10(const MCSubtargetInfo &STI)
bool isGFX11(const MCSubtargetInfo &STI)
LLVM_ABI IsaVersion getIsaVersion(StringRef GPU)
unsigned encodeWaitcnt(const IsaVersion &Version, const Waitcnt &Decoded)
unsigned getVmcntBitMask(const IsaVersion &Version)
unsigned getLgkmcntBitMask(const IsaVersion &Version)
unsigned getExpcntBitMask(const IsaVersion &Version)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ SingleThread
Synchronized with respect to signal handlers executing in the same thread.
Definition LLVMContext.h:55
@ System
Synchronized with respect to all concurrently executing threads.
Definition LLVMContext.h:58
initializer< Ty > init(const Ty &Val)
DXILDebugInfoMap run(Module &M)
NodeAddr< FuncNode * > Func
Definition RDFGraph.h:393
This is an optimization pass for GlobalISel generic memory operations.
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1738
OuterAnalysisManagerProxy< ModuleAnalysisManager, MachineFunction > ModuleAnalysisManagerMachineFunctionProxy
Provide the ModuleAnalysisManager to Function proxy.
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
char & SIMemoryLegalizerID
LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE()
AnalysisManager< MachineFunction > MachineFunctionAnalysisManager
bool isReleaseOrStronger(AtomicOrdering AO)
LLVM_ABI PreservedAnalyses getMachineFunctionPassPreservedAnalyses()
Returns the minimum set of Analyses that all machine function passes must preserve.
const char * toIRString(AtomicOrdering ao)
String used by LLVM IR to represent atomic ordering.
MachineInstr * getImm(const MachineOperand &MO, const MachineRegisterInfo *MRI)
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:279
AtomicOrdering getMergedAtomicOrdering(AtomicOrdering AO, AtomicOrdering Other)
Return a single atomic ordering that is at least as strong as both the AO and Other orderings for an ...
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:209
static const MachineMemOperand::Flags MOCooperative
Mark the MMO of cooperative load/store atomics.
Definition SIInstrInfo.h:52
AtomicOrdering
Atomic ordering for LLVM's memory model.
DWARFExpression::Operation Op
raw_ostream & operator<<(raw_ostream &OS, const APFixedPoint &FX)
std::string toString(const APInt &I, unsigned Radix, bool Signed, bool formatAsCLiteral=false, bool UpperCase=true, bool InsertSeparators=false)
static const MachineMemOperand::Flags MOLastUse
Mark the MMO of a load as the last use.
Definition SIInstrInfo.h:48
FunctionPass * createSIMemoryLegalizerPass()
bool isStrongerThan(AtomicOrdering AO, AtomicOrdering Other)
Returns true if ao is stronger than other as defined by the AtomicOrdering lattice,...